<a href="https://colab.research.google.com/github/Usool-Data-Science/Natural-Language-Processing-Models/blob/main/Multilabel_sentimental_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import io
import keras
import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import files
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Import the dataset using io and google.colab.files module

In [None]:
uploaded = files.upload()
phrases = pd.read_excel(io.BytesIO(uploaded['Test of AI ML possibilities.xlsx']),skiprows = 108,names = ['phrase','class']) #we dropped the numeric
#complete = phrases['']
phrases.head()

Saving Test of AI ML possibilities.xlsx to Test of AI ML possibilities.xlsx


Unnamed: 0,phrase,class
0,- asset management,4
1,- center,4
2,- co,4
3,- developer,4
4,- end development,4


## Check the distribution of the classes

In [None]:
phrases['phrase'].value_counts()

- asset management     1
quality management     1
quality center         1
quality center 10.0    1
quality center alm     1
                      ..
fpsl                   1
fq events              1
fr                     1
Fra                    1
zypher                 1
Name: phrase, Length: 41523, dtype: int64

## Data Exploration and splitting

In [None]:
labelled_phrases = phrases[~phrases['class'].isna()]
unlabelled_phrases = phrases[phrases['class'].isna()]
#print(f'Total phrases: {len(phrases)} \nTotal labelled phrases: {len(labelled_phrases)} \nTotal unlabelled phrases: {len(unlabelled_phrases)} \
#      \n%Labelled: {len(labelled_phrases)/len(phrases)*100}')

print('Total phrases: {0} \nTotal labelled phrases: {1} \nTotal unlabelled phrases: {2} \nPercentage Labelled: %{3:.2f}'.format(len(phrases),len(labelled_phrases),len(unlabelled_phrases),len(labelled_phrases)/len(phrases)*100))


Total phrases: 41523 
Total labelled phrases: 7626 
Total unlabelled phrases: 33897 
Percentage Labelled: %18.37


In [None]:
# Defining some constants/hyperparameters
BUFFER_SIZE = 1000 # for reshuffling
BATCH_SIZE = 100
NUM_EPOCHS = 20

In [None]:
#aa = labelled_phrases[labelled_phrases['class']=='4'].loc[:]['class'] = 4
index_to_change = labelled_phrases[labelled_phrases['class']=='e'].index
labelled_phrases.at[index_to_change,'class'] = 4

In [None]:
labelled_phrases['phrase'].value_counts()

This is where i ensure that the class variable is changed to int. Please note that if you run the cell below you should get dtype('int64') as response otherwise there will be another error

In [None]:
#labelled_phrases['class'].value_counts()
labelled_phrases.loc[:,'class'] = labelled_phrases.loc[:,'class'].astype('int')
labelled_phrases['class'].dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


dtype('int64')

In [None]:
import random
labelled_phrases = labelled_phrases.sample(frac = 1, replace = False, random_state = 12345)
sentences = labelled_phrases['phrase']
labels = labelled_phrases['class']

# Separate out the sentences and labels into training and test sets
training_size = int(len(sentences) * 0.8)

### manual shuffling

x_train  = sentences[0:training_size]
#x_train = random.sample(sentences, k = training_size)
x_test = sentences[training_size:]
y_train  = labels[0:training_size]
y_test = labels[training_size:]
x_future = unlabelled_phrases['phrase']

In [None]:
y_train.value_counts()

3    2960
4    2153
2     503
1     484
Name: class, dtype: int64

In [None]:
vocab_size = 1000
embedding_dim = 16
max_words = 10000
num_classes = max(y_train) + 1

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)
future_sequence = tokenizer.texts_to_sequences(x_future)

x_train = tokenizer.sequences_to_matrix(train_sequences, mode='binary')
x_test = tokenizer.sequences_to_matrix(test_sequences, mode='binary')
x_future = tokenizer.sequences_to_matrix(future_sequence, mode='binary')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(x_train[0])
print(len(x_train[0]))

print(y_train[0])
print(len(y_train[0]))

[0. 0. 0. ... 0. 0. 0.]
10000
[0. 0. 0. 1. 0.]
5


In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

#print(decode_review(padded[1]))
#print(training_sentences[1])

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Defining early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_loss',
    mode = 'auto',
    min_delta = 0,
    patience = 2,
    verbose = 0, 
    restore_best_weights = True
    )
print(model.metrics_names)

[]


In [None]:
batch_size = 20
epochs = 20

history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks = [early_stopping], verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=2)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Epoch 1/20
Epoch 2/20
Epoch 3/20
77/77 - 0s - loss: 0.6481 - accuracy: 0.6927 - 194ms/epoch - 3ms/step
Test loss: 0.648078978061676
Test accuracy: 0.6926605701446533


In [None]:
prob = model.predict(x_future)
future_class = []
for i in range(len(prob)):
  future_class.append(np.argmax(prob[i]))
#future_class



In [None]:
newphrase = unlabelled_phrases['phrase']
predicted_data = pd.DataFrame({'phrase': newphrase, 'class':future_class})
predicted_data

Unnamed: 0,phrase,class
45,.net 3.5,4
46,.net 4.0,4
47,.net 5,4
48,.net 6,4
50,.net architecture,4
...,...,...
41518,zwave,3
41519,zxing,3
41520,zynq,3
41521,zynq soc,3


## NETWORK VISUALIZATION

 Visit http://projector.tensorflow.org/ and load both vector and metadata that will be download below, then click the "Sphereize" checkbox.

In [None]:
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 512)


In [None]:
import io

# Write out the embedding vectors and metadata
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
# Download the files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
len(phrases)

41523

In [None]:
final_phrases = pd.merge(phrases.dropna(), predicted_data, how = 'outer') 
final_phrases

Unnamed: 0,phrase,class
0,- asset management,4
1,- center,4
2,- co,4
3,- developer,4
4,- end development,4
...,...,...
41518,zwave,3
41519,zxing,3
41520,zynq,3
41521,zynq soc,3


In [None]:
final_phrases.to_csv('complete_data.csv', index = False, encoding='utf-8')
files.download('complete_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>