In [1]:
import numpy as np
import pandas as pd
import gensim,nltk,string,os,zipfile
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from nltk.stem import WordNetLemmatizer,PorterStemmer,LancasterStemmer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,SimpleRNN,LSTM,GRU
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping
from tensorflow.keras.optimizers import Adagrad,Adam,SGD,RMSprop
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('C:\\Users\\raghav\\Desktop\\Files\\Jupyter_Notebook_Files\\NLP\\Word Embedding - Word2Vec & GloVe\\Botdata.csv',encoding='cp1252')

In [3]:
df.Responses.unique()

array(['Hello! how can i help you ?',
       'I am your virtual learning assistant',
       'I hope I was able to assist you, Good Bye',
       'Link: Machine Learning wiki ', 'Link: Neural Nets wiki',
       'Link: Olympus wiki', 'Please use respectful words',
       'Transferring the request to your PM'], dtype=object)

In [4]:
df.Tag.unique()

array(['Intro', 'Bot', 'Exit', 'SL', 'NN', 'Olympus', 'Profane', 'Ticket'],
      dtype=object)

In [5]:
def Data_Cleaning(x):
    Complete_data = []
    for i in x:
        single_row = []
        for j in word_tokenize(i):
            if j.lower() not in string.punctuation:
                single_row.append(j.lower())
        Complete_data.append(single_row)            
    return Complete_data

In [6]:
cleaned_data = Data_Cleaning(df['Pattern'])

In [7]:
cleaned_data

[['hi'],
 ['how', 'are', 'you'],
 ['hello', 'there'],
 ['hello'],
 ['whats', 'up'],
 ['hey'],
 ['yo'],
 ['listen'],
 ['please', 'help', 'me'],
 ['hi', 'there'],
 ['hello', 'bot'],
 ['whats', 'up', 'for', 'today'],
 ['hello', 'guys', 'i', 'need', 'a', 'help'],
 ['hey', 'there'],
 ['i', 'have', 'a', 'quick', 'question'],
 ['how', 'to', 'start'],
 ['online'],
 ['hey', 'ya'],
 ['talking', 'to', 'you', 'for', 'first', 'time'],
 ['anyone', 'there'],
 ['i', 'am', 'here', 'to', 'get', 'help'],
 ['someone', 'help', 'me', 'please'],
 ['ello'],
 ['wassuppp'],
 ['whats', 'happening', 'around', 'the', 'portal'],
 ['i', 'have', 'few', 'quick', 'questions'],
 ['i', 'need', 'a', 'help'],
 ['there'],
 ['what', 'is', 'your', 'name'],
 ['who', 'are', 'you'],
 ['how', 'do', 'they', 'call', 'you'],
 ['do', 'i', 'know', 'you'],
 ['who', 'is', 'there'],
 ['who', 'is', 'you'],
 ['your', 'name', 'please'],
 ['may', 'i', 'know', 'your', 'name'],
 ['speak', 'up'],
 ['are', 'you', 'a', 'human'],
 ['do', 'you', 'a

# We will create multiple vectorization techniques
## 1. Word2Vec (Self Trained)
## 2. Word2Vec (Pre-Trained)
## 3. GloVe(Pre-Trained)

## The above 3 data sources will be used on RNN, LSTM and GRU. We will have 9 different models

# 1. Word2Vec (Self Trained) - w2v

In [8]:
w2v = Word2Vec(cleaned_data,window=5,min_count=1,sg=0,vector_size=100)

In [9]:
w2v.wv.get_vector('hello')

array([-0.0071556 ,  0.00637592, -0.00201847, -0.00474921,  0.00495806,
       -0.00790987,  0.00294074,  0.00973763,  0.00686235, -0.00312551,
        0.00236688,  0.00485499,  0.00956698, -0.00275564,  0.00136494,
       -0.0042223 , -0.00316549,  0.00081706,  0.00216475, -0.00543462,
        0.00426822, -0.00508734, -0.00459894, -0.00905573,  0.00760234,
        0.00430341, -0.00228152,  0.00843985,  0.00194922, -0.00988891,
       -0.00773005, -0.00423402, -0.00730689,  0.00733085,  0.00502309,
        0.00208817, -0.00063066, -0.00746233,  0.00606645, -0.00528325,
       -0.00266641, -0.00587983, -0.00223673, -0.00931764,  0.00333597,
       -0.00745668,  0.00673162, -0.00749905, -0.00584616,  0.00012663,
       -0.00791986,  0.00530259, -0.00419767,  0.00035033,  0.00778899,
        0.00316128, -0.00018578,  0.00768974, -0.00019279,  0.00564487,
        0.00597251, -0.00798886,  0.00196761, -0.00948583, -0.00150069,
        0.00867612, -0.00244304, -0.00474332,  0.00266606,  0.00

# 2. Word2Vec (Pre-Trained) - pretrained_w2v

In [10]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [11]:
pretrained_w2v = api.load('fasttext-wiki-news-subwords-300')

In [12]:
pretrained_w2v.get_vector('india')

array([-0.082078  , -0.049793  , -0.0075161 , -0.020323  , -0.0092538 ,
       -0.072095  ,  0.033758  , -0.078571  ,  0.08059   , -0.052607  ,
       -0.020787  , -0.071582  ,  0.10044   ,  0.018694  , -0.0029067 ,
        0.039427  ,  0.14199   , -0.051042  ,  0.025533  ,  0.039338  ,
       -0.072977  ,  0.056744  ,  0.027786  ,  0.044195  ,  0.055468  ,
       -0.04315   ,  0.083226  , -0.10961   ,  0.047717  , -0.0097087 ,
        0.075673  ,  0.099839  ,  0.076693  ,  0.0033689 ,  0.0080532 ,
        0.07054   , -0.018284  ,  0.010139  , -0.030658  ,  0.088302  ,
       -0.096714  , -0.12911   ,  0.052357  , -0.15232   ,  0.030007  ,
        0.059222  ,  0.059283  , -0.066301  ,  0.049369  ,  0.049344  ,
       -0.065009  , -0.0059145 ,  0.028648  , -0.032529  , -0.080934  ,
       -0.066755  , -0.0057464 ,  0.00093308, -0.039737  ,  0.099084  ,
        0.018779  ,  0.027044  ,  0.1356    , -0.041382  ,  0.041289  ,
       -0.048331  , -0.067529  , -0.063287  ,  0.11987   , -0.05

# 3. GloVe(Pre-Trained) - Glove_dict

In [13]:
f = open("C:\\Users\\raghav\\Desktop\\Files\\Jupyter_Notebook_Files\\NLP\\Word Embedding - Word2Vec & GloVe\\glove.6B.50d.txt", encoding="utf8")

In [14]:
Glove_dict = {}

for i in f:
    Glove_dict[i.split()[0]] = i.split()[1:]

## Label Encoding 

In [15]:
LE = LabelEncoder()
y = LE.fit_transform(df['Responses'])

In [16]:
target = pd.get_dummies(y)

# Model1 = RNN with Word2Vec self trained - w2v

In [129]:
len(w2v.wv.key_to_index)

318

## Step1 : Using Tensorflow's Tokenizer, we are creating a text to sequences. The input to tokenizer is cleaned data that is in Word_Tokenized format. 

In [130]:
Tr= Tokenizer(num_words=319)
Tr.fit_on_texts(cleaned_data)
cleaned_data_seq = Tr.texts_to_sequences(cleaned_data)

## Step2: Create a numpy zero array (Tokenizers words length,Your word vector dimension). In our case it is (319,100)

In [131]:
embedding_matrix_Word2vec_Self = np.zeros((len(Tr.word_index)+1,100))

## Step3: Iterate over Tokenizer Word index items and get the word vector for each word and store it in numpy zero's matrix.

### We use Tokenizer's to get every word
### We use the word to get vector from word2vec model
### We save the vector in numpy zeros array

In [132]:
for word,i in Tr.word_index.items():
    if w2v.wv.get_vector(word) is not None:
        embedding_matrix_Word2vec_Self[i] = w2v.wv.get_vector(word)
    else:
        embedding_matrix_Word2vec_Self[i] = np.zeros((1,100))
        print(f"{word} not added")

## Step4: Pad the Sequences using the tokenizers text to sequence output

In [133]:
X = pad_sequences(cleaned_data_seq)
y = target

## Input Dimensions 

In [134]:
len(Tr.word_index)+1

319

## Output Dimensions  - 100 vector dimensions

In [135]:
max([len(i) for i in cleaned_data])

19

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
Model1 = Sequential()
Model1.add(Embedding(input_dim = len(Tr.word_index)+1,output_dim =100,weights = [embedding_matrix_Word2vec_Self],input_length=19))
Model1.add(SimpleRNN(100))
Model1.add(Dense(100,activation='relu'))
Model1.add(Dense(50,activation='relu'))
Model1.add(Dense(8,activation='softmax'))
Model1.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [26]:
Model1.fit(X_train,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2d83f9c3760>

In [27]:
prediction = Model1.predict(X_test)

In [28]:
prediction[0]

array([0.2867351 , 0.15578812, 0.08798981, 0.1223792 , 0.21393375,
       0.00864877, 0.09907303, 0.02545232], dtype=float32)

In [29]:
value,classes_pred = tf.math.top_k(prediction, k=1, sorted=True, name=None)
classes_actual = y_test.idxmax(axis=1)

In [54]:
print(classification_report(classes_actual,classes_pred.numpy()))

              precision    recall  f1-score   support

           0       0.17      0.11      0.13         9
           1       0.13      0.15      0.14        13
           2       0.25      0.09      0.13        11
           3       0.06      0.11      0.07         9
           4       0.14      0.29      0.19        14
           5       0.11      0.07      0.09        14
           6       0.12      0.09      0.11        11
           7       0.29      0.14      0.19        14

    accuracy                           0.14        95
   macro avg       0.16      0.13      0.13        95
weighted avg       0.16      0.14      0.14        95



# Model2  = RNN with Word2vec Pretrained

In [55]:
cleaned_data

[['hi'],
 ['how', 'are', 'you'],
 ['hello', 'there'],
 ['hello'],
 ['whats', 'up'],
 ['hey'],
 ['yo'],
 ['listen'],
 ['please', 'help', 'me'],
 ['hi', 'there'],
 ['hello', 'bot'],
 ['whats', 'up', 'for', 'today'],
 ['hello', 'guys', 'i', 'need', 'a', 'help'],
 ['hey', 'there'],
 ['i', 'have', 'a', 'quick', 'question'],
 ['how', 'to', 'start'],
 ['online'],
 ['hey', 'ya'],
 ['talking', 'to', 'you', 'for', 'first', 'time'],
 ['anyone', 'there'],
 ['i', 'am', 'here', 'to', 'get', 'help'],
 ['someone', 'help', 'me', 'please'],
 ['ello'],
 ['wassuppp'],
 ['whats', 'happening', 'around', 'the', 'portal'],
 ['i', 'have', 'few', 'quick', 'questions'],
 ['i', 'need', 'a', 'help'],
 ['there'],
 ['what', 'is', 'your', 'name'],
 ['who', 'are', 'you'],
 ['how', 'do', 'they', 'call', 'you'],
 ['do', 'i', 'know', 'you'],
 ['who', 'is', 'there'],
 ['who', 'is', 'you'],
 ['your', 'name', 'please'],
 ['may', 'i', 'know', 'your', 'name'],
 ['speak', 'up'],
 ['are', 'you', 'a', 'human'],
 ['do', 'you', 'a

## Step1: Tokenizer to get text sequence

In [141]:
TK2 = Tokenizer()
TK2.fit_on_texts(cleaned_data)
cleaned_data_Seq = TK2.texts_to_sequences(cleaned_data)

In [142]:
pretrained_w2v

<gensim.models.keyedvectors.KeyedVectors at 0x2d7dbeede50>

## Step2: Create a numpy zeros array ok tokenizer word index length+1 and embedding dimensions

In [143]:
pretrained_w2v_embedding = np.zeros((len(TK2.word_index)+1,300))

## Step3: Iterate over the tokenizer index items that gives word and word index, update the Numpy zeros using array index with vectors for each word.

In [144]:
for word,i in TK2.word_index.items():
    try:
        pretrained_w2v_embedding[i] = pretrained_w2v.get_vector(word)
    except:
        print(f"The term {word} is not available in the pretrained model.")

The term svm is not available in the pretrained model.
The term wassuppp is not available in the pretrained model.
The term techb=niques is not available in the pretrained model.
The term imputer is not available in the pretrained model.
The term diffult is not available in the pretrained model.
The term relu is not available in the pretrained model.
The term otimizer is not available in the pretrained model.
The term olypus is not available in the pretrained model.
The term shutttt is not available in the pretrained model.
The term upppp is not available in the pretrained model.
The term aiml is not available in the pretrained model.


## Step4: Find the max length of the cleaned data. Later pad the sequences using Tokenizer's Text to sequence content and perform train test split

In [145]:
max([len(i) for i in cleaned_data])

19

In [146]:
X = pad_sequences(cleaned_data_Seq,maxlen=19)
y = target

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Step5: Create an Embedding layer using the below mentioned:
### Input Dimensions: Tokenizer word index length + 1
### Output Dimensions: Embedding dimension
### Weights : The numpy zeros array that was filled with vectors

In [63]:
Model2 = Sequential()
Model2.add(Embedding(input_dim=len(TK2.word_index)+1,output_dim=300,weights=[pretrained_w2v_embedding]))
Model2.add(SimpleRNN(100))
Model2.add(Dense(150,activation='relu'))
Model2.add(Dense(75,activation='relu'))
Model2.add(Dense(30,activation='relu'))
Model2.add(Dense(8,activation='softmax'))
Model2.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

In [64]:
ES = EarlyStopping(monitor='val_accuracy',min_delta=0.01,patience=10)
Model2.fit(X_train,y_train,epochs=100,callbacks=[ES],validation_data=(X_test,y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


<keras.callbacks.History at 0x2d84c552340>

In [84]:
prediction_2 = Model2.predict(X_test)
value,classes_pred = tf.math.top_k(prediction_2,k=1, sorted=True, name=None)
classes_actual = y_test.idxmax(axis=1)

In [86]:
print(classification_report(classes_actual,classes_pred.numpy()))

              precision    recall  f1-score   support

           0       0.64      0.75      0.69        12
           1       0.55      0.67      0.60         9
           2       0.62      0.42      0.50        12
           3       0.54      0.93      0.68        14
           4       0.86      0.46      0.60        13
           5       0.79      1.00      0.88        11
           6       0.71      0.45      0.56        11
           7       0.60      0.46      0.52        13

    accuracy                           0.64        95
   macro avg       0.66      0.64      0.63        95
weighted avg       0.66      0.64      0.63        95



# Model 3: RNN with GloVe(Pre-Trained) - Glove_dict

## Step1: Use Tokenizer to create text sequences

In [151]:
tk3 = Tokenizer()
tk3.fit_on_texts(cleaned_data)
data_seq = tk3.texts_to_sequences(cleaned_data)

## Step2: Create numpy zeros array using tokenizer word index length and 50 dimensions. Then, iterate over the tokenizer's word index items to get word and their index, using this update the numpy zeros array with the Glove vector.

In [152]:
Cleaned_data_Glove = np.zeros((len(tk3.word_index.items())+1,50))

In [153]:
for word,i in tk3.word_index.items():
    try:
        Cleaned_data_Glove[i] = Glove_dict[word]
    except:
        print(f"The word {word} is not available in Glove")  

The word knn is not available in Glove
The word wassuppp is not available in Glove
The word techb=niques is not available in Glove
The word imputer is not available in Glove
The word diffult is not available in Glove
The word softmax is not available in Glove
The word relu is not available in Glove
The word otimizer is not available in Glove
The word olypus is not available in Glove
The word shutttt is not available in Glove
The word upppp is not available in Glove
The word intrested is not available in Glove
The word aiml is not available in Glove


## Step3: Find the sequences max length, use this to create padded sequences and generate train test split

In [154]:
max([len(i) for i in cleaned_data])

19

In [155]:
X = pad_sequences(data_seq,maxlen=19)
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [156]:
ES=EarlyStopping(monitor='val_accuracy',min_delta=0.001,patience=10)

In [93]:
Model3 = Sequential()
Model3.add(Embedding(input_dim=len(tk3.word_index)+1, output_dim=50,input_length=19,weights=[Cleaned_data_Glove]))
Model3.add(SimpleRNN(100))
Model3.add(Dense(100,activation='tanh'))
Model3.add(Dense(50,activation='tanh'))
Model3.add(Dense(8,activation='softmax'))
Model3.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
Model3.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test),callbacks=[ES])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


<keras.callbacks.History at 0x2d852aade20>

In [94]:
len(tk3.word_index)

318

In [95]:
prediction_3 = Model3.predict(X_test)
value,classes_pred = tf.math.top_k(prediction_3,k=1, sorted=True, name=None)
classes_actual = y_test.idxmax(axis=1)

In [96]:
print(classification_report(classes_actual,classes_pred.numpy()))

              precision    recall  f1-score   support

           0       0.56      0.56      0.56         9
           1       0.65      0.85      0.73        13
           2       0.43      0.55      0.48        11
           3       0.54      0.78      0.64         9
           4       0.75      0.64      0.69        14
           5       1.00      0.64      0.78        14
           6       0.50      0.36      0.42        11
           7       0.62      0.57      0.59        14

    accuracy                           0.62        95
   macro avg       0.63      0.62      0.61        95
weighted avg       0.65      0.62      0.62        95



# Model4 : LSTM with Word2vec Self trained

In [107]:
Model4 = Sequential()
Model4.add(Embedding(input_dim = len(Tr.word_index)+1,output_dim =100,weights = [embedding_matrix_Word2vec_Self],input_length=19))
Model4.add(LSTM(100))
Model4.add(Dense(100,activation='relu'))
Model4.add(Dense(50,activation='relu'))
Model4.add(Dense(8,activation='softmax'))
Model4.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
ES = EarlyStopping(monitor='val_accuracy',min_delta=0.001,patience=10)
Model4.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test),callbacks=[ES])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


<keras.callbacks.History at 0x2d8530b8280>

In [108]:
prediction_4 = Model4.predict(X_test)
value,classes_pred = tf.math.top_k(prediction_4,k=1, sorted=True, name=None)
classes_actual = y_test.idxmax(axis=1)

In [109]:
print(classification_report(classes_actual,classes_pred.numpy()))

              precision    recall  f1-score   support

           0       0.39      0.75      0.51        12
           1       0.75      0.33      0.46         9
           2       0.67      0.50      0.57        12
           3       0.75      0.86      0.80        14
           4       1.00      0.46      0.63        13
           5       0.92      1.00      0.96        11
           6       0.54      0.64      0.58        11
           7       0.58      0.54      0.56        13

    accuracy                           0.64        95
   macro avg       0.70      0.63      0.63        95
weighted avg       0.70      0.64      0.64        95



# Model5 : LSTM with Word2vec Pretrained 

In [117]:
Model5 = Sequential()
Model5.add(Embedding(input_dim = len(Tr.word_index)+1,output_dim =100,weights = [embedding_matrix_Word2vec_Self],input_length=19))
Model5.add(LSTM(100))
Model5.add(Dense(100,activation='relu'))
Model5.add(Dense(50,activation='relu'))
Model5.add(Dense(8,activation='softmax'))
Model5.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
ES = EarlyStopping(monitor='val_accuracy',min_delta=0.001,patience=10)
Model5.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test),callbacks=[ES])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100


<keras.callbacks.History at 0x2d854b77c70>

In [119]:
prediction_5 = Model5.predict(X_test)
value,classes_pred = tf.math.top_k(prediction_5,k=1, sorted=True, name=None)
classes_actual = y_test.idxmax(axis=1)

In [120]:
print(classification_report(classes_actual,classes_pred.numpy()))

              precision    recall  f1-score   support

           0       0.50      0.50      0.50        12
           1       0.71      0.56      0.63         9
           2       0.32      0.58      0.41        12
           3       0.68      0.93      0.79        14
           4       1.00      0.46      0.63        13
           5       0.85      1.00      0.92        11
           6       1.00      0.27      0.43        11
           7       0.54      0.54      0.54        13

    accuracy                           0.61        95
   macro avg       0.70      0.61      0.60        95
weighted avg       0.70      0.61      0.61        95



# Model6 : LSTM with GloVe Pretrained embeddings

In [126]:
Model6 = Sequential()
Model6.add(Embedding(input_dim = len(Tr.word_index)+1,output_dim =100,weights = [embedding_matrix_Word2vec_Self],input_length=19))
Model6.add(LSTM(100))
Model6.add(Dense(100,activation='relu'))
Model6.add(Dense(50,activation='relu'))
Model6.add(Dense(8,activation='softmax'))
Model6.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
ES = EarlyStopping(monitor='val_accuracy',min_delta=0.001,patience=10)
Model6.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test),callbacks=[ES])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100


<keras.callbacks.History at 0x2d852d639d0>

In [127]:
prediction_6 = Model6.predict(X_test)
value,classes_pred = tf.math.top_k(prediction_6,k=1, sorted=True, name=None)
classes_actual = y_test.idxmax(axis=1)

In [128]:
print(classification_report(classes_actual,classes_pred.numpy()))

              precision    recall  f1-score   support

           0       0.44      0.44      0.44         9
           1       0.60      0.46      0.52        13
           2       0.35      0.64      0.45        11
           3       1.00      0.89      0.94         9
           4       0.76      0.93      0.84        14
           5       0.92      0.79      0.85        14
           6       0.40      0.18      0.25        11
           7       0.71      0.71      0.71        14

    accuracy                           0.64        95
   macro avg       0.65      0.63      0.63        95
weighted avg       0.66      0.64      0.64        95



# Model7 : GRU with Word2Vec Self trained embeddings 

In [138]:
Model7 = Sequential()
Model7.add(Embedding(input_dim = len(Tr.word_index)+1,output_dim =100,weights = [embedding_matrix_Word2vec_Self],input_length=19))
Model7.add(GRU(100))
Model7.add(Dense(100,activation='relu'))
Model7.add(Dense(50,activation='relu'))
Model7.add(Dense(8,activation='softmax'))
Model7.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
ES = EarlyStopping(monitor='val_accuracy',min_delta=0.0001,patience=10,mode='max')
Model7.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test),callbacks=[ES])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


<keras.callbacks.History at 0x2d86806a700>

In [139]:
prediction_7 = Model7.predict(X_test)
value,classes_pred = tf.math.top_k(prediction_7,k=1, sorted=True, name=None)
classes_actual = y_test.idxmax(axis=1)

In [140]:
print(classification_report(classes_actual,classes_pred.numpy()))

              precision    recall  f1-score   support

           0       0.40      0.50      0.44        12
           1       0.57      0.44      0.50         9
           2       0.35      0.67      0.46        12
           3       0.80      0.86      0.83        14
           4       1.00      0.46      0.63        13
           5       0.79      1.00      0.88        11
           6       0.80      0.36      0.50        11
           7       0.70      0.54      0.61        13

    accuracy                           0.61        95
   macro avg       0.68      0.60      0.61        95
weighted avg       0.68      0.61      0.61        95



# Model8 : GRU with Word2Vec Pre trained embeddings 

In [148]:
Model8 = Sequential()
Model8.add(Embedding(input_dim = len(Tr.word_index)+1,output_dim =100,weights = [embedding_matrix_Word2vec_Self],input_length=19))
Model8.add(GRU(100))
Model8.add(Dense(100,activation='relu'))
Model8.add(Dense(50,activation='relu'))
Model8.add(Dense(8,activation='softmax'))
Model8.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
ES = EarlyStopping(monitor='val_accuracy',min_delta=0.0001,patience=10,mode='max')
Model8.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test),callbacks=[ES])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


<keras.callbacks.History at 0x2d86c63ed00>

In [149]:
prediction_8 = Model8.predict(X_test)
value,classes_pred = tf.math.top_k(prediction_8,k=1, sorted=True, name=None)
classes_actual = y_test.idxmax(axis=1)

In [150]:
print(classification_report(classes_actual,classes_pred.numpy()))

              precision    recall  f1-score   support

           0       0.29      0.42      0.34        12
           1       0.75      0.33      0.46         9
           2       0.38      0.42      0.40        12
           3       0.75      0.86      0.80        14
           4       0.44      0.62      0.52        13
           5       0.92      1.00      0.96        11
           6       0.75      0.27      0.40        11
           7       0.73      0.62      0.67        13

    accuracy                           0.58        95
   macro avg       0.63      0.57      0.57        95
weighted avg       0.62      0.58      0.57        95



# Model9 : GRU with GloVe Pre trained embeddings 

In [157]:
Model9 = Sequential()
Model9.add(Embedding(input_dim = len(Tr.word_index)+1,output_dim =100,weights = [embedding_matrix_Word2vec_Self],input_length=19))
Model9.add(GRU(100))
Model9.add(Dense(100,activation='relu'))
Model9.add(Dense(50,activation='relu'))
Model9.add(Dense(8,activation='softmax'))
Model9.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
ES = EarlyStopping(monitor='val_accuracy',min_delta=0.0001,patience=10,mode='max')
Model9.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test),callbacks=[ES])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100


<keras.callbacks.History at 0x2d870b71e80>

In [159]:
prediction_9 = Model9.predict(X_test)
value,classes_pred = tf.math.top_k(prediction_9,k=1, sorted=True, name=None)
classes_actual = y_test.idxmax(axis=1)

In [160]:
print(classification_report(classes_actual,classes_pred.numpy()))

              precision    recall  f1-score   support

           0       0.60      0.33      0.43         9
           1       0.53      0.69      0.60        13
           2       0.31      0.82      0.45        11
           3       1.00      0.89      0.94         9
           4       0.90      0.64      0.75        14
           5       0.92      0.79      0.85        14
           6       0.50      0.18      0.27        11
           7       1.00      0.71      0.83        14

    accuracy                           0.64        95
   macro avg       0.72      0.63      0.64        95
weighted avg       0.73      0.64      0.65        95

