In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,SimpleRNN,LSTM,GRU,Bidirectional

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/drive/MyDrive/NLP(Classes)/alexa_reviews.tsv',sep='\t')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,verified_reviews,feedback
0,0,Love my Echo!,1
1,1,Loved it!,1
2,2,"Sometimes while playing a game, you can answer...",1
3,3,I have had a lot of fun with this thing. My 4 ...,1
4,4,Music,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        3150 non-null   int64 
 1   verified_reviews  3150 non-null   object
 2   feedback          3150 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 74.0+ KB


In [5]:
df['feedback'].value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [6]:
X=df['verified_reviews']
y=df['feedback']

**DIVIDING INTO TRAINING AND TESTING DATA**

In [7]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,random_state=1)

**TRAIN DATA PRE-PROCESSING**

In [8]:
#Tokenization
tok=Tokenizer()
tok.fit_on_texts(xtrain)

In [9]:
#Vocubalary length
vocab_len=len(tok.index_word)
vocab_len

3632

In [10]:
#Text to sequence
train_sequence=tok.texts_to_sequences(xtrain)

In [11]:
doc_len=[]
for doc in train_sequence:
  doc_len.append(len(doc))

In [12]:
np.quantile(doc_len,0.99)

151.96000000000004

In [13]:
max_len=151

In [14]:
#Padding
train_matrix=sequence.pad_sequences(train_sequence,maxlen=max_len)
train_matrix

array([[   0,    0,    0, ..., 1234,   19,   44],
       [   0,    0,    0, ...,  765,    6, 1521],
       [   0,    0,    0, ...,    4,   30,   45],
       ...,
       [   0,    0,    0, ...,  315,   16,   49],
       [   0,    0,    0, ...,    0,  525,  570],
       [   0,    0,    0, ...,    1,   11,  141]], dtype=int32)

**TEST DATA PRE-PROCESSING**

In [15]:
test_sequence=tok.texts_to_sequences(xtest)
test_matrix=sequence.pad_sequences(test_sequence,maxlen=max_len)
test_matrix

array([[  0,   0,   0, ..., 655,  67,   8],
       [  0,   0,   0, ...,   4,  50, 976],
       [  0,   0,   0, ...,   7, 611, 134],
       ...,
       [  0,   0,   0, ..., 427,  11,  48],
       [  0,   0,   0, ...,  10,   1,  48],
       [  0,   0,   0, ..., 482,  15,  24]], dtype=int32)

**RNN**

In [16]:
#Single layer RNN
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(SimpleRNN(128))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 151, 50)           181650    
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               22912     
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 212,883
Trainable params: 212,883
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [18]:
model.fit(train_matrix,ytrain,epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f6199472950>

In [19]:
ypred=model.predict(test_matrix)

In [20]:
ypred = np.where(ypred >= 0.5,1,0)

In [21]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.69      0.27      0.39        73
           1       0.94      0.99      0.97       872

    accuracy                           0.93       945
   macro avg       0.82      0.63      0.68       945
weighted avg       0.92      0.93      0.92       945



In [22]:
#Bidirectional RNN
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(Bidirectional(SimpleRNN(128)))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 151, 50)           181650    
                                                                 
 bidirectional (Bidirectiona  (None, 256)              45824     
 l)                                                              
                                                                 
 dense_2 (Dense)             (None, 64)                16448     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 243,987
Trainable params: 243,987
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [24]:
model.fit(train_matrix,ytrain,epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f6199321850>

In [25]:
ypred=model.predict(test_matrix)

In [26]:
ypred = np.where(ypred >= 0.5,1,0)

In [27]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.75      0.49      0.60        73
           1       0.96      0.99      0.97       872

    accuracy                           0.95       945
   macro avg       0.85      0.74      0.78       945
weighted avg       0.94      0.95      0.94       945



In [28]:
#Multiple layer RNN
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(SimpleRNN(128,return_sequences=True))
model.add(SimpleRNN(64))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 151, 50)           181650    
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 151, 128)          22912     
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 64)                12352     
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 221,139
Trainable params: 221,139
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [30]:
model.fit(train_matrix,ytrain,epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f6198cb3790>

In [31]:
ypred=model.predict(test_matrix)

In [32]:
ypred = np.where(ypred >= 0.5,1,0)

In [33]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.57      0.36      0.44        73
           1       0.95      0.98      0.96       872

    accuracy                           0.93       945
   macro avg       0.76      0.67      0.70       945
weighted avg       0.92      0.93      0.92       945



**LSTM**

In [34]:
#Single layer LSTM
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(LSTM(128))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 151, 50)           181650    
                                                                 
 lstm (LSTM)                 (None, 128)               91648     
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 281,619
Trainable params: 281,619
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [36]:
model.fit(train_matrix,ytrain,epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f619331e9d0>

In [37]:
ypred=model.predict(test_matrix)

In [38]:
ypred = np.where(ypred >= 0.5,1,0)

In [39]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.76      0.40      0.52        73
           1       0.95      0.99      0.97       872

    accuracy                           0.94       945
   macro avg       0.86      0.69      0.75       945
weighted avg       0.94      0.94      0.94       945



In [40]:
#Bidirectional LSTM
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 151, 50)           181650    
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              183296    
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 64)                16448     
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 381,459
Trainable params: 381,459
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [42]:
model.fit(train_matrix,ytrain,epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f61923879d0>

In [43]:
ypred=model.predict(test_matrix)

In [44]:
ypred = np.where(ypred >= 0.5,1,0)

In [45]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.77      0.47      0.58        73
           1       0.96      0.99      0.97       872

    accuracy                           0.95       945
   macro avg       0.86      0.73      0.78       945
weighted avg       0.94      0.95      0.94       945



In [46]:
#Multi-layer LSTM
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(LSTM(128,return_sequences=True))
model.add(LSTM(128))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 151, 50)           181650    
                                                                 
 lstm_2 (LSTM)               (None, 151, 128)          91648     
                                                                 
 lstm_3 (LSTM)               (None, 128)               131584    
                                                                 
 dense_10 (Dense)            (None, 64)                8256      
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 413,203
Trainable params: 413,203
Non-trainable params: 0
_________________________________________________________________


In [47]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [48]:
model.fit(train_matrix,ytrain,epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f61904969d0>

In [49]:
ypred=model.predict(test_matrix)

In [50]:
ypred = np.where(ypred >= 0.5,1,0)

In [51]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.75      0.45      0.56        73
           1       0.96      0.99      0.97       872

    accuracy                           0.95       945
   macro avg       0.85      0.72      0.77       945
weighted avg       0.94      0.95      0.94       945



**GRU**

In [52]:
#GRU
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(GRU(128))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 151, 50)           181650    
                                                                 
 gru (GRU)                   (None, 128)               69120     
                                                                 
 dense_12 (Dense)            (None, 64)                8256      
                                                                 
 dense_13 (Dense)            (None, 1)                 65        
                                                                 
Total params: 259,091
Trainable params: 259,091
Non-trainable params: 0
_________________________________________________________________


In [53]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [54]:
model.fit(train_matrix,ytrain,epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f618cada090>

In [55]:
ypred=model.predict(test_matrix)

In [56]:
ypred = np.where(ypred >= 0.5,1,0)

In [57]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.64      0.41      0.50        73
           1       0.95      0.98      0.97       872

    accuracy                           0.94       945
   macro avg       0.80      0.70      0.73       945
weighted avg       0.93      0.94      0.93       945



In [58]:
#Bidirectional GRU
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(Bidirectional(GRU(128)))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 151, 50)           181650    
                                                                 
 bidirectional_2 (Bidirectio  (None, 256)              138240    
 nal)                                                            
                                                                 
 dense_14 (Dense)            (None, 64)                16448     
                                                                 
 dense_15 (Dense)            (None, 1)                 65        
                                                                 
Total params: 336,403
Trainable params: 336,403
Non-trainable params: 0
_________________________________________________________________


In [59]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [60]:
model.fit(train_matrix,ytrain,epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f618a5e9ed0>

In [61]:
ypred=model.predict(test_matrix)

In [62]:
ypred = np.where(ypred >= 0.5,1,0)

In [63]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.78      0.53      0.63        73
           1       0.96      0.99      0.97       872

    accuracy                           0.95       945
   macro avg       0.87      0.76      0.80       945
weighted avg       0.95      0.95      0.95       945



In [64]:
#Multi-layer GRU
model=Sequential()
model.add(Embedding(vocab_len+1,50,input_length=max_len,mask_zero=True))
model.add(GRU(128,return_sequences=True))
model.add(GRU(128))
model.add(Dense(64,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 151, 50)           181650    
                                                                 
 gru_2 (GRU)                 (None, 151, 128)          69120     
                                                                 
 gru_3 (GRU)                 (None, 128)               99072     
                                                                 
 dense_16 (Dense)            (None, 64)                8256      
                                                                 
 dense_17 (Dense)            (None, 1)                 65        
                                                                 
Total params: 358,163
Trainable params: 358,163
Non-trainable params: 0
_________________________________________________________________


In [65]:
model.compile(optimizer="adam",loss="binary_crossentropy")

In [66]:
model.fit(train_matrix,ytrain,epochs=15,batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f6189c83cd0>

In [67]:
ypred=model.predict(test_matrix)

In [68]:
ypred = np.where(ypred >= 0.5,1,0)

In [69]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.71      0.47      0.56        73
           1       0.96      0.98      0.97       872

    accuracy                           0.94       945
   macro avg       0.83      0.72      0.77       945
weighted avg       0.94      0.94      0.94       945

