In [64]:
#Importing bunch of libraries.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import LSTM,Dense
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Downloading the dataset from the kaggle.
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d sid321axn/amazon-alexa-reviews
!mkdir data
!unzip amazon-alexa-reviews -d /content/data/

Saving kaggle.json to kaggle.json
Downloading amazon-alexa-reviews.zip to /content
  0% 0.00/164k [00:00<?, ?B/s]
100% 164k/164k [00:00<00:00, 60.4MB/s]
Archive:  amazon-alexa-reviews.zip
  inflating: /content/data/amazon_alexa.tsv  


In [4]:
!ls data


amazon_alexa.tsv


In [22]:
#Loading the data and looking into first few rows to have look at data.
data_tb=pd.read_csv('/content/data/amazon_alexa.tsv',sep='\t',quoting=3) 
data_tb.head()


Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"""Sometimes while playing a game, you can answe...",1
3,5,31-Jul-18,Charcoal Fabric,"""I have had a lot of fun with this thing. My 4...",1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [76]:
data_tb['feedback'].value_counts()


1    2893
0     257
Name: feedback, dtype: int64

In [23]:
#Check no of lines in the file.
print(data_tb.shape[0])

3150


In [24]:
#Lets perform basic EDA to get insights into data.
data_tb.describe()

Unnamed: 0,rating,feedback
count,3150.0,3150.0
mean,4.463175,0.918413
std,1.068506,0.273778
min,1.0,0.0
25%,4.0,1.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


In [25]:
#Lets find the maximum and minimum length of sentence so that we can build the LSTM model.
verified_rev_array=data_tb.iloc[:,[3]].values
verified_rev_array_len=[len(str(text).split()) for text in verified_rev_array]
print("Max_text_length=",max(verified_rev_array_len))
print("Min_text_length=",min(verified_rev_array_len))

Max_text_length= 526
Min_text_length= 1


In [0]:
#Lets do the basic data preprocessing.
reviews=[]

for i in range(0,3150):
  review=re.sub('[^a-zA-Z]',' ',data_tb['verified_reviews'][i]) #Selects only textual data removing the emojies and other irrelevant data.
  review=review.lower() #converting the text to lower case.
  review=review.split() #Splits the words in the review.
  stemmer=PorterStemmer() #Creating the tokenizer object.
  review=[stemmer.stem(word) for word in review if not word in set(stopwords.words('english'))]#Iterating through each word in the text and performing stemming.
  review=' '.join(review) #Join the word in the stemmed array together to form the text.
  reviews.append(review)




In [80]:
#Creating Bag of words vectors.

tokenizer = Tokenizer(num_words=2000, lower=True,split=' ')#Creating the keras tokenizer object which represents the reviews with top 2000 words only.
tokenizer.fit_on_texts(reviews) #Fitting the tokenizer on reviews text.
X = tokenizer.texts_to_sequences(reviews) #Transforming the text into vectors.
print(X[100]) #Lets see how our vector looks like.

Y=data_tb.iloc[:,4].values #Target label

[3, 45, 12, 70, 174, 18, 1003, 113, 45, 792, 85, 119, 727, 40, 2, 387, 728, 263, 41]


In [0]:
#Split the data into train and test sets.
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=0,shuffle=True)


In [0]:
#Lets pad the reviews to the maximum length of the review.
X_train=sequence.pad_sequences(X_train,maxlen=526)
X_test=sequence.pad_sequences(X_test,maxlen=526)

In [83]:
#Build the LSTM model(Many to one RNN model).
max_words=2000
output_vector_len=32

model=Sequential()
model.add(Embedding(max_words,output_vector_len,input_length=526)) #Creating a embedding layer with output 32 vectors.
model.add(LSTM(100)) #100 LSTM cells.
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 526, 32)           64000     
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 117,301
Trainable params: 117,301
Non-trainable params: 0
_________________________________________________________________
None


In [85]:
#Lets train and evaluate the model.
model.fit(X_train,y_train,epochs=5,batch_size=64) 
scores=model.evaluate(X_test,y_test,verbose=0)
print("Accuracy:%.2f%%"%(scores[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy:93.02%


In [86]:
#Lets see the confusion matrix too.
from sklearn.metrics import confusion_matrix
y_pred=model.predict_classes(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 21  33]
 [ 11 565]]


# Conclusion:-
#### Looking at the confusion matrix we observe the the false positive rate is around (FP/FP+TN)=5.51 which is low.
#### And false negative rate is (FN/FN+TP)=34.375