# Amazon Food Review Classification Prediction Using RNN

In [3]:
# importing Libraries
import pandas as pd
import numpy as np

import tensorflow as tf
import keras
from keras.layers import Dense, SimpleRNN, Embedding 
from keras.models import Sequential
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.preprocessing.text import Tokenizer #to tokenize the words into sequence of unique numbers
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [4]:
reviews=pd.read_csv(r"D:\Reviews.csv")

In [81]:
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [82]:
df=reviews.iloc[:,[6,9]]

In [83]:
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [84]:
df=df.rename(columns={'Score':'Y','Text':'X'})

In [85]:
df.Y.value_counts()

Y
5    363122
4     80655
1     52268
3     42640
2     29769
Name: count, dtype: int64

In [86]:
df=df.replace({1:0,2:1,3:2,4:3,5:4})

In [87]:
# creating hyperparameters
max_num_words = 10000 #unique words to be considered in set of documents
seq_len=50 # how many of the unique words are preset in each document
embedding_size = 100 # vector length of each word

In [88]:
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(df['X']) # use the full dataset

In [89]:
from sklearn.model_selection import train_test_split

df_train,df_test=train_test_split(df,test_size=.2)

df_train_x=df_train.X
df_train_y=df_train.Y

df_test_x=df_test.X
df_test_y=df_test.Y

In [102]:
df_train_y=to_categorical(df_train_y)

In [104]:
#sequencing and padding train test X data

df_train_x=tokenizer.texts_to_sequences(df_train_x)
df_train_x=pad_sequences(df_train_x,maxlen=seq_len)

df_test_x=tokenizer.texts_to_sequences(df_test_x)
df_test_x=pad_sequences(df_test_x,maxlen=seq_len)

In [105]:
#model building
model=Sequential()

model.add(Embedding(input_dim=max_num_words,output_dim=embedding_size))
model.add(SimpleRNN(32))
model.add(Dense(5,activation='softmax'))

adam=Adam(learning_rate=0.001)

model.compile(optimizer=adam, loss='categorical_crossentropy',metrics=['accuracy'])

In [106]:
model.fit(df_train_x,df_train_y,validation_split=.2,batch_size=64,epochs=5)

Epoch 1/5
[1m5685/5685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 20ms/step - accuracy: 0.6842 - loss: 0.8946 - val_accuracy: 0.7239 - val_loss: 0.7588
Epoch 2/5
[1m5685/5685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 21ms/step - accuracy: 0.7573 - loss: 0.6778 - val_accuracy: 0.7359 - val_loss: 0.7411
Epoch 3/5
[1m5685/5685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 22ms/step - accuracy: 0.7907 - loss: 0.5909 - val_accuracy: 0.7394 - val_loss: 0.7542
Epoch 4/5
[1m5685/5685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 23ms/step - accuracy: 0.8108 - loss: 0.5388 - val_accuracy: 0.7426 - val_loss: 0.7814
Epoch 5/5
[1m5685/5685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 24ms/step - accuracy: 0.8272 - loss: 0.5002 - val_accuracy: 0.7412 - val_loss: 0.7630


<keras.src.callbacks.history.History at 0x20ba30f4ad0>

In [110]:
pred=model.predict(df_test_x)

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step


In [112]:
pred_cat=pred.argmax(axis=1)

In [114]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [116]:
accuracy_score(df_test_y,pred_cat)

0.7452568804918596

In [120]:
confusion_matrix(df_test_y,pred_cat)

array([[ 7062,   802,   687,   366,  1598],
       [ 1389,  1953,   929,   532,  1128],
       [  777,   643,  3764,  1369,  2023],
       [  418,   382,  1325,  6552,  7403],
       [ 1006,   576,  1390,  4219, 65398]], dtype=int64)

In [None]:
# 5 X 5
# array([[ 7062,   802,   687,   366,  1598],
#        [ 1389,  1953,   929,   532,  1128],
#        [  777,   643,  3764,  1369,  2023],
#        [  418,   382,  1325,  6552,  7403],
#        [ 1006,   576,  1390,  4219, 65398]], dtype=int64)
# accuracy_score = 0.7452568804918596

In [124]:
df1=df.replace({1:0,2:0,3:1,4:1})

In [126]:
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(df1['X']) # use the full dataset

In [128]:
from sklearn.model_selection import train_test_split

df1_train,df1_test=train_test_split(df1,test_size=.2)

df1_train_x=df1_train.X
df1_train_y=df1_train.Y

df1_test_x=df1_test.X
df1_test_y=df1_test.Y

In [131]:
df1_train_y=to_categorical(df1_train_y)

In [133]:
#sequencing and padding train test X data

df1_train_x=tokenizer.texts_to_sequences(df1_train_x)
df1_train_x=pad_sequences(df1_train_x,maxlen=seq_len)

df1_test_x=tokenizer.texts_to_sequences(df1_test_x)
df1_test_x=pad_sequences(df1_test_x,maxlen=seq_len)

In [134]:
#model building
model=Sequential()

model.add(Embedding(input_dim=max_num_words,output_dim=embedding_size))
model.add(SimpleRNN(32))
model.add(Dense(2,activation='softmax'))

adam=Adam(learning_rate=0.001)

model.compile(optimizer=adam, loss='categorical_crossentropy',metrics=['accuracy'])

In [137]:
model.fit(df1_train_x,df1_train_y,validation_split=.2,batch_size=64,epochs=5)

Epoch 1/5
[1m5685/5685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 19ms/step - accuracy: 0.8540 - loss: 0.3397 - val_accuracy: 0.8883 - val_loss: 0.2727
Epoch 2/5
[1m5685/5685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 19ms/step - accuracy: 0.9042 - loss: 0.2408 - val_accuracy: 0.8962 - val_loss: 0.2608
Epoch 3/5
[1m5685/5685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 19ms/step - accuracy: 0.9203 - loss: 0.2069 - val_accuracy: 0.8990 - val_loss: 0.2602
Epoch 4/5
[1m5685/5685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 19ms/step - accuracy: 0.9330 - loss: 0.1785 - val_accuracy: 0.9017 - val_loss: 0.2618
Epoch 5/5
[1m5685/5685[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 20ms/step - accuracy: 0.9438 - loss: 0.1537 - val_accuracy: 0.9012 - val_loss: 0.2680


<keras.src.callbacks.history.History at 0x20c0517a180>

In [139]:
pred2=model.predict(df1_test_x)

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step


In [146]:
pred2_cat=pred2.argmax(axis=1)

In [148]:
accuracy_score(df1_test_y,pred2_cat)

0.9016896676078142