In [None]:
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence #unique words

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,SimpleRNN, Dropout, Embedding

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df=pd.read_table('/content/amazon_alexa.tsv')

In [None]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [None]:
df.drop(['date','variation'],inplace=True,axis=1)

In [None]:
df

Unnamed: 0,rating,verified_reviews,feedback
0,5,Love my Echo!,1
1,5,Loved it!,1
2,4,"Sometimes while playing a game, you can answer...",1
3,5,I have had a lot of fun with this thing. My 4 ...,1
4,5,Music,1
...,...,...,...
3145,5,"Perfect for kids, adults and everyone in betwe...",1
3146,5,"Listening to music, searching locations, check...",1
3147,5,"I do love these things, i have them running my...",1
3148,5,Only complaint I have is that the sound qualit...,1


In [None]:
df['feedback'].value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [None]:
def cleantext(text):
  tokens = word_tokenize(text.lower())
  ftoken = [t for t in tokens if(t.isalpha())]

  stop = stopwords.words("english")
  ctoken = [t for t in ftoken if(t not in stop)]

  lemma = WordNetLemmatizer()
  ltoken = [lemma.lemmatize(t) for t in ctoken]

  return " ".join(ltoken)

In [None]:
df["clean_review"]=df["verified_reviews"].apply(cleantext)

In [None]:
df.head()

Unnamed: 0,rating,verified_reviews,feedback,clean_review
0,5,Love my Echo!,1,love echo
1,5,Loved it!,1,loved
2,4,"Sometimes while playing a game, you can answer...",1,sometimes playing game answer question correct...
3,5,I have had a lot of fun with this thing. My 4 ...,1,lot fun thing yr old learns dinosaur control l...
4,5,Music,1,music


In [None]:
x = df["clean_review"]
y = df["feedback"]

In [None]:
x

0                                               love echo
1                                                   loved
2       sometimes playing game answer question correct...
3       lot fun thing yr old learns dinosaur control l...
4                                                   music
                              ...                        
3145                           perfect kid adult everyone
3146    listening music searching location checking ti...
3147    love thing running entire home tv light thermo...
3148    complaint sound quality great mostly use comma...
3149                                                 good
Name: clean_review, Length: 3150, dtype: object

In [None]:
y

0       1
1       1
2       1
3       1
4       1
       ..
3145    1
3146    1
3147    1
3148    1
3149    1
Name: feedback, Length: 3150, dtype: int64

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=1)

In [None]:
sentlen = []

for sent in df["clean_review"]:
  sentlen.append(len(word_tokenize(sent)))

df["SentLen"] = sentlen
df.head()

Unnamed: 0,rating,verified_reviews,feedback,clean_review,SentLen
0,5,Love my Echo!,1,love echo,2
1,5,Loved it!,1,loved,1
2,4,"Sometimes while playing a game, you can answer...",1,sometimes playing game answer question correct...,17
3,5,I have had a lot of fun with this thing. My 4 ...,1,lot fun thing yr old learns dinosaur control l...,18
4,5,Music,1,music,1


In [None]:
max(sentlen)

245

In [None]:
max_len = np.quantile(sentlen, 0.95)
max_len

40.0

In [None]:
tok = Tokenizer(char_level=False, split=" ")


tok.fit_on_texts(xtrain)
tok.index_word

{1: 'love',
 2: 'echo',
 3: 'great',
 4: 'alexa',
 5: 'work',
 6: 'music',
 7: 'use',
 8: 'like',
 9: 'sound',
 10: 'device',
 11: 'easy',
 12: 'one',
 13: 'dot',
 14: 'set',
 15: 'speaker',
 16: 'good',
 17: 'product',
 18: 'thing',
 19: 'get',
 20: 'amazon',
 21: 'time',
 22: 'play',
 23: 'home',
 24: 'still',
 25: 'would',
 26: 'really',
 27: 'light',
 28: 'prime',
 29: 'day',
 30: 'smart',
 31: 'also',
 32: 'show',
 33: 'much',
 34: 'new',
 35: 'better',
 36: 'quality',
 37: 'well',
 38: 'bought',
 39: 'room',
 40: 'fun',
 41: 'far',
 42: 'need',
 43: 'alarm',
 44: 'got',
 45: 'even',
 46: 'could',
 47: 'everything',
 48: 'plus',
 49: 'feature',
 50: 'make',
 51: 'able',
 52: 'video',
 53: 'learning',
 54: 'say',
 55: 'turn',
 56: 'house',
 57: 'clock',
 58: 'nice',
 59: 'weather',
 60: 'buy',
 61: 'phone',
 62: 'want',
 63: 'ask',
 64: 'tv',
 65: 'little',
 66: 'know',
 67: 'hub',
 68: 'tell',
 69: 'used',
 70: 'using',
 71: 'spot',
 72: 'price',
 73: 'purchase',
 74: 'bulb',
 75:

In [None]:
vocab_len = len(tok.index_word)
vocab_len

2945

In [None]:
seqtrain = tok.texts_to_sequences(xtrain) #step1
seqtrain

[[5, 37, 1741, 180, 1230, 16, 24, 253, 8, 1231, 983],
 [24, 78, 69, 4, 92, 1232],
 [1, 10, 50, 203, 238, 278],
 [],
 [3,
  17,
  42,
  239,
  1742,
  14,
  260,
  21,
  59,
  120,
  1743,
  1744,
  5,
  112,
  69,
  43,
  57,
  6,
  86,
  1],
 [26, 188, 261, 446, 19, 829, 270, 573, 151],
 [133, 87, 40, 69, 204, 24, 53, 240],
 [1, 643, 155, 43, 57, 86, 1, 644, 163, 574, 79],
 [1, 2, 32, 51, 75, 381, 104, 22, 51, 262, 343, 76, 241],
 [90, 4, 724, 10, 41, 1233, 1234],
 [1,
  725,
  143,
  181,
  90,
  90,
  2,
  107,
  69,
  47,
  27,
  164,
  5,
  163,
  344,
  37,
  260,
  984,
  78,
  575,
  138,
  726,
  18,
  124,
  1745,
  1746,
  25,
  152,
  414],
 [182, 18, 53],
 [1],
 [1,
  18,
  98,
  25,
  576,
  577,
  1235,
  62,
  100,
  139,
  645,
  183,
  3,
  415,
  45,
  1236,
  18,
  8,
  985,
  183,
  727,
  481,
  140,
  134,
  73,
  242,
  164,
  44,
  28,
  29,
  243,
  301,
  72],
 [109,
  382,
  271,
  525,
  79,
  11,
  94,
  447,
  20,
  82,
  31,
  38,
  1747,
  27,
  74,
  2

In [None]:
seqmattrain = sequence.pad_sequences(seqtrain, maxlen= int(max_len)) #step2
seqmattrain

array([[   0,    0,    0, ...,    8, 1231,  983],
       [   0,    0,    0, ...,    4,   92, 1232],
       [   0,    0,    0, ...,  203,  238,  278],
       ...,
       [   0,    0,    0, ...,  191,    3,   17],
       [   0,    0,    0, ...,    0,    0,  385],
       [   0,    0,    0, ...,    1,    2,   71]], dtype=int32)

In [None]:
seqtest = tok.texts_to_sequences(xtest)
seqmattest = sequence.pad_sequences(seqtest, maxlen=int(max_len))

In [None]:
vocab_len

2945

In [None]:
rnn = Sequential()

rnn.add(Embedding(vocab_len+1,700, input_length=int(max_len), mask_zero=True))
rnn.add(SimpleRNN(units=32, activation="tanh"))
rnn.add(Dense(units=32, activation="relu"))
rnn.add(Dropout(0.2))

rnn.add(Dense(units=1, activation="sigmoid"))

rnn.compile(optimizer="adam", loss="binary_crossentropy")

rnn.fit(seqmattrain, ytrain, batch_size=50, epochs=50)

ypred = rnn.predict(seqmattest)

ypred = ypred>0.5

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
from sklearn.metrics import classification_report

print(classification_report(ytest,ypred))


              precision    recall  f1-score   support

           0       0.68      0.37      0.48        73
           1       0.95      0.99      0.97       872

    accuracy                           0.94       945
   macro avg       0.81      0.68      0.72       945
weighted avg       0.93      0.94      0.93       945



In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()
df['clean_review']=le.fit_transform(df['clean_review'])

In [None]:
df.head()

Unnamed: 0,rating,verified_reviews,feedback,clean_review,SentLen
0,5,Love my Echo!,1,1104,2
1,5,Loved it!,1,1360,1
2,4,"Sometimes while playing a game, you can answer...",1,1753,17
3,5,I have had a lot of fun with this thing. My 4 ...,1,1027,18
4,5,Music,1,1404,1


In [None]:
x=df['clean_review']
y=df['feedback']

In [None]:
#xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=1)

In [None]:
# xtrain

1433    2132
2833    1806
1807    1089
1447       0
1328     776
        ... 
2763       0
905       10
1096     715
235      995
1061    1144
Name: clean_review, Length: 2205, dtype: int64

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm=SMOTE(sampling_strategy='minority')
x_res,y_res=sm.fit_resample(x,y)

ValueError: ignored