In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score, plot_roc_curve, plot_confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer                    
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Embedding, Conv1D, GlobalMaxPooling1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping

In [31]:
df = pd.read_csv('../data/reviews_Model.csv')

In [32]:
df.head()

Unnamed: 0,rating,date,app,store,review,clean_content,adj,noun,verb,emoji,...,neu_score,pos_score,compound_score,month,dayofweek,hour,minute,text_len,word_count,category
0,5,2020-09-16 20:26:28,shoppee,google,Orders mostly came early and products are good.,order come early product good,good,order product,come,,...,0.707,0.293,0.4404,9,3,20,26,47,8,Good Seller Service
1,4,2020-09-16 20:13:46,shoppee,google,Good and convenient,good convenient,good convenient,,,,...,0.408,0.592,0.4404,9,3,20,13,19,3,Good Overall Service
2,4,2020-09-16 20:11:18,shoppee,google,My first purchase experience...Happy with purc...,purchase experience happy purchase,first happy,purchase experience purchase,,,...,0.714,0.286,0.34,9,3,20,11,57,7,Good Seller Service
3,5,2020-09-16 20:08:54,shoppee,google,A lot of items at a very good deal.,lot item good deal,good,lot item deal,,,...,0.715,0.285,0.4927,9,3,20,8,35,9,Good Overall Service
4,5,2020-09-16 19:37:21,shoppee,google,Delivery is fast,delivery fast,fast,delivery,,,...,1.0,0.0,0.0,9,3,19,37,16,3,Good Overall Service


In [33]:
#list comprehension for target variable
df['rate'] = [1  if (df['rating'][i] > 3) else 0 for i in range(len(df['rating']))]

In [34]:
df['rate'].value_counts(normalize = True)

1    0.796452
0    0.203548
Name: rate, dtype: float64

In [47]:
#Checking null values
df.isna().sum()[df.isna().sum() != 0]

adj      14433
noun     22056
verb     31922
emoji    69984
dtype: int64

In [36]:
## Removing null values
df= df[df['clean_content'].notna()]
df.reset_index(drop = True, inplace = True)
print(f'Null values left in df: {df.clean_content.isna().sum()}')
print(f'Number of rows left: {df.shape[0]}')

Null values left in df: 0
Number of rows left: 74685


In [37]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_content'], 
                                                    df['rate'], 
                                                    test_size = 0.2, 
                                                    random_state = 42,
                                                    stratify = df['rate'])

In [38]:
tokenizer = Tokenizer(num_words=8000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
# Adding 1 because of  reserved 0 index
vocab_size = len(tokenizer.word_index) + 1                          

max_length = 4000

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

embedding_dim = 100

In [54]:
cnn_model = Sequential()

cnn_model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
cnn_model.add(Conv1D(32, 8, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Flatten())
cnn_model.add(Dense(10, activation='relu'))
cnn_model.add(Dense(1, activation='sigmoid'))

cnn_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['acc'])

In [55]:
print(cnn_model.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 4000, 100)         918500    
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 3993, 32)          25632     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 32)                0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 10)                330       
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 11        
Total params: 944,473
Trainable params: 944,473
Non-trainable params: 0
________________________________________________

In [56]:
cnn_model.fit(X_train, y_train, epochs=10, verbose=1)

Epoch 1/10
  29/1868 [..............................] - ETA: 9:34 - loss: 0.5338 - acc: 0.7909

KeyboardInterrupt: 

In [None]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))