In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow
import keras
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [2]:
trip = pd.read_csv('trip_advisor_review.csv')

In [3]:
trip.head(10)

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
5,love monaco staff husband stayed hotel crazy w...,5
6,"cozy stay rainy city, husband spent 7 nights m...",5
7,"excellent staff, housekeeping quality hotel ch...",4
8,"hotel stayed hotel monaco cruise, rooms genero...",5
9,excellent stayed hotel monaco past w/e delight...,5


In [4]:
trip.shape

(20491, 2)

In [5]:
trip.isnull().sum()

Review    0
Rating    0
dtype: int64

In [6]:
trip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [7]:
trip.Rating.value_counts()

Rating
5    9054
4    6039
3    2184
2    1793
1    1421
Name: count, dtype: int64

In [8]:
trip.Review = trip.Review.str.lower()  #to ensure a uniform format

In [9]:
trip.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [36]:
from sklearn.model_selection import train_test_split

In [37]:
trip_train, trip_test = train_test_split(trip, test_size = .2)

In [38]:
trip_train_x = trip_train.iloc[:,0]
trip_test_x = trip_test.iloc[:,0]

In [39]:
trip_train_y = trip_train.iloc[:,1]
trip_test_y = trip_test.iloc[:,1]

In [40]:
from tensorflow.keras.utils import to_categorical

In [41]:
trip_train_y = to_categorical(trip_train_y)   # bcz we want a sequence

In [42]:
trip_train_y

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [44]:
max_num_words = 40000   # max no. of words to be taken from the entire corpus/doc
seq_len = 250
embedding_size = 500    

In [45]:
tokenizer = Tokenizer(max_num_words)

In [46]:
tokenizer.fit_on_texts(trip.Review)

In [47]:
trip_train_x = tokenizer.texts_to_sequences(trip_train_x)
trip_train_x = pad_sequences(trip_train_x, maxlen = seq_len)

In [48]:
trip_test_x = tokenizer.texts_to_sequences(trip_test_x)
trip_test_x = pad_sequences(trip_test_x, maxlen = seq_len)

In [49]:
trip_train_x[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [50]:
trip_test_x[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,    55,  9725,    15,
         218,   616,   775,  2464,  2837,  4169,   154,   398,   172,
          74,   202, 11880,  5007,    79,    32,    27,  5759,    26,
         250,   599,   449,  4628,   166,  3022,  2896,   522,   908,
         127,   114,  5226,    11,  3600,    12,  2092,   218,  2773,
        1462, 12940,  5574,   515,   542,     2,    29,  2475,   218,
          51,     7,   422,    58,  4628,   166,    96,  4169,   164,
          69,   119,     8,   710,   108,    30,   189,  1030,   672,
           2,   237,

In [91]:
model = Sequential()                  # initialize the model

In [92]:
model.add(Embedding(input_dim = max_num_words,
                   input_shape = (seq_len,),
                   output_dim = embedding_size))

  super().__init__(**kwargs)


In [93]:
model.add(LSTM(5)) 
model.add(Dense(4, activation = 'softmax'))
adam = Adam(learning_rate = 0.001)
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'] )

In [94]:
model.fit(trip_train_x, trip_train_y, epochs = 2, validation_split = .2)

Epoch 1/2
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 196ms/step - accuracy: 0.4696 - loss: 1.1118 - val_accuracy: 0.6651 - val_loss: 0.7691
Epoch 2/2
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 190ms/step - accuracy: 0.7246 - loss: 0.6600 - val_accuracy: 0.6828 - val_loss: 0.7053


<keras.src.callbacks.history.History at 0x1f6ef6ff0a0>

In [95]:
pred = model.predict(trip_test_x)
pred

[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step


array([[0.01642767, 0.5475081 , 0.36535916, 0.07070509],
       [0.01993179, 0.8995853 , 0.0535399 , 0.02694289],
       [0.00465424, 0.03388444, 0.30119494, 0.66026634],
       ...,
       [0.00318503, 0.00947281, 0.20589809, 0.78144413],
       [0.0068723 , 0.09052654, 0.6768565 , 0.22574459],
       [0.01084013, 0.03601812, 0.6077107 , 0.34543097]], dtype=float32)

In [96]:
pred_values = np.argmax(pred, axis=1)
pred_values

array([1, 1, 3, ..., 3, 2, 2], dtype=int64)

In [97]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [98]:
cm = confusion_matrix(trip_test_y, pred_values)
cm

array([[ 475,  156,   37],
       [ 163,  920,  540],
       [  32,  395, 1381]], dtype=int64)

In [99]:
accuracy_score(trip_test_y, pred_values)*100

67.72383508172724

In [34]:
trip.Rating.replace({2:1, 3:2, 4:2, 5:3}, inplace = True)  # Re-run the model from sampling after this step

In [100]:
model.summary()