In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from wordcloud import WordCloud

import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence #unique id

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Embedding
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv("hotel-reviews.csv")
df

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy
...,...,...,...,...,...
38927,id49253,We arrived late at night and walked in to a ch...,Edge,Desktop,happy
38928,id49254,The only positive impression is location and p...,InternetExplorer,Mobile,not happy
38929,id49255,Traveling with friends for shopping and a show...,Firefox,Mobile,not happy
38930,id49256,The experience was just ok. We paid extra for ...,Chrome,Desktop,not happy


In [None]:
df["Is_Response"].value_counts()

happy        26521
not happy    12411
Name: Is_Response, dtype: int64

In [None]:
df["Is_Response"].replace({"not happy":0 , "happy":1}, inplace=True)

In [None]:
df.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,0
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,0
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,0
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,1
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,0


In [None]:
df.isnull().sum()

User_ID         0
Description     0
Browser_Used    0
Device_Used     0
Is_Response     0
dtype: int64

In [None]:
def cleantext(text):
  tokens = word_tokenize(text.lower())
  ftoken = [t for t in tokens if(t.isalpha())]
  stop = stopwords.words("english")
  ctoken = [t for t in ftoken if(t not in stop)]
  lemma = WordNetLemmatizer()
  ltoken = [lemma.lemmatize(t) for t in ctoken]
  return " ".join(ltoken)

In [None]:
df["clean_Description"]=df["Description"].apply(cleantext)

In [None]:
df.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,clean_Description
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,0,room kind clean strong smell dog generally ave...
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,0,stayed crown plaza april april staff friendly ...
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,0,booked hotel hotwire lowest price could find g...
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,1,stayed husband son way alaska cruise loved hot...
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,0,girlfriend stayed celebrate th birthday planne...


In [None]:
x = df["clean_Description"]
y = df["Is_Response"]

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=1)

In [None]:
sentlen = []

for sent in df["clean_Description"]:
  sentlen.append(len(word_tokenize(sent)))

df["SentLen"] = sentlen
df.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,clean_Description,SentLen
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,0,room kind clean strong smell dog generally ave...,21
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,0,stayed crown plaza april april staff friendly ...,97
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,0,booked hotel hotwire lowest price could find g...,116
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,1,stayed husband son way alaska cruise loved hot...,50
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,0,girlfriend stayed celebrate th birthday planne...,134


In [None]:
max(sentlen)

1164

In [None]:
np.quantile(sentlen, 0.95)

190.0

In [None]:
# 95% of the review in doc has len equals to 190

In [None]:
max_len = np.quantile(sentlen, 0.95)

In [None]:
tok = Tokenizer(char_level=False, split=" ")
#char_level	if True, every character will be treated as a token.

tok.fit_on_texts(xtrain)
tok.index_word

{1: 'room',
 2: 'hotel',
 3: 'stay',
 4: 'great',
 5: 'staff',
 6: 'would',
 7: 'night',
 8: 'one',
 9: 'good',
 10: 'location',
 11: 'time',
 12: 'nice',
 13: 'stayed',
 14: 'bed',
 15: 'u',
 16: 'clean',
 17: 'service',
 18: 'breakfast',
 19: 'day',
 20: 'get',
 21: 'could',
 22: 'place',
 23: 'also',
 24: 'area',
 25: 'desk',
 26: 'restaurant',
 27: 'like',
 28: 'friendly',
 29: 'bathroom',
 30: 'well',
 31: 'comfortable',
 32: 'small',
 33: 'floor',
 34: 'front',
 35: 'really',
 36: 'back',
 37: 'free',
 38: 'even',
 39: 'helpful',
 40: 'two',
 41: 'view',
 42: 'walk',
 43: 'next',
 44: 'go',
 45: 'got',
 46: 'lobby',
 47: 'parking',
 48: 'street',
 49: 'little',
 50: 'price',
 51: 'new',
 52: 'first',
 53: 'door',
 54: 'right',
 55: 'around',
 56: 'much',
 57: 'minute',
 58: 'bar',
 59: 'close',
 60: 'lot',
 61: 'city',
 62: 'food',
 63: 'pool',
 64: 'thing',
 65: 'people',
 66: 'need',
 67: 'check',
 68: 'recommend',
 69: 'excellent',
 70: 'made',
 71: 'away',
 72: 'block',
 73: 

In [None]:
vocab_len = len(tok.index_word)
vocab_len

34231

In [None]:
seqtrain = tok.texts_to_sequences(xtrain) #step1
seqtrain

[[710,
  5475,
  22,
  186,
  13,
  2480,
  1235,
  22,
  6,
  168,
  134,
  226,
  7,
  21,
  2687,
  3176,
  192,
  192,
  346,
  1162,
  1363,
  11052,
  3560,
  33,
  38,
  100,
  1568,
  381,
  359,
  359,
  29,
  33,
  304,
  502,
  2019,
  8125,
  92,
  18691,
  114,
  102,
  5476,
  1546,
  2256,
  55,
  1447,
  74,
  829,
  990,
  22,
  38,
  273,
  7,
  2553,
  1082,
  1415,
  3,
  84,
  3,
  121,
  19,
  165,
  461,
  2,
  9,
  117,
  3901,
  5,
  361,
  1809,
  81,
  340,
  6,
  9,
  96,
  2286,
  784,
  2,
  309],
 [315,
  389,
  7,
  3,
  4092,
  99,
  2,
  357,
  915,
  763,
  1033,
  7,
  1042,
  222,
  1,
  10,
  2,
  4,
  112,
  10,
  4,
  40,
  72,
  57,
  42,
  747,
  42,
  488,
  216,
  842,
  139,
  240,
  126,
  110,
  293,
  539,
  49,
  2389,
  4,
  737,
  26,
  23,
  57,
  42,
  216,
  303,
  225,
  328,
  74,
  211,
  5,
  733,
  34,
  25,
  5,
  263,
  91,
  12467,
  11,
  1844,
  358,
  336,
  11,
  195,
  5,
  227,
  2863,
  4,
  2863,
  639,
  115,
  216,

In [None]:
seqmattrain = sequence.pad_sequences(seqtrain, maxlen= int(max_len)) #step2
seqmattrain

array([[    0,     0,     0, ...,   784,     2,   309],
       [  110,   293,   539, ...,   293,   539, 18692],
       [    0,     0,     0, ...,  2287,  1984,     1],
       ...,
       [    0,     0,     0, ...,   119,  1253,   176],
       [    0,     0,     0, ...,   786,   105,  2267],
       [    0,     0,     0, ...,   100,    36,     1]], dtype=int32)

In [None]:
seqtest = tok.texts_to_sequences(xtest)
seqmattest = sequence.pad_sequences(seqtest, maxlen=int(max_len))

In [None]:
vocab_len

34231

In [None]:
rnn = Sequential()

rnn.add(Embedding(vocab_len+1,190, input_length=int(max_len), mask_zero=True))
rnn.add(SimpleRNN(units=32, activation="tanh"))
rnn.add(Dense(units=32, activation="relu"))
rnn.add(Dropout(0.2))

rnn.add(Dense(units=1, activation="sigmoid"))

rnn.compile(optimizer="adam", loss="binary_crossentropy")

rnn.fit(seqmattrain, ytrain, batch_size=50, epochs=25)

ypred = rnn.predict(seqmattest)

#set threshold
ypred = np.where(ypred<0.5,0,1)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.74      0.67      0.70      3737
           1       0.85      0.89      0.87      7943

    accuracy                           0.82     11680
   macro avg       0.79      0.78      0.79     11680
weighted avg       0.81      0.82      0.82     11680

