In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df=pd.read_excel("C:/Users/waghm/Desktop/Assignments/hotel_reviews.xlsx")
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4.0
1,ok nothing special charge diamond member hilto...,2.0
2,nice rooms not 4* experience hotel monaco seat...,3.0
3,"unique, great stay, wonderful time hotel monac...",5.0
4,"great stay great stay, went seahawk game aweso...",5.0
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5.0
20487,great location price view hotel great quick pl...,4.0
20488,"ok just looks nice modern outside, desk staff ...",2.0
20489,hotel theft ruined vacation hotel opened sept ...,1.0


In [3]:
#Data cleaning and preprocessing
import re
import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(df['Review'])):
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    

In [4]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [5]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 2, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [6]:
y=pd.get_dummies(df['Rating'])
y=y.iloc[:,1].values

In [7]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [8]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [9]:
y_pred=spam_detect_model.predict(X_test)

In [10]:
y_pred

array([1, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix=confusion_matrix(y_test,y_pred)

In [15]:
confusion_matrix

array([[3236,  508],
       [ 103,  252]], dtype=int64)

In [12]:
from sklearn.metrics import accuracy_score
acc =accuracy_score(y_test,y_pred)

In [13]:
acc

0.8509392534764577

In [17]:
import tensorflow as tf

In [18]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [19]:
### Vocabulary size
voc_size=5000

In [20]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[4481,
  2664,
  3521,
  2378,
  4283,
  4544,
  2465,
  860,
  2664,
  547,
  4081,
  4557,
  1438,
  1319,
  4977,
  4594,
  78,
  59,
  2378,
  2834,
  3293,
  3023,
  3118,
  4991,
  3342,
  1944,
  2910,
  3810,
  3810,
  1728,
  4481,
  3565,
  623,
  2789,
  1372,
  816,
  3714,
  790,
  401,
  4578,
  3851,
  3678,
  4045,
  3810,
  2697,
  1705,
  3086,
  4390,
  4957,
  1388,
  1485,
  649,
  2515,
  145,
  886,
  1838,
  3012,
  1245,
  988,
  2271,
  3600,
  4481,
  4024,
  860,
  4481,
  590,
  3195,
  2393,
  860,
  4310,
  1903,
  2218,
  791,
  1113,
  2633,
  1425,
  4481,
  182,
  1596,
  2378,
  2697],
 [3461,
  411,
  1145,
  1145,
  3876,
  2470,
  4287,
  4938,
  3331,
  4335,
  3315,
  547,
  3062,
  2433,
  3914,
  1527,
  4874,
  575,
  4660,
  3870,
  1527,
  3703,
  1559,
  1536,
  2664,
  3810,
  1319,
  1347,
  3278,
  4863,
  4126,
  3812,
  3543,
  3851,
  3760,
  3148,
  4777,
  4863,
  2006,
  2362,
  2456,
  2637,
  1527,
  3870,
  3834,
  4660,
  166

In [21]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[4481 4024  860 ... 1596 2378 2697]
 [3654 4408 1447 ... 3331 2664 3062]
 [  93 3334 1559 ... 3492 2480 4780]
 ...
 [3349 2664 2664 ... 1933 3398 4503]
 [3727 2179 1764 ... 3267  821 2110]
 [2140 1559  837 ... 1370 2145 4191]]


In [22]:
embedded_docs[0]

array([4481, 4024,  860, 4481,  590, 3195, 2393,  860, 4310, 1903, 2218,
        791, 1113, 2633, 1425, 4481,  182, 1596, 2378, 2697])

In [23]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
len(embedded_docs),y.shape

(20491, (20491,))

In [25]:
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [26]:
X_final.shape,y_final.shape

((20491, 20), (20491,))

In [27]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

# Model Training

In [28]:
### Finally Training
model.fit(X_train1,y_train1,validation_data=(X_test1,y_test1),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x266ae2b70d0>

# Performance Metrics And Accuracy

In [35]:
y_pred1=model.predict_classes(X_test1)

AttributeError: 'Sequential' object has no attribute 'predict_classes'

In [34]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test1,y_pred1)

ValueError: Classification metrics can't handle a mix of binary and continuous targets