In [1]:
import pandas as pd
import numpy as np
import string

In [4]:
# dataset was downloaded from Kaggle
data = pd.read_csv('/content/Datafiniti_Hotel_Reviews_Jun19.csv')

In [5]:
# get only one star rating or five star rating
df = data[(data['reviews.rating']==1) | (data['reviews.rating']==5)]

In [6]:
# reset index for the new dataset df
df = df.reset_index().drop('index', axis=1)

In [7]:
# data cleaning and pre-processing
review = df['reviews.text'].str.lower()

In [8]:
review = review.str.translate(str.maketrans('', '', string.punctuation))

In [9]:
review = review.str.split()

In [11]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
sw = stopwords.words()
sw = set(sw)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
corpus = []
for i in range(len(review)):
	words = review[i]
	words = [ps.stem(word) for word in words if not word in sw]
	words = ' '.join(words)
	corpus.append(words)

In [13]:
corpus[:2]

['cheap excel locat price somewhat standard hace reserv room nice clean offer good continent breakfast plu compens front desk servic personnel excel carmel ac room fan air circul',
 'recent stay trip lexington friend group share feel would back routin wait 10 minut elev workout room tini 2 treadmil 2 cheap ellipt machin rock floor use everyth seem']

In [14]:
# transform the data
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x= cv.fit_transform(corpus).toarray()
y = df['reviews.rating'].values

In [15]:
# perform train_test_split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=13)

In [16]:
# train a model using GuassianNB as a classifier
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()
gb.fit(x_train, y_train)
pred = gb.predict(x_test)

In [17]:
# evaluate the model using testing data
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

              precision    recall  f1-score   support

           1       0.34      0.57      0.43       168
           5       0.95      0.87      0.91      1455

    accuracy                           0.84      1623
   macro avg       0.65      0.72      0.67      1623
weighted avg       0.88      0.84      0.86      1623

[[  96   72]
 [ 183 1272]]


In [18]:
# train a model using MultinomialNB as a classifier
from sklearn.naive_bayes import MultinomialNB
mb = MultinomialNB()
mb.fit(x_train, y_train)
predict = mb.predict(x_test)

In [19]:
# evaluate the model using testing data
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, predict))
print(confusion_matrix(y_test, predict))

              precision    recall  f1-score   support

           1       0.90      0.85      0.87       168
           5       0.98      0.99      0.99      1455

    accuracy                           0.97      1623
   macro avg       0.94      0.92      0.93      1623
weighted avg       0.97      0.97      0.97      1623

[[ 142   26]
 [  16 1439]]


In [None]:
#results from the MultinomialNB showed a much improved accuracy (0.97). 