# Rating Models

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regular expression libary.
import nltk # Natural Language toolkit
nltk.download("stopwords")  #downloading stopwords
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
nltk.download('wordnet')
import nltk as nlp

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanyas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sanyas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sanyas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("cleaned_nlp.csv")

In [3]:
df = df.drop('Unnamed: 0', axis=1)

In [4]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [5]:
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

In [6]:
# count vectorizer:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['Review'])
test_matrix = vectorizer.transform(test['Review'])

In [7]:
X_train = train_matrix
X_test = test_matrix
y_train = train['Rating']
y_test = test['Rating']

## Multinomial Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
# train logistic regression model
clf = LogisticRegression(max_iter=2000, random_state=123, multi_class='multinomial').fit(X_train, y_train)

# predict labels
preds = clf.predict(X_test)

# calculatre accuracy
score = clf.score(X_test, y_test)
print(score)

# calculate confusion matrix
cm = confusion_matrix(y_test, preds) 
print(cm / len(y_test))

0.5873674504379899
[[0.04472107 0.01890272 0.00253573 0.00184417 0.00069156]
 [0.01936376 0.0373444  0.02028585 0.01106501 0.00299677]
 [0.00368834 0.01959428 0.03596127 0.04126325 0.01221761]
 [0.00138313 0.00783771 0.02766252 0.13854311 0.11157215]
 [0.00046104 0.00230521 0.00599355 0.10096819 0.3307976 ]]


In [36]:
from sklearn.metrics import accuracy_score
print("Accuracy of Logistic Regression:",accuracy_score(y_test, preds))
print(classification_report(preds,y_test))

Accuracy of Logistic Regression: 0.5873674504379899
              precision    recall  f1-score   support

           1       0.65      0.64      0.65       302
           2       0.41      0.43      0.42       373
           3       0.32      0.39      0.35       401
           4       0.48      0.47      0.48      1274
           5       0.75      0.72      0.74      1988

    accuracy                           0.59      4338
   macro avg       0.52      0.53      0.53      4338
weighted avg       0.60      0.59      0.59      4338



## Multinomial Naive Bayes

In [40]:
from sklearn.naive_bayes import MultinomialNB
model_gn = MultinomialNB()

In [41]:
model_gn.fit(X_train, y_train)

MultinomialNB()

In [42]:
predict_mn = model_gn.predict(X_test)

In [43]:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predict_mn,y_test)

array([[ 105,   20,    2,    0,    0],
       [ 106,  111,   30,   10,    0],
       [  10,   23,   10,    1,    0],
       [  66,  204,  310,  654,  265],
       [  18,   47,   90,  685, 1586]])

In [44]:
print("Accuracy of NB:",accuracy_score(y_test, predict_mn))
print(classification_report(predict_mn,y_test))

Accuracy of NB: 0.5665058580289456
              precision    recall  f1-score   support

           1       0.34      0.83      0.49       127
           2       0.27      0.43      0.34       257
           3       0.02      0.23      0.04        44
           4       0.48      0.44      0.46      1499
           5       0.86      0.65      0.74      2426

    accuracy                           0.57      4353
   macro avg       0.40      0.52      0.41      4353
weighted avg       0.67      0.57      0.61      4353



## Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=100, n_estimators=30, criterion='entropy', min_samples_split=2).fit(X_train, y_train)

y_pred_rf=rf.predict(X_test)

print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, y_pred_rf))
print(classification_report(y_pred_rf,y_test))

Accuracy of Random Forest Classifier: 0.4974642692485016
              precision    recall  f1-score   support

           1       0.39      0.70      0.50       168
           2       0.06      0.46      0.11        52
           3       0.02      0.31      0.04        36
           4       0.27      0.34      0.30       984
           5       0.87      0.54      0.67      3098

    accuracy                           0.50      4338
   macro avg       0.32      0.47      0.32      4338
weighted avg       0.70      0.50      0.57      4338



## Decision Tree

In [71]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=4, min_samples_leaf=5)

In [72]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5,
                       min_samples_split=4)

In [73]:
y_pred_dt=dt.predict(X_test)

print("Accuracy of Decision Tree:",accuracy_score(y_test, y_pred_dt))
print(classification_report(y_pred_dt,y_test))

Accuracy of Decision Tree: 0.4598893499308437
              precision    recall  f1-score   support

           1       0.14      0.54      0.23        80
           2       0.31      0.25      0.28       480
           3       0.03      0.21      0.05        73
           4       0.21      0.35      0.26       747
           5       0.81      0.53      0.64      2958

    accuracy                           0.46      4338
   macro avg       0.30      0.37      0.29      4338
weighted avg       0.63      0.46      0.52      4338



## KNN

In [67]:
from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier(n_neighbors = 5).fit(X_train, y_train)

y_pred_knn=knn.predict(X_test)

print("Accuracy of k-nearest neighbours Classifier:",accuracy_score(y_test, y_pred_knn))
print(classification_report(y_pred_knn,y_test))

Accuracy of k-nearest neighbours Classifier: 0.4691101890272015
              precision    recall  f1-score   support

           1       0.35      0.34      0.34       307
           2       0.15      0.35      0.21       169
           3       0.15      0.24      0.18       313
           4       0.40      0.38      0.39      1303
           5       0.68      0.58      0.63      2246

    accuracy                           0.47      4338
   macro avg       0.35      0.38      0.35      4338
weighted avg       0.51      0.47      0.49      4338



## TensorFlow Neural Network

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import tensorflow as tf

In [9]:
stop_words = stopwords.words('english')

In [10]:
def process_text(text):
    text = re.sub(r'\d+', ' ', text)
    text = text.split()
    text = " ".join([word for word in text if word.lower().strip() not in stop_words])
    return text

In [11]:
reviews = df['Review'].apply(process_text)

In [12]:
num_words = 10000

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(reviews)

sequences = tokenizer.texts_to_sequences(reviews)

In [13]:
max_seq_length = np.max(list(map(lambda x: len(x), sequences)))

print("Max sequence length:", max_seq_length)

Max sequence length: 1752


In [14]:
inputs = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

In [15]:
inputs

array([[   8,    1,  175, ...,    0,    0,    0],
       [ 134,  132,  261, ...,    0,    0,    0],
       [   8,    9,   75, ...,    0,    0,    0],
       ...,
       [ 134,  708,    8, ...,    0,    0,    0],
       [   1, 3756, 2459, ...,    0,    0,    0],
       [  25, 1123,  186, ...,    0,    0,    0]], dtype=int32)

In [16]:
labels = np.array(df['Rating'].apply(lambda x: 1 if x == 5 else 0))

In [17]:
labels

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, train_size=0.7, random_state=100)

In [19]:
embedding_dim = 128

inputs = tf.keras.Input(shape=(max_seq_length,))

embedding = tf.keras.layers.Embedding(
    input_dim=num_words,
    output_dim=embedding_dim,
    input_length=max_seq_length
)(inputs)

gru = tf.keras.layers.Bidirectional(
    tf.keras.layers.GRU(128, return_sequences=True)
)(embedding)

flatten = tf.keras.layers.Flatten()(gru)

outputs = tf.keras.layers.Dense(1, activation='sigmoid')(flatten)


model = tf.keras.Model(inputs, outputs)

tf.keras.utils.plot_model(model)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [20]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

In [21]:
history = model.fit(
    train_inputs,
    train_labels,
    validation_split=0.2,
    batch_size=32,
    epochs=2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=2,
            restore_best_weights=True
        )
    ]
)

Epoch 1/2
Epoch 2/2


In [22]:
model.evaluate(test_inputs, test_labels)



[0.48448285460472107, 0.7740728855133057, 0.8613401055335999]

## Conclusion
The best model for predicting overall rating based on reviews for a hotel is the Tensorflow Neural Network. It fit the ideal high Accuracy with a low loss. Unfortunately, based on time, I conducted a low epoch, however, the low epoch was able to attain the ideal. If we haf done more, we would have had a higher accuracy and lower loss. The rest of the models that I had such as the Random Forest classifer, Multinomial Logistic Regression, Decision Tree, etc, were pretty bad models. I did not do much hyperparameter tuning due to time, however, in the future I can conduct a GridSearch CV to find the best parameters. However based on how off some of the models are, I predict that my tensorflow neural network will remain the best model. 