# Rating Models

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regular expression libary.
import nltk # Natural Language toolkit
nltk.download("stopwords")  #downloading stopwords
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
nltk.download('wordnet')
import nltk as nlp

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanyas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sanyas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sanyas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("cleaned_nlp.csv")

In [3]:
df = df.drop('Unnamed: 0', axis=1)

In [4]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [5]:
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

In [6]:
# count vectorizer:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['Review'])
test_matrix = vectorizer.transform(test['Review'])

In [7]:
X_train = train_matrix
X_test = test_matrix
y_train = train['Rating']
y_test = test['Rating']

## Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
predict_lr = logreg.predict(X_test)

In [10]:
# find accuracy, precision, recall:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predict_lr,y_test)

array([[ 164,   55,   13,    5,    2],
       [  76,  124,   69,   24,    9],
       [  25,   92,  144,   72,   35],
       [  10,   87,  197,  666,  427],
       [  12,   29,   66,  541, 1474]])

In [11]:
from sklearn.metrics import accuracy_score
print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, predict_lr))
print(classification_report(predict_lr,y_test))

Accuracy of Random Forest Classifier: 0.5821638750565867
              precision    recall  f1-score   support

           1       0.57      0.69      0.62       239
           2       0.32      0.41      0.36       302
           3       0.29      0.39      0.34       368
           4       0.51      0.48      0.49      1387
           5       0.76      0.69      0.72      2122

    accuracy                           0.58      4418
   macro avg       0.49      0.53      0.51      4418
weighted avg       0.60      0.58      0.59      4418



In [12]:
from sklearn.naive_bayes import MultinomialNB
model_gn = MultinomialNB()

In [13]:
model_gn.fit(X_train, y_train)

MultinomialNB()

In [14]:
predict_mn = model_gn.predict(X_test)

In [15]:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predict_mn,y_test)

array([[ 107,   18,    2,    0,    1],
       [ 133,   94,   27,    8,    1],
       [  11,   20,   11,    1,    0],
       [  67,  210,  327,  621,  298],
       [  18,   26,   92,  631, 1621]])

In [16]:
print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, predict_mn))
print(classification_report(predict_mn,y_test))

Accuracy of Random Forest Classifier: 0.5647871116225547
              precision    recall  f1-score   support

           1       0.32      0.84      0.46       128
           2       0.26      0.36      0.30       263
           3       0.02      0.26      0.04        43
           4       0.49      0.41      0.45      1523
           5       0.84      0.68      0.75      2388

    accuracy                           0.56      4345
   macro avg       0.39      0.51      0.40      4345
weighted avg       0.66      0.56      0.60      4345



In [17]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=50, n_estimators=20, criterion='entropy', min_samples_split=4).fit(X_train, y_train)

y_pred_rf=rf.predict(X_test)

print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, y_pred_rf))
print(classification_report(y_pred_rf,y_test))

Accuracy of Random Forest Classifier: 0.48584579976985043
              precision    recall  f1-score   support

           1       0.21      0.73      0.32        94
           2       0.05      0.33      0.09        60
           3       0.01      0.20      0.01        15
           4       0.19      0.36      0.25       668
           5       0.93      0.51      0.65      3508

    accuracy                           0.49      4345
   macro avg       0.28      0.43      0.27      4345
weighted avg       0.78      0.49      0.58      4345



In [12]:
reviews = df

In [13]:
#keeping only relevant columns and calculating sentence lengths
reviews = reviews[['Review', 'Rating']]
reviews.columns = ['Review', 'Rating']
reviews['Review_length'] = reviews['Review'].apply(lambda x: len(x.split()))
reviews.head()

Unnamed: 0,Review,Rating,Review_length
0,nice hotel expensive parking got good deal sta...,4,87
1,ok nothing special charge diamond member hilto...,2,250
2,nice rooms not 4* experience hotel monaco seat...,3,217
3,"unique, great stay, wonderful time hotel monac...",5,89
4,"great stay great stay, went seahawk game aweso...",5,191


In [14]:
#changing ratings to 0-numbering
zero_numbering = {1:0, 2:1, 3:2, 4:3, 5:4}
reviews['Rating'] = reviews['Rating'].apply(lambda x: zero_numbering[x])

In [15]:
#mean sentence length
np.mean(reviews['Review_length'])

104.37582353228247

In [17]:
!pip install torch



In [16]:
#library imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
import jovian
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

ModuleNotFoundError: No module named 'torch'

In [60]:
tok = spacy
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [67]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [68]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length