# Rating Models

In [31]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regular expression libary.
import nltk # Natural Language toolkit
nltk.download("stopwords")  #downloading stopwords
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
nltk.download('wordnet')
import nltk as nlp

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanyas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sanyas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sanyas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
df = pd.read_csv("cleaned_nlp.csv")

In [33]:
df = df.drop('Unnamed: 0', axis=1)

In [34]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [35]:
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

In [36]:
# count vectorizer:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['Review'])
test_matrix = vectorizer.transform(test['Review'])

In [37]:
X_train = train_matrix
X_test = test_matrix
y_train = train['Rating']
y_test = test['Rating']

## Logistic Regression with GridSearchCV

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
logreg = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=logreg, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

NameError: name 'model' is not defined

In [22]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
predict_lr = logreg.predict(X_test)

In [24]:
# find accuracy, precision, recall:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predict_lr,y_test)

array([[ 176,   69,   11,    6,    2],
       [  78,  115,   79,   15,   11],
       [  16,   70,  124,   95,   20],
       [  18,   70,  196,  630,  373],
       [  16,   30,   74,  482, 1527]])

In [25]:
print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, predict_lr))
print(classification_report(predict_lr,y_test))

Accuracy of Random Forest Classifier: 0.5977225191726703
              precision    recall  f1-score   support

           1       0.58      0.67      0.62       264
           2       0.32      0.39      0.35       298
           3       0.26      0.38      0.31       325
           4       0.51      0.49      0.50      1287
           5       0.79      0.72      0.75      2129

    accuracy                           0.60      4303
   macro avg       0.49      0.53      0.51      4303
weighted avg       0.62      0.60      0.61      4303



In [26]:
from sklearn.naive_bayes import MultinomialNB
model_gn = MultinomialNB()

In [27]:
model_gn.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
predict_mn = model_gn.predict(X_test)

In [29]:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predict_mn,y_test)

array([[  94,   28,    3,    0,    0],
       [ 115,   94,   28,    4,    1],
       [   6,   11,    6,    1,    2],
       [  64,  188,  361,  646,  278],
       [  25,   33,   86,  577, 1652]])

In [30]:
print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, predict_mn))
print(classification_report(predict_mn,y_test))

Accuracy of Random Forest Classifier: 0.5791308389495701
              precision    recall  f1-score   support

           1       0.31      0.75      0.44       125
           2       0.27      0.39      0.32       242
           3       0.01      0.23      0.02        26
           4       0.53      0.42      0.47      1537
           5       0.85      0.70      0.77      2373

    accuracy                           0.58      4303
   macro avg       0.39      0.50      0.40      4303
weighted avg       0.68      0.58      0.62      4303



In [21]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=50, n_estimators=20, criterion='entropy', min_samples_split=4).fit(X_train, y_train)

y_pred_rf=rf.predict(X_test)

print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, y_pred_rf))
print(classification_report(y_pred_rf,y_test))

Accuracy of Random Forest Classifier: 0.4963978619567743
              precision    recall  f1-score   support

           1       0.21      0.73      0.33        88
           2       0.05      0.35      0.09        52
           3       0.01      0.19      0.02        26
           4       0.22      0.37      0.28       719
           5       0.92      0.52      0.67      3418

    accuracy                           0.50      4303
   macro avg       0.28      0.43      0.28      4303
weighted avg       0.77      0.50      0.58      4303

