# Rating Models

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regular expression libary.
import nltk # Natural Language toolkit
nltk.download("stopwords")  #downloading stopwords
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
nltk.download('wordnet')
import nltk as nlp

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanyas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sanyas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sanyas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("cleaned_nlp.csv")

In [3]:
df = df.drop('Unnamed: 0', axis=1)

In [4]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [5]:
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

In [6]:
# count vectorizer:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['Review'])
test_matrix = vectorizer.transform(test['Review'])

In [7]:
X_train = train_matrix
X_test = test_matrix
y_train = train['Rating']
y_test = test['Rating']

## Logistic Regression with GridSearchCV

In [8]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
predict_lr = logreg.predict(X_test)

In [10]:
# find accuracy, precision, recall:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predict_lr,y_test)

array([[ 176,   65,   11,    3,    2],
       [  76,  113,   81,   26,    8],
       [  26,   93,  117,   98,   22],
       [  22,   68,  177,  621,  394],
       [  18,   28,   68,  512, 1465]])

In [12]:
from sklearn.metrics import accuracy_score
print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, predict_lr))
print(classification_report(predict_lr,y_test))

Accuracy of Random Forest Classifier: 0.5808857808857809
              precision    recall  f1-score   support

           1       0.55      0.68      0.61       257
           2       0.31      0.37      0.34       304
           3       0.26      0.33      0.29       356
           4       0.49      0.48      0.49      1282
           5       0.77      0.70      0.74      2091

    accuracy                           0.58      4290
   macro avg       0.48      0.51      0.49      4290
weighted avg       0.60      0.58      0.59      4290



In [13]:
from sklearn.naive_bayes import MultinomialNB
model_gn = MultinomialNB()

In [14]:
model_gn.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
predict_mn = model_gn.predict(X_test)

In [16]:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predict_mn,y_test)

array([[  84,   21,    2,    1,    1],
       [ 125,  101,   31,    9,    2],
       [  14,   23,    8,    1,    1],
       [  73,  178,  327,  627,  269],
       [  22,   44,   86,  622, 1618]])

In [17]:
print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, predict_mn))
print(classification_report(predict_mn,y_test))

Accuracy of Random Forest Classifier: 0.5682983682983683
              precision    recall  f1-score   support

           1       0.26      0.77      0.39       109
           2       0.28      0.38      0.32       268
           3       0.02      0.17      0.03        47
           4       0.50      0.43      0.46      1474
           5       0.86      0.68      0.76      2392

    accuracy                           0.57      4290
   macro avg       0.38      0.48      0.39      4290
weighted avg       0.67      0.57      0.61      4290



In [18]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=50, n_estimators=20, criterion='entropy', min_samples_split=4).fit(X_train, y_train)

y_pred_rf=rf.predict(X_test)

print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, y_pred_rf))
print(classification_report(y_pred_rf,y_test))

Accuracy of Random Forest Classifier: 0.4801864801864802
              precision    recall  f1-score   support

           1       0.14      0.72      0.24        64
           2       0.04      0.28      0.08        58
           3       0.00      0.11      0.01        18
           4       0.20      0.37      0.26       691
           5       0.92      0.50      0.65      3459

    accuracy                           0.48      4290
   macro avg       0.26      0.40      0.25      4290
weighted avg       0.78      0.48      0.57      4290



In [19]:
!pip install torch



In [None]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs