# Rating Models

In [20]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regular expression libary.
import nltk # Natural Language toolkit
nltk.download("stopwords")  #downloading stopwords
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
nltk.download('wordnet')
import nltk as nlp

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanyas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sanyas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sanyas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
df = pd.read_csv("cleaned_nlp.csv")

In [22]:
df = df.drop('Unnamed: 0', axis=1)

In [23]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [24]:
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

In [25]:
# count vectorizer:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['Review'])
test_matrix = vectorizer.transform(test['Review'])

In [26]:
X_train = train_matrix
X_test = test_matrix
y_train = train['Rating']
y_test = test['Rating']

## Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression

# train logistic regression model
clf = LogisticRegression(max_iter=2000, random_state=123, multi_class='multinomial').fit(X_train, y_train)

# predict labels
preds = clf.predict(X_test)

# calculatre accuracy
score = clf.score(X_test, y_test)
print(score)

# calculate confusion matrix
cm = confusion_matrix(y_test, preds) 
print(cm / len(y_test))

0.5873674504379899
[[0.04472107 0.01890272 0.00253573 0.00184417 0.00069156]
 [0.01936376 0.0373444  0.02028585 0.01106501 0.00299677]
 [0.00368834 0.01959428 0.03596127 0.04126325 0.01221761]
 [0.00138313 0.00783771 0.02766252 0.13854311 0.11157215]
 [0.00046104 0.00230521 0.00599355 0.10096819 0.3307976 ]]


In [36]:
from sklearn.metrics import accuracy_score
print("Accuracy of Logistic Regression:",accuracy_score(y_test, preds))
print(classification_report(preds,y_test))

Accuracy of Logistic Regression: 0.5873674504379899
              precision    recall  f1-score   support

           1       0.65      0.64      0.65       302
           2       0.41      0.43      0.42       373
           3       0.32      0.39      0.35       401
           4       0.48      0.47      0.48      1274
           5       0.75      0.72      0.74      1988

    accuracy                           0.59      4338
   macro avg       0.52      0.53      0.53      4338
weighted avg       0.60      0.59      0.59      4338



## Multinomial Naive Bayes

In [40]:
from sklearn.naive_bayes import MultinomialNB
model_gn = MultinomialNB()

In [41]:
model_gn.fit(X_train, y_train)

MultinomialNB()

In [42]:
predict_mn = model_gn.predict(X_test)

In [43]:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predict_mn,y_test)

array([[ 105,   20,    2,    0,    0],
       [ 106,  111,   30,   10,    0],
       [  10,   23,   10,    1,    0],
       [  66,  204,  310,  654,  265],
       [  18,   47,   90,  685, 1586]])

In [44]:
print("Accuracy of NB:",accuracy_score(y_test, predict_mn))
print(classification_report(predict_mn,y_test))

Accuracy of NB: 0.5665058580289456
              precision    recall  f1-score   support

           1       0.34      0.83      0.49       127
           2       0.27      0.43      0.34       257
           3       0.02      0.23      0.04        44
           4       0.48      0.44      0.46      1499
           5       0.86      0.65      0.74      2426

    accuracy                           0.57      4353
   macro avg       0.40      0.52      0.41      4353
weighted avg       0.67      0.57      0.61      4353



## Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=100, n_estimators=30, criterion='entropy', min_samples_split=2).fit(X_train, y_train)

y_pred_rf=rf.predict(X_test)

print("Accuracy of Random Forest Classifier:",accuracy_score(y_test, y_pred_rf))
print(classification_report(y_pred_rf,y_test))

Accuracy of Random Forest Classifier: 0.4974642692485016
              precision    recall  f1-score   support

           1       0.39      0.70      0.50       168
           2       0.06      0.46      0.11        52
           3       0.02      0.31      0.04        36
           4       0.27      0.34      0.30       984
           5       0.87      0.54      0.67      3098

    accuracy                           0.50      4338
   macro avg       0.32      0.47      0.32      4338
weighted avg       0.70      0.50      0.57      4338



## Decision Tree

In [71]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=4, min_samples_leaf=5)

In [72]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5,
                       min_samples_split=4)

In [73]:
y_pred_dt=dt.predict(X_test)

print("Accuracy of Decision Tree:",accuracy_score(y_test, y_pred_dt))
print(classification_report(y_pred_dt,y_test))

Accuracy of Decision Tree: 0.4598893499308437
              precision    recall  f1-score   support

           1       0.14      0.54      0.23        80
           2       0.31      0.25      0.28       480
           3       0.03      0.21      0.05        73
           4       0.21      0.35      0.26       747
           5       0.81      0.53      0.64      2958

    accuracy                           0.46      4338
   macro avg       0.30      0.37      0.29      4338
weighted avg       0.63      0.46      0.52      4338



## KNN

In [67]:
from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier(n_neighbors = 5).fit(X_train, y_train)

y_pred_knn=knn.predict(X_test)

print("Accuracy of k-nearest neighbours Classifier:",accuracy_score(y_test, y_pred_knn))
print(classification_report(y_pred_knn,y_test))

Accuracy of k-nearest neighbours Classifier: 0.4691101890272015
              precision    recall  f1-score   support

           1       0.35      0.34      0.34       307
           2       0.15      0.35      0.21       169
           3       0.15      0.24      0.18       313
           4       0.40      0.38      0.39      1303
           5       0.68      0.58      0.63      2246

    accuracy                           0.47      4338
   macro avg       0.35      0.38      0.35      4338
weighted avg       0.51      0.47      0.49      4338



In [38]:
import tez
import torch
import torch.nn as nn
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score

In [39]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [40]:
class BertDataset:
    def __init__(self, texts, targets, max_len = 64):
        self.texts = texts
        self.targets = targets
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased",
            do_lower_case = False
            )
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = 'max_length',
            truncation = True
        )
        resp = {
            'ids': torch.tensor(inputs['input_ids'], dtype = torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype = torch.long),
            'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype = torch.long),
            'targets': torch.tensor(self.targets[idx], dtype = torch.long),
        }
        return resp

In [41]:
class TextModel(tez.Model):
    def __init__(self, num_classes, num_train_steps):
        super().__init__()
        self.bert = transformers.BertModel.from_pretrained(
            'bert-base-uncased', return_dict = False)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, num_classes)
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = 'batch'
        
    def fetch_optimizer(self):
        opt = AdamW(self.parameters(), lr = 3e-5)
        return opt

    def fetch_scheduler(self):
        sch = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps = 0,
            num_training_steps = self.num_train_steps)
        return sch

    def loss(self, outputs, targets):
        return nn.CrossEntropyLoss()(outputs, targets)

    def monitor_metrics(self, outputs, targets):
        out = torch.argmax(outputs, axis = 1).cpu().detach().numpy()
        tag = targets.cpu().detach().numpy()

        return {'accuracy' : accuracy_score(out, tag)}

    def forward(self, ids, mask, token_type_ids, targets = None):
        _, x = self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids)
        x = self.bert_drop(x)
        x = self.out(x)
        if targets is not None:
            loss = self.loss(x, targets)
            met = self.monitor_metrics(x, targets)
            return x, loss, met
        return x, 0, {}

In [42]:
def train_model(fold, df, train_idx, val_idx):
    df_train = df.iloc[train_idx, :].reset_index(drop = True)
    df_val = df.iloc[val_idx, :].reset_index(drop = True)

    train_dataset = BertDataset(df_train.Review.values, df_train.Rating.values)
    val_dataset = BertDataset(df_val.Review.values, df_val.Rating.values)
    whole_dataset = BertDataset(df.Review.values, df.Rating.values)
    
    n_train_steps = int(len(df_train) / TRAIN_BS * EPOCHS)
    model = TextModel(num_classes = 5,
                      num_train_steps= n_train_steps)

    es = tez.callbacks.EarlyStopping(monitor = 'valid_loss', patience = 1, model_path="model.bin")
    model.fit(
        train_dataset,
        valid_dataset = val_dataset,
        device = DEVICE,
        epochs = 200,
        train_bs = 32,
        callbacks = [es])
    model.load('model.bin', device = DEVICE)
    pred = model.predict(whole_dataset, device = DEVICE)
    return pd.Series([p for p in pred])

In [43]:
TRAIN_BS = 32
EPOCHS = 200
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

In [None]:
df['pred'] = 0
pred_folds = {}
for i, (train_idx, val_idx) in enumerate(skf.split(X = train, y = train.Rating)):
    pred = train_model(fold = i, df = df, train_idx = train_idx, val_idx = val_idx)
    pred_folds[i] = pred

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




  0%|          | 0/402 [00:00<?, ?it/s]