In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
import pickle
import sys
sys.path.append('../Vectorizers')
from Word2VecVectorizer_PreTrained import getWord2VecVectorsPreTrained

In [2]:
data = pd.read_csv(r"..\Dataset\yelpReviewsDataset.csv")

In [3]:
with open(r"..\TokenizedDataset\yelpReviewsDatasetTokens.pkl", "rb") as file:
    tokenized_Reviews = pickle.load(file)

In [4]:
X = getWord2VecVectorsPreTrained(tokenized_Reviews)
y = data['Rating'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# Train the Naive Bayes classifier
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('NB (with pretrained Word2Vec vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

NB (with pretrained Word2Vec vectorizer) accuracy is 28.15%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  4548  1074  1744  1597  1160
1  3324  1090  2118  2026  1435
2  2351   947  2316  2668  1658
3  2014   695  1799  3347  2139
4  2426   577  1390  2784  2773
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.31      0.45      0.37     10123
           2       0.25      0.11      0.15      9993
           3       0.25      0.23      0.24      9940
           4       0.27      0.33      0.30      9994
           5       0.30      0.28      0.29      9950

    accuracy                           0.28     50000
   macro avg       0.28      0.28      0.27     50000
weighted avg       0.28      0.28      0.27     50000



In [7]:
# Train the Logistic Regression classifier
LR = LogisticRegression(solver='lbfgs', max_iter=5000)
LR.fit(X_train, y_train)

predicted = LR.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('LR (with pretrained Word2Vec vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

LR (with pretrained Word2Vec vectorizer) accuracy is 32.06%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  4711  1813  1178   953  1468
1  3109  2198  1849  1356  1481
2  2019  1854  2323  2050  1694
3  1497  1382  1942  2787  2386
4  1774   955  1105  2106  4010
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.36      0.47      0.41     10123
           2       0.27      0.22      0.24      9993
           3       0.28      0.23      0.25      9940
           4       0.30      0.28      0.29      9994
           5       0.36      0.40      0.38      9950

    accuracy                           0.32     50000
   macro avg       0.31      0.32      0.31     50000
weighted avg       0.31      0.32      0.31     50000



In [8]:
# Train the Random Forest classifier
RF = RandomForestClassifier(n_jobs=-1)
RF.fit(X_train, y_train)

predicted = RF.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('RF (with pretrained Word2Vec vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

RF (with pretrained Word2Vec vectorizer) accuracy is 30.54%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  4339  1989  1285  1097  1413
1  2873  2188  1884  1514  1534
2  1927  1878  2271  2156  1708
3  1451  1499  1977  2547  2520
4  1595  1070  1341  2018  3926
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.36      0.43      0.39     10123
           2       0.25      0.22      0.24      9993
           3       0.26      0.23      0.24      9940
           4       0.27      0.25      0.26      9994
           5       0.35      0.39      0.37      9950

    accuracy                           0.31     50000
   macro avg       0.30      0.31      0.30     50000
weighted avg       0.30      0.31      0.30     50000

