In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
import pickle
import sys
sys.path.append('../Vectorizers')
from Word2VecVectorizer import getWord2VecVectors

In [2]:
data = pd.read_csv(r"..\Dataset\yelpReviewsDataset.csv")

In [3]:
with open(r"..\TokenizedDataset\yelpReviewsDatasetTokens.pkl", "rb") as file:
    tokenized_Reviews = pickle.load(file)

In [10]:
X = getWord2VecVectors(tokenized_Reviews, 300)
y = data['Rating'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
# Train the Naive Bayes classifier
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('NB (with Word2Vec vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

NB (with Word2Vec vectorizer) accuracy is 28.39%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  3891  1147  2206   646  2233
1  2751  1301  2695   801  2445
2  1990  1082  3246  1121  2501
3  1684   945  2640  1517  3208
4  1705   688  2059  1256  4242
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.32      0.38      0.35     10123
           2       0.25      0.13      0.17      9993
           3       0.25      0.33      0.28      9940
           4       0.28      0.15      0.20      9994
           5       0.29      0.43      0.35      9950

    accuracy                           0.28     50000
   macro avg       0.28      0.28      0.27     50000
weighted avg       0.28      0.28      0.27     50000



In [13]:
# Train the Logistic Regression classifier
LR = LogisticRegression(solver='lbfgs', max_iter=5000)
LR.fit(X_train, y_train)

predicted = LR.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('LR (with Word2Vec vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

LR (with Word2Vec vectorizer) accuracy is 32.25%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  5292  1348  1080   887  1516
1  3744  1645  1605  1274  1725
2  2400  1576  2045  1984  1935
3  1726  1152  1708  2612  2796
4  1684   807  1046  1880  4533
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.36      0.52      0.42     10123
           2       0.25      0.16      0.20      9993
           3       0.27      0.21      0.23      9940
           4       0.30      0.26      0.28      9994
           5       0.36      0.46      0.40      9950

    accuracy                           0.32     50000
   macro avg       0.31      0.32      0.31     50000
weighted avg       0.31      0.32      0.31     50000



In [15]:
# Train the Random Forest classifier
RF = RandomForestClassifier(n_jobs=-1)
RF.fit(X_train, y_train)

predicted = RF.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('RF (with Word2Vec vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

RF (with Word2Vec vectorizer) accuracy is 30.79%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  4312  1873  1341  1087  1510
1  2887  2151  1873  1554  1528
2  1866  1952  2301  2064  1757
3  1436  1500  2021  2510  2527
4  1481  1090  1309  1949  4121
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.36      0.43      0.39     10123
           2       0.25      0.22      0.23      9993
           3       0.26      0.23      0.24      9940
           4       0.27      0.25      0.26      9994
           5       0.36      0.41      0.39      9950

    accuracy                           0.31     50000
   macro avg       0.30      0.31      0.30     50000
weighted avg       0.30      0.31      0.30     50000

