In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
import pickle
import sys
sys.path.append('../Vectorizers')
from CountVectorizer import getCountVectors

In [2]:
data = pd.read_csv(r"..\Dataset\yelpReviewsDataset.csv")

In [3]:
with open(r"..\TokenizedDataset\yelpReviewsDatasetTokens.pkl", "rb") as file:
    tokenized_Reviews = pickle.load(file)

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tokenized_Reviews, data['Rating'].values, test_size=0.2, random_state=42)

# Convert text data to numerical feature vectors
X_train_vectorized, X_test_vectorized = getCountVectors(X_train, X_test)

In [5]:
# Train the Naive Bayes classifier
MNB = MultinomialNB()
MNB.fit(X_train_vectorized, y_train)

predicted = MNB.predict(X_test_vectorized)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('NB (with count vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

NB (with count vectorizer) accuracy is 54.62%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  7074  2345   531    95    78
1  2818  3973  2686   382   134
2   947  1814  4502  2212   465
3   252   404  1768  4739  2831
4   203   163   372  2189  7023
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.63      0.70      0.66     10123
           2       0.46      0.40      0.43      9993
           3       0.46      0.45      0.45      9940
           4       0.49      0.47      0.48      9994
           5       0.67      0.71      0.69      9950

    accuracy                           0.55     50000
   macro avg       0.54      0.55      0.54     50000
weighted avg       0.54      0.55      0.54     50000



In [6]:
# Train the Logistic Regression classifier
LR = LogisticRegression(solver='lbfgs', max_iter=5000)
LR.fit(X_train_vectorized, y_train)

predicted = LR.predict(X_test_vectorized)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('LR (with count vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

LR (with count vectorizer) accuracy is 56.51%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  7283  2119   510   112    99
1  2682  4486  2257   388   180
2   616  2182  4581  2063   498
3   122   313  1846  4745  2968
4    95   103   369  2223  7160
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.67      0.72      0.70     10123
           2       0.49      0.45      0.47      9993
           3       0.48      0.46      0.47      9940
           4       0.50      0.47      0.49      9994
           5       0.66      0.72      0.69      9950

    accuracy                           0.57     50000
   macro avg       0.56      0.56      0.56     50000
weighted avg       0.56      0.57      0.56     50000



In [7]:
# Train the Random Forest classifier
RF = RandomForestClassifier(n_jobs=-1)
RF.fit(X_train_vectorized, y_train)

predicted = RF.predict(X_test_vectorized)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('RF (with count vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

RF (with count vectorizer) accuracy is 52.24%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  8046  1376   408   111   182
1  3787  3321  2037   448   400
2  1370  1989  3940  1728   913
3   430   518  1897  3541  3608
4   283   193   476  1727  7271
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.58      0.79      0.67     10123
           2       0.45      0.33      0.38      9993
           3       0.45      0.40      0.42      9940
           4       0.47      0.35      0.40      9994
           5       0.59      0.73      0.65      9950

    accuracy                           0.52     50000
   macro avg       0.51      0.52      0.51     50000
weighted avg       0.51      0.52      0.51     50000

