In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
import pickle
import sys
sys.path.append('../Vectorizers')
from TFIDFVectorizer import getTFIDFVectors

In [2]:
data = pd.read_csv(r"..\Dataset\yelpReviewsDataset.csv")

In [3]:
with open(r"..\TokenizedDataset\yelpReviewsDatasetTokens.pkl", "rb") as file:
    tokenized_Reviews = pickle.load(file)

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tokenized_Reviews, data['Rating'].values, test_size=0.2, random_state=42)

# Convert text data to numerical feature vectors
X_train_vectorized, X_test_vectorized = getTFIDFVectors(X_train, X_test)

In [5]:
# Train the Naive Bayes classifier
MNB = MultinomialNB()
MNB.fit(X_train_vectorized, y_train)

predicted = MNB.predict(X_test_vectorized)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('NB (with TFIDF vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

NB (with TFIDF vectorizer) accuracy is 54.38%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  6945  2574   459    90    55
1  2671  4519  2382   333    88
2   888  2070  4608  2053   321
3   254   485  2111  5079  2065
4   214   206   509  2983  6038
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.63      0.69      0.66     10123
           2       0.46      0.45      0.46      9993
           3       0.46      0.46      0.46      9940
           4       0.48      0.51      0.49      9994
           5       0.70      0.61      0.65      9950

    accuracy                           0.54     50000
   macro avg       0.55      0.54      0.54     50000
weighted avg       0.55      0.54      0.54     50000



In [6]:
# Train the Logistic Regression classifier
LR = LogisticRegression(solver='lbfgs', max_iter=5000)
LR.fit(X_train_vectorized, y_train)

predicted = LR.predict(X_test_vectorized)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('LR (with TFIDF vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

LR (with TFIDF vectorizer) accuracy is 58.57%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  7508  2067   361    92    95
1  2589  4804  2117   314   169
2   585  2160  4785  1977   433
3   125   264  1849  5083  2673
4   113   101   296  2335  7105
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.74      0.71     10123
           2       0.51      0.48      0.50      9993
           3       0.51      0.48      0.49      9940
           4       0.52      0.51      0.51      9994
           5       0.68      0.71      0.70      9950

    accuracy                           0.59     50000
   macro avg       0.58      0.59      0.58     50000
weighted avg       0.58      0.59      0.58     50000



In [7]:
# Train the Random Forest classifier
RF = RandomForestClassifier(n_jobs=-1)
RF.fit(X_train_vectorized, y_train)

predicted = RF.predict(X_test_vectorized)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('RF (with TFIDF vectorizer) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

RF (with TFIDF vectorizer) accuracy is 52.15%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  8043  1406   369   115   190
1  3798  3315  2017   451   412
2  1376  2049  3845  1818   852
3   423   511  1878  3709  3473
4   300   186   458  1842  7164
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.58      0.79      0.67     10123
           2       0.44      0.33      0.38      9993
           3       0.45      0.39      0.42      9940
           4       0.47      0.37      0.41      9994
           5       0.59      0.72      0.65      9950

    accuracy                           0.52     50000
   macro avg       0.51      0.52      0.51     50000
weighted avg       0.51      0.52      0.51     50000

