In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
import pickle
from fasttext import load_model
#import sys
#sys.path.append('./')

In [3]:
data = pd.read_csv(r"..\Dataset\yelpReviewsDataset.csv")

In [4]:
with open(r"..\TokenizedDataset\yelpReviewsDatasetTokens.pkl", "rb") as file:
    corpus = pickle.load(file)

In [5]:
model = load_model(r'..\WordEmbeddings\FastText\yelp_review_polarity.bin')

In [6]:
# Create a function to generate text vectors
def text_vector(text):
    return model.get_sentence_vector(text)
# Convert text data to text vectors
X = [text_vector(text) for text in corpus]
y = data['Rating']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Train the Naive Bayes classifier
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('NB (with FastText Pretrained) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

NB (with FastText Pretrained) accuracy is 45.32%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  4402  3900  1724    76    21
1  2313  3871  3439   319    51
2   772  1900  4918  1720   630
3    58   250  2533  3571  3582
4    19   107  1169  2756  5899
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.58      0.43      0.50     10123
           2       0.39      0.39      0.39      9993
           3       0.36      0.49      0.41      9940
           4       0.42      0.36      0.39      9994
           5       0.58      0.59      0.59      9950

    accuracy                           0.45     50000
   macro avg       0.47      0.45      0.45     50000
weighted avg       0.47      0.45      0.45     50000



In [10]:
# Train the Logistic Regression classifier
LR = LogisticRegression(solver='lbfgs', max_iter=5000)
LR.fit(X_train, y_train)

predicted = LR.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('LR (with FastText Pretrained) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

LR (with FastText Pretrained) accuracy is 47.47%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  5752  2914  1369    71    17
1  3245  3523  2852   309    64
2  1101  1810  4459  2028   542
3    77   226  2131  4305  3255
4    19    74   933  3228  5696
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.56      0.57      0.57     10123
           2       0.41      0.35      0.38      9993
           3       0.38      0.45      0.41      9940
           4       0.43      0.43      0.43      9994
           5       0.59      0.57      0.58      9950

    accuracy                           0.47     50000
   macro avg       0.48      0.47      0.47     50000
weighted avg       0.48      0.47      0.47     50000



In [11]:
# Train the Random Forest classifier
RF = RandomForestClassifier(n_jobs=-1)
RF.fit(X_train, y_train)

predicted = RF.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('RF (with FastText Pretrained) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

RF (with FastText Pretrained) accuracy is 46.47%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  6386  2650   951    85    51
1  4062  3430  2039   332   130
2  1537  2187  3471  1870   875
3   143   372  1814  3907  3758
4    47   126   825  2911  6041
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.52      0.63      0.57     10123
           2       0.39      0.34      0.37      9993
           3       0.38      0.35      0.36      9940
           4       0.43      0.39      0.41      9994
           5       0.56      0.61      0.58      9950

    accuracy                           0.46     50000
   macro avg       0.46      0.46      0.46     50000
weighted avg       0.46      0.46      0.46     50000

