In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
import pickle
#import sys
#sys.path.append('./')

In [2]:
data = pd.read_csv(r"..\Dataset\yelpReviewsDataset.csv")

In [3]:
with open(r"..\TokenizedDataset\yelpReviewsDatasetTokens.pkl", "rb") as file:
    corpus = pickle.load(file)

In [4]:
X = corpus
y = data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
labels = y_train.values.tolist()
inputdata = [f"__label__{label} {text}" for label, text in zip(labels, X_train)]
with open(r"..\Vectorizers\train_FastText.txt", "w", encoding="utf-8") as f:
    f.write('\n'.join(inputdata))

In [5]:
# Train the FastText model
from fasttext import train_supervised
model = train_supervised(input=r"..\Vectorizers\train_FastText.txt")

In [6]:
# Create a function to generate text vectors
def text_vector(text):
    return model.get_sentence_vector(text)
# Convert text data to text vectors
X = [text_vector(text) for text in corpus]
y = data['Rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Train the Naive Bayes classifier
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('NB (with FastText) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

NB (with FastText) accuracy is 58.82%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  7419  2306   319    48    31
1  2410  5026  2228   264    65
2   497  2235  4970  2000   238
3   108   346  1964  5152  2424
4   156   166   332  2452  6844
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.70      0.73      0.72     10123
           2       0.50      0.50      0.50      9993
           3       0.51      0.50      0.50      9940
           4       0.52      0.52      0.52      9994
           5       0.71      0.69      0.70      9950

    accuracy                           0.59     50000
   macro avg       0.59      0.59      0.59     50000
weighted avg       0.59      0.59      0.59     50000



In [9]:
# Train the Logistic Regression classifier
LR = LogisticRegression(solver='lbfgs', max_iter=5000)
LR.fit(X_train, y_train)

predicted = LR.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('LR (with FastText) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

LR (with FastText) accuracy is 59.24%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  7423  2137   392    83    88
1  2480  4940  2174   269   130
2   524  2098  5013  1969   336
3    93   248  1894  5253  2506
4    97    99   354  2411  6989
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.70      0.73      0.72     10123
           2       0.52      0.49      0.51      9993
           3       0.51      0.50      0.51      9940
           4       0.53      0.53      0.53      9994
           5       0.70      0.70      0.70      9950

    accuracy                           0.59     50000
   macro avg       0.59      0.59      0.59     50000
weighted avg       0.59      0.59      0.59     50000



In [10]:
# Train the Random Forest classifier
RF = RandomForestClassifier(n_jobs=-1)
RF.fit(X_train, y_train)

predicted = RF.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('RF (with FastText) accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

RF (with FastText) accuracy is 58.16%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  7292  2277   397    75    82
1  2506  4983  2082   298   124
2   513  2370  4745  1941   371
3    94   316  1932  5084  2568
4    89   125   389  2371  6976
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.72      0.71     10123
           2       0.49      0.50      0.50      9993
           3       0.50      0.48      0.49      9940
           4       0.52      0.51      0.51      9994
           5       0.69      0.70      0.70      9950

    accuracy                           0.58     50000
   macro avg       0.58      0.58      0.58     50000
weighted avg       0.58      0.58      0.58     50000

