In [81]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
import xgboost as xgb
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler,  ClusterCentroids
from sklearn import ensemble, linear_model, preprocessing, neighbors, datasets
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

warnings.filterwarnings('ignore')

#We're dealing with a multiclass classification problem with 5 possible target classes

In [89]:
#loading the datasets
X_train = pd.read_csv('data/X_train.csv', engine='python')
X_test = pd.read_csv('data/X_test.csv', engine='python')
y_train = pd.read_csv('data/y_train.csv', engine='python')['rating']
y_test = pd.read_csv('data/y_test.csv', engine='python')['rating']

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("X_train info:", X_train.info())
print("X_train dtypes:", X_train.dtypes)

X_train shape: (51336, 7)
X_test shape: (12834, 7)
y_train shape: (51336,)
y_test shape: (12834,)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   number_reviews_scaled    51336 non-null  float64
 1   review_time_encoded      51336 non-null  int64  
 2   text_word_length_scaled  51336 non-null  float64
 3   Sentiment_VADER          51336 non-null  float64
 4   Sentiment_Blob           51336 non-null  float64
 5   bow_1920                 51336 non-null  int64  
 6   local_hour               51336 non-null  int64  
dtypes: float64(4), int64(3)
memory usage: 2.7 MB
X_train info: None
X_train dtypes: number_reviews_scaled      float64
review_time_encoded          int64
text_word_length_scaled    float64
Sentiment_VADER            float64
Sentiment_Blob             float64
bow_1920                     int64
local_hour   

In [90]:
#Train base model
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

#Evaluate base model
base_model_score = model.score(X_test, y_test)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
cm_2 = pd.crosstab(y_test, y_pred, rownames=['Reality'], colnames=['Prediction'])
y_probas = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_probas, pos_label=1)
roc_auc = auc(fpr, tpr)

print("Accuracy: ", base_model_score * 100, "%")
print("ROC AUC: ", roc_auc * 100, "%")
print("Confusion matrix: ", "\n", cm)
print("PD Crosstab: ", "\n", cm_2)
print("Classification report: ", "\n", cr)

Accuracy:  66.24590930341282 %
ROC AUC:  77.42213960136122 %
Confusion matrix:  
 [[2551    0   76    0  561]
 [ 514    0   48    0  301]
 [ 570    0   76    0  608]
 [ 261    0   44    0  952]
 [ 338    0   59    0 5875]]
PD Crosstab:  
 Prediction     1   3     5
Reality                   
1           2551  76   561
2            514  48   301
3            570  76   608
4            261  44   952
5            338  59  5875
Classification report:  
               precision    recall  f1-score   support

           1       0.60      0.80      0.69      3188
           2       0.00      0.00      0.00       863
           3       0.25      0.06      0.10      1254
           4       0.00      0.00      0.00      1257
           5       0.71      0.94      0.81      6272

    accuracy                           0.66     12834
   macro avg       0.31      0.36      0.32     12834
weighted avg       0.52      0.66      0.57     12834



In [66]:
#Bagging
#Bagging Classifier
#Removed as accuracy dropped for our model
bc = BaggingClassifier(n_estimators=1000, oob_score=True)
bc.fit(X_train, y_train)

#Evaluate model with Bagging Classifier
y_pred_bc = bc.predict(X_test)
cm_bc = confusion_matrix(y_test, y_pred_bc)
cr_bc = classification_report(y_test, y_pred_bc)
cm_2_bc = pd.crosstab(y_test, y_pred_bc, rownames=['Reality'], colnames=['Prediction'])

print("OOB Score: ", bc.oob_score_)
print("Confusion matrix: ", "\n", cm_bc)
print("PD Crosstab: ", "\n", cm_2_bc)
print("Classification report: ", "\n", cr_bc)

OOB Score:  0.6385187782452859
Confusion matrix:  
 [[2397  101  204   86  400]
 [ 423   54  144   42  200]
 [ 452   81  208   78  435]
 [ 221   42  102   84  808]
 [ 328   56  153  169 5566]]
PD Crosstab:  
 Prediction     1    2    3    4     5
Reality                              
1           2397  101  204   86   400
2            423   54  144   42   200
3            452   81  208   78   435
4            221   42  102   84   808
5            328   56  153  169  5566
Classification report:  
               precision    recall  f1-score   support

           1       0.63      0.75      0.68      3188
           2       0.16      0.06      0.09       863
           3       0.26      0.17      0.20      1254
           4       0.18      0.07      0.10      1257
           5       0.75      0.89      0.81      6272

    accuracy                           0.65     12834
   macro avg       0.40      0.39      0.38     12834
weighted avg       0.58      0.65      0.60     12834



In [67]:
#Boosting
#Adaptive Boosting
#Removed as accuracy did not improve
ac = AdaBoostClassifier(estimator=model, n_estimators=2000)
ac.fit(X_train, y_train)

#Evaluate model with Adaptive Boosting
y_pred_ac = ac.predict(X_test)
cr_ac = classification_report(y_test, y_pred_ac)
cm_2_ac = pd.crosstab(y_test, y_pred_ac, rownames=['Reality'], colnames=['Prediction'])

print("PD Crosstab: ", "\n", cm_2_ac)
print("Classification report: ", "\n", cr_ac)




PD Crosstab:  
 Prediction     1   3     5
Reality                   
1           2526  63   599
2            514  35   314
3            563  58   633
4            243  41   973
5            303  51  5918
Classification report:  
               precision    recall  f1-score   support

           1       0.61      0.79      0.69      3188
           2       0.00      0.00      0.00       863
           3       0.23      0.05      0.08      1254
           4       0.00      0.00      0.00      1257
           5       0.70      0.94      0.80      6272

    accuracy                           0.66     12834
   macro avg       0.31      0.36      0.31     12834
weighted avg       0.52      0.66      0.57     12834



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [101]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
cm_2 = pd.crosstab(y_test, y_pred, rownames=['Reality'], colnames=['Prediction'])

print("Accuracy: ", base_model_score * 100, "%")
print("Confusion matrix: ", "\n", cm)
print("PD Crosstab: ", "\n", cm_2)
print("Classification report: ", "\n", cr)

Accuracy:  66.24590930341282 %
Confusion matrix:  
 [[2556    1  100    7  524]
 [ 521    1   78    4  259]
 [ 555    1  119    6  573]
 [ 261    0   56    5  935]
 [ 306    1   58    4 5903]]
PD Crosstab:  
 Prediction     1  2    3  4     5
Reality                          
1           2556  1  100  7   524
2            521  1   78  4   259
3            555  1  119  6   573
4            261  0   56  5   935
5            306  1   58  4  5903
Classification report:  
               precision    recall  f1-score   support

           1       0.61      0.80      0.69      3188
           2       0.25      0.00      0.00       863
           3       0.29      0.09      0.14      1254
           4       0.19      0.00      0.01      1257
           5       0.72      0.94      0.82      6272

    accuracy                           0.67     12834
   macro avg       0.41      0.37      0.33     12834
weighted avg       0.57      0.67      0.59     12834



In [99]:
# Current best model
# XGBoostClassifier with GridSearchCV
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [200, 300],
    'min_child_weight': [1, 3],
    'gamma': [0, 0.1],
    'scale_pos_weight': [1, 3, 5],
    'class_weight': ['balanced']
}
# Create and fit model with GridSearchCV
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob', 
    num_class=5,
    subsample=0.8,  # add these as fixed parameters instead
    colsample_bytree=0.8
)
grid_search = GridSearchCV(
    estimator=xgb_model, 
    param_grid=param_grid, 
    cv=5, 
    scoring='f1_weighted',  # better for imbalanced data
    verbose=2  # increased verbosity to see progress
)

# Fit with 0-based labels
grid_search.fit(X_train, y_train - 1)

# Print best parameters
print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Now use the best parameters with the original XGBoost implementation
best_params = grid_search.best_params_

# Adjust labels to be 0-based
y_train_0based = y_train - 1
y_test_0based = y_test - 1

# Create DMatrix objects with 0-based labels
train = xgb.DMatrix(X_train, y_train_0based)
test = xgb.DMatrix(X_test, y_test_0based)

# Set parameters using GridSearchCV results
params = {'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 5,
    'learning_rate': best_params['learning_rate'],
    'max_depth': best_params['max_depth']}

# Train model
model = xgb.train(params,  dtrain=train, 
    num_boost_round=best_params['n_estimators'],
    early_stopping_rounds=15, 
    evals=[(train, 'train'), (test, 'eval')])

# Get predictions and convert back to 1-based
preds_prob = model.predict(test)
preds = np.argmax(preds_prob, axis=1) + 1  

# Print metrics
print("\nFinal Model Performance:")
print("Confusion Matrix - XGBoost: \n", confusion_matrix(y_test, preds))
print("\nClassification Report - XGBoost: \n", classification_report(y_test, preds))
print("\nROC AUC - XGBoost: ", roc_auc_score(
    pd.get_dummies(y_test), 
    preds_prob, 
    multi_class='ovr'))

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END class_weight=balanced, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, scale_pos_weight=1; total time=   0.9s
[CV] END class_weight=balanced, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, scale_pos_weight=1; total time=   0.9s
[CV] END class_weight=balanced, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, scale_pos_weight=1; total time=   0.9s
[CV] END class_weight=balanced, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, scale_pos_weight=1; total time=   0.8s
[CV] END class_weight=balanced, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, scale_pos_weight=1; total time=   0.8s
[CV] END class_weight=balanced, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200, scale_pos_weight=3; total time=   0.8s
[CV] END class_weight=balanced

In [97]:
#Ensemble method
#Voting Classifier
clf1 = KNeighborsClassifier(n_neighbors=3)
clf2 = RandomForestClassifier(random_state=123)
vclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2), ('lr', model)], voting='hard')
vclf.fit(X_train, y_train)
print("Voting Classifier Score:", vclf.score(X_test, y_test))

# Cross validation for each classifier
for clf, label in zip([clf1, clf2, model, vclf], 
                     ['KNN', 'Random Forest', 'Logistic Regression', 'Voting Classifier']):
    scores = cross_validate(clf, X_train, y_train, cv=5, scoring=['accuracy','f1_weighted'])
    print(f"\n[{label}]:")
    print(f"Accuracy: {scores['test_accuracy'].mean():.2f} (+/- {scores['test_accuracy'].std():.2f})")
    print(f"F1 score: {scores['test_f1_weighted'].mean():.2f} (+/- {scores['test_f1_weighted'].std():.2f})")

#Stacking Classifier
sclf = StackingClassifier(
    estimators=[('knn', clf1), ('rf', clf2), ('lr', model)],
    final_estimator=LogisticRegression(),
    cv=5
)

# Cross validation for Stacking
scores = cross_validate(sclf, X_train, y_train, cv=5, scoring=['accuracy', 'f1_weighted'])
print("\n[Stacking Classifier]:")
print(f"Accuracy: {scores['test_accuracy'].mean():.2f} (+/- {scores['test_accuracy'].std():.2f})")
print(f"F1 score: {scores['test_f1_weighted'].mean():.2f} (+/- {scores['test_f1_weighted'].std():.2f})")

# Final fit and score
sclf.fit(X_train, y_train)
print("Stacking Classifier Score:", sclf.score(X_test, y_test))

Voting Classifier Score: 0.6577060931899642

[KNN]:
Accuracy: 0.59 (+/- 0.00)
F1 score: 0.57 (+/- 0.00)

[Random Forest]:
Accuracy: 0.65 (+/- 0.00)
F1 score: 0.60 (+/- 0.00)

[Logistic Regression]:
Accuracy: 0.66 (+/- 0.00)
F1 score: 0.57 (+/- 0.00)

[Voting Classifier]:
Accuracy: 0.65 (+/- 0.00)
F1 score: 0.58 (+/- 0.00)

[Stacking Classifier]:
Accuracy: 0.66 (+/- 0.00)
F1 score: 0.59 (+/- 0.00)
Stacking Classifier Score: 0.6663549945457379


In [95]:
#Base model
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

#Random Oversampling
rOs = RandomOverSampler()
X_ro, y_ro = rOs.fit_resample(X_train, y_train)
print(dict(pd.Series(y_ro).value_counts()))

model.fit(X_ro, y_ro)
y_pred = model.predict(X_test)
print(pd.crosstab(y_test, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

{2: np.int64(25088), 1: np.int64(25088), 5: np.int64(25088), 4: np.int64(25088), 3: np.int64(25088)}
col_0      1    2    3     4     5
rating                            
1       1986  419  418   263   102
2        247  195  232   130    59
3        217  240  390   250   157
4         86  115  224   371   461
5        137  168  420  1126  4421
                   pre       rec       spe        f1       geo       iba       sup

          1       0.74      0.62      0.93      0.68      0.76      0.56      3188
          2       0.17      0.23      0.92      0.20      0.46      0.19       863
          3       0.23      0.31      0.89      0.27      0.53      0.26      1254
          4       0.17      0.30      0.85      0.22      0.50      0.24      1257
          5       0.85      0.70      0.88      0.77      0.79      0.61      6272

avg / total       0.65      0.57      0.89      0.61      0.71      0.50     12834



In [98]:
from imblearn.combine import SMOTETomek
#Base model
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

#SMOTE
smt = SMOTETomek()
X_resampled, y_resampled = smt.fit_resample(X_train, y_train)
print(dict(pd.Series(y_resampled).value_counts()))

model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test)
print(pd.crosstab(y_test, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

{2: np.int64(23694), 1: np.int64(23405), 3: np.int64(23354), 4: np.int64(23017), 5: np.int64(22524)}
col_0      1    2    3     4     5
rating                            
1       1965  463  402   266    92
2        249  206  222   131    55
3        205  277  369   264   139
4         83  128  217   392   437
5        123  195  421  1227  4306
                   pre       rec       spe        f1       geo       iba       sup

          1       0.75      0.62      0.93      0.68      0.76      0.56      3188
          2       0.16      0.24      0.91      0.19      0.47      0.20       863
          3       0.23      0.29      0.89      0.26      0.51      0.25      1254
          4       0.17      0.31      0.84      0.22      0.51      0.25      1257
          5       0.86      0.69      0.89      0.76      0.78      0.60      6272

avg / total       0.65      0.56      0.90      0.60      0.70      0.49     12834



In [86]:
#Base model
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

#SMOTE
smo = SMOTE()
X_sm, y_sm = smo.fit_resample(X_train, y_train)
print(dict(pd.Series(y_sm).value_counts()))

model.fit(X_sm, y_sm)
y_pred = model.predict(X_test)
print(pd.crosstab(y_test, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

{2: np.int64(25088), 1: np.int64(25088), 5: np.int64(25088), 4: np.int64(25088), 3: np.int64(25088)}
col_0      1    2    3     4     5
rating                            
1       1998  436  395   252   107
2        256  205  221   123    58
3        219  259  368   252   156
4         88  121  219   378   451
5        144  174  429  1145  4380
                   pre       rec       spe        f1       geo       iba       sup

          1       0.74      0.63      0.93      0.68      0.76      0.56      3188
          2       0.17      0.24      0.92      0.20      0.47      0.20       863
          3       0.23      0.29      0.89      0.26      0.51      0.25      1254
          4       0.18      0.30      0.85      0.22      0.50      0.24      1257
          5       0.85      0.70      0.88      0.77      0.78      0.60      6272

avg / total       0.65      0.57      0.89      0.60      0.70      0.50     12834



In [87]:
#Base model
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

#Random Undersampling
rUs = RandomUnderSampler()
X_ru, y_ru = rUs.fit_resample(X_train, y_train)
print(dict(pd.Series(y_ru).value_counts()))

model.fit(X_ru, y_ru)
y_pred = model.predict(X_test)
print(pd.crosstab(y_test, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

{1: np.int64(3450), 2: np.int64(3450), 3: np.int64(3450), 4: np.int64(3450), 5: np.int64(3450)}
col_0      1    2    3     4     5
rating                            
1       1997  415  416   256   104
2        246  195  231   133    58
3        217  247  379   252   159
4         84  119  217   379   458
5        137  177  418  1121  4419
                   pre       rec       spe        f1       geo       iba       sup

          1       0.74      0.63      0.93      0.68      0.76      0.56      3188
          2       0.17      0.23      0.92      0.19      0.46      0.19       863
          3       0.23      0.30      0.89      0.26      0.52      0.25      1254
          4       0.18      0.30      0.85      0.22      0.51      0.24      1257
          5       0.85      0.70      0.88      0.77      0.79      0.61      6272

avg / total       0.65      0.57      0.89      0.61      0.71      0.50     12834



In [88]:
#Base model
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

#ClusterCentroids
cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X_train, y_train)
print(dict(pd.Series(y_cc).value_counts()))

model.fit(X_cc, y_cc)
y_pred = model.predict(X_test)
print(pd.crosstab(y_test, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

{1: np.int64(3450), 2: np.int64(3450), 3: np.int64(3450), 4: np.int64(3450), 5: np.int64(3450)}
col_0      1    2    3     4     5
rating                            
1       1741  760  259   238   190
2        216  286  138   139    84
3        165  370  248   279   192
4         66  160  139   404   488
5         98  222  242  1357  4353
                   pre       rec       spe        f1       geo       iba       sup

          1       0.76      0.55      0.94      0.64      0.72      0.49      3188
          2       0.16      0.33      0.87      0.21      0.54      0.27       863
          3       0.24      0.20      0.93      0.22      0.43      0.17      1254
          4       0.17      0.32      0.83      0.22      0.52      0.25      1257
          5       0.82      0.69      0.85      0.75      0.77      0.58      6272

avg / total       0.64      0.55      0.88      0.58      0.68      0.47     12834



In [None]:
#Anomaly Detection

