# scikit-learn Classification Model Benchmarking

Testing out some different classifiers in scikit-learn using just the TFIDF features of the translated Arabic data to begin with.

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import precision_score, recall_score, average_precision_score

warnings.filterwarnings('ignore') 

X = pd.read_csv("dataset_features/data_stem_tfidf_features.csv", index_col=0)
y = pd.read_csv("translated_tweets_t1.csv")['label'].to_list()

np.random.seed(42)

In [2]:
models = [GaussianNB(), RandomForestClassifier(n_estimators=100), SGDClassifier(),
          AdaBoostClassifier(), GradientBoostingClassifier()]#, KNeighborsClassifier()] # KNN takes an extremely long time
models_str = ["Naive Bayes", "Random Forest", "SGD", "AdaBoost", "Gradient Boosting"]#, "KNN"]

for i, model in enumerate(models):
    scoring = {'acc': 'accuracy', 'prec_macro': 'precision_macro',
               'rec_micro': 'recall_macro', 'map':'average_precision'}
    scores = cross_validate(model, X, y, scoring=scoring, cv=5, 
                            return_train_score=True)
    prec_mean = np.mean(scores['test_prec_macro'])
    rec_mean = np.mean(scores['test_rec_micro'])
    print(models_str[i])
    print("Test F1 Score:", 
          round((2 * prec_mean * rec_mean) / (prec_mean + rec_mean), 3))
    print("Test MAP Score:", round(np.mean(scores['test_map']), 3), '\n')

# Baseline 'predict same for every instance' scores
baseline = [1]*len(y)
base_r = recall_score(y, baseline)
base_p = precision_score(y, baseline)
base_f = (2 * base_r * base_p) / (base_r + base_p)
base_map = average_precision_score(y, baseline)
print("Baseline F1 Score:", round(base_f, 3))
print("Baseline MAP Score:", round(base_map, 3))

Naive Bayes
Test F1 Score: 0.602
Test MAP Score: 0.392 

Random Forest
Test F1 Score: 0.606
Test MAP Score: 0.59 

SGD
Test F1 Score: 0.639
Test MAP Score: 0.519 

AdaBoost
Test F1 Score: 0.625
Test MAP Score: 0.514 

Gradient Boosting
Test F1 Score: 0.606
Test MAP Score: 0.541 

Baseline F1 Score: 0.468
Baseline MAP Score: 0.306


Adaboost, Gradient Boosting and Random Forest are the most promising here so focusing on those three for hyperparameter optimisation.

In [3]:
select_models = [RandomForestClassifier(), GradientBoostingClassifier(), AdaBoostClassifier()]
select_models_str = ["Random Forest", "Gradient Boosting", "AdaBoost"]

rf_params = {'n_estimators':[50, 100, 150, 200, 250, 300]}
gb_params = {'n_estimators':[50, 100, 150], 'learning_rate':[0.9, 0.5, 0.1, 0.01]}
ada_params = {'n_estimators':[50, 100, 150], 'learning_rate':[0.9, 0.5, 0.1, 0.01]}
model_params = [rf_params, gb_params, ada_params]
scoring = {'f': 'f1_macro', 'map':'average_precision'}

for i, model in enumerate(select_models):
    cv = GridSearchCV(model, model_params[i], cv=5, scoring=scoring, refit=False)
    cv.fit(X, y)
    best_map_mean = np.max(cv.cv_results_['mean_test_map'])
    best_f1_mean = cv.cv_results_['mean_test_f'][np.argmax(cv.cv_results_['mean_test_map'])]
    print(select_models_str[i])
    print("Best Test MAP Score:", round(best_map_mean, 3))
    print("Parameters:", cv.cv_results_['params'][np.argmax(cv.cv_results_['mean_test_map'])])
    print("Test F1 Score for Model w/ Best MAP:", round(best_f1_mean, 3), '\n')

Random Forest
Best Test MAP Score: 0.598
Parameters: {'n_estimators': 250}
Test F1 Score for Model w/ Best MAP: 0.531 

Gradient Boosting
Best Test MAP Score: 0.578
Parameters: {'learning_rate': 0.01, 'n_estimators': 150}
Test F1 Score for Model w/ Best MAP: 0.538 

AdaBoost
Best Test MAP Score: 0.58
Parameters: {'learning_rate': 0.1, 'n_estimators': 50}
Test F1 Score for Model w/ Best MAP: 0.537 

