In [None]:
import os
from pandas import *
import numpy as np
import pandas as pd
import time
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import metrics

In [None]:
from sklearn.model_selection import train_test_split

path_to_data = 'BERT-EMBEDDINGS.csv'
data_all = read_csv(path_to_data, header=None)  # Skip the first row
Y_train = overall_labels
Y_train_transposed = np.transpose(Y_train)
Y_train = Y_train_transposed[0:2000]
X_train = data_all.iloc[0:2000,:].values

Y_test = data_all.loc[1:501,0]


X_test = data_all.loc[1:501,1:768]


dict_classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Nearest Neighbors": KNeighborsClassifier(),
    "Rbf SVM": SVC(kernel='rbf', gamma=0.45, C=3.7),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    "Neural Net": MLPClassifier(alpha=1),
    "Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=1000),
    "QDA": QuadraticDiscriminantAnalysis(),
}



def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 10, verbose = True):
    """
    This method, takes as input the X, Y matrices of the Train and Test set.
    And fits them on all of the Classifiers specified in the dict_classifier.
    The trained models, and accuracies are saved in a dictionary. The reason to use a dictionary
    is because it is very easy to save the whole dictionary with the pickle module.
    
    Usually, the SVM, Random Forest and Gradient Boosting Classifier take quiet some time to train. 
    So it is best to train them on a smaller dataset first and 
    decide whether you want to comment them out or not based on the test accuracy score.
    """
    
    dict_models = {}
    for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]:
        t_start = time.process_time()
        classifier.fit(X_train, Y_train)
        t_end = time.process_time()
        
        t_diff = t_end - t_start
        train_score = classifier.score(X_train, Y_train)
        test_score = classifier.score(X_test, Y_test)
        
        y_pred = classifier.predict(X_test)        
        precision_value = metrics.precision_score(Y_test, y_pred)
        recall_value = metrics.recall_score(Y_test, y_pred)
        f1_value = metrics.f1_score(Y_test, y_pred)
        
        dict_models[classifier_name] = {'model': classifier, 'train_score': train_score, 'test_score': test_score, 'train_time': t_diff, 'precision':precision_value, 'recall':recall_value, 'f1-value':f1_value}
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=classifier_name, f=t_diff))
    return dict_models



def display_dict_models(dict_models, sort_by='test_score'):
    cls = [key for key in dict_models.keys()]
    test_s = [dict_models[key]['test_score'] for key in cls]
    training_s = [dict_models[key]['train_score'] for key in cls]
    training_t = [dict_models[key]['train_time'] for key in cls]
    prec = [dict_models[key]['precision'] for key in cls]
    rec = [dict_models[key]['recall'] for key in cls]
    f1 = [dict_models[key]['f1-value'] for key in cls]
    
    df_ = pd.DataFrame(data=np.zeros(shape=(len(cls),7)), columns = ['classifier', 'train_score', 'test_score', 'train_time','precision','recall','f1-value'])
    for ii in range(0,len(cls)):
        df_.loc[ii, 'classifier'] = cls[ii]
        df_.loc[ii, 'train_score'] = training_s[ii]
        df_.loc[ii, 'test_score'] = test_s[ii]
        df_.loc[ii, 'train_time'] = training_t[ii]
        df_.loc[ii, 'precision'] = prec[ii]
        df_.loc[ii, 'recall'] = rec[ii]
        df_.loc[ii, 'f1-value'] = f1[ii]
    
    display(df_.sort_values(by=sort_by, ascending=False))

dict_models = batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 10)
display_dict_models(dict_models)
