# COMP47590 Advanced Machine Learning
# Pefroming a Machine Learning Benchmarking Experiment in Python

## Imports

Import the libraries we will use - **aeon** is the new one we use for nice benchmarking imlpementations.

In [34]:
import io
import random
import time
import pickle

import pandas as pd  # core data handling package
import numpy as np  # core data handling package
import matplotlib  # core plotting functioanlity
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns  # nicer plotting functionlity

import sklearn  # For basic machine learning functionality
import sklearn.preprocessing
import sklearn.metrics
import sklearn.model_selection
import sklearn.tree
import sklearn.ensemble

## Setup

Take only a sample of the dataset for fast testing

In [35]:
data_sampling_rate = 0.1

Setup the number of folds for all grid searches (should be 5 - 10)

In [36]:
grid_search_cv_folds = 2

Setup the number of folds for all grid searches (should be 5 - 10)

In [37]:
final_cv_folds = 2

Set up a dictionary to store simple model performance comparions

In [38]:
model_evaluation_results = dict()

### Setup Datasets

Set up a dictionary to store details of datasets to be used in the experiment

In [39]:
datasets = dict()

Load the dataset and explore it.

In [40]:
dataset_details = dict()
dataset_details["file_name"] = '../Data/mnist_train.csv'
dataset_details["target_feature"] = "label"
dataset_details["num_classes"] = 10
dataset_details["classes"] = {0: "0", 1: "1", 2: "2", 3: "3", 4: "4", 5: "5", 6: "6", 7: "7", 8: "8", 9: "9"}
datasets["mnist"] = dataset_details

In [41]:
dataset_details = dict()
dataset_details["file_name"] = '../Data/fashion-mnist_train.csv'
dataset_details["target_feature"] = "label"
dataset_details["num_classes"] = 10
dataset_details["classes"] = {0: "T-shirt/top", 1: "Trouser", 2: "Pullover", 3: "Dress", 4: "Coat", 5: "Sandal",
                              6: "Shirt", 7: "Sneaker", 8: "Bag", 9: "Ankle boot"}
datasets["mnist_fashion"] = dataset_details

In [42]:
dataset_details = dict()
dataset_details["file_name"] = '../Data/kmnist.csv'
dataset_details["target_feature"] = "label"
dataset_details["num_classes"] = 10
dataset_details["classes"] = {0: "0", 1: "1", 2: "2", 3: "3", 4: "4", 5: "5", 6: "6", 7: "7", 8: "8", 9: "9"}
datasets["kmnist"] = dataset_details

In [43]:
dataset_details = dict()
dataset_details["file_name"] = '../Data/sign_mnist_train.csv'
dataset_details["target_feature"] = "label"
dataset_details["num_classes"] = 24
dataset_details["classes"] = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e", 5: "f", 6: "g", 7: "h", 8: "i", 10: "k", 11: "l",
                              12: "m", 13: "n", 14: "o", 15: "p", 16: "q", 17: "r", 18: "s", 19: "t", 20: "u", 21: "v",
                              22: "w", 23: "x", 24: "y"}
datasets["mnist_chinese"] = dataset_details

### Setup Models

Setup a dictioanry of models and hyper-parramtee training details that will be evaluated. 

In [44]:
models = dict()

Decision Tree

In [45]:
model_details = dict()
model_details['base_model'] = sklearn.tree.DecisionTreeClassifier()
model_details['param_grid'] = {'criterion': ['gini', "entropy"],
                               'max_depth': list(range(3, 50, 3)),
                               'min_samples_split': [50]}
models['decision_tree'] = model_details

kNN

In [46]:
model_details = dict()
model_details['base_model'] = sklearn.neighbors.KNeighborsClassifier()
model_details['param_grid'] = {'n_neighbors': [*list(range(1, 50, 5)), 2, 3]}
models['kNN'] = model_details

Bagging

In [47]:
# model_details = dict()
# model_details['base_model'] = sklearn.ensemble.BaggingClassifier(
#     estimator=sklearn.tree.DecisionTreeClassifier(criterion="entropy", max_depth=6, min_samples_leaf=200))
# model_details['param_grid'] = {'n_estimators': list(range(50, 501, 50))}
# models['bagging'] = model_details

Gradient Boosting

In [48]:
#model_details = dict()
#model_details['base_model'] = sklearn.ensemble.GradientBoostingClassifier(max_depth = 3)
#model_details['param_grid'] = {'n_estimators': list(range(50, 501, 100)),
#                               'learning_rate': [0.001, 0.01, 0.1]}
#models['gradient_boosting'] = model_details

Random Forest

In [49]:
model_details = dict()
model_details['base_model'] = sklearn.ensemble.RandomForestClassifier(min_samples_split=200)
model_details['param_grid'] = {'n_estimators': list(range(50, 501, 100)),
                               'max_features': list(range(2, 10, 2))}
models['random_forest'] = model_details

SVM

In [50]:
model_details = dict()
model_details['base_model'] = sklearn.svm.SVC()
model_details['param_grid'] = {'C': [0.1, 1, 10, 100],
                               'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
models['svm'] = model_details

## Experiment Loop

Iterate through each dataset and then for each model. For exach combination perfomr a hyper-parameter tuning grid search and a final cross validation experiment.

In [51]:
data_model_evaluation_results = dict()

for dataset_name in datasets:

    print(dataset_name)

    dataset_details = datasets[dataset_name]

    file_name = dataset_details["file_name"]
    target_feature = dataset_details["target_feature"]
    num_classes = dataset_details["num_classes"]
    classes = dataset_details["classes"]

    dataset = pd.read_csv(file_name)
    dataset = dataset.sample(frac=data_sampling_rate)  #take a sample from the dataset so everything runs smoothly

    X = dataset.loc[:, dataset.columns != target_feature]
    y = dataset[target_feature]

    model_evaluation_results = dict()

    for model_name in models:
        print("\t{}".format(model_name))

        model_details = models[model_name]

        evaluation_results = dict()

        # Perform a grid search
        print('\t\tPerforming grid search')
        grid_search_results = sklearn.model_selection.GridSearchCV(model_details['base_model'],
                                                                   model_details['param_grid'],
                                                                   cv=grid_search_cv_folds, verbose=1,
                                                                   n_jobs=-1)
        grid_search_results.fit(X, y)
        print("\t\tBest Parameters: {}".format(grid_search_results.best_params_))

        # Store the grid search results
        evaluation_results['best_params'] = grid_search_results.best_params_
        evaluation_results['best_score'] = grid_search_results.best_score_
        evaluation_results['cv_results'] = grid_search_results.cv_results_

        # Perform final cross validation
        print('\t\tPerforming final cross validation')
        best_model = grid_search_results.best_estimator_
        cv_results = sklearn.model_selection.cross_validate(best_model, X, y, cv=final_cv_folds)
        print("\t\t{} +/- {}".format(cv_results['test_score'].mean(), cv_results['test_score'].std()))

        # Store the cross validation results
        evaluation_results['final_cv_results'] = cv_results
        evaluation_results['final_cv_mean'] = cv_results['test_score'].mean()
        evaluation_results['final_cv_std_dev'] = cv_results['test_score'].std()

        # Add all evaluation details to the evaluation dictionary
        model_evaluation_results[model_name] = evaluation_results

        with open('evaluation_results_' + time.strftime("%Y%m%d_%H%M%S") + '.data', 'wb') as f:
            pickle.dump(model_evaluation_results, f)

    data_model_evaluation_results[dataset_name] = model_evaluation_results

mnist
	decision_tree
		Performing grid search
Fitting 2 folds for each of 32 candidates, totalling 64 fits
		Best Parameters: {'criterion': 'entropy', 'max_depth': 9, 'min_samples_split': 50}
		Performing final cross validation
		0.7095 +/- 0.017833333333333368
	kNN
		Performing grid search
Fitting 2 folds for each of 12 candidates, totalling 24 fits
		Best Parameters: {'n_neighbors': 3}
		Performing final cross validation
		0.9226666666666667 +/- 0.005333333333333357
	random_forest
		Performing grid search
Fitting 2 folds for each of 20 candidates, totalling 40 fits
		Best Parameters: {'max_features': 8, 'n_estimators': 350}
		Performing final cross validation
		0.834 +/- 0.0030000000000000027
	svm
		Performing grid search
Fitting 2 folds for each of 16 candidates, totalling 32 fits
		Best Parameters: {'C': 10, 'kernel': 'rbf'}
		Performing final cross validation
		0.9471666666666667 +/- 0.0008333333333332971
mnist_fashion
	decision_tree
		Performing grid search
Fitting 2 folds for ea

## Comparing Models

Generate table of model perfomrances form the dictiaonry containing experiment results.

In [52]:
data = []
for row_key, sub_dict in data_model_evaluation_results.items():
    row_data = {"Index": row_key}
    for col_key, inner_dict in sub_dict.items():
        row_data[col_key] = inner_dict.get('final_cv_mean', None)  # Extract the specific value
    data.append(row_data)
results_df = pd.DataFrame(data).set_index("Index")
print(results_df)

               decision_tree       kNN  random_forest       svm
Index                                                          
mnist               0.709500  0.922667       0.834000  0.947167
mnist_fashion       0.737667  0.794833       0.780500  0.852500
kmnist              0.578000  0.894000       0.733167  0.894500
mnist_chinese       0.311726  0.799345       0.459213  0.908958


Convert to ranks.

In [53]:
ranks_df = results_df.rank(method="min", ascending=False, axis=1)
print(ranks_df)

               decision_tree  kNN  random_forest  svm
Index                                                
mnist                    4.0  2.0            3.0  1.0
mnist_fashion            4.0  2.0            3.0  1.0
kmnist                   4.0  2.0            3.0  1.0
mnist_chinese            4.0  2.0            3.0  1.0
