In [1]:
import os
import pandas as pd

from pathlib import Path
from datetime import datetime

from src.models.LogisticRegression import LogisticRegressionClassifier
from src.models.SVM import SVMClassifier
from src.models.RandomForest import RFClassifier
from src.models.MLP import MLP
from src.models.KN import KN
from src.models.DecisionTree import DT
from sklearn.model_selection import StratifiedKFold

In [2]:
# Variables used throughout the notebook
project_root_dir = Path(os.path.abspath('')).resolve().parents[0]
current_notebook = "Training"

# output path for images
image_folder_path = os.path.join(project_root_dir, "notebooks", current_notebook)
os.makedirs(image_folder_path, exist_ok=True)

In [3]:
def display_score(name, score):
    score_string = name + ': '
    for key in score:
        score_string += key + '={:.4f}'.format(score[key]) + '\t'

    print(score_string)

In [4]:
def build_dict_from_results(name, score):
    score['name'] = name
    return score

## Data

In [5]:
# import the data
data_path = os.path.join(project_root_dir, 'data', 'processed')

data = pd.read_csv(os.path.join(data_path, 'training_data.csv'))
labels = pd.read_csv(os.path.join(data_path, 'training_labels.csv'))

In [6]:
data.describe()

Unnamed: 0,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,margin10,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
count,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0,...,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0
mean,0.0,3.364312e-17,1.300867e-16,1.014901e-16,-5.719331000000001e-17,1.166295e-16,-5.607187e-17,-1.7943e-17,-5.831474e-17,6.055762e-17,...,3.252168e-17,6.728624e-17,-2.0185870000000002e-17,5.3829000000000005e-17,-2.063445e-16,1.087794e-16,-1.570012e-17,2.0185870000000002e-17,4.9343250000000004e-17,-1.401797e-16
std,1.000632,1.000632,1.000632,1.000632,1.000632,1.000632,1.000632,1.000632,1.000632,1.000632,...,1.000632,1.000632,1.000632,1.000632,1.000632,1.000632,1.000632,1.000632,1.000632,1.000632
min,-0.881617,-0.7342717,-1.240112,-0.8066802,-0.7626971,-0.7345422,-1.095629,-0.3792727,-0.8283998,-1.15405,...,-0.5656381,-0.2606863,-0.7002438,-0.4547012,-1.046938,-0.2332711,-0.234417,-0.5220187,-0.6579067,-0.859462
25%,-0.781026,-0.6841764,-0.7047405,-0.60464,-0.6580867,-0.7345422,-0.7569981,-0.3792727,-0.6039509,-0.7929953,...,-0.5656381,-0.2606863,-0.6572363,-0.4547012,-0.7272258,-0.2332711,-0.234417,-0.5220187,-0.6579067,-0.8166401
50%,-0.378613,-0.4336742,-0.3223209,-0.3352186,-0.3442556,-0.4358417,-0.305433,-0.3792727,-0.1550532,-0.1911757,...,-0.4886857,-0.2606863,-0.4423305,-0.4161063,-0.2157383,-0.2332711,-0.234417,-0.4211254,-0.444567,-0.3030405
75%,0.426161,0.2802158,0.442479,0.2035897,0.1788499,0.3202531,0.4847627,-0.3792727,0.06939565,0.5309956,...,0.06919588,-0.2606863,0.2562342,-0.1075048,0.3597179,-0.2332711,-0.234417,0.0833931,0.2664437,0.4246247
max,3.444026,4.526119,3.807779,5.052796,5.200577,5.202191,4.209929,10.77247,6.803321,4.8639,...,6.205956,9.794375,6.908617,7.453904,5.922832,9.088584,12.68766,9.189677,5.670489,4.276973


In [7]:
labels.describe()

Unnamed: 0,species
count,792.0
mean,49.0
std,28.595439
min,0.0
25%,24.0
50%,49.0
75%,74.0
max,98.0


In [8]:
s_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Training

In [9]:
score_list = []

### Logistic regression

In [10]:
from src.models.LogisticRegression import LogisticRegressionClassifier

lr_clf = LogisticRegressionClassifier(s_k_fold)

Logistic Regression:Creating


In [11]:
# print results without any optimization
lr_clf.cross_validate(data, labels, optimized=False)
display_score(lr_clf.name, lr_clf.get_score())

score_list.append(build_dict_from_results(lr_clf.name, lr_clf.get_score()))

Logistic Regression:Initialization
Logistic Regression: f1=0.9843	precision=0.9848	recall=0.9874	accuracy=0.9874	log_loss=0.1454	


In [None]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
lr_clf.optimize(data, labels)

In [12]:
lr_clf.cross_validate(data, labels, optimized=True)
display_score(lr_clf.name, lr_clf.get_score())

score_list.append(build_dict_from_results(lr_clf.name, lr_clf.get_score()))

Logistic Regression:Initialization
Logistic Regression:C:390.01768308021974 penalty:l2 solver:lbfgs 
Logistic Regression: f1=0.9843	precision=0.9848	recall=0.9874	accuracy=0.9874	log_loss=0.0542	


### SVM

In [13]:
from src.models.SVM import SVMClassifier

svm_clf = SVMClassifier(s_k_fold)

SVM:Creating


In [14]:
# print results without any optimization
svm_clf.cross_validate(data, labels, optimized=False)
display_score(svm_clf.name, svm_clf.get_score())

score_list.append(build_dict_from_results(svm_clf.name, svm_clf.get_score()))

SVM:Initialization
SVM: f1=0.9749	precision=0.9783	recall=0.9785	accuracy=0.9785	log_loss=2.5780	


In [None]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
svm_clf.optimize(data, labels)

In [15]:
svm_clf.cross_validate(data, labels, optimized=True)
display_score(svm_clf.name, svm_clf.get_score())

score_list.append(build_dict_from_results(svm_clf.name, svm_clf.get_score()))

SVM:Initialization
SVM:probability:True kernel:poly gamma:auto degree:1 C:1.7782794100389228e+74 
SVM: f1=0.9808	precision=0.9834	recall=0.9836	accuracy=0.9836	log_loss=2.5311	


### Random forest

In [16]:
from src.models.RandomForest import RFClassifier

rf_clf = RFClassifier(s_k_fold)

Random Forest:Creating


In [17]:
# print results without any optimization
rf_clf.cross_validate(data, labels, optimized=False)
display_score(rf_clf.name, rf_clf.get_score())

score_list.append(build_dict_from_results(rf_clf.name, rf_clf.get_score()))

Random Forest:Initialization
Random Forest: f1=0.9709	precision=0.9785	recall=0.9735	accuracy=0.9735	log_loss=0.9300	


In [None]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
rf_clf.optimize(data, labels)

In [None]:
rf_clf.cross_validate(data, labels, optimized=True)
display_score(rf_clf.name, rf_clf.get_score())

score_list.append(build_dict_from_results(rf_clf.name, rf_clf.get_score()))

Random Forest:Initialization
Random Forest:random_state:42 n_estimators:4000 min_samples_split:2 min_samples_leaf:1 max_features:None max_depth:12 criterion:entropy 


### Multi layer perceptron

In [None]:
from src.models.MLP import MLP

mlp_clf = MLP(s_k_fold)

In [None]:
# print results without any optimization
mlp_clf.cross_validate(data, labels, optimized=False)
display_score(mlp_clf.name, mlp_clf.get_score())

score_list.append(build_dict_from_results(mlp_clf.name, mlp_clf.get_score()))

In [None]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
mlp_clf.optimize(data, labels)

In [None]:
mlp_clf.cross_validate(data, labels, optimized=True)
display_score(mlp_clf.name, mlp_clf.get_score())

score_list.append(build_dict_from_results(mlp_clf.name, mlp_clf.get_score()))

### KNeighbors 

In [None]:
from src.models.KN import KN

kn_clf = KN(s_k_fold)

In [None]:
# print results without any optimization
kn_clf.cross_validate(data, labels, optimized=False)
display_score(kn_clf.name, kn_clf.get_score())

score_list.append(build_dict_from_results(kn_clf.name, kn_clf.get_score()))

In [None]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
kn_clf.optimize(data, labels)

In [None]:
kn_clf.cross_validate(data, labels, optimized=True)
display_score(kn_clf.name, mlp_clf.get_score())

score_list.append(build_dict_from_results(kn_clf.name, kn_clf.get_score()))

### Decision Tree

In [None]:
from src.models.DecisionTree import DT

dt_clf = DT(s_k_fold)

In [None]:
# print results without any optimization
dt_clf.cross_validate(data, labels, optimized=False)
display_score(dt_clf.name, dt_clf.get_score())

score_list.append(build_dict_from_results(dt_clf.name, dt_clf.get_score()))

In [None]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
dt_clf.optimize(data, labels)

In [None]:
dt_clf.cross_validate(data, labels, optimized=True)
display_score(kn_clf.name, mlp_clf.get_score())

score_list.append(build_dict_from_results(dt_clf.name, dt_clf.get_score()))