In [1]:
import os
import pandas as pd

from pathlib import Path
from datetime import datetime

from src.models.LogisticRegression import LogisticRegressionClassifier
from src.models.SVM import SVMClassifier
from src.models.RandomForest import RFClassifier
from src.models.MLP import MLP
from src.models.KN import KN
from src.models.DecisionTree import DT
from sklearn.model_selection import StratifiedKFold

In [2]:
# Variables used throughout the notebook
project_root_dir = Path(os.path.abspath('')).resolve()
current_notebook = "Training"

# output path for images
image_folder_path = os.path.join(project_root_dir, "notebooks", current_notebook)
os.makedirs(image_folder_path, exist_ok=True)

In [3]:
def display_score(name, score):
    score_string = name + ': '
    for key in score:
        score_string += key + '={:.4f}'.format(score[key]) + '\t'

    print(score_string)

In [4]:
def build_dict_from_results(name, score):
    score['name'] = name
    return score

## Data

In [5]:
# import the data
data_path = os.path.join(project_root_dir, 'data', 'processed')

data = pd.read_csv(os.path.join(data_path, 'training_data.csv'))
labels = pd.read_csv(os.path.join(data_path, 'training_labels.csv'))

data_test = pd.read_csv(os.path.join(data_path, 'test_data.csv'))
labels_test = pd.read_csv(os.path.join(data_path, 'test_labels.csv'))

In [6]:
data.describe()

Unnamed: 0,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,margin10,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
count,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0,...,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0,792.0
mean,0.203812,0.139585,0.245669,0.137671,0.127899,0.123728,0.206506,0.03401,0.108547,0.191768,...,0.083531,0.025926,0.09203,0.057494,0.150211,0.025024,0.018141,0.053752,0.103961,0.167327
std,0.231326,0.19022,0.198228,0.170772,0.167799,0.168549,0.188601,0.089729,0.131115,0.166275,...,0.147769,0.099515,0.131509,0.126524,0.143567,0.107343,0.077436,0.103034,0.158118,0.194811
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.023254,0.009523,0.106058,0.034481,0.017542,0.0,0.063826,0.0,0.02941,0.059996,...,0.0,0.0,0.005652,0.0,0.045871,0.0,0.0,0.0,0.0,0.008337
50%,0.116284,0.057144,0.181817,0.080461,0.07017,0.050314,0.148937,0.0,0.08823,0.16,...,0.011364,0.0,0.033896,0.00488,0.119258,0.0,0.0,0.010389,0.033711,0.108328
75%,0.302331,0.192854,0.333326,0.172416,0.157891,0.177673,0.297875,0.0,0.11764,0.280003,...,0.09375,0.0,0.125706,0.043901,0.201822,0.0,0.0,0.062338,0.146064,0.249996
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
labels.describe()

Unnamed: 0,species
count,792.0
mean,49.0
std,28.595439
min,0.0
25%,24.0
50%,49.0
75%,74.0
max,98.0


In [8]:
s_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Training

In [9]:
score_list = []

### Logistic regression

In [10]:
from src.models.LogisticRegression import LogisticRegressionClassifier

lr_clf = LogisticRegressionClassifier(s_k_fold)

Logistic Regression:Creating


In [11]:
# print results without any optimization
lr_clf.cross_validate(data, labels, optimized=False)
display_score(lr_clf.name, lr_clf.get_score())

score_list.append(build_dict_from_results(lr_clf.name, lr_clf.get_score()))

Logistic Regression:Initialization
Logistic Regression: f1=0.9813	precision=0.9817	recall=0.9848	accuracy=0.9848	log_loss=0.9275	


In [12]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
lr_clf.optimize(data, labels)

Logistic Regression:Initialization
Logistic Regression:Start optimization
Logistic Regression:end optimization


In [13]:
lr_clf.cross_validate(data, labels, optimized=True)
display_score(lr_clf.name, lr_clf.get_score())

score_list.append(build_dict_from_results(lr_clf.name + '_o', lr_clf.get_score()))

Logistic Regression:Initialization
Logistic Regression:C:390.01768308021974 penalty:l2 solver:lbfgs 
Logistic Regression: f1=0.9874	precision=0.9876	recall=0.9899	accuracy=0.9899	log_loss=0.0722	


### SVM

In [14]:
from src.models.SVM import SVMClassifier

svm_clf = SVMClassifier(s_k_fold)

SVM:Creating


In [15]:
# print results without any optimization
svm_clf.cross_validate(data, labels, optimized=False)
display_score(svm_clf.name, svm_clf.get_score())

score_list.append(build_dict_from_results(svm_clf.name, svm_clf.get_score()))

SVM:Initialization
SVM: f1=0.9860	precision=0.9863	recall=0.9886	accuracy=0.9886	log_loss=2.5717	


In [16]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
svm_clf.optimize(data, labels)

SVM:Initialization
SVM:Start optimization
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed:    1.3s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    1.6s finished


SVM:end optimization


In [17]:
svm_clf.cross_validate(data, labels, optimized=True)
display_score(svm_clf.name, svm_clf.get_score())

score_list.append(build_dict_from_results(svm_clf.name + '_o', svm_clf.get_score()))

SVM:Initialization
SVM:probability:True kernel:linear C:0.1 
SVM: f1=0.8046	precision=0.8262	recall=0.8270	accuracy=0.8270	log_loss=2.6481	


### Random forest

In [18]:
from src.models.RandomForest import RFClassifier

rf_clf = RFClassifier(s_k_fold)

Random Forest:Creating


In [19]:
# print results without any optimization
rf_clf.cross_validate(data, labels, optimized=False)
display_score(rf_clf.name, rf_clf.get_score())

score_list.append(build_dict_from_results(rf_clf.name, rf_clf.get_score()))

Random Forest:Initialization
Random Forest: f1=0.9702	precision=0.9766	recall=0.9735	accuracy=0.9735	log_loss=0.8863	


In [20]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
rf_clf.optimize(data, labels)

Random Forest:Initialization
Random Forest:Start optimization
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   27.7s finished


Random Forest:end optimization


In [21]:
rf_clf.cross_validate(data, labels, optimized=True)
display_score(rf_clf.name, rf_clf.get_score())

score_list.append(build_dict_from_results(rf_clf.name + '_o', rf_clf.get_score()))

Random Forest:Initialization
Random Forest:random_state:42 n_estimators:1000 min_samples_split:6 min_samples_leaf:2 max_features:auto max_depth:10 criterion:entropy 
Random Forest: f1=0.9450	precision=0.9519	recall=0.9521	accuracy=0.9521	log_loss=1.0391	


### Multi layer perceptron

In [22]:
from src.models.MLP import MLP

mlp_clf = MLP(s_k_fold)

MLP:Creating


In [None]:
# print results without any optimization
mlp_clf.cross_validate(data, labels, optimized=False)
display_score(mlp_clf.name, mlp_clf.get_score())

score_list.append(build_dict_from_results(mlp_clf.name, mlp_clf.get_score()))

MLP:Initialization


In [None]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
mlp_clf.optimize(data, labels)

In [None]:
mlp_clf.cross_validate(data, labels, optimized=True)
display_score(mlp_clf.name, mlp_clf.get_score())

score_list.append(build_dict_from_results(mlp_clf.name + '_o', mlp_clf.get_score()))

### KNeighbors 

In [None]:
from src.models.KN import KN

kn_clf = KN(s_k_fold)

In [None]:
# print results without any optimization
kn_clf.cross_validate(data, labels, optimized=False)
display_score(kn_clf.name, kn_clf.get_score())

score_list.append(build_dict_from_results(kn_clf.name, kn_clf.get_score()))

In [None]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
kn_clf.optimize(data, labels)

In [None]:
kn_clf.cross_validate(data, labels, optimized=True)
display_score(kn_clf.name, kn_clf.get_score())

score_list.append(build_dict_from_results(kn_clf.name + '_o', kn_clf.get_score()))

### Decision Tree

In [None]:
from src.models.DecisionTree import DT

dt_clf = DT(s_k_fold)

In [None]:
# print results without any optimization
dt_clf.cross_validate(data, labels, optimized=False)
display_score(dt_clf.name, dt_clf.get_score())

score_list.append(build_dict_from_results(dt_clf.name, dt_clf.get_score()))

In [None]:
# This method will search the best set of hyperparameters with a RandomSearch. 
# The hyper-parameters have been fitted to have to the best range
dt_clf.optimize(data, labels)

In [None]:
dt_clf.cross_validate(data, labels, optimized=True)
display_score(dt_clf.name, dt_clf.get_score())

score_list.append(build_dict_from_results(dt_clf.name + '_o', dt_clf.get_score()))

## Result analysis

In [None]:
# build a dataframe from the list of results
results = pd.DataFrame(score_list)

In [None]:
pd.set_option('display.max_rows', results.shape[0]+1)
#print(results)
print(results.sort_values('log_loss'))

In [None]:
validation_list = []

lr_clf.validation(data_test, labels_test)
validation_list.append(build_dict_from_results(lr_clf.name + '_o', lr_clf.get_score()))

svm_clf.validation(data_test, labels_test)
validation_list.append(build_dict_from_results(svm_clf.name + '_o', svm_clf.get_score()))

mlp_clf.validation(data_test, labels_test)
validation_list.append(build_dict_from_results(mlp_clf.name + '_o', mlp_clf.get_score()))

rf_clf.validation(data_test, labels_test)
validation_list.append(build_dict_from_results(rf_clf.name + '_o', rf_clf.get_score()))

kn_clf.validation(data_test, labels_test)
validation_list.append(build_dict_from_results(kn_clf.name + '_o', kn_clf.get_score()))

dt_clf.validation(data_test, labels_test)
validation_list.append(build_dict_from_results(dt_clf.name + '_o', dt_clf.get_score()))

In [None]:
results = pd.DataFrame(validation_list)
print(results.sort_values('log_loss'))