# Multi-Model Classification Comparison Tool
This is a one file classificaiton model comparison template that allows quick comparison of the following models:
* Decision Tree Classification
* K-Nearest Neighbours
* Kernel SVM
* Logistic Regression
* Naive Bayes
* Random Forest Classification
* Extremely Randomized Trees
* Bagging Classifier
* AdaBoost
* Gradient Boosting Classifier

The available metrics that are used for comparison are listed below:
* Accuracy
* Precision
* Recall/Sensitivity
* F1 Score
* Logloss

This was setup to allow the user to pass in the path to their .csv file as a string stored in the `path` variable in the **Import the dataset** section below. The template has been setup such that any number of predictor variables can be used, however the response variable **MUST** be the last column in the .csv. 
***

### Importing the libraries

In [1]:
# Data Formatting and Manipulation Import
import numpy as np
import pandas as pd
# Plotting Imports
import matplotlib.pyplot as plt
# Supervised Classification Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, log_loss

### Defining Constants

In [2]:
FILE_PATH = '../../data_files/pre-processed_data/regression_outputs/single_20_regression_output.csv'
RESPONSE_COLUMN = 'endUseLabel'

### Importing the dataset

In [3]:
training_data = pd.read_csv(FILE_PATH) # Reading in the training data file
# Manipulating dataset to be in the appropriate format for creating seperate predictor and response datasets
cols = training_data.columns.tolist()
cols.remove(RESPONSE_COLUMN)
cols.append(RESPONSE_COLUMN)
training_data = training_data[cols]
training_data = training_data.iloc[:,2:]
training_data = training_data.fillna(0)
# Extracting just the number from the label
training_data[RESPONSE_COLUMN] = training_data[RESPONSE_COLUMN].apply(lambda x: int(str(x)[0:2]))
training_data=training_data[(training_data[RESPONSE_COLUMN]!=99)]
# Creating seperate predictor variable and response variable numpy arrays
x = training_data.iloc[:, :-1].values
y = training_data.iloc[:, -1].values

### Splitting the dataset into the Training set and Test set

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

### Feature Scaling

In [5]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [6]:
models = ['decision_tree', 'knn', 'kernel_svm', 'logistic_regression', 'naive_bayse', 'random_forest', 'extremely_random_trees', 'bagging', 'adaboost', 'gradientboost']
metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'logloss']
results = pd.DataFrame(index=models, columns=metrics)

***
## Decision Tree Classification

### Training the Decision Tree Classification model on the Training set

In [7]:
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

### Making the Confusion Matrix

In [8]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Caclulating and displaying comparison metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_true=y_test, y_pred=classifier.predict_proba(x_test))
print("accuracy: "+str(accuracy))
print("precision: "+str(precision))
print("recall: "+str(recall))
print("f1: "+str(f1))
print("logloss: "+str(logloss))

# Storing comparison metrics in dataframe
results.loc['decision_tree', 'accuracy'] = accuracy
results.loc['decision_tree', 'precision'] = precision
results.loc['decision_tree', 'recall'] = recall
results.loc['decision_tree', 'f1_score'] = f1
results.loc['decision_tree', 'logloss'] = logloss

[[10  0  0  0  0  0]
 [ 0  5  0  0  0  0]
 [ 0  0  8  0  0  1]
 [ 0  0  0  2  0  0]
 [ 0  0  0  0  2  0]
 [ 0  0  0  1  0  6]]
accuracy: 0.9428571428571428
precision: 0.9523809523809522
recall: 0.9428571428571428
f1: 0.944873949579832
logloss: 1.973644365423473


***
## K-Nearest Neighbors (K-NN)

### Training the K-NN model on the Training set

In [9]:
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

### Making the Confusion Matrix

In [10]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Caclulating and displaying comparison metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_true=y_test, y_pred=classifier.predict_proba(x_test))
print("accuracy: "+str(accuracy))
print("precision: "+str(precision))
print("recall: "+str(recall))
print("f1: "+str(f1))
print("logloss: "+str(logloss))

# Storing comparison metrics in dataframe
results.loc['knn', 'accuracy'] = accuracy
results.loc['knn', 'precision'] = precision
results.loc['knn', 'recall'] = recall
results.loc['knn', 'f1_score'] = f1
results.loc['knn', 'logloss'] = logloss

[[10  0  0  0  0  0]
 [ 0  5  0  0  0  0]
 [ 1  0  7  0  0  1]
 [ 0  0  0  2  0  0]
 [ 0  0  0  0  2  0]
 [ 0  0  0  1  0  6]]
accuracy: 0.9142857142857143
precision: 0.9264069264069263
recall: 0.9142857142857143
f1: 0.9142517006802722
logloss: 1.237491301716211


***
## Kernel SVM

### Training the Kernel SVM model on the Training set

In [11]:
classifier = SVC(kernel = 'rbf', random_state = 0, probability=True)
classifier.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

## Making the Confusion Matrix

In [12]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Caclulating and displaying comparison metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_true=y_test, y_pred=classifier.predict_proba(x_test))
print("accuracy: "+str(accuracy))
print("precision: "+str(precision))
print("recall: "+str(recall))
print("f1: "+str(f1))
print("logloss: "+str(logloss))

# Storing comparison metrics in dataframe
results.loc['kernel_svm', 'accuracy'] = accuracy
results.loc['kernel_svm', 'precision'] = precision
results.loc['kernel_svm', 'recall'] = recall
results.loc['kernel_svm', 'f1_score'] = f1
results.loc['kernel_svm', 'logloss'] = logloss

[[10  0  0  0  0  0]
 [ 0  4  0  0  0  1]
 [ 0  0  9  0  0  0]
 [ 0  0  0  2  0  0]
 [ 0  0  0  2  0  0]
 [ 0  0  2  1  0  4]]
accuracy: 0.8285714285714286
precision: 0.8218181818181819
recall: 0.8285714285714286
f1: 0.8101133786848073
logloss: 0.5347282236561574


  _warn_prf(average, modifier, msg_start, len(result))


***
## Logistic Regression

### Training the Logistic Regression model on the Training set

In [13]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Making the Confusion Matrix

In [14]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Caclulating and displaying comparison metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_true=y_test, y_pred=classifier.predict_proba(x_test))
print("accuracy: "+str(accuracy))
print("precision: "+str(precision))
print("recall: "+str(recall))
print("f1: "+str(f1))
print("logloss: "+str(logloss))

# Storing comparison metrics in dataframe
results.loc['logistic_regression', 'accuracy'] = accuracy
results.loc['logistic_regression', 'precision'] = precision
results.loc['logistic_regression', 'recall'] = recall
results.loc['logistic_regression', 'f1_score'] = f1
results.loc['logistic_regression', 'logloss'] = logloss

[[10  0  0  0  0  0]
 [ 0  4  0  0  0  1]
 [ 1  0  8  0  0  0]
 [ 0  0  0  2  0  0]
 [ 0  0  0  1  0  1]
 [ 0  0  2  1  0  4]]
accuracy: 0.8
precision: 0.7702164502164501
recall: 0.8
f1: 0.7768064850771618
logloss: 0.49058690334057675


  _warn_prf(average, modifier, msg_start, len(result))


***
## Naive Bayes

### Training the Naive Bayes model on the Training set

In [15]:
classifier = GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

### Making the Confusion Matrix

In [16]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Caclulating and displaying comparison metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_true=y_test, y_pred=classifier.predict_proba(x_test))
print("accuracy: "+str(accuracy))
print("precision: "+str(precision))
print("recall: "+str(recall))
print("f1: "+str(f1))
print("logloss: "+str(logloss))

# Storing comparison metrics in dataframe
results.loc['naive_bayse', 'accuracy'] = accuracy
results.loc['naive_bayse', 'precision'] = precision
results.loc['naive_bayse', 'recall'] = recall
results.loc['naive_bayse', 'f1_score'] = f1
results.loc['naive_bayse', 'logloss'] = logloss

[[3 4 0 0 0 3]
 [0 5 0 0 0 0]
 [0 0 8 0 0 1]
 [0 0 0 2 0 0]
 [0 0 0 0 2 0]
 [0 0 2 1 4 0]]
accuracy: 0.5714285714285714
precision: 0.6279365079365079
recall: 0.5714285714285714
f1: 0.5247360158638354
logloss: 13.903797760599288


***
## Random Forest Classification

### Training the Random Forest Classification model on the Training set

In [17]:
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 0)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

### Making the Confusion Matrix

In [18]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Caclulating and displaying comparison metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_true=y_test, y_pred=classifier.predict_proba(x_test))
print("accuracy: "+str(accuracy))
print("precision: "+str(precision))
print("recall: "+str(recall))
print("f1: "+str(f1))
print("logloss: "+str(logloss))

# Storing comparison metrics in dataframe
results.loc['random_forest', 'accuracy'] = accuracy
results.loc['random_forest', 'precision'] = precision
results.loc['random_forest', 'recall'] = recall
results.loc['random_forest', 'f1_score'] = f1
results.loc['random_forest', 'logloss'] = logloss

[[10  0  0  0  0  0]
 [ 0  5  0  0  0  0]
 [ 0  0  8  0  0  1]
 [ 0  0  0  2  0  0]
 [ 0  0  0  0  1  1]
 [ 0  0  0  1  0  6]]
accuracy: 0.9142857142857143
precision: 0.9309523809523809
recall: 0.9142857142857143
f1: 0.9143977591036414
logloss: 0.3103035665234523


***
## Extremely Randomized Trees

### Training the Extremely Randomized Tree model on the Training set

In [19]:
classifier = ExtraTreesClassifier(n_estimators = 100, criterion = 'gini', random_state = 0)
classifier.fit(x_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=0, verbose=0,
                     warm_start=False)

### Making the Confusion Matrix

In [20]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Caclulating and displaying comparison metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_true=y_test, y_pred=classifier.predict_proba(x_test))
print("accuracy: "+str(accuracy))
print("precision: "+str(precision))
print("recall: "+str(recall))
print("f1: "+str(f1))
print("logloss: "+str(logloss))

# Storing comparison metrics in dataframe
results.loc['extremely_random_trees', 'accuracy'] = accuracy
results.loc['extremely_random_trees', 'precision'] = precision
results.loc['extremely_random_trees', 'recall'] = recall
results.loc['extremely_random_trees', 'f1_score'] = f1
results.loc['extremely_random_trees', 'logloss'] = logloss

[[10  0  0  0  0  0]
 [ 0  5  0  0  0  0]
 [ 0  0  8  0  0  1]
 [ 0  0  0  2  0  0]
 [ 0  0  0  0  2  0]
 [ 0  0  0  1  0  6]]
accuracy: 0.9428571428571428
precision: 0.9523809523809522
recall: 0.9428571428571428
f1: 0.944873949579832
logloss: 1.0546157772314815


***
## Bagging Classifier

### Training the Bagging Classifier model on the Training set

In [21]:
classifier = BaggingClassifier(n_estimators = 100, random_state = 0)
classifier.fit(x_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=100,
                  n_jobs=None, oob_score=False, random_state=0, verbose=0,
                  warm_start=False)

### Making the Confusion Matrix

In [22]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Caclulating and displaying comparison metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_true=y_test, y_pred=classifier.predict_proba(x_test))
print("accuracy: "+str(accuracy))
print("precision: "+str(precision))
print("recall: "+str(recall))
print("f1: "+str(f1))
print("logloss: "+str(logloss))

# Storing comparison metrics in dataframe
results.loc['bagging', 'accuracy'] = accuracy
results.loc['bagging', 'precision'] = precision
results.loc['bagging', 'recall'] = recall
results.loc['bagging', 'f1_score'] = f1
results.loc['bagging', 'logloss'] = logloss

[[10  0  0  0  0  0]
 [ 0  5  0  0  0  0]
 [ 0  0  8  0  0  1]
 [ 0  0  0  2  0  0]
 [ 0  0  0  0  2  0]
 [ 0  0  0  1  0  6]]
accuracy: 0.9428571428571428
precision: 0.9523809523809522
recall: 0.9428571428571428
f1: 0.944873949579832
logloss: 0.19613896262455327


***
## AdaBoost Classifier

### Training the AdaBoost Classifier model on the Training set

In [23]:
classifier = AdaBoostClassifier(n_estimators = 100, random_state = 0)
classifier.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=0)

### Making the Confusion Matrix

In [24]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Caclulating and displaying comparison metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_true=y_test, y_pred=classifier.predict_proba(x_test))
print("accuracy: "+str(accuracy))
print("precision: "+str(precision))
print("recall: "+str(recall))
print("f1: "+str(f1))
print("logloss: "+str(logloss))

# Storing comparison metrics in dataframe
results.loc['adaboost', 'accuracy'] = accuracy
results.loc['adaboost', 'precision'] = precision
results.loc['adaboost', 'recall'] = recall
results.loc['adaboost', 'f1_score'] = f1
results.loc['adaboost', 'logloss'] = logloss

[[7 0 0 0 0 3]
 [0 4 0 0 0 1]
 [0 0 0 0 0 9]
 [0 0 0 2 0 0]
 [0 0 0 0 0 2]
 [0 0 0 1 0 6]]
accuracy: 0.5428571428571428
precision: 0.5238095238095237
recall: 0.5428571428571428
f1: 0.4937068160597572
logloss: 1.1455907793657576


  _warn_prf(average, modifier, msg_start, len(result))


***
## Gradient Boosting Classifier

### Training the Gradient Boosting Classifier model on the Training set

In [25]:
classifier = GradientBoostingClassifier(n_estimators = 100, loss='deviance', criterion='friedman_mse', random_state = 0)
classifier.fit(x_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

### Making the Confusion Matrix

In [26]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Caclulating and displaying comparison metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_true=y_test, y_pred=classifier.predict_proba(x_test))
print("accuracy: "+str(accuracy))
print("precision: "+str(precision))
print("recall: "+str(recall))
print("f1: "+str(f1))
print("logloss: "+str(logloss))

# Storing comparison metrics in dataframe
results.loc['gradientboost', 'accuracy'] = accuracy
results.loc['gradientboost', 'precision'] = precision
results.loc['gradientboost', 'recall'] = recall
results.loc['gradientboost', 'f1_score'] = f1
results.loc['gradientboost', 'logloss'] = logloss

[[10  0  0  0  0  0]
 [ 0  5  0  0  0  0]
 [ 0  0  8  0  0  1]
 [ 0  0  0  2  0  0]
 [ 0  0  0  0  2  0]
 [ 0  0  0  1  0  6]]
accuracy: 0.9428571428571428
precision: 0.9523809523809522
recall: 0.9428571428571428
f1: 0.944873949579832
logloss: 0.21361263465331412


***
# Summary of Results

In [27]:
results

Unnamed: 0,accuracy,precision,recall,f1_score,logloss
decision_tree,0.942857,0.952381,0.942857,0.944874,1.97364
knn,0.914286,0.926407,0.914286,0.914252,1.23749
kernel_svm,0.828571,0.821818,0.828571,0.810113,0.534728
logistic_regression,0.8,0.770216,0.8,0.776806,0.490587
naive_bayse,0.571429,0.627937,0.571429,0.524736,13.9038
random_forest,0.914286,0.930952,0.914286,0.914398,0.310304
extremely_random_trees,0.942857,0.952381,0.942857,0.944874,1.05462
bagging,0.942857,0.952381,0.942857,0.944874,0.196139
adaboost,0.542857,0.52381,0.542857,0.493707,1.14559
gradientboost,0.942857,0.952381,0.942857,0.944874,0.213613


In [28]:
results = results.astype(float)
print('logloss\t     '+results.iloc[:,-1].idxmin())
print(results.iloc[:,:-1].idxmax())

logloss	     bagging
accuracy     decision_tree
precision    decision_tree
recall       decision_tree
f1_score     decision_tree
dtype: object
