# Binary Classification on Diseases

## Import

In [1]:
%config InlineBackend.figure_format = 'retina'
import time
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.io import arff
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
warnings.filterwarnings('ignore')
datasets = [] # To store data from all datasets
dataset_names = ['Breast Cancer', 'Diabetes', 'Heart Disease', 'Hepatitis', 'Liver Patients', 'Parkinson\'s Disease', 'Tumors']
count = 0

### Breast Cancer

In [3]:
data = arff.loadarff('./dataset/breast/breast.arff')
df = pd.DataFrame(data[0])
df.dropna(inplace=True) # Remove all rows with invalid data
df.head()

Unnamed: 0,﻿id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025.0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
1,1002945.0,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,2.0
2,1015425.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,2.0
3,1016277.0,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,2.0
4,1017023.0,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,2.0


In [4]:
values = df.values
breast_data = values[:,1:]
# relabel healthy = 0, malignant = 1
breast_data[breast_data[:,-1] == 2,-1] = 0
breast_data[breast_data[:,-1] == 4,-1] = 1
datasets.append(breast_data)
# preview
print(breast_data[0:5,:])
print(breast_data.shape)
print(breast_data.dtype)

[[ 5.  1.  1.  1.  2.  1.  3.  1.  1.  0.]
 [ 5.  4.  4.  5.  7. 10.  3.  2.  1.  0.]
 [ 3.  1.  1.  1.  2.  2.  3.  1.  1.  0.]
 [ 6.  8.  8.  1.  3.  4.  3.  7.  1.  0.]
 [ 4.  1.  1.  3.  2.  1.  3.  1.  1.  0.]]
(683, 10)
float64


### Diabetes

In [5]:
data = arff.loadarff('./dataset/diabetes/diabetes.arff')
df = pd.DataFrame(data[0])
df.dropna(inplace=True) # Remove all rows with invalid data
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,Class
0,1.0,1.0,22.0,22.0,22.0,19.0,18.0,14.0,49.895756,17.775994,5.27092,0.771761,0.018632,0.006864,0.003923,0.003923,0.486903,0.100025,1.0,b'0'
1,1.0,1.0,24.0,24.0,22.0,18.0,16.0,13.0,57.709936,23.799994,3.325423,0.234185,0.003903,0.003903,0.003903,0.003903,0.520908,0.144414,0.0,b'0'
2,1.0,1.0,62.0,60.0,59.0,54.0,47.0,33.0,55.831441,27.993933,12.687485,4.852282,1.393889,0.373252,0.041817,0.007744,0.530904,0.128548,0.0,b'1'
3,1.0,1.0,55.0,53.0,53.0,50.0,43.0,31.0,40.467228,18.445954,9.118901,3.079428,0.840261,0.272434,0.007653,0.001531,0.483284,0.11479,0.0,b'0'
4,1.0,1.0,44.0,44.0,44.0,41.0,39.0,27.0,18.026254,8.570709,0.410381,0.0,0.0,0.0,0.0,0.0,0.475935,0.123572,0.0,b'1'


In [6]:
values = df.values
diabetes_data = values[:,:]
# relabel healthy = 0, DR = 1
diabetes_data[diabetes_data[:,-1] == b'0',-1] = 0
diabetes_data[diabetes_data[:,-1] == b'1',-1] = 1
diabetes_data = diabetes_data.astype('float64')
datasets.append(diabetes_data)
# preview
print(diabetes_data[0:5,:])
print(diabetes_data.shape)
print(diabetes_data.dtype)

[[1.0000000e+00 1.0000000e+00 2.2000000e+01 2.2000000e+01 2.2000000e+01
  1.9000000e+01 1.8000000e+01 1.4000000e+01 4.9895756e+01 1.7775994e+01
  5.2709200e+00 7.7176100e-01 1.8632000e-02 6.8640000e-03 3.9230000e-03
  3.9230000e-03 4.8690300e-01 1.0002500e-01 1.0000000e+00 0.0000000e+00]
 [1.0000000e+00 1.0000000e+00 2.4000000e+01 2.4000000e+01 2.2000000e+01
  1.8000000e+01 1.6000000e+01 1.3000000e+01 5.7709936e+01 2.3799994e+01
  3.3254230e+00 2.3418500e-01 3.9030000e-03 3.9030000e-03 3.9030000e-03
  3.9030000e-03 5.2090800e-01 1.4441400e-01 0.0000000e+00 0.0000000e+00]
 [1.0000000e+00 1.0000000e+00 6.2000000e+01 6.0000000e+01 5.9000000e+01
  5.4000000e+01 4.7000000e+01 3.3000000e+01 5.5831441e+01 2.7993933e+01
  1.2687485e+01 4.8522820e+00 1.3938890e+00 3.7325200e-01 4.1817000e-02
  7.7440000e-03 5.3090400e-01 1.2854800e-01 0.0000000e+00 1.0000000e+00]
 [1.0000000e+00 1.0000000e+00 5.5000000e+01 5.3000000e+01 5.3000000e+01
  5.0000000e+01 4.3000000e+01 3.1000000e+01 4.0467228e+01 1.8

### Heart disease

In [7]:
data = arff.loadarff('./dataset/heart/heart.arff')
df = pd.DataFrame(data[0])
df.dropna(inplace=True) # Remove all rows with invalid data
df.head()

Unnamed: 0,﻿age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0


In [8]:
values = df.values
heart_data = values[:,:]
# relabel healthy = 0, heart disease = 1
heart_data[heart_data[:,-1] == 0,-1] = 0
heart_data[heart_data[:,-1] > 0,-1] = 1
datasets.append(heart_data)
# preview
print(heart_data[0:5,:])
print(heart_data.shape)
print(heart_data.dtype)

[[ 63.    1.    1.  145.  233.    1.    2.  150.    0.    2.3   3.    0.
    6.    0. ]
 [ 67.    1.    4.  160.  286.    0.    2.  108.    1.    1.5   2.    3.
    3.    1. ]
 [ 67.    1.    4.  120.  229.    0.    2.  129.    1.    2.6   2.    2.
    7.    1. ]
 [ 37.    1.    3.  130.  250.    0.    0.  187.    0.    3.5   3.    0.
    3.    0. ]
 [ 41.    0.    2.  130.  204.    0.    2.  172.    0.    1.4   1.    0.
    3.    0. ]]
(297, 14)
float64


### Hepatitis

In [9]:
data = arff.loadarff('./dataset/hepatitis/hepatitis.arff')
df = pd.DataFrame(data[0])
df.dropna(inplace=True) # Remove all rows with invalid data
df.head()

Unnamed: 0,﻿age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver big,liver firm,spleen palpable,spiders,ascites,varices,bilirubin,alk phosphate,sgot,albumin,protime,histology,class
5,34.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.9,95.0,28.0,4.0,75.0,1.0,2.0
10,39.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.3,78.0,30.0,4.4,85.0,1.0,2.0
11,32.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,59.0,249.0,3.7,54.0,1.0,2.0
12,41.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.9,81.0,60.0,3.9,52.0,1.0,2.0
13,30.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.2,57.0,144.0,4.9,78.0,1.0,2.0


In [10]:
values = df.values
hepatitis_data = values[:,:]
# relabel healthy = 0, perished = 1
hepatitis_data[hepatitis_data[:,-1] == 2,-1] = 0
datasets.append(hepatitis_data)
# preview
print(hepatitis_data[0:5,:])
print(hepatitis_data.shape)
print(hepatitis_data.dtype)

[[ 34.    1.    2.    2.    2.    2.    2.    2.    2.    2.    2.    2.
    2.    0.9  95.   28.    4.   75.    1.    0. ]
 [ 39.    1.    1.    1.    2.    2.    2.    1.    1.    2.    2.    2.
    2.    1.3  78.   30.    4.4  85.    1.    0. ]
 [ 32.    1.    2.    1.    1.    2.    2.    2.    1.    2.    1.    2.
    2.    1.   59.  249.    3.7  54.    1.    0. ]
 [ 41.    1.    2.    1.    1.    2.    2.    2.    1.    2.    2.    2.
    2.    0.9  81.   60.    3.9  52.    1.    0. ]
 [ 30.    1.    2.    2.    1.    2.    2.    2.    1.    2.    2.    2.
    2.    2.2  57.  144.    4.9  78.    1.    0. ]]
(80, 20)
float64


### Liver patients

In [11]:
data = arff.loadarff('./dataset/liver/liver.arff')
df = pd.DataFrame(data[0])
df.dropna(inplace=True) # Remove all rows with invalid data
df.head()

Unnamed: 0,age,gender,tb,db,alkphos,sgpt,sgot,tp,alb,ag,class
0,65.0,b'Female',0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,1.0
1,62.0,b'Male',10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1.0
2,62.0,b'Male',7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1.0
3,58.0,b'Male',1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,1.0
4,72.0,b'Male',3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.4,1.0


In [12]:
values = df.values
liver_data = values[:,:]
# relabel healthy = 0, unhealthy = 1; male = 0, female = 1
liver_data[liver_data[:,-1] == 2,-1] = 0
liver_data[liver_data[:,1] == b'Male',1] = 0
liver_data[liver_data[:,1] == b'Female',1] = 1
liver_data = liver_data.astype('float64')
datasets.append(liver_data)
# preview
print(liver_data[0:5,:])
print(liver_data.shape)
print(liver_data.dtype)

[[6.50e+01 1.00e+00 7.00e-01 1.00e-01 1.87e+02 1.60e+01 1.80e+01 6.80e+00
  3.30e+00 9.00e-01 1.00e+00]
 [6.20e+01 0.00e+00 1.09e+01 5.50e+00 6.99e+02 6.40e+01 1.00e+02 7.50e+00
  3.20e+00 7.40e-01 1.00e+00]
 [6.20e+01 0.00e+00 7.30e+00 4.10e+00 4.90e+02 6.00e+01 6.80e+01 7.00e+00
  3.30e+00 8.90e-01 1.00e+00]
 [5.80e+01 0.00e+00 1.00e+00 4.00e-01 1.82e+02 1.40e+01 2.00e+01 6.80e+00
  3.40e+00 1.00e+00 1.00e+00]
 [7.20e+01 0.00e+00 3.90e+00 2.00e+00 1.95e+02 2.70e+01 5.90e+01 7.30e+00
  2.40e+00 4.00e-01 1.00e+00]]
(579, 11)
float64


### Parkinson's Disease

In [13]:
data = arff.loadarff('./dataset/pd_data/pd_data.arff')
df = pd.DataFrame(data[0])
df.dropna(inplace=True) # Remove all rows with invalid data
df.head()

Unnamed: 0,﻿MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(\%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,status
0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,...,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654,1.0
1,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,...,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674,1.0
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,...,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634,1.0
3,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,...,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975,1.0
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,...,0.1047,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335,1.0


In [14]:
values = df.values
pd_data = values[:,:]
datasets.append(pd_data)
# preview
print(pd_data[0:5,:])
print(pd_data.shape)
print(pd_data.dtype)

[[ 1.199920e+02  1.573020e+02  7.499700e+01  7.840000e-03  7.000000e-05
   3.700000e-03  5.540000e-03  1.109000e-02  4.374000e-02  4.260000e-01
   2.182000e-02  3.130000e-02  2.971000e-02  6.545000e-02  2.211000e-02
   2.103300e+01  4.147830e-01  8.152850e-01 -4.813031e+00  2.664820e-01
   2.301442e+00  2.846540e-01  1.000000e+00]
 [ 1.224000e+02  1.486500e+02  1.138190e+02  9.680000e-03  8.000000e-05
   4.650000e-03  6.960000e-03  1.394000e-02  6.134000e-02  6.260000e-01
   3.134000e-02  4.518000e-02  4.368000e-02  9.403000e-02  1.929000e-02
   1.908500e+01  4.583590e-01  8.195210e-01 -4.075192e+00  3.355900e-01
   2.486855e+00  3.686740e-01  1.000000e+00]
 [ 1.166820e+02  1.311110e+02  1.115550e+02  1.050000e-02  9.000000e-05
   5.440000e-03  7.810000e-03  1.633000e-02  5.233000e-02  4.820000e-01
   2.757000e-02  3.858000e-02  3.590000e-02  8.270000e-02  1.309000e-02
   2.065100e+01  4.298950e-01  8.252880e-01 -4.443179e+00  3.111730e-01
   2.342259e+00  3.326340e-01  1.000000e+00]
 

### Primary tumors

In [15]:
data = arff.loadarff('./dataset/tumor/tumor.arff')
df = pd.DataFrame(data[0])
# del df['histologic']
df.dropna(inplace=True) # Remove all rows with invalid data
df.head()

Unnamed: 0,﻿age,sex,histologic,degree,bone,bone marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal,class
2,1.0,2.0,2.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0
6,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0
7,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
8,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0
10,2.0,1.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0


In [16]:
values = df.values
tumor_data = values[:,:]
# relabel lung tumor = 1, other tumors = 0
tumor_data[tumor_data[:,-1] > 1,-1] = 0
datasets.append(tumor_data)
# preview
print(tumor_data[0:5,:])
print(tumor_data.shape)
print(tumor_data.dtype)

[[1. 2. 2. 3. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1. 2. 1.]
 [2. 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 1. 1. 1. 2. 2. 2. 1.]
 [2. 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1.]
 [2. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2. 1. 2. 2. 2. 2. 1.]
 [2. 1. 1. 3. 1. 2. 2. 1. 2. 2. 2. 2. 2. 1. 2. 1. 2. 1.]]
(132, 18)
float64


## Classifiers

In [17]:
def draw_heatmap(accuracy, descriptor, parameters, parameter_label):
    global count
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(accuracy, annot=True, fmt='.3f', yticklabels=parameters, xticklabels=[])
    ax.collections[0].colorbar.set_label("Accuracy")
    ax.set(ylabel='$' + parameter_label + '$')
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    plt.title(descriptor + ' w.r.t $' + parameter_label + '$')
    plt.savefig('./Graphs/' + str(count) + '.jpg', bbox_inches = 'tight')
    plt.show()
    count += 1

In [18]:
def svm_classifier(train_data, train_label, test_data, test_label):
    C_list = [10**-3, 10**-2, 10**-1, 1, 10]
    parameters = {'C': C_list}
    classifier = GridSearchCV(SVC(kernel = 'linear'), parameters, cv=5, return_train_score=True)
    classifier.fit(train_data, train_label)

    training_accuracy = classifier.cv_results_['mean_train_score']
    validation_accuracy = classifier.cv_results_['mean_test_score']

    # draw_heatmap(training_accuracy.reshape(-1,1), 'SVM Training', C_list, 'C')
    # draw_heatmap(validation_accuracy.reshape(-1,1), 'SVM Validation', C_list, 'C')

    optimal_parameter = classifier.best_params_['C']
    optimal_classifier = SVC(kernel = 'linear', C = optimal_parameter)
    optimal_classifier.fit(train_data, train_label)
    test_accuracy = optimal_classifier.score(test_data, test_label)
    
    for i,j in enumerate(C_list):
        if j == optimal_parameter:
            train_accuracy = training_accuracy[i]

    return train_accuracy, test_accuracy

In [19]:
def rf_classifier(train_data, train_label, test_data, test_label):
    D_list = np.array([1, 2, 3, 4, 5])
    parameters = {'max_depth':D_list}
    classifier = GridSearchCV(RandomForestClassifier(criterion="entropy"), parameters, cv=5, return_train_score=True)
    classifier.fit(train_data, train_label)
    
    training_accuracy = classifier.cv_results_['mean_train_score']
    validation_accuracy = classifier.cv_results_['mean_test_score']

    # draw_heatmap(training_accuracy.reshape(-1,1), 'RF Training', D_list, 'D')
    # draw_heatmap(validation_accuracy.reshape(-1,1), 'RF Validation', D_list, 'D')

    optimal_parameter = classifier.best_params_['max_depth']
    optimal_classifier = RandomForestClassifier(criterion="entropy", max_depth=optimal_parameter)
    optimal_classifier.fit(train_data, train_label)
    test_accuracy = optimal_classifier.score(test_data, test_label)
        
    for i,j in enumerate(D_list):
        if j == optimal_parameter:
            train_accuracy = training_accuracy[i]
    
    return train_accuracy, test_accuracy

In [20]:
def dt_classifier(train_data, train_label, test_data, test_label):
    D_list = np.array([1, 2, 3, 4, 5])
    parameters = {'max_depth':D_list}
    classifier = GridSearchCV(DecisionTreeClassifier(criterion="entropy"), parameters, cv=5, return_train_score=True)
    classifier.fit(train_data, train_label)
    
    training_accuracy = classifier.cv_results_['mean_train_score']
    validation_accuracy = classifier.cv_results_['mean_test_score']

    # draw_heatmap(training_accuracy.reshape(-1,1), 'DT Training', D_list, 'D')
    # draw_heatmap(validation_accuracy.reshape(-1,1), 'DT Validation', D_list, 'D')

    optimal_parameter = classifier.best_params_['max_depth']
    optimal_classifier = DecisionTreeClassifier(criterion="entropy", max_depth=optimal_parameter)
    optimal_classifier.fit(train_data, train_label)
    test_accuracy = optimal_classifier.score(test_data, test_label)
        
    for i,j in enumerate(D_list):
        if j == optimal_parameter:
            train_accuracy = training_accuracy[i]
    
    return train_accuracy, test_accuracy

In [21]:
def knn_classifier(train_data, train_label, test_data, test_label):
    K_list = np.array([1, 3, 5, 7, 9, 11])
    parameters = {'n_neighbors':K_list}
    classifier = GridSearchCV(KNeighborsClassifier(), parameters, cv=5, return_train_score=True)
    classifier.fit(train_data, train_label)
    
    training_accuracy = classifier.cv_results_['mean_train_score']
    validation_accuracy = classifier.cv_results_['mean_test_score']

    # draw_heatmap(training_accuracy.reshape(-1,1), 'KNN Training', K_list, 'K')
    # draw_heatmap(validation_accuracy.reshape(-1,1), 'KNN Validation', K_list, 'K')
    
    optimal_parameter = classifier.best_params_['n_neighbors']
    optimal_classifier = KNeighborsClassifier(n_neighbors=optimal_parameter)
    optimal_classifier.fit(train_data, train_label)
    test_accuracy = optimal_classifier.score(test_data, test_label)
        
    for i,j in enumerate(K_list):
        if j == optimal_parameter:
            train_accuracy = training_accuracy[i]
    
    return train_accuracy, test_accuracy

In [22]:
def logit_classifier(train_data, train_label, test_data, test_label):
    C_list = np.array([10**-3, 10**-2, 10**-1, 1, 10])
    parameters = {'C':C_list}
    classifier = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=5, return_train_score=True)
    classifier.fit(train_data, train_label)
    
    training_accuracy = classifier.cv_results_['mean_train_score']
    validation_accuracy = classifier.cv_results_['mean_test_score']

    # draw_heatmap(training_accuracy.reshape(-1,1), 'LOGIT Training', C_list, 'C')
    # draw_heatmap(validation_accuracy.reshape(-1,1), 'LOGIT Validation', C_list, 'C')
    
    optimal_parameter = classifier.best_params_['C']
    optimal_classifier = LogisticRegression(solver='liblinear', C=optimal_parameter)
    optimal_classifier.fit(train_data, train_label)
    test_accuracy = optimal_classifier.score(test_data, test_label)
        
    for i,j in enumerate(C_list):
        if j == optimal_parameter:
            train_accuracy = training_accuracy[i]
    
    return train_accuracy, test_accuracy

## Classification

In [23]:
start_time = time.time()
partition_values = [0.8,0.5,0.2]
for name, dataset in enumerate(datasets):
    print("\n", dataset_names[name])
    for i, partition in enumerate(partition_values):
        svm_train_accuracy = []
        svm_test_accuracy = []
        rf_train_accuracy = []
        rf_test_accuracy = []
        dt_train_accuracy = []
        dt_test_accuracy = []
        knn_train_accuracy = []
        knn_test_accuracy = []
        logit_train_accuracy = []
        logit_test_accuracy = []
        
        trials = 3
        for trial in range(trials):
            np.random.shuffle(dataset)
            split = int(partition*len(dataset))

            train_data = dataset[:split,:-1]
            train_label = dataset[:split,-1]
            test_data = dataset[split:,:-1]
            test_label = dataset[split:,-1]

            # Classifiers in the order of: Support Vector Machine, Random Forest, Decision Tree, K-Nearest Neighbors, Logistic Regression
            train_accuracy, test_accuracy = svm_classifier(train_data, train_label, test_data, test_label)
            svm_train_accuracy.append(train_accuracy)
            svm_test_accuracy.append(test_accuracy)
            train_accuracy, test_accuracy = rf_classifier(train_data, train_label, test_data, test_label)
            rf_train_accuracy.append(train_accuracy)
            rf_test_accuracy.append(test_accuracy)
            train_accuracy, test_accuracy = dt_classifier(train_data, train_label, test_data, test_label)
            dt_train_accuracy.append(train_accuracy)
            dt_test_accuracy.append(test_accuracy)
            train_accuracy, test_accuracy = knn_classifier(train_data, train_label, test_data, test_label)
            knn_train_accuracy.append(train_accuracy)
            knn_test_accuracy.append(test_accuracy)
            train_accuracy, test_accuracy = logit_classifier(train_data, train_label, test_data, test_label)
            logit_train_accuracy.append(train_accuracy)
            logit_test_accuracy.append(test_accuracy)
        
        # Display results
        print("Partition: ", partition)
        print("Average train accuracy for Support Vector Machine = ", round(sum(svm_train_accuracy)/trials,3))
        print("Average train accuracy for Random Forest = ", round(sum(rf_train_accuracy)/trials,3))
        print("Average train accuracy for Decision Tree = ", round(sum(dt_train_accuracy)/trials,3))
        print("Average train accuracy for K Nearest Neighbors = ", round(sum(knn_train_accuracy)/trials,3))
        print("Average train accuracy for Logistic Regression = ", round(sum(logit_train_accuracy)/trials,3))
        print("Average test accuracy for Support Vector Machine = ", round(sum(svm_test_accuracy)/trials,3))
        print("Average test accuracy for Random Forest = ", round(sum(rf_test_accuracy)/trials,3))
        print("Average test accuracy for Decision Tree = ", round(sum(dt_test_accuracy)/trials,3))
        print("Average test accuracy for K Nearest Neighbors = ", round(sum(knn_test_accuracy)/trials,3))
        print("Average test accuracy for Logistic Regression = ", round(sum(logit_test_accuracy)/trials,3))
        
print("\n---------------------------------------------------------------------")
print("Program terminated without error, total time elapsed (w/o heatmap): %.1f seconds." % (time.time() - start_time))


 Breast Cancer
Partition:  0.8
Average train accuracy for Support Vector Machine =  0.975
Average train accuracy for Random Forest =  0.984
Average train accuracy for Decision Tree =  0.985
Average train accuracy for K Nearest Neighbors =  0.982
Average train accuracy for Logistic Regression =  0.973
Average test accuracy for Support Vector Machine =  0.968
Average test accuracy for Random Forest =  0.964
Average test accuracy for Decision Tree =  0.951
Average test accuracy for K Nearest Neighbors =  0.966
Average test accuracy for Logistic Regression =  0.968
Partition:  0.5
Average train accuracy for Support Vector Machine =  0.971
Average train accuracy for Random Forest =  0.979
Average train accuracy for Decision Tree =  0.968
Average train accuracy for K Nearest Neighbors =  0.979
Average train accuracy for Logistic Regression =  0.971
Average test accuracy for Support Vector Machine =  0.966
Average test accuracy for Random Forest =  0.97
Average test accuracy for Decision Tre

Partition:  0.8
Average train accuracy for Support Vector Machine =  0.875
Average train accuracy for Random Forest =  0.985
Average train accuracy for Decision Tree =  0.939
Average train accuracy for K Nearest Neighbors =  0.93
Average train accuracy for Logistic Regression =  0.86
Average test accuracy for Support Vector Machine =  0.889
Average test accuracy for Random Forest =  0.915
Average test accuracy for Decision Tree =  0.88
Average test accuracy for K Nearest Neighbors =  0.829
Average test accuracy for Logistic Regression =  0.863
Partition:  0.5
Average train accuracy for Support Vector Machine =  0.889
Average train accuracy for Random Forest =  0.983
Average train accuracy for Decision Tree =  0.916
Average train accuracy for K Nearest Neighbors =  0.915
Average train accuracy for Logistic Regression =  0.878
Average test accuracy for Support Vector Machine =  0.857
Average test accuracy for Random Forest =  0.867
Average test accuracy for Decision Tree =  0.833
Average