In [1]:
# Import packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter
from parse import preprocess

In [2]:
# Function to show classification report for Cross Validation
def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred)) # print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

In [3]:
# Pre-parse the dataset
data = preprocess("rawfile_blood.csv")

robust          368
prefrail_mci    268
prefrail        250
mci             142
frail_mci        86
frail             9
Name: condition, dtype: int64

####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1007

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47


In [4]:
# Initialise counters for each condition
frail = 0
frail_mci = 0
mci = 0
prefrail_mci = 0
prefrail = 0
robust = 0

# Count rows of data for each condition
for i in range(0, len(data)):
	if data.at[i, 'condition'] == 'frail':
		frail += 1
	elif data.at[i, 'condition'] == 'frail_mci':
		frail_mci += 1
	elif data.at[i, 'condition'] == 'mci':
		mci += 1
	elif data.at[i, 'condition'] == 'prefrail_mci':
		prefrail_mci += 1
	elif data.at[i, 'condition'] == 'prefrail':
		prefrail += 1
	elif data.at[i, 'condition'] == 'robust':
		robust += 1
        
# Display number of rows (frequency) for each condition (label)
print("\n####################################################################")
print("Labels with frequencies:")
print("Frail:", frail)
print("Frail + MCI:", frail_mci)
print("MCI:", mci)
print("Prefrail + MCI:", prefrail_mci)
print("Prefrail:", prefrail)
print("Robust:", robust)


####################################################################
Labels with frequencies:
Frail: 7
Frail + MCI: 76
MCI: 133
Prefrail + MCI: 231
Prefrail: 221
Robust: 339


In [5]:
data.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME02646,frail,196,24,46.5,121,3.93,0.37,95,31,...,7,12,13,6,0.2,6.0,1.011,1.14,4.1,5.9
1,ME03109,frail,200,23,55.6,142,4.82,0.42,87,30,...,7,20,17,26,3.1,5.0,1.011,3.25,4.6,8.5
2,ME06997,frail,441,20,76.8,105,4.54,0.41,90,30,...,5,16,19,15,1.4,7.0,1.023,2.14,4.0,6.4
3,ME07149,frail,265,16,47.2,122,4.53,0.39,86,27,...,8,24,19,21,2.1,5.5,1.012,1.06,4.7,6.1
4,ME07700,frail,425,14,31.3,124,4.44,0.38,85,28,...,6,20,23,23,6.0,5.5,1.013,1.95,3.8,5.8


In [6]:
data.columns

Index(['mtag', 'condition', 'A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6'],
      dtype='object')

In [7]:
# Grouping MCI and Robust

# Grouping:
# MCI, Prefrail_MCI, Frail_MCI, --> MCI
# Prefrail, Frail, Robust--> Robust

for i in range(0, len(data)):
	if data.at[i, 'condition'] == 'frail':
		data.at[i, 'condition'] = 'robust'
	elif data.at[i, 'condition'] == 'frail_mci':
		data.at[i, 'condition'] = 'mci'
	elif data.at[i, 'condition'] == 'mci':
		data.at[i, 'condition'] = 'mci'
	elif data.at[i, 'condition'] == 'prefrail_mci':
		data.at[i, 'condition'] = 'mci'
	elif data.at[i, 'condition'] == 'prefrail':
		data.at[i, 'condition'] = 'robust'
	elif data.at[i, 'condition'] == 'robust':
		data.at[i, 'condition'] = 'robust'
        
c = data['condition'].value_counts()
condition = c.index

c

robust    567
mci       440
Name: condition, dtype: int64

In [8]:
# Run Classification using 80/20 Train-Test Split

# Logistic Regression
    
y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
svm_model = SVC(gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression: 0.584
Linear Discriminant Analysis: 0.599
K-Nearest Neigbors: 0.525
Classification and Regression Trees: 0.584
Gaussian Naive Bayes: 0.619
Support Vector Machines: 0.584
Random Forest Classifier: 0.599


In [9]:
# Cross Validation Score

# Run Classification using 80/20 Train-Test Split

# Logistic Regression
    
y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=10)
print("Logistic Regression: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=10)
print("Linear Discriminant Analysis: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=10)
print("K-Nearest Neighbors: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=10)
print("Classification and Regression Trees: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=10)
print("Gaussian Naive Bayes: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Support Vector Machines

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
svm_model = SVC(gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=10)
print("Support Vector Machines: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Random Forest Classifier

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=10)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Logistic Regression: 0.63 accuracy with a standard deviation of 0.05
Linear Discriminant Analysis: 0.64 accuracy with a standard deviation of 0.06
K-Nearest Neighbors: 0.55 accuracy with a standard deviation of 0.06
Classification and Regression Trees: 0.52 accuracy with a standard deviation of 0.05
Gaussian Naive Bayes: 0.62 accuracy with a standard deviation of 0.03
Support Vector Machines: 0.57 accuracy with a standard deviation of 0.00
Random Forest Classifier: 0.62 accuracy with a standard deviation of 0.05


In [10]:
# Showing Cross Validation Score for each iteration

# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.64      0.79      0.71        57
           1       0.61      0.43      0.51        44

    accuracy                           0.63       101
   macro avg       0.63      0.61      0.61       101
weighted avg       0.63      0.63      0.62       101

              precision    recall  f1-score   support

           0       0.64      0.68      0.66        57
           1       0.55      0.50      0.52        44

    accuracy                           0.60       101
   macro avg       0.59      0.59      0.59       101
weighted avg       0.60      0.60      0.60       101



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

           0       0.59      0.67      0.63        57
           1       0.49      0.41      0.44        44

    accuracy                           0.55       101
   macro avg       0.54      0.54      0.54       101
weighted avg       0.55      0.55      0.55       101

              precision    recall  f1-score   support

           0       0.64      0.86      0.74        57
           1       0.68      0.39      0.49        44

    accuracy                           0.65       101
   macro avg       0.66      0.62      0.61       101
weighted avg       0.66      0.65      0.63       101

              precision    recall  f1-score   support

           0       0.59      0.91      0.72        57
           1       0.62      0.18      0.28        44

    accuracy                           0.59       101
   macro avg       0.60      0.55      0.50       101
weighted avg       0.60      0.59      0.53       101



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

           0       0.65      0.75      0.70        57
           1       0.60      0.48      0.53        44

    accuracy                           0.63       101
   macro avg       0.63      0.62      0.62       101
weighted avg       0.63      0.63      0.63       101

              precision    recall  f1-score   support

           0       0.66      0.82      0.73        57
           1       0.67      0.45      0.54        44

    accuracy                           0.66       101
   macro avg       0.66      0.64      0.64       101
weighted avg       0.66      0.66      0.65       101

              precision    recall  f1-score   support

           0       0.71      0.79      0.75        56
           1       0.68      0.59      0.63        44

    accuracy                           0.70       100
   macro avg       0.70      0.69      0.69       100
weighted avg       0.70      0.70      0.70       100



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

           0       0.68      0.84      0.75        56
           1       0.71      0.50      0.59        44

    accuracy                           0.69       100
   macro avg       0.70      0.67      0.67       100
weighted avg       0.69      0.69      0.68       100

              precision    recall  f1-score   support

           0       0.57      0.77      0.65        56
           1       0.46      0.25      0.32        44

    accuracy                           0.54       100
   macro avg       0.51      0.51      0.49       100
weighted avg       0.52      0.54      0.51       100

[0.63366337 0.6039604  0.55445545 0.65346535 0.59405941 0.63366337
 0.66336634 0.7        0.69       0.54      ]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.64      0.74      0.68        57
           1       0.57      0.45      0.51        44

    accuracy                           0.61       101
   macro avg       0.60      0.60      0.59       101
weighted avg       0.61      0.61      0.61       101

              precision    recall  f1-score   support

           0       0.60      0.53      0.56        57
           1       0.47      0.55      0.51        44

    accuracy                           0.53       101
   macro avg       0.54      0.54      0.53       101
weighted avg       0.54      0.53      0.54       101

              precision    recall  f1-score   support

           0       0.59      0.68      0.63        57
           1       0.49      0.39      0.43        44

    accuracy                           0.55       101
   macro avg       0.54      0.54      0.53       101
weighted avg       0.55      0.55      0.55       101

              preci

In [12]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.56      0.61      0.58        57
           1       0.42      0.36      0.39        44

    accuracy                           0.50       101
   macro avg       0.49      0.49      0.49       101
weighted avg       0.50      0.50      0.50       101

              precision    recall  f1-score   support

           0       0.52      0.60      0.55        57
           1       0.34      0.27      0.30        44

    accuracy                           0.46       101
   macro avg       0.43      0.43      0.43       101
weighted avg       0.44      0.46      0.44       101

              precision    recall  f1-score   support

           0       0.54      0.58      0.56        57
           1       0.40      0.36      0.38        44

    accuracy                           0.49       101
   macro avg       0.47      0.47      0.47       101
weighted avg       0.48      0.49      0.48       101

              preci

In [13]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.60      0.47      0.53        57
           1       0.46      0.59      0.52        44

    accuracy                           0.52       101
   macro avg       0.53      0.53      0.52       101
weighted avg       0.54      0.52      0.53       101

              precision    recall  f1-score   support

           0       0.60      0.46      0.52        57
           1       0.47      0.61      0.53        44

    accuracy                           0.52       101
   macro avg       0.54      0.53      0.52       101
weighted avg       0.54      0.52      0.52       101

              precision    recall  f1-score   support

           0       0.57      0.56      0.57        57
           1       0.44      0.45      0.45        44

    accuracy                           0.51       101
   macro avg       0.51      0.51      0.51       101
weighted avg       0.52      0.51      0.52       101

              preci

In [14]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.65      0.74      0.69        57
           1       0.58      0.48      0.53        44

    accuracy                           0.62       101
   macro avg       0.61      0.61      0.61       101
weighted avg       0.62      0.62      0.62       101

              precision    recall  f1-score   support

           0       0.66      0.61      0.64        57
           1       0.54      0.59      0.57        44

    accuracy                           0.60       101
   macro avg       0.60      0.60      0.60       101
weighted avg       0.61      0.60      0.61       101

              precision    recall  f1-score   support

           0       0.65      0.81      0.72        57
           1       0.63      0.43      0.51        44

    accuracy                           0.64       101
   macro avg       0.64      0.62      0.62       101
weighted avg       0.64      0.64      0.63       101

              preci

In [15]:
# Support Vector Machines
svm_model = SVC(gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.56      1.00      0.72        57
           1       0.00      0.00      0.00        44

    accuracy                           0.56       101
   macro avg       0.28      0.50      0.36       101
weighted avg       0.32      0.56      0.41       101

              precision    recall  f1-score   support

           0       0.56      1.00      0.72        57
           1       0.00      0.00      0.00        44

    accuracy                           0.56       101
   macro avg       0.28      0.50      0.36       101
weighted avg       0.32      0.56      0.41       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.56      1.00      0.72        57
           1       0.00      0.00      0.00        44

    accuracy                           0.56       101
   macro avg       0.28      0.50      0.36       101
weighted avg       0.32      0.56      0.41       101

              precision    recall  f1-score   support

           0       0.56      1.00      0.72        57
           1       0.00      0.00      0.00        44

    accuracy                           0.56       101
   macro avg       0.28      0.50      0.36       101
weighted avg       0.32      0.56      0.41       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.56      1.00      0.72        57
           1       0.00      0.00      0.00        44

    accuracy                           0.56       101
   macro avg       0.28      0.50      0.36       101
weighted avg       0.32      0.56      0.41       101

              precision    recall  f1-score   support

           0       0.56      1.00      0.72        57
           1       0.00      0.00      0.00        44

    accuracy                           0.56       101
   macro avg       0.28      0.50      0.36       101
weighted avg       0.32      0.56      0.41       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.56      1.00      0.72        57
           1       0.00      0.00      0.00        44

    accuracy                           0.56       101
   macro avg       0.28      0.50      0.36       101
weighted avg       0.32      0.56      0.41       101

              precision    recall  f1-score   support

           0       0.57      1.00      0.72        56
           1       1.00      0.02      0.04        44

    accuracy                           0.57       100
   macro avg       0.78      0.51      0.38       100
weighted avg       0.76      0.57      0.42       100

              precision    recall  f1-score   support

           0       0.57      1.00      0.72        56
           1       1.00      0.02      0.04        44

    accuracy                           0.57       100
   macro avg       0.78      0.51      0.38       100
weighted avg       0.76      0.57      0.42       100

              preci

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.64      0.77      0.70        57
           1       0.59      0.43      0.50        44

    accuracy                           0.62       101
   macro avg       0.62      0.60      0.60       101
weighted avg       0.62      0.62      0.61       101

              precision    recall  f1-score   support

           0       0.57      0.46      0.50        57
           1       0.44      0.55      0.48        44

    accuracy                           0.50       101
   macro avg       0.50      0.50      0.49       101
weighted avg       0.51      0.50      0.50       101

              precision    recall  f1-score   support

           0       0.63      0.77      0.69        57
           1       0.58      0.41      0.48        44

    accuracy                           0.61       101
   macro avg       0.60      0.59      0.59       101
weighted avg       0.61      0.61      0.60       101

              preci

In [17]:
# Calculating accuracy metrics for Logistic Regression
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

print('Accuracy Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

Accuracy Metrics for Logistic Regression:

0.58416 

[[91 27]
 [57 27]] 

              precision    recall  f1-score   support

           0       0.61      0.77      0.68       118
           1       0.50      0.32      0.39        84

    accuracy                           0.58       202
   macro avg       0.56      0.55      0.54       202
weighted avg       0.57      0.58      0.56       202



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [18]:
# Calculating accuracy metrics for LDA
lda_model.fit(X_train, y_train)
lda_pred = lda_model.predict(X_test)

print('Accuracy Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

Accuracy Metrics for LDA:

0.59901 

[[92 26]
 [55 29]] 

              precision    recall  f1-score   support

           0       0.63      0.78      0.69       118
           1       0.53      0.35      0.42        84

    accuracy                           0.60       202
   macro avg       0.58      0.56      0.56       202
weighted avg       0.58      0.60      0.58       202



In [19]:
# Calculating accuracy metrics for KNN
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print('Accuracy Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

Accuracy Metrics for KNN:

0.52475 

[[75 43]
 [53 31]] 

              precision    recall  f1-score   support

           0       0.59      0.64      0.61       118
           1       0.42      0.37      0.39        84

    accuracy                           0.52       202
   macro avg       0.50      0.50      0.50       202
weighted avg       0.52      0.52      0.52       202



In [20]:
# Calculating accuracy metrics for CART
cart_model.fit(X_train, y_train)
cart_pred = cart_model.predict(X_test)

print('Accuracy Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

Accuracy Metrics for CART:

0.60891 

[[80 38]
 [41 43]] 

              precision    recall  f1-score   support

           0       0.66      0.68      0.67       118
           1       0.53      0.51      0.52        84

    accuracy                           0.61       202
   macro avg       0.60      0.59      0.60       202
weighted avg       0.61      0.61      0.61       202



In [21]:
# Calculating accuracy metrics for Gaussian Naive Bayes
gnb_model.fit(X_train, y_train)
gnb_pred = gnb_model.predict(X_test)

print('Accuracy Metrics for Gaussian Naive Bayes:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

Accuracy Metrics for Gaussian Naive Bayes:

0.61881 

[[104  14]
 [ 63  21]] 

              precision    recall  f1-score   support

           0       0.62      0.88      0.73       118
           1       0.60      0.25      0.35        84

    accuracy                           0.62       202
   macro avg       0.61      0.57      0.54       202
weighted avg       0.61      0.62      0.57       202



In [22]:
# Calculating accuracy metrics for SVM
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

print('Accuracy Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

Accuracy Metrics for LDA:

0.58416 

[[118   0]
 [ 84   0]] 

              precision    recall  f1-score   support

           0       0.58      1.00      0.74       118
           1       0.00      0.00      0.00        84

    accuracy                           0.58       202
   macro avg       0.29      0.50      0.37       202
weighted avg       0.34      0.58      0.43       202



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
# Calculating accuracy metrics for Random Forest Classifier
rfc_model.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)

print('Accuracy Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

Accuracy Metrics for LDA:

0.59901 

[[94 24]
 [57 27]] 

              precision    recall  f1-score   support

           0       0.62      0.80      0.70       118
           1       0.53      0.32      0.40        84

    accuracy                           0.60       202
   macro avg       0.58      0.56      0.55       202
weighted avg       0.58      0.60      0.57       202



In [24]:
# Only using Features provided

In [25]:
# Conduct mapping for Feature Names
featureName_mapping = {
    "A1_1" : "Vitamin B12 (pmol/L)",
    "A1_2" : "Serum Folate (nmol/L)",
    "A2_1" : "Serum Homocysteine (µmol/L)",
    "A3_1" : "25-hydroxy Vitamin D (nmol/L)",
    "B1_a" : "Haemoglobin (g/L)",
    "B1_a1" : "RBC (/L)",
    "B1_a2" : "PCV (L/L)",
    "B1_a3" : "MCV (fL)",
    "B1_a4" : "MCH (pg)",
    "B1_a5" : "MCHC (g/L)",
    "B1_a6" : "RDW (%)",
    "B1_b" : "White Cell Count (/L)",
    "B1_b1" : "Neutrophils (/L)",
    "B1_b2" : "Lymphocytes (/L)",
    "B1_b3" : "Monocytes (/L)",
    "B1_b4" : "Eosinophils (/L)",
    "B1_b5" : "Basophils (/L)",
    "B1_c" : "Platelets (/L)",
    "B1_d" : "Glucose (mmol/L)",
    "B2_a1" : "Total Cholesterol (mmol/L)",
    "B2_a2" : "Triglyceride (mmol/L)",
    "B2_a3" : "HDL Cholesterol (mmol/L)",
    "B2_a4" : "LDL Cholesterol (mmol/L)",
    "B2_a5" : "Total Cholesterol/HDL Ratio",
    "B2_b1" : "Sodium (mmol/L)",
    "B2_b2" : "Potassium (mmol/L)",
    "B2_b3" : "Chloride (mmol/L)",
    "B2_c1" : 'Urea (mmol/L)',
    "B2_c2" : "Creatinine (umol/L)",
    "B2_c3" : "eGFR (mL/min/1.73m2)",
    "B2_c4" : "Uric Acid (mmol/L)",
    "B2_c5" : "Calcium (mmol/L)",
    "B2_c6" : "Corrected Calcium (mmol/L)",
    "B2_c7" : "Phosphate (mmol/L)",
    "B2_d1" : "Total Protein (g/L)",
    "B2_d2" : "Albumin (g/L)",
    "B2_d3" : "Globulin (g/L)",
    "B2_d4" : "Albumin/Globulin ratio",
    "B2_d5" : "Alkaline Phosphatase (U/L)",
    "B2_d6" : "Total Bilirubin (µmol/L)",
    "B2_d7" : "GGT",
    "B2_d8" : "AST",
    "B2_d9" : "ALT",
    "B3" : "C-Reactive Protein",
    "B4_a1" : "Protein",
    "B4_a2" : "pH",
    "B4_a3" : "Glucose",
    "B4_a4" : "Ketones",
    "B4_a5" : "S.G.",
    "B4_a6" : "Blood",
    "B4_b1" : "Leucocytes (/L)",
    "B4_b2" : "Erythrocytes (/L)",
    "B4_b3" : "Epithelial Cells",
    "B5_a1" : "Free Thyroxine (FT4) (pmol/L)",
    "B5_a2" : "Thyroid Stimulating Hormone (mIU/L)",
    "B5_a3" : "Free Tri-iodothyronine (FT3) (pmol/L)",
    "B6" : "HbA1c"
}

In [26]:
feature_list = ['A1_2', 'A2_1', 'B2_c3', 'B2_d2', 'B5_a3','B6']
missing = []
exists = []

print("Selected Features:")
print(feature_list)
print()

for items in feature_list:
    if items not in data.columns:
        missing.append(items)
    else:
        exists.append(items)

data1 = pd.Series(data=missing, name='MissingFeatures')

data1 = data1.map(featureName_mapping)

data2 = pd.Series(data=exists, name='ExistingFeatures')

data2 = data2.map(featureName_mapping)

print("Columns missing in parsed dataset:")

for i in range(0, len(data1)):
    print(missing[i], "-->", data1[i])

print("\nColumns existing in parsed dataset:")

for i in range(0, len(data2)):
    print(exists[i], "-->", data2[i])

Selected Features:
['A1_2', 'A2_1', 'B2_c3', 'B2_d2', 'B5_a3', 'B6']

Columns missing in parsed dataset:
A1_2 --> Serum Folate (nmol/L)
B2_c3 --> eGFR (mL/min/1.73m2)

Columns existing in parsed dataset:
A2_1 --> Serum Homocysteine (µmol/L)
B2_d2 --> Albumin (g/L)
B5_a3 --> Free Tri-iodothyronine (FT3) (pmol/L)
B6 --> HbA1c


In [27]:
data1 = data[exists]
data1 = data1.reset_index(drop=True)

data2 = data[['mtag', 'condition']]
data2 = data2.reset_index(drop=True)

data_final = data2.join(data1)

data = data_final

data

Unnamed: 0,mtag,condition,A2_1,B2_d2,B5_a3,B6
0,ME02646,0,24,42,4.1,5.9
1,ME03109,0,23,42,4.6,8.5
2,ME06997,0,20,43,4.0,6.4
3,ME07149,0,16,42,4.7,6.1
4,ME07700,0,14,45,3.8,5.8
...,...,...,...,...,...,...
1002,MV00454,0,19,42,4.5,6.2
1003,MV00456,0,18,39,3.9,5.6
1004,MV00460,0,17,41,4.0,5.6
1005,MV00502,0,18,40,4.1,6.0


In [28]:
# Run Classification using 80/20 Train-Test Split

# Logistic Regression
    
y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
svm_model = SVC(gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

Logistic Regression: 0.604
Linear Discriminant Analysis: 0.589
K-Nearest Neigbors: 0.55
Classification and Regression Trees: 0.564
Gaussian Naive Bayes: 0.589
Support Vector Machines: 0.564
Random Forest Classifier: 0.579


In [29]:
# Cross Validation Score

# Run Classification using 80/20 Train-Test Split

# Logistic Regression
    
y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=10)
print("Logistic Regression: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=10)
print("Linear Discriminant Analysis: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=10)
print("K-Nearest Neighbors: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=10)
print("Classification and Regression Trees: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=10)
print("Gaussian Naive Bayes: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Support Vector Machines

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
svm_model = SVC(gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=10)
print("Support Vector Machines: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Random Forest Classifier

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=10)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Logistic Regression: 0.60 accuracy with a standard deviation of 0.05
Linear Discriminant Analysis: 0.60 accuracy with a standard deviation of 0.04
K-Nearest Neighbors: 0.53 accuracy with a standard deviation of 0.07
Classification and Regression Trees: 0.53 accuracy with a standard deviation of 0.04
Gaussian Naive Bayes: 0.58 accuracy with a standard deviation of 0.04
Support Vector Machines: 0.57 accuracy with a standard deviation of 0.05
Random Forest Classifier: 0.55 accuracy with a standard deviation of 0.07


In [30]:
# Showing Cross Validation Score for each iteration

# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.57      0.68      0.62        57
           1       0.44      0.32      0.37        44

    accuracy                           0.52       101
   macro avg       0.50      0.50      0.49       101
weighted avg       0.51      0.52      0.51       101

              precision    recall  f1-score   support

           0       0.65      0.79      0.71        57
           1       0.62      0.45      0.53        44

    accuracy                           0.64       101
   macro avg       0.64      0.62      0.62       101
weighted avg       0.64      0.64      0.63       101

              precision    recall  f1-score   support

           0       0.62      0.89      0.73        57
           1       0.68      0.30      0.41        44

    accuracy                           0.63       101
   macro avg       0.65      0.60      0.57       101
weighted avg       0.65      0.63      0.59       101

              preci

In [31]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.59      0.75      0.66        57
           1       0.50      0.32      0.39        44

    accuracy                           0.56       101
   macro avg       0.54      0.54      0.53       101
weighted avg       0.55      0.56      0.54       101

              precision    recall  f1-score   support

           0       0.66      0.84      0.74        57
           1       0.68      0.43      0.53        44

    accuracy                           0.66       101
   macro avg       0.67      0.64      0.63       101
weighted avg       0.67      0.66      0.65       101

              precision    recall  f1-score   support

           0       0.61      0.89      0.73        57
           1       0.67      0.27      0.39        44

    accuracy                           0.62       101
   macro avg       0.64      0.58      0.56       101
weighted avg       0.64      0.62      0.58       101

              preci

In [32]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.55      0.54      0.55        57
           1       0.42      0.43      0.43        44

    accuracy                           0.50       101
   macro avg       0.49      0.49      0.49       101
weighted avg       0.50      0.50      0.50       101

              precision    recall  f1-score   support

           0       0.60      0.68      0.64        57
           1       0.50      0.41      0.45        44

    accuracy                           0.56       101
   macro avg       0.55      0.55      0.54       101
weighted avg       0.56      0.56      0.56       101

              precision    recall  f1-score   support

           0       0.54      0.58      0.56        57
           1       0.40      0.36      0.38        44

    accuracy                           0.49       101
   macro avg       0.47      0.47      0.47       101
weighted avg       0.48      0.49      0.48       101

              preci

In [33]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.56      0.49      0.52        57
           1       0.43      0.50      0.46        44

    accuracy                           0.50       101
   macro avg       0.50      0.50      0.49       101
weighted avg       0.50      0.50      0.50       101

              precision    recall  f1-score   support

           0       0.65      0.65      0.65        57
           1       0.55      0.55      0.55        44

    accuracy                           0.60       101
   macro avg       0.60      0.60      0.60       101
weighted avg       0.60      0.60      0.60       101

              precision    recall  f1-score   support

           0       0.58      0.56      0.57        57
           1       0.46      0.48      0.47        44

    accuracy                           0.52       101
   macro avg       0.52      0.52      0.52       101
weighted avg       0.53      0.52      0.53       101

              preci

In [34]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.55      0.72      0.63        57
           1       0.41      0.25      0.31        44

    accuracy                           0.51       101
   macro avg       0.48      0.48      0.47       101
weighted avg       0.49      0.51      0.49       101

              precision    recall  f1-score   support

           0       0.65      0.84      0.73        57
           1       0.67      0.41      0.51        44

    accuracy                           0.65       101
   macro avg       0.66      0.63      0.62       101
weighted avg       0.66      0.65      0.63       101

              precision    recall  f1-score   support

           0       0.59      0.82      0.69        57
           1       0.52      0.25      0.34        44

    accuracy                           0.57       101
   macro avg       0.56      0.54      0.51       101
weighted avg       0.56      0.57      0.53       101

              preci

In [35]:
# Support Vector Machines
svm_model = SVC(gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.51      0.61      0.56        57
           1       0.33      0.25      0.29        44

    accuracy                           0.46       101
   macro avg       0.42      0.43      0.42       101
weighted avg       0.44      0.46      0.44       101

              precision    recall  f1-score   support

           0       0.62      0.79      0.69        57
           1       0.57      0.36      0.44        44

    accuracy                           0.60       101
   macro avg       0.59      0.58      0.57       101
weighted avg       0.60      0.60      0.58       101

              precision    recall  f1-score   support

           0       0.60      0.75      0.67        57
           1       0.52      0.34      0.41        44

    accuracy                           0.57       101
   macro avg       0.56      0.55      0.54       101
weighted avg       0.56      0.57      0.56       101

              preci

In [36]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.55      0.60      0.57        57
           1       0.41      0.36      0.39        44

    accuracy                           0.50       101
   macro avg       0.48      0.48      0.48       101
weighted avg       0.49      0.50      0.49       101

              precision    recall  f1-score   support

           0       0.62      0.67      0.64        57
           1       0.53      0.48      0.50        44

    accuracy                           0.58       101
   macro avg       0.57      0.57      0.57       101
weighted avg       0.58      0.58      0.58       101

              precision    recall  f1-score   support

           0       0.63      0.63      0.63        57
           1       0.52      0.52      0.52        44

    accuracy                           0.58       101
   macro avg       0.58      0.58      0.58       101
weighted avg       0.58      0.58      0.58       101

              preci

In [37]:
# Calculating accuracy metrics for Logistic Regression
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

print('Accuracy Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

Accuracy Metrics for Logistic Regression:

0.60396 

[[97 21]
 [59 25]] 

              precision    recall  f1-score   support

           0       0.62      0.82      0.71       118
           1       0.54      0.30      0.38        84

    accuracy                           0.60       202
   macro avg       0.58      0.56      0.55       202
weighted avg       0.59      0.60      0.57       202



In [38]:
# Calculating accuracy metrics for LDA
lda_model.fit(X_train, y_train)
lda_pred = lda_model.predict(X_test)

print('Accuracy Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

Accuracy Metrics for LDA:

0.58911 

[[99 19]
 [64 20]] 

              precision    recall  f1-score   support

           0       0.61      0.84      0.70       118
           1       0.51      0.24      0.33        84

    accuracy                           0.59       202
   macro avg       0.56      0.54      0.51       202
weighted avg       0.57      0.59      0.55       202



In [39]:
# Calculating accuracy metrics for KNN
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print('Accuracy Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

Accuracy Metrics for KNN:

0.5495 

[[74 44]
 [47 37]] 

              precision    recall  f1-score   support

           0       0.61      0.63      0.62       118
           1       0.46      0.44      0.45        84

    accuracy                           0.55       202
   macro avg       0.53      0.53      0.53       202
weighted avg       0.55      0.55      0.55       202



In [40]:
# Calculating accuracy metrics for CART
cart_model.fit(X_train, y_train)
cart_pred = cart_model.predict(X_test)

print('Accuracy Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

Accuracy Metrics for CART:

0.4901 

[[61 57]
 [46 38]] 

              precision    recall  f1-score   support

           0       0.57      0.52      0.54       118
           1       0.40      0.45      0.42        84

    accuracy                           0.49       202
   macro avg       0.49      0.48      0.48       202
weighted avg       0.50      0.49      0.49       202



In [41]:
# Calculating accuracy metrics for Gaussian Naive Bayes
gnb_model.fit(X_train, y_train)
gnb_pred = gnb_model.predict(X_test)

print('Accuracy Metrics for Gaussian Naive Bayes:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

Accuracy Metrics for Gaussian Naive Bayes:

0.58911 

[[103  15]
 [ 68  16]] 

              precision    recall  f1-score   support

           0       0.60      0.87      0.71       118
           1       0.52      0.19      0.28        84

    accuracy                           0.59       202
   macro avg       0.56      0.53      0.50       202
weighted avg       0.57      0.59      0.53       202



In [42]:
# Calculating accuracy metrics for SVM
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

print('Accuracy Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

Accuracy Metrics for SVM:

0.56436 

[[92 26]
 [62 22]] 

              precision    recall  f1-score   support

           0       0.60      0.78      0.68       118
           1       0.46      0.26      0.33        84

    accuracy                           0.56       202
   macro avg       0.53      0.52      0.50       202
weighted avg       0.54      0.56      0.53       202



In [43]:
# Calculating accuracy metrics for Random Forest Classifier
rfc_model.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)

print('Accuracy Metrics for Random Forest:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

Accuracy Metrics for Random Forest:

0.59406 

[[84 34]
 [48 36]] 

              precision    recall  f1-score   support

           0       0.64      0.71      0.67       118
           1       0.51      0.43      0.47        84

    accuracy                           0.59       202
   macro avg       0.58      0.57      0.57       202
weighted avg       0.59      0.59      0.59       202

