In [1]:
# Import packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter
from parse import preprocess

In [2]:
# Function to show classification report for Cross Validation
def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred)) # print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

In [3]:
# Pre-parse the dataset
data = preprocess("rawfile_blood.csv")

robust          368
prefrail_mci    268
prefrail        250
mci             142
frail_mci        86
frail             9
Name: condition, dtype: int64

####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1007

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47


In [4]:
# Initialise counters for each condition
frail = 0
frail_mci = 0
mci = 0
prefrail_mci = 0
prefrail = 0
robust = 0

# Count rows of data for each condition
for i in range(0, len(data)):
	if data.at[i, 'condition'] == 'frail':
		frail += 1
	elif data.at[i, 'condition'] == 'frail_mci':
		frail_mci += 1
	elif data.at[i, 'condition'] == 'mci':
		mci += 1
	elif data.at[i, 'condition'] == 'prefrail_mci':
		prefrail_mci += 1
	elif data.at[i, 'condition'] == 'prefrail':
		prefrail += 1
	elif data.at[i, 'condition'] == 'robust':
		robust += 1
        
# Display number of rows (frequency) for each condition (label)
print("\n####################################################################")
print("Labels with frequencies:")
print("Frail:", frail)
print("Frail + MCI:", frail_mci)
print("MCI:", mci)
print("Prefrail + MCI:", prefrail_mci)
print("Prefrail:", prefrail)
print("Robust:", robust)


####################################################################
Labels with frequencies:
Frail: 7
Frail + MCI: 76
MCI: 133
Prefrail + MCI: 231
Prefrail: 221
Robust: 339


In [5]:
data.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME02646,frail,196,24,46.5,121,3.93,0.37,95,31,...,7,12,13,6,0.2,6.0,1.011,1.14,4.1,5.9
1,ME03109,frail,200,23,55.6,142,4.82,0.42,87,30,...,7,20,17,26,3.1,5.0,1.011,3.25,4.6,8.5
2,ME06997,frail,441,20,76.8,105,4.54,0.41,90,30,...,5,16,19,15,1.4,7.0,1.023,2.14,4.0,6.4
3,ME07149,frail,265,16,47.2,122,4.53,0.39,86,27,...,8,24,19,21,2.1,5.5,1.012,1.06,4.7,6.1
4,ME07700,frail,425,14,31.3,124,4.44,0.38,85,28,...,6,20,23,23,6.0,5.5,1.013,1.95,3.8,5.8


In [6]:
data.columns

Index(['mtag', 'condition', 'A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6'],
      dtype='object')

In [7]:
# Grouping Pre-frail, Frail, and Robust

# Grouping:
# Frail, Frail_MCI, --> Frail
# Prefrail_MCI, Prefrail --> Prefrail
# MCI, Robust --> Robust

for i in range(0, len(data)):
	if data.at[i, 'condition'] == 'frail':
		data.at[i, 'condition'] = 'frail'
	elif data.at[i, 'condition'] == 'frail_mci':
		data.at[i, 'condition'] = 'frail'
	elif data.at[i, 'condition'] == 'mci':
		data.at[i, 'condition'] = 'robust'
	elif data.at[i, 'condition'] == 'prefrail_mci':
		data.at[i, 'condition'] = 'prefrail'
	elif data.at[i, 'condition'] == 'prefrail':
		data.at[i, 'condition'] = 'prefrail'
	elif data.at[i, 'condition'] == 'robust':
		data.at[i, 'condition'] = 'robust'

In [8]:
c = data['condition'].value_counts()
condition = c.index

c

robust      472
prefrail    452
frail        83
Name: condition, dtype: int64

In [9]:
# Run Classification using 80/20 Train-Test Split

# Logistic Regression
    
y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
svm_model = SVC(gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression: 0.55
Linear Discriminant Analysis: 0.55
K-Nearest Neigbors: 0.495
Classification and Regression Trees: 0.51
Gaussian Naive Bayes: 0.49
Support Vector Machines: 0.485
Random Forest Classifier: 0.554


In [10]:
# Cross Validation Score

# Run Classification using 80/20 Train-Test Split

# Logistic Regression
    
y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=10)
print("Logistic Regression: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=10)
print("Linear Discriminant Analysis: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=10)
print("K-Nearest Neighbors: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=10)
print("Classification and Regression Trees: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=10)
print("Gaussian Naive Bayes: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Support Vector Machines

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
svm_model = SVC(gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=10)
print("Support Vector Machines: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Random Forest Classifier

y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=10)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Logistic Regression: 0.52 accuracy with a standard deviation of 0.08
Linear Discriminant Analysis: 0.52 accuracy with a standard deviation of 0.05
K-Nearest Neighbors: 0.50 accuracy with a standard deviation of 0.03
Classification and Regression Trees: 0.45 accuracy with a standard deviation of 0.05
Gaussian Naive Bayes: 0.49 accuracy with a standard deviation of 0.03
Support Vector Machines: 0.47 accuracy with a standard deviation of 0.01
Random Forest Classifier: 0.51 accuracy with a standard deviation of 0.06


In [11]:
# Showing Cross Validation Score for each iteration

# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.63      0.62      0.62        47
           1       0.55      0.67      0.60        45
           2       0.00      0.00      0.00         9

    accuracy                           0.58       101
   macro avg       0.39      0.43      0.41       101
weighted avg       0.54      0.58      0.56       101



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/m

              precision    recall  f1-score   support

           0       0.52      0.47      0.49        47
           1       0.44      0.56      0.49        45
           2       0.00      0.00      0.00         9

    accuracy                           0.47       101
   macro avg       0.32      0.34      0.33       101
weighted avg       0.44      0.47      0.45       101

              precision    recall  f1-score   support

           0       0.53      0.55      0.54        47
           1       0.46      0.51      0.48        45
           2       0.00      0.00      0.00         9

    accuracy                           0.49       101
   macro avg       0.33      0.35      0.34       101
weighted avg       0.45      0.49      0.47       101

              precision    recall  f1-score   support

           0       0.65      0.73      0.69        48
           1       0.65      0.67      0.66        45
           2       0.00      0.00      0.00         8

    accuracy        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/m

              precision    recall  f1-score   support

           0       0.57      0.73      0.64        48
           1       0.57      0.51      0.54        45
           2       0.00      0.00      0.00         8

    accuracy                           0.57       101
   macro avg       0.38      0.41      0.39       101
weighted avg       0.53      0.57      0.55       101

              precision    recall  f1-score   support

           0       0.44      0.64      0.52        47
           1       0.27      0.20      0.23        46
           2       0.00      0.00      0.00         8

    accuracy                           0.39       101
   macro avg       0.24      0.28      0.25       101
weighted avg       0.33      0.39      0.35       101

              precision    recall  f1-score   support

           0       0.56      0.70      0.62        47
           1       0.54      0.48      0.51        46
           2       0.00      0.00      0.00         8

    accuracy        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

           0       0.58      0.70      0.63        47
           1       0.50      0.47      0.48        45
           2       0.00      0.00      0.00         8

    accuracy                           0.54       100
   macro avg       0.36      0.39      0.37       100
weighted avg       0.50      0.54      0.52       100

              precision    recall  f1-score   support

           0       0.57      0.64      0.60        47
           1       0.56      0.56      0.56        45
           2       0.50      0.12      0.20         8

    accuracy                           0.56       100
   macro avg       0.54      0.44      0.45       100
weighted avg       0.56      0.56      0.55       100

              precision    recall  f1-score   support

           0       0.46      0.62      0.53        47
           1       0.33      0.27      0.30        45
           2       1.00      0.12      0.22         8

    accuracy        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.57      0.62      0.59        47
           1       0.57      0.58      0.57        45
           2       0.25      0.11      0.15         9

    accuracy                           0.55       101
   macro avg       0.46      0.44      0.44       101
weighted avg       0.54      0.55      0.54       101

              precision    recall  f1-score   support

           0       0.67      0.34      0.45        47
           1       0.49      0.78      0.60        45
           2       0.00      0.00      0.00         9

    accuracy                           0.50       101
   macro avg       0.38      0.37      0.35       101
weighted avg       0.53      0.50      0.48       101

              precision    recall  f1-score   support

           0       0.53      0.55      0.54        47
           1       0.50      0.56      0.53        45
           2       0.50      0.11      0.18         9

    accuracy        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.55      0.62      0.58        47
           1       0.52      0.56      0.54        45
           2       0.00      0.00      0.00         9

    accuracy                           0.53       101
   macro avg       0.36      0.39      0.37       101
weighted avg       0.49      0.53      0.51       101

              precision    recall  f1-score   support

           0       0.50      0.66      0.57        47
           1       0.46      0.40      0.43        45
           2       0.00      0.00      0.00         9

    accuracy                           0.49       101
   macro avg       0.32      0.35      0.33       101
weighted avg       0.44      0.49      0.46       101

              precision    recall  f1-score   support

           0       0.51      0.64      0.57        47
           1       0.48      0.44      0.46        45
           2       0.00      0.00      0.00         9

    accuracy        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              precision    recall  f1-score   support

           0       0.46      0.64      0.54        47
           1       0.46      0.36      0.40        45
           2       0.00      0.00      0.00         8

    accuracy                           0.46       100
   macro avg       0.31      0.33      0.31       100
weighted avg       0.42      0.46      0.43       100

[0.53465347 0.48514851 0.4950495  0.5049505  0.54455446 0.46534653
 0.47524752 0.49       0.5        0.46      ]


In [14]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.47      0.43      0.44        47
           1       0.42      0.42      0.42        45
           2       0.15      0.22      0.18         9

    accuracy                           0.41       101
   macro avg       0.35      0.36      0.35       101
weighted avg       0.42      0.41      0.41       101

              precision    recall  f1-score   support

           0       0.49      0.45      0.47        47
           1       0.43      0.44      0.43        45
           2       0.00      0.00      0.00         9

    accuracy                           0.41       101
   macro avg       0.30      0.30      0.30       101
weighted avg       0.42      0.41      0.41       101

              precision    recall  f1-score   support

           0       0.58      0.60      0.59        47
           1       0.55      0.51      0.53        45
           2       0.18      0.22      0.20         9

    accuracy        

In [15]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.48      0.70      0.57        47
           1       0.52      0.27      0.35        45
           2       0.11      0.11      0.11         9

    accuracy                           0.46       101
   macro avg       0.37      0.36      0.34       101
weighted avg       0.46      0.46      0.43       101

              precision    recall  f1-score   support

           0       0.57      0.60      0.58        47
           1       0.53      0.42      0.47        45
           2       0.06      0.11      0.08         9

    accuracy                           0.48       101
   macro avg       0.39      0.38      0.38       101
weighted avg       0.51      0.48      0.49       101

              precision    recall  f1-score   support

           0       0.54      0.87      0.67        47
           1       0.36      0.11      0.17        45
           2       0.27      0.33      0.30         9

    accuracy        

In [16]:
# Support Vector Machines
svm_model = SVC(gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.47      1.00      0.64        47
           1       0.00      0.00      0.00        45
           2       0.00      0.00      0.00         9

    accuracy                           0.47       101
   macro avg       0.16      0.33      0.21       101
weighted avg       0.22      0.47      0.30       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.47      1.00      0.64        47
           1       0.00      0.00      0.00        45
           2       0.00      0.00      0.00         9

    accuracy                           0.47       101
   macro avg       0.16      0.33      0.21       101
weighted avg       0.22      0.47      0.30       101

              precision    recall  f1-score   support

           0       0.47      1.00      0.64        47
           1       1.00      0.02      0.04        45
           2       0.00      0.00      0.00         9

    accuracy                           0.48       101
   macro avg       0.49      0.34      0.23       101
weighted avg       0.66      0.48      0.32       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.48      1.00      0.65        48
           1       1.00      0.02      0.04        45
           2       0.00      0.00      0.00         8

    accuracy                           0.49       101
   macro avg       0.49      0.34      0.23       101
weighted avg       0.67      0.49      0.33       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.48      1.00      0.64        48
           1       0.00      0.00      0.00        45
           2       0.00      0.00      0.00         8

    accuracy                           0.48       101
   macro avg       0.16      0.33      0.21       101
weighted avg       0.23      0.48      0.31       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.47      1.00      0.64        47
           1       0.00      0.00      0.00        46
           2       0.00      0.00      0.00         8

    accuracy                           0.47       101
   macro avg       0.16      0.33      0.21       101
weighted avg       0.22      0.47      0.30       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.47      1.00      0.64        47
           1       0.00      0.00      0.00        46
           2       0.00      0.00      0.00         8

    accuracy                           0.47       101
   macro avg       0.16      0.33      0.21       101
weighted avg       0.22      0.47      0.30       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.47      1.00      0.64        47
           1       0.00      0.00      0.00        45
           2       0.00      0.00      0.00         8

    accuracy                           0.47       100
   macro avg       0.16      0.33      0.21       100
weighted avg       0.22      0.47      0.30       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.47      1.00      0.64        47
           1       0.00      0.00      0.00        45
           2       0.00      0.00      0.00         8

    accuracy                           0.47       100
   macro avg       0.16      0.33      0.21       100
weighted avg       0.22      0.47      0.30       100

              precision    recall  f1-score   support

           0       0.47      1.00      0.64        47
           1       0.00      0.00      0.00        45
           2       0.00      0.00      0.00         8

    accuracy                           0.47       100
   macro avg       0.16      0.33      0.21       100
weighted avg       0.22      0.47      0.30       100

[0.46534653 0.46534653 0.47524752 0.48514851 0.47524752 0.46534653
 0.46534653 0.47       0.47       0.47      ]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.57      0.55      0.56        47
           1       0.53      0.64      0.58        45
           2       0.00      0.00      0.00         9

    accuracy                           0.54       101
   macro avg       0.36      0.40      0.38       101
weighted avg       0.50      0.54      0.52       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.45      0.28      0.34        47
           1       0.44      0.71      0.55        45
           2       0.00      0.00      0.00         9

    accuracy                           0.45       101
   macro avg       0.30      0.33      0.30       101
weighted avg       0.41      0.45      0.40       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.56      0.74      0.64        47
           1       0.54      0.47      0.50        45
           2       0.00      0.00      0.00         9

    accuracy                           0.55       101
   macro avg       0.37      0.40      0.38       101
weighted avg       0.50      0.55      0.52       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.58      0.73      0.65        48
           1       0.61      0.56      0.58        45
           2       0.00      0.00      0.00         8

    accuracy                           0.59       101
   macro avg       0.40      0.43      0.41       101
weighted avg       0.55      0.59      0.57       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.51      0.65      0.57        48
           1       0.47      0.42      0.45        45
           2       0.00      0.00      0.00         8

    accuracy                           0.50       101
   macro avg       0.33      0.36      0.34       101
weighted avg       0.45      0.50      0.47       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.44      0.55      0.49        47
           1       0.36      0.33      0.34        46
           2       0.00      0.00      0.00         8

    accuracy                           0.41       101
   macro avg       0.27      0.29      0.28       101
weighted avg       0.37      0.41      0.38       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.60      0.64      0.62        47
           1       0.53      0.59      0.56        46
           2       0.00      0.00      0.00         8

    accuracy                           0.56       101
   macro avg       0.38      0.41      0.39       101
weighted avg       0.52      0.56      0.54       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.59      0.70      0.64        47
           1       0.55      0.53      0.54        45
           2       0.00      0.00      0.00         8

    accuracy                           0.57       100
   macro avg       0.38      0.41      0.39       100
weighted avg       0.52      0.57      0.54       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.60      0.68      0.64        47
           1       0.53      0.56      0.54        45
           2       0.00      0.00      0.00         8

    accuracy                           0.57       100
   macro avg       0.38      0.41      0.39       100
weighted avg       0.52      0.57      0.55       100

              precision    recall  f1-score   support

           0       0.45      0.62      0.52        47
           1       0.31      0.24      0.27        45
           2       0.00      0.00      0.00         8

    accuracy                           0.40       100
   macro avg       0.25      0.29      0.26       100
weighted avg       0.35      0.40      0.37       100

[0.54455446 0.44554455 0.55445545 0.59405941 0.4950495  0.40594059
 0.56435644 0.57       0.57       0.4       ]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# Calculating accuracy metrics for Logistic Regression
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

print('Accuracy Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

Accuracy Metrics for Logistic Regression:

0.5495 

[[67 31  0]
 [43 42  0]
 [ 6 11  2]] 

              precision    recall  f1-score   support

           0       0.58      0.68      0.63        98
           1       0.50      0.49      0.50        85
           2       1.00      0.11      0.19        19

    accuracy                           0.55       202
   macro avg       0.69      0.43      0.44       202
weighted avg       0.58      0.55      0.53       202



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
# Calculating accuracy metrics for LDA
lda_model.fit(X_train, y_train)
lda_pred = lda_model.predict(X_test)

print('Accuracy Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

Accuracy Metrics for LDA:

0.5495 

[[65 33  0]
 [39 44  2]
 [ 8  9  2]] 

              precision    recall  f1-score   support

           0       0.58      0.66      0.62        98
           1       0.51      0.52      0.51        85
           2       0.50      0.11      0.17        19

    accuracy                           0.55       202
   macro avg       0.53      0.43      0.44       202
weighted avg       0.54      0.55      0.53       202



In [20]:
# Calculating accuracy metrics for KNN
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print('Accuracy Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

Accuracy Metrics for KNN:

0.49505 

[[61 37  0]
 [46 39  0]
 [10  9  0]] 

              precision    recall  f1-score   support

           0       0.52      0.62      0.57        98
           1       0.46      0.46      0.46        85
           2       0.00      0.00      0.00        19

    accuracy                           0.50       202
   macro avg       0.33      0.36      0.34       202
weighted avg       0.45      0.50      0.47       202



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Calculating accuracy metrics for CART
cart_model.fit(X_train, y_train)
cart_pred = cart_model.predict(X_test)

print('Accuracy Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

Accuracy Metrics for CART:

0.53465 

[[63 32  3]
 [39 41  5]
 [10  5  4]] 

              precision    recall  f1-score   support

           0       0.56      0.64      0.60        98
           1       0.53      0.48      0.50        85
           2       0.33      0.21      0.26        19

    accuracy                           0.53       202
   macro avg       0.47      0.45      0.45       202
weighted avg       0.53      0.53      0.53       202



In [22]:
# Calculating accuracy metrics for Gaussian Naive Bayes
gnb_model.fit(X_train, y_train)
gnb_pred = gnb_model.predict(X_test)

print('Accuracy Metrics for Gaussian Naive Bayes:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

Accuracy Metrics for Gaussian Naive Bayes:

0.4901 

[[73 23  2]
 [56 22  7]
 [ 8  7  4]] 

              precision    recall  f1-score   support

           0       0.53      0.74      0.62        98
           1       0.42      0.26      0.32        85
           2       0.31      0.21      0.25        19

    accuracy                           0.49       202
   macro avg       0.42      0.40      0.40       202
weighted avg       0.47      0.49      0.46       202



In [23]:
# Calculating accuracy metrics for SVM
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

print('Accuracy Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

Accuracy Metrics for LDA:

0.48515 

[[98  0  0]
 [85  0  0]
 [19  0  0]] 

              precision    recall  f1-score   support

           0       0.49      1.00      0.65        98
           1       0.00      0.00      0.00        85
           2       0.00      0.00      0.00        19

    accuracy                           0.49       202
   macro avg       0.16      0.33      0.22       202
weighted avg       0.24      0.49      0.32       202



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# Calculating accuracy metrics for Random Forest Classifier
rfc_model.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)

print('Accuracy Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

Accuracy Metrics for LDA:

0.55941 

[[72 26  0]
 [44 41  0]
 [ 8 11  0]] 

              precision    recall  f1-score   support

           0       0.58      0.73      0.65        98
           1       0.53      0.48      0.50        85
           2       0.00      0.00      0.00        19

    accuracy                           0.56       202
   macro avg       0.37      0.41      0.38       202
weighted avg       0.50      0.56      0.53       202



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
# Only using Features provided

In [26]:
# Conduct mapping for Feature Names
featureName_mapping = {
    "A1_1" : "Vitamin B12 (pmol/L)",
    "A1_2" : "Serum Folate (nmol/L)",
    "A2_1" : "Serum Homocysteine (µmol/L)",
    "A3_1" : "25-hydroxy Vitamin D (nmol/L)",
    "B1_a" : "Haemoglobin (g/L)",
    "B1_a1" : "RBC (/L)",
    "B1_a2" : "PCV (L/L)",
    "B1_a3" : "MCV (fL)",
    "B1_a4" : "MCH (pg)",
    "B1_a5" : "MCHC (g/L)",
    "B1_a6" : "RDW (%)",
    "B1_b" : "White Cell Count (/L)",
    "B1_b1" : "Neutrophils (/L)",
    "B1_b2" : "Lymphocytes (/L)",
    "B1_b3" : "Monocytes (/L)",
    "B1_b4" : "Eosinophils (/L)",
    "B1_b5" : "Basophils (/L)",
    "B1_c" : "Platelets (/L)",
    "B1_d" : "Glucose (mmol/L)",
    "B2_a1" : "Total Cholesterol (mmol/L)",
    "B2_a2" : "Triglyceride (mmol/L)",
    "B2_a3" : "HDL Cholesterol (mmol/L)",
    "B2_a4" : "LDL Cholesterol (mmol/L)",
    "B2_a5" : "Total Cholesterol/HDL Ratio",
    "B2_b1" : "Sodium (mmol/L)",
    "B2_b2" : "Potassium (mmol/L)",
    "B2_b3" : "Chloride (mmol/L)",
    "B2_c1" : 'Urea (mmol/L)',
    "B2_c2" : "Creatinine (umol/L)",
    "B2_c3" : "eGFR (mL/min/1.73m2)",
    "B2_c4" : "Uric Acid (mmol/L)",
    "B2_c5" : "Calcium (mmol/L)",
    "B2_c6" : "Corrected Calcium (mmol/L)",
    "B2_c7" : "Phosphate (mmol/L)",
    "B2_d1" : "Total Protein (g/L)",
    "B2_d2" : "Albumin (g/L)",
    "B2_d3" : "Globulin (g/L)",
    "B2_d4" : "Albumin/Globulin ratio",
    "B2_d5" : "Alkaline Phosphatase (U/L)",
    "B2_d6" : "Total Bilirubin (µmol/L)",
    "B2_d7" : "GGT",
    "B2_d8" : "AST",
    "B2_d9" : "ALT",
    "B3" : "C-Reactive Protein",
    "B4_a1" : "Protein",
    "B4_a2" : "pH",
    "B4_a3" : "Glucose",
    "B4_a4" : "Ketones",
    "B4_a5" : "S.G.",
    "B4_a6" : "Blood",
    "B4_b1" : "Leucocytes (/L)",
    "B4_b2" : "Erythrocytes (/L)",
    "B4_b3" : "Epithelial Cells",
    "B5_a1" : "Free Thyroxine (FT4) (pmol/L)",
    "B5_a2" : "Thyroid Stimulating Hormone (mIU/L)",
    "B5_a3" : "Free Tri-iodothyronine (FT3) (pmol/L)",
    "B6" : "HbA1c"
}

In [27]:
feature_list = ['A1_2', 'A2_1', 'B2_c3', 'B2_d2', 'B5_a3','B6']
missing = []
exists = []

print("Selected Features:")
print(feature_list)
print()

for items in feature_list:
    if items not in data.columns:
        missing.append(items)
    else:
        exists.append(items)

data1 = pd.Series(data=missing, name='MissingFeatures')

data1 = data1.map(featureName_mapping)

data2 = pd.Series(data=exists, name='ExistingFeatures')

data2 = data2.map(featureName_mapping)

print("Columns missing in parsed dataset:")

for i in range(0, len(data1)):
    print(missing[i], "-->", data1[i])

print("\nColumns existing in parsed dataset:")

for i in range(0, len(data2)):
    print(exists[i], "-->", data2[i])

Selected Features:
['A1_2', 'A2_1', 'B2_c3', 'B2_d2', 'B5_a3', 'B6']

Columns missing in parsed dataset:
A1_2 --> Serum Folate (nmol/L)
B2_c3 --> eGFR (mL/min/1.73m2)

Columns existing in parsed dataset:
A2_1 --> Serum Homocysteine (µmol/L)
B2_d2 --> Albumin (g/L)
B5_a3 --> Free Tri-iodothyronine (FT3) (pmol/L)
B6 --> HbA1c


In [28]:
data1 = data[exists]
data1 = data1.reset_index(drop=True)

data2 = data[['mtag', 'condition']]
data2 = data2.reset_index(drop=True)

data_final = data2.join(data1)

data = data_final

data

Unnamed: 0,mtag,condition,A2_1,B2_d2,B5_a3,B6
0,ME02646,2,24,42,4.1,5.9
1,ME03109,2,23,42,4.6,8.5
2,ME06997,2,20,43,4.0,6.4
3,ME07149,2,16,42,4.7,6.1
4,ME07700,2,14,45,3.8,5.8
...,...,...,...,...,...,...
1002,MV00454,0,19,42,4.5,6.2
1003,MV00456,0,18,39,3.9,5.6
1004,MV00460,0,17,41,4.0,5.6
1005,MV00502,0,18,40,4.1,6.0


In [29]:
# Run Classification using 80/20 Train-Test Split

# Logistic Regression
    
y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
svm_model = SVC(gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression: 0.545
Linear Discriminant Analysis: 0.564
K-Nearest Neigbors: 0.46
Classification and Regression Trees: 0.421
Gaussian Naive Bayes: 0.559
Support Vector Machines: 0.45
Random Forest Classifier: 0.441


In [30]:
# Cross Validation Score

# Run Classification using 80/20 Train-Test Split

# Logistic Regression
    
y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=10)
print("Logistic Regression: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=10)
print("Linear Discriminant Analysis: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=10)
print("K-Nearest Neighbors: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=10)
print("Classification and Regression Trees: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=10)
print("Gaussian Naive Bayes: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Support Vector Machines

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
svm_model = SVC(gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=10)
print("Support Vector Machines: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Random Forest Classifier

y = data['condition']
features = exists

X_old = data[features]

X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

c = data['condition'].value_counts()
condition = c.index
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)
    
# Define undersample strategy
# sampling_strategy = {0: 83, 1: 83, 2: 83}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
# X, y = undersample.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=10)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Logistic Regression: 0.52 accuracy with a standard deviation of 0.07
Linear Discriminant Analysis: 0.50 accuracy with a standard deviation of 0.05
K-Nearest Neighbors: 0.46 accuracy with a standard deviation of 0.03
Classification and Regression Trees: 0.43 accuracy with a standard deviation of 0.04
Gaussian Naive Bayes: 0.49 accuracy with a standard deviation of 0.03
Support Vector Machines: 0.46 accuracy with a standard deviation of 0.03
Random Forest Classifier: 0.44 accuracy with a standard deviation of 0.04


In [31]:
# Showing Cross Validation Score for each iteration

# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.56      0.51      0.53        47
           1       0.49      0.62      0.55        45
           2       0.00      0.00      0.00         9

    accuracy                           0.51       101
   macro avg       0.35      0.38      0.36       101
weighted avg       0.48      0.51      0.49       101

              precision    recall  f1-score   support

           0       0.62      0.60      0.61        47
           1       0.52      0.64      0.57        45
           2       0.00      0.00      0.00         9

    accuracy                           0.56       101
   macro avg       0.38      0.41      0.39       101
weighted avg       0.52      0.56      0.54       101



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/m

              precision    recall  f1-score   support

           0       0.59      0.83      0.69        47
           1       0.63      0.49      0.55        45
           2       0.00      0.00      0.00         9

    accuracy                           0.60       101
   macro avg       0.41      0.44      0.41       101
weighted avg       0.56      0.60      0.57       101

              precision    recall  f1-score   support

           0       0.62      0.77      0.69        48
           1       0.68      0.62      0.65        45
           2       0.00      0.00      0.00         8

    accuracy                           0.64       101
   macro avg       0.43      0.46      0.45       101
weighted avg       0.60      0.64      0.62       101

              precision    recall  f1-score   support

           0       0.47      0.65      0.54        48
           1       0.43      0.33      0.38        45
           2       0.00      0.00      0.00         8

    accuracy        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(res

              precision    recall  f1-score   support

           0       0.46      0.66      0.54        47
           1       0.39      0.28      0.33        46
           2       0.00      0.00      0.00         8

    accuracy                           0.44       101
   macro avg       0.28      0.31      0.29       101
weighted avg       0.39      0.44      0.40       101

              precision    recall  f1-score   support

           0       0.46      0.60      0.52        47
           1       0.45      0.39      0.42        46
           2       0.00      0.00      0.00         8

    accuracy                           0.46       101
   macro avg       0.30      0.33      0.31       101
weighted avg       0.42      0.46      0.43       101

              precision    recall  f1-score   support

           0       0.54      0.70      0.61        47
           1       0.46      0.40      0.43        45
           2       0.00      0.00      0.00         8

    accuracy        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(res

              precision    recall  f1-score   support

           0       0.54      0.79      0.64        47
           1       0.52      0.36      0.42        45
           2       0.00      0.00      0.00         8

    accuracy                           0.53       100
   macro avg       0.35      0.38      0.35       100
weighted avg       0.48      0.53      0.49       100

              precision    recall  f1-score   support

           0       0.50      0.64      0.56        47
           1       0.40      0.36      0.38        45
           2       0.00      0.00      0.00         8

    accuracy                           0.46       100
   macro avg       0.30      0.33      0.31       100
weighted avg       0.41      0.46      0.43       100

[0.51485149 0.56435644 0.6039604  0.64356436 0.45544554 0.43564356
 0.45544554 0.51       0.53       0.46      ]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.55      0.51      0.53        47
           1       0.49      0.60      0.54        45
           2       0.00      0.00      0.00         9

    accuracy                           0.50       101
   macro avg       0.35      0.37      0.36       101
weighted avg       0.47      0.50      0.49       101

              precision    recall  f1-score   support

           0       0.59      0.62      0.60        47
           1       0.52      0.60      0.56        45
           2       0.00      0.00      0.00         9

    accuracy                           0.55       101
   macro avg       0.37      0.41      0.39       101
weighted avg       0.51      0.55      0.53       101

              precision    recall  f1-score   support

           0       0.58      0.79      0.67        47
           1       0.54      0.44      0.49        45
           2       0.00      0.00      0.00         9

    accuracy        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [33]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.45      0.53      0.49        47
           1       0.43      0.44      0.44        45
           2       0.00      0.00      0.00         9

    accuracy                           0.45       101
   macro avg       0.30      0.33      0.31       101
weighted avg       0.41      0.45      0.42       101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.47      0.60      0.53        47
           1       0.51      0.47      0.49        45
           2       0.00      0.00      0.00         9

    accuracy                           0.49       101
   macro avg       0.33      0.35      0.34       101
weighted avg       0.45      0.49      0.46       101

              precision    recall  f1-score   support

           0       0.48      0.55      0.51        47
           1       0.45      0.47      0.46        45
           2       0.00      0.00      0.00         9

    accuracy                           0.47       101
   macro avg       0.31      0.34      0.32       101
weighted avg       0.42      0.47      0.44       101

              precision    recall  f1-score   support

           0       0.43      0.48      0.46        48
           1       0.43      0.44      0.43        45
           2       0.00      0.00      0.00         8

    accuracy        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.49      0.36      0.41        47
           1       0.44      0.58      0.50        45
           2       0.14      0.11      0.12         9

    accuracy                           0.44       101
   macro avg       0.36      0.35      0.35       101
weighted avg       0.44      0.44      0.43       101

              precision    recall  f1-score   support

           0       0.51      0.55      0.53        47
           1       0.51      0.51      0.51        45
           2       0.00      0.00      0.00         9

    accuracy                           0.49       101
   macro avg       0.34      0.35      0.35       101
weighted avg       0.46      0.49      0.47       101

              precision    recall  f1-score   support

           0       0.51      0.55      0.53        47
           1       0.39      0.33      0.36        45
           2       0.08      0.11      0.10         9

    accuracy        

In [35]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.55      0.70      0.62        47
           1       0.54      0.47      0.50        45
           2       0.00      0.00      0.00         9

    accuracy                           0.53       101
   macro avg       0.36      0.39      0.37       101
weighted avg       0.50      0.53      0.51       101

              precision    recall  f1-score   support

           0       0.59      0.68      0.63        47
           1       0.50      0.44      0.47        45
           2       0.00      0.00      0.00         9

    accuracy                           0.51       101
   macro avg       0.36      0.38      0.37       101
weighted avg       0.50      0.51      0.50       101

              precision    recall  f1-score   support

           0       0.53      0.85      0.65        47
           1       0.50      0.22      0.31        45
           2       0.00      0.00      0.00         9

    accuracy        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
# Support Vector Machines
svm_model = SVC(gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.46      0.47      0.46        47
           1       0.43      0.51      0.47        45
           2       0.00      0.00      0.00         9

    accuracy                           0.45       101
   macro avg       0.30      0.33      0.31       101
weighted avg       0.41      0.45      0.42       101

              precision    recall  f1-score   support

           0       0.47      0.53      0.50        47
           1       0.52      0.56      0.54        45
           2       0.00      0.00      0.00         9

    accuracy                           0.50       101
   macro avg       0.33      0.36      0.35       101
weighted avg       0.45      0.50      0.47       101

              precision    recall  f1-score   support

           0       0.50      0.55      0.53        47
           1       0.47      0.51      0.49        45
           2       0.00      0.00      0.00         9

    accuracy        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.47      0.54      0.50        48
           1       0.48      0.49      0.48        45
           2       0.00      0.00      0.00         8

    accuracy                           0.48       101
   macro avg       0.32      0.34      0.33       101
weighted avg       0.44      0.48      0.46       101

              precision    recall  f1-score   support

           0       0.46      0.56      0.50        48
           1       0.43      0.40      0.41        45
           2       0.00      0.00      0.00         8

    accuracy                           0.45       101
   macro avg       0.30      0.32      0.31       101
weighted avg       0.41      0.45      0.42       101

              precision    recall  f1-score   support

           0       0.42      0.57      0.49        47
           1       0.38      0.30      0.34        46
           2       0.00      0.00      0.00         8

    accuracy        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.48      0.60      0.53        47
           1       0.47      0.43      0.45        46
           2       0.00      0.00      0.00         8

    accuracy                           0.48       101
   macro avg       0.32      0.34      0.33       101
weighted avg       0.44      0.48      0.45       101

              precision    recall  f1-score   support

           0       0.49      0.57      0.53        47
           1       0.43      0.42      0.43        45
           2       0.00      0.00      0.00         8

    accuracy                           0.46       100
   macro avg       0.31      0.33      0.32       100
weighted avg       0.43      0.46      0.44       100

              precision    recall  f1-score   support

           0       0.51      0.70      0.59        47
           1       0.46      0.36      0.40        45
           2       0.00      0.00      0.00         8

    accuracy        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.36      0.30      0.33        47
           1       0.37      0.49      0.42        45
           2       0.00      0.00      0.00         9

    accuracy                           0.36       101
   macro avg       0.24      0.26      0.25       101
weighted avg       0.33      0.36      0.34       101

              precision    recall  f1-score   support

           0       0.43      0.43      0.43        47
           1       0.41      0.47      0.44        45
           2       0.00      0.00      0.00         9

    accuracy                           0.41       101
   macro avg       0.28      0.30      0.29       101
weighted avg       0.38      0.41      0.39       101

              precision    recall  f1-score   support

           0       0.55      0.60      0.57        47
           1       0.49      0.53      0.51        45
           2       0.00      0.00      0.00         9

    accuracy        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.53      0.58      0.55        48
           1       0.50      0.53      0.52        45
           2       0.00      0.00      0.00         8

    accuracy                           0.51       101
   macro avg       0.34      0.37      0.36       101
weighted avg       0.47      0.51      0.49       101

              precision    recall  f1-score   support

           0       0.43      0.50      0.46        48
           1       0.34      0.33      0.34        45
           2       0.00      0.00      0.00         8

    accuracy                           0.39       101
   macro avg       0.26      0.28      0.27       101
weighted avg       0.36      0.39      0.37       101

              precision    recall  f1-score   support

           0       0.49      0.55      0.52        47
           1       0.46      0.46      0.46        46
           2       0.00      0.00      0.00         8

    accuracy        

In [38]:
# Calculating accuracy metrics for Logistic Regression
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

print('Accuracy Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

Accuracy Metrics for Logistic Regression:

0.54455 

[[70 28  0]
 [45 40  0]
 [ 8 11  0]] 

              precision    recall  f1-score   support

           0       0.57      0.71      0.63        98
           1       0.51      0.47      0.49        85
           2       0.00      0.00      0.00        19

    accuracy                           0.54       202
   macro avg       0.36      0.39      0.37       202
weighted avg       0.49      0.54      0.51       202



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
# Calculating accuracy metrics for LDA
lda_model.fit(X_train, y_train)
lda_pred = lda_model.predict(X_test)

print('Accuracy Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

Accuracy Metrics for LDA:

0.56436 

[[72 26  0]
 [44 41  0]
 [ 8 10  1]] 

              precision    recall  f1-score   support

           0       0.58      0.73      0.65        98
           1       0.53      0.48      0.51        85
           2       1.00      0.05      0.10        19

    accuracy                           0.56       202
   macro avg       0.70      0.42      0.42       202
weighted avg       0.60      0.56      0.54       202



In [40]:
# Calculating accuracy metrics for KNN
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print('Accuracy Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

Accuracy Metrics for KNN:

0.4604 

[[57 41  0]
 [48 36  1]
 [13  6  0]] 

              precision    recall  f1-score   support

           0       0.48      0.58      0.53        98
           1       0.43      0.42      0.43        85
           2       0.00      0.00      0.00        19

    accuracy                           0.46       202
   macro avg       0.31      0.34      0.32       202
weighted avg       0.42      0.46      0.44       202



In [41]:
# Calculating accuracy metrics for CART
cart_model.fit(X_train, y_train)
cart_pred = cart_model.predict(X_test)

print('Accuracy Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

Accuracy Metrics for CART:

0.41584 

[[46 43  9]
 [43 34  8]
 [ 8  7  4]] 

              precision    recall  f1-score   support

           0       0.47      0.47      0.47        98
           1       0.40      0.40      0.40        85
           2       0.19      0.21      0.20        19

    accuracy                           0.42       202
   macro avg       0.36      0.36      0.36       202
weighted avg       0.42      0.42      0.42       202



In [42]:
# Calculating accuracy metrics for Gaussian Naive Bayes
gnb_model.fit(X_train, y_train)
gnb_pred = gnb_model.predict(X_test)

print('Accuracy Metrics for Gaussian Naive Bayes:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

Accuracy Metrics for Gaussian Naive Bayes:

0.55941 

[[82 16  0]
 [55 30  0]
 [ 8 10  1]] 

              precision    recall  f1-score   support

           0       0.57      0.84      0.67        98
           1       0.54      0.35      0.43        85
           2       1.00      0.05      0.10        19

    accuracy                           0.56       202
   macro avg       0.70      0.41      0.40       202
weighted avg       0.59      0.56      0.52       202



In [43]:
# Calculating accuracy metrics for SVM
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

print('Accuracy Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

Accuracy Metrics for SVM:

0.4505 

[[54 44  0]
 [48 37  0]
 [10  9  0]] 

              precision    recall  f1-score   support

           0       0.48      0.55      0.51        98
           1       0.41      0.44      0.42        85
           2       0.00      0.00      0.00        19

    accuracy                           0.45       202
   macro avg       0.30      0.33      0.31       202
weighted avg       0.41      0.45      0.43       202



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
# Calculating accuracy metrics for Random Forest Classifier
rfc_model.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)

print('Accuracy Metrics for Random Forest:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

Accuracy Metrics for Random Forest:

0.43564 

[[52 45  1]
 [48 35  2]
 [10  8  1]] 

              precision    recall  f1-score   support

           0       0.47      0.53      0.50        98
           1       0.40      0.41      0.40        85
           2       0.25      0.05      0.09        19

    accuracy                           0.44       202
   macro avg       0.37      0.33      0.33       202
weighted avg       0.42      0.44      0.42       202

