In [307]:
# Import packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter
from parse import preprocess

In [308]:
# Function to show classification report for Cross Validation
def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred)) # print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

In [309]:
# Pre-parse the dataset
data = preprocess("rawfile_blood.csv")

robust          368
prefrail_mci    268
prefrail        250
mci             142
frail_mci        86
frail             9
Name: condition, dtype: int64

####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1015


In [310]:
# Initialise counters for each condition
frail = 0
frail_mci = 0
mci = 0
prefrail_mci = 0
prefrail = 0
robust = 0

# Count rows of data for each condition
for i in range(0, len(data)):
	if data.at[i, 'condition'] == 'frail':
		frail += 1
	elif data.at[i, 'condition'] == 'frail_mci':
		frail_mci += 1
	elif data.at[i, 'condition'] == 'mci':
		mci += 1
	elif data.at[i, 'condition'] == 'prefrail_mci':
		prefrail_mci += 1
	elif data.at[i, 'condition'] == 'prefrail':
		prefrail += 1
	elif data.at[i, 'condition'] == 'robust':
		robust += 1
        
# Display number of rows (frequency) for each condition (label)
print("\n####################################################################")
print("Labels with frequencies:")
print("Frail:", frail)
print("Frail + MCI:", frail_mci)
print("MCI:", mci)
print("Prefrail + MCI:", prefrail_mci)
print("Prefrail:", prefrail)
print("Robust:", robust)


####################################################################
Labels with frequencies:
Frail: 7
Frail + MCI: 76
MCI: 133
Prefrail + MCI: 233
Prefrail: 223
Robust: 343


In [311]:
data.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME02646,frail,196,24,46.5,121,3.93,0.37,95,31,...,7,12,13,6,0.2,6.0,1.011,1.14,4.1,5.9
1,ME03109,frail,200,23,55.6,142,4.82,0.42,87,30,...,7,20,17,26,3.1,5.0,1.011,3.25,4.6,8.5
2,ME06997,frail,441,20,76.8,105,4.54,0.41,90,30,...,5,16,19,15,1.4,7.0,1.023,2.14,4.0,6.4
3,ME07149,frail,265,16,47.2,122,4.53,0.39,86,27,...,8,24,19,21,2.1,5.5,1.012,1.06,4.7,6.1
4,ME07700,frail,425,14,31.3,124,4.44,0.38,85,28,...,6,20,23,23,6.0,5.5,1.013,1.95,3.8,5.8


In [312]:
data.columns

Index(['mtag', 'condition', 'A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6'],
      dtype='object')

In [313]:
c = data['condition'].value_counts()
condition = c.index
c

robust          343
prefrail_mci    233
prefrail        223
mci             133
frail_mci        76
frail             7
Name: condition, dtype: int64

In [314]:
# for i in range(0, len(data)):
# 	if data.at[i, 'condition'] == 'frail':
# 		data.at[i, 'condition'] = 'non-robust'
# 	elif data.at[i, 'condition'] == 'frail_mci':
# 		data.at[i, 'condition'] = 'non-robust'
# 	elif data.at[i, 'condition'] == 'mci':
# 		data.at[i, 'condition'] = 'non-robust'
# 	elif data.at[i, 'condition'] == 'prefrail_mci':
# 		data.at[i, 'condition'] = 'non-robust'
# 	elif data.at[i, 'condition'] == 'prefrail':
# 		data.at[i, 'condition'] = 'non-robust'
# 	elif data.at[i, 'condition'] == 'robust':
# 		data.at[i, 'condition'] = 'robust'

# df1 = data[data.condition == 'frail_mci']
# df1 = df1.reset_index(drop=True)

# df2 = data[data.condition == 'robust']
# df2 = df2.reset_index(drop=True)

# data = pd.concat([df1, df2], ignore_index=True)

In [315]:
data.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME02646,frail,196,24,46.5,121,3.93,0.37,95,31,...,7,12,13,6,0.2,6.0,1.011,1.14,4.1,5.9
1,ME03109,frail,200,23,55.6,142,4.82,0.42,87,30,...,7,20,17,26,3.1,5.0,1.011,3.25,4.6,8.5
2,ME06997,frail,441,20,76.8,105,4.54,0.41,90,30,...,5,16,19,15,1.4,7.0,1.023,2.14,4.0,6.4
3,ME07149,frail,265,16,47.2,122,4.53,0.39,86,27,...,8,24,19,21,2.1,5.5,1.012,1.06,4.7,6.1
4,ME07700,frail,425,14,31.3,124,4.44,0.38,85,28,...,6,20,23,23,6.0,5.5,1.013,1.95,3.8,5.8


In [316]:
data.tail()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
1010,MV00454,robust,220,19,67.5,138,4.66,0.42,91,30,...,20,10,17,8,6.6,7.0,1.015,1.29,4.5,6.2
1011,MV00456,robust,334,18,51.0,139,4.63,0.42,91,30,...,16,22,35,40,1.0,6.0,1.015,1.88,3.9,5.6
1012,MV00460,robust,418,17,61.0,122,4.18,0.38,90,29,...,19,20,23,15,0.4,6.5,1.005,3.58,4.0,5.6
1013,MV00502,robust,393,18,43.1,136,4.57,0.43,94,30,...,13,11,22,23,0.7,7.0,1.009,0.92,4.1,6.0
1014,MV00510,robust,371,24,55.9,127,4.41,0.4,90,29,...,13,14,16,12,7.5,8.0,1.017,2.45,4.5,6.2


In [317]:
c = data['condition'].value_counts()
condition = c.index
c

robust          343
prefrail_mci    233
prefrail        223
mci             133
frail_mci        76
frail             7
Name: condition, dtype: int64

In [318]:
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

data.head(3)

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME02646,5,196,24,46.5,121,3.93,0.37,95,31,...,7,12,13,6,0.2,6.0,1.011,1.14,4.1,5.9
1,ME03109,5,200,23,55.6,142,4.82,0.42,87,30,...,7,20,17,26,3.1,5.0,1.011,3.25,4.6,8.5
2,ME06997,5,441,20,76.8,105,4.54,0.41,90,30,...,5,16,19,15,1.4,7.0,1.023,2.14,4.0,6.4


In [319]:
data.tail()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
1010,MV00454,0,220,19,67.5,138,4.66,0.42,91,30,...,20,10,17,8,6.6,7.0,1.015,1.29,4.5,6.2
1011,MV00456,0,334,18,51.0,139,4.63,0.42,91,30,...,16,22,35,40,1.0,6.0,1.015,1.88,3.9,5.6
1012,MV00460,0,418,17,61.0,122,4.18,0.38,90,29,...,19,20,23,15,0.4,6.5,1.005,3.58,4.0,5.6
1013,MV00502,0,393,18,43.1,136,4.57,0.43,94,30,...,13,11,22,23,0.7,7.0,1.009,0.92,4.1,6.0
1014,MV00510,0,371,24,55.9,127,4.41,0.4,90,29,...,13,14,16,12,7.5,8.0,1.017,2.45,4.5,6.2


In [320]:
data.columns

Index(['mtag', 'condition', 'A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6'],
      dtype='object')

In [321]:
y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

In [322]:
X = X_old
X = StandardScaler().fit_transform(X_old)
X = MinMaxScaler().fit_transform(X_old)

In [323]:
# Summarise the new class distribution
counter = Counter(y)
print(counter)

Counter({0: 343, 1: 233, 2: 223, 3: 133, 4: 76, 5: 7})


In [324]:
# Undersample the majority class
# Define undersample strategy

# 75% of majority class
# sampling_strategy = {0: 254, 1: 231, 2: 221, 3: 133, 4: 76, 5: 7}

# 50% of majority class
# sampling_strategy = {0: 170, 1: 170, 2: 170, 3: 133, 4: 76, 5: 7}

# 25% of majority class
# sampling_strategy = {0: 85, 1: 85, 2: 85, 3: 85, 4: 76, 5: 7}

# sampling_strategy = {0: 76, 1: 76}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)

# 50% of majority class
# undersample = RandomUnderSampler(sampling_strategy=0.0413)

# 25% of majority class
# undersample = RandomUnderSampler(sampling_strategy=0.0826)

# X, y = undersample.fit_resample(X, y)

In [325]:
# Summarise the new class distribution
counter = Counter(y)
print(counter) 

Counter({0: 343, 1: 233, 2: 223, 3: 133, 4: 76, 5: 7})


In [326]:
y.shape

(1015,)

In [327]:
X.shape

(1015, 45)

In [328]:
# Transform the dataset using SMOTE
# oversample = SMOTE()
# X, y = oversample.fit_resample(X, y)

In [329]:
# Summarise the new class distribution
counter = Counter(y)
print(counter)

Counter({0: 343, 1: 233, 2: 223, 3: 133, 4: 76, 5: 7})


In [330]:
y.shape

(1015,)

In [331]:
X.shape

(1015, 45)

In [332]:
# Test 1: Using the entire dataset as both the train and test sets without splitting into separate train and test sets

In [333]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X, y)
print("Logistic Regression:", log_model.score(X, y).round(3))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
print("Linear Discriminant Analysis:", lda_model.score(X, y).round(3))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
print("K-Nearest Neigbors:", knn_model.score(X, y).round(3))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
print("Classification and Regression Trees:", cart_model.score(X, y).round(3))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X, y)
print("Gaussian Naive Bayes:", gnb_model.score(X, y).round(3))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
print("Support Vector Machines:", svm_model.score(X, y).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
print("Random Forest Classifier:", rfc_model.score(X, y).round(3))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression: 0.427
Linear Discriminant Analysis: 0.448
K-Nearest Neigbors: 0.511
Classification and Regression Trees: 1.0
Gaussian Naive Bayes: 0.391
Support Vector Machines: 0.41
Random Forest Classifier: 1.0


In [334]:
# Test 2: Splitting the dataset into separate train and test sets

In [335]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

Logistic Regression: 0.382
Linear Discriminant Analysis: 0.357
K-Nearest Neigbors: 0.335


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Classification and Regression Trees: 0.241
Gaussian Naive Bayes: 0.369
Support Vector Machines: 0.382
Random Forest Classifier: 0.387


In [336]:
# Test 3: 5-fold Cross Validation

In [337]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=5)
print("Logistic Regression: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=5)
print("Linear Discriminant Analysis: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5)
print("K-Nearest Neighbors: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5)
print("Classification and Regression Trees: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5)
print("Gaussian Naive Bayes: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5)
print("Support Vector Machines: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Logistic Regression: 0.37 accuracy with a standard deviation of 0.04
Linear Discriminant Analysis: 0.36 accuracy with a standard deviation of 0.03
K-Nearest Neighbors: 0.31 accuracy with a standard deviation of 0.02
Classification and Regression Trees: 0.27 accuracy with a standard deviation of 0.03
Gaussian Naive Bayes: 0.33 accuracy with a standard deviation of 0.03
Support Vector Machines: 0.37 accuracy with a standard deviation of 0.03
Random Forest Classifier: 0.35 accuracy with a standard deviation of 0.02


In [338]:
# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.42      0.90      0.57        68
           1       0.41      0.37      0.39        46
           2       0.44      0.16      0.23        45
           3       0.00      0.00      0.00        27
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         2

    accuracy                           0.42       203
   macro avg       0.21      0.24      0.20       203
weighted avg       0.33      0.42      0.33       203



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(res

              precision    recall  f1-score   support

           0       0.40      0.69      0.51        68
           1       0.34      0.43      0.38        47
           2       0.17      0.09      0.12        45
           3       0.50      0.04      0.07        26
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         2

    accuracy                           0.35       203
   macro avg       0.23      0.21      0.18       203
weighted avg       0.31      0.35      0.29       203

              precision    recall  f1-score   support

           0       0.43      0.77      0.55        69
           1       0.37      0.40      0.39        47
           2       0.21      0.11      0.15        44
           3       0.17      0.04      0.06        26
           4       0.00      0.00      0.00        16
           5       0.00      0.00      0.00         1

    accuracy                           0.38       203
   macro avg       0.20

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regre

              precision    recall  f1-score   support

           0       0.42      0.86      0.56        69
           1       0.33      0.32      0.32        47
           2       0.20      0.07      0.10        44
           3       0.00      0.00      0.00        27
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         1

    accuracy                           0.38       203
   macro avg       0.16      0.21      0.16       203
weighted avg       0.26      0.38      0.29       203

              precision    recall  f1-score   support

           0       0.36      0.72      0.49        69
           1       0.20      0.22      0.21        46
           2       0.15      0.04      0.07        45
           3       0.33      0.04      0.07        27
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         1

    accuracy                           0.31       203
   macro avg       0.18

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [339]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.43      0.79      0.56        68
           1       0.46      0.46      0.46        46
           2       0.32      0.16      0.21        45
           3       0.20      0.04      0.06        27
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         2

    accuracy                           0.41       203
   macro avg       0.23      0.24      0.21       203
weighted avg       0.35      0.41      0.35       203

              precision    recall  f1-score   support

           0       0.41      0.60      0.49        68
           1       0.42      0.45      0.43        47
           2       0.12      0.09      0.10        45
           3       0.11      0.04      0.06        26
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         2

    accuracy                           0.33       203
   macro avg       0.18

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [340]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.36      0.71      0.48        68
           1       0.39      0.30      0.34        46
           2       0.20      0.09      0.12        45
           3       0.07      0.04      0.05        27
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         2

    accuracy                           0.33       203
   macro avg       0.17      0.19      0.17       203
weighted avg       0.26      0.33      0.27       203

              precision    recall  f1-score   support

           0       0.36      0.59      0.44        68
           1       0.18      0.17      0.18        47
           2       0.21      0.16      0.18        45
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         2

    accuracy                           0.27       203
   macro avg       0.13

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [341]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.41      0.50      0.45        68
           1       0.34      0.35      0.34        46
           2       0.24      0.18      0.20        45
           3       0.10      0.11      0.11        27
           4       0.33      0.20      0.25        15
           5       0.00      0.00      0.00         2

    accuracy                           0.32       203
   macro avg       0.24      0.22      0.23       203
weighted avg       0.31      0.32      0.31       203

              precision    recall  f1-score   support

           0       0.39      0.29      0.34        68
           1       0.26      0.28      0.27        47
           2       0.21      0.22      0.22        45
           3       0.07      0.08      0.07        26
           4       0.12      0.20      0.15        15
           5       0.00      0.00      0.00         2

    accuracy                           0.24       203
   macro avg       0.18

In [342]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.41      0.66      0.51        68
           1       0.41      0.24      0.30        46
           2       0.33      0.16      0.21        45
           3       0.24      0.30      0.26        27
           4       0.20      0.13      0.16        15
           5       0.00      0.00      0.00         2

    accuracy                           0.36       203
   macro avg       0.26      0.25      0.24       203
weighted avg       0.35      0.36      0.33       203

              precision    recall  f1-score   support

           0       0.37      0.50      0.42        68
           1       0.32      0.21      0.26        47
           2       0.11      0.04      0.06        45
           3       0.18      0.27      0.22        26
           4       0.19      0.27      0.22        15
           5       0.00      0.00      0.00         2

    accuracy                           0.28       203
   macro avg       0.20

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [343]:
# Support Vector Machines
svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.39      0.94      0.55        68
           1       0.46      0.35      0.40        46
           2       0.00      0.00      0.00        45
           3       0.00      0.00      0.00        27
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         2

    accuracy                           0.39       203
   macro avg       0.14      0.21      0.16       203
weighted avg       0.23      0.39      0.27       203

              precision    recall  f1-score   support

           0       0.41      0.79      0.54        68
           1       0.34      0.43      0.38        47
           2       0.08      0.02      0.04        45
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         2

    accuracy                           0.37       203
   macro avg       0.14

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [344]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.40      0.82      0.54        68
           1       0.33      0.28      0.31        46
           2       0.25      0.13      0.17        45
           3       0.00      0.00      0.00        27
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         2

    accuracy                           0.37       203
   macro avg       0.16      0.21      0.17       203
weighted avg       0.26      0.37      0.29       203



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.41      0.66      0.50        68
           1       0.27      0.38      0.32        47
           2       0.18      0.09      0.12        45
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         2

    accuracy                           0.33       203
   macro avg       0.14      0.19      0.16       203
weighted avg       0.24      0.33      0.27       203



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.40      0.67      0.50        69
           1       0.31      0.40      0.35        47
           2       0.20      0.11      0.14        44
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        16
           5       0.00      0.00      0.00         1

    accuracy                           0.34       203
   macro avg       0.15      0.20      0.17       203
weighted avg       0.25      0.34      0.28       203



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.44      0.86      0.58        69
           1       0.32      0.28      0.30        47
           2       0.25      0.14      0.18        44
           3       0.00      0.00      0.00        27
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         1

    accuracy                           0.38       203
   macro avg       0.17      0.21      0.18       203
weighted avg       0.28      0.38      0.30       203

              precision    recall  f1-score   support

           0       0.35      0.67      0.46        69
           1       0.34      0.37      0.35        46
           2       0.00      0.00      0.00        45
           3       0.00      0.00      0.00        27
           4       0.67      0.13      0.22        15
           5       0.00      0.00      0.00         1

    accuracy                           0.32       203
   macro avg       0.23

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [345]:
# Calculating Performance Metrics for Holdout

In [346]:
# Calculating for Logistic Regression
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

print('Performance Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

Performance Metrics for Logistic Regression:

0.38177 

[[108  19  20   0   0   0]
 [ 45  32  12   0   0   0]
 [ 47  18  15   0   0   0]
 [ 26  14  11   0   0   0]
 [ 12  19   3   0   0   0]
 [  3   2   0   0   0   0]] 

              precision    recall  f1-score   support

           0       0.45      0.73      0.56       147
           1       0.31      0.36      0.33        89
           2       0.25      0.19      0.21        80
           3       0.00      0.00      0.00        51
           4       0.00      0.00      0.00        34
           5       0.00      0.00      0.00         5

    accuracy                           0.38       406
   macro avg       0.17      0.21      0.18       406
weighted avg       0.28      0.38      0.32       406



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [347]:
# Calculating for LDA
lda_model.fit(X_train, y_train)
lda_pred = lda_model.predict(X_test)

print('Performance Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

Performance Metrics for LDA:

0.35714 

[[92 25 26  3  1  0]
 [32 32 17  5  2  1]
 [39 20 17  1  1  2]
 [21 15 10  3  1  1]
 [ 8 17  5  3  1  0]
 [ 3  2  0  0  0  0]] 

              precision    recall  f1-score   support

           0       0.47      0.63      0.54       147
           1       0.29      0.36      0.32        89
           2       0.23      0.21      0.22        80
           3       0.20      0.06      0.09        51
           4       0.17      0.03      0.05        34
           5       0.00      0.00      0.00         5

    accuracy                           0.36       406
   macro avg       0.23      0.21      0.20       406
weighted avg       0.32      0.36      0.32       406



In [348]:
# Calculating for kNN
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print('Performance Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

Performance Metrics for KNN:

0.33498 

[[102  20  21   4   0   0]
 [ 47  25  15   1   1   0]
 [ 47  22   9   2   0   0]
 [ 35  12   4   0   0   0]
 [ 17   9   4   4   0   0]
 [  3   1   0   1   0   0]] 

              precision    recall  f1-score   support

           0       0.41      0.69      0.51       147
           1       0.28      0.28      0.28        89
           2       0.17      0.11      0.14        80
           3       0.00      0.00      0.00        51
           4       0.00      0.00      0.00        34
           5       0.00      0.00      0.00         5

    accuracy                           0.33       406
   macro avg       0.14      0.18      0.15       406
weighted avg       0.24      0.33      0.27       406



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [349]:
# Calculating for CART
cart_model.fit(X_train, y_train)
cart_pred = cart_model.predict(X_test)

print('Performance Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

Performance Metrics for CART:

0.24631 

[[54 31 40 16  6  0]
 [28 18 25 11  5  2]
 [32 15 20 10  2  1]
 [20 12 13  5  1  0]
 [13  9  6  3  3  0]
 [ 0  0  3  2  0  0]] 

              precision    recall  f1-score   support

           0       0.37      0.37      0.37       147
           1       0.21      0.20      0.21        89
           2       0.19      0.25      0.21        80
           3       0.11      0.10      0.10        51
           4       0.18      0.09      0.12        34
           5       0.00      0.00      0.00         5

    accuracy                           0.25       406
   macro avg       0.17      0.17      0.17       406
weighted avg       0.24      0.25      0.24       406



In [350]:
# Calculating for GNB
gnb_model.fit(X_train, y_train)
gnb_pred = log_model.predict(X_test)

print('Performance Metrics for GNB:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

Performance Metrics for GNB:

0.38177 

[[108  19  20   0   0   0]
 [ 45  32  12   0   0   0]
 [ 47  18  15   0   0   0]
 [ 26  14  11   0   0   0]
 [ 12  19   3   0   0   0]
 [  3   2   0   0   0   0]] 

              precision    recall  f1-score   support

           0       0.45      0.73      0.56       147
           1       0.31      0.36      0.33        89
           2       0.25      0.19      0.21        80
           3       0.00      0.00      0.00        51
           4       0.00      0.00      0.00        34
           5       0.00      0.00      0.00         5

    accuracy                           0.38       406
   macro avg       0.17      0.21      0.18       406
weighted avg       0.28      0.38      0.32       406



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [351]:
# Calculating for Support Vector Machine
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

print('Performance Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

Performance Metrics for SVM:

0.38177 

[[116  19  12   0   0   0]
 [ 55  27   7   0   0   0]
 [ 52  16  12   0   0   0]
 [ 36  12   3   0   0   0]
 [ 15  16   3   0   0   0]
 [  3   2   0   0   0   0]] 

              precision    recall  f1-score   support

           0       0.42      0.79      0.55       147
           1       0.29      0.30      0.30        89
           2       0.32      0.15      0.21        80
           3       0.00      0.00      0.00        51
           4       0.00      0.00      0.00        34
           5       0.00      0.00      0.00         5

    accuracy                           0.38       406
   macro avg       0.17      0.21      0.18       406
weighted avg       0.28      0.38      0.30       406



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [352]:
# Calculating for Random Forest Classifier
rfc_model.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)

print('Performance Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

Performance Metrics for RFC:

0.36207 

[[106  21  20   0   0   0]
 [ 46  28  13   2   0   0]
 [ 47  20  13   0   0   0]
 [ 31  12   8   0   0   0]
 [ 14  17   2   1   0   0]
 [  1   2   2   0   0   0]] 

              precision    recall  f1-score   support

           0       0.43      0.72      0.54       147
           1       0.28      0.31      0.30        89
           2       0.22      0.16      0.19        80
           3       0.00      0.00      0.00        51
           4       0.00      0.00      0.00        34
           5       0.00      0.00      0.00         5

    accuracy                           0.36       406
   macro avg       0.16      0.20      0.17       406
weighted avg       0.26      0.36      0.30       406



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [353]:
# Using Robust vs Non-Robust

In [354]:
# Pre-parse the dataset
data1 = preprocess("rawfile_blood.csv")

for i in range(0, len(data1)):
	if data1.at[i, 'condition'] == 'frail':
		data1.at[i, 'condition'] = 'non-robust'
	elif data1.at[i, 'condition'] == 'frail_mci':
		data1.at[i, 'condition'] = 'non-robust'
	elif data1.at[i, 'condition'] == 'mci':
		data1.at[i, 'condition'] = 'non-robust'
	elif data1.at[i, 'condition'] == 'prefrail_mci':
		data1.at[i, 'condition'] = 'non-robust'
	elif data1.at[i, 'condition'] == 'prefrail':
		data1.at[i, 'condition'] = 'non-robust'
	elif data1.at[i, 'condition'] == 'robust':
		data1.at[i, 'condition'] = 'robust'

# df1 = data1[data1.condition == 'frail_mci']
# df1 = df1.reset_index(drop=True)

# df2 = data1[data1.condition == 'robust']
# df2 = df2.reset_index(drop=True)

# data = pd.concat([df1, df2], ignore_index=True)

robust          368
prefrail_mci    268
prefrail        250
mci             142
frail_mci        86
frail             9
Name: condition, dtype: int64

####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1015


In [355]:
c = data1['condition'].value_counts()
condition = c.index
c

non-robust    672
robust        343
Name: condition, dtype: int64

In [356]:
for i in range(len(condition)):
    data1['condition'].replace(condition[i], i, inplace = True)

data1.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME02646,0,196,24,46.5,121,3.93,0.37,95,31,...,7,12,13,6,0.2,6.0,1.011,1.14,4.1,5.9
1,ME03109,0,200,23,55.6,142,4.82,0.42,87,30,...,7,20,17,26,3.1,5.0,1.011,3.25,4.6,8.5
2,ME06997,0,441,20,76.8,105,4.54,0.41,90,30,...,5,16,19,15,1.4,7.0,1.023,2.14,4.0,6.4
3,ME07149,0,265,16,47.2,122,4.53,0.39,86,27,...,8,24,19,21,2.1,5.5,1.012,1.06,4.7,6.1
4,ME07700,0,425,14,31.3,124,4.44,0.38,85,28,...,6,20,23,23,6.0,5.5,1.013,1.95,3.8,5.8


In [357]:
data1.tail()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
1010,MV00454,1,220,19,67.5,138,4.66,0.42,91,30,...,20,10,17,8,6.6,7.0,1.015,1.29,4.5,6.2
1011,MV00456,1,334,18,51.0,139,4.63,0.42,91,30,...,16,22,35,40,1.0,6.0,1.015,1.88,3.9,5.6
1012,MV00460,1,418,17,61.0,122,4.18,0.38,90,29,...,19,20,23,15,0.4,6.5,1.005,3.58,4.0,5.6
1013,MV00502,1,393,18,43.1,136,4.57,0.43,94,30,...,13,11,22,23,0.7,7.0,1.009,0.92,4.1,6.0
1014,MV00510,1,371,24,55.9,127,4.41,0.4,90,29,...,13,14,16,12,7.5,8.0,1.017,2.45,4.5,6.2


In [358]:
y = data1['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data1[features]

X = X_old
X = StandardScaler().fit_transform(X_old)
X = MinMaxScaler().fit_transform(X_old)

In [359]:
Counter(y)

Counter({0: 672, 1: 343})

In [360]:
# Undersample to balance the dataset

# Set Sampling Strategy
sampling_strategy = {0: 343, 1: 343}
undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)

# Undersample
X, y = undersample.fit_resample(X, y)

In [361]:
Counter(y)

Counter({0: 343, 1: 343})

In [362]:
# HOLDOUT METHOD:

In [363]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

Logistic Regression: 0.604
Linear Discriminant Analysis: 0.629
K-Nearest Neigbors: 0.502
Classification and Regression Trees: 0.545
Gaussian Naive Bayes: 0.596
Support Vector Machines: 0.607
Random Forest Classifier: 0.647


In [364]:
# CROSS VALIDATION

In [365]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=5)
print("Logistic Regression: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=5)
print("Linear Discriminant Analysis: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5)
print("K-Nearest Neighbors: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5)
print("Classification and Regression Trees: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5)
print("Gaussian Naive Bayes: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5)
print("Support Vector Machines: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Logistic Regression: 0.58 accuracy with a standard deviation of 0.04
Linear Discriminant Analysis: 0.60 accuracy with a standard deviation of 0.05
K-Nearest Neighbors: 0.57 accuracy with a standard deviation of 0.03
Classification and Regression Trees: 0.56 accuracy with a standard deviation of 0.03
Gaussian Naive Bayes: 0.57 accuracy with a standard deviation of 0.02
Support Vector Machines: 0.60 accuracy with a standard deviation of 0.02
Random Forest Classifier: 0.59 accuracy with a standard deviation of 0.01


In [366]:
# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.62      0.55      0.58        69
           1       0.60      0.67      0.63        69

    accuracy                           0.61       138
   macro avg       0.61      0.61      0.61       138
weighted avg       0.61      0.61      0.61       138

              precision    recall  f1-score   support

           0       0.60      0.52      0.56        69
           1       0.57      0.65      0.61        68

    accuracy                           0.58       137
   macro avg       0.59      0.58      0.58       137
weighted avg       0.59      0.58      0.58       137

              precision    recall  f1-score   support

           0       0.53      0.42      0.47        69
           1       0.51      0.62      0.56        68

    accuracy                           0.52       137
   macro avg       0.52      0.52      0.51       137
weighted avg       0.52      0.52      0.51       137

              preci

In [367]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.70      0.58      0.63        69
           1       0.64      0.75      0.69        69

    accuracy                           0.67       138
   macro avg       0.67      0.67      0.66       138
weighted avg       0.67      0.67      0.66       138

              precision    recall  f1-score   support

           0       0.65      0.62      0.64        69
           1       0.63      0.66      0.65        68

    accuracy                           0.64       137
   macro avg       0.64      0.64      0.64       137
weighted avg       0.64      0.64      0.64       137

              precision    recall  f1-score   support

           0       0.57      0.48      0.52        69
           1       0.54      0.63      0.59        68

    accuracy                           0.55       137
   macro avg       0.56      0.56      0.55       137
weighted avg       0.56      0.55      0.55       137

              preci

In [368]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.59      0.48      0.53        69
           1       0.56      0.67      0.61        69

    accuracy                           0.57       138
   macro avg       0.58      0.57      0.57       138
weighted avg       0.58      0.57      0.57       138

              precision    recall  f1-score   support

           0       0.54      0.39      0.45        69
           1       0.52      0.66      0.58        68

    accuracy                           0.53       137
   macro avg       0.53      0.53      0.52       137
weighted avg       0.53      0.53      0.52       137

              precision    recall  f1-score   support

           0       0.59      0.54      0.56        69
           1       0.57      0.62      0.59        68

    accuracy                           0.58       137
   macro avg       0.58      0.58      0.58       137
weighted avg       0.58      0.58      0.58       137

              preci

In [369]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.51      0.54      0.52        69
           1       0.51      0.48      0.49        69

    accuracy                           0.51       138
   macro avg       0.51      0.51      0.51       138
weighted avg       0.51      0.51      0.51       138

              precision    recall  f1-score   support

           0       0.53      0.45      0.48        69
           1       0.51      0.59      0.55        68

    accuracy                           0.52       137
   macro avg       0.52      0.52      0.52       137
weighted avg       0.52      0.52      0.52       137

              precision    recall  f1-score   support

           0       0.59      0.55      0.57        69
           1       0.58      0.62      0.60        68

    accuracy                           0.58       137
   macro avg       0.58      0.58      0.58       137
weighted avg       0.58      0.58      0.58       137

              preci

In [370]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.70      0.30      0.42        69
           1       0.56      0.87      0.68        69

    accuracy                           0.59       138
   macro avg       0.63      0.59      0.55       138
weighted avg       0.63      0.59      0.55       138

              precision    recall  f1-score   support

           0       0.69      0.35      0.46        69
           1       0.56      0.84      0.67        68

    accuracy                           0.59       137
   macro avg       0.62      0.59      0.57       137
weighted avg       0.62      0.59      0.57       137

              precision    recall  f1-score   support

           0       0.65      0.32      0.43        69
           1       0.54      0.82      0.65        68

    accuracy                           0.57       137
   macro avg       0.60      0.57      0.54       137
weighted avg       0.60      0.57      0.54       137

              preci

In [371]:
# Support Vector Machines
svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.62      0.51      0.56        69
           1       0.59      0.70      0.64        69

    accuracy                           0.60       138
   macro avg       0.61      0.60      0.60       138
weighted avg       0.61      0.60      0.60       138

              precision    recall  f1-score   support

           0       0.64      0.46      0.54        69
           1       0.57      0.74      0.65        68

    accuracy                           0.60       137
   macro avg       0.61      0.60      0.59       137
weighted avg       0.61      0.60      0.59       137

              precision    recall  f1-score   support

           0       0.62      0.41      0.49        69
           1       0.55      0.75      0.64        68

    accuracy                           0.58       137
   macro avg       0.59      0.58      0.56       137
weighted avg       0.59      0.58      0.56       137

              preci

In [372]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.59      0.59      0.59        69
           1       0.59      0.58      0.58        69

    accuracy                           0.59       138
   macro avg       0.59      0.59      0.59       138
weighted avg       0.59      0.59      0.59       138

              precision    recall  f1-score   support

           0       0.63      0.64      0.63        69
           1       0.63      0.62      0.62        68

    accuracy                           0.63       137
   macro avg       0.63      0.63      0.63       137
weighted avg       0.63      0.63      0.63       137

              precision    recall  f1-score   support

           0       0.62      0.62      0.62        69
           1       0.62      0.62      0.62        68

    accuracy                           0.62       137
   macro avg       0.62      0.62      0.62       137
weighted avg       0.62      0.62      0.62       137

              preci

In [373]:
# Calculating Performance Metrics for Holdout

In [374]:
# Calculating for Logistic Regression
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

print('Performance Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

Performance Metrics for Logistic Regression:

0.60364 

[[78 50]
 [59 88]] 

              precision    recall  f1-score   support

           0       0.57      0.61      0.59       128
           1       0.64      0.60      0.62       147

    accuracy                           0.60       275
   macro avg       0.60      0.60      0.60       275
weighted avg       0.61      0.60      0.60       275



In [375]:
# Calculating for LDA
lda_model.fit(X_train, y_train)
lda_pred = lda_model.predict(X_test)

print('Performance Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

Performance Metrics for LDA:

0.62909 

[[84 44]
 [58 89]] 

              precision    recall  f1-score   support

           0       0.59      0.66      0.62       128
           1       0.67      0.61      0.64       147

    accuracy                           0.63       275
   macro avg       0.63      0.63      0.63       275
weighted avg       0.63      0.63      0.63       275



In [376]:
# Calculating for kNN
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print('Performance Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

Performance Metrics for KNN:

0.50182 

[[49 79]
 [58 89]] 

              precision    recall  f1-score   support

           0       0.46      0.38      0.42       128
           1       0.53      0.61      0.57       147

    accuracy                           0.50       275
   macro avg       0.49      0.49      0.49       275
weighted avg       0.50      0.50      0.50       275



In [377]:
# Calculating for CART
cart_model.fit(X_train, y_train)
cart_pred = cart_model.predict(X_test)

print('Performance Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

Performance Metrics for CART:

0.54909 

[[63 65]
 [59 88]] 

              precision    recall  f1-score   support

           0       0.52      0.49      0.50       128
           1       0.58      0.60      0.59       147

    accuracy                           0.55       275
   macro avg       0.55      0.55      0.55       275
weighted avg       0.55      0.55      0.55       275



In [378]:
# Calculating for GNB
gnb_model.fit(X_train, y_train)
gnb_pred = log_model.predict(X_test)

print('Performance Metrics for GNB:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

Performance Metrics for GNB:

0.60364 

[[78 50]
 [59 88]] 

              precision    recall  f1-score   support

           0       0.57      0.61      0.59       128
           1       0.64      0.60      0.62       147

    accuracy                           0.60       275
   macro avg       0.60      0.60      0.60       275
weighted avg       0.61      0.60      0.60       275



In [379]:
# Calculating for Support Vector Machine
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

print('Performance Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

Performance Metrics for SVM:

0.60727 

[[68 60]
 [48 99]] 

              precision    recall  f1-score   support

           0       0.59      0.53      0.56       128
           1       0.62      0.67      0.65       147

    accuracy                           0.61       275
   macro avg       0.60      0.60      0.60       275
weighted avg       0.61      0.61      0.61       275



In [380]:
# Calculating for Random Forest Classifier
rfc_model.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)

print('Performance Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

Performance Metrics for RFC:

0.64364 

[[85 43]
 [55 92]] 

              precision    recall  f1-score   support

           0       0.61      0.66      0.63       128
           1       0.68      0.63      0.65       147

    accuracy                           0.64       275
   macro avg       0.64      0.64      0.64       275
weighted avg       0.65      0.64      0.64       275



In [381]:
# Using Robust vs Frail + MCI

In [382]:
# Pre-parse the dataset
data2 = preprocess("rawfile_blood.csv")

# for i in range(0, len(data1)):
# 	if data1.at[i, 'condition'] == 'frail':
# 		data1.at[i, 'condition'] = 'non-robust'
# 	elif data1.at[i, 'condition'] == 'frail_mci':
# 		data1.at[i, 'condition'] = 'non-robust'
# 	elif data1.at[i, 'condition'] == 'mci':
# 		data1.at[i, 'condition'] = 'non-robust'
# 	elif data1.at[i, 'condition'] == 'prefrail_mci':
# 		data1.at[i, 'condition'] = 'non-robust'
# 	elif data1.at[i, 'condition'] == 'prefrail':
# 		data1.at[i, 'condition'] = 'non-robust'
# 	elif data1.at[i, 'condition'] == 'robust':
# 		data1.at[i, 'condition'] = 'robust'

df1 = data2[data2.condition == 'frail_mci']
df1 = df1.reset_index(drop=True)

df2 = data2[data2.condition == 'robust']
df2 = df2.reset_index(drop=True)

data2 = pd.concat([df1, df2], ignore_index=True)

robust          368
prefrail_mci    268
prefrail        250
mci             142
frail_mci        86
frail             9
Name: condition, dtype: int64

####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1015


In [383]:
c = data2['condition'].value_counts()
condition = c.index
c

robust       343
frail_mci     76
Name: condition, dtype: int64

In [384]:
for i in range(len(condition)):
    data2['condition'].replace(condition[i], i, inplace = True)

data2.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME01378,1,241,20,33.5,150,5.25,0.46,87,29,...,10,21,22,17,1.3,7.0,1.01,0.69,4.7,5.9
1,ME02832,1,444,16,87.0,134,4.65,0.4,85,28,...,10,14,20,15,13.4,6.0,1.005,1.29,4.5,5.8
2,ME02909,1,1476,16,57.0,119,3.8,0.36,94,31,...,18,17,35,21,0.2,7.5,1.012,1.9,4.1,5.8
3,ME02998,1,339,18,63.8,135,4.89,0.42,86,28,...,13,16,25,13,16.8,5.0,1.017,1.32,4.0,6.0
4,ME03061,1,287,20,95.5,146,5.18,0.44,85,28,...,18,22,25,24,1.4,7.5,1.006,2.94,4.6,6.1


In [385]:
data2.tail()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
414,MV00454,0,220,19,67.5,138,4.66,0.42,91,30,...,20,10,17,8,6.6,7.0,1.015,1.29,4.5,6.2
415,MV00456,0,334,18,51.0,139,4.63,0.42,91,30,...,16,22,35,40,1.0,6.0,1.015,1.88,3.9,5.6
416,MV00460,0,418,17,61.0,122,4.18,0.38,90,29,...,19,20,23,15,0.4,6.5,1.005,3.58,4.0,5.6
417,MV00502,0,393,18,43.1,136,4.57,0.43,94,30,...,13,11,22,23,0.7,7.0,1.009,0.92,4.1,6.0
418,MV00510,0,371,24,55.9,127,4.41,0.4,90,29,...,13,14,16,12,7.5,8.0,1.017,2.45,4.5,6.2


In [386]:
y = data2['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data2[features]

X = X_old
X = StandardScaler().fit_transform(X_old)
X = MinMaxScaler().fit_transform(X_old)

In [387]:
Counter(y)

Counter({1: 76, 0: 343})

In [388]:
# Undersample to balance the dataset

# Set Sampling Strategy
sampling_strategy = {0: 76, 1: 76}
undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)

# Undersample
X, y = undersample.fit_resample(X, y)

In [389]:
Counter(y)

Counter({0: 76, 1: 76})

In [390]:
# HOLDOUT METHOD:

In [391]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

Logistic Regression: 0.607
Linear Discriminant Analysis: 0.59
K-Nearest Neigbors: 0.492
Classification and Regression Trees: 0.59
Gaussian Naive Bayes: 0.705
Support Vector Machines: 0.557
Random Forest Classifier: 0.639


In [392]:
# CROSS VALIDATION

In [393]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=5)
print("Logistic Regression: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=5)
print("Linear Discriminant Analysis: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5)
print("K-Nearest Neighbors: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5)
print("Classification and Regression Trees: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5)
print("Gaussian Naive Bayes: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5)
print("Support Vector Machines: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Logistic Regression: 0.72 accuracy with a standard deviation of 0.08
Linear Discriminant Analysis: 0.69 accuracy with a standard deviation of 0.04
K-Nearest Neighbors: 0.65 accuracy with a standard deviation of 0.05
Classification and Regression Trees: 0.60 accuracy with a standard deviation of 0.05
Gaussian Naive Bayes: 0.74 accuracy with a standard deviation of 0.07
Support Vector Machines: 0.73 accuracy with a standard deviation of 0.08
Random Forest Classifier: 0.68 accuracy with a standard deviation of 0.10


In [394]:
# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.79      0.69      0.73        16
           1       0.71      0.80      0.75        15

    accuracy                           0.74        31
   macro avg       0.75      0.74      0.74        31
weighted avg       0.75      0.74      0.74        31

              precision    recall  f1-score   support

           0       0.75      1.00      0.86        15
           1       1.00      0.69      0.81        16

    accuracy                           0.84        31
   macro avg       0.88      0.84      0.84        31
weighted avg       0.88      0.84      0.84        31

              precision    recall  f1-score   support

           0       0.62      0.53      0.57        15
           1       0.59      0.67      0.62        15

    accuracy                           0.60        30
   macro avg       0.60      0.60      0.60        30
weighted avg       0.60      0.60      0.60        30

              preci

In [395]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.71      0.62      0.67        16
           1       0.65      0.73      0.69        15

    accuracy                           0.68        31
   macro avg       0.68      0.68      0.68        31
weighted avg       0.68      0.68      0.68        31

              precision    recall  f1-score   support

           0       0.63      0.80      0.71        15
           1       0.75      0.56      0.64        16

    accuracy                           0.68        31
   macro avg       0.69      0.68      0.67        31
weighted avg       0.69      0.68      0.67        31

              precision    recall  f1-score   support

           0       0.65      0.73      0.69        15
           1       0.69      0.60      0.64        15

    accuracy                           0.67        30
   macro avg       0.67      0.67      0.67        30
weighted avg       0.67      0.67      0.67        30

              preci

In [396]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.61      0.88      0.72        16
           1       0.75      0.40      0.52        15

    accuracy                           0.65        31
   macro avg       0.68      0.64      0.62        31
weighted avg       0.68      0.65      0.62        31

              precision    recall  f1-score   support

           0       0.57      0.87      0.68        15
           1       0.75      0.38      0.50        16

    accuracy                           0.61        31
   macro avg       0.66      0.62      0.59        31
weighted avg       0.66      0.61      0.59        31

              precision    recall  f1-score   support

           0       0.59      0.67      0.62        15
           1       0.62      0.53      0.57        15

    accuracy                           0.60        30
   macro avg       0.60      0.60      0.60        30
weighted avg       0.60      0.60      0.60        30

              preci

In [397]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.63      0.75      0.69        16
           1       0.67      0.53      0.59        15

    accuracy                           0.65        31
   macro avg       0.65      0.64      0.64        31
weighted avg       0.65      0.65      0.64        31

              precision    recall  f1-score   support

           0       0.53      0.60      0.56        15
           1       0.57      0.50      0.53        16

    accuracy                           0.55        31
   macro avg       0.55      0.55      0.55        31
weighted avg       0.55      0.55      0.55        31

              precision    recall  f1-score   support

           0       0.62      0.67      0.65        15
           1       0.64      0.60      0.62        15

    accuracy                           0.63        30
   macro avg       0.63      0.63      0.63        30
weighted avg       0.63      0.63      0.63        30

              preci

In [398]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.81      0.81      0.81        16
           1       0.80      0.80      0.80        15

    accuracy                           0.81        31
   macro avg       0.81      0.81      0.81        31
weighted avg       0.81      0.81      0.81        31

              precision    recall  f1-score   support

           0       0.70      0.93      0.80        15
           1       0.91      0.62      0.74        16

    accuracy                           0.77        31
   macro avg       0.80      0.78      0.77        31
weighted avg       0.81      0.77      0.77        31

              precision    recall  f1-score   support

           0       0.65      0.73      0.69        15
           1       0.69      0.60      0.64        15

    accuracy                           0.67        30
   macro avg       0.67      0.67      0.67        30
weighted avg       0.67      0.67      0.67        30

              preci

In [399]:
# Support Vector Machines
svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.75      0.75      0.75        16
           1       0.73      0.73      0.73        15

    accuracy                           0.74        31
   macro avg       0.74      0.74      0.74        31
weighted avg       0.74      0.74      0.74        31

              precision    recall  f1-score   support

           0       0.75      1.00      0.86        15
           1       1.00      0.69      0.81        16

    accuracy                           0.84        31
   macro avg       0.88      0.84      0.84        31
weighted avg       0.88      0.84      0.84        31

              precision    recall  f1-score   support

           0       0.60      0.60      0.60        15
           1       0.60      0.60      0.60        15

    accuracy                           0.60        30
   macro avg       0.60      0.60      0.60        30
weighted avg       0.60      0.60      0.60        30

              preci

In [400]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.68      0.81      0.74        16
           1       0.75      0.60      0.67        15

    accuracy                           0.71        31
   macro avg       0.72      0.71      0.70        31
weighted avg       0.72      0.71      0.71        31

              precision    recall  f1-score   support

           0       0.65      1.00      0.79        15
           1       1.00      0.50      0.67        16

    accuracy                           0.74        31
   macro avg       0.83      0.75      0.73        31
weighted avg       0.83      0.74      0.73        31

              precision    recall  f1-score   support

           0       0.60      0.60      0.60        15
           1       0.60      0.60      0.60        15

    accuracy                           0.60        30
   macro avg       0.60      0.60      0.60        30
weighted avg       0.60      0.60      0.60        30

              preci

In [401]:
# Calculating Performance Metrics for Holdout

In [402]:
# Calculating for Logistic Regression
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

print('Performance Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

Performance Metrics for Logistic Regression:

0.60656 

[[21  9]
 [15 16]] 

              precision    recall  f1-score   support

           0       0.58      0.70      0.64        30
           1       0.64      0.52      0.57        31

    accuracy                           0.61        61
   macro avg       0.61      0.61      0.60        61
weighted avg       0.61      0.61      0.60        61



In [403]:
# Calculating for LDA
lda_model.fit(X_train, y_train)
lda_pred = lda_model.predict(X_test)

print('Performance Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

Performance Metrics for LDA:

0.59016 

[[21  9]
 [16 15]] 

              precision    recall  f1-score   support

           0       0.57      0.70      0.63        30
           1       0.62      0.48      0.55        31

    accuracy                           0.59        61
   macro avg       0.60      0.59      0.59        61
weighted avg       0.60      0.59      0.59        61



In [404]:
# Calculating for kNN
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print('Performance Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

Performance Metrics for KNN:

0.4918 

[[19 11]
 [20 11]] 

              precision    recall  f1-score   support

           0       0.49      0.63      0.55        30
           1       0.50      0.35      0.42        31

    accuracy                           0.49        61
   macro avg       0.49      0.49      0.48        61
weighted avg       0.49      0.49      0.48        61



In [405]:
# Calculating for CART
cart_model.fit(X_train, y_train)
cart_pred = cart_model.predict(X_test)

print('Performance Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

Performance Metrics for CART:

0.5082 

[[17 13]
 [17 14]] 

              precision    recall  f1-score   support

           0       0.50      0.57      0.53        30
           1       0.52      0.45      0.48        31

    accuracy                           0.51        61
   macro avg       0.51      0.51      0.51        61
weighted avg       0.51      0.51      0.51        61



In [406]:
# Calculating for GNB
gnb_model.fit(X_train, y_train)
gnb_pred = log_model.predict(X_test)

print('Performance Metrics for GNB:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

Performance Metrics for GNB:

0.60656 

[[21  9]
 [15 16]] 

              precision    recall  f1-score   support

           0       0.58      0.70      0.64        30
           1       0.64      0.52      0.57        31

    accuracy                           0.61        61
   macro avg       0.61      0.61      0.60        61
weighted avg       0.61      0.61      0.60        61



In [407]:
# Calculating for Support Vector Machine
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

print('Performance Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

Performance Metrics for SVM:

0.55738 

[[21  9]
 [18 13]] 

              precision    recall  f1-score   support

           0       0.54      0.70      0.61        30
           1       0.59      0.42      0.49        31

    accuracy                           0.56        61
   macro avg       0.56      0.56      0.55        61
weighted avg       0.57      0.56      0.55        61



In [409]:
# Calculating for Random Forest Classifier
rfc_model.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)

print('Performance Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

Performance Metrics for RFC:

0.65574 

[[21  9]
 [12 19]] 

              precision    recall  f1-score   support

           0       0.64      0.70      0.67        30
           1       0.68      0.61      0.64        31

    accuracy                           0.66        61
   macro avg       0.66      0.66      0.66        61
weighted avg       0.66      0.66      0.66        61



In [410]:
# Pre-parse the dataset
data3 = preprocess("rawfile_blood.csv")

robust          368
prefrail_mci    268
prefrail        250
mci             142
frail_mci        86
frail             9
Name: condition, dtype: int64

####################################################################
Number of Rows of Dataframe:
1123
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47

####################################################################
Number of Rows before Parsing NULLs in data:
1123
Number of Rows after Parsing NULLs in data:
1015


In [411]:
data3.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME02646,frail,196,24,46.5,121,3.93,0.37,95,31,...,7,12,13,6,0.2,6.0,1.011,1.14,4.1,5.9
1,ME03109,frail,200,23,55.6,142,4.82,0.42,87,30,...,7,20,17,26,3.1,5.0,1.011,3.25,4.6,8.5
2,ME06997,frail,441,20,76.8,105,4.54,0.41,90,30,...,5,16,19,15,1.4,7.0,1.023,2.14,4.0,6.4
3,ME07149,frail,265,16,47.2,122,4.53,0.39,86,27,...,8,24,19,21,2.1,5.5,1.012,1.06,4.7,6.1
4,ME07700,frail,425,14,31.3,124,4.44,0.38,85,28,...,6,20,23,23,6.0,5.5,1.013,1.95,3.8,5.8


In [412]:
data3.columns

Index(['mtag', 'condition', 'A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6'],
      dtype='object')

In [413]:
c = data3['condition'].value_counts()
condition = c.index
c

robust          343
prefrail_mci    233
prefrail        223
mci             133
frail_mci        76
frail             7
Name: condition, dtype: int64

In [414]:
for i in range(len(condition)):
    data3['condition'].replace(condition[i], i, inplace = True)

data3.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME02646,5,196,24,46.5,121,3.93,0.37,95,31,...,7,12,13,6,0.2,6.0,1.011,1.14,4.1,5.9
1,ME03109,5,200,23,55.6,142,4.82,0.42,87,30,...,7,20,17,26,3.1,5.0,1.011,3.25,4.6,8.5
2,ME06997,5,441,20,76.8,105,4.54,0.41,90,30,...,5,16,19,15,1.4,7.0,1.023,2.14,4.0,6.4
3,ME07149,5,265,16,47.2,122,4.53,0.39,86,27,...,8,24,19,21,2.1,5.5,1.012,1.06,4.7,6.1
4,ME07700,5,425,14,31.3,124,4.44,0.38,85,28,...,6,20,23,23,6.0,5.5,1.013,1.95,3.8,5.8


In [415]:
data3.tail()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
1010,MV00454,0,220,19,67.5,138,4.66,0.42,91,30,...,20,10,17,8,6.6,7.0,1.015,1.29,4.5,6.2
1011,MV00456,0,334,18,51.0,139,4.63,0.42,91,30,...,16,22,35,40,1.0,6.0,1.015,1.88,3.9,5.6
1012,MV00460,0,418,17,61.0,122,4.18,0.38,90,29,...,19,20,23,15,0.4,6.5,1.005,3.58,4.0,5.6
1013,MV00502,0,393,18,43.1,136,4.57,0.43,94,30,...,13,11,22,23,0.7,7.0,1.009,0.92,4.1,6.0
1014,MV00510,0,371,24,55.9,127,4.41,0.4,90,29,...,13,14,16,12,7.5,8.0,1.017,2.45,4.5,6.2


In [416]:
data3.columns

Index(['mtag', 'condition', 'A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6'],
      dtype='object')

In [417]:
y = data3['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data3[features]

In [418]:
X = X_old
X = StandardScaler().fit_transform(X_old)
X = MinMaxScaler().fit_transform(X_old)

In [419]:
# Summarise the new class distribution
counter = Counter(y)
print(counter)

Counter({0: 343, 1: 233, 2: 223, 3: 133, 4: 76, 5: 7})


In [420]:
# Oversample using SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

counter = Counter(y)
print(counter)

Counter({5: 343, 4: 343, 3: 343, 1: 343, 2: 343, 0: 343})


In [421]:
y.shape

(2058,)

In [422]:
X.shape

(2058, 45)

In [423]:
# HOLDOUT METHOD

In [424]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression: 0.404
Linear Discriminant Analysis: 0.421
K-Nearest Neigbors: 0.568
Classification and Regression Trees: 0.523
Gaussian Naive Bayes: 0.449
Support Vector Machines: 0.413
Random Forest Classifier: 0.728


In [425]:
# 5-fold Cross Validation

In [426]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=5)
print("Logistic Regression: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=5)
print("Linear Discriminant Analysis: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5)
print("K-Nearest Neighbors: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5)
print("Classification and Regression Trees: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5)
print("Gaussian Naive Bayes: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5)
print("Support Vector Machines: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Logistic Regression: 0.43 accuracy with a standard deviation of 0.01
Linear Discriminant Analysis: 0.45 accuracy with a standard deviation of 0.03
K-Nearest Neighbors: 0.63 accuracy with a standard deviation of 0.07
Classification and Regression Trees: 0.57 accuracy with a standard deviation of 0.07
Gaussian Naive Bayes: 0.44 accuracy with a standard deviation of 0.04
Support Vector Machines: 0.44 accuracy with a standard deviation of 0.01
Random Forest Classifier: 0.76 accuracy with a standard deviation of 0.08


In [427]:
# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.39      0.38      0.39        69
           1       0.38      0.22      0.28        69
           2       0.30      0.28      0.29        68
           3       0.35      0.35      0.35        68
           4       0.32      0.32      0.32        69
           5       0.65      1.00      0.79        69

    accuracy                           0.42       412
   macro avg       0.40      0.42      0.40       412
weighted avg       0.40      0.42      0.40       412

              precision    recall  f1-score   support

           0       0.39      0.39      0.39        69
           1       0.24      0.17      0.20        69
           2       0.29      0.16      0.21        68
           3       0.29      0.35      0.31        69
           4       0.38      0.37      0.38        68
           5       0.65      1.00      0.79        69

    accuracy                           0.41       412
   macro avg       0.37

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.31      0.32      0.31        69
           1       0.29      0.22      0.25        68
           2       0.31      0.23      0.27        69
           3       0.42      0.42      0.42        69
           4       0.47      0.43      0.45        68
           5       0.64      1.00      0.78        69

    accuracy                           0.44       412
   macro avg       0.41      0.44      0.41       412
weighted avg       0.41      0.44      0.41       412

              precision    recall  f1-score   support

           0       0.31      0.38      0.34        68
           1       0.39      0.28      0.32        68
           2       0.45      0.30      0.36        69
           3       0.32      0.32      0.32        69
           4       0.47      0.39      0.43        69
           5       0.65      1.00      0.79        68

    accuracy                           0.45       411
   macro avg       0.43

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [428]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.38      0.41      0.39        69
           1       0.29      0.22      0.25        69
           2       0.29      0.26      0.27        68
           3       0.34      0.25      0.29        68
           4       0.35      0.36      0.36        69
           5       0.64      0.96      0.77        69

    accuracy                           0.41       412
   macro avg       0.38      0.41      0.39       412
weighted avg       0.38      0.41      0.39       412

              precision    recall  f1-score   support

           0       0.36      0.36      0.36        69
           1       0.22      0.19      0.20        69
           2       0.24      0.16      0.19        68
           3       0.34      0.36      0.35        69
           4       0.45      0.43      0.44        68
           5       0.68      1.00      0.81        69

    accuracy                           0.42       412
   macro avg       0.38

In [429]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.30      0.16      0.21        69
           1       0.33      0.20      0.25        69
           2       0.39      0.35      0.37        68
           3       0.41      0.56      0.47        68
           4       0.61      0.87      0.72        69
           5       0.86      1.00      0.93        69

    accuracy                           0.52       412
   macro avg       0.48      0.52      0.49       412
weighted avg       0.48      0.52      0.49       412

              precision    recall  f1-score   support

           0       0.37      0.20      0.26        69
           1       0.36      0.32      0.34        69
           2       0.41      0.35      0.38        68
           3       0.55      0.67      0.61        69
           4       0.77      0.96      0.86        68
           5       0.78      1.00      0.88        69

    accuracy                           0.58       412
   macro avg       0.54

In [430]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.21      0.20      0.20        69
           1       0.37      0.33      0.35        69
           2       0.32      0.34      0.33        68
           3       0.45      0.41      0.43        68
           4       0.58      0.65      0.61        69
           5       0.97      1.00      0.99        69

    accuracy                           0.49       412
   macro avg       0.48      0.49      0.49       412
weighted avg       0.48      0.49      0.49       412

              precision    recall  f1-score   support

           0       0.30      0.25      0.27        69
           1       0.38      0.36      0.37        69
           2       0.35      0.34      0.34        68
           3       0.48      0.48      0.48        69
           4       0.53      0.65      0.58        68
           5       0.94      0.99      0.96        69

    accuracy                           0.51       412
   macro avg       0.50

In [431]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.33      0.26      0.29        69
           1       0.23      0.20      0.22        69
           2       0.20      0.16      0.18        68
           3       0.26      0.47      0.33        68
           4       0.47      0.30      0.37        69
           5       0.96      1.00      0.98        69

    accuracy                           0.40       412
   macro avg       0.41      0.40      0.39       412
weighted avg       0.41      0.40      0.40       412

              precision    recall  f1-score   support

           0       0.29      0.28      0.28        69
           1       0.36      0.20      0.26        69
           2       0.40      0.12      0.18        68
           3       0.21      0.36      0.27        69
           4       0.38      0.51      0.44        68
           5       0.90      1.00      0.95        69

    accuracy                           0.41       412
   macro avg       0.42

In [432]:
# Support Vector Machines
svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.46      0.49      0.48        69
           1       0.32      0.23      0.27        69
           2       0.23      0.19      0.21        68
           3       0.38      0.38      0.38        68
           4       0.29      0.30      0.30        69
           5       0.75      1.00      0.86        69

    accuracy                           0.43       412
   macro avg       0.41      0.43      0.42       412
weighted avg       0.41      0.43      0.42       412

              precision    recall  f1-score   support

           0       0.38      0.41      0.39        69
           1       0.26      0.22      0.24        69
           2       0.39      0.22      0.28        68
           3       0.28      0.35      0.31        69
           4       0.43      0.38      0.41        68
           5       0.72      1.00      0.84        69

    accuracy                           0.43       412
   macro avg       0.41

In [433]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

              precision    recall  f1-score   support

           0       0.42      0.57      0.48        69
           1       0.63      0.46      0.53        69
           2       0.44      0.46      0.45        68
           3       0.77      0.59      0.67        68
           4       0.81      0.91      0.86        69
           5       1.00      1.00      1.00        69

    accuracy                           0.67       412
   macro avg       0.68      0.66      0.67       412
weighted avg       0.68      0.67      0.67       412

              precision    recall  f1-score   support

           0       0.45      0.55      0.49        69
           1       0.49      0.54      0.51        69
           2       0.70      0.49      0.57        68
           3       0.71      0.65      0.68        69
           4       0.94      0.97      0.96        68
           5       0.97      1.00      0.99        69

    accuracy                           0.70       412
   macro avg       0.71

In [434]:
# Calculating Performance Metrics for Holdout

In [435]:
# Calculating for Logistic Regression
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

print('Performance Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

Performance Metrics for Logistic Regression:

0.40413 

[[ 46   8  29  29  15  11]
 [ 18  24  21  26  34  18]
 [ 35  15  32  24  17  11]
 [ 35  21  13  44  16  10]
 [ 21  19  19   7  51  19]
 [  0   0   0   0   0 136]] 

              precision    recall  f1-score   support

           0       0.30      0.33      0.31       138
           1       0.28      0.17      0.21       141
           2       0.28      0.24      0.26       134
           3       0.34      0.32      0.33       139
           4       0.38      0.38      0.38       136
           5       0.66      1.00      0.80       136

    accuracy                           0.40       824
   macro avg       0.37      0.41      0.38       824
weighted avg       0.37      0.40      0.38       824



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [436]:
# Calculating for LDA
lda_model.fit(X_train, y_train)
lda_pred = lda_model.predict(X_test)

print('Performance Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

Performance Metrics for LDA:

0.42112 

[[ 48  11  33  25  15   6]
 [ 20  33  25  22  26  15]
 [ 32  17  40  19  19   7]
 [ 35  27  19  44   9   5]
 [  9  27  27   8  51  14]
 [  0   0   5   0   0 131]] 

              precision    recall  f1-score   support

           0       0.33      0.35      0.34       138
           1       0.29      0.23      0.26       141
           2       0.27      0.30      0.28       134
           3       0.37      0.32      0.34       139
           4       0.42      0.38      0.40       136
           5       0.74      0.96      0.83       136

    accuracy                           0.42       824
   macro avg       0.40      0.42      0.41       824
weighted avg       0.40      0.42      0.41       824



In [437]:
# Calculating for kNN
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print('Performance Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

Performance Metrics for KNN:

0.56796 

[[ 28  25  29  27  21   8]
 [ 18  42  15  29  22  15]
 [ 16  20  57  19  13   9]
 [ 19  16   8  86   9   1]
 [  1   4   5   7 119   0]
 [  0   0   0   0   0 136]] 

              precision    recall  f1-score   support

           0       0.34      0.20      0.25       138
           1       0.39      0.30      0.34       141
           2       0.50      0.43      0.46       134
           3       0.51      0.62      0.56       139
           4       0.65      0.88      0.74       136
           5       0.80      1.00      0.89       136

    accuracy                           0.57       824
   macro avg       0.53      0.57      0.54       824
weighted avg       0.53      0.57      0.54       824



In [438]:
# Calculating for CART
cart_model.fit(X_train, y_train)
cart_pred = cart_model.predict(X_test)

print('Performance Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

Performance Metrics for CART:

0.52549 

[[ 45  24  29  26  13   1]
 [ 20  54  21  21  22   3]
 [ 25  29  43  16  21   0]
 [ 18  16  18  79   7   1]
 [ 14  25  14   4  77   2]
 [  0   0   0   1   0 135]] 

              precision    recall  f1-score   support

           0       0.37      0.33      0.35       138
           1       0.36      0.38      0.37       141
           2       0.34      0.32      0.33       134
           3       0.54      0.57      0.55       139
           4       0.55      0.57      0.56       136
           5       0.95      0.99      0.97       136

    accuracy                           0.53       824
   macro avg       0.52      0.53      0.52       824
weighted avg       0.52      0.53      0.52       824



In [439]:
# Calculating for GNB
gnb_model.fit(X_train, y_train)
gnb_pred = log_model.predict(X_test)

print('Performance Metrics for GNB:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

Performance Metrics for GNB:

0.40413 

[[ 46   8  29  29  15  11]
 [ 18  24  21  26  34  18]
 [ 35  15  32  24  17  11]
 [ 35  21  13  44  16  10]
 [ 21  19  19   7  51  19]
 [  0   0   0   0   0 136]] 

              precision    recall  f1-score   support

           0       0.30      0.33      0.31       138
           1       0.28      0.17      0.21       141
           2       0.28      0.24      0.26       134
           3       0.34      0.32      0.33       139
           4       0.38      0.38      0.38       136
           5       0.66      1.00      0.80       136

    accuracy                           0.40       824
   macro avg       0.37      0.41      0.38       824
weighted avg       0.37      0.40      0.38       824



In [440]:
# Calculating for Support Vector Machine
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

print('Performance Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

Performance Metrics for SVM:

0.41262 

[[ 52  10  28  25  15   8]
 [ 19  21  23  25  39  14]
 [ 37  13  37  21  16  10]
 [ 37  25  12  43  16   6]
 [ 26  25  13   8  51  13]
 [  0   0   0   0   0 136]] 

              precision    recall  f1-score   support

           0       0.30      0.38      0.34       138
           1       0.22      0.15      0.18       141
           2       0.33      0.28      0.30       134
           3       0.35      0.31      0.33       139
           4       0.37      0.38      0.37       136
           5       0.73      1.00      0.84       136

    accuracy                           0.41       824
   macro avg       0.38      0.41      0.39       824
weighted avg       0.38      0.41      0.39       824



In [441]:
# Calculating for Random Forest Classifier
rfc_model.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)

print('Performance Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

Performance Metrics for RFC:

0.70024 

[[ 69  21  30  16   2   0]
 [ 29  67  20  14  11   0]
 [ 30  12  75  13   3   1]
 [ 14   4   7 112   1   1]
 [  9   5   3   1 118   0]
 [  0   0   0   0   0 136]] 

              precision    recall  f1-score   support

           0       0.46      0.50      0.48       138
           1       0.61      0.48      0.54       141
           2       0.56      0.56      0.56       134
           3       0.72      0.81      0.76       139
           4       0.87      0.87      0.87       136
           5       0.99      1.00      0.99       136

    accuracy                           0.70       824
   macro avg       0.70      0.70      0.70       824
weighted avg       0.70      0.70      0.70       824

