In [113]:
# Import packages
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter
from parse import preprocess

In [114]:
# Function to show classification report for Cross Validation
def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred)) # print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

In [115]:
data = pd.read_csv("rawfile_blood_parsed_encoded.csv")

data.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME02646,frail,2,0,0,0,0,0,1,1,...,1,1,1,1,3,,,1,1,3
1,ME03109,frail,2,0,0,1,1,1,1,1,...,1,1,1,1,3,,,1,1,0
2,ME06997,frail,2,1,1,0,1,1,1,1,...,1,1,1,1,3,,,1,1,2
3,ME07149,frail,2,1,0,0,1,0,1,1,...,1,1,1,1,3,,,1,1,3
4,ME07700,frail,2,1,0,0,0,0,1,1,...,1,1,1,1,3,,,1,1,3


In [116]:
# Drop columns with no ranges set in Data Dictionary

data = data.drop(['B4_a2', 'B4_a5'], axis=1)

data.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d4,B2_d5,B2_d6,B2_d7,B2_d8,B2_d9,B3,B5_a2,B5_a3,B6
0,ME02646,frail,2,0,0,0,0,0,1,1,...,1,1,1,1,1,1,3,1,1,3
1,ME03109,frail,2,0,0,1,1,1,1,1,...,1,1,1,1,1,1,3,1,1,0
2,ME06997,frail,2,1,1,0,1,1,1,1,...,1,1,1,1,1,1,3,1,1,2
3,ME07149,frail,2,1,0,0,1,0,1,1,...,1,1,1,1,1,1,3,1,1,3
4,ME07700,frail,2,1,0,0,0,0,1,1,...,1,1,1,1,1,1,3,1,1,3


In [117]:
c = data['condition'].value_counts()
condition = c.index
c

robust          343
prefrail_mci    233
prefrail        223
mci             133
frail_mci        76
frail             7
Name: condition, dtype: int64

In [118]:
data.columns

Index(['mtag', 'condition', 'A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B5_a2', 'B5_a3', 'B6'],
      dtype='object')

In [119]:
y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B5_a2', 'B5_a3', 'B6']

X = data[features]

# X = MinMaxScaler().fit_transform(X)
# X = StandardScaler().fit_transform(X)

In [120]:
# Undersample the majority class
# Define undersample strategy

# sampling_strategy = {'robust': 76, 'prefrail_mci': 76, 'prefrail': 76, 'mci': 76, 'frail_mci': 76}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)

# X, y = undersample.fit_resample(X, y)

# Summarise the new class distribution
counter = Counter(y)
print(counter)

Counter({'robust': 343, 'prefrail_mci': 233, 'prefrail': 223, 'mci': 133, 'frail_mci': 76, 'frail': 7})


In [121]:
# Transform the dataset using SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

# Summarise the new class distribution
counter = Counter(y)
print(counter)

Counter({'frail': 343, 'frail_mci': 343, 'mci': 343, 'prefrail_mci': 343, 'prefrail': 343, 'robust': 343})


In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier(random_state=1)
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

K-Nearest Neigbors: 0.539
Random Forest Classifier: 0.632


In [123]:
# Calculating for KNN
knn_pred = knn_model.predict(X_test)

print('Performance Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

# Calculating for RFC
rfc_pred = rfc_model.predict(X_test)

print('Performance Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

Performance Metrics for KNN:

0.53883 

[[136   0   0   0   0   0]
 [  3 108   8   6   4   3]
 [  4  15  84  18  11  12]
 [  8  15  31  43  22  17]
 [  9  22  19  32  39  17]
 [  1  23  24  38  18  34]] 

              precision    recall  f1-score   support

       frail       0.84      1.00      0.92       136
   frail_mci       0.59      0.82      0.69       132
         mci       0.51      0.58      0.54       144
    prefrail       0.31      0.32      0.32       136
prefrail_mci       0.41      0.28      0.34       138
      robust       0.41      0.25      0.31       138

    accuracy                           0.54       824
   macro avg       0.51      0.54      0.52       824
weighted avg       0.51      0.54      0.52       824

Performance Metrics for RFC:

0.63228 

[[134   0   1   0   0   1]
 [  0 112   5   6   5   4]
 [  2   8  94   8  15  17]
 [  1  11  20  58  23  23]
 [  5   9  10  24  64  26]
 [  0   9  17  35  18  59]] 

              precision    recall  f1-score   s

In [124]:
# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5)
print("K-Nearest Neighbors: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

K-Nearest Neighbors: 0.62 accuracy with a standard deviation of 0.06
Random Forest Classifier: 0.71 accuracy with a standard deviation of 0.07
