In [61]:
# Import packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter
from parse import preprocess

In [62]:
# Function to show classification report for Cross Validation
def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred)) # print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

In [63]:
# Pre-parse the dataset
data = preprocess("rawfile_blood.csv")

robust          368
prefrail_mci    268
prefrail        250
mci             142
frail_mci        86
Name: condition, dtype: int64

####################################################################
Number of Rows of Dataframe:
1114
Number of Columns of Dataframe:
59

####################################################################
Threshold for number of NULLs in a column: 0.1095
Number of Columns before Parsing for Too Many NULLs in a column:
59
Number of Columns after Parsing for Too Many NULLs in a column:
51

Columns Removed:
B1_b5
B4_a1
B4_a3
B4_a4
B4_a6
B4_b1
B4_b3
B5_a1

####################################################################
Number of Rows before Parsing NULLs in data:
1114
Number of Rows after Parsing NULLs in data:
1000

####################################################################
Number of Columns after dropping A1_2, B1_b4, B2_c3, B4_b2 for inconsistent data types:
47


In [64]:
# # Initialise counters for each condition
# frail = 0
# frail_mci = 0
# mci = 0
# prefrail_mci = 0
# prefrail = 0
# robust = 0

# # Count rows of data for each condition
# for i in range(0, len(data)):
# 	if data.at[i, 'condition'] == 'frail':
# 		frail += 1
# 	if data.at[i, 'condition'] == 'frail_mci':
# 		frail_mci += 1
# 	elif data.at[i, 'condition'] == 'mci':
# 		mci += 1
# 	elif data.at[i, 'condition'] == 'prefrail_mci':
# 		prefrail_mci += 1
# 	elif data.at[i, 'condition'] == 'prefrail':
# 		prefrail += 1
# 	elif data.at[i, 'condition'] == 'robust':
# 		robust += 1
        
# # Display number of rows (frequency) for each condition (label)
# print("\n####################################################################")
# print("Labels with frequencies:")
# # print("Frail:", frail)
# print("Frail + MCI:", frail_mci)
# print("MCI:", mci)
# print("Prefrail + MCI:", prefrail_mci)
# print("Prefrail:", prefrail)
# print("Robust:", robust)

In [65]:
# indexNames = data[data.condition == 'mci'].index
# data.drop(indexNames, inplace=True)

# indexNames = data[data.condition == 'prefrail_mci'].index
# data.drop(indexNames, inplace=True)

# indexNames = data[data.condition == 'prefrail'].index
# data.drop(indexNames, inplace=True)
    
# print()
# print(data['condition'].value_counts())

In [66]:
data.head()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME01378,frail_mci,241,20,33.5,150,5.25,0.46,87,29,...,10,21,22,17,1.3,7.0,1.01,0.69,4.7,5.9
1,ME02832,frail_mci,444,16,87.0,134,4.65,0.4,85,28,...,10,14,20,15,13.4,6.0,1.005,1.29,4.5,5.8
2,ME02909,frail_mci,1476,16,57.0,119,3.8,0.36,94,31,...,18,17,35,21,0.2,7.5,1.012,1.9,4.1,5.8
3,ME02998,frail_mci,339,18,63.8,135,4.89,0.42,86,28,...,13,16,25,13,16.8,5.0,1.017,1.32,4.0,6.0
4,ME03061,frail_mci,287,20,95.5,146,5.18,0.44,85,28,...,18,22,25,24,1.4,7.5,1.006,2.94,4.6,6.1


In [67]:
data.columns

Index(['mtag', 'condition', 'A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6'],
      dtype='object')

In [68]:
c = data['condition'].value_counts()
condition = c.index
c

robust          339
prefrail_mci    231
prefrail        221
mci             133
frail_mci        76
Name: condition, dtype: int64

In [69]:
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

data.head(3)

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
0,ME01378,4,241,20,33.5,150,5.25,0.46,87,29,...,10,21,22,17,1.3,7.0,1.01,0.69,4.7,5.9
1,ME02832,4,444,16,87.0,134,4.65,0.4,85,28,...,10,14,20,15,13.4,6.0,1.005,1.29,4.5,5.8
2,ME02909,4,1476,16,57.0,119,3.8,0.36,94,31,...,18,17,35,21,0.2,7.5,1.012,1.9,4.1,5.8


In [70]:
data.tail()

Unnamed: 0,mtag,condition,A1_1,A2_1,A3_1,B1_a,B1_a1,B1_a2,B1_a3,B1_a4,...,B2_d6,B2_d7,B2_d8,B2_d9,B3,B4_a2,B4_a5,B5_a2,B5_a3,B6
995,MV00454,0,220,19,67.5,138,4.66,0.42,91,30,...,20,10,17,8,6.6,7.0,1.015,1.29,4.5,6.2
996,MV00456,0,334,18,51.0,139,4.63,0.42,91,30,...,16,22,35,40,1.0,6.0,1.015,1.88,3.9,5.6
997,MV00460,0,418,17,61.0,122,4.18,0.38,90,29,...,19,20,23,15,0.4,6.5,1.005,3.58,4.0,5.6
998,MV00502,0,393,18,43.1,136,4.57,0.43,94,30,...,13,11,22,23,0.7,7.0,1.009,0.92,4.1,6.0
999,MV00510,0,371,24,55.9,127,4.41,0.4,90,29,...,13,14,16,12,7.5,8.0,1.017,2.45,4.5,6.2


In [71]:
data.columns

Index(['mtag', 'condition', 'A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6'],
      dtype='object')

In [72]:
y = data['condition']

features = ['A1_1', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

In [73]:
X = X_old
# X = StandardScaler().fit_transform(X_old)
# X = MinMaxScaler().fit_transform(X_old)

In [74]:
# Summarise the new class distribution
counter = Counter(y)
print(counter)

Counter({0: 339, 1: 231, 2: 221, 3: 133, 4: 76})


In [75]:
# Transform the dataset using SMOTE
# oversample = SMOTE()
# X, y = oversample.fit_resample(X, y)

In [76]:
# Summarise the new class distribution
# counter = Counter(y)
# print(counter)

In [77]:
y.shape

(1000,)

In [78]:
X.shape

(1000, 45)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [80]:
# y_train.value_counts()

In [81]:
# oversample = SMOTE(k_neighbors=2)
# X_train, y_train = oversample.fit_resample(X_train, y_train)

In [82]:
# y_train.value_counts()

In [83]:
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

Random Forest Classifier: 0.35


In [84]:
# oversample = SMOTE()
# X, y = oversample.fit_resample(X, y)

In [85]:
X.shape

(1000, 45)

In [86]:
y.shape

(1000,)

In [87]:
y.value_counts()

0    339
1    231
2    221
3    133
4     76
Name: condition, dtype: int64

In [88]:
scores = cross_val_score(rfc_model, X, y, cv=10)
# scores = cross_val_score(rfc_model, X, y, cv=7)
print("Random Forest Classifier: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Random Forest Classifier: 0.37 accuracy with a standard deviation of 0.03


In [89]:
# Random Forest Classifier
scoresExtra = cross_val_score(rfc_model, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
# scoresExtra = cross_val_score(rfc_model, X, y, cv=7, scoring=make_scorer(classification_report_with_accuracy_score))
print(scoresExtra)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.38      0.91      0.53        33
           1       0.55      0.26      0.35        23
           2       0.22      0.09      0.12        23
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         8

    accuracy                           0.38       100
   macro avg       0.23      0.25      0.20       100
weighted avg       0.30      0.38      0.29       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.47      0.82      0.60        34
           1       0.32      0.39      0.35        23
           2       0.23      0.14      0.17        22
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         8

    accuracy                           0.40       100
   macro avg       0.21      0.27      0.23       100
weighted avg       0.29      0.40      0.32       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.48      0.85      0.62        34
           1       0.32      0.43      0.37        23
           2       0.11      0.05      0.06        22
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         8

    accuracy                           0.40       100
   macro avg       0.18      0.27      0.21       100
weighted avg       0.26      0.40      0.31       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.40      0.71      0.51        34
           1       0.26      0.30      0.28        23
           2       0.23      0.14      0.17        22
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         8

    accuracy                           0.34       100
   macro avg       0.18      0.23      0.19       100
weighted avg       0.25      0.34      0.28       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.45      0.74      0.56        34
           1       0.28      0.35      0.31        23
           2       0.13      0.09      0.11        22
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         8

    accuracy                           0.35       100
   macro avg       0.17      0.23      0.19       100
weighted avg       0.24      0.35      0.28       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.38      0.68      0.49        34
           1       0.32      0.43      0.37        23
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00         8

    accuracy                           0.33       100
   macro avg       0.14      0.22      0.17       100
weighted avg       0.20      0.33      0.25       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.47      0.79      0.59        34
           1       0.36      0.52      0.43        23
           2       0.29      0.09      0.14        22
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00         7

    accuracy                           0.41       100
   macro avg       0.22      0.28      0.23       100
weighted avg       0.30      0.41      0.33       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.42      0.79      0.55        34
           1       0.33      0.35      0.34        23
           2       0.30      0.14      0.19        22
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00         7

    accuracy                           0.38       100
   macro avg       0.21      0.26      0.22       100
weighted avg       0.29      0.38      0.31       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.38      0.71      0.49        34
           1       0.36      0.35      0.36        23
           2       0.23      0.14      0.17        22
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00         7

    accuracy                           0.35       100
   macro avg       0.19      0.24      0.20       100
weighted avg       0.26      0.35      0.29       100

              precision    recall  f1-score   support

           0       0.40      0.74      0.52        34
           1       0.26      0.25      0.26        24
           2       0.46      0.27      0.34        22
           3       1.00      0.08      0.14        13
           4       0.00      0.00      0.00         7

    accuracy                           0.38       100
   macro avg       0.42      0.27      0.25       100
weighted avg       0.43      0.38      0.33       100

[0.38 0.4  0.4  0.34 0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [90]:
# Calculating accuracy metrics for RFC
rfc_pred = rfc_model.predict(X_test)

print('Accuracy Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

Accuracy Metrics for RFC:

0.35 

[[48 10  7  0  0]
 [13 15  6  0  0]
 [38 10  7  0  0]
 [24  3  4  0  0]
 [ 3  9  3  0  0]] 

              precision    recall  f1-score   support

           0       0.38      0.74      0.50        65
           1       0.32      0.44      0.37        34
           2       0.26      0.13      0.17        55
           3       0.00      0.00      0.00        31
           4       0.00      0.00      0.00        15

    accuracy                           0.35       200
   macro avg       0.19      0.26      0.21       200
weighted avg       0.25      0.35      0.27       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
