In [None]:
# Import packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from collections import Counter
from parser_final import RobustFrailMCIpreprocess

In [None]:
# Function to show classification report for Cross Validation
def classification_report_with_accuracy_score(y_true, y_pred):
    print(classification_report(y_true, y_pred)) # print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

In [None]:
# Conduct High Correlation Filter

# Conduct mapping for Feature Names
featureName_mapping = {
    "A1_1" : "Vitamin B12 (pmol/L)",
    "A1_2" : "Serum Folate (nmol/L)",
    "A2_1" : "Serum Homocysteine (µmol/L)",
    "A3_1" : "25-hydroxy Vitamin D (nmol/L)",
    "B1_a" : "Haemoglobin (g/L)",
    "B1_a1" : "RBC (/L)",
    "B1_a2" : "PCV (L/L)",
    "B1_a3" : "MCV (fL)",
    "B1_a4" : "MCH (pg)",
    "B1_a5" : "MCHC (g/L)",
    "B1_a6" : "RDW (%)",
    "B1_b" : "White Cell Count (/L)",
    "B1_b1" : "Neutrophils (/L)",
    "B1_b2" : "Lymphocytes (/L)",
    "B1_b3" : "Monocytes (/L)",
    "B1_b4" : "Eosinophils (/L)",
    "B1_b5" : "Basophils (/L)",
    "B1_c" : "Platelets (/L)",
    "B1_d" : "Glucose (mmol/L)",
    "B2_a1" : "Total Cholesterol (mmol/L)",
    "B2_a2" : "Triglyceride (mmol/L)",
    "B2_a3" : "HDL Cholesterol (mmol/L)",
    "B2_a4" : "LDL Cholesterol (mmol/L)",
    "B2_a5" : "Total Cholesterol/HDL Ratio",
    "B2_b1" : "Sodium (mmol/L)",
    "B2_b2" : "Potassium (mmol/L)",
    "B2_b3" : "Chloride (mmol/L)",
    "B2_c1" : 'Urea (mmol/L)',
    "B2_c2" : "Creatinine (umol/L)",
    "B2_c3" : "eGFR (mL/min/1.73m2)",
    "B2_c4" : "Uric Acid (mmol/L)",
    "B2_c5" : "Calcium (mmol/L)",
    "B2_c6" : "Corrected Calcium (mmol/L)",
    "B2_c7" : "Phosphate (mmol/L)",
    "B2_d1" : "Total Protein (g/L)",
    "B2_d2" : "Albumin (g/L)",
    "B2_d3" : "Globulin (g/L)",
    "B2_d4" : "Albumin/Globulin ratio",
    "B2_d5" : "Alkaline Phosphatase (U/L)",
    "B2_d6" : "Total Bilirubin (µmol/L)",
    "B2_d7" : "GGT",
    "B2_d8" : "AST",
    "B2_d9" : "ALT",
    "B3" : "C-Reactive Protein",
    "B4_a1" : "Protein",
    "B4_a2" : "pH",
    "B4_a3" : "Glucose",
    "B4_a4" : "Ketones",
    "B4_a5" : "S.G.",
    "B4_a6" : "Blood",
    "B4_b1" : "Leucocytes (/L)",
    "B4_b2" : "Erythrocytes (/L)",
    "B4_b3" : "Epithelial Cells",
    "B5_a1" : "Free Thyroxine (FT4) (pmol/L)",
    "B5_a2" : "Thyroid Stimulating Hormone (mIU/L)",
    "B5_a3" : "Free Tri-iodothyronine (FT3) (pmol/L)",
    "B6" : "HbA1c"
}

In [None]:
# Test 1: New Dataset, 6 Classes

In [None]:
# Pre-parse the dataset
data = RobustFrailMCIpreprocess("rawfile_blood.csv")

In [None]:
c = data['condition'].value_counts()
condition = c.index
c

In [None]:
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

data.head()

In [None]:
data.tail()

In [None]:
y = data['condition']

features = ['A1_1', 'A1_2', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
X = StandardScaler().fit_transform(X_old)
X = MinMaxScaler().fit_transform(X_old)

In [None]:
# Summarise the new class distribution
counter = Counter(y)
print(counter)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

In [None]:
# Calculating for Logistic Regression
log_pred = log_model.predict(X_test)

print('Performance Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

In [None]:
# Calculating for LDA
lda_pred = lda_model.predict(X_test)

print('Performance Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

In [None]:
# Calculating for kNN
knn_pred = knn_model.predict(X_test)

print('Performance Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

In [None]:
# Calculating for CART
cart_pred = cart_model.predict(X_test)

print('Performance Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

In [None]:
# Calculating for GNB
gnb_pred = log_model.predict(X_test)

print('Performance Metrics for GNB:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

In [None]:
# Calculating for Support Vector Machine
svm_pred = svm_model.predict(X_test)

print('Performance Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

In [None]:
# Calculating for Random Forest Classifier
rfc_pred = rfc_model.predict(X_test)

print('Performance Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

In [None]:
# Logistic Regression

# predict probabilities
log_probs = log_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
log_probs = log_probs[:, 1]

# Linear Discriminant Analysis

# predict probabilities
lda_probs = lda_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
lda_probs = lda_probs[:, 1]

# K-Nearest Neigbors

# predict probabilities
knn_probs = knn_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
knn_probs = knn_probs[:, 1]

# Classification and Regression Trees

# predict probabilities
cart_probs = cart_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
cart_probs = cart_probs[:, 1]

# Gaussian Naive Bayes

# predict probabilities
gnb_probs = gnb_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
gnb_probs = gnb_probs[:, 1]

# Support Vector Machines

# predict probabilities
svm_probs = svm_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
svm_probs = svm_probs[:, 1]

# Random Forest Classifier

# predict probabilities
rfc_probs = rfc_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
rfc_probs = rfc_probs[:, 1]

In [None]:
# Logistic Regression

# calculate roc curves
log_fpr, log_tpr, log_thresholds = roc_curve(y_test, log_probs)
# calculate the g-mean for each threshold
log_gmeans = np.sqrt(log_tpr * (1-log_fpr))
# locate the index of the largest g-mean
log_ix = np.argmax(log_gmeans)
print("Logistic Regression:")
print("AUC: ", auc(log_fpr, log_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (log_thresholds[log_ix], log_gmeans[log_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(log_fpr, log_tpr, marker='.', label='Logistic Regression')
plt.scatter(log_fpr[log_ix], log_tpr[log_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Linear Discriminant Analysis

# calculate roc curves
lda_fpr, lda_tpr, lda_thresholds = roc_curve(y_test, lda_probs)
# calculate the g-mean for each threshold
lda_gmeans = np.sqrt(lda_tpr * (1-lda_fpr))
# locate the index of the largest g-mean
lda_ix = np.argmax(lda_gmeans)
print("Linear Discriminant Analysis:")
print("AUC: ", auc(lda_fpr, lda_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (lda_thresholds[lda_ix], lda_gmeans[lda_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(lda_fpr, lda_tpr, marker='.', label='Linear Discriminant Analysis')
plt.scatter(lda_fpr[lda_ix], lda_tpr[lda_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# K-Nearest Neighbor

# calculate roc curves
knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn_probs)
# calculate the g-mean for each threshold
knn_gmeans = np.sqrt(knn_tpr * (1-knn_fpr))
# locate the index of the largest g-mean
knn_ix = np.argmax(knn_gmeans)
print("K-Nearest Neighbor:")
print("AUC: ", auc(knn_fpr, knn_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (knn_thresholds[knn_ix], knn_gmeans[knn_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(knn_fpr, knn_tpr, marker='.', label='K-Nearest Neighbor')
plt.scatter(knn_fpr[knn_ix], knn_tpr[knn_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Classification and Regression Tree

# calculate roc curves
cart_fpr, cart_tpr, cart_thresholds = roc_curve(y_test, cart_probs)
# calculate the g-mean for each threshold
cart_gmeans = np.sqrt(cart_tpr * (1-cart_fpr))
# locate the index of the largest g-mean
cart_ix = np.argmax(cart_gmeans)
print("Classification and Regression Tree:")
print("AUC: ", auc(cart_fpr, cart_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (cart_thresholds[cart_ix], cart_gmeans[cart_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(cart_fpr, cart_tpr, marker='.', label='Classification and Regression Tree')
plt.scatter(cart_fpr[cart_ix], cart_tpr[cart_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Gaussian Naive Bayes

# calculate roc curves
gnb_fpr, gnb_tpr, gnb_thresholds = roc_curve(y_test, gnb_probs)
# calculate the g-mean for each threshold
gnb_gmeans = np.sqrt(gnb_tpr * (1-gnb_fpr))
# locate the index of the largest g-mean
gnb_ix = np.argmax(gnb_gmeans)
print("Gaussian Naive Bayes:")
print("AUC: ", auc(gnb_fpr, gnb_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (gnb_thresholds[gnb_ix], gnb_gmeans[gnb_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(gnb_fpr, gnb_tpr, marker='.', label='Gaussian Naive Bayes')
plt.scatter(gnb_fpr[gnb_ix], gnb_tpr[gnb_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Support Vector Machines

# calculate roc curves
svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_probs)
# calculate the g-mean for each threshold
svm_gmeans = np.sqrt(svm_tpr * (1-svm_fpr))
# locate the index of the largest g-mean
svm_ix = np.argmax(svm_gmeans)
print("Support Vector Machines:")
print("AUC: ", auc(svm_fpr, svm_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (svm_thresholds[svm_ix], svm_gmeans[svm_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(svm_fpr, svm_tpr, marker='.', label='Support Vector Machines')
plt.scatter(svm_fpr[svm_ix], svm_tpr[svm_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Random Forest Classifier

# calculate roc curves
rfc_fpr, rfc_tpr, rfc_thresholds = roc_curve(y_test, rfc_probs)
# calculate the g-mean for each threshold
rfc_gmeans = np.sqrt(rfc_tpr * (1-rfc_fpr))
# locate the index of the largest g-mean
rfc_ix = np.argmax(rfc_gmeans)
print("Random Forest Classifier:")
print("AUC: ", auc(rfc_fpr, rfc_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (rfc_thresholds[rfc_ix], rfc_gmeans[rfc_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(rfc_fpr, rfc_tpr, marker='.', label='Random Forest Classifier')
plt.scatter(rfc_fpr[rfc_ix], rfc_tpr[rfc_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=5)
print("Logistic Regression: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=5)
print("Linear Discriminant Analysis: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5)
print("K-Nearest Neighbors: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5)
print("Classification and Regression Trees: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5)
print("Gaussian Naive Bayes: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5)
print("Support Vector Machines: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5)
print("Random Forest Classifier: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

In [None]:
# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Support Vector Machines
svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Logistic Regression

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(log_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Linear Discriminant Analysis

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(lda_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Classification and Regression Trees

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(cart_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Support Vector Machines

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(svm_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Random Forest Classifier

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(rfc_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
from sklearn.pipeline import Pipeline
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold
import matplotlib.pyplot as plt

# get a list of models to evaluate
def get_models():
    models = dict()
    # Logistic Regression
    rfe = RFE(log_model, 10)
    model = DecisionTreeClassifier()
    models['LOG'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Linear Discriminant Analysis
    rfe = RFE(lda_model, 10)
    model = DecisionTreeClassifier()
    models['LDA'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Classification & Regression Trees
    rfe = RFE(cart_model, 10)
    model = DecisionTreeClassifier()
    models['CART'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Support Vector Machines
    rfe = RFE(svm_model, 10)
    model = DecisionTreeClassifier()
    models['SVM'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Random Forest Classifier
    rfe = RFE(rfc_model, 10)
    model = DecisionTreeClassifier()
    models['RFC'] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

In [None]:
# Test 2: New Dataset, Robust and Non-Robust

In [None]:
# Pre-parse the dataset
data = RobustFrailMCIpreprocess("rawfile_blood.csv")

In [None]:
for i in range(0, len(data)):
	if data.at[i, 'condition'] == 'frail':
		data.at[i, 'condition'] = 'non-robust'
	elif data.at[i, 'condition'] == 'frail_mci':
		data.at[i, 'condition'] = 'non-robust'
	elif data.at[i, 'condition'] == 'mci':
		data.at[i, 'condition'] = 'non-robust'
	elif data.at[i, 'condition'] == 'prefrail_mci':
		data.at[i, 'condition'] = 'non-robust'
	elif data.at[i, 'condition'] == 'prefrail':
		data.at[i, 'condition'] = 'non-robust'
	elif data.at[i, 'condition'] == 'robust':
		data.at[i, 'condition'] = 'robust'

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
c = data['condition'].value_counts()
condition = c.index
c

In [None]:
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

data.head()

In [None]:
data.tail()

In [None]:
y = data['condition']

features = ['A1_1', 'A1_2', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X_old = data[features]

X = X_old
X = StandardScaler().fit_transform(X_old)
X = MinMaxScaler().fit_transform(X_old)

In [None]:
# Summarise the new class distribution
counter = Counter(y)
print(counter)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

In [None]:
# Calculating for Logistic Regression
log_pred = log_model.predict(X_test)

print('Performance Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

In [None]:
# Calculating for LDA
lda_pred = lda_model.predict(X_test)

print('Performance Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

In [None]:
# Calculating for kNN
knn_pred = knn_model.predict(X_test)

print('Performance Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

In [None]:
# Calculating for CART
cart_pred = cart_model.predict(X_test)

print('Performance Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

In [None]:
# Calculating for GNB
gnb_pred = log_model.predict(X_test)

print('Performance Metrics for GNB:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

In [None]:
# Calculating for Support Vector Machine
svm_pred = svm_model.predict(X_test)

print('Performance Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

In [None]:
# Calculating for Random Forest Classifier
rfc_pred = rfc_model.predict(X_test)

print('Performance Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

In [None]:
# Logistic Regression

# predict probabilities
log_probs = log_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
log_probs = log_probs[:, 1]

# Linear Discriminant Analysis

# predict probabilities
lda_probs = lda_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
lda_probs = lda_probs[:, 1]

# K-Nearest Neigbors

# predict probabilities
knn_probs = knn_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
knn_probs = knn_probs[:, 1]

# Classification and Regression Trees

# predict probabilities
cart_probs = cart_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
cart_probs = cart_probs[:, 1]

# Gaussian Naive Bayes

# predict probabilities
gnb_probs = gnb_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
gnb_probs = gnb_probs[:, 1]

# Support Vector Machines

# predict probabilities
svm_probs = svm_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
svm_probs = svm_probs[:, 1]

# Random Forest Classifier

# predict probabilities
rfc_probs = rfc_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
rfc_probs = rfc_probs[:, 1]

In [None]:
# Logistic Regression

# calculate roc curves
log_fpr, log_tpr, log_thresholds = roc_curve(y_test, log_probs)
# calculate the g-mean for each threshold
log_gmeans = np.sqrt(log_tpr * (1-log_fpr))
# locate the index of the largest g-mean
log_ix = np.argmax(log_gmeans)
print("Logistic Regression:")
print("AUC: ", auc(log_fpr, log_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (log_thresholds[log_ix], log_gmeans[log_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(log_fpr, log_tpr, marker='.', label='Logistic Regression')
plt.scatter(log_fpr[log_ix], log_tpr[log_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Linear Discriminant Analysis

# calculate roc curves
lda_fpr, lda_tpr, lda_thresholds = roc_curve(y_test, lda_probs)
# calculate the g-mean for each threshold
lda_gmeans = np.sqrt(lda_tpr * (1-lda_fpr))
# locate the index of the largest g-mean
lda_ix = np.argmax(lda_gmeans)
print("Linear Discriminant Analysis:")
print("AUC: ", auc(lda_fpr, lda_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (lda_thresholds[lda_ix], lda_gmeans[lda_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(lda_fpr, lda_tpr, marker='.', label='Linear Discriminant Analysis')
plt.scatter(lda_fpr[lda_ix], lda_tpr[lda_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# K-Nearest Neighbor

# calculate roc curves
knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn_probs)
# calculate the g-mean for each threshold
knn_gmeans = np.sqrt(knn_tpr * (1-knn_fpr))
# locate the index of the largest g-mean
knn_ix = np.argmax(knn_gmeans)
print("K-Nearest Neighbor:")
print("AUC: ", auc(knn_fpr, knn_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (knn_thresholds[knn_ix], knn_gmeans[knn_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(knn_fpr, knn_tpr, marker='.', label='K-Nearest Neighbor')
plt.scatter(knn_fpr[knn_ix], knn_tpr[knn_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Classification and Regression Tree

# calculate roc curves
cart_fpr, cart_tpr, cart_thresholds = roc_curve(y_test, cart_probs)
# calculate the g-mean for each threshold
cart_gmeans = np.sqrt(cart_tpr * (1-cart_fpr))
# locate the index of the largest g-mean
cart_ix = np.argmax(cart_gmeans)
print("Classification and Regression Tree:")
print("AUC: ", auc(cart_fpr, cart_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (cart_thresholds[cart_ix], cart_gmeans[cart_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(cart_fpr, cart_tpr, marker='.', label='Classification and Regression Tree')
plt.scatter(cart_fpr[cart_ix], cart_tpr[cart_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Gaussian Naive Bayes

# calculate roc curves
gnb_fpr, gnb_tpr, gnb_thresholds = roc_curve(y_test, gnb_probs)
# calculate the g-mean for each threshold
gnb_gmeans = np.sqrt(gnb_tpr * (1-gnb_fpr))
# locate the index of the largest g-mean
gnb_ix = np.argmax(gnb_gmeans)
print("Gaussian Naive Bayes:")
print("AUC: ", auc(gnb_fpr, gnb_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (gnb_thresholds[gnb_ix], gnb_gmeans[gnb_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(gnb_fpr, gnb_tpr, marker='.', label='Gaussian Naive Bayes')
plt.scatter(gnb_fpr[gnb_ix], gnb_tpr[gnb_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Support Vector Machines

# calculate roc curves
svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_probs)
# calculate the g-mean for each threshold
svm_gmeans = np.sqrt(svm_tpr * (1-svm_fpr))
# locate the index of the largest g-mean
svm_ix = np.argmax(svm_gmeans)
print("Support Vector Machines:")
print("AUC: ", auc(svm_fpr, svm_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (svm_thresholds[svm_ix], svm_gmeans[svm_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(svm_fpr, svm_tpr, marker='.', label='Support Vector Machines')
plt.scatter(svm_fpr[svm_ix], svm_tpr[svm_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Random Forest Classifier

# calculate roc curves
rfc_fpr, rfc_tpr, rfc_thresholds = roc_curve(y_test, rfc_probs)
# calculate the g-mean for each threshold
rfc_gmeans = np.sqrt(rfc_tpr * (1-rfc_fpr))
# locate the index of the largest g-mean
rfc_ix = np.argmax(rfc_gmeans)
print("Random Forest Classifier:")
print("AUC: ", auc(rfc_fpr, rfc_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (rfc_thresholds[rfc_ix], rfc_gmeans[rfc_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(rfc_fpr, rfc_tpr, marker='.', label='Random Forest Classifier')
plt.scatter(rfc_fpr[rfc_ix], rfc_tpr[rfc_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=5)
print("Logistic Regression: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=5)
print("Linear Discriminant Analysis: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5)
print("K-Nearest Neighbors: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5)
print("Classification and Regression Trees: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5)
print("Gaussian Naive Bayes: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5)
print("Support Vector Machines: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5)
print("Random Forest Classifier: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

In [None]:
# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Support Vector Machines
svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Logistic Regression

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(log_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Linear Discriminant Analysis

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(lda_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Classification and Regression Trees

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(cart_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Support Vector Machines

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(svm_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Random Forest Classifier

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(rfc_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
from sklearn.pipeline import Pipeline
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold
import matplotlib.pyplot as plt

# get a list of models to evaluate
def get_models():
    models = dict()
    # Logistic Regression
    rfe = RFE(log_model, 10)
    model = DecisionTreeClassifier()
    models['LOG'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Linear Discriminant Analysis
    rfe = RFE(lda_model, 10)
    model = DecisionTreeClassifier()
    models['LDA'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Classification & Regression Trees
    rfe = RFE(cart_model, 10)
    model = DecisionTreeClassifier()
    models['CART'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Support Vector Machines
    rfe = RFE(svm_model, 10)
    model = DecisionTreeClassifier()
    models['SVM'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Random Forest Classifier
    rfe = RFE(rfc_model, 10)
    model = DecisionTreeClassifier()
    models['RFC'] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

In [None]:
# Test 3: New Dataset, Robust and Non-Robust (76 samples)

In [None]:
# Pre-parse the dataset
data = RobustFrailMCIpreprocess("rawfile_final.csv")

In [None]:
# Taking only Frail+MCI and Robust classes

df1 = data[data.condition == 'frail_mci']
df1 = df1.reset_index(drop=True)

df2 = data[data.condition == 'robust']
df2 = df2.reset_index(drop=True)

data = pd.concat([df1, df2], ignore_index=True)

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
c = data['condition'].value_counts()
condition = c.index
c

In [None]:
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

data.head()

In [None]:
data.tail()

In [None]:
y = data['condition']

features = ['A1_1', 'A1_2', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X = data[features]

X = StandardScaler().fit_transform(X)
X = MinMaxScaler().fit_transform(X)

In [None]:
sampling_strategy = {0: 76, 1: 76}
undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)

X, y = undersample.fit_resample(X, y)

# Transform the dataset using SMOTE
# sampling_strategy = {0: 100, 1: 100}
# oversample = SMOTE(sampling_strategy=sampling_strategy)
# X, y = oversample.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

In [None]:
# Calculating for Logistic Regression
log_pred = log_model.predict(X_test)

print('Performance Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

In [None]:
# Calculating for LDA
lda_pred = lda_model.predict(X_test)

print('Performance Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

In [None]:
# Calculating for kNN
knn_pred = knn_model.predict(X_test)

print('Performance Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

In [None]:
# Calculating for CART
cart_pred = cart_model.predict(X_test)

print('Performance Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

In [None]:
# Calculating for GNB
gnb_pred = log_model.predict(X_test)

print('Performance Metrics for GNB:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

In [None]:
# Calculating for Support Vector Machine
svm_pred = svm_model.predict(X_test)

print('Performance Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

In [None]:
# Calculating for Random Forest Classifier
rfc_pred = rfc_model.predict(X_test)

print('Performance Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

In [None]:
# Logistic Regression

# predict probabilities
log_probs = log_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
log_probs = log_probs[:, 1]

# Linear Discriminant Analysis

# predict probabilities
lda_probs = lda_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
lda_probs = lda_probs[:, 1]

# K-Nearest Neigbors

# predict probabilities
knn_probs = knn_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
knn_probs = knn_probs[:, 1]

# Classification and Regression Trees

# predict probabilities
cart_probs = cart_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
cart_probs = cart_probs[:, 1]

# Gaussian Naive Bayes

# predict probabilities
gnb_probs = gnb_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
gnb_probs = gnb_probs[:, 1]

# Support Vector Machines

# predict probabilities
svm_probs = svm_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
svm_probs = svm_probs[:, 1]

# Random Forest Classifier

# predict probabilities
rfc_probs = rfc_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
rfc_probs = rfc_probs[:, 1]

In [None]:
# Logistic Regression

# calculate roc curves
log_fpr, log_tpr, log_thresholds = roc_curve(y_test, log_probs)
# calculate the g-mean for each threshold
log_gmeans = np.sqrt(log_tpr * (1-log_fpr))
# locate the index of the largest g-mean
log_ix = np.argmax(log_gmeans)
print("Logistic Regression:")
print("AUC: ", auc(log_fpr, log_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (log_thresholds[log_ix], log_gmeans[log_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(log_fpr, log_tpr, marker='.', label='Logistic Regression')
plt.scatter(log_fpr[log_ix], log_tpr[log_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Linear Discriminant Analysis

# calculate roc curves
lda_fpr, lda_tpr, lda_thresholds = roc_curve(y_test, lda_probs)
# calculate the g-mean for each threshold
lda_gmeans = np.sqrt(lda_tpr * (1-lda_fpr))
# locate the index of the largest g-mean
lda_ix = np.argmax(lda_gmeans)
print("Linear Discriminant Analysis:")
print("AUC: ", auc(lda_fpr, lda_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (lda_thresholds[lda_ix], lda_gmeans[lda_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(lda_fpr, lda_tpr, marker='.', label='Linear Discriminant Analysis')
plt.scatter(lda_fpr[lda_ix], lda_tpr[lda_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# K-Nearest Neighbor

# calculate roc curves
knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn_probs)
# calculate the g-mean for each threshold
knn_gmeans = np.sqrt(knn_tpr * (1-knn_fpr))
# locate the index of the largest g-mean
knn_ix = np.argmax(knn_gmeans)
print("K-Nearest Neighbor:")
print("AUC: ", auc(knn_fpr, knn_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (knn_thresholds[knn_ix], knn_gmeans[knn_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(knn_fpr, knn_tpr, marker='.', label='K-Nearest Neighbor')
plt.scatter(knn_fpr[knn_ix], knn_tpr[knn_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Classification and Regression Tree

# calculate roc curves
cart_fpr, cart_tpr, cart_thresholds = roc_curve(y_test, cart_probs)
# calculate the g-mean for each threshold
cart_gmeans = np.sqrt(cart_tpr * (1-cart_fpr))
# locate the index of the largest g-mean
cart_ix = np.argmax(cart_gmeans)
print("Classification and Regression Tree:")
print("AUC: ", auc(cart_fpr, cart_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (cart_thresholds[cart_ix], cart_gmeans[cart_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(cart_fpr, cart_tpr, marker='.', label='Classification and Regression Tree')
plt.scatter(cart_fpr[cart_ix], cart_tpr[cart_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Gaussian Naive Bayes

# calculate roc curves
gnb_fpr, gnb_tpr, gnb_thresholds = roc_curve(y_test, gnb_probs)
# calculate the g-mean for each threshold
gnb_gmeans = np.sqrt(gnb_tpr * (1-gnb_fpr))
# locate the index of the largest g-mean
gnb_ix = np.argmax(gnb_gmeans)
print("Gaussian Naive Bayes:")
print("AUC: ", auc(gnb_fpr, gnb_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (gnb_thresholds[gnb_ix], gnb_gmeans[gnb_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(gnb_fpr, gnb_tpr, marker='.', label='Gaussian Naive Bayes')
plt.scatter(gnb_fpr[gnb_ix], gnb_tpr[gnb_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Support Vector Machines

# calculate roc curves
svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_probs)
# calculate the g-mean for each threshold
svm_gmeans = np.sqrt(svm_tpr * (1-svm_fpr))
# locate the index of the largest g-mean
svm_ix = np.argmax(svm_gmeans)
print("Support Vector Machines:")
print("AUC: ", auc(svm_fpr, svm_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (svm_thresholds[svm_ix], svm_gmeans[svm_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(svm_fpr, svm_tpr, marker='.', label='Support Vector Machines')
plt.scatter(svm_fpr[svm_ix], svm_tpr[svm_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Random Forest Classifier

# calculate roc curves
rfc_fpr, rfc_tpr, rfc_thresholds = roc_curve(y_test, rfc_probs)
# calculate the g-mean for each threshold
rfc_gmeans = np.sqrt(rfc_tpr * (1-rfc_fpr))
# locate the index of the largest g-mean
rfc_ix = np.argmax(rfc_gmeans)
print("Random Forest Classifier:")
print("AUC: ", auc(rfc_fpr, rfc_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (rfc_thresholds[rfc_ix], rfc_gmeans[rfc_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(rfc_fpr, rfc_tpr, marker='.', label='Random Forest Classifier')
plt.scatter(rfc_fpr[rfc_ix], rfc_tpr[rfc_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=5)
print("Logistic Regression: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=5)
print("Linear Discriminant Analysis: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5)
print("K-Nearest Neighbors: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5)
print("Classification and Regression Trees: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5)
print("Gaussian Naive Bayes: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5)
print("Support Vector Machines: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5)
print("Random Forest Classifier: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

In [None]:
# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Support Vector Machines
svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Logistic Regression

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(log_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Linear Discriminant Analysis

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(lda_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Classification and Regression Trees

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(cart_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Support Vector Machines

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(svm_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Random Forest Classifier

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(rfc_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
from sklearn.pipeline import Pipeline
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold
import matplotlib.pyplot as plt

# get a list of models to evaluate
def get_models():
    models = dict()
    # Logistic Regression
    rfe = RFE(log_model, 10)
    model = DecisionTreeClassifier()
    models['LOG'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Linear Discriminant Analysis
    rfe = RFE(lda_model, 10)
    model = DecisionTreeClassifier()
    models['LDA'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Classification & Regression Trees
    rfe = RFE(cart_model, 10)
    model = DecisionTreeClassifier()
    models['CART'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Support Vector Machines
    rfe = RFE(svm_model, 10)
    model = DecisionTreeClassifier()
    models['SVM'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Random Forest Classifier
    rfe = RFE(rfc_model, 10)
    model = DecisionTreeClassifier()
    models['RFC'] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

In [None]:
# Test 4: New Dataset, Robust and Non-Robust (343 samples)

In [None]:
# Pre-parse the dataset
data = RobustFrailMCIpreprocess("rawfile_final.csv")

In [None]:
# Taking only Frail+MCI and Robust classes

df1 = data[data.condition == 'frail_mci']
df1 = df1.reset_index(drop=True)

df2 = data[data.condition == 'robust']
df2 = df2.reset_index(drop=True)

data = pd.concat([df1, df2], ignore_index=True)

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
c = data['condition'].value_counts()
condition = c.index
c

In [None]:
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

data.head()

In [None]:
data.tail()

In [None]:
y = data['condition']

features = ['A1_1', 'A1_2', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X = data[features]

X = StandardScaler().fit_transform(X)
X = MinMaxScaler().fit_transform(X)

In [None]:
# sampling_strategy = {0: 76, 1: 76}
# undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)

# X, y = undersample.fit_resample(X, y)

# Transform the dataset using SMOTE
# sampling_strategy = {0: 100, 1: 100}
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

In [None]:
# Calculating for Logistic Regression
log_pred = log_model.predict(X_test)

print('Performance Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

In [None]:
# Calculating for LDA
lda_pred = lda_model.predict(X_test)

print('Performance Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

In [None]:
# Calculating for kNN
knn_pred = knn_model.predict(X_test)

print('Performance Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

In [None]:
# Calculating for CART
cart_pred = cart_model.predict(X_test)

print('Performance Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

In [None]:
# Calculating for GNB
gnb_pred = log_model.predict(X_test)

print('Performance Metrics for GNB:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

In [None]:
# Calculating for Support Vector Machine
svm_pred = svm_model.predict(X_test)

print('Performance Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

In [None]:
# Calculating for Random Forest Classifier
rfc_pred = rfc_model.predict(X_test)

print('Performance Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

In [None]:
# Logistic Regression

# predict probabilities
log_probs = log_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
log_probs = log_probs[:, 1]

# Linear Discriminant Analysis

# predict probabilities
lda_probs = lda_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
lda_probs = lda_probs[:, 1]

# K-Nearest Neigbors

# predict probabilities
knn_probs = knn_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
knn_probs = knn_probs[:, 1]

# Classification and Regression Trees

# predict probabilities
cart_probs = cart_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
cart_probs = cart_probs[:, 1]

# Gaussian Naive Bayes

# predict probabilities
gnb_probs = gnb_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
gnb_probs = gnb_probs[:, 1]

# Support Vector Machines

# predict probabilities
svm_probs = svm_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
svm_probs = svm_probs[:, 1]

# Random Forest Classifier

# predict probabilities
rfc_probs = rfc_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
rfc_probs = rfc_probs[:, 1]

In [None]:
# Logistic Regression

# calculate roc curves
log_fpr, log_tpr, log_thresholds = roc_curve(y_test, log_probs)
# calculate the g-mean for each threshold
log_gmeans = np.sqrt(log_tpr * (1-log_fpr))
# locate the index of the largest g-mean
log_ix = np.argmax(log_gmeans)
print("Logistic Regression:")
print("AUC: ", auc(log_fpr, log_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (log_thresholds[log_ix], log_gmeans[log_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(log_fpr, log_tpr, marker='.', label='Logistic Regression')
plt.scatter(log_fpr[log_ix], log_tpr[log_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Linear Discriminant Analysis

# calculate roc curves
lda_fpr, lda_tpr, lda_thresholds = roc_curve(y_test, lda_probs)
# calculate the g-mean for each threshold
lda_gmeans = np.sqrt(lda_tpr * (1-lda_fpr))
# locate the index of the largest g-mean
lda_ix = np.argmax(lda_gmeans)
print("Linear Discriminant Analysis:")
print("AUC: ", auc(lda_fpr, lda_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (lda_thresholds[lda_ix], lda_gmeans[lda_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(lda_fpr, lda_tpr, marker='.', label='Linear Discriminant Analysis')
plt.scatter(lda_fpr[lda_ix], lda_tpr[lda_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# K-Nearest Neighbor

# calculate roc curves
knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn_probs)
# calculate the g-mean for each threshold
knn_gmeans = np.sqrt(knn_tpr * (1-knn_fpr))
# locate the index of the largest g-mean
knn_ix = np.argmax(knn_gmeans)
print("K-Nearest Neighbor:")
print("AUC: ", auc(knn_fpr, knn_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (knn_thresholds[knn_ix], knn_gmeans[knn_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(knn_fpr, knn_tpr, marker='.', label='K-Nearest Neighbor')
plt.scatter(knn_fpr[knn_ix], knn_tpr[knn_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Classification and Regression Tree

# calculate roc curves
cart_fpr, cart_tpr, cart_thresholds = roc_curve(y_test, cart_probs)
# calculate the g-mean for each threshold
cart_gmeans = np.sqrt(cart_tpr * (1-cart_fpr))
# locate the index of the largest g-mean
cart_ix = np.argmax(cart_gmeans)
print("Classification and Regression Tree:")
print("AUC: ", auc(cart_fpr, cart_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (cart_thresholds[cart_ix], cart_gmeans[cart_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(cart_fpr, cart_tpr, marker='.', label='Classification and Regression Tree')
plt.scatter(cart_fpr[cart_ix], cart_tpr[cart_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Gaussian Naive Bayes

# calculate roc curves
gnb_fpr, gnb_tpr, gnb_thresholds = roc_curve(y_test, gnb_probs)
# calculate the g-mean for each threshold
gnb_gmeans = np.sqrt(gnb_tpr * (1-gnb_fpr))
# locate the index of the largest g-mean
gnb_ix = np.argmax(gnb_gmeans)
print("Gaussian Naive Bayes:")
print("AUC: ", auc(gnb_fpr, gnb_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (gnb_thresholds[gnb_ix], gnb_gmeans[gnb_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(gnb_fpr, gnb_tpr, marker='.', label='Gaussian Naive Bayes')
plt.scatter(gnb_fpr[gnb_ix], gnb_tpr[gnb_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Support Vector Machines

# calculate roc curves
svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_probs)
# calculate the g-mean for each threshold
svm_gmeans = np.sqrt(svm_tpr * (1-svm_fpr))
# locate the index of the largest g-mean
svm_ix = np.argmax(svm_gmeans)
print("Support Vector Machines:")
print("AUC: ", auc(svm_fpr, svm_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (svm_thresholds[svm_ix], svm_gmeans[svm_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(svm_fpr, svm_tpr, marker='.', label='Support Vector Machines')
plt.scatter(svm_fpr[svm_ix], svm_tpr[svm_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Random Forest Classifier

# calculate roc curves
rfc_fpr, rfc_tpr, rfc_thresholds = roc_curve(y_test, rfc_probs)
# calculate the g-mean for each threshold
rfc_gmeans = np.sqrt(rfc_tpr * (1-rfc_fpr))
# locate the index of the largest g-mean
rfc_ix = np.argmax(rfc_gmeans)
print("Random Forest Classifier:")
print("AUC: ", auc(rfc_fpr, rfc_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (rfc_thresholds[rfc_ix], rfc_gmeans[rfc_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(rfc_fpr, rfc_tpr, marker='.', label='Random Forest Classifier')
plt.scatter(rfc_fpr[rfc_ix], rfc_tpr[rfc_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=5)
print("Logistic Regression: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=5)
print("Linear Discriminant Analysis: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5)
print("K-Nearest Neighbors: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5)
print("Classification and Regression Trees: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5)
print("Gaussian Naive Bayes: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5)
print("Support Vector Machines: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5)
print("Random Forest Classifier: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

In [None]:
# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Support Vector Machines
svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Logistic Regression

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(log_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Linear Discriminant Analysis

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(lda_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Classification and Regression Trees

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(cart_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Support Vector Machines

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(svm_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Random Forest Classifier

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(rfc_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
from sklearn.pipeline import Pipeline
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold
import matplotlib.pyplot as plt

# get a list of models to evaluate
def get_models():
    models = dict()
    # Logistic Regression
    rfe = RFE(log_model, 10)
    model = DecisionTreeClassifier()
    models['LOG'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Linear Discriminant Analysis
    rfe = RFE(lda_model, 10)
    model = DecisionTreeClassifier()
    models['LDA'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Classification & Regression Trees
    rfe = RFE(cart_model, 10)
    model = DecisionTreeClassifier()
    models['CART'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Support Vector Machines
    rfe = RFE(svm_model, 10)
    model = DecisionTreeClassifier()
    models['SVM'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Random Forest Classifier
    rfe = RFE(rfc_model, 10)
    model = DecisionTreeClassifier()
    models['RFC'] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

In [None]:
# Test 5: New Dataset, Robust and Non-Robust (100 samples)

In [None]:
# Pre-parse the dataset
data = RobustFrailMCIpreprocess("rawfile_final.csv")

In [None]:
# Taking only Frail+MCI and Robust classes

df1 = data[data.condition == 'frail_mci']
df1 = df1.reset_index(drop=True)

df2 = data[data.condition == 'robust']
df2 = df2.reset_index(drop=True)

data = pd.concat([df1, df2], ignore_index=True)

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
c = data['condition'].value_counts()
condition = c.index
c

In [None]:
for i in range(len(condition)):
    data['condition'].replace(condition[i], i, inplace = True)

data.head()

In [None]:
data.tail()

In [None]:
y = data['condition']

features = ['A1_1', 'A1_2', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
       'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
       'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
       'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
       'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
       'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
X = data[features]

X = StandardScaler().fit_transform(X)
X = MinMaxScaler().fit_transform(X)

In [None]:
sampling_strategy = {0: 76, 1: 76}
undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)

X, y = undersample.fit_resample(X, y)

# Transform the dataset using SMOTE
sampling_strategy = {0: 100, 1: 100}
oversample = SMOTE(sampling_strategy=sampling_strategy)
X, y = oversample.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Regression:", log_model.score(X_test, y_test).round(3))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X_train, y_train)
print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X_train, y_train)
print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))

In [None]:
# Calculating for Logistic Regression
log_pred = log_model.predict(X_test)

print('Performance Metrics for Logistic Regression:\n')
print(accuracy_score(y_test, log_pred).round(5), '\n')
print(confusion_matrix(y_test, log_pred), '\n')
print(classification_report(y_test, log_pred))

In [None]:
# Calculating for LDA
lda_pred = lda_model.predict(X_test)

print('Performance Metrics for LDA:\n')
print(accuracy_score(y_test, lda_pred).round(5), '\n')
print(confusion_matrix(y_test, lda_pred), '\n')
print(classification_report(y_test, lda_pred))

In [None]:
# Calculating for kNN
knn_pred = knn_model.predict(X_test)

print('Performance Metrics for KNN:\n')
print(accuracy_score(y_test, knn_pred).round(5), '\n')
print(confusion_matrix(y_test, knn_pred), '\n')
print(classification_report(y_test, knn_pred))

In [None]:
# Calculating for CART
cart_pred = cart_model.predict(X_test)

print('Performance Metrics for CART:\n')
print(accuracy_score(y_test, cart_pred).round(5), '\n')
print(confusion_matrix(y_test, cart_pred), '\n')
print(classification_report(y_test, cart_pred))

In [None]:
# Calculating for GNB
gnb_pred = log_model.predict(X_test)

print('Performance Metrics for GNB:\n')
print(accuracy_score(y_test, gnb_pred).round(5), '\n')
print(confusion_matrix(y_test, gnb_pred), '\n')
print(classification_report(y_test, gnb_pred))

In [None]:
# Calculating for Support Vector Machine
svm_pred = svm_model.predict(X_test)

print('Performance Metrics for SVM:\n')
print(accuracy_score(y_test, svm_pred).round(5), '\n')
print(confusion_matrix(y_test, svm_pred), '\n')
print(classification_report(y_test, svm_pred))

In [None]:
# Calculating for Random Forest Classifier
rfc_pred = rfc_model.predict(X_test)

print('Performance Metrics for RFC:\n')
print(accuracy_score(y_test, rfc_pred).round(5), '\n')
print(confusion_matrix(y_test, rfc_pred), '\n')
print(classification_report(y_test, rfc_pred))

In [None]:
# Logistic Regression

# predict probabilities
log_probs = log_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
log_probs = log_probs[:, 1]

# Linear Discriminant Analysis

# predict probabilities
lda_probs = lda_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
lda_probs = lda_probs[:, 1]

# K-Nearest Neigbors

# predict probabilities
knn_probs = knn_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
knn_probs = knn_probs[:, 1]

# Classification and Regression Trees

# predict probabilities
cart_probs = cart_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
cart_probs = cart_probs[:, 1]

# Gaussian Naive Bayes

# predict probabilities
gnb_probs = gnb_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
gnb_probs = gnb_probs[:, 1]

# Support Vector Machines

# predict probabilities
svm_probs = svm_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
svm_probs = svm_probs[:, 1]

# Random Forest Classifier

# predict probabilities
rfc_probs = rfc_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
rfc_probs = rfc_probs[:, 1]

In [None]:
# Logistic Regression

# calculate roc curves
log_fpr, log_tpr, log_thresholds = roc_curve(y_test, log_probs)
# calculate the g-mean for each threshold
log_gmeans = np.sqrt(log_tpr * (1-log_fpr))
# locate the index of the largest g-mean
log_ix = np.argmax(log_gmeans)
print("Logistic Regression:")
print("AUC: ", auc(log_fpr, log_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (log_thresholds[log_ix], log_gmeans[log_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(log_fpr, log_tpr, marker='.', label='Logistic Regression')
plt.scatter(log_fpr[log_ix], log_tpr[log_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Linear Discriminant Analysis

# calculate roc curves
lda_fpr, lda_tpr, lda_thresholds = roc_curve(y_test, lda_probs)
# calculate the g-mean for each threshold
lda_gmeans = np.sqrt(lda_tpr * (1-lda_fpr))
# locate the index of the largest g-mean
lda_ix = np.argmax(lda_gmeans)
print("Linear Discriminant Analysis:")
print("AUC: ", auc(lda_fpr, lda_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (lda_thresholds[lda_ix], lda_gmeans[lda_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(lda_fpr, lda_tpr, marker='.', label='Linear Discriminant Analysis')
plt.scatter(lda_fpr[lda_ix], lda_tpr[lda_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# K-Nearest Neighbor

# calculate roc curves
knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn_probs)
# calculate the g-mean for each threshold
knn_gmeans = np.sqrt(knn_tpr * (1-knn_fpr))
# locate the index of the largest g-mean
knn_ix = np.argmax(knn_gmeans)
print("K-Nearest Neighbor:")
print("AUC: ", auc(knn_fpr, knn_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (knn_thresholds[knn_ix], knn_gmeans[knn_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(knn_fpr, knn_tpr, marker='.', label='K-Nearest Neighbor')
plt.scatter(knn_fpr[knn_ix], knn_tpr[knn_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Classification and Regression Tree

# calculate roc curves
cart_fpr, cart_tpr, cart_thresholds = roc_curve(y_test, cart_probs)
# calculate the g-mean for each threshold
cart_gmeans = np.sqrt(cart_tpr * (1-cart_fpr))
# locate the index of the largest g-mean
cart_ix = np.argmax(cart_gmeans)
print("Classification and Regression Tree:")
print("AUC: ", auc(cart_fpr, cart_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (cart_thresholds[cart_ix], cart_gmeans[cart_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(cart_fpr, cart_tpr, marker='.', label='Classification and Regression Tree')
plt.scatter(cart_fpr[cart_ix], cart_tpr[cart_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Gaussian Naive Bayes

# calculate roc curves
gnb_fpr, gnb_tpr, gnb_thresholds = roc_curve(y_test, gnb_probs)
# calculate the g-mean for each threshold
gnb_gmeans = np.sqrt(gnb_tpr * (1-gnb_fpr))
# locate the index of the largest g-mean
gnb_ix = np.argmax(gnb_gmeans)
print("Gaussian Naive Bayes:")
print("AUC: ", auc(gnb_fpr, gnb_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (gnb_thresholds[gnb_ix], gnb_gmeans[gnb_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(gnb_fpr, gnb_tpr, marker='.', label='Gaussian Naive Bayes')
plt.scatter(gnb_fpr[gnb_ix], gnb_tpr[gnb_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Support Vector Machines

# calculate roc curves
svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_probs)
# calculate the g-mean for each threshold
svm_gmeans = np.sqrt(svm_tpr * (1-svm_fpr))
# locate the index of the largest g-mean
svm_ix = np.argmax(svm_gmeans)
print("Support Vector Machines:")
print("AUC: ", auc(svm_fpr, svm_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (svm_thresholds[svm_ix], svm_gmeans[svm_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(svm_fpr, svm_tpr, marker='.', label='Support Vector Machines')
plt.scatter(svm_fpr[svm_ix], svm_tpr[svm_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Random Forest Classifier

# calculate roc curves
rfc_fpr, rfc_tpr, rfc_thresholds = roc_curve(y_test, rfc_probs)
# calculate the g-mean for each threshold
rfc_gmeans = np.sqrt(rfc_tpr * (1-rfc_fpr))
# locate the index of the largest g-mean
rfc_ix = np.argmax(rfc_gmeans)
print("Random Forest Classifier:")
print("AUC: ", auc(rfc_fpr, rfc_tpr))
print('Best Threshold=%f, G-Mean=%.3f' % (rfc_thresholds[rfc_ix], rfc_gmeans[rfc_ix]))
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(rfc_fpr, rfc_tpr, marker='.', label='Random Forest Classifier')
plt.scatter(rfc_fpr[rfc_ix], rfc_tpr[rfc_ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
# Logistic Regression

log_model = LogisticRegression()
log_model.fit(X, y)
scores = cross_val_score(log_model, X, y, cv=5)
print("Logistic Regression: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Linear Discriminant Analysis

lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X, y)
scores = cross_val_score(lda_model, X, y, cv=5)
print("Linear Discriminant Analysis: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# K-Nearest Neigbors

knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5)
print("K-Nearest Neighbors: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Classification and Regression Trees

cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5)
print("Classification and Regression Trees: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Gaussian Naive Bayes

gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5)
print("Gaussian Naive Bayes: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Support Vector Machines

svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5)
print("Support Vector Machines: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

# Random Forest Classifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5)
print("Random Forest Classifier: %0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

In [None]:
# Logistic Regression
scores = cross_val_score(log_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Linear Discriminant Analysis
scores = cross_val_score(lda_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# K-Nearest Neigbors
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)
scores = cross_val_score(knn_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Classification and Regression Trees
cart_model = DecisionTreeClassifier()
cart_model.fit(X, y)
scores = cross_val_score(cart_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X, y)
scores = cross_val_score(gnb_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Support Vector Machines
svm_model = SVC(kernel='linear', gamma = 'auto')
svm_model.fit(X, y)
scores = cross_val_score(svm_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Random Forest Classifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X, y)
scores = cross_val_score(rfc_model, X, y, cv=5, scoring=make_scorer(classification_report_with_accuracy_score))
print(scores)

In [None]:
# Logistic Regression

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(log_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Linear Discriminant Analysis

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(lda_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Classification and Regression Trees

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(cart_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Support Vector Machines

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(svm_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
# Random Forest Classifier

# Create the Recursive Feature Elimination (RFE) model and select 10 attributes
rfe = RFE(rfc_model, 10)
rfe = rfe.fit(X_train, y_train)

# Summarise the selection of the attributes
# pd.DataFrame(rfe.support_,index=X_old.columns,columns=['Rank'])
rfe_df = pd.DataFrame(rfe.ranking_,index=X_old.columns,columns=['Rank']).sort_values(by='Rank',ascending=True)
rfe_df.index = rfe_df.index.map(featureName_mapping)
rfe_df

In [None]:
from sklearn.pipeline import Pipeline
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold
import matplotlib.pyplot as plt

# get a list of models to evaluate
def get_models():
    models = dict()
    # Logistic Regression
    rfe = RFE(log_model, 10)
    model = DecisionTreeClassifier()
    models['LOG'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Linear Discriminant Analysis
    rfe = RFE(lda_model, 10)
    model = DecisionTreeClassifier()
    models['LDA'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Classification & Regression Trees
    rfe = RFE(cart_model, 10)
    model = DecisionTreeClassifier()
    models['CART'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Support Vector Machines
    rfe = RFE(svm_model, 10)
    model = DecisionTreeClassifier()
    models['SVM'] = Pipeline(steps=[('s',rfe),('m',model)])
    # Random Forest Classifier
    rfe = RFE(rfc_model, 10)
    model = DecisionTreeClassifier()
    models['RFC'] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

In [None]:
# major_num = []
# log_acc = []
# lda_acc = []
# knn_acc = []
# cart_acc = []
# gnb_acc = []
# svm_acc = []
# rfc_acc = []

# log_auc_list = []
# lda_auc_list = []
# knn_auc_list = []
# cart_auc_list = []
# gnb_auc_list = []
# svm_auc_list = []
# rfc_auc_list = []

# for classnumber in range(76, 344):
#     y = data['condition']

#     features = ['A1_1', 'A1_2', 'A2_1', 'A3_1', 'B1_a', 'B1_a1', 'B1_a2',
#            'B1_a3', 'B1_a4', 'B1_a5', 'B1_a6', 'B1_b', 'B1_b1', 'B1_b2', 'B1_b3',
#            'B1_c', 'B1_d', 'B2_a1', 'B2_a2', 'B2_a3', 'B2_a4', 'B2_a5', 'B2_b1',
#            'B2_b2', 'B2_b3', 'B2_c1', 'B2_c2', 'B2_c4', 'B2_c5', 'B2_c6', 'B2_c7',
#            'B2_d1', 'B2_d2', 'B2_d3', 'B2_d4', 'B2_d5', 'B2_d6', 'B2_d7', 'B2_d8',
#            'B2_d9', 'B3', 'B4_a2', 'B4_a5', 'B5_a2', 'B5_a3', 'B6']
#     X = data[features]

#     X = StandardScaler().fit_transform(X)
#     X = MinMaxScaler().fit_transform(X)

#     sampling_strategy = {0: classnumber, 1: 76}
#     undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)

#     X, y = undersample.fit_resample(X, y)

#     # Transform the dataset using SMOTE
#     oversample = SMOTE()
#     X, y = oversample.fit_resample(X, y)
    
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)

#     # Logistic Regression

#     log_model = LogisticRegression()
#     log_model.fit(X_train, y_train)
#     log_pred = log_model.predict(X_test)
# #     print("Logistic Regression:", log_model.score(X_test, y_test).round(3))
#     log_acc.append(log_model.score(X_test, y_test).round(3))

#     # Linear Discriminant Analysis

#     lda_model = LinearDiscriminantAnalysis()
#     lda_model.fit(X_train, y_train)
#     lda_pred = lda_model.predict(X_test)
# #     print("Linear Discriminant Analysis:", lda_model.score(X_test, y_test).round(3))
#     lda_acc.append(lda_model.score(X_test, y_test).round(3))

#     # K-Nearest Neigbors

#     knn_model = KNeighborsClassifier()
#     knn_model.fit(X_train, y_train)
#     knn_pred = knn_model.predict(X_test)
# #     print("K-Nearest Neigbors:", knn_model.score(X_test, y_test).round(3))
#     knn_acc.append(knn_model.score(X_test, y_test).round(3))

#     # Classification and Regression Trees

#     cart_model = DecisionTreeClassifier()
#     cart_model.fit(X_train, y_train)
#     cart_pred = cart_model.predict(X_test)
# #     print("Classification and Regression Trees:", cart_model.score(X_test, y_test).round(3))
#     cart_acc.append(cart_model.score(X_test, y_test).round(3))

#     # Gaussian Naive Bayes

#     gnb_model = GaussianNB()
#     gnb_model.fit(X_train, y_train)
#     gnb_pred = gnb_model.predict(X_test)
# #     print("Gaussian Naive Bayes:", gnb_model.score(X_test, y_test).round(3))
#     gnb_acc.append(gnb_model.score(X_test, y_test).round(3))

#     # Support Vector Machines

#     svm_model = SVC(kernel='linear', gamma = 'auto', probability=True)
#     svm_model.fit(X_train, y_train)
#     svm_pred = svm_model.predict(X_test)
# #     print("Support Vector Machines:", svm_model.score(X_test, y_test).round(3))
#     svm_acc.append(svm_model.score(X_test, y_test).round(3))

#     # Random Forest Classifier

#     rfc_model = RandomForestClassifier()
#     rfc_model.fit(X_train, y_train)
#     rfc_pred = rfc_model.predict(X_test)
# #     print("Random Forest Classifier:", rfc_model.score(X_test, y_test).round(3))
#     rfc_acc.append(rfc_model.score(X_test, y_test).round(3))
    
#     major_num.append(classnumber)
    
#     # Logistic Regression

#     # predict probabilities
#     log_probs = log_model.predict_proba(X_test)
#     # keep probabilities for the positive outcome only
#     log_probs = log_probs[:, 1]
#     # calculate roc curves
#     log_fpr, log_tpr, log_thresholds = roc_curve(y_test, log_probs)
#     # calculate the g-mean for each threshold
#     log_gmeans = np.sqrt(log_tpr * (1-log_fpr))
#     # locate the index of the largest g-mean
#     log_ix = np.argmax(log_gmeans)
    
#     log_auc_list.append(auc(log_fpr, log_tpr))


#     # Linear Discriminant Analysis

#     # predict probabilities
#     lda_probs = lda_model.predict_proba(X_test)
#     # keep probabilities for the positive outcome only
#     lda_probs = lda_probs[:, 1]
#     # calculate roc curves
#     lda_fpr, lda_tpr, lda_thresholds = roc_curve(y_test, lda_probs)
#     # calculate the g-mean for each threshold
#     lda_gmeans = np.sqrt(lda_tpr * (1-lda_fpr))
#     # locate the index of the largest g-mean
#     lda_ix = np.argmax(lda_gmeans)
    
#     lda_auc_list.append(auc(lda_fpr, lda_tpr))
    
#     # K-Nearest Neigbors

#     # predict probabilities
#     knn_probs = knn_model.predict_proba(X_test)
#     # keep probabilities for the positive outcome only
#     knn_probs = knn_probs[:, 1]
#     # calculate roc curves
#     knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn_probs)
#     # calculate the g-mean for each threshold
#     knn_gmeans = np.sqrt(knn_tpr * (1-knn_fpr))
#     # locate the index of the largest g-mean
#     knn_ix = np.argmax(knn_gmeans)
    
#     knn_auc_list.append(auc(knn_fpr, knn_tpr))

#     # Classification and Regression Trees

#     # predict probabilities
#     cart_probs = cart_model.predict_proba(X_test)
#     # keep probabilities for the positive outcome only
#     cart_probs = cart_probs[:, 1]
#     # calculate roc curves
#     cart_fpr, cart_tpr, cart_thresholds = roc_curve(y_test, cart_probs)
#     # calculate the g-mean for each threshold
#     cart_gmeans = np.sqrt(cart_tpr * (1-cart_fpr))
#     # locate the index of the largest g-mean
#     cart_ix = np.argmax(cart_gmeans)
    
#     cart_auc_list.append(auc(cart_fpr, cart_tpr))

#     # Gaussian Naive Bayes

#     # predict probabilities
#     gnb_probs = gnb_model.predict_proba(X_test)
#     # keep probabilities for the positive outcome only
#     gnb_probs = gnb_probs[:, 1]
#     # calculate roc curves
#     gnb_fpr, gnb_tpr, gnb_thresholds = roc_curve(y_test, gnb_probs)
#     # calculate the g-mean for each threshold
#     gnb_gmeans = np.sqrt(gnb_tpr * (1-gnb_fpr))
#     # locate the index of the largest g-mean
#     gnb_ix = np.argmax(gnb_gmeans)
    
#     gnb_auc_list.append(auc(gnb_fpr, gnb_tpr))

#     # Support Vector Machines

#     # predict probabilities
#     svm_probs = svm_model.predict_proba(X_test)
#     # keep probabilities for the positive outcome only
#     svm_probs = svm_probs[:, 1]
#     # calculate roc curves
#     svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_probs)
#     # calculate the g-mean for each threshold
#     svm_gmeans = np.sqrt(svm_tpr * (1-svm_fpr))
#     # locate the index of the largest g-mean
#     svm_ix = np.argmax(svm_gmeans)
    
#     svm_auc_list.append(auc(svm_fpr, svm_tpr))

#     # Random Forest Classifier

#     # predict probabilities
#     rfc_probs = rfc_model.predict_proba(X_test)
#     # keep probabilities for the positive outcome only
#     rfc_probs = rfc_probs[:, 1]
#     # calculate roc curves
#     rfc_fpr, rfc_tpr, rfc_thresholds = roc_curve(y_test, rfc_probs)
#     # calculate the g-mean for each threshold
#     rfc_gmeans = np.sqrt(rfc_tpr * (1-rfc_fpr))
#     # locate the index of the largest g-mean
#     rfc_ix = np.argmax(rfc_gmeans)
    
#     rfc_auc_list.append(auc(rfc_fpr, rfc_tpr))

In [None]:
# # Logistic Regression

# plt.plot(major_num, log_acc, label='Logistic Regression')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('Accuracy')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Linear Discriminant Analysis

# plt.plot(major_num, lda_acc, label='Linear Discriminant Analysis')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('Accuracy')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # K-Nearest Neighbor

# plt.plot(major_num, knn_acc, label='K-Nearest Neighbor')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('Accuracy')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Classification & Regression Tree

# plt.plot(major_num, cart_acc, label='Classification & Regression Tree')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('Accuracy')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Gaussian Naive Bayes

# plt.plot(major_num, gnb_acc, label='Gaussian Naive Bayes')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('Accuracy')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Support Vector Machines

# plt.plot(major_num, svm_acc, label='Support Vector Machines')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('Accuracy')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Random Forest Classifier

# plt.plot(major_num, rfc_acc, label='Random Forest Classifier')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('Accuracy')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Logistic Regression

# plt.plot(major_num, log_auc_list, label='Logistic Regression')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('AUC')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Linear Discriminant Analysis

# plt.plot(major_num, lda_auc_list, label='Linear Discriminant Analysis')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('AUC')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # K-Nearest Neighbor

# plt.plot(major_num, knn_auc_list, label='K-Nearest Neighbor')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('AUC')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Classification & Regression Tree

# plt.plot(major_num, cart_auc_list, label='Classification & Regression Tree')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('AUC')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Gaussian Naive Bayes

# plt.plot(major_num, gnb_auc_list, label='Gaussian Naive Bayes')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('AUC')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Support Vector Machines

# plt.plot(major_num, svm_auc_list, label='Support Vector Machines')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('AUC')
# plt.legend()
# # show the plot
# plt.show()

In [None]:
# # Random Forest Classifier

# plt.plot(major_num, rfc_auc_list, label='Random Forest Classifier')
# # axis labels
# plt.xlabel('Majority Class Size')
# plt.ylabel('AUC')
# plt.legend()
# # show the plot
# plt.show()