Import necessary libraries for ML and Pre-Processing

In [1]:
!pip install feature-engine #Library for random sample imputer
from feature_engine.imputation import RandomSampleImputer

#Importing the necessary libraries for EDA and model building
import numpy as np 
import pandas as pd 
from sklearn import preprocessing
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
import time
warnings.simplefilter(action='ignore')

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#importing ML models from sklearn library
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

#Importing metrics functions from SK Learn
from sklearn.metrics import roc_auc_score, r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from pprint import pprint

# # Used for Downloading MNIST
# from sklearn.datasets import fetch_mldata

# Used for Splitting Training and Test Sets
from sklearn.model_selection import train_test_split

%matplotlib inline



Importing the dataset

In [2]:
dataset = pd.read_csv('https://gist.githubusercontent.com/PUUDI/861771ffca8462507b487b6f75f2386d/raw/44e4760f1f6ee628c9674fe1c87e63bd4fbcf19d/gistfile1.txt')

preparing the data for classification

In [3]:
#dropping the converted Group data points
classes = ['Nondemented', 'Demented']
dataset.drop(dataset.loc[dataset['Group'] == 'Converted'].index, inplace=True) # dataset.Group[dataset.Group == 'Converted'] = 'Nondemented' 

#Dropping the unwanted columns that won't be needing to include in our model
dataset.drop(['Subject ID', 'MRI ID', 'Hand','CDR','MR Delay','Visit'], axis=1, inplace=True) # 'MR Delay''Visit','Age','EDUC','eTIV'

#Encoding binary variables
dataset['M/F'] = dataset['M/F'].apply(lambda x: ['M', 'F'].index(x))

#Encoding the class variable
dataset['Class'] = [classes.index(group) for group in dataset['Group']]

Imputing missing values using Random Sample technique

In [4]:
from feature_engine.imputation import RandomSampleImputer

imputer = RandomSampleImputer(
        random_state=['SES','MMSE'],
        seed='observation',
        seeding_method='add'
    )

# fit the imputer
imputer.fit(dataset)

dataset = imputer.transform(dataset)

In [5]:
dataset

Unnamed: 0,Group,M/F,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF,Class
0,Nondemented,0,87,14,2.0,27.0,1987,0.696,0.883,0
1,Nondemented,0,88,14,2.0,30.0,2004,0.681,0.876,0
2,Demented,0,75,12,1.0,23.0,1678,0.736,1.046,1
3,Demented,0,76,12,2.0,28.0,1738,0.713,1.010,1
4,Demented,0,80,12,2.0,22.0,1698,0.701,1.034,1
...,...,...,...,...,...,...,...,...,...,...
368,Demented,0,82,16,1.0,28.0,1693,0.694,1.037,1
369,Demented,0,86,16,1.0,26.0,1688,0.675,1.040,1
370,Nondemented,1,61,13,2.0,30.0,1319,0.801,1.331,0
371,Nondemented,1,63,13,2.0,30.0,1327,0.796,1.323,0


In [6]:
y = dataset.Class
dataset.drop(['Group','Class'], axis=1, inplace=True) # 'MR Delay''Visit',
X = dataset

train_feature, test_feature, train_label, test_label = train_test_split(X, y, test_size=0.2, random_state=42)

### Fitting model to the dataset with default parameters

In [7]:
nbc = GaussianNB()
#Fitting the model
nbc.fit(train_feature, train_label)

nbc_predict = nbc.predict(test_feature)

print("=== Confusion Matrix ===")
print(confusion_matrix(test_label, nbc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(test_label, nbc_predict))

=== Confusion Matrix ===
[[36  2]
 [ 5 25]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.88      0.95      0.91        38
           1       0.93      0.83      0.88        30

    accuracy                           0.90        68
   macro avg       0.90      0.89      0.89        68
weighted avg       0.90      0.90      0.90        68



In [8]:
knn = KNeighborsClassifier()
knn.fit(train_feature, train_label)

knn_predict = knn.predict(test_feature)

print("=== Confusion Matrix ===")
print(confusion_matrix(test_label, knn_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(test_label, knn_predict))

=== Confusion Matrix ===
[[26 12]
 [14 16]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.65      0.68      0.67        38
           1       0.57      0.53      0.55        30

    accuracy                           0.62        68
   macro avg       0.61      0.61      0.61        68
weighted avg       0.62      0.62      0.62        68



In [9]:
lda = LinearDiscriminantAnalysis()
lda.fit(train_feature, train_label)

lda_predict = lda.predict(test_feature)

print("=== Confusion Matrix ===")
print(confusion_matrix(test_label, lda_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(test_label, lda_predict))

=== Confusion Matrix ===
[[38  0]
 [ 9 21]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.81      1.00      0.89        38
           1       1.00      0.70      0.82        30

    accuracy                           0.87        68
   macro avg       0.90      0.85      0.86        68
weighted avg       0.89      0.87      0.86        68



In [10]:
log = LogisticRegression(solver='liblinear')
log.fit(train_feature, train_label)

log_predict = log.predict(test_feature)

print("=== Confusion Matrix ===")
print(confusion_matrix(test_label, log_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(test_label, log_predict))

=== Confusion Matrix ===
[[37  1]
 [ 5 25]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.88      0.97      0.93        38
           1       0.96      0.83      0.89        30

    accuracy                           0.91        68
   macro avg       0.92      0.90      0.91        68
weighted avg       0.92      0.91      0.91        68



In [11]:
svc = SVC(probability=True)
svc.fit(train_feature, train_label)

svc_predict = svc.predict(test_feature)

print("=== Confusion Matrix ===")
print(confusion_matrix(test_label, svc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(test_label, svc_predict))

=== Confusion Matrix ===
[[38  0]
 [30  0]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.56      1.00      0.72        38
           1       0.00      0.00      0.00        30

    accuracy                           0.56        68
   macro avg       0.28      0.50      0.36        68
weighted avg       0.31      0.56      0.40        68



In [12]:
rf = RandomForestClassifier()
rf.fit(train_feature, train_label)


rf_predict = rf.predict(test_feature)

print("=== Confusion Matrix ===")
print(confusion_matrix(test_label, rf_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(test_label, rf_predict))

=== Confusion Matrix ===
[[36  2]
 [ 2 28]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        38
           1       0.93      0.93      0.93        30

    accuracy                           0.94        68
   macro avg       0.94      0.94      0.94        68
weighted avg       0.94      0.94      0.94        68



In [13]:
gb = GradientBoostingClassifier(criterion = "friedman_mse")
gb.fit(train_feature, train_label)

gb_predict = gb.predict(test_feature)

print("=== Confusion Matrix ===")
print(confusion_matrix(test_label, gb_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(test_label, gb_predict))

=== Confusion Matrix ===
[[35  3]
 [ 2 28]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.95      0.92      0.93        38
           1       0.90      0.93      0.92        30

    accuracy                           0.93        68
   macro avg       0.92      0.93      0.93        68
weighted avg       0.93      0.93      0.93        68



Fitting Models with default parameters

### Gaussian Naive Bayes  
> Default parameters
---
F1 Score: **0.86**
### K Nearest Neighbour Classifier 
> Default parameters
---
F1 Score: **0.64**
### Linear Discriminant Analysis  
> Default parameters
---
F1 Score: **0.85**
### Logistic Regression  
> Default parameters
---
F1 Score: **0.85**

### Support Vector Machine 
> Default parameters
---
F1 Score: **0.58**

### Random Forest Classifier 
> Default parameters
---
F1 Score: **0.89**
### Gradient Bossting Algorithm 
> Default parameters
---
F1 Score: **0.90**



### Applying cross validation technique to reduce overfitting and get generalized model scores





In [14]:
models = [RandomForestClassifier(), LogisticRegression(solver='liblinear'),LinearDiscriminantAnalysis(), GradientBoostingClassifier()]
names = ["Random Forest", "Logistic Regression","Linear Discriminant Analysis","Gradient Boosting"]
for model, name in zip(models, names):
    print(name)
    start = time.time()
    for score in ["accuracy", "precision", "recall"]:
        print(score," : ",cross_val_score(model, X, y ,scoring=score, cv=5).mean())
       
    print('Time elapsed: ',time.time() - start)
    print('\n')

Random Forest
accuracy  :  0.8332309043020192
precision  :  0.8454205069124423
recall  :  0.7664367816091955
Time elapsed:  2.301682233810425


Logistic Regression
accuracy  :  0.8035557506584723
precision  :  0.8376535541752933
recall  :  0.6836781609195401
Time elapsed:  0.09908485412597656


Linear Discriminant Analysis
accuracy  :  0.7974100087796312
precision  :  0.8669657234874626
recall  :  0.6294252873563219
Time elapsed:  0.0897073745727539


Gradient Boosting
accuracy  :  0.8035996488147499
precision  :  0.7735261454240201
recall  :  0.7873563218390804
Time elapsed:  1.251885175704956




### Balancing the data to reduce model bias towards one classification




In [15]:
# Oversample and plot imbalanced dataset with SMOTE
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from matplotlib import pyplot
from numpy import where

counter = Counter(y)
print(counter)
# transform the dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)

train_feature, test_feature, train_label, test_label = train_test_split(X, y, test_size=0.2, random_state=42)

Counter({0: 190, 1: 146})
Counter({0: 190, 1: 190})


In [16]:
models = [RandomForestClassifier(), LogisticRegression(solver='liblinear'),LinearDiscriminantAnalysis(), GradientBoostingClassifier()]
names = ["Random Forest", "Logistic Regression","Linear Discriminant Analysis","Gradient Boosting"]
for model, name in zip(models, names):
    print(name)
    start = time.time()
    for score in ["accuracy", "precision", "recall"]:
        print(score," : ",cross_val_score(model, X, y ,scoring=score, cv=5).mean())
       
    print('Time elapsed: ',time.time() - start)
    print('\n')

Random Forest
accuracy  :  0.8631578947368421
precision  :  0.871100517129929
recall  :  0.8631578947368421
Time elapsed:  2.2818374633789062


Logistic Regression
accuracy  :  0.805263157894737
precision  :  0.844797356011749
recall  :  0.7526315789473685
Time elapsed:  0.11090278625488281


Linear Discriminant Analysis
accuracy  :  0.8236842105263158
precision  :  0.8816535534207948
recall  :  0.7473684210526316
Time elapsed:  0.09803080558776855


Gradient Boosting
accuracy  :  0.8421052631578947
precision  :  0.8313056332144348
recall  :  0.8315789473684211
Time elapsed:  1.3309507369995117




### Applying Randomized Gride Search CV to find best parameters for the models

Grid Search CV for Random Forest

In [17]:
from sklearn.model_selection import RandomizedSearchCV

# Number of estimators in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
start = time.time()
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 10, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(train_feature, train_label);
print('Time elapsed: ',time.time() - start)

print('\n')
pprint(rf_random.best_params_)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Time elapsed:  31.588361024856567


{'bootstrap': False,
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 2000}


Grid Search CV for Gradient Boosting

In [18]:
# Number of features to consider at every split
learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
# Maximum number of levels in tree
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
max_depths = np.linspace(1, 32, 32, endpoint=True)
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
max_features = list(range(1,train_feature.shape[1]))
# Minimum number of samples required at each leaf node

# Create the random grid
random_grid_grad = {'learning_rate': learning_rates,
               'n_estimators': n_estimators,
               'max_depth':max_depths,
               'min_samples_split':min_samples_splits,
               'min_samples_leaf':min_samples_leafs,
               'max_features':max_features}

pprint(random_grid_grad)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
gb = GradientBoostingClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
gb_random = RandomizedSearchCV(estimator=gb, param_distributions=random_grid_grad,
                              n_iter = 50, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
gb_random.fit(train_feature, train_label);

gb_random.best_params_

{'learning_rate': [1, 0.5, 0.25, 0.1, 0.05, 0.01],
 'max_depth': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31., 32.]),
 'max_features': [1, 2, 3, 4, 5, 6, 7],
 'min_samples_leaf': array([0.1, 0.2, 0.3, 0.4, 0.5]),
 'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
 'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200]}
Fitting 3 folds for each of 50 candidates, totalling 150 fits


{'learning_rate': 0.25,
 'max_depth': 18.0,
 'max_features': 6,
 'min_samples_leaf': 0.2,
 'min_samples_split': 0.2,
 'n_estimators': 100}

Fitting model using the parameters obtained from Grid Search CV

In [19]:
models = [RandomForestClassifier(bootstrap=False,max_depth=None,max_features='sqrt',min_samples_leaf=1,min_samples_split=2,n_estimators=750,random_state=40),  
          GradientBoostingClassifier(learning_rate=0.05,max_depth=24,max_features=6,min_samples_leaf=0.1,min_samples_split=2,n_estimators=100)]

names = ["Random Forest (parameters - Grid search CV)","Gradient Boost Classifier (parameters - Grid search CV)"]
for model, name in zip(models, names):
    print(name)
    start = time.time()
    for score in ["accuracy", "precision", "recall"]:
        print(score," : ",cross_val_score(model, X, y ,scoring=score, cv=5).mean())
       
    print('Time elapsed: ',time.time() - start)
    print('\n')

Random Forest (parameters - Grid search CV)
accuracy  :  0.868421052631579
precision  :  0.8654015042824226
recall  :  0.8736842105263157
Time elapsed:  13.794224262237549


Gradient Boost Classifier (parameters - Grid search CV)
accuracy  :  0.8631578947368421
precision  :  0.8911269791455092
recall  :  0.8263157894736842
Time elapsed:  1.2279579639434814




Fitting the Final Choosen model and getting the Confusion Matrix

In [20]:
rf_final = RandomForestClassifier(bootstrap=False,max_depth=None,max_features='sqrt',min_samples_leaf=1,min_samples_split=2,n_estimators=750,random_state=40)
rf_final.fit(train_feature, train_label)

rf_final_predict = rf_final.predict(test_feature)

print("=== Confusion Matrix ===")
print(confusion_matrix(test_label, rf_final_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(test_label, rf_final_predict))

=== Confusion Matrix ===
[[37  2]
 [ 3 34]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        39
           1       0.94      0.92      0.93        37

    accuracy                           0.93        76
   macro avg       0.93      0.93      0.93        76
weighted avg       0.93      0.93      0.93        76

