## Dataset and Analysis

Import all the necessary libraries and modules for machine learning and deep learning

In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.model_selection import cross_val_score

import tensorflow as tf
from keras.models import Model, Sequential
from keras.layers import Input,Dense

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc

%matplotlib inline

sns.set()

Read the .csv file for our data

In [21]:
df = pd.read_csv('oasis_longitudinal.csv')

Filter all the patients according to their visits and show patients whose visit input == 1

In [22]:
df = df.loc[df['Visit']==1]

Reset the index values so that they are in sequence

In [23]:
df = df.reset_index(drop=True)

Drop unnecessary columns and replace the data which cannot be read

In [24]:
df['M/F'] = df['M/F'].replace(['F','M'], [0,1]) # M/F column
df['Group'] = df['Group'].replace(['Converted'], ['Demented']) # Target variable
df['Group'] = df['Group'].replace(['Demented', 'Nondemented'], [1,0]) # Target variable
df = df.drop(['MRI ID', 'Visit', 'Hand'], axis=1) # Drop unnecessary columns

## Data Preprocessing

Identify fields which have unreadable datatypes

In [25]:
pd.isnull(df).sum() 
#df["SES"].fillna(df.groupby("EDUC")["SES"].transform("median"), inplace=True)

Subject ID    0
Group         0
MR Delay      0
M/F           0
Age           0
EDUC          0
SES           8
MMSE          0
CDR           0
eTIV          0
nWBV          0
ASF           0
dtype: int64

Drop fields which have unreadable datatypes

In [26]:
df_dropna = df.dropna(axis=0, how='any')
pd.isnull(df_dropna).sum()
#pd.isnull(df).sum()

Subject ID    0
Group         0
MR Delay      0
M/F           0
Age           0
EDUC          0
SES           0
MMSE          0
CDR           0
eTIV          0
nWBV          0
ASF           0
dtype: int64

Differentiate patients as to demented and nondemented

In [27]:
df_dropna['Group'].value_counts()

0    72
1    70
Name: Group, dtype: int64

Reset indexes in sequential order

In [28]:
df = df_dropna.reset_index(drop=True)
df

Unnamed: 0,Subject ID,Group,MR Delay,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,0,0,1,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0004,0,0,0,88,18,3.0,28.0,0.0,1215,0.710,1.444
2,OAS2_0005,0,0,1,80,12,4.0,28.0,0.0,1689,0.712,1.039
3,OAS2_0008,0,0,0,93,14,2.0,30.0,0.0,1272,0.698,1.380
4,OAS2_0009,1,0,1,68,12,2.0,27.0,0.5,1457,0.806,1.205
...,...,...,...,...,...,...,...,...,...,...,...,...
137,OAS2_0179,1,0,1,79,20,1.0,26.0,0.5,1548,0.711,1.134
138,OAS2_0183,0,0,0,66,13,2.0,30.0,0.0,1495,0.746,1.174
139,OAS2_0184,1,0,0,72,16,3.0,24.0,0.5,1354,0.733,1.296
140,OAS2_0185,1,0,1,80,16,1.0,28.0,0.5,1704,0.711,1.030


Set x and y inputs for Training and Validation Data

In [29]:
Y = df['Group'].values # Target for the model
X = df[['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']] # Features we use

# splitting into three sets
X_trainval, X_test, Y_trainval, Y_test = train_test_split(
    X, Y, random_state=10,stratify=df['Group'].values)

# Feature scaling
scaler = StandardScaler().fit(X_trainval)
X_trainval_scaled = scaler.transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

Print the Data

In [30]:
print('Number of demented samples in training data:',(np.asarray(Y_trainval)==1).sum())
print('Number of non-demented samples in training data:',(np.asarray(Y_trainval)==0).sum())
print('Traning data features:',X_trainval.shape)
print('Training data labels:',Y_trainval.shape)

print('Testing data features:',X_test_scaled.shape)
print('Testing data labels:',Y_test.shape)

Number of demented samples in training data: 52
Number of non-demented samples in training data: 54
Traning data features: (106, 8)
Training data labels: (106,)
Testing data features: (36, 8)
Testing data labels: (36,)


The above section of the code included fetching the file and filtering and modifying the data according to our use.

## Simple Feed-Forward Neural Network

Initialising the model

In [31]:
# model = Sequential()
# model.add(Dense(12,activation='relu',input_dim=X_train.shape[0]))
# model.add(Dense(8,activation='relu'))
# model.add(Dense(1,activation='sigmoid'))
input_shape = (X_trainval_scaled.shape[-1])
i = Input(shape = input_shape)
x = Dense(12,activation='relu',kernel_initializer='he_normal')(i)
x = Dense(8,activation='relu',kernel_initializer='he_normal')(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(i,x)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
dense_3 (Dense)              (None, 12)                108       
_________________________________________________________________
dense_4 (Dense)              (None, 8)                 104       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 9         
Total params: 221
Trainable params: 221
Non-trainable params: 0
_________________________________________________________________


Compiling the model

In [32]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics='accuracy')
model.fit(X_trainval_scaled,Y_trainval,epochs=75,batch_size=4,validation_data=(X_test_scaled,Y_test))

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<keras.callbacks.History at 0x7f91182f1fd0>

Printing the Output

In [33]:
loss,acc = model.evaluate(X_test_scaled,Y_test)
PredictedOutput = (model.predict(X_test_scaled) > 0.5).astype("int32")
print(PredictedOutput)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Test accuracy is", acc)
confusion_matrix(Y_test, PredictedOutput)

[[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]]
Test accuracy is 0.8055555820465088


array([[12,  6],
       [ 1, 17]])

## SVM 

Building a SVM model

In [34]:
best_score = 0
kfolds = 5
for c_paramter in [0.001, 0.01, 0.1, 1, 10, 100, 1000]: #iterate over the values we need to try for the parameter C
    for gamma_paramter in [0.001, 0.01, 0.1, 1, 10, 100, 1000]: #iterate over the values we need to try for the parameter gamma
        for k_parameter in ['rbf', 'linear', 'poly', 'sigmoid']: # iterate over the values we need to try for the kernel parameter
            svmModel = SVC(kernel=k_parameter, C=c_paramter, gamma=gamma_paramter) #define the model
            # perform cross-validation
            scores = cross_val_score(svmModel, X_trainval_scaled, Y_trainval, cv=kfolds, scoring='accuracy')
            # the training set will be split internally into training and cross validation

            # compute mean cross-validation accuracy
            score = np.mean(scores)
            # if we got a better score, store the score and parameters
            if score > best_score:
                best_score = score #store the score 
                best_parameter_c = c_paramter #store the parameter c
                best_parameter_gamma = gamma_paramter #store the parameter gamma
                best_parameter_k = k_parameter
            

# rebuild a model with best parameters to get score 
SelectedSVMmodel = SVC(C=best_parameter_c, gamma=best_parameter_gamma, kernel=best_parameter_k).fit(X_trainval_scaled, Y_trainval)

Printing output

In [35]:
test_score = SelectedSVMmodel.score(X_test_scaled, Y_test)
PredictedOutput = SelectedSVMmodel.predict(X_test_scaled)
print(PredictedOutput)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on cross validation set is:", best_score)
print("Best parameter for c is: ", best_parameter_c)
print("Best parameter for gamma is: ", best_parameter_gamma)
print("Best parameter for kernel is: ", best_parameter_k)
print("Test accuracy with the best parameters is", test_score)
confusion_matrix(Y_test, PredictedOutput)

[0 0 0 0 1 1 0 0 0 1 0 0 1 1 1 1 1 1 0 1 0 1 1 1 0 1 0 1 0 0 0 0 0 0 1 0]
Best accuracy on cross validation set is: 0.7264069264069264
Best parameter for c is:  10
Best parameter for gamma is:  0.1
Best parameter for kernel is:  sigmoid
Test accuracy with the best parameters is 0.6666666666666666


array([[13,  5],
       [ 7, 11]])

## Decision Tree Classification

Building a Decision Tree Classification Model

In [36]:
best_score = 0

for md in range(1, 9): # iterate different maximum depth values
    # train the model
    treeModel = DecisionTreeClassifier(random_state=0, max_depth=md, criterion='gini')
    # perform cross-validation
    scores = cross_val_score(treeModel, X_trainval_scaled, Y_trainval, cv=kfolds, scoring='accuracy')
    
    # compute mean cross-validation accuracy
    score = np.mean(scores)
    
    # if we got a better score, store the score and parameters
    if score > best_score:
        best_score = score
        best_parameter = md

# Rebuild a model on the combined training and validation set        
SelectedDTModel = DecisionTreeClassifier(max_depth=best_parameter).fit(X_trainval_scaled, Y_trainval )


Printing Output

In [37]:
test_score = SelectedDTModel.score(X_test_scaled, Y_test)
PredictedOutput = SelectedDTModel.predict(X_test_scaled)
print(PredictedOutput)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameter for the maximum depth is: ", best_parameter)
print("Test accuracy with best parameter is ", test_score)
confusion_matrix(Y_test, PredictedOutput)

[0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 0 1 1 1 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0]
Best accuracy on validation set is: 0.7731601731601732
Best parameter for the maximum depth is:  1
Test accuracy with best parameter is  0.8333333333333334


array([[17,  1],
       [ 5, 13]])

## Random Forest Classification

Building a random forest classification model

In [None]:
best_score = 0

for M in range(2, 15, 2): # combines M trees
    for d in range(1, 9): # maximum number of features considered at each split
        for m in range(1, 9): # maximum depth of the tree
            # train the model
            # n_jobs(4) is the number of parallel computing
            forestModel = RandomForestClassifier(n_estimators=M, max_features=d, n_jobs=4,
                                          max_depth=m, random_state=0)
        
            # perform cross-validation
            scores = cross_val_score(forestModel, X_trainval_scaled, Y_trainval, cv=kfolds, scoring='accuracy')

            # compute mean cross-validation accuracy
            score = np.mean(scores)

            # if we got a better score, store the score and parameters
            if score > best_score:
                best_score = score
                best_M = M
                best_d = d
                best_m = m

# Rebuild a model on the combined training and validation set        
SelectedRFModel = RandomForestClassifier(n_estimators=M, max_features=d,
                                          max_depth=m, random_state=0).fit(X_trainval_scaled, Y_trainval )

Printing Output

In [None]:
PredictedOutput = SelectedRFModel.predict(X_test_scaled)
print(PredictedOutput)
test_score = SelectedRFModel.score(X_test_scaled, Y_test)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameters of M, d, m are: ", best_M, best_d, best_m)
print("Test accuracy with the best parameters is", test_score)
confusion_matrix(Y_test, PredictedOutput)


## AdaBoost Classification

Building AdaBoost classification Model

In [None]:
best_score = 0

for M in range(2, 15, 2): # combines M trees
    for lr in [0.0001, 0.001, 0.01, 0.1, 1]:
        # train the model
        boostModel = AdaBoostClassifier(n_estimators=M, learning_rate=lr, random_state=0)

        # perform cross-validation
        scores = cross_val_score(boostModel, X_trainval_scaled, Y_trainval, cv=kfolds, scoring='accuracy')

        # compute mean cross-validation accuracy
        score = np.mean(scores)

        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_M = M
            best_lr = lr

# Rebuild a model on the combined training and validation set        
SelectedBoostModel = AdaBoostClassifier(n_estimators=M, learning_rate=lr, random_state=0).fit(X_trainval_scaled, Y_trainval )


Printing Output

In [None]:
PredictedOutput = SelectedBoostModel.predict(X_test_scaled)
print(PredictedOutput)
test_score = SelectedRFModel.score(X_test_scaled, Y_test)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameter of M is: ", best_M)
print("best parameter of LR is: ", best_lr)
print("Test accuracy with the best parameter is", test_score)
confusion_matrix(Y_test, PredictedOutput)

## Logistic Regression

Building a Logisting Regression Model

In [None]:
best_score=0
kfolds=5 # set the number of folds

for c in [0.001, 0.1, 1, 10, 100]:
    logRegModel = LogisticRegression(C=c)
    # perform cross-validation
    scores = cross_val_score(logRegModel, X_trainval_scaled, Y_trainval, cv=kfolds, scoring='accuracy')
    
    # compute mean cross-validation accuracy
    score = np.mean(scores)
    
    # Find the best parameters and score
    if score > best_score:
        best_score = score
        best_parameters = c

# rebuild a model on the combined training and validation set
SelectedLogRegModel = LogisticRegression(C=best_parameters).fit(X_trainval_scaled, Y_trainval)


Printing output

In [None]:
test_score = SelectedLogRegModel.score(X_test_scaled, Y_test)
PredictedOutput = SelectedLogRegModel.predict(X_test_scaled)
print(PredictedOutput)
test_recall = recall_score(Y_test, PredictedOutput, pos_label=1)
fpr, tpr, thresholds = roc_curve(Y_test, PredictedOutput, pos_label=1)
test_auc = auc(fpr, tpr)
print("Best accuracy on validation set is:", best_score)
print("Best parameter for regularization (C) is: ", best_parameters)
print("Test accuracy with best C parameter is", test_score)        
confusion_matrix(Y_test, PredictedOutput)
