# Alzheimer's Prediction System

This notebook includes the eda and modelling part using pycaret

### Importing libraries

In [3]:
!pip install pycaret

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import pandas as pd
from pycaret.classification import *

from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'pycaret'

### Reading the dataset

In [85]:
data = pd.read_csv("data/ADNI_Training_Q3_APOE_CollectionADNI1Complete 1Yr 1.5T_July22.2014.csv")

In [86]:
data.columns

Index(['directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Modality', 'Visit',
       'Acq.Date', 'DX.bl', 'EXAMDATE', 'AGE', 'PTGENDER', 'PTEDUCAT',
       'PTETHCAT', 'PTRACCAT', 'APOE4', 'MMSE', 'imputed_genotype',
       'APOE Genotype', 'Dx Codes for Submission'],
      dtype='object')

### Removing extra columns

In [87]:
remove_column = ['directory.id', 'Subject', 'RID', 'Image.Data.ID', 'Visit', 'Acq.Date', 'EXAMDATE', 'DX.bl']

data = data.drop(columns=remove_column)

In [88]:
cat_vars = ['PTGENDER', 'PTETHCAT', 'PTRACCAT', 'APOE4', 'imputed_genotype', 'APOE Genotype']

#### Converting the dataset to one-hot encoding

In [89]:
for var in cat_vars:
    if var in data.columns:
        print(f'Converting {var} to 1-hot encoding')
        one_hot_df = pd.get_dummies(data[var], prefix=var)
        data = pd.concat([data, one_hot_df], axis=1)
        data.drop(var, axis=1, inplace=True)
    else:
        print(f'Column {var} not found in DataFrame data.')

data.head(4)


Converting PTGENDER to 1-hot encoding
Converting PTETHCAT to 1-hot encoding
Converting PTRACCAT to 1-hot encoding
Converting APOE4 to 1-hot encoding
Converting imputed_genotype to 1-hot encoding
Converting APOE Genotype to 1-hot encoding


Unnamed: 0,Modality,AGE,PTEDUCAT,MMSE,Dx Codes for Submission,PTGENDER_Female,PTGENDER_Male,PTETHCAT_Hisp/Latino,PTETHCAT_Not Hisp/Latino,PTETHCAT_Unknown,...,APOE4_1,APOE4_2,imputed_genotype_False,imputed_genotype_True,"APOE Genotype_2,2","APOE Genotype_2,3","APOE Genotype_2,4","APOE Genotype_3,3","APOE Genotype_3,4","APOE Genotype_4,4"
0,MRI,81.3,18,20,AD,0,1,0,1,0,...,1,0,0,1,0,0,0,0,1,0
1,MRI,67.5,10,27,MCI,0,1,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,MRI,73.7,16,29,CN,0,1,0,1,0,...,0,0,0,1,0,0,0,1,0,0
3,MRI,80.4,13,25,MCI,1,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0


In [90]:
data.columns

Index(['Modality', 'AGE', 'PTEDUCAT', 'MMSE', 'Dx Codes for Submission',
       'PTGENDER_Female', 'PTGENDER_Male', 'PTETHCAT_Hisp/Latino',
       'PTETHCAT_Not Hisp/Latino', 'PTETHCAT_Unknown', 'PTRACCAT_Asian',
       'PTRACCAT_Black', 'PTRACCAT_White', 'APOE4_0', 'APOE4_1', 'APOE4_2',
       'imputed_genotype_False', 'imputed_genotype_True', 'APOE Genotype_2,2',
       'APOE Genotype_2,3', 'APOE Genotype_2,4', 'APOE Genotype_3,3',
       'APOE Genotype_3,4', 'APOE Genotype_4,4'],
      dtype='object')

### Splitting the dataset

In [91]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['Dx Codes for Submission']), data['Dx Codes for Submission'], test_size=0.2, random_state=42)

### Training the Model

In [92]:
clf = setup(data=data, target="Dx Codes for Submission", session_id=123, log_experiment=True, experiment_name="alzheimer")

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Dx Codes for Submission
2,Target type,Multiclass
3,Target mapping,"AD: 0, CN: 1, MCI: 2"
4,Original data shape,"(628, 24)"
5,Transformed data shape,"(628, 24)"
6,Transformed train set shape,"(439, 24)"
7,Transformed test set shape,"(189, 24)"
8,Numeric features,22
9,Categorical features,1


In [93]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.7084,0.8457,0.7084,0.7191,0.7066,0.5312,0.5369,0.009
lr,Logistic Regression,0.7038,0.8514,0.7038,0.7163,0.7017,0.5214,0.5277,0.023
gbc,Gradient Boosting Classifier,0.6812,0.8153,0.6812,0.6947,0.6802,0.4859,0.4903,0.042
ridge,Ridge Classifier,0.6767,0.0,0.6767,0.6966,0.6746,0.4736,0.48,0.009
lightgbm,Light Gradient Boosting Machine,0.6583,0.8216,0.6583,0.6654,0.655,0.4527,0.4569,0.19
rf,Random Forest Classifier,0.647,0.7841,0.647,0.6555,0.6445,0.4314,0.4349,0.033
dt,Decision Tree Classifier,0.6239,0.6908,0.6239,0.6373,0.6218,0.4014,0.4081,0.01
et,Extra Trees Classifier,0.6198,0.7545,0.6198,0.6299,0.6172,0.3896,0.3934,0.032
knn,K Neighbors Classifier,0.6128,0.7566,0.6128,0.6156,0.6072,0.3716,0.376,0.011
ada,Ada Boost Classifier,0.5922,0.6717,0.5922,0.6075,0.5777,0.3415,0.3509,0.017


In [94]:
model = create_model('ridge')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6136,0.0,0.6136,0.6231,0.6144,0.359,0.3614
1,0.6818,0.0,0.6818,0.7361,0.6646,0.443,0.4695
2,0.75,0.0,0.75,0.7521,0.7505,0.5943,0.5948
3,0.6591,0.0,0.6591,0.6763,0.6538,0.4491,0.4554
4,0.75,0.0,0.75,0.7554,0.7507,0.602,0.6035
5,0.5455,0.0,0.5455,0.5817,0.5514,0.2781,0.2825
6,0.5909,0.0,0.5909,0.5926,0.5912,0.3481,0.3485
7,0.6136,0.0,0.6136,0.6723,0.607,0.3507,0.3663
8,0.8182,0.0,0.8182,0.8182,0.8182,0.7129,0.7129
9,0.7442,0.0,0.7442,0.7586,0.7447,0.5985,0.6052


In [95]:
tuned_model = tune_model(model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6364,0.0,0.6364,0.6442,0.6345,0.3921,0.3966
1,0.7045,0.0,0.7045,0.7537,0.6875,0.487,0.5103
2,0.75,0.0,0.75,0.7521,0.7505,0.5943,0.5948
3,0.6818,0.0,0.6818,0.699,0.6811,0.4909,0.4963
4,0.75,0.0,0.75,0.7554,0.7507,0.602,0.6035
5,0.5455,0.0,0.5455,0.5817,0.5514,0.2781,0.2825
6,0.5682,0.0,0.5682,0.5785,0.5711,0.3102,0.311
7,0.5909,0.0,0.5909,0.6515,0.5873,0.3172,0.3288
8,0.8182,0.0,0.8182,0.8205,0.8185,0.7122,0.7128
9,0.7442,0.0,0.7442,0.7586,0.7447,0.5985,0.6052


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [96]:
evaluate_model(model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [97]:
final_model = finalize_model(model)

In [99]:
predictions = predict_model(final_model, data=X_test)

In [100]:
predictions

Unnamed: 0,Modality,AGE,PTEDUCAT,MMSE,PTGENDER_Female,PTGENDER_Male,PTETHCAT_Hisp/Latino,PTETHCAT_Not Hisp/Latino,PTETHCAT_Unknown,PTRACCAT_Asian,...,APOE4_2,imputed_genotype_False,imputed_genotype_True,"APOE Genotype_2,2","APOE Genotype_2,3","APOE Genotype_2,4","APOE Genotype_3,3","APOE Genotype_3,4","APOE Genotype_4,4",prediction_label
581,MRI,71.800003,14,26,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,MCI
591,MRI,71.699997,12,28,1,0,0,0,1,0,...,1,1,0,0,0,0,0,0,1,MCI
550,MRI,71.300003,14,27,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,CN
213,MRI,83.099998,16,29,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,CN
485,MRI,81.400002,20,29,0,1,0,1,0,0,...,0,0,1,0,0,0,0,1,0,MCI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,MRI,72.900002,14,29,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,MCI
319,MRI,77.300003,18,29,0,1,0,1,0,0,...,0,1,0,0,0,0,1,0,0,MCI
218,MRI,77.300003,16,26,1,0,0,1,0,0,...,1,0,1,0,0,0,0,0,1,MCI
344,MRI,81.800003,12,22,1,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,AD


### Final Notes
Since there was no significant improvent in the model accuracy, we won't be exporting the model