## Alzheimer

In [2]:
import pandas as pd
import numpy as np
import sys
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import svm
from tensorflow import keras
from sklearn.metrics import auc, roc_auc_score, accuracy_score, confusion_matrix,recall_score
from xgboost import XGBClassifier
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA
sys.path.append(os.path.relpath('../src'))
import utils as ut



## Data reading and preliminary analysis

In [3]:
### Origin: https://www.kaggle.com/datasets/rabieelkharoua/alzheimers-disease-dataset
alz = pd.read_csv('../data/class/raw/alzheimers_disease_data.csv',index_col='PatientID')
alz.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2149 entries, 4751 to 6899
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        2149 non-null   int64  
 1   Gender                     2149 non-null   int64  
 2   Ethnicity                  2149 non-null   int64  
 3   EducationLevel             2149 non-null   int64  
 4   BMI                        2149 non-null   float64
 5   Smoking                    2149 non-null   int64  
 6   AlcoholConsumption         2149 non-null   float64
 7   PhysicalActivity           2149 non-null   float64
 8   DietQuality                2149 non-null   float64
 9   SleepQuality               2149 non-null   float64
 10  FamilyHistoryAlzheimers    2149 non-null   int64  
 11  CardiovascularDisease      2149 non-null   int64  
 12  Diabetes                   2149 non-null   int64  
 13  Depression                 2149 non-null   int64  

## Model #3

This third model aims to take a better recall score by Neural Networks

In [64]:
alz_3 = alz[['MMSE','FunctionalAssessment','MemoryComplaints','BehavioralProblems','ADL','Diagnosis']] #,'mix'
x1_train, x1_test, y1_train, y1_test = ut.train_test(alz_3,test_size=.2)


In [65]:
x1_train

Unnamed: 0_level_0,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5531,20.610337,5.566873,0,0,0.232938
6092,8.973483,9.307896,0,0,0.442326
5571,11.968872,4.001694,0,0,4.918146
5608,27.648573,9.700073,1,0,5.157190
5344,25.410148,8.626984,0,0,9.413984
...,...,...,...,...,...
6068,19.624474,4.408669,0,0,3.027579
5462,21.986095,4.496684,0,0,6.747443
4950,9.529090,1.157719,0,0,9.116986
5070,7.669430,2.391914,0,0,0.905049


Fitting 5 folds for each of 120 candidates, totalling 600 fits




Accuracy 0.9441860465116279
AUC 0.9344708443771297
Recall 0.9013157894736842
[[269   9]
 [ 15 137]]
{'classifier': GradientBoostingClassifier(), 'classifier__max_depth': 2, 'classifier__min_samples_split': 2}


Less recall than in the baseline model. We go back to the same variables, and check whether it's possible a better output

In [76]:
alz_3b = alz[['MMSE','FunctionalAssessment','MemoryComplaints','BehavioralProblems','ADL','Diagnosis']] #,'mix'
x1_train, x1_test,x1_val, y1_train, y1_test,y1_val = ut.train_test(alz_3b,test_size=.2,val_size=.15,val_set=True)

In [66]:
y1_train.value_counts()

Diagnosis
0    1111
1     608
Name: count, dtype: int64

In [10]:
y1_test.value_counts()

Diagnosis
0    278
1    152
Name: count, dtype: int64

In [11]:
y1_val.value_counts()

Diagnosis
0    167
1     91
Name: count, dtype: int64

In [77]:
scaler = StandardScaler()
x1_train = scaler.fit_transform(x1_train)
x1_test = scaler.transform(x1_test)
x1_val = scaler.transform(x1_val)

In [68]:
type(x1_train)

numpy.ndarray

In [9]:
y1_train_np = np.array(y1_train).flatten()
print("Formato de y1_train:", type(y1_train_np), y1_train_np.shape)
print("Valores únicos en y1_train:", np.unique(y1_train))

Formato de y1_train: <class 'numpy.ndarray'> (1461,)
Valores únicos en y1_train: [0 1]


In [78]:
layers = [
    keras.layers.Input(shape=x1_train.shape[1:]),
    keras.layers.Flatten(),
    keras.layers.Dense(units = 300, activation='relu'),
    keras.layers.Dense(units = 100, activation='relu'),
    keras.layers.Dense(units = 1, activation='sigmoid')
]

model = keras.models.Sequential(layers)
model.compile(
    optimizer = "sgd",
    loss = "binary_crossentropy",
    metrics = ['accuracy',"recall"] #
)
#y1_train_np = np.array(y1_train).flatten()
#class_weights = {0: 1, 1: len(y1_train_np) / sum(y1_train_np == 1)}
history = model.fit(
    x1_train,
    y1_train,
    batch_size = 16,
    epochs = 1000,
    validation_data = (x1_val,y1_val),
    callbacks=keras.callbacks.EarlyStopping(patience=10),
    #class_weight = class_weights
)

Epoch 1/1000
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6472 - loss: 0.6626 - recall: 0.5467 - val_accuracy: 0.7597 - val_loss: 0.5819 - val_recall: 0.3516
Epoch 2/1000
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7942 - loss: 0.5528 - recall: 0.4225 - val_accuracy: 0.8217 - val_loss: 0.5102 - val_recall: 0.5385
Epoch 3/1000
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8346 - loss: 0.4838 - recall: 0.5992 - val_accuracy: 0.8372 - val_loss: 0.4588 - val_recall: 0.6374
Epoch 4/1000
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8593 - loss: 0.4300 - recall: 0.6815 - val_accuracy: 0.8333 - val_loss: 0.4238 - val_recall: 0.6484
Epoch 5/1000
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8759 - loss: 0.3737 - recall: 0.7355 - val_accuracy: 0.8411 - val_loss: 0.4020 - val_recall: 0.7033


In [79]:
pred = model.predict(x1_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [81]:
model.evaluate(x1_test, y1_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9091 - loss: 0.2789 - recall: 0.8960


[0.2563968300819397, 0.9209302067756653, 0.875]

In [82]:
accuracy_score(y1_test, pred.round()) #,normalize='pred'

0.9209302325581395

In [83]:
recall_score(y1_test, pred.round(),average='weighted')

np.float64(0.9209302325581395)

In [84]:
roc_auc_score(y1_test, pred.round(),average='weighted')

np.float64(0.9105215827338129)

In [27]:
alz_3b.head()

Unnamed: 0_level_0,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Diagnosis
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4751,21.463532,6.518877,0,0,1.725883,0
4752,20.613267,7.118696,0,0,2.592424,0
4753,7.356249,5.895077,0,0,7.119548,0
4754,13.991127,8.965106,0,1,6.481226,0
4755,13.517609,6.045039,0,0,0.014691,0


In [19]:
x1_train

array([[-1.52581182,  0.87057639, -0.51046004, -0.41765692,  0.31069247],
       [ 0.88998419,  0.10193886, -0.51046004, -0.41765692, -0.00653084],
       [-0.8033478 ,  0.23896236,  1.95901719, -0.41765692,  0.69713778],
       ...,
       [ 1.53698846,  0.63393449, -0.51046004, -0.41765692, -1.55792193],
       [-1.71370962, -0.63337908, -0.51046004, -0.41765692, -0.1383277 ],
       [ 0.43542546, -0.14655547,  1.95901719, -0.41765692, -1.16920224]])

In [28]:
y1_train

PatientID
5037    0
5173    0
5812    0
6284    0
5363    1
       ..
4803    0
6263    0
5105    0
6860    0
4996    1
Name: Diagnosis, Length: 1461, dtype: int64

In [29]:
x1_test

array([[-0.09604496, -1.30809413, -0.51046004, -0.41765692, -0.98749752],
       [-1.17173722,  0.89545199, -0.51046004, -0.41765692, -0.59633045],
       [ 0.25222402, -1.40784416,  1.95901719, -0.41765692,  0.06173593],
       ...,
       [-0.23594164, -0.64226318, -0.51046004, -0.41765692, -1.59727151],
       [ 0.87391962,  0.22926522,  1.95901719, -0.41765692,  0.21920641],
       [-1.58806241,  0.22816292, -0.51046004, -0.41765692,  0.32175647]])

In [30]:
y1_test

PatientID
5202    1
6831    1
6407    1
5821    1
5581    1
       ..
6548    0
6175    0
4859    1
6567    0
4913    0
Name: Diagnosis, Length: 430, dtype: int64

In [45]:
model.predict(x1_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


array([[0.9772983 ],
       [0.02321894],
       [0.9844667 ],
       [0.47645706],
       [0.99961513],
       [0.03584076],
       [0.9501687 ],
       [0.03414219],
       [0.03218903],
       [0.996937  ],
       [0.03090922],
       [0.9778564 ],
       [0.01329451],
       [0.7705607 ],
       [0.1200042 ],
       [0.01322803],
       [0.02033255],
       [0.02070368],
       [0.031251  ],
       [0.9847618 ],
       [0.5244626 ],
       [0.6175678 ],
       [0.01683419],
       [0.89893806],
       [0.612797  ],
       [0.49752712],
       [0.8008729 ],
       [0.94124526],
       [0.07396176],
       [0.02051582],
       [0.06409417],
       [0.9983246 ],
       [0.04648422],
       [0.04824806],
       [0.7195949 ],
       [0.04411293],
       [0.00988217],
       [0.03456977],
       [0.9815368 ],
       [0.9722634 ],
       [0.92528975],
       [0.0672225 ],
       [0.501039  ],
       [0.09417542],
       [0.99923694],
       [0.6001875 ],
       [0.9987117 ],
       [0.041

In [46]:
pred = model.predict(x1_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [43]:
pred = round(pred)
pred

TypeError: type numpy.ndarray doesn't define __round__ method

In [50]:
confusion_matrix(y1_test, pred.round())

array([[258,  20],
       [ 19, 133]])

In [73]:
model.save("../models/class/model_3.keras")

In [72]:
result = model.predict(scaler.transform([[1,1,1,1,10]])).round()
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[[1.]]




In [54]:
type(x1_train)

numpy.ndarray