# Data Preparation for Alzheimer's Disease Prediction

Ce notebook présente les étapes essentielles de préparation des données pour la prédiction de la maladie d'Alzheimer.

## 1. Import Required Libraries

In [5]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [15]:
# Load the dataset
df = pd.read_csv('data/alzheimers_disease_data.csv')
df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [7]:
# Data exploration
# Info
df.info()

# Describe
df.describe().T

# Check for duplicates
print('Duplicates:', sum(df.duplicated()))

# Value counts for a key column (example: DoctorInCharge)
print(df.DoctorInCharge.value_counts())

<class 'pandas.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64  
 14  Dep

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientID,2149.0,5825.0,620.507185,4751.0,5288.0,5825.0,6362.0,6899.0
Age,2149.0,74.908795,8.990221,60.0,67.0,75.0,83.0,90.0
Gender,2149.0,0.506282,0.500077,0.0,0.0,1.0,1.0,1.0
Ethnicity,2149.0,0.697534,0.996128,0.0,0.0,0.0,1.0,3.0
EducationLevel,2149.0,1.286645,0.904527,0.0,1.0,1.0,2.0,3.0
BMI,2149.0,27.655697,7.217438,15.008851,21.611408,27.823924,33.869778,39.992767
Smoking,2149.0,0.288506,0.453173,0.0,0.0,0.0,1.0,1.0
AlcoholConsumption,2149.0,10.039442,5.75791,0.002003,5.13981,9.934412,15.157931,19.989293
PhysicalActivity,2149.0,4.920202,2.857191,0.003616,2.570626,4.766424,7.427899,9.987429
DietQuality,2149.0,4.993138,2.909055,0.009385,2.458455,5.076087,7.558625,9.998346


In [9]:
# Check for duplicates
sum(df.duplicated())

0

In [10]:
# Value counts for a key column (example: DoctorInCharge)
df.DoctorInCharge.value_counts()

DoctorInCharge
XXXConfid    2149
Name: count, dtype: int64

In [17]:
# Feature selection with Lasso (automatic)
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# Use 'Diagnosis' as the target column
X = df.drop(columns=['Diagnosis', 'PatientID', 'DoctorInCharge'])
y = df['Diagnosis']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lasso = LassoCV(cv=5, random_state=0).fit(X_scaled, y)
selected_features = X.columns[lasso.coef_ != 0]
deleted_features = X.columns[lasso.coef_ == 0]
print("Selected variables:", list(selected_features))
print("Deleted variables:", list(deleted_features))

selected_df = df[selected_features.tolist() + ['Diagnosis']]
selected_df.head()

Selected variables: ['Age', 'EducationLevel', 'Smoking', 'SleepQuality', 'CardiovascularDisease', 'HeadInjury', 'Hypertension', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Disorientation']
Deleted variables: ['Gender', 'Ethnicity', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'FamilyHistoryAlzheimers', 'Diabetes', 'Depression', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'Confusion', 'PersonalityChanges', 'DifficultyCompletingTasks', 'Forgetfulness']


Unnamed: 0,Age,EducationLevel,Smoking,SleepQuality,CardiovascularDisease,HeadInjury,Hypertension,CholesterolLDL,CholesterolHDL,CholesterolTriglycerides,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Disorientation,Diagnosis
0,73,2,0,9.025679,0,0,0,56.150897,33.682563,162.189143,21.463532,6.518877,0,0,1.725883,0,0
1,89,0,0,7.151293,0,0,0,193.407996,79.028477,294.630909,20.613267,7.118696,0,0,2.592424,0,0
2,73,1,0,9.673574,0,0,0,153.322762,69.772292,83.638324,7.356249,5.895077,0,0,7.119548,1,0
3,74,1,1,8.392554,0,0,0,65.366637,68.457491,277.577358,13.991127,8.965106,0,1,6.481226,0,0
4,89,0,0,5.597238,0,0,0,92.8697,56.874305,291.19878,13.517609,6.045039,0,0,0.014691,0,0


In [12]:
# Drop unnecessary columns
df.drop(['PatientID', 'DoctorInCharge'], axis=1, inplace=True)

In [13]:
# Normalize and standardize selected columns
columns = ['Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'ADL']

# Normalize the columns
min_max_scaler = MinMaxScaler()
df[columns] = min_max_scaler.fit_transform(df[columns])

# Standardize the columns
standard_scaler = StandardScaler()
df[columns] = standard_scaler.fit_transform(df[columns])

In [14]:
# Final prepared data
df.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,-0.212368,0,0,2,-0.655225,0,0.565923,0.492525,-1.253593,1.119918,...,0.497506,0,0,-1.104434,0,0,0,1,0,0
1,1.567757,0,0,0,-0.114751,0,-0.954895,0.945093,-1.538442,0.056836,...,0.704907,0,0,-0.810601,0,0,0,0,1,0
2,-0.212368,0,3,1,-1.366428,0,1.653006,1.023896,-1.088855,1.48738,...,0.281813,0,0,0.724491,0,1,0,1,0,0
3,-0.101111,1,0,1,0.851625,1,0.37693,1.227995,0.839804,0.760833,...,1.343346,0,1,0.508044,0,0,0,0,0,0
4,1.567757,0,0,0,-0.961607,0,1.461793,0.486696,-1.443293,-0.824566,...,0.333665,0,0,-1.684679,0,0,1,1,0,0
