##  EARLY ALZHEIMER'S DETECTION USING MACHINE LEARNING MODELS
####  Dataset: Rabie El Kharoua - Alzheimer's Disease Dataset (Kaggle)
#### Author: Smriti Reddy Uravakonda
#### Course: CS6140 - Machine Learning (Fall 2025)


In [1]:

# ------------------ 1. IMPORT LIBRARIES -------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

In [2]:

# ------------------ 2. LOAD & INSPECT DATA ----------------------------
df = pd.read_csv("alzheimers_disease_data.csv")
df.head()

# Basic info
df.info()
df.describe()

# Check missing values
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfu

In [None]:

# ------------------ 3. DATA PRE-PROCESSING ----------------------------
# Handle missing numeric values with median
df['SES'].fillna(df['SES'].median(), inplace=True)
df['MMSE'].fillna(df['MMSE'].median(), inplace=True)

# Encode categorical variables
label_cols = ['Group', 'Gender']
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Define features and target
X = df.drop('Group', axis=1)
y = df['Group']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Handle imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:

# ------------------ 4. EXPLORATORY DATA ANALYSIS (EDA) ----------------
plt.figure(figsize=(6,4))
sns.countplot(x=y)
plt.title("Class Distribution (Before SMOTE)")
plt.show()

sns.heatmap(pd.DataFrame(X_train_res, columns=X.columns).corr(), cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Pairplot on selected variables
sns.pairplot(df[['Age','MMSE','CDR','nWBV','eTIV','Group']], hue='Group')
plt.show()

In [None]:

# ------------------ 5. FEATURE ENGINEERING ----------------------------
# Filtering – remove highly correlated (>0.85) features
corr_matrix = pd.DataFrame(X_train_res, columns=X.columns).corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
drop_cols = [column for column in upper.columns if any(upper[column] > 0.85)]
X_filtered = pd.DataFrame(X_train_res, columns=X.columns).drop(drop_cols, axis=1)

# Embedding – PCA (retain 95% variance)
pca = PCA(0.95)
X_pca = pca.fit_transform(X_filtered)
print(f"PCA reduced dimensions: {X_pca.shape[1]}")

# Wrapping – RFE with Logistic Regression
estimator = LogisticRegression(max_iter=1000)
rfe = RFE(estimator, n_features_to_select=8)
rfe.fit(X_filtered, y_train_res)
selected_cols = X_filtered.columns[rfe.support_]
print("Selected Features via RFE:", list(selected_cols))

In [None]:

# ------------------ 6. MODEL DEVELOPMENT ------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1": f1_score(y_test, y_pred, average='weighted'),
        "ROC-AUC": roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
    }

pd.DataFrame(results).T

In [None]:

# ------------------ 7. HYPERPARAMETER TUNING --------------------------
# Example: Random Forest
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
grid_rf.fit(X_train_res, y_train_res)
print("Best RF Parameters:", grid_rf.best_params_)
best_rf = grid_rf.best_estimator_
best_rf.fit(X_train_res, y_train_res)
y_pred_tuned = best_rf.predict(X_test)

In [None]:
# ------------------ 8. EVALUATION METRICS -----------------------------
print(classification_report(y_test, y_pred_tuned))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tuned))

scores = cross_val_score(best_rf, X_scaled, y, cv=5, scoring='accuracy')
print("Cross-validation Accuracy (RF):", np.mean(scores))

In [None]:

# ------------------ 9. ANALYSIS & VISUALIZATION -----------------------
feat_imp = pd.Series(best_rf.feature_importances_, index=X_filtered.columns).sort_values(ascending=False)
feat_imp.head(10).plot(kind='barh')
plt.title("Top 10 Feature Importances - Random Forest")
plt.show()

# ROC curve comparison (optional)
from sklearn.metrics import RocCurveDisplay
for name, model in models.items():
    RocCurveDisplay.from_estimator(model, X_test, y_test)
plt.title("ROC Curves - Untuned Models")
plt.show()


#### ------------------ 10. CONCLUSION -----------------------------------

Summary:
- Logistic Regression served as baseline.
- Random Forest with tuning achieved highest accuracy.
- PCA + RFE improved dimensionality reduction and generalization.
- Key predictive features: MMSE, CDR, nWBV, Age.

Next Steps:
- Experiment with Gradient Boosting or XGBoost.
- Expand dataset with external ADNI sources for robustness.


