In [30]:
pip install lightgbm


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [31]:
!pip install pandas numpy scikit-learn lightgbm


Defaulting to user installation because normal site-packages is not writeable


In [32]:
pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [33]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score, recall_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV



In [34]:

# Load the dataset
df = pd.read_csv("Dementia Prediction Dataset.csv")

# --- 1. Create the Binary Target Variable ---
df['DementiaStatus'] = df['NACCUDSD'].apply(lambda x: 1 if x == 4 else 0)


# --- 2. Define the list of non-medical features to use ---
non_medical_features = [
    'VISITYR',
    'BIRTHYR',
    'SEX',
    'EDUC',
    'RACE',
    'MARISTAT',
    'INDEPEND',
    'RESIDENC'
]

# 3. Create the 'Age' Feature and Finalize Feature Set 'X' 
# Select only the columns we need
X = df[non_medical_features].copy()

# Engineer the 'Age' feature
X['Age'] = X['VISITYR'] - X['BIRTHYR']

# Drop the original year columns as 'Age' is now the primary feature
X = X.drop(['VISITYR', 'BIRTHYR'], axis=1)


#  4. Define the Final Target Variable 'y' 
y = df['DementiaStatus']


#  5. Perform the Train-Test Split 
# This will now work correctly. We stratify on 'y' because the dataset is imbalanced.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# 6. Verification 
print("Data loading and preparation successful!")
print("\nFinal features being used for the model:")
print(X_train.columns.tolist())

print("\nDistribution of Dementia in the training set:")
print(y_train.value_counts(normalize=True))

print("\nShape of training features:", X_train.shape)
print("Shape of testing features:", X_test.shape)

  df = pd.read_csv("Dementia Prediction Dataset.csv")


Data loading and preparation successful!

Final features being used for the model:
['SEX', 'EDUC', 'RACE', 'MARISTAT', 'INDEPEND', 'RESIDENC', 'Age']

Distribution of Dementia in the training set:
DementiaStatus
0    0.704962
1    0.295038
Name: proportion, dtype: float64

Shape of training features: (156156, 7)
Shape of testing features: (39040, 7)


In [35]:
# Define the models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

# Dictionary to store results
results = {}

# Loop through each model
for name, model in models.items():
    print(f"\n--- Training {name} ---")
    
    # Create a full pipeline that first preprocesses the data and then runs the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    
    # Train the pipeline on the raw training data
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the raw test data
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

    # Calculate and store metrics
    auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    results[name] = {"AUC-ROC": auc, "F1-Score": f1, "Recall": recall}

    # Print results for the current model
    print(classification_report(y_test, y_pred))
    print(f"AUC-ROC = {auc:.3f}")


--- Training Logistic Regression ---
              precision    recall  f1-score   support

           0       0.94      0.92      0.93     27522
           1       0.81      0.85      0.83     11518

    accuracy                           0.90     39040
   macro avg       0.87      0.88      0.88     39040
weighted avg       0.90      0.90      0.90     39040

AUC-ROC = 0.914

--- Training Random Forest ---
              precision    recall  f1-score   support

           0       0.92      0.93      0.92     27522
           1       0.83      0.80      0.82     11518

    accuracy                           0.89     39040
   macro avg       0.87      0.87      0.87     39040
weighted avg       0.89      0.89      0.89     39040

AUC-ROC = 0.912

--- Training LightGBM ---
[LightGBM] [Info] Number of positive: 46072, number of negative: 110084
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008598 seconds.
You can set `force_row_wise=true` to remo



              precision    recall  f1-score   support

           0       0.93      0.92      0.93     27522
           1       0.82      0.84      0.83     11518

    accuracy                           0.90     39040
   macro avg       0.88      0.88      0.88     39040
weighted avg       0.90      0.90      0.90     39040

AUC-ROC = 0.924


In [36]:
# --- 4. Final Comparison Table ---
results_df = pd.DataFrame(results).T.sort_values(by="AUC-ROC", ascending=False)
print("\n--- Final Model Comparison ---")
print(results_df)


--- Final Model Comparison ---
                      AUC-ROC  F1-Score    Recall
LightGBM             0.923678  0.830983  0.839208
Logistic Regression  0.914094  0.829891  0.849453
Random Forest        0.911915  0.815328  0.802657
