In [21]:
import xgboost as xgb 
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt

In [8]:
df=pd.read_csv("prepared_medical_dataset.csv")
df.head(10)

Unnamed: 0,BIRTHMO,BIRTHYR,SEX,PRIMLANG,EDUC,MARISTAT,NACCLIVS,INDEPEND,RESIDENC,HANDED,...,DEPOTHR,PSYCDIS,NACCTBI,VISION,VISCORR,VISWCORR,HEARING,HEARAID,DEMENTED,EDUC.1
0,5,1952,1,1,16,1,4,1,1,2,...,0,0,1,0,1,1,1,0,0,16
1,2,1945,1,1,12,3,1,1,1,2,...,0,0,0,0,1,1,0,0,0,12
2,7,1936,1,1,20,1,2,1,1,2,...,0,0,0,0,1,1,0,1,1,20
3,8,1949,1,8,18,1,2,1,1,2,...,0,0,0,0,1,1,0,1,0,18
4,10,1945,1,1,14,1,2,2,1,1,...,0,0,0,0,1,1,0,0,1,14
5,1,1957,2,2,9,1,2,1,1,2,...,0,0,0,0,1,1,1,0,0,9
6,12,1947,1,1,18,1,2,1,1,2,...,0,0,0,0,1,1,0,1,1,18
7,7,1952,2,1,14,3,1,1,1,2,...,0,0,0,0,1,1,1,0,0,14
8,7,1951,2,1,14,1,2,1,1,1,...,1,0,0,1,0,8,1,0,0,14
9,10,1963,1,1,16,1,2,2,1,1,...,0,0,0,1,0,8,1,0,1,16


In [9]:
X = df.drop('DEMENTED', axis=1)  
y = df['DEMENTED']               

print(f"Features: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

Features: (106523, 54)
Target distribution:
DEMENTED
0    73313
1    33210
Name: count, dtype: int64


In [10]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

Training set: (85218, 54)
Testing set: (21305, 54)


In [None]:
# Create and train XGBoost model
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]), 
    random_state=42,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8843

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91     14663
           1       0.80      0.84      0.82      6642

    accuracy                           0.88     21305
   macro avg       0.86      0.87      0.87     21305
weighted avg       0.89      0.88      0.89     21305



In [19]:
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc_score:.4f}")



AUC Score: 0.9236


In [None]:
# Optimized Decision Tree
dt_optimized = DecisionTreeClassifier(
    max_depth=8,               
    min_samples_split=20,    
    min_samples_leaf=10,       
    max_features='sqrt',       
    random_state=42
)

dt_optimized.fit(X_train, y_train)
y_pred_dt_opt = dt_optimized.predict(X_test)

accuracy_dt_opt = accuracy_score(y_test, y_pred_dt_opt)
print(f"Optimized Decision Tree Accuracy: {accuracy_dt_opt:.4f}")
print("\nOptimized Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt_opt))

Optimized Decision Tree Accuracy: 0.8824

Optimized Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.91     14663
           1       0.80      0.83      0.81      6642

    accuracy                           0.88     21305
   macro avg       0.86      0.87      0.86     21305
weighted avg       0.88      0.88      0.88     21305

