In [1]:
# ===== IMPORTS =====
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')


ModuleNotFoundError: No module named 'imblearn'

In [2]:
plt.style.use('default')
sns.set_style('white')

In [4]:
df = pd.read_csv('Dementia Prediction Dataset.csv')

  df = pd.read_csv('Dementia Prediction Dataset.csv')


In [6]:
feats = ['AGIT', 'ALCFREQ', 'ALCOHOL', 'ANX', 'ANYMEDS', 'APA', 'APPSEV', 'BEANX',
         'BEAPATHY', 'BEDEP', 'BEDISIN', 'BEIRRIT', 'BEOTHR', 'BIRTHYR', 'CBSTROKE',
         'DEL', 'DELSEV', 'DEP2YRS', 'DEPD', 'ENERGY', 'FORMVER', 'GAMES', 'HALLSEV',
         'HEIGHT', 'INBIRMO', 'INCALLS', 'INCONTF', 'MARISTAT', 'MOFALLS', 'NACCBEHF',
         'NACCDAYS', 'NACCLIVS', 'NACCREAS', 'NACCREFR', 'NITE', 'NITESEV', 'PACKSPER',
         'RESIDENC', 'SEX', 'SLEEPOTH', 'TAXES']

In [7]:
final_df = pd.concat([df[feats], df['DEMENTED']], axis=1)

In [8]:
df = final_df

In [9]:
# Domain knowledge features
df['AGE'] = 2025 - df['BIRTHYR']
final_df['BEHAV_COMPOSITE'] = final_df['BEANX'] + final_df['BEDEP'] + final_df['BEIRRIT'] + final_df['BEAPATHY']

In [10]:
df.drop('BIRTHYR', axis=1, inplace=True)

In [11]:
feats = [f for f in feats if f != 'BIRTHYR'] + ['AGE', 'BEHAV_COMPOSITE']

In [12]:
print(f"Total features after feature engineering: {len(feats)}")

Total features after feature engineering: 42


In [19]:
# ===== DATA PREPROCESSING =====
print("\n=== DATA PREPROCESSING ===")

X = df.drop('DEMENTED', axis=1)
y = df['DEMENTED']


=== DATA PREPROCESSING ===


In [20]:
missing_indicators = []

for col in feats:
    if X[col].isnull().mean() > 0.05:
        name = f"{col} (missing values)"
        X[name] = X[col].isnull().astype(int)
        missing_indicators.append(name)

In [21]:
# ===== IMPORTS =====
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, precision_recall_curve
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('default')
sns.set_palette("husl")

# ===== DATA LOADING =====
df = pd.read_csv('Dementia Prediction Dataset.csv')

# Original features
feats = ['AGIT', 'ALCFREQ', 'ALCOHOL', 'ANX', 'ANYMEDS', 'APA', 'APPSEV', 'BEANX',
         'BEAPATHY', 'BEDEP', 'BEDISIN', 'BEIRRIT', 'BEOTHR', 'BIRTHYR', 'CBSTROKE',
         'DEL', 'DELSEV', 'DEP2YRS', 'DEPD', 'ENERGY', 'FORMVER', 'GAMES', 'HALLSEV',
         'HEIGHT', 'INBIRMO', 'INCALLS', 'INCONTF', 'MARISTAT', 'MOFALLS', 'NACCBEHF',
         'NACCDAYS', 'NACCLIVS', 'NACCREAS', 'NACCREFR', 'NITE', 'NITESEV', 'PACKSPER',
         'RESIDENC', 'SEX', 'SLEEPOTH', 'TAXES']

# Create working dataframe with target
final_df = pd.concat([df[feats], df['DEMENTED']], axis=1)

# ===== ADVANCED FEATURE ENGINEERING =====
print("=== FEATURE ENGINEERING ===")

# Create new features from existing numerical data
final_df['AGE'] = 2025 - final_df['BIRTHYR']  # Convert birth year to age
final_df['BEHAV_COMPOSITE'] = final_df['BEANX'] + final_df['BEDEP'] + final_df['BEIRRIT'] + final_df['BEAPATHY']
final_df['COG_FUNCTION_COMPOSITE'] = final_df['FORMVER'] + final_df['GAMES'] + final_df['TAXES']

# Drop the original BIRTHYR as we have AGE
final_df.drop('BIRTHYR', axis=1, inplace=True)

# Update features list
feats = [f for f in feats if f != 'BIRTHYR'] + ['AGE', 'BEHAV_COMPOSITE', 'COG_FUNCTION_COMPOSITE']

print(f"Total features after engineering: {len(feats)}")

# ===== DATA PREPROCESSING =====
print("\n=== DATA PREPROCESSING ===")

# Separate features and target
X = final_df.drop('DEMENTED', axis=1)
y = final_df['DEMENTED']

print(f"Original dataset shape: {X.shape}")

# Create missing indicators for columns with >5% missing values
missing_indicators = []
for col in X.columns:
    missing_rate = X[col].isnull().mean()
    if missing_rate > 0.05:
        indicator_name = f"{col}_MISSING"
        X[indicator_name] = X[col].isnull().astype(int)
        missing_indicators.append(indicator_name)
        print(f"Created missing indicator for {col} (missing rate: {missing_rate:.2%})")

print(f"Total missing indicators created: {len(missing_indicators)}")

# Handle missing values using KNN Imputation
print("Applying KNN Imputation for missing values...")
knn_imputer = KNNImputer(n_neighbors=5)
X_imputed = knn_imputer.fit_transform(X)

# Convert back to DataFrame
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)

# Scale features using RobustScaler (handles outliers better)
print("Scaling features with RobustScaler...")
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_scaled = pd.DataFrame(X_scaled, columns=X_imputed.columns)

print(f"Dataset shape after preprocessing: {X_scaled.shape}")

# ===== ADVANCED FEATURE SELECTION =====
print("\n=== FEATURE SELECTION ===")

# 1. Remove low variance features
print("Step 1: Removing low variance features...")
var_threshold = VarianceThreshold(threshold=0.01)
X_low_var = var_threshold.fit_transform(X_scaled)
selected_features = X_scaled.columns[var_threshold.get_support()].tolist()
print(f"Features after low variance removal: {len(selected_features)}")

# 2. Remove highly correlated features
print("Step 2: Removing highly correlated features...")
correlation_matrix = pd.DataFrame(X_low_var, columns=selected_features).corr().abs()
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

to_drop = []
for column in upper_tri.columns:
    high_corr = upper_tri[column][upper_tri[column] > 0.95]
    if not high_corr.empty:
        to_drop.append(column)

if to_drop:
    selected_features = [col for col in selected_features if col not in to_drop]
    print(f"Dropped {len(to_drop)} highly correlated features: {to_drop}")
    print(f"Features after correlation removal: {len(selected_features)}")

# 3. Univariate feature selection
print("Step 3: Univariate feature selection...")
selector = SelectKBest(score_func=mutual_info_classif, k=min(25, len(selected_features)))
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
    X_scaled[selected_features], y, test_size=0.2, random_state=42, stratify=y
)
X_selected = selector.fit_transform(X_train_temp, y_train_temp)
selected_features = np.array(selected_features)[selector.get_support()].tolist()
print(f"Features after univariate selection: {len(selected_features)}")

# ===== TRAIN-TEST SPLIT =====
print("\n=== TRAIN-TEST SPLIT ===")
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled[selected_features], y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Class distribution - Training: {y_train.value_counts().to_dict()}")
print(f"Class distribution - Test: {y_test.value_counts().to_dict()}")

# ===== INITIAL RANDOM FOREST FOR FEATURE IMPORTANCE =====
print("\n=== INITIAL FEATURE IMPORTANCE ANALYSIS ===")
rf_initial = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
rf_initial.fit(X_train, y_train)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': rf_initial.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 most important features:")
print(feature_importance.head(15))

# Select top features based on importance (keep at least 15, at most 20)
n_features_final = min(20, max(15, len(selected_features)))
top_features = feature_importance.head(n_features_final)['feature'].tolist()
print(f"\nSelected top {len(top_features)} features for final model")

# Update datasets with top features
X_train_final = X_train[top_features]
X_test_final = X_test[top_features]

# ===== COMPREHENSIVE HYPERPARAMETER TUNING =====
print("\n=== HYPERPARAMETER TUNING ===")

# Extensive parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 15, 20, 25, 30, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', 0.3, 0.5, 0.7, 0.9],
    'bootstrap': [True, False],
    'class_weight': ['balanced', 'balanced_subsample', {0: 1, 1: 2}, {0: 1, 1: 3}]
}

print("Starting RandomizedSearchCV with 100 iterations...")
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=100,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='roc_auc',
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_final, y_train)

print(f"Best score from random search: {random_search.best_score_:.4f}")
print(f"Best parameters from random search: {random_search.best_params_}")

# Refined grid search around best parameters
print("\nStarting refined GridSearchCV...")
best_params = random_search.best_params_

# Create refined grid
refined_param_grid = {}
for param, value in best_params.items():
    if param == 'n_estimators':
        refined_param_grid[param] = [max(50, value - 50), value, min(600, value + 50)]
    elif param == 'max_depth' and value is not None:
        refined_param_grid[param] = [max(5, value - 5), value, value + 5]
    elif param == 'min_samples_split':
        refined_param_grid[param] = [max(2, value - 2), value, value + 2]
    elif param == 'min_samples_leaf':
        refined_param_grid[param] = [max(1, value - 1), value, value + 1]
    elif param == 'max_features':
        refined_param_grid[param] = [value]  # Keep the best value
    elif param == 'bootstrap':
        refined_param_grid[param] = [value]  # Keep the best value
    elif param == 'class_weight':
        refined_param_grid[param] = [value]  # Keep the best value

# Remove empty entries
refined_param_grid = {k: v for k, v in refined_param_grid.items() if v}

# grid_search = GridSearchCV(
#     estimator=RandomForestClassifier(random_state=42),
#     param_grid=refined_param_grid,
#     cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
#     scoring='roc_auc',
#     verbose=1,
#     n_jobs=-1
# )
#
# grid_search.fit(X_train_final, y_train)

final_model = random_search.best_estimator_
# ===== COMPREHENSIVE MODEL EVALUATION =====
print("\n=== MODEL EVALUATION ===")

# Predictions
y_pred = final_model.predict(X_test_final)
y_pred_proba = final_model.predict_proba(X_test_final)[:, 1]

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Dementia', 'Dementia'])
fig, ax = plt.subplots(figsize=(8, 6))
disp.plot(ax=ax, cmap='Blues', colorbar=False)
plt.title('Confusion Matrix - Optimized Random Forest', fontsize=14, fontweight='bold')
plt.grid(False)
plt.show()

# Classification Report
print("Detailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Dementia', 'Dementia']))

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc_score = roc_auc_score(y_test, y_pred_proba)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=3, label=f'Random Forest (AUC = {auc_score:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', alpha=0.5, label='Random Classifier')
plt.fill_between(fpr, tpr, alpha=0.2, color='darkorange')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve - Dementia Prediction', fontsize=14, fontweight='bold')
plt.legend(loc="lower right", fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
average_precision = np.mean(precision)

plt.figure(figsize=(10, 8))
plt.plot(recall, precision, color='blue', lw=3, label=f'Average Precision = {average_precision:.4f}')
plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curve', fontsize=14, fontweight='bold')
plt.legend(loc="upper right", fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

# ===== FEATURE IMPORTANCE ANALYSIS =====
print("\n=== FEATURE IMPORTANCE ANALYSIS ===")

feature_importance_final = pd.DataFrame({
    'feature': top_features,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=True)

plt.figure(figsize=(12, 10))
plt.barh(feature_importance_final['feature'], feature_importance_final['importance'],
         color='skyblue', edgecolor='navy')
plt.xlabel('Feature Importance', fontsize=12)
plt.title('Top Feature Importances - Optimized Random Forest', fontsize=14, fontweight='bold')
plt.grid(True, axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

# ===== MODEL EXPLAINABILITY WITH SHAP =====
print("\n=== MODEL EXPLAINABILITY ===")
try:
    import shap

    print("Generating SHAP explanations...")
    shap.initjs()

    # Create explainer
    explainer = shap.TreeExplainer(final_model)
    shap_values = explainer.shap_values(X_test_final)

    # Summary plot
    plt.figure(figsize=(12, 10))
    shap.summary_plot(shap_values[1], X_test_final, feature_names=top_features, show=False)
    plt.title('SHAP Summary Plot - Impact on Dementia Prediction', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

    # Feature importance from SHAP
    shap_importance = np.abs(shap_values[1]).mean(0)
    shap_feature_importance = pd.DataFrame({
        'feature': top_features,
        'shap_importance': shap_importance
    }).sort_values('shap_importance', ascending=False)

    print("\nTop 10 features by SHAP importance:")
    print(shap_feature_importance.head(10))

except ImportError:
    print("SHAP not installed. Install with: pip install shap")

# ===== FINAL PERFORMANCE SUMMARY =====
print("\n" + "="*60)
print("FINAL MODEL PERFORMANCE SUMMARY")
print("="*60)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"AUC-ROC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Average Precision: {average_precision:.4f}")

# Save the final model and preprocessing objects
print("\n=== SAVING MODEL ===")
joblib.dump(final_model, 'optimized_random_forest_dementia.pkl')
joblib.dump(top_features, 'selected_features.pkl')
joblib.dump(scaler, 'robust_scaler.pkl')
joblib.dump(knn_imputer, 'knn_imputer.pkl')

print("Model and preprocessing objects saved successfully!")

# ===== COMPARISON WITH BASELINE =====
print("\n=== COMPARISON WITH BASELINE ===")
# Simple Random Forest without tuning
baseline_rf = RandomForestClassifier(n_estimators=100, random_state=42)
baseline_rf.fit(X_train_final, y_train)
baseline_pred_proba = baseline_rf.predict_proba(X_test_final)[:, 1]
baseline_auc = roc_auc_score(y_test, baseline_pred_proba)

print(f"Baseline Random Forest AUC: {baseline_auc:.4f}")
print(f"Optimized Random Forest AUC: {auc_score:.4f}")
print(f"Improvement: {auc_score - baseline_auc:.4f}")

print("\nðŸŽ¯ OPTIMIZATION COMPLETE! ðŸŽ¯")

=== FEATURE ENGINEERING ===
Total features after engineering: 43

=== DATA PREPROCESSING ===
Original dataset shape: (195196, 43)
Total missing indicators created: 0
Applying KNN Imputation for missing values...
Scaling features with RobustScaler...
Dataset shape after preprocessing: (195196, 43)

=== FEATURE SELECTION ===
Step 1: Removing low variance features...
Features after low variance removal: 43
Step 2: Removing highly correlated features...
Step 3: Univariate feature selection...
Features after univariate selection: 25

=== TRAIN-TEST SPLIT ===
Training set shape: (156156, 25)
Test set shape: (39040, 25)
Class distribution - Training: {0: 110084, 1: 46072}
Class distribution - Test: {0: 27522, 1: 11518}

=== INITIAL FEATURE IMPORTANCE ANALYSIS ===
Top 15 most important features:
                   feature  importance
24  COG_FUNCTION_COMPOSITE    0.222482
22                   TAXES    0.204415
11                   GAMES    0.129855
16                NACCDAYS    0.093503
15    

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


KeyboardInterrupt: 