In [None]:
# %%
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import xgboost as xgb

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, roc_curve, auc, RocCurveDisplay
)

optuna.logging.set_verbosity(optuna.logging.WARNING)

xgb.set_config(verbosity=0)
%config InlineBackend.figure_format = 'svg'
plt.rcParams.update({'text.usetex': False})
cohort_data = pd.read_csv('cohort_data_new.csv')
print("Dataset shape:", cohort_data.shape)

# Drop identifying columns
drop_cols = [c for c in cohort_data.columns if 'icustay_id' in c.lower() or 'subject' in c.lower()]
cohort_data = cohort_data.drop(columns=drop_cols, errors='ignore')

# Separate features and labels
X = cohort_data.drop(columns=['target'])
y = cohort_data['target']

# Keep only numeric columns and handle infinities
X = X.select_dtypes(include=['number']).replace([np.inf, -np.inf], np.nan)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y, shuffle=True
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print(f"% Readmissions in Train: {np.mean(y_train) * 100:.2f}")
print(f"% Readmissions in Test: {np.mean(y_test) * 100:.2f}")

RANDOM_STATE = 229
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "use_label_encoder": False,
        "random_state": RANDOM_STATE,
        "n_jobs": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "max_delta_step": trial.suggest_int("max_delta_step", 0, 5),
        "max_leaves": trial.suggest_int("max_leaves", 2, 10),
        "min_child_weight": trial.suggest_float("min_child_weight", 1, 8),
        "n_estimators": trial.suggest_int("n_estimators", 400, 1000),
        "alpha": trial.suggest_float("alpha", 0.1, 1.0),
        "lambda": trial.suggest_float("lambda", 0.8, 1.5),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.5, 1.5),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
    }

    aucs = []
    for train_idx, valid_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)

        y_pred = model.predict_proba(X_val)[:, 1]
        aucs.append(roc_auc_score(y_val, y_pred))

    return np.mean(aucs)

# %% [markdown]
# ## Run Optuna optimization

# %%
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest Parameters Found:")
for k, v in study.best_params.items():
    print(f"{k}: {v}")

print(f"\nBest Cross-Validation AUC: {study.best_value:.4f}")

# %% [markdown]
# ## Train final model on full training set

# %%
best_params = study.best_params
best_params.update({
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "use_label_encoder": False,
    "random_state": RANDOM_STATE,
    "n_jobs": -1,
})

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# %% [markdown]
# ## Evaluate on test set

# %%
y_proba_test = final_model.predict_proba(X_test)[:, 1]
y_pred_test = (y_proba_test >= 0.5).astype(int)

test_auroc = roc_auc_score(y_test, y_proba_test)
print(f"\nFinal Test ROC AUC: {test_auroc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC curve
RocCurveDisplay.from_estimator(final_model, X_test, y_test)
plt.title("ROC Curve - XGBoost")
plt.show()

# %% [markdown]
# ## Feature Importance (Top 20)

# %%
importances = final_model.feature_importances_
feat_imp = pd.DataFrame({'feature': X.columns, 'importance': importances})
feat_imp = feat_imp.sort_values('importance', ascending=False).head(20)

plt.figure(figsize=(8, 6))
sns.barplot(y='feature', x='importance', data=feat_imp)
plt.title("Top 20 Feature Importances (XGBoost)")
plt.tight_layout()
plt.show()



Dataset shape: (30489, 97)
Train shape: (21342, 94), Test shape: (9147, 94)
% Readmissions in Train: 10.74
% Readmissions in Test: 10.75


  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_param_importances(study).show()