In [28]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

from catboost import CatBoostClassifier, Pool

import shap
import optuna

## Loading the dataset, pre-processing, and analysing the data

In [29]:
cohort_data = pd.read_csv('../cohort_data_new.csv')
cohort_data

Unnamed: 0,icustay_id,anion_gap_mean,anion_gap_sd,anion_gap_min,anion_gap_max,bicarbonate_mean,bicarbonate_sd,bicarbonate_min,bicarbonate_max,calcium_total_mean,...,urea_nitrogen_min,urea_nitrogen_max,white_blood_cells_mean,white_blood_cells_sd,white_blood_cells_min,white_blood_cells_max,age,gender,icu_los_hours,target
0,200003,13.375000,3.583195,9.0,21.0,25.250000,3.105295,18.0,28.0,7.771429,...,10.0,21.0,26.471429,13.176711,13.2,43.9,48,M,141,0
1,200007,15.500000,2.121320,14.0,17.0,23.000000,1.414214,22.0,24.0,8.900000,...,8.0,10.0,10.300000,1.272792,9.4,11.2,44,M,30,0
2,200009,9.500000,2.121320,8.0,11.0,23.333333,2.081666,21.0,25.0,8.000000,...,15.0,21.0,12.471429,1.471637,10.5,14.3,47,F,51,0
3,200012,,,,,,,,,,...,,,4.900000,,4.9,4.9,33,F,10,0
4,200014,10.000000,1.732051,9.0,12.0,24.000000,1.000000,23.0,25.0,7.733333,...,21.0,24.0,13.233333,2.203028,10.7,14.7,85,M,41,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30484,299992,15.375000,2.856153,11.0,25.0,23.125000,2.609556,15.0,26.0,8.307143,...,8.0,23.0,14.134783,3.781727,8.1,22.1,41,M,499,0
30485,299993,9.400000,1.341641,8.0,11.0,29.600000,2.073644,26.0,31.0,8.000000,...,12.0,15.0,12.600000,0.605530,12.0,13.3,26,M,67,0
30486,299994,16.157895,2.477973,13.0,24.0,21.631579,3.451417,17.0,31.0,8.100000,...,28.0,63.0,10.076190,2.642329,5.3,14.5,74,F,152,1
30487,299998,11.500000,1.732051,10.0,14.0,23.500000,1.290994,22.0,25.0,8.800000,...,20.0,22.0,9.900000,1.210372,7.9,11.0,87,M,46,1


In [30]:
print(f"Dataset shape: {cohort_data.shape}")
print(f"Readmission rate: {cohort_data['target'].mean() * 100:.2f}%")

Dataset shape: (30489, 93)
Readmission rate: 10.74%


In [31]:
lab_cols = [
    'anion_gap_mean', 'anion_gap_min', 'anion_gap_max', 'anion_gap_sd',
    'bicarbonate_mean', 'bicarbonate_min', 'bicarbonate_max', 'bicarbonate_sd',
    'calcium_total_mean', 'calcium_total_min', 'calcium_total_max', 'calcium_total_sd',
    'chloride_mean', 'chloride_min', 'chloride_max', 'chloride_sd',
    'creatinine_mean', 'creatinine_min', 'creatinine_max', 'creatinine_sd',
    'glucose_mean', 'glucose_min', 'glucose_max', 'glucose_sd',
    'hematocrit_mean', 'hematocrit_min', 'hematocrit_max', 'hematocrit_sd',
    'hemoglobin_mean', 'hemoglobin_min', 'hemoglobin_max', 'hemoglobin_sd',
    'mchc_mean', 'mchc_min', 'mchc_max', 'mchc_sd',
    'mcv_mean', 'mcv_min', 'mcv_max', 'mcv_sd',
    'magnesium_mean', 'magnesium_min', 'magnesium_max', 'magnesium_sd',
    'pt_mean', 'pt_min', 'pt_max', 'pt_sd',
    'phosphate_mean', 'phosphate_min', 'phosphate_max', 'phosphate_sd',
    'platelet_count_mean', 'platelet_count_min', 'platelet_count_max', 'platelet_count_sd',
    'potassium_mean', 'potassium_min', 'potassium_max', 'potassium_sd',
    'rdw_mean', 'rdw_min', 'rdw_max', 'rdw_sd',
    'red_blood_cells_mean', 'red_blood_cells_min', 'red_blood_cells_max', 'red_blood_cells_sd',
    'sodium_mean', 'sodium_min', 'sodium_max', 'sodium_sd',
    'urea_nitrogen_mean', 'urea_nitrogen_min', 'urea_nitrogen_max', 'urea_nitrogen_sd',
    'white_blood_cells_mean', 'white_blood_cells_min', 'white_blood_cells_max', 'white_blood_cells_sd',
    'age', 'icu_los_hours'
]

REmove the ICUstay_id and the gender

In [32]:
drop_cols = [c for c in cohort_data.columns if 'icustay_id' in c.lower() or 'gender' in c.lower()]
df = cohort_data.drop(columns=['icustay_id', 'gender'], errors='ignore')

X = df.drop(columns=['target'])
y = df['target']
X

Unnamed: 0,anion_gap_mean,anion_gap_sd,anion_gap_min,anion_gap_max,bicarbonate_mean,bicarbonate_sd,bicarbonate_min,bicarbonate_max,calcium_total_mean,calcium_total_sd,...,urea_nitrogen_mean,urea_nitrogen_sd,urea_nitrogen_min,urea_nitrogen_max,white_blood_cells_mean,white_blood_cells_sd,white_blood_cells_min,white_blood_cells_max,age,icu_los_hours
0,13.375000,3.583195,9.0,21.0,25.250000,3.105295,18.0,28.0,7.771429,0.292770,...,15.571429,4.577377,10.0,21.0,26.471429,13.176711,13.2,43.9,48,141
1,15.500000,2.121320,14.0,17.0,23.000000,1.414214,22.0,24.0,8.900000,,...,9.000000,1.414214,8.0,10.0,10.300000,1.272792,9.4,11.2,44,30
2,9.500000,2.121320,8.0,11.0,23.333333,2.081666,21.0,25.0,8.000000,,...,17.333333,3.214550,15.0,21.0,12.471429,1.471637,10.5,14.3,47,51
3,,,,,,,,,,,...,,,,,4.900000,,4.9,4.9,33,10
4,10.000000,1.732051,9.0,12.0,24.000000,1.000000,23.0,25.0,7.733333,0.057735,...,23.000000,1.732051,21.0,24.0,13.233333,2.203028,10.7,14.7,85,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30484,15.375000,2.856153,11.0,25.0,23.125000,2.609556,15.0,26.0,8.307143,0.255597,...,16.000000,4.662524,8.0,23.0,14.134783,3.781727,8.1,22.1,41,499
30485,9.400000,1.341641,8.0,11.0,29.600000,2.073644,26.0,31.0,8.000000,0.216025,...,13.000000,1.224745,12.0,15.0,12.600000,0.605530,12.0,13.3,26,67
30486,16.157895,2.477973,13.0,24.0,21.631579,3.451417,17.0,31.0,8.100000,0.316228,...,44.578947,12.102873,28.0,63.0,10.076190,2.642329,5.3,14.5,74,152
30487,11.500000,1.732051,10.0,14.0,23.500000,1.290994,22.0,25.0,8.800000,0.416333,...,20.750000,0.957427,20.0,22.0,9.900000,1.210372,7.9,11.0,87,46


trying out feature engineering

In [33]:
X["renal_ratio"] = X["urea_nitrogen_mean"] / (X["creatinine_mean"] + 1e-5)
X["creatinine_range"] = X["creatinine_max"] - X["creatinine_min"]
X["bun_range"] = X["urea_nitrogen_max"] - X["urea_nitrogen_min"]

X["anion_gap_bicarb_ratio"] = X["anion_gap_mean"] / (X["bicarbonate_mean"] + 1e-5)
X["anion_gap_range"] = X["anion_gap_max"] - X["anion_gap_min"]

X["hb_hct_ratio"] = X["hemoglobin_mean"] / (X["hematocrit_mean"] + 1e-5)
X["rbc_variability"] = X["red_blood_cells_sd"]
X["plt_range"] = X["platelet_count_max"] - X["platelet_count_min"]

X["sodium_range"] = X["sodium_max"] - X["sodium_min"]
X["potassium_range"] = X["potassium_max"] - X["potassium_min"]
X["chloride_range"] = X["chloride_max"] - X["chloride_min"]

X["glucose_range"] = X["glucose_max"] - X["glucose_min"]
X["glucose_instability"] = X["glucose_sd"]

X["age_los_interaction"] = X["age"] * X["icu_los_hours"]
X["icu_los_log"] = np.log1p(X["icu_los_hours"])

X

Unnamed: 0,anion_gap_mean,anion_gap_sd,anion_gap_min,anion_gap_max,bicarbonate_mean,bicarbonate_sd,bicarbonate_min,bicarbonate_max,calcium_total_mean,calcium_total_sd,...,hb_hct_ratio,rbc_variability,plt_range,sodium_range,potassium_range,chloride_range,glucose_range,glucose_instability,age_los_interaction,icu_los_log
0,13.375000,3.583195,9.0,21.0,25.250000,3.105295,18.0,28.0,7.771429,0.292770,...,0.330890,0.141657,17.0,4.0,1.1,6.0,78.0,26.596187,6768,4.955827
1,15.500000,2.121320,14.0,17.0,23.000000,1.414214,22.0,24.0,8.900000,,...,0.345695,0.049497,22.0,3.0,0.1,2.0,16.0,11.313709,1320,3.433987
2,9.500000,2.121320,8.0,11.0,23.333333,2.081666,21.0,25.0,8.000000,,...,0.342468,0.194398,146.0,2.0,0.7,3.0,35.0,24.748737,2397,3.951244
3,,,,,,,,,,,...,0.335484,,0.0,,,,,,330,2.397895
4,10.000000,1.732051,9.0,12.0,24.000000,1.000000,23.0,25.0,7.733333,0.057735,...,0.333838,0.347707,17.0,6.0,0.4,6.0,14.0,7.810250,3485,3.737670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30484,15.375000,2.856153,11.0,25.0,23.125000,2.609556,15.0,26.0,8.307143,0.255597,...,0.346878,0.290223,759.0,12.0,1.8,13.0,116.0,26.268023,20459,6.214608
30485,9.400000,1.341641,8.0,11.0,29.600000,2.073644,26.0,31.0,8.000000,0.216025,...,0.354777,0.084261,102.0,4.0,0.9,8.0,61.0,23.776038,1742,4.219508
30486,16.157895,2.477973,13.0,24.0,21.631579,3.451417,17.0,31.0,8.100000,0.316228,...,0.365978,0.405553,162.0,18.0,3.3,16.0,367.0,80.865024,11248,5.030438
30487,11.500000,1.732051,10.0,14.0,23.500000,1.290994,22.0,25.0,8.800000,0.416333,...,0.328358,0.268179,26.0,4.0,0.5,3.0,76.0,32.269697,4002,3.850148


In [34]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.30, random_state=7, stratify=y)
X_tr, X_val, y_tr, y_val = train_test_split(X_train_full, y_train_full, test_size=0.30, random_state=7, stratify=y_train_full)

train_pool = Pool(X_tr, y_tr)
valid_pool = Pool(X_val, y_val)

In [35]:
def objective(trial):

    params = {
        "iterations": trial.suggest_int("iterations", 500, 3000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 10, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 5),
        "random_strength": trial.suggest_float("random_strength", 0, 2),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_seed": 42,
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "task_type": "CPU",
        "verbose": False,
        "od_type": "Iter",
        "od_wait": 120
    }

    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, verbose=False)
    pred_val = model.predict_proba(X_val)[:, 1]
    val_auc = roc_auc_score(y_val, pred_val)

    return val_auc

In [37]:
sampler = optuna.samplers.TPESampler(seed=7)
study = optuna.create_study(direction="maximize", sampler=sampler)

study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best Trial:")
print(study.best_trial.params)
print("Best AUROC:", study.best_value)

[I 2025-11-29 15:50:59,592] A new study created in memory with name: no-name-1399fbae-9bcc-497d-9858-724d3613ff47


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-29 15:51:09,992] Trial 0 finished with value: 0.6967069523286334 and parameters: {'iterations': 690, 'learning_rate': 0.08549750026376444, 'depth': 7, 'l2_leaf_reg': 1.4804560990495679, 'bagging_temperature': 4.889947559983013, 'random_strength': 1.0769917408208673, 'border_count': 144}. Best is trial 0 with value: 0.6967069523286334.
[I 2025-11-29 15:51:24,274] Trial 1 finished with value: 0.6948918085820668 and parameters: {'iterations': 680, 'learning_rate': 0.004623340751759673, 'depth': 7, 'l2_leaf_reg': 1.090661513591819, 'bagging_temperature': 4.018695180521878, 'random_strength': 0.7618822662970768, 'border_count': 46}. Best is trial 0 with value: 0.6967069523286334.
[I 2025-11-29 15:51:27,041] Trial 2 finished with value: 0.6900992390486073 and parameters: {'iterations': 1220, 'learning_rate': 0.17913211790965036, 'depth': 5, 'l2_leaf_reg': 0.22718093648132454, 'bagging_temperature': 4.656030098445108, 'random_strength': 0.049798455100696026, 'border_count': 166}. B

In [None]:
best_params = study.best_trial.params
best_params.update({
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "random_seed": 7,
    "task_type": "CPU",
    "verbose": 200
})

final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train_full, y_train_full, eval_set=(X_test, y_test), use_best_model=True)
test_proba = final_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, test_proba)

print("Final Test AUROC =", test_auc)

0:	test: 0.5859241	best: 0.5859241 (0)	total: 23.3ms	remaining: 49.2s
200:	test: 0.7108459	best: 0.7109532 (194)	total: 4.84s	remaining: 45.9s
400:	test: 0.7241235	best: 0.7241602 (399)	total: 9.97s	remaining: 42.4s
600:	test: 0.7292453	best: 0.7292453 (600)	total: 15.6s	remaining: 39.1s
800:	test: 0.7321171	best: 0.7321243 (799)	total: 19.7s	remaining: 32.2s
1000:	test: 0.7336854	best: 0.7336854 (1000)	total: 23.5s	remaining: 26s
1200:	test: 0.7354617	best: 0.7354922 (1196)	total: 27.2s	remaining: 20.6s
1400:	test: 0.7359232	best: 0.7359277 (1385)	total: 31.5s	remaining: 15.9s
1600:	test: 0.7361282	best: 0.7362525 (1557)	total: 36.5s	remaining: 11.6s
1800:	test: 0.7363546	best: 0.7367030 (1734)	total: 40.8s	remaining: 6.95s
2000:	test: 0.7360221	best: 0.7367030 (1734)	total: 45.6s	remaining: 2.44s
2107:	test: 0.7357347	best: 0.7367030 (1734)	total: 48s	remaining: 0us

bestTest = 0.7367030304
bestIteration = 1734

Shrink model to first 1735 iterations.

ðŸ”¥ Final Test AUROC = 0.736703