In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.utils import resample, shuffle

from xgboost import XGBClassifier
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='svg'
plt.rcParams.update({
    'text.usetex':False
})
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

## Loading the dataset, pre-processing, and analysing the data

In [21]:
cohort_data = pd.read_csv('cohort_data.csv')
cohort_data

Unnamed: 0,subject_id,hadm_id,icustay_id,gender,age,icu_los_hours,min_wbc,max_wbc,mean_wbc,std_wbc,...,max_po2,mean_po2,std_po2,last_po2,min_pco2,max_pco2,mean_pco2,std_pco2,last_pco2,target
0,28162,194362,285686,F,18,49.0,,,,,...,,,,,,,,,,0
1,22190,166880,290052,F,18,117.0,0.0,0.0,0.0,,...,221.0,213.333333,6.806859,208.0,37.0,42.0,38.666667,2.886751,37.0,0
2,7717,159770,260370,F,18,34.0,9.0,9.0,9.0,,...,274.0,216.000000,82.024387,158.0,40.0,40.0,40.000000,0.000000,40.0,0
3,69145,128969,213687,F,19,82.0,,,,,...,112.0,100.666667,11.503623,101.0,16.0,20.0,18.000000,2.000000,20.0,0
4,68035,145990,288524,F,19,22.0,,,,,...,58.0,58.000000,,58.0,37.0,37.0,37.000000,,37.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29278,26644,170124,296631,M,89,0.0,0.0,0.0,0.0,,...,,,,,,,,,,1
29279,21296,172695,266878,M,89,18.0,,,,,...,,,,,,,,,,0
29280,27515,123997,232528,M,89,93.0,,,,,...,229.0,195.000000,35.618347,219.0,26.0,39.0,30.750000,5.737305,26.0,1
29281,23531,193022,234645,M,89,234.0,,,,,...,100.0,100.000000,,100.0,43.0,43.0,43.000000,,43.0,0


In [22]:
# Data available in most of the patients:
df = cohort_data.dropna(subset=['icu_los_hours', 'min_hemoglobin', 'min_hematocrit', 'min_platelet count', 'min_sodium', 'min_potassium', 'min_chloride', 'min_bicarbonate', 'min_glucose', 'min_creatinine', 'min_urea nitrogen', 'min_ph', 'min_po2', 'min_pco2', 'target'])

# Drop sparse columns (missing values for many ICU entries)
df = df.dropna(axis=1, how='any')
df

Unnamed: 0,subject_id,hadm_id,icustay_id,gender,age,icu_los_hours,min_hemoglobin,max_hemoglobin,mean_hemoglobin,last_hemoglobin,...,last_ph,min_po2,max_po2,mean_po2,last_po2,min_pco2,max_pco2,mean_pco2,last_pco2,target
1,22190,166880,290052,F,18,117.0,9.4,11.9,10.266667,9.5,...,7.44,208.0,221.0,213.333333,208.0,37.0,42.0,38.666667,37.0,0
2,7717,159770,260370,F,18,34.0,11.6,12.4,12.000000,11.6,...,5.00,158.0,274.0,216.000000,158.0,40.0,40.0,40.000000,40.0,0
3,69145,128969,213687,F,19,82.0,10.8,10.9,10.850000,10.8,...,7.55,89.0,112.0,100.666667,101.0,16.0,20.0,18.000000,20.0,0
5,42842,162017,267868,F,20,403.0,4.6,10.8,8.522222,4.6,...,7.23,42.0,188.0,89.125000,188.0,20.0,50.0,36.375000,37.0,0
6,88518,158955,224060,F,20,15.0,12.1,12.1,12.100000,12.1,...,7.26,43.0,43.0,43.000000,43.0,43.0,43.0,43.000000,43.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29268,26701,189710,242094,M,77,310.0,11.9,12.1,12.000000,12.1,...,8.00,127.0,162.0,144.500000,127.0,43.0,47.0,45.000000,43.0,0
29275,19609,155814,274163,M,84,27.0,11.5,11.5,11.500000,11.5,...,8.00,82.0,82.0,82.000000,82.0,62.0,62.0,62.000000,62.0,0
29277,28848,137939,219453,M,88,277.0,12.1,12.1,12.100000,12.1,...,7.47,89.0,106.0,97.500000,106.0,37.0,39.0,38.000000,39.0,0
29280,27515,123997,232528,M,89,93.0,12.9,12.9,12.900000,12.9,...,7.35,152.0,229.0,195.000000,219.0,26.0,39.0,30.750000,26.0,1


Summary of the filtered data

In [23]:
print(df.info())        # Data types and non-null counts
print(df.describe())    # Summary stats for numeric columns
print(df.describe(include='object')) 

<class 'pandas.core.frame.DataFrame'>
Index: 14899 entries, 1 to 29281
Data columns (total 59 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   subject_id           14899 non-null  int64  
 1   hadm_id              14899 non-null  int64  
 2   icustay_id           14899 non-null  int64  
 3   gender               14899 non-null  object 
 4   age                  14899 non-null  int64  
 5   icu_los_hours        14899 non-null  float64
 6   min_hemoglobin       14899 non-null  float64
 7   max_hemoglobin       14899 non-null  float64
 8   mean_hemoglobin      14899 non-null  float64
 9   last_hemoglobin      14899 non-null  float64
 10  min_hematocrit       14899 non-null  float64
 11  max_hematocrit       14899 non-null  float64
 12  mean_hematocrit      14899 non-null  float64
 13  last_hematocrit      14899 non-null  float64
 14  min_platelet count   14899 non-null  float64
 15  max_platelet count   14899 non-null  floa

In [24]:
df = df.drop('subject_id', axis=1)
df = df.drop('hadm_id', axis=1)
df = df.drop('icustay_id', axis=1)
df = df.drop('gender', axis=1)
df

Unnamed: 0,age,icu_los_hours,min_hemoglobin,max_hemoglobin,mean_hemoglobin,last_hemoglobin,min_hematocrit,max_hematocrit,mean_hematocrit,last_hematocrit,...,last_ph,min_po2,max_po2,mean_po2,last_po2,min_pco2,max_pco2,mean_pco2,last_pco2,target
1,18,117.0,9.4,11.9,10.266667,9.5,26.1,26.6,26.350000,26.1,...,7.44,208.0,221.0,213.333333,208.0,37.0,42.0,38.666667,37.0,0
2,18,34.0,11.6,12.4,12.000000,11.6,31.5,34.9,33.500000,34.1,...,5.00,158.0,274.0,216.000000,158.0,40.0,40.0,40.000000,40.0,0
3,19,82.0,10.8,10.9,10.850000,10.8,30.4,31.7,31.050000,30.4,...,7.55,89.0,112.0,100.666667,101.0,16.0,20.0,18.000000,20.0,0
5,20,403.0,4.6,10.8,8.522222,4.6,13.8,32.2,26.133333,13.8,...,7.23,42.0,188.0,89.125000,188.0,20.0,50.0,36.375000,37.0,0
6,20,15.0,12.1,12.1,12.100000,12.1,35.8,35.8,35.800000,35.8,...,7.26,43.0,43.0,43.000000,43.0,43.0,43.0,43.000000,43.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29268,77,310.0,11.9,12.1,12.000000,12.1,32.9,36.0,34.425000,32.9,...,8.00,127.0,162.0,144.500000,127.0,43.0,47.0,45.000000,43.0,0
29275,84,27.0,11.5,11.5,11.500000,11.5,36.8,36.8,36.800000,36.8,...,8.00,82.0,82.0,82.000000,82.0,62.0,62.0,62.000000,62.0,0
29277,88,277.0,12.1,12.1,12.100000,12.1,34.0,34.2,34.100000,34.0,...,7.47,89.0,106.0,97.500000,106.0,37.0,39.0,38.000000,39.0,0
29280,89,93.0,12.9,12.9,12.900000,12.9,41.8,42.1,41.950000,42.1,...,7.35,152.0,229.0,195.000000,219.0,26.0,39.0,30.750000,26.0,1


In [25]:
dataset = np.array(df)
y = np.array(df['target'])
X = np.array(df.drop('target', axis=1))
print(np.shape(dataset))
print(np.shape(X))
print(np.shape(y))

# Random shuffle and split 70-30 into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
print(f'% Readmissions in Train: {np.mean(y_train) * 100}')
print(f'% Readmissions in Test: {np.mean(y_test) * 100}')

(14899, 55)
(14899, 54)
(14899,)
% Readmissions in Train: 6.66410969412216
% Readmissions in Test: 7.0022371364653235


### XGBoost 

In [28]:
import numpy as np
import optuna
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

RANDOM_STATE = 229
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)


def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "use_label_encoder": False,
        "random_state": RANDOM_STATE,
        "n_jobs": -1,

        # Search around your known good values
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "max_delta_step": trial.suggest_int("max_delta_step", 0, 5),
        "max_leaves": trial.suggest_int("max_leaves", 2, 10),
        "min_child_weight": trial.suggest_float("min_child_weight", 1, 8),
        "n_estimators": trial.suggest_int("n_estimators", 400, 1000),
        "alpha": trial.suggest_float("alpha", 0.1, 1.0),
        "lambda": trial.suggest_float("lambda", 0.8, 1.5),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.5, 1.5),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
    }

    aucs = []
    for train_idx, valid_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train[train_idx], X_train[valid_idx]
        y_tr, y_val = y_train[train_idx], y_train[valid_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        y_pred = model.predict_proba(X_val)[:, 1]
        aucs.append(roc_auc_score(y_val, y_pred))

    return np.mean(aucs)

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest Parameters Found:")
for k, v in study.best_params.items():
    print(f"{k}: {v}")

print("\nBest Cross-Validation AUC:", study.best_value)

best_params = study.best_params
best_params.update({
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "use_label_encoder": False,
    "random_state": RANDOM_STATE,
    "n_jobs": -1,
})

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

[I 2025-11-11 18:24:27,143] A new study created in memory with name: no-name-d4e9e242-aeb0-4e0a-a32b-75b0028f60e8


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-11 18:24:35,429] Trial 0 finished with value: 0.6585032460759072 and parameters: {'learning_rate': 0.01250331001543945, 'max_depth': 10, 'max_delta_step': 4, 'max_leaves': 4, 'min_child_weight': 6.456232181430629, 'n_estimators': 682, 'alpha': 0.4346791939075747, 'lambda': 1.2687132500980753, 'scale_pos_weight': 0.8541608132425591, 'subsample': 0.9897997300994384}. Best is trial 0 with value: 0.6585032460759072.
[I 2025-11-11 18:24:45,447] Trial 1 finished with value: 0.6447294852707957 and parameters: {'learning_rate': 0.039760682577662444, 'max_depth': 9, 'max_delta_step': 5, 'max_leaves': 4, 'min_child_weight': 2.37463831194187, 'n_estimators': 817, 'alpha': 0.3181256942236034, 'lambda': 1.25254872668324, 'scale_pos_weight': 1.3110181408002357, 'subsample': 0.9008305830079517}. Best is trial 0 with value: 0.6585032460759072.
[I 2025-11-11 18:24:57,675] Trial 2 finished with value: 0.6130202946228439 and parameters: {'learning_rate': 0.09027408151278808, 'max_depth': 6, 'm

In [29]:
y_proba_test = final_model.predict_proba(X_test)[:, 1]
y_pred_test = (y_proba_test >= 0.5).astype(int)

test_auroc = roc_auc_score(y_test, y_proba_test)
print(f"\nFinal Test ROC AUC: {test_auroc:.4f}")


Final Test ROC AUC: 0.6390
