In [53]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, roc_curve

from catboost import CatBoostClassifier
import xgboost as xgb

import matplotlib.pyplot as plt
import shap

## Loading the dataset, pre-processing, and analysing the data

In [54]:
cohort_data = pd.read_csv('../cohort_data_new.csv')
cohort_data

Unnamed: 0,icustay_id,anion_gap_mean,anion_gap_sd,anion_gap_min,anion_gap_max,bicarbonate_mean,bicarbonate_sd,bicarbonate_min,bicarbonate_max,calcium_total_mean,...,urea_nitrogen_min,urea_nitrogen_max,white_blood_cells_mean,white_blood_cells_sd,white_blood_cells_min,white_blood_cells_max,age,gender,icu_los_hours,target
0,200003,13.375000,3.583195,9.0,21.0,25.250000,3.105295,18.0,28.0,7.771429,...,10.0,21.0,26.471429,13.176711,13.2,43.9,48,M,141,0
1,200007,15.500000,2.121320,14.0,17.0,23.000000,1.414214,22.0,24.0,8.900000,...,8.0,10.0,10.300000,1.272792,9.4,11.2,44,M,30,0
2,200009,9.500000,2.121320,8.0,11.0,23.333333,2.081666,21.0,25.0,8.000000,...,15.0,21.0,12.471429,1.471637,10.5,14.3,47,F,51,0
3,200012,,,,,,,,,,...,,,4.900000,,4.9,4.9,33,F,10,0
4,200014,10.000000,1.732051,9.0,12.0,24.000000,1.000000,23.0,25.0,7.733333,...,21.0,24.0,13.233333,2.203028,10.7,14.7,85,M,41,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30484,299992,15.375000,2.856153,11.0,25.0,23.125000,2.609556,15.0,26.0,8.307143,...,8.0,23.0,14.134783,3.781727,8.1,22.1,41,M,499,0
30485,299993,9.400000,1.341641,8.0,11.0,29.600000,2.073644,26.0,31.0,8.000000,...,12.0,15.0,12.600000,0.605530,12.0,13.3,26,M,67,0
30486,299994,16.157895,2.477973,13.0,24.0,21.631579,3.451417,17.0,31.0,8.100000,...,28.0,63.0,10.076190,2.642329,5.3,14.5,74,F,152,1
30487,299998,11.500000,1.732051,10.0,14.0,23.500000,1.290994,22.0,25.0,8.800000,...,20.0,22.0,9.900000,1.210372,7.9,11.0,87,M,46,1


In [55]:
print(f"Dataset shape: {cohort_data.shape}")
print(f"Readmission rate: {cohort_data['target'].mean() * 100:.2f}%")

Dataset shape: (30489, 93)
Readmission rate: 10.74%


In [56]:
lab_cols = [
    'anion_gap_mean', 'anion_gap_min', 'anion_gap_max', 'anion_gap_sd',
    'bicarbonate_mean', 'bicarbonate_min', 'bicarbonate_max', 'bicarbonate_sd',
    'calcium_total_mean', 'calcium_total_min', 'calcium_total_max', 'calcium_total_sd',
    'chloride_mean', 'chloride_min', 'chloride_max', 'chloride_sd',
    'creatinine_mean', 'creatinine_min', 'creatinine_max', 'creatinine_sd',
    'glucose_mean', 'glucose_min', 'glucose_max', 'glucose_sd',
    'hematocrit_mean', 'hematocrit_min', 'hematocrit_max', 'hematocrit_sd',
    'hemoglobin_mean', 'hemoglobin_min', 'hemoglobin_max', 'hemoglobin_sd',
    'mchc_mean', 'mchc_min', 'mchc_max', 'mchc_sd',
    'mcv_mean', 'mcv_min', 'mcv_max', 'mcv_sd',
    'magnesium_mean', 'magnesium_min', 'magnesium_max', 'magnesium_sd',
    'pt_mean', 'pt_min', 'pt_max', 'pt_sd',
    'phosphate_mean', 'phosphate_min', 'phosphate_max', 'phosphate_sd',
    'platelet_count_mean', 'platelet_count_min', 'platelet_count_max', 'platelet_count_sd',
    'potassium_mean', 'potassium_min', 'potassium_max', 'potassium_sd',
    'rdw_mean', 'rdw_min', 'rdw_max', 'rdw_sd',
    'red_blood_cells_mean', 'red_blood_cells_min', 'red_blood_cells_max', 'red_blood_cells_sd',
    'sodium_mean', 'sodium_min', 'sodium_max', 'sodium_sd',
    'urea_nitrogen_mean', 'urea_nitrogen_min', 'urea_nitrogen_max', 'urea_nitrogen_sd',
    'white_blood_cells_mean', 'white_blood_cells_min', 'white_blood_cells_max', 'white_blood_cells_sd',
    'age', 'icu_los_hours'
]

REmove the ICUstay_id and the gender

In [57]:
drop_cols = [c for c in cohort_data.columns if 'icustay_id' in c.lower() or 'gender' in c.lower()]
df = cohort_data.drop(columns=['icustay_id', 'gender'], errors='ignore')

X = df.drop(columns=['target'])
y = df['target']

Creating the final datasets

In [58]:
# test-train split
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)

# train-validation split
X_train, X_eval, y_train, y_eval = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=7, stratify=y_train_full)

print(f'% Readmissions in train: {np.mean(y_train) * 100}')
print(f'% Readmissions in test: {np.mean(y_test) * 100}')
print(f'% Readmissions in validation: {np.mean(y_eval) * 100}')

% Readmissions in train: 10.74369101010777
% Readmissions in test: 10.746692904777523
% Readmissions in validation: 10.74496329845385


XgBoost Model training

In [59]:
# this was found earlier!
best_params = {
        "learning_rate": 0.007016279495713262,
        "max_depth": 18,
        "min_child_weight": 6,
        "gamma": 0.8330494433115034,
        "subsample": 0.5173581010881821,
        "colsample_bytree": 0.8198100446974683,
        "reg_alpha": 0.25804770055836257,
        "reg_lambda": 0.8758805054640171,
        "n_estimators": 1829,
        "max_delta_step": 4,
        "scale_pos_weight": (y_train_full.value_counts()[0] / y_train_full.value_counts()[1]),
        "random_state": 7,
        "eval_metric": "auc",
        "tree_method": "hist",
        "use_label_encoder": False
    }
xgb_model = xgb.XGBClassifier(**best_params)
xgb_model.fit(X_train_full, y_train_full)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8198100446974683
,device,
,early_stopping_rounds,
,enable_categorical,False


CatBoost training

In [61]:
# again, these parameters were found earlier!
cat_model = CatBoostClassifier(
    iterations=2108,
    learning_rate=0.007197701469778457,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=7,
    l2_leaf_reg=1.5001408865424133,
    subsample=0.8,
    verbose=200,
    bagging_temperature= 1.3142064644354503, 
    random_strength= 1.359510180945487, 
    border_count= 80
)

# --- Train ---
cat_model.fit(X_train, y_train, eval_set=(X_eval, y_eval))

0:	test: 0.5351140	best: 0.5351140 (0)	total: 30.7ms	remaining: 1m 4s
200:	test: 0.6835070	best: 0.6835070 (200)	total: 4.99s	remaining: 47.3s
400:	test: 0.6925095	best: 0.6925095 (400)	total: 10.1s	remaining: 42.9s
600:	test: 0.6959786	best: 0.6959786 (600)	total: 15.5s	remaining: 39s
800:	test: 0.6989890	best: 0.6990185 (799)	total: 20.4s	remaining: 33.3s
1000:	test: 0.7010260	best: 0.7010600 (998)	total: 25.2s	remaining: 27.9s
1200:	test: 0.7027030	best: 0.7027030 (1200)	total: 30s	remaining: 22.7s
1400:	test: 0.7045858	best: 0.7045858 (1400)	total: 34.8s	remaining: 17.6s
1600:	test: 0.7057196	best: 0.7057196 (1600)	total: 39.7s	remaining: 12.6s
1800:	test: 0.7074747	best: 0.7075177 (1798)	total: 44.5s	remaining: 7.59s
2000:	test: 0.7085137	best: 0.7085866 (1966)	total: 49.4s	remaining: 2.64s
2107:	test: 0.7087822	best: 0.7088692 (2099)	total: 52.2s	remaining: 0us

bestTest = 0.7088692038
bestIteration = 2099

Shrink model to first 2100 iterations.


<catboost.core.CatBoostClassifier at 0x2da1dc2fe80>

In [62]:
# --- Evaluate ---
# inference of probs from the XGBoost
y_proba_xgb_train = xgb_model.predict_proba(X_train_full)[:, 1]
y_proba_cat_train = cat_model.predict_proba(X_train_full)[:, 1]
y_proba_xgb_test = xgb_model.predict_proba(X_test)[:, 1]
y_proba_cat_test = cat_model.predict_proba(X_test)[:, 1]

stack_train = np.column_stack((y_proba_cat_train, y_proba_xgb_train))
stack_test = np.column_stack((
    y_proba_cat_test,
    y_proba_xgb_test
))

print("Shape of stacking train matrix:", stack_train.shape)

Shape of stacking train matrix: (21342, 2)


In [63]:
meta_model = LogisticRegression(max_iter=1000)
meta_model.fit(stack_train, y_train_full)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [64]:
y_proba_stack = meta_model.predict_proba(stack_test)[:, 1]

auc_xgb = roc_auc_score(y_test, y_proba_xgb_test)
auc_cat = roc_auc_score(y_test, y_proba_cat_test)
auc_stack = roc_auc_score(y_test, y_proba_stack)

print(f"AUC XGBoost: {auc_xgb:.4f}")
print(f"AUC CatBoost: {auc_cat:.4f}")
print(f"AUC Stacked Model: {auc_stack:.4f}")

AUC XGBoost: 0.7358
AUC CatBoost: 0.7336
AUC Stacked Model: 0.7363
