In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from holoviews.plotting.bokeh.styles import alpha
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier


import warnings
warnings.filterwarnings('ignore')


In [2]:
# importing csv
df = pd.read_csv('extractedMimic.csv')


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4559 entries, 0 to 4558
Columns: 106 entries, icustay_id to crystalloid_bolus
dtypes: float64(57), int64(39), object(10)
memory usage: 3.7+ MB


In [4]:
df.drop(df[df['age'] < 18].index, inplace=True)

df.describe()

Unnamed: 0,icustay_id,hadm_id,suspected_infection_time_poe_days,positiveculture_poe,blood_culture_positive,age,is_male,race_white,race_black,race_hispanic,...,glucose_min1,glucose_max1,glucose_mean,rrt,subject_id,hadm_id.1,icustay_id.1,urineoutput,colloid_bolus,crystalloid_bolus
count,4555.0,4555.0,4555.0,4555.0,4555.0,4555.0,4555.0,4555.0,4555.0,4555.0,...,4525.0,4525.0,4525.0,4555.0,4555.0,4555.0,4555.0,4555.0,508.0,3361.0
mean,250637.474863,149922.364874,0.084101,0.136334,0.381339,65.173619,0.565971,0.718771,0.086279,0.032931,...,111.232044,411.037348,177.261026,0.043469,68174.064105,149922.364874,250637.474863,1842.618441,382.694882,645.671229
std,28759.579721,28710.589841,0.290748,0.34318,0.485769,17.622075,0.495683,0.449649,0.280806,0.178475,...,36.773707,14863.345644,2123.773411,0.203932,18470.812604,28710.589841,28759.579721,1535.550605,134.934798,370.024064
min,200075.0,100003.0,-0.991076,0.0,0.0,18.0209,0.0,0.0,0.0,0.0,...,12.0,57.0,52.444444,0.0,165.0,100003.0,200075.0,0.0,150.0,250.0
25%,225575.5,125404.5,-0.075677,0.0,0.0,53.78485,0.0,0.0,0.0,0.0,...,89.0,130.0,112.888889,0.0,53134.0,125404.5,225575.5,897.5,250.0,500.0
50%,250984.0,149667.0,0.034965,0.0,0.0,66.5915,1.0,1.0,0.0,0.0,...,106.0,166.0,134.0,0.0,68391.0,149667.0,250984.0,1560.0,500.0,500.0
75%,275436.0,175042.5,0.157309,0.0,1.0,79.5404,1.0,1.0,0.0,0.0,...,127.0,217.0,165.0,0.0,83771.5,175042.5,275436.0,2460.0,500.0,1000.0
max,299998.0,199962.0,0.995139,1.0,1.0,91.4,1.0,1.0,1.0,1.0,...,480.0,999999.0,142966.8571,1.0,99982.0,199962.0,299998.0,50515.0,1000.0,11000.0


In [5]:
X = df[['urineoutput', 'lactate_min','bun_mean','sysbp_min', 'metastatic_cancer', 'inr_max', 'age', 'sodium_max', 'aniongap_max', 'creatinine_min', 'spo2_mean']]

y = df['thirtyday_expire_flag']

In [6]:
X.fillna(X.median(), inplace=True)

In [7]:
X.isnull().sum()

urineoutput          0
lactate_min          0
bun_mean             0
sysbp_min            0
metastatic_cancer    0
inr_max              0
age                  0
sodium_max           0
aniongap_max         0
creatinine_min       0
spo2_mean            0
dtype: int64

In [8]:
X.shape

(4555, 11)

In [9]:
# Step 3: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes of the resulting datasets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (3644, 11)
Testing set shape: (911, 11)


In [10]:
# Step 1: Train the XGBoost model with hyperparameter tuning
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False, 
    eval_metric='logloss', 
    booster='gbtree', 
    objective='binary:logistic', 
    n_estimators=90,      # Increased number of trees for better learning
    max_depth=6,           # Control the depth of trees
    learning_rate=0.05,    # Reduced learning rate
    subsample=0.8,         # Control overfitting by using 80% of data for each tree
    colsample_bytree=0.8,  # Feature subsampling
    gamma=0.05,
)
xgb_model.fit(X_train, y_train)


In [11]:
# Step 2: Make predictions on the test set
y_pred = xgb_model.predict(X_test)  # Predicted labels (0 or 1)
y_probs = xgb_model.predict_proba(X_test)[:, 1]  # Predicted probabilities for class 1


In [12]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Assuming y_test, y_pred, and y_probs are already defined
# y_test: True class labels
# y_pred: Predicted class labels
# y_probs: Predicted probabilities (for AUC)

# Step 3: Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)  # Accuracy
auc = roc_auc_score(y_test, y_probs)  # AUC
precision = precision_score(y_test, y_pred, average='binary')  # Precision (binary classification)
recall = recall_score(y_test, y_pred, average='binary')  # Recall (binary classification)
conf_matrix = confusion_matrix(y_test, y_pred)  # Confusion matrix
class_report = classification_report(y_test, y_pred)  # Classification report

# Print evaluation metrics
print(f"Model Accuracy: {accuracy * 100:.5f}%")
print(f"Model Precision: {precision * 100:.5f}%")
print(f"Model Recall: {recall * 100:.5f}%")
print(f"Model AUC: {auc * 100:.5f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Model Accuracy: 84.74204%
Model Precision: 76.54321%
Model Recall: 34.06593%
Model AUC: 83.34012%
Confusion Matrix:
[[710  19]
 [120  62]]
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.97      0.91       729
           1       0.77      0.34      0.47       182

    accuracy                           0.85       911
   macro avg       0.81      0.66      0.69       911
weighted avg       0.84      0.85      0.82       911



In [14]:
import joblib
joblib.dump(xgb_model, 'model.pkl')  # Save as 'model.pkl'


['model.pkl']