# Healthcare Readmission Prediction Pipeline

In [1]:

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.metrics import brier_score_loss, roc_auc_score, accuracy_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')


In [4]:

# Step 2: Load Dataset
df = pd.read_csv('diabetic_data.csv')
print("Shape of data:", df.shape)
df = df.replace('?', np.nan)
df = df.drop(columns=['weight', 'payer_code', 'medical_specialty', 'encounter_id', 'patient_nbr'], errors='ignore')


Shape of data: (101766, 50)


In [5]:

# Step 3: Data Types and Target Setup
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(exclude=['object']).columns.tolist()

df['readmitted'] = df['readmitted'].map({'NO':0, '>30':1, '<30':1})
y = df['readmitted']
X = df.drop(columns=['readmitted'])


In [6]:

# Step 4: Handle time-based numeric features if any
if 'admission_type_id' in X.columns:
    X['admission_type_id'] = X['admission_type_id'].astype(int)
if 'discharge_disposition_id' in X.columns:
    X['discharge_disposition_id'] = X['discharge_disposition_id'].astype(int)

# Split dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Train:", X_train.shape, "Validation:", X_val.shape, "Test:", X_test.shape)


Train: (71236, 44) Validation: (15265, 44) Test: (15265, 44)


In [12]:
# ✅ Step 5 (Fixed): Dynamic Preprocessing Setup
# Automatically detect valid columns and handle mixed data safely

# Re-identify columns after cleaning
categorical_cols = [c for c in X.columns if X[c].dtype == 'object']
numerical_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]

print("Detected categorical columns:", len(categorical_cols))
print("Detected numerical columns:", len(numerical_cols))

# Handle edge cases (e.g., no numeric or no categorical columns)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
]) if len(numerical_cols) > 0 else 'drop'

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
]) if len(categorical_cols) > 0 else 'drop'

# Only include valid transformers
transformers = []
if len(numerical_cols) > 0:
    transformers.append(('num', numeric_transformer, numerical_cols))
if len(categorical_cols) > 0:
    transformers.append(('cat', categorical_transformer, categorical_cols))

preprocessor = ColumnTransformer(transformers=transformers)
print("✅ Preprocessor successfully created with available columns.")


Detected categorical columns: 33
Detected numerical columns: 11
✅ Preprocessor successfully created with available columns.


In [None]:
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'),
    'SVM': SVC(probability=True, random_state=42)
}

In [None]:
# --- Step 6: Train, Calibrate (robust to sklearn version) ---
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss, roc_auc_score, accuracy_score
from sklearn.base import clone
import pandas as pd

# Make sure labels are integer
y_train = y_train.astype(int)
y_val   = y_val.astype(int)
y_test  = y_test.astype(int)

# Fit the preprocessor on training data and transform all sets
X_train_tr = preprocessor.fit_transform(X_train)
X_val_tr   = preprocessor.transform(X_val)
X_test_tr  = preprocessor.transform(X_test)

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    # clone the model to ensure a fresh unfitted estimator each loop
    clf = clone(model)

    # fit base model on transformed training data
    clf.fit(X_train_tr, y_train)

    # wrap the fitted classifier with CalibratedClassifierCV using prefit mode
    # newer sklearn versions use `estimator=`, older versions use `base_estimator=`
    try:
        calibrator = CalibratedClassifierCV(estimator=clf, cv='prefit', method='isotonic')
    except TypeError:
        # fallback for older sklearn releases
        calibrator = CalibratedClassifierCV(base_estimator=clf, cv='prefit', method='isotonic')

    # Calibrate using the validation set
    calibrator.fit(X_val_tr, y_val)

    # Evaluate on test set
    y_proba = calibrator.predict_proba(X_test_tr)[:, 1]
    y_pred  = (y_proba > 0.5).astype(int)

    brier = brier_score_loss(y_test, y_proba)
    auc   = roc_auc_score(y_test, y_proba)
    acc   = accuracy_score(y_test, y_pred)

    results[name] = {'Brier': brier, 'AUC': auc, 'Accuracy': acc}
    print(f"{name}: Brier={brier:.4f}, AUC={auc:.4f}, Accuracy={acc:.4f}")

# summary
results_df = pd.DataFrame(results).T
print("\nModel Comparison:\n", results_df)



Training RandomForest...
RandomForest: Brier=0.2205, AUC=0.6925, Accuracy=0.6437

Training XGBoost...
XGBoost: Brier=0.2163, AUC=0.7049, Accuracy=0.6474

Training SVM...


In [None]:

# Step 7: Calibration Curves
plt.figure(figsize=(7,5))
for name, model in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    y_prob = pipe.predict_proba(X_test)[:,1]
    prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
    plt.plot(prob_pred, prob_true, marker='o', label=name)
plt.plot([0,1],[0,1],'--',color='gray')
plt.title("Calibration Curves")
plt.xlabel("Predicted Probability")
plt.ylabel("True Probability")
plt.legend()
plt.show()


In [None]:

# Step 8: Feature Importance (Random Forest)
rf_pipe = Pipeline(steps=[('preprocessor', preprocessor), ('model', RandomForestClassifier(random_state=42))])
rf_pipe.fit(X_train, y_train)
rf_model = rf_pipe.named_steps['model']

if hasattr(rf_model, 'feature_importances_'):
    importances = rf_model.feature_importances_
    feature_names = rf_pipe.named_steps['preprocessor'].get_feature_names_out()
    feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)[:20]

    plt.figure(figsize=(10,6))
    sns.barplot(x=feat_imp.values, y=feat_imp.index)
    plt.title("Top 20 Important Features")
    plt.show()


In [None]:

# Step 9: Clinical Utility Summary
print("=== Clinical Utility Summary ===")
print("The best calibrated model provides reliable probability estimates for hospital readmission.")
print("Top predictors may include medication changes, lab procedures, and number of inpatient visits.")
print("These can guide follow-up scheduling and preventive care.")
