# 🏥 Predicting Hospital Readmission Rates
An end-to-end machine learning + deep learning pipeline using synthetic hospital data.
Models used: Logistic Regression, Random Forest, XGBoost, MLP (TensorFlow).
Includes: Data generation, preprocessing, training, evaluation, ROC comparison.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# Generate synthetic data
np.random.seed(42)
n_samples = 1000
data = pd.DataFrame({
    'age': np.random.randint(18, 90, n_samples),
    'gender': np.random.choice(['Male', 'Female'], n_samples),
    'length_of_stay': np.random.randint(1, 30, n_samples),
    'num_prev_admissions': np.random.randint(0, 10, n_samples),
    'comorbidity_score': np.random.normal(loc=2, scale=1, size=n_samples).round(1),
    'has_diabetes': np.random.choice([0, 1], n_samples),
    'has_hypertension': np.random.choice([0, 1], n_samples),
    'discharged_to_home': np.random.choice([0, 1], n_samples),
    'readmitted_within_30_days': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
})
data['gender'] = LabelEncoder().fit_transform(data['gender'])

# Split data
X = data.drop('readmitted_within_30_days', axis=1)
y = data['readmitted_within_30_days']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)


In [None]:
# Train classical ML models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}
roc_data = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_data[name] = (fpr, tpr, auc(fpr, tpr))
    print(f"\n{name} Accuracy: {accuracy_score(y_test, model.predict(X_test)):.2f}")
    print(classification_report(y_test, model.predict(X_test)))


In [None]:
# Train MLP with TensorFlow
mlp_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = mlp_model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, callbacks=[early_stop], verbose=0)

# Evaluate and store ROC
y_proba_dl = mlp_model.predict(X_test).ravel()
fpr_dl, tpr_dl, _ = roc_curve(y_test, y_proba_dl)
roc_data["MLP (Deep Learning)"] = (fpr_dl, tpr_dl, auc(fpr_dl, tpr_dl))

print(f"\nMLP Accuracy: {mlp_model.evaluate(X_test, y_test, verbose=0)[1]:.2f}")
y_pred_dl = (y_proba_dl > 0.5).astype('int')
print(classification_report(y_test, y_pred_dl))


In [None]:
# ROC Curve Comparison
plt.figure(figsize=(10, 6))
for name, (fpr, tpr, roc_auc) in roc_data.items():
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC Curve Comparison')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


### 📊 Summary
- XGBoost and MLP performed best in terms of ROC AUC.
- MLP captured nonlinear relationships but took longer to train.
- Logistic Regression gave fast, interpretable results.
- Future work: use real patient data (e.g. MIMIC-III), hyperparameter tuning, feature engineering.