# Model Training: Hospital Readmission Prediction
## MSDS692 - Data Science Practicum
### Sai Teja Lakkapally

This notebook focuses on training and evaluating machine learning models for predicting 30-day hospital readmissions.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix
from sklearn.calibration import calibration_curve

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', 50)

print("Libraries imported successfully!")

In [None]:
# Import project modules
import sys
sys.path.append('../src')

from etl import DataETL
from features import FeatureEngineer
from model import ReadmissionModel

## 1. Data Preparation and Feature Engineering

In [None]:
# Load and prepare data
print("Step 1: Loading and preparing data...")
etl = DataETL()
data = etl.run_pipeline()

print(f"Original dataset shape: {data.shape}")

In [None]:
# Feature engineering
print("Step 2: Feature engineering...")
feature_engineer = FeatureEngineer()
X, y, feature_names = feature_engineer.prepare_features(data)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Number of features: {len(feature_names)}")
print(f"Target distribution:\n{y.value_counts(normalize=True)}")

In [None]:
# Handle class imbalance
print("Step 3: Handling class imbalance...")
X_balanced, y_balanced = feature_engineer.handle_imbalance(X, y, method='smote')

print(f"After balancing - Features shape: {X_balanced.shape}")
print(f"After balancing - Target distribution:\n{pd.Series(y_balanced).value_counts(normalize=True)}")

In [None]:
# Train-test split
print("Step 4: Train-test split...")
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_balanced
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training target distribution:\n{pd.Series(y_train).value_counts(normalize=True)}")

## 2. Baseline Model Training

In [None]:
# Initialize model trainer
model_trainer = ReadmissionModel(random_state=42)

# Train baseline models
print("Training Baseline Models...")
print("=" * 50)

model_trainer.train_baseline_models(X_train, y_train, cv_folds=5)

In [None]:
# Display baseline model results
print("Baseline Model Performance (Cross-Validation):")
print("=" * 50)

baseline_results = []
for model_name, results in model_trainer.results.items():
    if model_name in ['logistic_regression', 'random_forest']:
        baseline_results.append({
            'Model': model_name.replace('_', ' ').title(),
            'ROC-AUC Mean': f"{results['cv_roc_auc_mean']:.4f}",
            'ROC-AUC Std': f"{results['cv_roc_auc_std']:.4f}",
            'PR-AUC Mean': f"{results['cv_ap_mean']:.4f}",
            'PR-AUC Std': f"{results['cv_ap_std']:.4f}"
        })

baseline_df = pd.DataFrame(baseline_results)
print(baseline_df.to_string(index=False))

## 3. Advanced Model Training

In [None]:
# Train advanced models
print("Training Advanced Models...")
print("=" * 50)

model_trainer.train_advanced_models(X_train, y_train, cv_folds=5)

In [None]:
# Display all model results
print("All Model Performance (Cross-Validation):")
print("=" * 50)

all_results = []
for model_name, results in model_trainer.results.items():
    all_results.append({
        'Model': model_name.replace('_', ' ').title(),
        'ROC-AUC Mean': f"{results['cv_roc_auc_mean']:.4f}",
        'ROC-AUC Std': f"{results['cv_roc_auc_std']:.4f}",
        'PR-AUC Mean': f"{results['cv_ap_mean']:.4f}",
        'PR-AUC Std': f"{results['cv_ap_std']:.4f}"
    })

all_models_df = pd.DataFrame(all_results)
print(all_models_df.to_string(index=False))

## 4. Model Evaluation on Test Set

In [None]:
# Evaluate all models on test set
print("Evaluating Models on Test Set...")
print("=" * 50)

test_results = model_trainer.evaluate_models(X_test, y_test)

In [None]:
# Display test results
print("Test Set Performance:")
print("=" * 50)

test_performance = []
for model_name, results in test_results.items():
    test_performance.append({
        'Model': model_name.replace('_', ' ').title(),
        'ROC-AUC': f"{results['roc_auc']:.4f}",
        'PR-AUC': f"{results['pr_auc']:.4f}"
    })

test_df = pd.DataFrame(test_performance)
print(test_df.to_string(index=False))

In [None]:
# Visualize model comparison
model_trainer.plot_model_comparison(test_results)

## 5. Detailed Model Analysis

In [None]:
# Best model analysis
best_model_name = model_trainer.best_model_name
best_model_results = test_results[best_model_name]

print(f"BEST MODEL: {best_model_name.replace('_', ' ').title()}")
print("=" * 50)
print(f"ROC-AUC: {best_model_results['roc_auc']:.4f}")
print(f"PR-AUC: {best_model_results['pr_auc']:.4f}")

# Classification report for best model
y_pred_best = best_model_results['y_pred']
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best, target_names=['Not Readmitted', 'Readmitted']))

In [None]:
# Confusion matrix for best model
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Not Readmitted', 'Readmitted'],
            yticklabels=['Not Readmitted', 'Readmitted'])
plt.title(f'Confusion Matrix - {best_model_name.replace("_", " ").title()}', fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# ROC and PR curves for all models
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# ROC curves
for model_name, results in test_results.items():
    y_pred_proba = results['y_pred_proba']
    
    from sklearn.metrics import roc_curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    
    ax1.plot(fpr, tpr, label=f'{model_name.replace("_", " ").title()} (AUC = {results["roc_auc"]:.3f})', linewidth=2)

ax1.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random Classifier')
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curves - All Models', fontweight='bold')
ax1.legend()
ax1.grid(alpha=0.3)

# PR curves
for model_name, results in test_results.items():
    y_pred_proba = results['y_pred_proba']
    
    from sklearn.metrics import precision_recall_curve
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    
    ax2.plot(recall, precision, label=f'{model_name.replace("_", " ").title()} (AP = {results["pr_auc"]:.3f})', linewidth=2)

ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
ax2.set_title('Precision-Recall Curves - All Models', fontweight='bold')
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Feature Importance Analysis

In [None]:
# Display feature importance for tree-based models
print("Top Features by Importance:")
print("=" * 50)

for model_name in ['random_forest', 'xgboost', 'lightgbm']:
    if model_name in model_trainer.feature_importance:
        importance_df = model_trainer.feature_importance[model_name]
        
        print(f"\n{model_name.replace('_', ' ').title()} - Top 10 Features:")
        print("-" * 40)
        
        top_features = importance_df.head(10)
        for idx, row in top_features.iterrows():
            print(f"{row['feature']}: {row['importance']:.4f}")

In [None]:
# Visualize feature importance for best tree-based model
if model_trainer.best_model_name in model_trainer.feature_importance:
    importance_df = model_trainer.feature_importance[model_trainer.best_model_name]
    top_20 = importance_df.head(20)
    
    plt.figure(figsize=(12, 8))
    bars = plt.barh(top_20['feature'], top_20['importance'], color='skyblue')
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Feature Importance - {model_trainer.best_model_name.replace("_", " ").title()}', 
              fontweight='bold')
    plt.gca().invert_yaxis()
    
 Add value labels
    for bar in bars:
        width = bar.get_width()
        plt.text(width + 0.001, bar.get_y() + bar.get_height()/2, 
                f'{width:.4f}', ha='left', va='center')
    
    plt.tight_layout()
    plt.show()

## 7. Model Calibration

In [None]:
# Calibration curves
plt.figure(figsize=(10, 8))

for model_name, results in test_results.items():
    y_pred_proba = results['y_pred_proba']
    
    fraction_of_positives, mean_predicted_value = calibration_curve(y_test, y_pred_proba, n_bins=10)
    
    plt.plot(mean_predicted_value, fraction_of_positives, 's-', 
             label=f'{model_name.replace("_", " ").title()}')

plt.plot([0, 1], [0, 1], 'k:', label='Perfectly calibrated')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Curves - All Models', fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

## 8. Model Saving and Final Results

In [None]:
# Save trained models
print("Saving trained models...")
model_trainer.save_models('../models/')
print("✓ Models saved successfully!")

# Save feature names
import joblib
joblib.dump(feature_names, '../models/feature_names.pkl')
joblib.dump(feature_engineer.preprocessor, '../models/preprocessor.pkl')
print("✓ Feature names and preprocessor saved!")

In [None]:
# Final performance summary
print("FINAL MODEL PERFORMANCE SUMMARY")
print("=" * 60)

print(f"\nBEST MODEL: {model_trainer.best_model_name.replace('_', ' ').title()}")
print(f"ROC-AUC Score: {test_results[model_trainer.best_model_name]['roc_auc']:.4f}")
print(f"PR-AUC Score: {test_results[model_trainer.best_model_name]['pr_auc']:.4f}")

print(f"\nDATASET INFORMATION:")
print(f"• Training samples: {X_train.shape[0]:,}")
print(f"• Test samples: {X_test.shape[0]:,}")
print(f"• Total features: {len(feature_names)}")
print(f"• Feature categories: Medical, SDOH, Demographic")

print(f"\nMODEL COMPARISON:")
for model_name, results in test_results.items():
    print(f"• {model_name.replace('_', ' ').title()}: ROC-AUC = {results['roc_auc']:.4f}, PR-AUC = {results['pr_auc']:.4f}")

print(f"\nKEY SUCCESS METRICS:")
print(f"✓ Integrated medical + SDOH data successfully")
print(f"✓ Handled class imbalance effectively")
print(f"✓ Achieved strong predictive performance")
print(f"✓ All models trained and evaluated")
print(f"✓ Models saved for deployment")

print("\n" + "=" * 60)
print("MODEL TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 60)

## Next Steps

1. **Model Interpretation**: Use SHAP for explainability (see interpretability notebook)
2. **Fairness Analysis**: Evaluate model performance across demographic groups
3. **Dashboard Deployment**: Create interactive dashboard for stakeholders
4. **Model Monitoring**: Set up monitoring for production deployment