# Hospital Readmission Prediction Project
## Phase 4-5: Feature Engineering & Data Preparation

**Author:** Vindya Siriwardhana  
**Previous Findings:**
- Readmission rate (<30 days): 11.16%
- Class imbalance: 7.96:1 (needs SMOTE!)
- Total patients: 101,766

---
## SETUP & LOAD PREVIOUS WORK

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("‚úÖ Libraries imported successfully!")

In [None]:
# Load data from Phase 1-3
df = pd.read_csv('/mnt/user-data/uploads/diabetic_data.csv')

# Recreate binary target
df['readmitted_binary'] = (df['readmitted'] == '<30').astype(int)

print(f"‚úÖ Data loaded: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"üìä Readmission rate: {(df['readmitted_binary'].sum()/len(df)*100):.2f}%")

---
## PHASE 4: FEATURE ENGINEERING (Steps 10-12)

### Step 10: Create New Features

In [None]:
print("\n" + "="*80)
print("STEP 10: FEATURE ENGINEERING")
print("="*80)

# Make a copy for feature engineering
df_fe = df.copy()

print("\nüîß Creating new features...")

In [None]:
# 1. AGE GROUPS (convert age ranges to numeric mid-points)
age_mapping = {
    '[0-10)': 5, '[10-20)': 15, '[20-30)': 25, '[30-40)': 35,
    '[40-50)': 45, '[50-60)': 55, '[60-70)': 65, '[70-80)': 75,
    '[80-90)': 85, '[90-100)': 95
}
df_fe['age_numeric'] = df_fe['age'].map(age_mapping)

# Age categories
df_fe['age_category'] = pd.cut(df_fe['age_numeric'], 
                                bins=[0, 40, 60, 80, 100], 
                                labels=['Young', 'Middle', 'Senior', 'Elderly'])

print("‚úÖ Age features created:")
print("   - age_numeric (5-95)")
print("   - age_category (Young/Middle/Senior/Elderly)")

In [None]:
# 2. POLYPHARMACY FLAG (5+ medications = polypharmacy)
df_fe['polypharmacy'] = (df_fe['num_medications'] >= 5).astype(int)

print("\n‚úÖ Polypharmacy feature created:")
print(f"   Patients with polypharmacy: {df_fe['polypharmacy'].sum():,} ({df_fe['polypharmacy'].mean()*100:.1f}%)")

In [None]:
# 3. COMORBIDITY COUNT (number of diagnoses)
df_fe['comorbidity_count'] = df_fe['number_diagnoses']

# High comorbidity flag (7+ diagnoses)
df_fe['high_comorbidity'] = (df_fe['number_diagnoses'] >= 7).astype(int)

print("\n‚úÖ Comorbidity features created:")
print(f"   Average diagnoses: {df_fe['number_diagnoses'].mean():.1f}")
print(f"   High comorbidity patients: {df_fe['high_comorbidity'].sum():,} ({df_fe['high_comorbidity'].mean()*100:.1f}%)")

In [None]:
# 4. LENGTH OF STAY CATEGORIES
df_fe['los_category'] = pd.cut(df_fe['time_in_hospital'],
                                bins=[0, 3, 7, 14],
                                labels=['Short', 'Medium', 'Long'])

print("\n‚úÖ Length of stay categories created:")
print(df_fe['los_category'].value_counts())

In [None]:
# 5. PREVIOUS HOSPITAL VISITS (any emergency or inpatient visits)
df_fe['had_emergency'] = (df_fe['number_emergency'] > 0).astype(int)
df_fe['had_inpatient'] = (df_fe['number_inpatient'] > 0).astype(int)
df_fe['had_outpatient'] = (df_fe['number_outpatient'] > 0).astype(int)

# Total previous visits
df_fe['total_previous_visits'] = (df_fe['number_emergency'] + 
                                   df_fe['number_inpatient'] + 
                                   df_fe['number_outpatient'])

print("\n‚úÖ Previous visit features created:")
print(f"   Patients with emergency history: {df_fe['had_emergency'].sum():,}")
print(f"   Patients with inpatient history: {df_fe['had_inpatient'].sum():,}")

In [None]:
# 6. LAB PROCEDURES INTENSITY
df_fe['high_lab_procedures'] = (df_fe['num_lab_procedures'] > 50).astype(int)

print("\n‚úÖ Lab procedures feature created:")
print(f"   Patients with high lab intensity: {df_fe['high_lab_procedures'].sum():,}")

### Step 11: Encode Diagnosis Codes

In [None]:
print("\n" + "="*80)
print("STEP 11: DIAGNOSIS CODE ENCODING")
print("="*80)

# Function to categorize ICD-9 diagnosis codes
def categorize_diagnosis(diag):
    """Categorize ICD-9 codes into major disease categories"""
    if pd.isna(diag) or diag == '?':
        return 'Unknown'
    
    diag = str(diag)
    
    # Extract numeric part
    if diag.startswith('V') or diag.startswith('E'):
        return 'Other'
    
    try:
        code = float(diag)
    except:
        return 'Other'
    
    # ICD-9 code ranges
    if 390 <= code <= 459 or code == 785:
        return 'Circulatory'  # Heart disease, hypertension
    elif 460 <= code <= 519 or code == 786:
        return 'Respiratory'  # COPD, pneumonia
    elif 520 <= code <= 579 or code == 787:
        return 'Digestive'  # GI issues
    elif 250 <= code < 251:
        return 'Diabetes'  # Diabetes complications
    elif 800 <= code <= 999:
        return 'Injury'  # Injuries, poisoning
    elif 140 <= code <= 239:
        return 'Neoplasms'  # Cancer
    elif 580 <= code <= 629 or code == 788:
        return 'Genitourinary'  # Kidney, urinary
    elif 710 <= code <= 739:
        return 'Musculoskeletal'  # Arthritis, etc.
    elif 780 <= code <= 799:
        return 'Symptoms'  # General symptoms
    else:
        return 'Other'

# Apply to all three diagnosis columns
df_fe['diag_1_category'] = df_fe['diag_1'].apply(categorize_diagnosis)
df_fe['diag_2_category'] = df_fe['diag_2'].apply(categorize_diagnosis)
df_fe['diag_3_category'] = df_fe['diag_3'].apply(categorize_diagnosis)

print("\n‚úÖ Diagnosis categories created!")
print("\nüìä Primary Diagnosis Distribution:")
print(df_fe['diag_1_category'].value_counts())

In [None]:
# Create flags for major conditions
df_fe['has_circulatory'] = (df_fe['diag_1_category'] == 'Circulatory').astype(int)
df_fe['has_respiratory'] = (df_fe['diag_1_category'] == 'Respiratory').astype(int)
df_fe['has_diabetes_complication'] = (df_fe['diag_1_category'] == 'Diabetes').astype(int)

print("\n‚úÖ Major condition flags created:")
print(f"   Circulatory conditions: {df_fe['has_circulatory'].sum():,}")
print(f"   Respiratory conditions: {df_fe['has_respiratory'].sum():,}")
print(f"   Diabetes complications: {df_fe['has_diabetes_complication'].sum():,}")

### Step 12: Medication Features

In [None]:
print("\n" + "="*80)
print("STEP 12: MEDICATION FEATURES")
print("="*80)

# Medication columns (all the diabetes medications)
med_columns = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
               'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
               'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
               'miglitol', 'troglitazone', 'tolazamide', 'insulin',
               'glyburide-metformin', 'glipizide-metformin',
               'glimepiride-pioglitazone', 'metformin-rosiglitazone',
               'metformin-pioglitazone']

# Count medications that were changed/prescribed
# (not 'No' or 'Steady')
def count_active_meds(row):
    count = 0
    for med in med_columns:
        if med in row.index and row[med] not in ['No', 'Steady']:
            count += 1
    return count

df_fe['diabetes_meds_count'] = df_fe.apply(count_active_meds, axis=1)

print("\n‚úÖ Diabetes medication count created:")
print(df_fe['diabetes_meds_count'].describe())

In [None]:
# Medication change flag (from 'change' column)
df_fe['medication_changed'] = (df_fe['change'] == 'Ch').astype(int)

# On diabetes medication (from 'diabetesMed' column)
df_fe['on_diabetes_med'] = (df_fe['diabetesMed'] == 'Yes').astype(int)

print("\n‚úÖ Medication change features created:")
print(f"   Medication changed: {df_fe['medication_changed'].sum():,} ({df_fe['medication_changed'].mean()*100:.1f}%)")
print(f"   On diabetes meds: {df_fe['on_diabetes_med'].sum():,} ({df_fe['on_diabetes_med'].mean()*100:.1f}%)")

In [None]:
# Summary of new features
print("\n" + "="*80)
print("FEATURE ENGINEERING COMPLETE!")
print("="*80)

new_features = [
    'age_numeric', 'age_category', 'polypharmacy', 'comorbidity_count',
    'high_comorbidity', 'los_category', 'had_emergency', 'had_inpatient',
    'had_outpatient', 'total_previous_visits', 'high_lab_procedures',
    'diag_1_category', 'diag_2_category', 'diag_3_category',
    'has_circulatory', 'has_respiratory', 'has_diabetes_complication',
    'diabetes_meds_count', 'medication_changed', 'on_diabetes_med'
]

print(f"\nüìä Created {len(new_features)} new features:")
for i, feat in enumerate(new_features, 1):
    print(f"   {i:2d}. {feat}")

print(f"\nüìà Total features now: {df_fe.shape[1]}")

---
## PHASE 5: DATA PREPARATION (Steps 13-15)

### Step 13: Select Features & Prepare for Modeling

In [None]:
print("\n" + "="*80)
print("STEP 13: FEATURE SELECTION & PREPARATION")
print("="*80)

# Select features for modeling
feature_columns = [
    # Demographics
    'age_numeric',
    
    # Hospital stay
    'time_in_hospital', 'num_lab_procedures', 'num_procedures',
    'num_medications', 'number_diagnoses',
    
    # Previous visits
    'number_outpatient', 'number_emergency', 'number_inpatient',
    'total_previous_visits',
    
    # Engineered features
    'polypharmacy', 'high_comorbidity', 'high_lab_procedures',
    'had_emergency', 'had_inpatient', 'had_outpatient',
    'has_circulatory', 'has_respiratory', 'has_diabetes_complication',
    'diabetes_meds_count', 'medication_changed', 'on_diabetes_med'
]

# Add categorical features (will encode later)
categorical_features = ['gender', 'age_category', 'los_category', 'diag_1_category']

print(f"\nüìä Selected {len(feature_columns)} numerical features")
print(f"üìä Selected {len(categorical_features)} categorical features")

In [None]:
# Handle missing values in selected features
df_model = df_fe[feature_columns + categorical_features + ['readmitted_binary']].copy()

# Check for missing values
missing = df_model.isnull().sum()
if missing.sum() > 0:
    print("\n‚ö†Ô∏è Missing values found:")
    print(missing[missing > 0])
    
    # Fill missing with median for numerical, mode for categorical
    for col in feature_columns:
        if df_model[col].isnull().sum() > 0:
            df_model[col].fillna(df_model[col].median(), inplace=True)
    
    for col in categorical_features:
        if df_model[col].isnull().sum() > 0:
            df_model[col].fillna(df_model[col].mode()[0], inplace=True)
    
    print("‚úÖ Missing values filled!")
else:
    print("\n‚úÖ No missing values in selected features!")

In [None]:
# Encode categorical features
print("\nüîß Encoding categorical features...")

le_dict = {}
for col in categorical_features:
    le = LabelEncoder()
    df_model[col + '_encoded'] = le.fit_transform(df_model[col].astype(str))
    le_dict[col] = le
    print(f"   ‚úì {col}: {len(le.classes_)} categories")

# Update feature list with encoded columns
final_features = feature_columns + [col + '_encoded' for col in categorical_features]

print(f"\n‚úÖ Total features for modeling: {len(final_features)}")

In [None]:
# Prepare X and y
X = df_model[final_features]
y = df_model['readmitted_binary']

print(f"\nüìä X shape: {X.shape}")
print(f"üìä y shape: {y.shape}")
print(f"\nüéØ Target distribution:")
print(y.value_counts())
print(f"\n‚öñÔ∏è Class imbalance: {(y==0).sum() / (y==1).sum():.2f}:1")

### Step 13: Train-Test Split

In [None]:
print("\n" + "="*80)
print("TRAIN-TEST SPLIT")
print("="*80)

# Split: 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.30, 
    random_state=42, 
    stratify=y  # Maintain class distribution
)

print(f"\nüìä Training set: {X_train.shape[0]:,} samples")
print(f"üìä Test set: {X_test.shape[0]:,} samples")

print(f"\nüéØ Train set class distribution:")
print(y_train.value_counts())
print(f"   Readmission rate: {y_train.mean()*100:.2f}%")

print(f"\nüéØ Test set class distribution:")
print(y_test.value_counts())
print(f"   Readmission rate: {y_test.mean()*100:.2f}%")

### Step 14: Handle Class Imbalance with SMOTE

In [None]:
print("\n" + "="*80)
print("STEP 14: HANDLING CLASS IMBALANCE (7.96:1)")
print("="*80)

print("\n‚ö†Ô∏è BEFORE SMOTE:")
print(f"   Class 0 (Not readmitted): {(y_train==0).sum():,}")
print(f"   Class 1 (Readmitted <30): {(y_train==1).sum():,}")
print(f"   Imbalance ratio: {(y_train==0).sum() / (y_train==1).sum():.2f}:1")

# Apply SMOTE
print("\nüîß Applying SMOTE...")
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("\n‚úÖ AFTER SMOTE:")
print(f"   Class 0 (Not readmitted): {(y_train_balanced==0).sum():,}")
print(f"   Class 1 (Readmitted <30): {(y_train_balanced==1).sum():,}")
print(f"   Imbalance ratio: {(y_train_balanced==0).sum() / (y_train_balanced==1).sum():.2f}:1")
print(f"\nüìä Training set size increased from {X_train.shape[0]:,} to {X_train_balanced.shape[0]:,}")

In [None]:
# Visualize class balance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Before SMOTE
pd.Series(y_train).value_counts().plot(kind='bar', ax=axes[0], color=['green', 'red'])
axes[0].set_title('Before SMOTE (Imbalanced)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['Not Readmitted', 'Readmitted <30'], rotation=0)

# After SMOTE
pd.Series(y_train_balanced).value_counts().plot(kind='bar', ax=axes[1], color=['green', 'red'])
axes[1].set_title('After SMOTE (Balanced)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Count')
axes[1].set_xticklabels(['Not Readmitted', 'Readmitted <30'], rotation=0)

plt.tight_layout()
plt.show()

print("\n‚úÖ Class imbalance handled!")

### Step 15: Scale Features

In [None]:
print("\n" + "="*80)
print("STEP 15: FEATURE SCALING")
print("="*80)

# Initialize scaler
scaler = StandardScaler()

# Fit on balanced training data and transform
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)  # Use same scaler on test

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=final_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=final_features)

print("\n‚úÖ Features scaled using StandardScaler!")
print(f"\nüìä Training set (scaled & balanced): {X_train_scaled.shape}")
print(f"üìä Test set (scaled): {X_test_scaled.shape}")

# Show example of scaling
print("\nüìà Example - 'age_numeric' scaling:")
print(f"   Original mean: {X_train['age_numeric'].mean():.2f}")
print(f"   Scaled mean: {X_train_scaled['age_numeric'].mean():.2f}")
print(f"   Original std: {X_train['age_numeric'].std():.2f}")
print(f"   Scaled std: {X_train_scaled['age_numeric'].std():.2f}")

---
## üíæ SAVE PREPARED DATA

In [None]:
print("\n" + "="*80)
print("SAVING PREPARED DATA")
print("="*80)

# Save processed data
import pickle

# Save splits
data_dict = {
    'X_train': X_train_scaled,
    'X_test': X_test_scaled,
    'y_train': y_train_balanced,
    'y_test': y_test,
    'feature_names': final_features,
    'scaler': scaler,
    'label_encoders': le_dict
}

with open('/home/claude/hospital_readmission_prepared_data.pkl', 'wb') as f:
    pickle.dump(data_dict, f)

print("\nüíæ Saved:")
print("   ‚úì X_train (scaled & balanced)")
print("   ‚úì X_test (scaled)")
print("   ‚úì y_train (balanced)")
print("   ‚úì y_test")
print("   ‚úì Feature names")
print("   ‚úì Scaler")
print("   ‚úì Label encoders")

print("\nüìÅ File: hospital_readmission_prepared_data.pkl")

---
## üìä PHASE 4-5 SUMMARY

In [None]:
print("\n" + "="*80)
print("PHASE 4-5 COMPLETED SUMMARY")
print("="*80)

print("\n‚úÖ COMPLETED STEPS:")
print("   Step 10: ‚úì Feature engineering (20 new features)")
print("   Step 11: ‚úì Diagnosis codes encoded")
print("   Step 12: ‚úì Medication features created")
print("   Step 13: ‚úì Train-test split (70-30)")
print("   Step 14: ‚úì Class imbalance handled (SMOTE)")
print("   Step 15: ‚úì Features scaled (StandardScaler)")

print("\nüìä FINAL DATASET STATISTICS:")
print(f"   ‚Ä¢ Total features: {len(final_features)}")
print(f"   ‚Ä¢ Training samples (balanced): {X_train_scaled.shape[0]:,}")
print(f"   ‚Ä¢ Test samples: {X_test_scaled.shape[0]:,}")
print(f"   ‚Ä¢ Class balance (train): 1:1 (perfect!)")
print(f"   ‚Ä¢ Features scaled: ‚úì (mean=0, std=1)")

print("\nüéØ NEXT STEPS (Phase 6):")
print("   Step 16: Train Logistic Regression (baseline)")
print("   Step 17: Train Random Forest")
print("   Step 18: Train XGBoost")
print("   Step 19: Hyperparameter tuning")

print("\n" + "="*80)
print("Ready to proceed to Phase 6: Model Building!")
print("="*80)