In [None]:
# Data Preprocessing Pipeline for Hospital Readmission Prediction
# This script handles data cleaning, feature engineering, and bias detection

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

class DataPreprocessor:
    """
    A class to handle all data preprocessing steps for hospital readmission prediction
    """
    
    def __init__(self):
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='median')
        
    def clean_data(self, df):
        """
        Clean the raw data by handling errors and inconsistencies
        """
        print("Step 1: Cleaning raw data...")
        
        # Make a copy to avoid modifying original data
        cleaned_df = df.copy()
        
        # Remove duplicate records
        initial_size = len(cleaned_df)
        cleaned_df = cleaned_df.drop_duplicates()
        duplicates_removed = initial_size - len(cleaned_df)
        print(f"  - Removed {duplicates_removed} duplicate records")
        
        # Fix obvious data errors
        # Age should be between 0 and 120
        age_errors = (cleaned_df['age'] < 0) | (cleaned_df['age'] > 120)
        if age_errors.sum() > 0:
            print(f"  - Found {age_errors.sum()} age errors, fixing...")
            cleaned_df.loc[age_errors, 'age'] = cleaned_df['age'].median()
        
        # Length of stay should be positive
        los_errors = cleaned_df['length_of_stay'] <= 0
        if los_errors.sum() > 0:
            print(f"  - Found {los_errors.sum()} length of stay errors, fixing...")
            cleaned_df.loc[los_errors, 'length_of_stay'] = cleaned_df['length_of_stay'].median()
        
        # Number of medications should be non-negative
        med_errors = cleaned_df['num_medications'] < 0
        if med_errors.sum() > 0:
            print(f"  - Found {med_errors.sum()} medication count errors, fixing...")
            cleaned_df.loc[med_errors, 'num_medications'] = 0
        
        # Standardize text fields
        cleaned_df['gender'] = cleaned_df['gender'].str.upper()
        cleaned_df['insurance_type'] = cleaned_df['insurance_type'].str.replace('_', ' ').str.title()
        
        print(f"  - Data cleaning complete. Final size: {len(cleaned_df)} records")
        return cleaned_df
    
    def handle_missing_values(self, df):
        """
        Handle missing values in the dataset
        """
        print("Step 2: Handling missing values...")
        
        # Check for missing values
        missing_counts = df.isnull().sum()
        missing_percent = (missing_counts / len(df)) * 100
        
        missing_info = pd.DataFrame({
            'Missing Count': missing_counts,
            'Missing Percent': missing_percent
        })
        
        print("  Missing values summary:")
        print(missing_info[missing_info['Missing Count'] > 0])
        
        # Handle missing values based on column type
        processed_df = df.copy()
        
        # For numerical columns, use median imputation
        numerical_cols = ['age', 'length_of_stay', 'num_medications', 'chronic_conditions', 
                         'previous_admissions', 'distance_from_hospital']
        
        for col in numerical_cols:
            if processed_df[col].isnull().sum() > 0:
                median_value = processed_df[col].median()
                processed_df[col].fillna(median_value, inplace=True)
                print(f"  - Filled {col} missing values with median: {median_value:.2f}")
        
        # For categorical columns, use mode imputation
        categorical_cols = ['gender', 'insurance_type', 'discharge_destination']
        
        for col in categorical_cols:
            if processed_df[col].isnull().sum() > 0:
                mode_value = processed_df[col].mode()[0]
                processed_df[col].fillna(mode_value, inplace=True)
                print(f"  - Filled {col} missing values with mode: {mode_value}")
        
        return processed_df
    
    def feature_engineering(self, df):
        """
        Create new features from existing data
        """
        print("Step 3: Engineering new features...")
        
        engineered_df = df.copy()
        
        # Create polypharmacy flag (taking more than 5 medications)
        engineered_df['polypharmacy'] = (engineered_df['num_medications'] > 5).astype(int)
        print("  - Created polypharmacy flag (>5 medications)")
        
        # Create elderly flag (age > 75)
        engineered_df['elderly'] = (engineered_df['age'] > 75).astype(int)
        print("  - Created elderly flag (>75 years)")
        
        # Create frequent admissions flag (>2 previous admissions)
        engineered_df['frequent_admissions'] = (engineered_df['previous_admissions'] > 2).astype(int)
        print("  - Created frequent admissions flag (>2 previous)")
        
        # Create long stay flag (>7 days)
        engineered_df['long_stay'] = (engineered_df['length_of_stay'] > 7).astype(int)
        print("  - Created long stay flag (>7 days)")
        
        # Create high-risk discharge flag (not going home)
        engineered_df['high_risk_discharge'] = (engineered_df['discharge_destination'] != 'Home').astype(int)
        print("  - Created high-risk discharge flag (not home)")
        
        # Create distance category (far from hospital)
        engineered_df['far_from_hospital'] = (engineered_df['distance_from_hospital'] > 20).astype(int)
        print("  - Created far from hospital flag (>20 miles)")
        
        # Create comorbidity burden score
        engineered_df['comorbidity_burden'] = (
            engineered_df['chronic_conditions'] * 0.3 +
            engineered_df['num_medications'] * 0.1 +
            engineered_df['age'] * 0.01
        )
        print("  - Created comorbidity burden score")
        
        return engineered_df
    
    def encode_categorical_variables(self, df):
        """
        Encode categorical variables for machine learning
        """
        print("Step 4: Encoding categorical variables...")
        
        encoded_df = df.copy()
        categorical_columns = ['gender', 'insurance_type', 'discharge_destination']
        
        for col in categorical_columns:
            if col not in self.label_encoders:
                self.label_encoders[col] = LabelEncoder()
            
            encoded_df[f'{col}_encoded'] = self.label_encoders[col].fit_transform(encoded_df[col])
            print(f"  - Encoded {col}: {dict(zip(self.label_encoders[col].classes_, self.label_encoders[col].transform(self.label_encoders[col].classes_)))}")
        
        return encoded_df
    
    def detect_bias(self, df):
        """
        Detect potential bias in the dataset
        """
        print("Step 5: Detecting potential bias...")
        
        # Check readmission rates by demographic groups
        bias_report = {}
        
        # Gender bias
        gender_bias = df.groupby('gender')['readmitted_30_days'].agg(['count', 'mean'])
        bias_report['gender'] = gender_bias
        print("  Gender bias analysis:")
        print(gender_bias)
        
        # Insurance bias
        insurance_bias = df.groupby('insurance_type')['readmitted_30_days'].agg(['count', 'mean'])
        bias_report['insurance'] = insurance_bias
        print("\n  Insurance bias analysis:")
        print(insurance_bias)
        
        # Age group bias
        df['age_group'] = pd.cut(df['age'], bins=[0, 50, 65, 80, 120], labels=['<50', '50-65', '65-80', '>80'])
        age_bias = df.groupby('age_group')['readmitted_30_days'].agg(['count', 'mean'])
        bias_report['age_group'] = age_bias
        print("\n  Age group bias analysis:")
        print(age_bias)
        
        # Statistical significance test for bias
        from scipy.stats import chi2_contingency
        
        # Test for gender bias
        gender_crosstab = pd.crosstab(df['gender'], df['readmitted_30_days'])
        chi2_gender, p_gender = chi2_contingency(gender_crosstab)[:2]
        print(f"\n  Gender bias test: Chi2={chi2_gender:.3f}, p-value={p_gender:.3f}")
        
        # Test for insurance bias
        insurance_crosstab = pd.crosstab(df['insurance_type'], df['readmitted_30_days'])
        chi2_insurance, p_insurance = chi2_contingency(insurance_crosstab)[:2]
        print(f"  Insurance bias test: Chi2={chi2_insurance:.3f}, p-value={p_insurance:.3f}")
        
        return bias_report
    
    def create_visualizations(self, df):
        """
        Create visualizations to understand the data better
        """
        print("Step 6: Creating data visualizations...")
        
        # Set up the plotting style
        plt.style.use('default')
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        
        # 1. Age distribution
        axes[0, 0].hist(df['age'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
        axes[0, 0].set_title('Age Distribution')
        axes[0, 0].set_xlabel('Age')
        axes[0, 0].set_ylabel('Count')
        
        # 2. Readmission rate by gender
        gender_rates = df.groupby('gender')['readmitted_30_days'].mean()
        axes[0, 1].bar(gender_rates.index, gender_rates.values, color=['pink', 'lightblue'])
        axes[0, 1].set_title('Readmission Rate by Gender')
        axes[0, 1].set_ylabel('Readmission Rate')
        
        # 3. Length of stay distribution
        axes[0, 2].hist(df['length_of_stay'], bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
        axes[0, 2].set_title('Length of Stay Distribution')
        axes[0, 2].set_xlabel('Days')
        axes[0, 2].set_ylabel('Count')
        
        # 4. Chronic conditions vs readmission
        chronic_rates = df.groupby('chronic_conditions')['readmitted_30_days'].mean()
        axes[1, 0].plot(chronic_rates.index, chronic_rates.values, marker='o', color='red')
        axes[1, 0].set_title('Readmission Rate by Chronic Conditions')
        axes[1, 0].set_xlabel('Number of Chronic Conditions')
        axes[1, 0].set_ylabel('Readmission Rate')
        
        # 5. Insurance type distribution
        insurance_counts = df['insurance_type'].value_counts()
        axes[1, 1].pie(insurance_counts.values, labels=insurance_counts.index, autopct='%1.1f%%')
        axes[1, 1].set_title('Insurance Type Distribution')
        
        # 6. Correlation heatmap of numerical features
        numerical_features = ['age', 'length_of_stay', 'num_medications', 'chronic_conditions', 
                             'previous_admissions', 'distance_from_hospital', 'readmitted_30_days']
        correlation_matrix = df[numerical_features].corr()
        
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                   ax=axes[1, 2], square=True)
        axes[1, 2].set_title('Feature Correlation Matrix')
        
        plt.tight_layout()
        plt.show()
        
        # Additional plot: Readmission rate by multiple factors
        fig, ax = plt.subplots(1, 1, figsize=(10, 6))
        
        # Create a comprehensive analysis
        multi_factor_analysis = df.groupby(['elderly', 'polypharmacy', 'frequent_admissions'])['readmitted_30_days'].mean().reset_index()
        multi_factor_analysis['group'] = (
            multi_factor_analysis['elderly'].astype(str) + '_' +
            multi_factor_analysis['polypharmacy'].astype(str) + '_' +
            multi_factor_analysis['frequent_admissions'].astype(str)
        )
        
        bars = ax.bar(range(len(multi_factor_analysis)), multi_factor_analysis['readmitted_30_days'])
        ax.set_title('Readmission Rate by Risk Factor Combinations')
        ax.set_xlabel('Risk Factor Combinations\n(Elderly_Polypharmacy_FrequentAdmissions)')
        ax.set_ylabel('Readmission Rate')
        ax.set_xticks(range(len(multi_factor_analysis)))
        ax.set_xticklabels(multi_factor_analysis['group'], rotation=45)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x

## Model Training and Evaluation

The following steps demonstrate how to train a Random Forest model for hospital readmission prediction and evaluate its performance using precision and recall metrics.

In [None]:
# Example: Model Training and Evaluation with Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Assume df is your preprocessed DataFrame and 'readmitted_30_days' is the target
# Select features and target
target = 'readmitted_30_days'
features = [
    'age', 'length_of_stay', 'num_medications', 'chronic_conditions',
    'previous_admissions', 'distance_from_hospital',
    'polypharmacy', 'elderly', 'frequent_admissions', 'long_stay',
    'high_risk_discharge', 'far_from_hospital', 'comorbidity_burden',
    'gender_encoded', 'insurance_type_encoded', 'discharge_destination_encoded'
]

X = df[features]
y = df[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

# Predict on test set
y_pred = rf.predict(X_test)

# Calculate precision and recall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")