# Advanced Imputation: K-NN and MICE

This notebook applies K-NN and MICE imputation to specific columns that benefit from relationship-aware imputation.

**Target Columns:**
- **MICE**: BP_systolic, BP_diastolic (strong correlation between pair)
- **K-NN**: temperature, heart_rate, resp_rate, o2sat (vital signs cluster)
- **K-NN**: creatinine, blood_wbc (multi-factor relationships)

**Note**: This should be run AFTER temporal filling steps in `3_b2s_handle_missing_values.ipynb`


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

# Get project root
PROJECT_ROOT = Path(__file__).parent.parent.parent if '__file__' in globals() else Path.cwd().parent.parent

# Load data after temporal filling (from previous notebook)
# If running standalone, load the file saved after temporal filling
file_path = PROJECT_ROOT / "data" / "silver" / "bronze_outliers_handled.csv"
print(f"Loading data from: {file_path}")

df = pd.read_csv(file_path)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {len(df.columns)}")


: 

## Step 1: Apply Temporal Filling (if not already done)

This replicates the temporal filling logic from the main notebook to ensure we have the baseline before applying K-NN/MICE.


In [None]:
# Ensure datetime columns are properly formatted
df["admittime"] = pd.to_datetime(df["admittime"], errors="coerce")

# Sort by class, patient, and time for temporal operations
df = df.sort_values(["y", "subject_id", "admittime"])

# Columns that benefit from temporal filling
temporal_columns = [
    "BMI", "creatinine", "temperature", "heart_rate", 
    "resp_rate", "o2sat", "BP_systolic", "BP_diastolic"
]

print("Applying temporal filling...")
for col in temporal_columns:
    if col in df.columns:
        # Forward fill within patient groups
        df[col] = df.groupby(["y", "subject_id"])[col].ffill()
        # Backward fill within patient groups
        df[col] = df.groupby(["y", "subject_id"])[col].bfill()

print("Temporal filling completed.")

# Check remaining missing values
missing_after_temporal = df[temporal_columns].isna().sum()
print("\nMissing values after temporal fill:")
print(missing_after_temporal[missing_after_temporal > 0])


## Step 2: MICE Imputation for Blood Pressure Pair

BP_systolic and BP_diastolic have a strong correlation. MICE can model this relationship better than simple group medians.


In [None]:
def apply_mice_bp(df):
    """Apply MICE imputation to BP_systolic and BP_diastolic."""
    
    bp_columns = ['BP_systolic', 'BP_diastolic']
    
    # Check if columns exist and have missing values
    if not all(col in df.columns for col in bp_columns):
        print("BP columns not found. Skipping MICE for BP.")
        return df
    
    missing_bp = df[bp_columns].isna().any(axis=1).sum()
    if missing_bp == 0:
        print("No missing BP values. Skipping MICE.")
        return df
    
    print(f"Rows with missing BP: {missing_bp} ({missing_bp/len(df)*100:.2f}%)")
    
    # Features to use for BP imputation
    bp_features = [
        'BP_systolic', 'BP_diastolic', 'heart_rate', 'anchor_age', 
        'BMI', 'temperature', 'icu_admission'
    ]
    
    # Filter to available features
    bp_features = [f for f in bp_features if f in df.columns]
    
    # Prepare data
    bp_data = df[bp_features].copy()
    
    # Handle categorical variables
    if 'icu_admission' in bp_data.columns:
        bp_data = pd.get_dummies(bp_data, columns=['icu_admission'], drop_first=True, prefix='icu')
    
    # Store original indices
    original_indices = bp_data.index
    
    # Apply MICE
    print("Applying MICE imputation for BP...")
    
    # Use RandomForest as estimator for better handling of non-linear relationships
    mice_imputer = IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1),
        max_iter=10,
        random_state=42,
        imputation_order='ascending',  # Start with least missing
        verbose=0
    )
    
    bp_data_imputed = mice_imputer.fit_transform(bp_data)
    bp_data_imputed = pd.DataFrame(bp_data_imputed, columns=bp_data.columns, index=original_indices)
    
    # Update only the BP columns in original dataframe
    df.loc[df.index, 'BP_systolic'] = bp_data_imputed['BP_systolic']
    df.loc[df.index, 'BP_diastolic'] = bp_data_imputed['BP_diastolic']
    
    print(f"MICE imputation for BP completed.")
    print(f"Remaining missing BP_systolic: {df['BP_systolic'].isna().sum()}")
    print(f"Remaining missing BP_diastolic: {df['BP_diastolic'].isna().sum()}")
    
    return df

# Apply MICE for BP
df = apply_mice_bp(df)


## Step 3: K-NN Imputation for Vital Signs Cluster

Temperature, heart_rate, resp_rate, and o2sat are highly correlated. K-NN can leverage these relationships.


In [None]:
def apply_knn_vitals(df, n_neighbors=5):
    """Apply K-NN imputation to vital signs cluster."""
    
    vital_signs = ['temperature', 'heart_rate', 'resp_rate', 'o2sat']
    
    # Check if columns exist
    vital_signs = [v for v in vital_signs if v in df.columns]
    if len(vital_signs) == 0:
        print("Vital signs columns not found. Skipping K-NN.")
        return df
    
    missing_vitals = df[vital_signs].isna().any(axis=1).sum()
    if missing_vitals == 0:
        print("No missing vital signs. Skipping K-NN.")
        return df
    
    print(f"Rows with missing vital signs: {missing_vitals} ({missing_vitals/len(df)*100:.2f}%)")
    
    # Features to use for vital signs imputation
    vital_features = vital_signs + [
        'anchor_age', 'BMI', 'icu_admission', 'BP_systolic', 'BP_diastolic'
    ]
    
    # Filter to available features
    vital_features = [f for f in vital_features if f in df.columns]
    
    # Prepare data
    vital_data = df[vital_features].copy()
    
    # Handle categorical variables
    if 'icu_admission' in vital_data.columns:
        vital_data = pd.get_dummies(vital_data, columns=['icu_admission'], drop_first=True, prefix='icu')
    
    # Store original indices and column names
    original_indices = vital_data.index
    vital_cols = vital_data.columns.tolist()
    
    # Scale features for K-NN (important for distance calculation)
    print("Scaling features for K-NN...")
    scaler = StandardScaler()
    vital_data_scaled = scaler.fit_transform(vital_data)
    
    # Apply K-NN imputation
    print(f"Applying K-NN imputation (k={n_neighbors}) for vital signs...")
    knn_imputer = KNNImputer(
        n_neighbors=n_neighbors,
        weights='distance',  # Weight by distance
        metric='euclidean'
    )
    
    vital_data_imputed = knn_imputer.fit_transform(vital_data_scaled)
    
    # Inverse transform to get original scale
    vital_data_imputed = scaler.inverse_transform(vital_data_imputed)
    vital_data_imputed = pd.DataFrame(vital_data_imputed, columns=vital_cols, index=original_indices)
    
    # Update only the vital signs columns in original dataframe
    for col in vital_signs:
        if col in vital_data_imputed.columns:
            missing_mask = df[col].isna()
            df.loc[missing_mask, col] = vital_data_imputed.loc[missing_mask, col]
    
    print(f"K-NN imputation for vital signs completed.")
    for col in vital_signs:
        remaining = df[col].isna().sum()
        print(f"  Remaining missing {col}: {remaining}")
    
    return df

# Apply K-NN for vital signs
df = apply_knn_vitals(df, n_neighbors=5)


## Step 4: K-NN Imputation for Lab Values (Creatinine and Blood WBC)

These have complex multi-factor relationships that K-NN can capture better than group medians.


In [None]:
def apply_knn_labs(df, n_neighbors=5):
    """Apply K-NN imputation to creatinine and blood_wbc."""
    
    lab_columns = ['creatinine', 'blood_wbc']
    
    # Check if columns exist
    lab_columns = [l for l in lab_columns if l in df.columns]
    if len(lab_columns) == 0:
        print("Lab columns not found. Skipping K-NN.")
        return df
    
    missing_labs = df[lab_columns].isna().any(axis=1).sum()
    if missing_labs == 0:
        print("No missing lab values. Skipping K-NN.")
        return df
    
    print(f"Rows with missing lab values: {missing_labs} ({missing_labs/len(df)*100:.2f}%)")
    
    # Process each lab column separately (they may have different relationships)
    for lab_col in lab_columns:
        if df[lab_col].isna().sum() == 0:
            print(f"No missing values for {lab_col}. Skipping.")
            continue
        
        print(f"\nProcessing {lab_col}...")
        
        # Features to use for lab imputation
        if lab_col == 'creatinine':
            lab_features = [lab_col, 'anchor_age', 'BMI', 'blood_wbc', 
                          'BP_systolic', 'BP_diastolic', 'gender', 'temperature']
        elif lab_col == 'blood_wbc':
            lab_features = [lab_col, 'temperature', 'heart_rate', 'creatinine', 
                          'anchor_age', 'BMI', 'gender']
        else:
            lab_features = [lab_col, 'anchor_age', 'BMI', 'gender']
        
        # Filter to available features
        lab_features = [f for f in lab_features if f in df.columns]
        
        # Prepare data
        lab_data = df[lab_features].copy()
        
        # Handle categorical variables
        if 'gender' in lab_data.columns:
            lab_data = pd.get_dummies(lab_data, columns=['gender'], drop_first=True, prefix='gender')
        
        # Store original indices and column names
        original_indices = lab_data.index
        lab_cols = lab_data.columns.tolist()
        
        # Scale features for K-NN
        scaler = StandardScaler()
        lab_data_scaled = scaler.fit_transform(lab_data)
        
        # Apply K-NN imputation
        print(f"  Applying K-NN imputation (k={n_neighbors})...")
        knn_imputer = KNNImputer(
            n_neighbors=n_neighbors,
            weights='distance',
            metric='euclidean'
        )
        
        lab_data_imputed = knn_imputer.fit_transform(lab_data_scaled)
        
        # Inverse transform
        lab_data_imputed = scaler.inverse_transform(lab_data_imputed)
        lab_data_imputed = pd.DataFrame(lab_data_imputed, columns=lab_cols, index=original_indices)
        
        # Update only the lab column in original dataframe
        missing_mask = df[lab_col].isna()
        df.loc[missing_mask, lab_col] = lab_data_imputed.loc[missing_mask, lab_col]
        
        remaining = df[lab_col].isna().sum()
        print(f"  Remaining missing {lab_col}: {remaining}")
    
    return df

# Apply K-NN for lab values
df = apply_knn_labs(df, n_neighbors=5)


## Step 5: Fallback to Group Median (for any remaining missing values)

If K-NN/MICE couldn't impute some values (e.g., all neighbors also missing), fall back to group medians.


In [None]:
def apply_fallback_median(df):
    """Apply group median as fallback for any remaining missing values."""
    
    # Columns that might still have missing values
    target_columns = [
        'BP_systolic', 'BP_diastolic', 'temperature', 'heart_rate', 
        'resp_rate', 'o2sat', 'creatinine', 'blood_wbc', 'BMI'
    ]
    
    target_columns = [c for c in target_columns if c in df.columns]
    
    # Create age_group if not exists
    if 'age_group' not in df.columns:
        df['age_group'] = pd.cut(
            df['anchor_age'],
            bins=[0, 18, 40, 65, 120],
            labels=['child', 'adult', 'middle_age', 'elderly']
        )
    
    print("Applying fallback group medians...")
    
    for col in target_columns:
        if df[col].isna().sum() == 0:
            continue
        
        # Group median by (y, age_group, gender) or (y, age_group, icu_admission)
        if col in ['o2sat', 'BP_systolic', 'BP_diastolic']:
            group_key = ['y', 'age_group', 'icu_admission']
        else:
            group_key = ['y', 'age_group', 'gender']
        
        # Compute group median
        group_median = df.groupby(group_key, observed=False)[col].median()
        
        # Join back
        df = df.join(
            group_median.rename(f'{col}_group_median'),
            on=group_key
        )
        
        # Fallback to overall median by y
        overall_median_by_y = df.groupby('y')[col].median()
        
        df[f'{col}_group_median'] = df.apply(
            lambda row: (
                overall_median_by_y.loc[row['y']]
                if pd.isna(row[f'{col}_group_median'])
                else row[f'{col}_group_median']
            ),
            axis=1
        )
        
        # Fill missing
        df[col] = df[col].fillna(df[f'{col}_group_median'])
        
        # Drop helper column
        df = df.drop(columns=[f'{col}_group_median'])
        
        remaining = df[col].isna().sum()
        if remaining > 0:
            print(f"  {col}: {remaining} missing values remain (using overall median)")
            overall_median = df[col].median()
            df[col] = df[col].fillna(overall_median)
    
    # Cleanup age_group if it was created here
    df = df.drop(columns=['age_group'], errors='ignore')
    
    print("Fallback medians applied.")
    return df

# Apply fallback
df = apply_fallback_median(df)


## Step 6: Validation and Summary


In [None]:
# Check final missing values
target_columns = [
    'BP_systolic', 'BP_diastolic', 'temperature', 'heart_rate', 
    'resp_rate', 'o2sat', 'creatinine', 'blood_wbc', 'BMI'
]

target_columns = [c for c in target_columns if c in df.columns]

print("Final Missing Values Summary:")
print("=" * 50)

missing_summary = pd.DataFrame({
    'missing_count': df[target_columns].isna().sum(),
    'missing_percentage': (df[target_columns].isna().mean() * 100).round(2)
}).sort_values('missing_percentage', ascending=False)

print(missing_summary)

total_missing = missing_summary['missing_count'].sum()
print(f"\nTotal missing values: {total_missing}")
print(f"Total rows: {len(df)}")

if total_missing == 0:
    print("\n✓ All target columns successfully imputed!")
else:
    print(f"\n⚠ {total_missing} missing values remain.")


## Step 7: Save Results


In [None]:
# Save the imputed dataset
output_path = PROJECT_ROOT / "data" / "silver" / "bronze_missing_values_handled_advanced.csv"

df.to_csv(output_path, index=False)

print(f"Dataset saved to: {output_path}")
print(f"Final shape: {df.shape}")

# Optionally, also save comparison summary
comparison_path = PROJECT_ROOT / "data" / "silver" / "imputation_comparison_summary.csv"
missing_summary.to_csv(comparison_path, index=True)
print(f"Comparison summary saved to: {comparison_path}")


## Notes

1. **Performance**: K-NN and MICE are computationally expensive. For 158K rows, this may take 10-30 minutes.

2. **Memory**: Ensure sufficient RAM (recommended: 8GB+).

3. **Comparison**: Compare results with the baseline approach to evaluate improvement.

4. **Tuning**: Adjust `n_neighbors` for K-NN (default: 5) based on your data characteristics.

5. **Integration**: This notebook can be integrated into the main pipeline or run separately for comparison.
