# Sepsis Prediction Data Analysis & Preprocessing

This notebook analyzes the unified parquet dataset, performs standardization, handles missing values, and creates data splits.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set plot style
sns.set(style="whitegrid")

## 1. Load Data

In [None]:
data_path = "../data/unified/unified_data.parquet"
df = pd.read_parquet(data_path)
print(f"Dataset Shape: {df.shape}")
df.head()

## 2. Data Analysis Report

In [None]:
# Column Types
print(df.dtypes)

# Unique Patients
n_patients = df['PatientID'].nunique()
print(f"Number of unique patients: {n_patients}")

# Class Balance (SepsisLabel)
sepsis_counts = df['SepsisLabel'].value_counts()
print("\nClass Balance (Rows):")
print(sepsis_counts)
print(f"Sepsis Prevalence (Rows): {sepsis_counts[1] / len(df):.2%}")

# Patient-level Class Balance
patient_labels = df.groupby('PatientID')['SepsisLabel'].max()
print("\nClass Balance (Patients):")
print(patient_labels.value_counts())
print(f"Sepsis Prevalence (Patients): {patient_labels.sum() / n_patients:.2%}")

### Missing Values Analysis

In [None]:
missing_counts = df.isnull().sum()
missing_pct = (missing_counts / len(df)) * 100

missing_df = pd.DataFrame({'Missing Count': missing_counts, 'Missing %': missing_pct})
missing_df = missing_df.sort_values(by='Missing %', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x=missing_df.index, y=missing_df['Missing %'])
plt.xticks(rotation=90)
plt.title("Percentage of Missing Values per Column")
plt.ylabel("%")
plt.show()

missing_df.head(20)

## 3. Data Splitting
We split the data by PatientID to ensure no leakage. We will create Train (70%), Validation (15%), and Test (15%) splits.

In [None]:
from sklearn.model_selection import train_test_split

patient_ids = df['PatientID'].unique()
train_ids, temp_ids = train_test_split(patient_ids, test_size=0.3, random_state=42)
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)

print(f"Train Patients: {len(train_ids)}")
print(f"Val Patients: {len(val_ids)}")
print(f"Test Patients: {len(test_ids)}")

# Create masks for splitting
train_mask = df['PatientID'].isin(train_ids)
val_mask = df['PatientID'].isin(val_ids)
test_mask = df['PatientID'].isin(test_ids)

train_df = df[train_mask].copy()
val_df = df[val_mask].copy()
test_df = df[test_mask].copy()

## 4. Standardization & Missingness Handling
We calculate mean and std ONLY on the training set. We also create mask features.

In [None]:
# Identify feature columns (exclude IDs and Labels)
feature_cols = [c for c in df.columns if c not in ['PatientID', 'SepsisLabel', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS']]
print(f"Feature Columns: {feature_cols}")

# Compute statistics on Train
train_mean = train_df[feature_cols].mean()
train_std = train_df[feature_cols].std()

# Avoid division by zero
train_std = train_std.replace(0, 1.0)

def preprocess_split(split_df, mean, std, feature_cols):
    # 1. Create Mask (1 = Observed, 0 = Missing)
    mask = (~split_df[feature_cols].isnull()).astype(int)
    mask.columns = [f"{c}_mask" for c in feature_cols]
    
    # 2. Forward Fill (and fill remaining NaNs with 0 after standardization, or mean before)
    # Standard practice: Forward fill, then fill remaining with mean (0 after standardization)
    # Actually, let's just fill with mean (0) for now for the values, but keep the mask.
    # Or better: Forward Fill per patient. 
    # Groupby fillna is slow. Let's do global fillna(0) for simple baseline, 
    # but for time-series, forward fill is better.
    
    # For efficiency in this notebook, we'll use simple imputation + mask.
    # (Refining to per-patient forward fill would be better in the full pipeline)
    
    # Standardization
    scaled_features = (split_df[feature_cols] - mean) / std
    
    # Fill NaNs with 0 (which is the mean)
    scaled_features = scaled_features.fillna(0)
    
    # Concatenate: PatientID, Scaled Features, Masks, Other Cols, Label
    result = pd.concat([
        split_df[['PatientID', 'ICULOS', 'Unit1', 'Unit2', 'HospAdmTime']],
        scaled_features,
        mask,
        split_df[['SepsisLabel']]
    ], axis=1)
    
    return result

print("Processing Train...")
train_processed = preprocess_split(train_df, train_mean, train_std, feature_cols)

print("Processing Val...")
val_processed = preprocess_split(val_df, train_mean, train_std, feature_cols)

print("Processing Test...")
test_processed = preprocess_split(test_df, train_mean, train_std, feature_cols)

train_processed.head()

## 5. Save Processed Splits

In [None]:
output_dir = "../data/processed_splits"
os.makedirs(output_dir, exist_ok=True)

train_processed.to_parquet(os.path.join(output_dir, "train.parquet"))
val_processed.to_parquet(os.path.join(output_dir, "val.parquet"))
test_processed.to_parquet(os.path.join(output_dir, "test.parquet"))

print(f"Saved splits to {output_dir}")