In [None]:
# %%
# --- Phase 1: Load Data ---
import pandas as pd
import numpy as np

# Load the raw clinical data from the VitalDB dataset
# This file contains patient demographics, surgical information, and outcomes.
try:
    df_clinical = pd.read_csv('../data/raw/clinical_data.csv')
    print("Successfully loaded clinical_data.csv")
    print(f"Initial dataset size: {len(df_clinical)} cases")
except FileNotFoundError:
    print("Error: 'clinical_data.csv' not found.")
    print("Please ensure the file is located in a '../data/raw/' directory relative to the notebook.")
    df_clinical = pd.DataFrame() # Create empty dataframe to prevent further errors

# %%
# --- Phase 2: Create Death Cohort and Label ---

if not df_clinical.empty:
    print("Creating cohort for in-hospital mortality prediction...")

    # Per the project plan, we focus on a specific surgical population.
    # The 'department' column allows us to filter for 'General surgery' cases.
    df_cohort = df_clinical[df_clinical['department'] == 'General surgery'].copy()
    print(f"\nFiltered for General Surgery: {len(df_cohort)} cases remaining.")

    # The target variable is 'death_inhosp' (In-hospital Mortality).
    # This column is already coded as 1 (death) or 0 (survival).
    # We will create a new 'death_label' column for clarity and consistency.
    df_cohort['death_label'] = df_cohort['death_inhosp'].astype(int)

    # It's important to understand the incidence of the outcome in our cohort.
    death_incidence = df_cohort['death_label'].mean() * 100
    print(f"Final cohort size: {len(df_cohort)} cases")
    print(f"In-hospital mortality incidence: {death_incidence:.2f}%")

    # Save the newly created cohort with the death label for future reference.
    # This corresponds to the 'final_cohort_with_labels.csv' from the original script.
    try:
        df_cohort.to_csv('../data/processed/final_cohort_with_death_label.csv', index=False)
        print("\nSuccessfully saved the final labeled cohort for mortality.")
    except OSError as e:
        print(f"\nCould not save the file. Please check permissions for the 'data/processed/' directory.")
        print(e)
    
    display(df_cohort.head())

# %%
# --- Phase 3: Split Data ---
from sklearn.model_selection import train_test_split

if 'df_cohort' in locals() and not df_cohort.empty:
    print("Splitting data into training and testing sets...")

    # Define features (X) and target (y).
    # We drop columns that are either labels, identifiers that shouldn't be features,
    # or clear data leakers (like postoperative length of stay).
    features_to_drop = [
        'death_label', 'death_inhosp', 'los_postop', 'los_icu',
        'subjectid' # caseid is kept for now for mapping purposes
    ]
    
    existing_features_to_drop = [col for col in features_to_drop if col in df_cohort.columns]
    
    X = df_cohort.drop(columns=existing_features_to_drop)
    y = df_cohort['death_label']

    # Perform an 80/20 split.
    # We use 'stratify=y' to ensure the proportion of mortality cases
    # is the same in both the training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

    print(f"\nData split complete.")
    print(f"Training set size: {len(X_train)} cases")
    print(f"Testing set size:  {len(X_test)} cases")
    print(f"Mortality incidence in training set: {y_train.mean()*100:.2f}%")
    print(f"Mortality incidence in testing set:  {y_test.mean()*100:.2f}%")

# %%
# --- Phase 4: Handle Missing Data ---

if 'X_train' in locals():
    print("Handling missing data by imputing with -99...")

    # We will drop the caseid before imputation, then add it back.
    # This ensures the identifier itself is not part of the imputation process.
    X_train_caseids = X_train['caseid']
    X_test_caseids = X_test['caseid']
    
    X_train_features = X_train.drop(columns=['caseid'])
    X_test_features = X_test.drop(columns=['caseid'])

    # Impute missing values with -99.
    X_train_imputed_features = X_train_features.fillna(-99)
    X_test_imputed_features = X_test_features.fillna(-99)
    
    # --- RE-INTEGRATE CASEID ---
    X_train_imputed = pd.concat([X_train_caseids.reset_index(drop=True), X_train_imputed_features.reset_index(drop=True)], axis=1)
    X_test_imputed = pd.concat([X_test_caseids.reset_index(drop=True), X_test_imputed_features.reset_index(drop=True)], axis=1)

    print("\nImputation complete.")
    display(X_train_imputed.head())

# %%
# --- Phase 5: Handle Outliers ---

if 'X_train_imputed' in locals():
    print("Handling outliers...")

    # Define the columns to process. Exclude 'caseid' from this process.
    continuous_cols = [col for col in X_train_imputed.columns if col != 'caseid']
    
    # --- ADDED: Sanity check before outlier handling ---
    print("\n--- Sanity Check: Before Outlier Handling ---")
    # Display summary statistics for a few key continuous variables
    cols_to_check = ['age', 'bmi', 'preop_cr', 'preop_hb']
    display(X_train_imputed[[col for col in cols_to_check if col in X_train_imputed.columns]].describe())


    # This robust function handles outliers by learning thresholds from the training set.
    def handle_outliers(df, train_df, cols):
        df_processed = df.copy()
        train_df_for_calc = train_df.drop(columns=['caseid'])

        for col in cols:
            if col in df_processed.columns and col in train_df_for_calc.columns:
                train_col_numeric = pd.to_numeric(train_df_for_calc[col], errors='coerce')
                df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
                
                train_col_numeric.dropna(inplace=True)
                if not train_col_numeric.empty:
                    low_p_1, high_p_99 = np.percentile(train_col_numeric, [1, 99])
                    
                    # Clip the values to the 1st and 99th percentile range
                    df_processed[col] = df_processed[col].clip(lower=low_p_1, upper=high_p_99)
        return df_processed

    # Apply the outlier handling function
    X_train_cleaned = handle_outliers(X_train_imputed, X_train_imputed, continuous_cols)
    X_test_cleaned = handle_outliers(X_test_imputed, X_train_imputed, continuous_cols)

    # Re-fill any NaNs that might have been created by pd.to_numeric
    X_train_cleaned.fillna(-99, inplace=True)
    X_test_cleaned.fillna(-99, inplace=True)

    print("\nOutlier handling complete.")
    
    # --- ADDED: Sanity check after outlier handling ---
    print("\n--- Sanity Check: After Outlier Handling ---")
    # Display summary statistics for the same columns to see the effect of clipping
    display(X_train_cleaned[[col for col in cols_to_check if col in X_train_cleaned.columns]].describe())


# %%
# --- Phase 6: Save Final Datasets ---

if 'X_train_cleaned' in locals():
    # Save the final cleaned preoperative data to the processed folder
    # These files now correctly include the 'caseid' for mapping.
    try:
        X_train_cleaned.to_csv('../data/processed/preop_train_cleaned_death_cohort.csv', index=False)
        X_test_cleaned.to_csv('../data/processed/preop_test_cleaned_death_cohort.csv', index=False)
        
        # We also need to save the corresponding labels for model training and evaluation
        y_train.to_csv('../data/processed/preop_train_labels_death_cohort.csv', index=False)
        y_test.to_csv('../data/processed/preop_test_labels_death_cohort.csv', index=False)

        print("\nCleaned preoperative data (with caseids) and labels saved to 'data/processed/'.")
    except OSError as e:
        print(f"\nCould not save the final files. Please check permissions for the 'data/processed/' directory.")
        print(e)


Successfully loaded clinical_data.csv
Initial dataset size: 6388 cases
Creating cohort for in-hospital mortality prediction...

Filtered for General Surgery: 4930 cases remaining.
Final cohort size: 4930 cases
In-hospital mortality incidence: 0.89%

Successfully saved the final labeled cohort for mortality.


Unnamed: 0,caseid,subjectid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,...,intraop_ppf,intraop_mdz,intraop_ftn,intraop_rocu,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca,death_label
0,1,5955,0,11542,-552,10848.0,1668,10368,-236220,627780,...,120,0.0,100,70,0,10,0,0,0,0
1,2,2487,0,15741,-1039,14921.0,1721,14621,-221160,1506840,...,150,0.0,0,100,0,20,0,0,0,0
2,3,2861,0,4394,-590,4210.0,1090,3010,-218640,40560,...,0,0.0,0,50,0,0,0,0,0,0
3,4,1903,0,20990,-778,20222.0,2522,17822,-201120,576480,...,80,0.0,100,100,0,50,0,0,0,0
4,5,4416,0,21531,-1009,22391.0,2591,20291,-67560,3734040,...,0,0.0,0,160,0,10,900,0,2100,0


Splitting data into training and testing sets...

Data split complete.
Training set size: 3944 cases
Testing set size:  986 cases
Mortality incidence in training set: 0.89%
Mortality incidence in testing set:  0.91%
Handling missing data by imputing with -99...

Imputation complete.


Unnamed: 0,caseid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,icu_days,...,intraop_colloid,intraop_ppf,intraop_mdz,intraop_ftn,intraop_rocu,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca
0,5275,0,8966,-817,8723.0,2123,8123,-221640,383160,0,...,0,100,0.0,100,55,0,0,0,0,0
1,6356,0,22116,-1191,21609.0,2109,20409,-201540,835260,0,...,0,100,0.0,100,140,0,15,0,0,0
2,5227,0,8217,187,8167.0,3367,7267,-200580,404220,0,...,0,100,0.0,50,40,0,0,0,0,0
3,5074,0,6029,72,6492.0,1092,5945,-133860,211740,0,...,0,120,0.0,100,50,0,10,0,0,0
4,4751,0,4030,-2172,4128.0,1728,3228,-201960,143640,0,...,0,0,0.0,0,50,0,40,0,0,0


Handling outliers...

Outlier handling complete.


Unnamed: 0,caseid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,icu_days,...,intraop_colloid,intraop_ppf,intraop_mdz,intraop_ftn,intraop_rocu,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca
0,5275,0,8966.0,-817.0,8723.0,2123.0,8123.0,-221640.0,383160.0,0,...,0,100,0.0,100,55,0,0,0,0,0
1,6356,0,22116.0,-1191.0,21609.0,2109.0,20409.0,-201540.0,835260.0,0,...,0,100,0.0,100,140,0,15,0,0,0
2,5227,0,8217.0,187.0,8167.0,3367.0,7267.0,-200580.0,404220.0,0,...,0,100,0.0,50,40,0,0,0,0,0
3,5074,0,6029.0,72.0,6492.0,1092.0,5945.0,-133860.0,211740.0,0,...,0,120,0.0,100,50,0,10,0,0,0
4,4751,0,4030.0,-2172.0,4128.0,1728.0,3228.0,-201960.0,143640.0,0,...,0,0,0.0,0,50,0,40,0,0,0



Cleaned preoperative data (with caseids) and labels saved to 'data/processed/'.
