In [1]:
import pandas as pd

# Load the final cohort data created in the previous notebook
df = pd.read_csv('../data/processed/final_cohort_with_labels.csv')

print("Successfully loaded the final labeled cohort.")
print(f"Cohort size: {len(df)} cases")
display(df.head())

Successfully loaded the final labeled cohort.
Cohort size: 2566 cases


Unnamed: 0,caseid,subjectid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,...,intraop_ppf,intraop_mdz,intraop_ftn,intraop_rocu,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca,aki_label
0,3594,1,0,27769,-575,26725.0,2125,26725,-216840,1079160,...,120,0.0,0,110,0,0,0,0,0,0
1,6198,3,0,10013,-314,9886.0,1786,8986,-219300,471900,...,70,0.0,100,60,0,0,0,0,300,0
2,3417,4,0,16177,-447,15813.0,1833,15333,-215940,1598460,...,0,0.0,0,90,0,0,0,0,0,0
3,734,5,0,20889,-76,20624.0,4724,19424,-114060,490740,...,0,0.0,0,95,0,0,0,0,0,0
4,1580,6,0,13925,-1646,13594.0,2194,13294,-219180,558420,...,90,0.0,0,110,0,10,130,0,300,0


In [3]:
from sklearn.model_selection import train_test_split

# --- Phase 4: Split Data ---

print("Splitting data into training and testing sets...")

# Define your features (X) and your target (y) from the loaded dataframe 'df'
X = df.drop('aki_label', axis=1)
y = df['aki_label']

# Perform an 80/20 split
# We use 'stratify=y' to ensure that the proportion of AKI cases (1s and 0s)
# is the same in both the training and testing sets. This is crucial for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

print(f"\nData split complete.")
print(f"Training set size: {len(X_train)} cases")
print(f"Testing set size:  {len(X_test)} cases")
print(f"AKI incidence in training set: {y_train.mean()*100:.2f}%")
print(f"AKI incidence in testing set:  {y_test.mean()*100:.2f}%")

Splitting data into training and testing sets...

Data split complete.
Training set size: 2052 cases
Testing set size:  514 cases
AKI incidence in training set: 6.48%
AKI incidence in testing set:  6.42%


In [4]:
# --- Phase 4: Handle Missing Data ---

print("Handling missing data by imputing with -99...")

# It's good practice to check the number of missing values before
print(f"Missing values in X_train before imputation: {X_train.isnull().sum().sum()}")
print(f"Missing values in X_test before imputation:  {X_test.isnull().sum().sum()}")

# Impute missing values with -99 as specified in the project plan
X_train_imputed = X_train.fillna(-99)
X_test_imputed = X_test.fillna(-99)

# Check again to confirm that all missing values have been handled
print(f"\nMissing values in X_train after imputation: {X_train_imputed.isnull().sum().sum()}")
print(f"Missing values in X_test after imputation:  {X_test_imputed.isnull().sum().sum()}")

print("\nImputation complete.")
display(X_train_imputed.head())

Handling missing data by imputing with -99...
Missing values in X_train before imputation: 25366
Missing values in X_test before imputation:  6341

Missing values in X_train after imputation: 0
Missing values in X_test after imputation:  0

Imputation complete.


  X_test_imputed = X_test.fillna(-99)


Unnamed: 0,caseid,subjectid,casestart,caseend,anestart,aneend,opstart,opend,adm,dis,...,intraop_colloid,intraop_ppf,intraop_mdz,intraop_ftn,intraop_rocu,intraop_vecu,intraop_eph,intraop_phe,intraop_epi,intraop_ca
1162,4331,2785,0,8695,-579,8241.0,741,7941,-1347720,380280,...,0,100,0.0,50,90,0,5,50,0,300
358,3253,841,0,9507,-189,9411.0,1611,8511,-221280,556320,...,0,0,0.0,0,80,0,0,0,0,0
10,4320,19,0,18754,-491,19069.0,2869,17869,-903720,2897880,...,0,0,0.0,0,100,0,35,0,0,300
2473,1401,5882,0,12786,-2273,11887.0,1087,11887,-202860,747540,...,500,150,0.0,100,90,0,15,0,0,0
1386,357,3294,0,6281,-2062,6518.0,1118,5318,-220260,125340,...,0,100,0.0,0,60,0,0,90,0,0


In [6]:
import numpy as np
import pandas as pd

# --- Phase 4: Handle Outliers (Corrected) ---
print("Handling outliers...")

# A list of continuous variables to check for outliers, based on the data dictionary
continuous_cols = ['age', 'height', 'weight', 'bmi', 'preop_hb', 'preop_plt', 
                   'preop_pt', 'preop_aptt', 'preop_na', 'preop_k', 'preop_gluc', 
                   'preop_alb', 'preop_ast', 'preop_alt', 'preop_bun', 'preop_cr']

# This is the function definition from your project plan, now with robust type conversion
def handle_outliers(df, train_df, continuous_cols):
    df_processed = df.copy()
    for col in continuous_cols:
        # Check if the column exists in the dataframe
        if col in df_processed.columns:
            
            # --- CORRECTION START ---
            # 1. Create a clean, numeric version of the training data column for calculating percentiles
            #    errors='coerce' will turn any non-numeric strings into NaN
            train_col_numeric = pd.to_numeric(train_df[col], errors='coerce')
            
            # 2. Force the column in the dataframe being processed to also be numeric
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
            # --- CORRECTION END ---

            # Calculate percentile thresholds ONLY from the numeric training column
            # We must fill any new NaNs created by 'coerce' before calculating percentiles
            low_p_0_5, low_p_1, low_p_5 = np.percentile(train_col_numeric.dropna(), [0.5, 1, 5])
            high_p_95, high_p_99_5 = np.percentile(train_col_numeric.dropna(), [95, 99.5])
            
            # Identify outlier indices in the processed dataframe
            low_outlier_indices = df_processed[df_processed[col] < low_p_1].index
            high_outlier_indices = df_processed[df_processed[col] > high_p_99_5].index
            
            # Replace with random values from the specified plausible range
            low_replacements = np.random.uniform(low_p_0_5, low_p_5, size=len(low_outlier_indices))
            high_replacements = np.random.uniform(high_p_95, high_p_99_5, size=len(high_outlier_indices))
            
            df_processed.loc[low_outlier_indices, col] = low_replacements
            df_processed.loc[high_outlier_indices, col] = high_replacements
    return df_processed

# Apply the outlier handling function to the imputed training and test sets
X_train_cleaned = handle_outliers(X_train_imputed, X_train_imputed, continuous_cols)
X_test_cleaned = handle_outliers(X_test_imputed, X_train_imputed, continuous_cols)

# The outlier handling might have created new NaNs from non-numeric strings. We fill them now.
X_train_cleaned.fillna(-99, inplace=True)
X_test_cleaned.fillna(-99, inplace=True)

# Save the cleaned preoperative data to the processed folder
X_train_cleaned.to_csv('../data/processed/preop_train_cleaned.csv', index=False)
X_test_cleaned.to_csv('../data/processed/preop_test_cleaned.csv', index=False)

print("\nOutlier handling complete.")
print("Cleaned preoperative data saved to 'data/processed/'.")

# Display the summary statistics of a column to see the effect
print("\nExample: 'bmi' column statistics after outlier handling:")
print(X_train_cleaned['bmi'].describe())

Handling outliers...

Outlier handling complete.
Cleaned preoperative data saved to 'data/processed/'.

Example: 'bmi' column statistics after outlier handling:
count    2052.000000
mean       22.954648
std         3.350702
min        15.031774
25%        20.700000
50%        22.750000
75%        25.000000
max        34.914081
Name: bmi, dtype: float64
