# Data Wrangler Script

Converts long feature file into wide feature file so we can train on it

In [6]:
import pandas as pd
from pathlib import Path
import numpy as np
import sys
from sklearn.model_selection import train_test_split

print("--- Part 1: Comprehensive Feature Wrangler ---")
print("This script will clean preoperative data and merge it with waveform features.")

# --- 1. Configuration and Paths ---

RANDOM_STATE = 42

# --- Path Configuration ---
try:
    PROJECT_ROOT = Path.cwd().parent
    PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
except Exception as e:
    print(f"Error setting up paths: {e}")
    print("Please ensure you are running this from a subdirectory of your project root.")
    sys.exit(1)

# Input Files
COHORT_FILE = PROCESSED_DIR / 'aki_pleth_ecg_co2_awp.csv' # From cohort generation
LONG_WAVEFORM_FILE = PROCESSED_DIR / 'aki_pleth_ecg_co2_awp_inf.csv' # From catch22 prep

# Output File
WIDE_FEATURES_FILE = PROCESSED_DIR / 'aki_features_master_wide.csv'

# Target Column
TARGET_COLUMN = 'aki_label'

print(f"Project Root:        {PROJECT_ROOT}")
print(f"Cohort (Preop) file: {COHORT_FILE}")
print(f"Waveform (long) file:{LONG_WAVEFORM_FILE}")
print(f"Output (wide) file:  {WIDE_FEATURES_FILE}")

# --- 2. Feature Definitions ---

# All preoperative features to be selected from the cohort file
PREOP_FEATURES_TO_SELECT = [
    'caseid', TARGET_COLUMN, 'age', 'sex', 'emop', 'department', 'bmi', 'approach',
    'preop_htn', 'preop_dm', 'preop_ecg', 'preop_pft', 'preop_hb', 'preop_plt',
    'preop_pt', 'preop_aptt', 'preop_na', 'preop_k', 'preop_gluc', 'preop_alb',
    'preop_ast', 'preop_alt', 'preop_bun', 'preop_cr', 'preop_hco3'
]

# Continuous features for outlier handling
CONTINUOUS_COLS = [
    'age', 'bmi', 'preop_hb', 'preop_plt', 'preop_pt', 'preop_aptt', 'preop_na',
    'preop_k', 'preop_gluc', 'preop_alb', 'preop_ast', 'preop_alt',
    'preop_bun', 'preop_cr', 'preop_hco3'
    # Note: 'height' and 'weight' are not in the select list, so they are not here
]

# Categorical features for merging and one-hot encoding
CATEGORICAL_COLS = [
    'sex', 'emop', 'department', 'approach', 'preop_htn', 'preop_dm',
    'preop_ecg', 'preop_pft'
]

# Waveform prefixes for later logic (used to separate preop vs waveform features)
WAVEFORM_PREFIXES = ['SNUADC_PLETH', 'SNUADC_ECG_II', 'Primus_CO2', 'Primus_AWP']

print(f"Defined {len(PREOP_FEATURES_TO_SELECT)} preop, {len(CONTINUOUS_COLS)} continuous, {len(CATEGORICAL_COLS)} categorical features.")

# --- 3. Outlier Handling Function ---
# This is the exact function from your plan
def handle_outliers(df, train_df, continuous_cols):
    """
    Handles outliers based on training set percentiles.
    Replaces outliers with random values from a plausible range.
    """
    df_processed = df.copy()
    for col in continuous_cols:
        if col in df_processed.columns:
            
            # 1. Create a clean, numeric version of the training data column
            train_col_numeric = pd.to_numeric(train_df[col], errors='coerce')
            
            # 2. Force the column in the dataframe being processed to also be numeric
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')

            # Calculate percentile thresholds ONLY from the numeric training column
            # We must fill any new NaNs created by 'coerce' before calculating percentiles
            train_col_numeric.dropna(inplace=True)
            if train_col_numeric.empty:
                print(f"Warning: No valid data for '{col}' in training set. Skipping outlier handling for this column.")
                continue

            low_p_0_5, low_p_1, low_p_5 = np.percentile(train_col_numeric, [0.5, 1, 5])
            high_p_95, high_p_99_5 = np.percentile(train_col_numeric, [95, 99.5])
            
            # Identify outlier indices in the processed dataframe
            low_outlier_indices = df_processed[df_processed[col] < low_p_1].index
            high_outlier_indices = df_processed[df_processed[col] > high_p_99_5].index
            
            # Replace with random values from the specified plausible range
            low_replacements = np.random.uniform(low_p_0_5, low_p_5, size=len(low_outlier_indices))
            high_replacements = np.random.uniform(high_p_95, high_p_99_5, size=len(high_outlier_indices))
            
            df_processed.loc[low_outlier_indices, col] = low_replacements
            df_processed.loc[high_outlier_indices, col] = high_replacements
    return df_processed

print("Outlier handling function defined.")

# --- 4. Process Preoperative Data ---
print("\n--- Processing Preoperative Data ---")

# 4.1 Load and Select Preop Data
print(f"Loading cohort file from {COHORT_FILE}...")
try:
    cohort_df = pd.read_csv(COHORT_FILE)
    print(f"Successfully loaded. Shape: {cohort_df.shape}")
except FileNotFoundError:
    print(f"ERROR: Cohort file not found at {COHORT_FILE}")
    sys.exit(1)
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    sys.exit(1)

# Select only the columns we need
try:
    preop_df = cohort_df[PREOP_FEATURES_TO_SELECT].copy()
    print(f"Selected {len(PREOP_FEATURES_TO_SELECT)} columns. Shape: {preop_df.shape}")
except KeyError as e:
    print(f"ERROR: A column is missing from {COHORT_FILE}. Missing key: {e}")
    print("Please check PREOP_FEATURES_TO_SELECT against the cohort file columns.")
    sys.exit(1)

# 4.2 Split Data (to prevent leakage)
print("Splitting preoperative data into train/test sets for safe processing...")
X_preop = preop_df.drop(columns=[TARGET_COLUMN, 'caseid'])
y_preop = preop_df[TARGET_COLUMN]
caseid_series = preop_df['caseid']

X_train, X_test, y_train, y_test, caseid_train, caseid_test = train_test_split(
    X_preop, y_preop, caseid_series,
    test_size=0.2, # Same split as the trainer script
    random_state=RANDOM_STATE,
    stratify=y_preop
)
print(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")

# 4.3 Handle Categorical Data
print("Processing categorical variables...")
# Merge 'department' counts < 30
dept_counts = X_train['department'].value_counts()
depts_to_merge = dept_counts[dept_counts < 30].index.tolist()

if depts_to_merge:
    print(f"Merging {len(depts_to_merge)} departments into 'other': {depts_to_merge}")
    X_train['department'] = X_train['department'].replace(depts_to_merge, 'other')
    X_test['department'] = X_test['department'].replace(depts_to_merge, 'other')

# One-Hot Encoding
print("Applying one-hot encoding...")
X_train_dummies = pd.get_dummies(X_train[CATEGORICAL_COLS], drop_first=True, dtype=int)
X_test_dummies = pd.get_dummies(X_test[CATEGORICAL_COLS], drop_first=True, dtype=int)

# Align columns - this is critical
X_train_aligned, X_test_aligned = X_train_dummies.align(
    X_test_dummies, join='left', axis=1, fill_value=0
)
print(f"Created {X_train_aligned.shape[1]} one-hot encoded features.")

# Drop original categorical columns and add encoded ones
X_train = X_train.drop(columns=CATEGORICAL_COLS)
X_test = X_test.drop(columns=CATEGORICAL_COLS)

X_train = pd.concat([X_train, X_train_aligned], axis=1)
X_test = pd.concat([X_test, X_test_aligned], axis=1)

# 4.4 Handle Outliers
print("Handling outliers on continuous variables...")
X_train_cleaned = handle_outliers(X_train, X_train, CONTINUOUS_COLS)
X_test_cleaned = handle_outliers(X_test, X_train, CONTINUOUS_COLS) # Use X_train stats

# 4.5 Handle Missing Data (Imputation)
print("Imputing all remaining missing values with -99...")
X_train_imputed = X_train_cleaned.fillna(-99)
X_test_imputed = X_test_cleaned.fillna(-99)

print(f"Missing values in train after imputation: {X_train_imputed.isnull().sum().sum()}")
print(f"Missing values in test after imputation: {X_test_imputed.isnull().sum().sum()}")

# 4.6 Recombine Processed Preop Data
print("Recombining processed preoperative data...")
X_preop_processed = pd.concat([X_train_imputed, X_test_imputed])

# Re-attach IDs and labels
preop_final_df = pd.DataFrame({
    'caseid': pd.concat([caseid_train, caseid_test]),
    TARGET_COLUMN: pd.concat([y_train, y_test])
}).reset_index(drop=True)

# Align indexes for concat
X_preop_processed = X_preop_processed.reset_index(drop=True)
preop_final_df = pd.concat([preop_final_df, X_preop_processed], axis=1)

print(f"Final processed preoperative data shape: {preop_final_df.shape}")

# --- 5. Process Waveform Data ---
print("\n--- Processing Waveform Data ---")

# 5.1 Load Long-Format Waveform Data
print(f"Loading long-format data from {LONG_WAVEFORM_FILE}...")
try:
    long_df = pd.read_csv(LONG_WAVEFORM_FILE)
    print(f"Successfully loaded. Shape: {long_df.shape}")
except FileNotFoundError:
    print(f"ERROR: Input file not found at {LONG_WAVEFORM_FILE}")
    print("Please ensure the preparation script has been run and the file exists.")
    sys.exit(1)

# 5.2 Pivot to Wide Format
print("Pivoting waveform data from long to wide format...")
id_cols = ['caseid', TARGET_COLUMN]
pivot_col = 'waveform'
feature_cols = [col for col in long_df.columns if col not in id_cols + [pivot_col]]

if not feature_cols:
    print("ERROR: No waveform feature columns found. Check your input CSV.")
    raise ValueError("No feature columns detected to pivot.")

waveform_wide_df = long_df.pivot_table(
    index=id_cols,
    columns=pivot_col,
    values=feature_cols
)
print(f"Pivot complete. Shape before flattening: {waveform_wide_df.shape}")

# 5.3 Flatten and Clean Column Names
print("Flattening and cleaning column names...")
new_cols = []
for feature_name, waveform_name in waveform_wide_df.columns:
    # Replace slashes (e.g., in 'SNUADC/PLETH') with underscores
    clean_waveform = waveform_name.replace('/', '_')
    new_cols.append(f"{clean_waveform}_{feature_name}")

waveform_wide_df.columns = new_cols
waveform_wide_df = waveform_wide_df.reset_index()
print(f"Column names flattened. Shape after reset_index: {waveform_wide_df.shape}")

# 5.4 Impute Waveform Data
print("Imputing missing waveform values (from missing windows) with 0...")
nan_count_before = waveform_wide_df.isna().sum().sum()
waveform_wide_df.fillna(0, inplace=True)
print(f"Total NaN values filled: {nan_count_before}")

# --- 6. Final Merge and Save ---
print("\n--- Merging Processed Data ---")
print(f"Merging waveform data (shape: {waveform_wide_df.shape}) with preop data (shape: {preop_final_df.shape})...")

# Use a left merge to ensure we keep all patients from the waveform cohort
# and attach their cleaned preoperative data.
master_df = pd.merge(
    waveform_wide_df,
    preop_final_df,
    on=['caseid', TARGET_COLUMN],
    how='left'
)

# Check for any NaNs introduced by the merge (e.g., a caseid in waveform file
# but not in preop file, which shouldn't happen if cohort file is the source)
merge_nan_count = master_df.isnull().sum().sum()
if merge_nan_count > 0:
    print(f"WARNING: Merge introduced {merge_nan_count} NaN values. This may indicate a mismatch.")
    print("Imputing these with -99 (assuming they are missing preop features)...")
    master_df.fillna(-99, inplace=True)

print(f"Final master DataFrame shape: {master_df.shape}")

# --- 7. Save Master File ---
print(f"\nSaving new master wide-format file to {WIDE_FEATURES_FILE}...")
master_df.to_csv(WIDE_FEATURES_FILE, index=False)

print("---")
print("✅ Comprehensive data wrangler script complete.")
print("\n--- Final Output Sample (first 7 columns) ---")
print(master_df.iloc[:, :7].head())



--- Part 1: Comprehensive Feature Wrangler ---
This script will clean preoperative data and merge it with waveform features.
Project Root:        d:\Projects\aki_prediction_project
Cohort (Preop) file: d:\Projects\aki_prediction_project\data\processed\aki_pleth_ecg_co2_awp.csv
Waveform (long) file:d:\Projects\aki_prediction_project\data\processed\aki_pleth_ecg_co2_awp_inf.csv
Output (wide) file:  d:\Projects\aki_prediction_project\data\processed\aki_features_master_wide.csv
Defined 25 preop, 15 continuous, 8 categorical features.
Outlier handling function defined.

--- Processing Preoperative Data ---
Loading cohort file from d:\Projects\aki_prediction_project\data\processed\aki_pleth_ecg_co2_awp.csv...
Successfully loaded. Shape: (3462, 75)
Selected 25 columns. Shape: (3462, 25)
Splitting preoperative data into train/test sets for safe processing...
Train set shape: (2769, 23), Test set shape: (693, 23)
Processing categorical variables...
Applying one-hot encoding...
Created 34 one-ho