In [4]:
"""
Phase 2: Data Splitting, Imputation, and Augmentation for AKI Cohort (v1.5)

Description:
This script takes the preprocessed AKI feature matrix from Phase 1 and prepares it 
for model training.
1. Splits the data into training, validation, and test sets.
2. Applies feature scaling.
3. Imputes missing values using a SimpleImputer.
4. Balances the imputed training set using a hybrid SMOTE and CTGAN strategy.

Outputs:
- All processed files will be saved in a new 'aki' subfolder.
- scaler_aki.joblib & imputer_aki.joblib: Saved objects for the AKI pipeline.
"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import joblib
from pathlib import Path

# --- Configuration for AKI Dataset ---
CONFIG = {
    "INPUT_FILE": "../data/preprocessed/aki_feature_matrix.csv",
    "OUTPUT_DIR": Path("../data/processed/aki/"),
    "TARGET_COLUMN": "kdigo_aki",
    "ID_COLUMNS": ['subject_id', 'hadm_id', 'stay_id'],
    "TEST_SIZE": 0.2,
    "VALIDATION_SIZE": 0.1,
    "RANDOM_STATE": 42
}

def split_scale_impute_data(df, config):
    """
    Loads, splits, scales, and imputes the data.
    The scaler and imputer are fit ONLY on the training data and saved.
    """
    print("--- 1. Splitting, Scaling, and Imputing AKI Data ---")
    
    X = df.drop(columns=[config["TARGET_COLUMN"]])
    y = df[config["TARGET_COLUMN"]]
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"], stratify=y
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=config["VALIDATION_SIZE"], random_state=config["RANDOM_STATE"], stratify=y_train_val
    )
    
    id_cols = [col for col in config["ID_COLUMNS"] if col in X_train.columns]
    feature_cols = [col for col in X_train.columns if col not in id_cols]
    
    # --- Feature Scaling ---
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_val_scaled = X_val.copy()
    X_test_scaled = X_test.copy()
    
    X_train_scaled.loc[:, feature_cols] = scaler.fit_transform(X_train[feature_cols])
    X_val_scaled.loc[:, feature_cols] = scaler.transform(X_val[feature_cols])
    X_test_scaled.loc[:, feature_cols] = scaler.transform(X_test[feature_cols])
    print("Scaling complete.")
    
    # --- Imputation ---
    imputer = SimpleImputer(strategy='mean')
    
    X_train_imputed = X_train_scaled.copy()
    X_val_imputed = X_val_scaled.copy()
    X_test_imputed = X_test_scaled.copy()
    
    X_train_imputed.loc[:, feature_cols] = imputer.fit_transform(X_train_scaled[feature_cols])
    X_val_imputed.loc[:, feature_cols] = imputer.transform(X_val_scaled[feature_cols])
    X_test_imputed.loc[:, feature_cols] = imputer.transform(X_test_scaled[feature_cols])
    print("Imputation complete.")
    
    return (X_train_imputed, y_train), (X_val_imputed, y_val), (X_test_imputed, y_test), scaler, imputer

def augment_training_data(X_train_imputed, y_train):
    """
    Applies the hybrid SMOTE and CTGAN augmentation strategy to the now-complete training set.
    """
    print("\n--- 2. Augmenting AKI Training Data ---")
    
    id_cols = [col for col in CONFIG["ID_COLUMNS"] if col in X_train_imputed.columns]
    X_train_features = X_train_imputed.drop(columns=id_cols)
    
    train_df_features = pd.concat([X_train_features, y_train.reset_index(drop=True)], axis=1)
    minority_df = train_df_features[train_df_features[CONFIG["TARGET_COLUMN"]] == 1]
    majority_df = train_df_features[train_df_features[CONFIG["TARGET_COLUMN"]] == 0]
    
    num_to_generate = len(majority_df) - len(minority_df)
    num_from_smote = num_to_generate // 2
    num_from_gan = num_to_generate - num_from_smote

    # --- SMOTE ---
    print(f"Original training distribution:\n{y_train.value_counts(normalize=True)}")
    print("\nApplying SMOTE...")
    explicit_strategy = {
        0: len(majority_df), 
        1: len(minority_df) + num_from_smote
    }
    smote = SMOTE(sampling_strategy=explicit_strategy, random_state=CONFIG["RANDOM_STATE"])
    X_smote_aug, y_smote_aug = smote.fit_resample(X_train_features, y_train)
    
    smote_generated_df = pd.concat([pd.DataFrame(X_smote_aug), pd.DataFrame(y_smote_aug)], axis=1).iloc[len(X_train_features):]
    smote_generated_df.columns = train_df_features.columns

    # --- CTGAN ---
    print("Detecting metadata for CTGAN...")
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=minority_df)

    print("Training CTGAN on original minority data...")
    ctgan = CTGANSynthesizer(metadata, epochs=300, verbose=False)
    ctgan.fit(minority_df)
    gan_generated_df = ctgan.sample(num_rows=num_from_gan)
    
    # --- Combine ---
    print("\nCombining original and synthetic data...")
    final_train_df_features = pd.concat([majority_df, minority_df, smote_generated_df, gan_generated_df], ignore_index=True)
    
    final_train_df = final_train_df_features.sample(frac=1, random_state=CONFIG["RANDOM_STATE"]).reset_index(drop=True)
    
    X_train_aug = final_train_df.drop(columns=[CONFIG["TARGET_COLUMN"]])
    y_train_aug = final_train_df[CONFIG["TARGET_COLUMN"]]
    
    print(f"\nFinal augmented training set distribution:\n{y_train_aug.value_counts(normalize=True)}")
    return X_train_aug, y_train_aug

def save_artifacts(train_data, val_data, test_data, scaler, imputer, output_dir):
    """Saves all processed data splits and the scaler/imputer objects to disk."""
    print("\n--- 3. Saving Artifacts ---")
    output_dir.mkdir(exist_ok=True, parents=True)
    
    X_train, y_train = train_data
    X_val, y_val = val_data
    X_test, y_test = test_data
    
    X_train.to_csv(output_dir / "train_X.csv", index=False)
    y_train.to_csv(output_dir / "train_y.csv", index=False)
    X_val.to_csv(output_dir / "val_X.csv", index=False)
    y_val.to_csv(output_dir / "val_y.csv", index=False)
    X_test.to_csv(output_dir / "test_X.csv", index=False)
    y_test.to_csv(output_dir / "test_y.csv", index=False)
    
    joblib.dump(scaler, output_dir / "scaler_aki.joblib")
    joblib.dump(imputer, output_dir / "imputer_aki.joblib")
    
    print(f"Processed AKI data saved to '{output_dir.resolve()}'")

def main():
    """Main execution pipeline for Phase 2 for the AKI cohort."""
    try:
        df = pd.read_csv(CONFIG["INPUT_FILE"])
    except FileNotFoundError:
        print(f"Error: Input file not found at '{CONFIG['INPUT_FILE']}'")
        return
        
    train_data, val_data, test_data, scaler, imputer = split_scale_impute_data(df, CONFIG)
    X_train_aug, y_train_aug = augment_training_data(train_data[0], train_data[1])
    
    save_artifacts((X_train_aug, y_train_aug), val_data, test_data, scaler, imputer, CONFIG["OUTPUT_DIR"])
    
    print("\nPhase 2 processing for AKI complete.")

if __name__ == "__main__":
    main()


--- 1. Splitting, Scaling, and Imputing AKI Data ---
Scaling complete.
Imputation complete.

--- 2. Augmenting AKI Training Data ---
Original training distribution:
kdigo_aki
1    0.748931
0    0.251069
Name: proportion, dtype: float64

Applying SMOTE...



Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[-0.19932743  0.45290586  0.12678921 ...  1.36603248  1.17036249
  1.23558582]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[-1.18474485  0.8440636   0.8440636  ... -1.18474485  0.8440636
 -1.18474485]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[ 0.71379918 -1.96035734  0.51812919 ...  0.38768253  1.36603248
 -0.32977409]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[ 0.8440636   0.8440636  -1.18474485 ...  0.8440636  -1.18474485
 -1.18474485]' h

ValueError: With over-sampling methods, the number of samples in a class should be greater or equal to the original number of samples. Originally, there is 14718 samples and 9826 samples are asked.