In [7]:
"""
Phase 2: Data Splitting, Hybrid Augmentation, and Preparation for Imputation

Description:
This script takes the preprocessed feature matrix from Phase 1 and prepares it 
for model training. It performs three critical steps:
1. Splits the data into training, validation, and test sets.
2. Applies feature scaling (StandardScaler) to normalize the data.
3. Balances the training set using a hybrid SMOTE and CTGAN strategy.

Outputs:
- train_X.csv, train_y.csv: The scaled and balanced training data.
- val_X.csv, val_y.csv: The scaled validation data.
- test_X.csv, test_y.csv: The scaled test data.
- scaler.joblib: The saved scaler object, crucial for consistent scaling.
"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
import joblib
from pathlib import Path

# --- Configuration ---
# Updated to match the paths you are using
CONFIG = {
    "INPUT_FILE": "../data/preprocessed/sepsis_feature_matrix.csv",
    "OUTPUT_DIR": Path("../data/processed/"),
    "TARGET_COLUMN": "hospital_expire_flag",
    "ID_COLUMNS": ['subject_id', 'hadm_id', 'stay_id'],
    "TEST_SIZE": 0.2,
    "VALIDATION_SIZE": 0.1,
    "RANDOM_STATE": 42
}

def split_scale_impute_data(df, config):
    """
    Loads, splits, scales, and imputes the data.
    The scaler and imputer are fit ONLY on the training data and saved.
    """
    print("--- 1. Splitting, Scaling, and Imputing Data ---")
    
    X = df.drop(columns=[config["TARGET_COLUMN"]])
    y = df[config["TARGET_COLUMN"]]
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"], stratify=y
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=config["VALIDATION_SIZE"], random_state=config["RANDOM_STATE"], stratify=y_train_val
    )
    
    id_cols = [col for col in config["ID_COLUMNS"] if col in X_train.columns]
    feature_cols = [col for col in X_train.columns if col not in id_cols]
    
    # --- Feature Scaling ---
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_val_scaled = X_val.copy()
    X_test_scaled = X_test.copy()
    
    X_train_scaled.loc[:, feature_cols] = scaler.fit_transform(X_train[feature_cols])
    X_val_scaled.loc[:, feature_cols] = scaler.transform(X_val[feature_cols])
    X_test_scaled.loc[:, feature_cols] = scaler.transform(X_test[feature_cols])
    print("Scaling complete.")
    
    # --- Imputation ---
    imputer = SimpleImputer(strategy='mean')
    
    X_train_imputed = X_train_scaled.copy()
    X_val_imputed = X_val_scaled.copy()
    X_test_imputed = X_test_scaled.copy()
    
    X_train_imputed.loc[:, feature_cols] = imputer.fit_transform(X_train_scaled[feature_cols])
    X_val_imputed.loc[:, feature_cols] = imputer.transform(X_val_scaled[feature_cols])
    X_test_imputed.loc[:, feature_cols] = imputer.transform(X_test_scaled[feature_cols])
    print("Imputation complete.")
    
    return (X_train_imputed, y_train), (X_val_imputed, y_val), (X_test_imputed, y_test), scaler, imputer

def augment_training_data(X_train_imputed, y_train):
    """
    Applies the hybrid SMOTE and CTGAN augmentation strategy to the now-complete training set.
    """
    print("\n--- 2. Augmenting Training Data ---")
    
    id_cols = [col for col in CONFIG["ID_COLUMNS"] if col in X_train_imputed.columns]
    X_train_features = X_train_imputed.drop(columns=id_cols)
    
    train_df_features = pd.concat([X_train_features, y_train.reset_index(drop=True)], axis=1)
    minority_df = train_df_features[train_df_features[CONFIG["TARGET_COLUMN"]] == 1]
    majority_df = train_df_features[train_df_features[CONFIG["TARGET_COLUMN"]] == 0]
    
    num_to_generate = len(majority_df) - len(minority_df)
    num_from_smote = num_to_generate // 2
    num_from_gan = num_to_generate - num_from_smote

    # --- SMOTE ---
    print("\nApplying SMOTE...")
    smote = SMOTE(sampling_strategy={1: len(minority_df) + num_from_smote}, random_state=CONFIG["RANDOM_STATE"])
    X_smote_aug, y_smote_aug = smote.fit_resample(X_train_features, y_train)
    
    smote_generated_df = pd.concat([pd.DataFrame(X_smote_aug), pd.DataFrame(y_smote_aug)], axis=1).iloc[len(X_train_features):]
    smote_generated_df.columns = train_df_features.columns

    # --- CTGAN (with Metadata Detection Fix) ---
    print("Detecting metadata for CTGAN...")
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=minority_df)

    print("Training CTGAN on original minority data...")
    # Pass the detected metadata object to the synthesizer
    ctgan = CTGANSynthesizer(metadata, epochs=300, verbose=False)
    ctgan.fit(minority_df)
    gan_generated_df = ctgan.sample(num_rows=num_from_gan)
    
    # --- Combine ---
    print("\nCombining original and synthetic data...")
    final_train_df_features = pd.concat([majority_df, minority_df, smote_generated_df, gan_generated_df], ignore_index=True)
    
    final_train_df = final_train_df_features.sample(frac=1, random_state=CONFIG["RANDOM_STATE"]).reset_index(drop=True)
    
    X_train_aug = final_train_df.drop(columns=[CONFIG["TARGET_COLUMN"]])
    y_train_aug = final_train_df[CONFIG["TARGET_COLUMN"]]
    
    print(f"\nFinal augmented training set distribution:\n{y_train_aug.value_counts(normalize=True)}")
    return X_train_aug, y_train_aug

def save_artifacts(train_data, val_data, test_data, scaler, imputer, output_dir):
    """Saves all processed data splits and the scaler/imputer objects to disk."""
    print("\n--- 3. Saving Artifacts ---")
    output_dir.mkdir(exist_ok=True, parents=True)
    
    X_train, y_train = train_data
    X_val, y_val = val_data
    X_test, y_test = test_data
    
    X_train.to_csv(output_dir / "train_X.csv", index=False)
    y_train.to_csv(output_dir / "train_y.csv", index=False)
    X_val.to_csv(output_dir / "val_X.csv", index=False)
    y_val.to_csv(output_dir / "val_y.csv", index=False)
    X_test.to_csv(output_dir / "test_X.csv", index=False)
    y_test.to_csv(output_dir / "test_y.csv", index=False)
    
    joblib.dump(scaler, output_dir / "scaler.joblib")
    joblib.dump(imputer, output_dir / "imputer.joblib")
    
    print(f"Processed data saved to '{output_dir.resolve()}'")

def main():
    """Main execution pipeline for Phase 2."""
    try:
        df = pd.read_csv(CONFIG["INPUT_FILE"])
    except FileNotFoundError:
        print(f"Error: Input file not found at '{CONFIG['INPUT_FILE']}'")
        return
        
    train_data, val_data, test_data, scaler, imputer = split_scale_impute_data(df, CONFIG)
    X_train_aug, y_train_aug = augment_training_data(train_data[0], train_data[1])
    
    save_artifacts((X_train_aug, y_train_aug), val_data, test_data, scaler, imputer, CONFIG["OUTPUT_DIR"])
    
    print("\nPhase 2 processing complete.")

if __name__ == "__main__":
    main()


--- 1. Splitting, Scaling, and Imputing Data ---
Scaling complete.
Imputation complete.

--- 2. Augmenting Training Data ---

Applying SMOTE...
Detecting metadata for CTGAN...
Training CTGAN on original minority data...



Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[ 0.07684979  0.07684979 -1.87512084 ... -0.11834727 -0.11834727
 -1.87512084]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[ 0.89783048 -1.11379601  0.89783048 ...  0.89783048 -1.11379601
 -1.11379601]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[-0.89913552 -0.37861002  1.18296648 ...  0.98776941  0.46724391
  0.59737529]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[ 0.89783048 -1.11379601 -1.11379601 ...  0.89783048  0.89783048
  0.89783048]' 


Combining original and synthetic data...

Final augmented training set distribution:
hospital_expire_flag
1.0    0.5
0.0    0.5
Name: proportion, dtype: float64

--- 3. Saving Artifacts ---
Processed data saved to 'C:\Users\vamsi\cost-effective-diagnosis-using-a-transformer-driven-rl-policy\data\processed'

Phase 2 processing complete.
