In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def build_pipeline(df, target="SepsisLabel", missing_threshold=0.85):
    """
    Build and fit preprocessing pipeline on given dataframe.
    Returns: fitted pipeline, preprocessed X, y, kept columns
    """
    # 1. Drop highly missing features
    missing_frac = df.isnull().mean()
    features_to_drop = missing_frac[missing_frac > missing_threshold].index.tolist()
    df_reduced = df.drop(columns=features_to_drop)

    # 2. Separate numeric and categorical features
    num_features = df_reduced.select_dtypes(include=["float64", "int64"]).drop(columns=[target]).columns.tolist()
    cat_features = ["Gender", "Unit1", "Unit2"]

    # Keep only existing categorical features
    cat_features = [c for c in cat_features if c in df_reduced.columns]
    num_features = [c for c in num_features if c not in cat_features]

    # 3. Define transformers
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    # 4. Column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_features),
            ("cat", categorical_transformer, cat_features)
        ]
    )

    # 5. Pipeline
    pipeline = Pipeline(steps=[("preprocessor", preprocessor)])

    # 6. Fit + transform pipeline
    X = df_reduced.drop(columns=[target])
    y = df_reduced[target]
    X_preprocessed = pipeline.fit_transform(X)

    # Get feature names after preprocessing
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

    # Convert to DataFrame
    X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)

    return pipeline, X_preprocessed_df, y, df_reduced.columns

def apply_pipeline(df, pipeline, target="SepsisLabel"):
    """
    Apply a fitted pipeline to a dataframe.
    Returns: preprocessed X, y
    """
    X = df.drop(columns=[target])
    y = df[target]

    X_preprocessed = pipeline.transform(X)
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
    X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)

    return X_preprocessed_df, y

def main(input_folder, output_folder, target="SepsisLabel"):
    # Create output folder if not exists
    os.makedirs(output_folder, exist_ok=True)

    # List all PSV files in input folder
    psv_files = [f for f in os.listdir(input_folder) if f.endswith('.psv')]
    if not psv_files:
        print("No PSV files found in the input folder.")
        return

    # Load first file to build and fit pipeline
    first_file_path = os.path.join(input_folder, psv_files[0])
    df_first = pd.read_csv(first_file_path, sep='|')

    # Build pipeline using first file (assuming it's representative)
    pipeline, _, _, _ = build_pipeline(df_first, target=target)

    # Process each file and save processed data
    for filename in psv_files:
        print(f"Processing {filename}...")
        file_path = os.path.join(input_folder, filename)
        df = pd.read_csv(file_path, sep='|')

        # Apply pipeline
        X_preprocessed, y = apply_pipeline(df, pipeline, target=target)

        # Combine processed features with target column
        df_preprocessed = pd.concat([X_preprocessed, y.reset_index(drop=True)], axis=1)

        # Save processed file
        output_file_path = os.path.join(output_folder, filename)
        df_preprocessed.to_csv(output_file_path, sep='|', index=False)
        print(f"Saved processed file to: {output_file_path}")

    print("All files processed successfully.")

# Example usage: change the folder paths as needed
if __name__ == "__main__":
    # Folder paths
    training_input = "training"
    training_output = "processed_training"

    training_setB_input = "training_setB"
    training_setB_output = "processed_training_setB"

    # Process training folder
    print("Processing training folder...")
    main(training_input, training_output)

    # Process training_setB folder
    print("Processing training_setB folder...")
    main(training_setB_input, training_setB_output)


Processing training folder...
Processing p000001.psv...
Saved processed file to: processed_training\p000001.psv
Processing p000002.psv...
Saved processed file to: processed_training\p000002.psv
Processing p000003.psv...
Saved processed file to: processed_training\p000003.psv
Processing p000004.psv...
Saved processed file to: processed_training\p000004.psv
Processing p000005.psv...
Saved processed file to: processed_training\p000005.psv
Processing p000006.psv...
Saved processed file to: processed_training\p000006.psv
Processing p000007.psv...
Saved processed file to: processed_training\p000007.psv
Processing p000008.psv...
Saved processed file to: processed_training\p000008.psv
Processing p000009.psv...
Saved processed file to: processed_training\p000009.psv
Processing p000010.psv...
Saved processed file to: processed_training\p000010.psv
Processing p000011.psv...
Saved processed file to: processed_training\p000011.psv
Processing p000012.psv...
Saved processed file to: processed_trainin