In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob

## New Label Creation

In [2]:
# open psv file
df = pd.read_csv("../data/training_setA/p000001.psv", sep="|")
df.tail()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
49,76.0,,,85.0,53.0,,17.0,,,,...,,,,83.14,0,,,-0.03,50,0
50,81.0,99.0,,99.0,51.0,,17.0,,,,...,,,,83.14,0,,,-0.03,51,0
51,85.0,100.0,,103.0,48.0,,26.0,,,,...,,,,83.14,0,,,-0.03,52,0
52,86.0,93.0,,87.0,44.0,,22.0,,,,...,,,,83.14,0,,,-0.03,53,0
53,84.0,85.0,,78.0,44.0,,18.0,,,,...,,,,83.14,0,,,-0.03,54,0


In [3]:
def make_time_to_sepsis_label(
    df: pd.DataFrame,
    time_col: str = "ICULOS",
    label_col: str = "SepsisLabel",
    group_col: str | None = None,
    horizon_hours: float = 24.0,
    out_col: str = "NewLabel",
) -> pd.DataFrame:
    """
    Create a time-to-sepsis label:
      - label = 1 if current sepsisLabel == 1
      - label = 0 if there is no future sepsis onset after this point
      - otherwise label = max(1 - (t_sepsis - t)/horizon_hours, 0)
    
    Supports multiple sepsis episodes per patient (e.g., 0->1, back to 0, later 0->1 again).

    Args:
        df: DataFrame containing at least [time_col, label_col], and optionally group_col.
        time_col: Name of the time variable (in hours).
        label_col: Name of the binary sepsis indicator column (0/1).
        group_col: Optional patient identifier column. If provided, labeling is done per patient.
        horizon_hours: Linear ramp window before sepsis onset (default 24).
        out_col: Name of output column to write.

    Returns:
        A DataFrame with a new float column `out_col` added.
    """
    def _label_one_patient(g: pd.DataFrame) -> pd.DataFrame:
        g = g.sort_values(time_col).copy()

        # Current sepsis state (ensure 0/1 integers)
        sepsis = g[label_col].fillna(0).astype(int)

        # Onset indices are 0->1 transitions
        onset = (sepsis.shift(fill_value=0) == 0) & (sepsis == 1)

        # For each row, get the next onset time at/after the current row
        #   - Where onset is True, keep its time; else NaN
        #   - bfill gives the nearest future onset time (or NaN if none ahead)
        next_onset_time = g[time_col].where(onset).bfill()

        # Hours until the next onset (NaN if none ahead)
        delta_hours = (next_onset_time - g[time_col]).astype(float)

        # Linear ramp up to 1 in the last `horizon_hours` before onset
        ramp = 1.0 - (delta_hours / horizon_hours)
        ramp = ramp.clip(lower=0.0, upper=1.0)

        # If there is no future onset, label = 0
        ramp[next_onset_time.isna()] = 0.0

        # If currently septic, label = 1 (overrides ramp)
        new_label = ramp.where(sepsis == 0, 1.0)

        g[out_col] = new_label.astype(float)
        # Restore original order of rows
        return g.sort_index()

    if group_col is None:
        return _label_one_patient(df)
    else:
        return df.groupby(group_col, group_keys=False).apply(_label_one_patient)


In [None]:
def build_training_set_labels(
    input_dir: str,
    output_csv: str,
    horizon_hours: float = 24.0,
    make_label_fn=None,  # pass your make_time_to_sepsis_label here if not in scope
) -> pd.DataFrame:
    """
    Parse all .psv files under `input_dir`, create NewLabel using `make_time_to_sepsis_label`,
    and concatenate into one CSV at `output_csv`.

    Assumes each file is a single patient's time series with columns including ICULOS and SepsisLabel.
    Adds a PatientID (derived from filename: e.g., 'p000001').

    Returns the concatenated DataFrame.
    """
    if make_label_fn is None:
        raise ValueError("Please provide `make_time_to_sepsis_label` via `make_label_fn`.")

    paths = sorted(glob.glob(os.path.join(input_dir, "*.psv")))
    if not paths:
        raise FileNotFoundError(f"No .psv files found in: {input_dir}")

    all_dfs: list[pd.DataFrame] = []

    for path in paths:
        patient_id = os.path.splitext(os.path.basename(path))[0]  # e.g. 'p000001'

        # Read pipe-separated file exactly as requested
        df = pd.read_csv(path, sep="|")
        if "ICULOS" not in df.columns or "SepsisLabel" not in df.columns:
            raise ValueError(f"File {path} missing required columns ICULOS/SepsisLabel.")

        # Standardize types & order
        df["ICULOS"] = pd.to_numeric(df["ICULOS"], errors="coerce")
        df["SepsisLabel"] = pd.to_numeric(df["SepsisLabel"], errors="coerce").fillna(0).astype(int)
        df = df.sort_values("ICULOS").reset_index(drop=True)

        # Add PatientID so we can track provenance
        df.insert(0, "PatientID", patient_id)

        # Create labels (single-patient at a time; group_col=None)
        labeled = make_label_fn(
            df=df,
            time_col="ICULOS",
            label_col="SepsisLabel",
            group_col=None,
            horizon_hours=horizon_hours,
            out_col="NewLabel",
        )

        # Be tolerant of return type (DataFrame vs Series)
        if isinstance(labeled, pd.Series):
            df["NewLabel"] = labeled.values
        elif isinstance(labeled, pd.DataFrame):
            if "NewLabel" in labeled.columns:
                # If they returned the full df with NewLabel, use it; otherwise just take the column
                if set(df.index) == set(labeled.index) and len(labeled.columns) > 1:
                    df = labeled
                else:
                    df["NewLabel"] = labeled["NewLabel"].values
            else:
                # Fallback: take the first column as the label
                df["NewLabel"] = labeled.iloc[:, 0].values
        else:
            raise TypeError(
                "make_time_to_sepsis_label must return a pandas Series or DataFrame."
            )

        all_dfs.append(df)

    big_df = pd.concat(all_dfs, ignore_index=True)
    big_df.to_csv(output_csv, index=False)
    return big_df


big = build_training_set_labels(
    input_dir="../data/training_setA",
    output_csv="../data/agg/training_setA_labeled.csv",
    horizon_hours=24.0,
    make_label_fn=make_time_to_sepsis_label,
)

bigB = build_training_set_labels(
    input_dir="../data/training_setB",
    output_csv="../data/agg/training_setB_labeled.csv",
    horizon_hours=24.0,
    make_label_fn=make_time_to_sepsis_label,
)


In [6]:
df_cleaned = pd.concat([pd.read_csv("../data/agg/training_setA_labeled.csv"),
                       pd.read_csv("../data/agg/training_setB_labeled.csv")],
                       ignore_index=True)
df_cleaned

Unnamed: 0,PatientID,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,NewLabel
0,p000001,,,,,,,,,,...,,,83.14,0,,,-0.03,1,0,0.0
1,p000001,97.0,95.0,,98.0,75.33,,19.0,,,...,,,83.14,0,,,-0.03,2,0,0.0
2,p000001,89.0,99.0,,122.0,86.00,,22.0,,,...,,,83.14,0,,,-0.03,3,0,0.0
3,p000001,90.0,95.0,,,,,30.0,,24.0,...,,,83.14,0,,,-0.03,4,0,0.0
4,p000001,103.0,88.5,,122.0,91.33,,24.5,,,...,,,83.14,0,,,-0.03,5,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552205,p120000,80.0,96.0,,115.0,87.00,65.0,15.0,,,...,,,62.00,0,,,0.00,31,0,0.0
1552206,p120000,74.0,97.0,,114.0,83.00,67.0,15.0,,,...,,,62.00,0,,,0.00,32,0,0.0
1552207,p120000,78.0,98.0,,110.0,83.00,69.0,15.0,,,...,,,62.00,0,,,0.00,33,0,0.0
1552208,p120000,82.0,99.0,36.6,124.0,91.00,71.0,16.0,,,...,,,62.00,0,,,0.00,34,0,0.0


In [10]:
# get averages for all columns except PatientID, ICULOS, SepsisLabel, NewLabel
df_means = df_cleaned.drop(columns=["PatientID", "ICULOS", "SepsisLabel", "NewLabel"]).mean()
df_means

HR                   84.581443
O2Sat                97.193955
Temp                 36.977228
SBP                 123.750465
MAP                  82.400100
DBP                  63.830556
Resp                 18.726498
EtCO2                32.957657
BaseExcess           -0.689919
HCO3                 24.075481
FiO2                  0.554839
pH                    7.378934
PaCO2                41.021869
SaO2                 92.654188
AST                 260.223385
BUN                  23.915452
Alkalinephos        102.483661
Calcium               7.557531
Chloride            105.827910
Creatinine            1.510699
Bilirubin_direct      1.836177
Glucose             136.932283
Lactate               2.646666
Magnesium             2.051450
Phosphate             3.544238
Potassium             4.135528
Bilirubin_total       2.114059
TroponinI             8.290099
Hct                  30.794093
Hgb                  10.430833
PTT                  41.231193
WBC                  11.446405
Fibrinog

In [7]:
# get median for all columns except PatientID, ICULOS, SepsisLabel, NewLabel
df_medians = df_cleaned.drop(columns=["PatientID", "ICULOS", "SepsisLabel", "NewLabel"]).median()
df_medians

HR                   83.500
O2Sat                98.000
Temp                 37.000
SBP                 121.000
MAP                  80.000
DBP                  62.000
Resp                 18.000
EtCO2                33.000
BaseExcess            0.000
HCO3                 24.000
FiO2                  0.500
pH                    7.380
PaCO2                40.000
SaO2                 97.000
AST                  41.000
BUN                  17.000
Alkalinephos         74.000
Calcium               8.300
Chloride            106.000
Creatinine            0.940
Bilirubin_direct      0.445
Glucose             127.000
Lactate               1.800
Magnesium             2.000
Phosphate             3.300
Potassium             4.100
Bilirubin_total       0.900
TroponinI             0.300
Hct                  30.300
Hgb                  10.300
PTT                  32.400
WBC                  10.300
Fibrinogen          250.000
Platelets           181.000
Age                  64.000
Gender              

## remove NaNs

In [None]:
import os
import glob
from typing import Iterable, Optional
import pandas as pd
import numpy as np

def impute_psv_folder(
    input_dir: str,
    output_dir: str,
    avg_series: pd.Series,
    exclude_cols: Optional[Iterable[str]] = ("PatientID", "ICULOS", "SepsisLabel"),
    sort_by_time: bool = True,
    time_col: str = "ICULOS",
) -> list[str]:
    """
    For each .psv in `input_dir`, create an imputed version in `output_dir`
    with the same filename. Imputation is column-wise and follows:
        1) forward-fill (most recent past value),
        2) THEN (only if still NaN) backward-fill (nearest future value),
        3) THEN (only if still NaN) fill with `avg_series[col]` if available (and not NaN).

    Columns excluded from imputation: by default ["PatientID", "ICULOS", "SepsisLabel"].
    If a listed exclude column doesn't exist in a file, it's ignored.

    Parameters
    ----------
    input_dir : folder containing .psv files (pipe-delimited)
    output_dir : destination folder (created if missing)
    avg_series : pd.Series whose index are column names and values are global means
    exclude_cols : columns to skip from imputation
    sort_by_time : if True and `time_col` exists, sort ascending before imputation
    time_col : time column name (default "ICULOS")

    Returns
    -------
    list[str]
        Paths of written files.
    """
    os.makedirs(output_dir, exist_ok=True)
    paths = sorted(glob.glob(os.path.join(input_dir, "*.psv")))
    if not paths:
        raise FileNotFoundError(f"No .psv files found under: {input_dir}")

    # Normalize exclude list
    exclude_set = set(exclude_cols or [])

    written = []
    for path in paths:
        df = pd.read_csv(path, sep="|")

        # Optional: ensure chronological order for correct forward/backward fill
        if sort_by_time and time_col in df.columns:
            df = df.sort_values(time_col, kind="mergesort").reset_index(drop=True)

        # Determine which columns to impute in this file
        cols_to_impute = [c for c in df.columns if c not in exclude_set]

        if cols_to_impute:
            # 1) Forward-fill (most recent past)
            df[cols_to_impute] = df[cols_to_impute].ffill()

            # 2) Backward-fill (nearest future) ONLY where still NaN
            #    Using mask avoids overwriting values that were just forward-filled.
            remaining_nan_mask = df[cols_to_impute].isna()
            if remaining_nan_mask.values.any():
                df.loc[:, cols_to_impute] = df[cols_to_impute].where(~remaining_nan_mask, df[cols_to_impute].bfill())

            # 3) Fill remaining NaNs with global averages if provided and not NaN
            #    Only fill where avg is available and finite (not NaN).
            still_nan = df[cols_to_impute].isna()
            if still_nan.values.any():
                for col in cols_to_impute:
                    if still_nan[col].any():
                        avg_val = avg_series.get(col, np.nan)
                        if pd.notna(avg_val):
                            df.loc[still_nan[col], col] = avg_val
                        # If avg is NaN or not provided for this column, we leave as NaN (by spec).

        # Write with identical filename into output_dir
        out_path = os.path.join(output_dir, os.path.basename(path))
        df.to_csv(out_path, sep="|", index=False)
        written.append(out_path)

    return written



imputed_paths = impute_psv_folder(
    input_dir="../data/training_setA",
    output_dir="../data/training_setA_imputed",
    avg_series=df_medians,
    exclude_cols=("PatientID", "ICULOS", "SepsisLabel"),
)
imputed_pathsB = impute_psv_folder(
    input_dir="../data/training_setB",
    output_dir="../data/training_setB_imputed",
    avg_series=df_medians,
    exclude_cols=("PatientID", "ICULOS", "SepsisLabel"),
)


In [None]:
# repeat step above for imputed data
big_imputed = build_training_set_labels(
    input_dir="../data/training_setA_imputed",
    output_csv="../data/training_setA_imputed_labeled.csv",
    horizon_hours=24.0,
    make_label_fn=make_time_to_sepsis_label,
)
bigB_imputed = build_training_set_labels(
    input_dir="../data/training_setB_imputed",
    output_csv="../data/training_setB_imputed_labeled.csv",     
    horizon_hours=24.0,
    make_label_fn=make_time_to_sepsis_label,
)

df_cleaned_imputed = pd.concat([pd.read_csv("../data/training_setA_imputed_labeled.csv"),
                       pd.read_csv("../data/training_setB_imputed_labeled.csv")],
                       ignore_index=True)

df_cleaned_imputed

Unnamed: 0,PatientID,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,NewLabel
0,p000001,97.0,95.0,36.11,98.0,75.33,63.830556,19.0,32.957657,24.000000,...,287.385706,317.0,83.14,0,0.496571,0.503429,-0.03,1,0,0.0
1,p000001,97.0,95.0,36.11,98.0,75.33,63.830556,19.0,32.957657,24.000000,...,287.385706,317.0,83.14,0,0.496571,0.503429,-0.03,2,0,0.0
2,p000001,89.0,99.0,36.11,122.0,86.00,63.830556,22.0,32.957657,24.000000,...,287.385706,317.0,83.14,0,0.496571,0.503429,-0.03,3,0,0.0
3,p000001,90.0,95.0,36.11,122.0,86.00,63.830556,30.0,32.957657,24.000000,...,287.385706,317.0,83.14,0,0.496571,0.503429,-0.03,4,0,0.0
4,p000001,103.0,88.5,36.11,122.0,91.33,63.830556,24.5,32.957657,24.000000,...,287.385706,317.0,83.14,0,0.496571,0.503429,-0.03,5,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552205,p120000,80.0,96.0,36.40,115.0,87.00,65.000000,15.0,32.957657,-0.689919,...,287.385706,216.0,62.00,0,0.496571,0.503429,0.00,31,0,0.0
1552206,p120000,74.0,97.0,36.40,114.0,83.00,67.000000,15.0,32.957657,-0.689919,...,287.385706,216.0,62.00,0,0.496571,0.503429,0.00,32,0,0.0
1552207,p120000,78.0,98.0,36.40,110.0,83.00,69.000000,15.0,32.957657,-0.689919,...,287.385706,216.0,62.00,0,0.496571,0.503429,0.00,33,0,0.0
1552208,p120000,82.0,99.0,36.60,124.0,91.00,71.000000,16.0,32.957657,-0.689919,...,287.385706,216.0,62.00,0,0.496571,0.503429,0.00,34,0,0.0


In [14]:
df_cleaned_imputed.to_csv("../data/cleaned_imputed_labeled.csv", index=False)