## Data pipeline for mortality decompensation prediction.

## Imports and Globals

In [14]:
import os
import random
from typing import List, Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from cyclops.plotter import plot_timeline, set_bars_color, setup_plot
from cyclops.processors.aggregate import Aggregator, tabular_as_aggregated
from cyclops.processors.cleaning import (
    combine_events,
    convert_to_events,
    normalize_events,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    DIAGNOSIS_CODE,
    DISCHARGE_DISPOSITION,
    DISCHARGE_TIMESTAMP,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    HOSPITAL_ID,
    RESTRICT_TIMESTAMP,
    SEX,
    TIMESTEP,
    TRIAGE_LEVEL,
)
from cyclops.processors.constants import (
    ALL,
    BINARY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    FIRST,
    LAST,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    SMH,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.vectorize import intersect_vectorized, split_vectorized
from cyclops.processors.statics import compute_statics
from cyclops.processors.string_ops import replace_if_string_match, to_lower
from cyclops.processors.util import (
    create_indicator_variables,
    gather_columns,
    pivot_aggregated_events_to_features,
)
from cyclops.query import gemini
from cyclops.utils.file import load_dataframe, save_dataframe, yield_dataframes
from notebooks.usecases.gemini.mortality_decompensation.constants import (
    ENCOUNTERS_FILE,
    QUERIED_DIR,
    WINDOW_DURATION,
    TIMESTEP_SIZE,
    MORTALITY,
    LOS,
    YEARS,
    PREDICT_OFFSET,
)

## Query cohort and labs

In [None]:
encounters_interface = gemini.patient_encounters(
    er_admin_table=gemini.get_table(gemini.ER_ADMIN),
    years=YEARS,
    died=True,
    died_binarize_col=MORTALITY,
)
encounters = encounters_interface.run()
save_dataframe(encounters, ENCOUNTERS_FILE)

labs_interface = gemini.events(
    years=YEARS,
    event_category="lab"
)
labs_interface.save_in_grouped_batches(QUERIED_DIR, ENCOUNTER_ID, int(1e7))

## Load data

In [2]:
encounters = load_dataframe(ENCOUNTERS_FILE)

2022-09-29 09:25:22,744 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from /mnt/nfs/home/krishnanam/cyclops/notebooks/usecases/gemini/mortality_decompensation/./data/encounters.parquet


## Map triage level

In [3]:
def remap(triage_level):
    map_ = {
        "1": "RESUSCITATION",
        "2": "EMERGENCY",
        "3": "URGENT",
        "4": "SEMI-URGENT",
        "5": "NON-URGENT",
    }
    return map_.get(triage_level, "UNKNOWN")


encounters[TRIAGE_LEVEL] = encounters[TRIAGE_LEVEL].apply(remap)

## Query imaging and transfusions, map them such that they can be treated as events

In [4]:
imaging_interface = gemini.imaging(years=YEARS)
imaging = imaging_interface.run()
imaging = pd.merge(
    encounters, imaging, on=ENCOUNTER_ID, how="inner"
)

imaging = imaging.rename(
    columns={
        "imaging_test_description": EVENT_NAME,
        "performed_date_time": EVENT_TIMESTAMP,
    }
)
imaging[EVENT_CATEGORY] = "imaging"
imaging[EVENT_VALUE] = 1


transfusions_interface = gemini.blood_transfusions(years=YEARS)
transfusions = transfusions_interface.run()
transfusions = pd.merge(
    encounters, transfusions, on=ENCOUNTER_ID, how="inner"
)

transfusions = transfusions.rename(
    columns={"issue_date_time": EVENT_TIMESTAMP}
)
transfusions[EVENT_NAME] = transfusions["rbc_mapped"]
transfusions[EVENT_NAME] = transfusions[EVENT_NAME].apply(
    lambda x: "rbc" if x else "non-rbc"
)
transfusions[EVENT_VALUE] = 1
transfusions[EVENT_CATEGORY] = "transfusions"

2022-09-29 09:25:41,462 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-09-29 09:25:41,463 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 9.167026 s
2022-09-29 09:25:46,311 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-09-29 09:25:46,312 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.523020 s


##  Query interventions, map them such that they can be treated as events

In [5]:
interventions_interface = gemini.interventions(years=YEARS)
interventions = interventions_interface.run()
interventions = pd.merge(
    encounters, interventions, on=ENCOUNTER_ID, how="inner"
)

interventions[EVENT_VALUE] = 1
interventions[EVENT_CATEGORY] = "interventions"

binary_mapped_cols = [
    "endoscopy_mapped",
    "gi_endoscopy_mapped",
    "bronch_endoscopy_mapped",
    "dialysis_mapped",
    "inv_mech_vent_mapped",
    "surgery_mapped",
]
interventions["intervention_episode_start_time"].loc[
    interventions["intervention_episode_start_time"].isna()
] = "12:00:00"
interventions[EVENT_TIMESTAMP] = pd.to_datetime(
    interventions["intervention_episode_start_date"].astype(str)
    + " "
    + interventions["intervention_episode_start_time"].astype(str)
)
interventions[EVENT_TIMESTAMP] = interventions[EVENT_TIMESTAMP].astype(
    "datetime64[ns]"
)
interventions["unmapped_intervention"] = ~(
    interventions["endoscopy_mapped"]
    | interventions["gi_endoscopy_mapped"]
    | interventions["bronch_endoscopy_mapped"]
    | interventions["dialysis_mapped"]
    | interventions["inv_mech_vent_mapped"]
    | interventions["surgery_mapped"]
)
interventions[EVENT_NAME] = interventions[
    binary_mapped_cols + ["unmapped_intervention"]
].idxmax(axis=1)

2022-09-29 09:25:57,025 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-09-29 09:25:57,027 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 1.097104 s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interventions["intervention_episode_start_time"].loc[


## Filter out encounters that had less than TIMESTEP_SIZE

In [6]:
encounters[LOS] = (
    encounters[DISCHARGE_TIMESTAMP] - encounters[ADMIT_TIMESTAMP]
)
encounters = encounters.loc[
    encounters[LOS] >= pd.to_timedelta(TIMESTEP_SIZE, unit="h")
]

## Get encounters that ended in mortality outcome

In [7]:
encounters_mortality = encounters.loc[
    encounters[MORTALITY] == True
]

## Get encounters which result in death within timeframe

In [8]:
encounters_mortality_within_risk_timeframe = encounters_mortality.loc[
    encounters_mortality[LOS]
    <= pd.to_timedelta(PREDICT_OFFSET + WINDOW_DURATION, unit="h")
]

## Create death events

In [9]:
mortality_events = convert_to_events(
    encounters_mortality_within_risk_timeframe,
    event_name=f"death",
    event_category="general",
    timestamp_col=DISCHARGE_TIMESTAMP,
)
mortality_events = pd.merge(
    mortality_events, encounters_mortality, on=ENCOUNTER_ID, how="inner"
)
mortality_events = mortality_events[
    [
        ENCOUNTER_ID,
        EVENT_NAME,
        EVENT_TIMESTAMP,
        ADMIT_TIMESTAMP,
        EVENT_VALUE,
        EVENT_CATEGORY,
    ]
]
mortality_events[EVENT_VALUE] = 1

## Get admission/discharge events

In [10]:
admit_events = convert_to_events(
    encounters,
    event_name="admission",
    event_category="general",
    timestamp_col=ADMIT_TIMESTAMP,
)
disch_events = convert_to_events(
    encounters,
    event_name="discharge",
    event_category="general",
    timestamp_col=DISCHARGE_TIMESTAMP,
)

## Define aggregator

In [12]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=TIMESTEP_SIZE,
    window_duration=WINDOW_DURATION,
)

## Process labs

In [16]:
# Aggregate
skip_n = 0
generator = yield_dataframes(QUERIED_DIR, skip_n=skip_n, log=False)

for save_count, events in enumerate(generator):
    # Aggregate
    events = events.reset_index(drop=True)
    tmp_features = TemporalFeatures(
        events,
        features=EVENT_VALUE,
        by=[ENCOUNTER_ID, EVENT_NAME],
        timestamp_col=EVENT_TIMESTAMP,
        aggregator=aggregator,
    )
    aggregated = tmp_features.aggregate()

    save_dataframe(
        aggregated,
        join(usecase_params.AGGREGATED_DIR, "batch_" + f"{save_count + skip_n:04d}"),
    )
    del events

TypeError: Could not convert 20 @CKD to numeric

## Normalize all event data (names, string operations)

In [None]:
imaging = normalize_events(imaging)
transfusions = normalize_events(transfusions)
intervention_events = normalize_events(interventions)

mortality_events = normalize_events(mortality_events)
admit_events = normalize_events(admit_events)
disch_events = normalize_events(disch_events)

## Combine different event data, save

In [None]:
combined_events = combine_events(
    [
        intervention_events,
        imaging_events,
        transfusion_events,
        lab_events,
        mortality_events,
        admit_events,
        disch_events,
    ]
)
save_dataframe(combined_events, os.path.join(BASE_DATA_PATH, "combined_events"))

## Load combined events

In [None]:
combined_events = load_dataframe(os.path.join(BASE_DATA_PATH, "combined_events"))

## Aggregate combined events, save

In [None]:

combined_events = combined_events.reset_index(drop=True)

temporal_features = TemporalFeatures(
    combined_events,
    features=EVENT_VALUE,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
)
aggregated = temporal_features.aggregate()
save_dataframe(aggregated, os.path.join(BASE_DATA_PATH, "aggregated"))

## Load aggregated events

In [None]:
aggregated = load_dataframe(os.path.join(BASE_DATA_PATH, "aggregated"))

## Convert aggregated to vectorized

In [None]:
vectorized = aggregator.vectorize(aggregated)

## Add to feature handler, with indicator variables.

In [None]:
temporal_features = load_dataframe(os.path.join(BASE_DATA_PATH, "temporal_features"))
feature_handler.add_features(temporal_features)
feature_handler.drop_features(["death"])

already_indicators = [
    "ct",
    "dialysis_mapped",
    "echo",
    "endoscopy_mapped",
    "interventional",
    "inv_mech_vent_mapped",
    "mri",
    "non-rbc",
    "other",
    "rbc",
    "surgery_mapped",
    "ultrasound",
    "unmapped_intervention",
    "x-ray",
]

temporal_features = feature_handler.features["temporal"]
indicators = create_indicator_variables(
    temporal_features[
        [col for col in temporal_features if col not in already_indicators]
    ]
)
feature_handler.add_features(indicators)
feature_handler.features["temporal"].columns

## Compute static features, save it

In [None]:
encounters_train_val_test_static_cols = gather_columns(
    encounters_train_val_test,
    [
        ENCOUNTER_ID,
        AGE,
        SEX,
        HOSPITAL_ID,
        ADMIT_TIMESTAMP,
        DISCHARGE_TIMESTAMP,
        TRIAGE_LEVEL,
    ],
)
static_features = compute_statics(encounters_train_val_test_static_cols)
save_dataframe(static_features, os.path.join(BASE_DATA_PATH, "static_features"))

##  Load static features, add to feature handler, save all features

In [None]:
static_features = load_dataframe(os.path.join(BASE_DATA_PATH, "static_features"))
feature_handler.add_features(
    static_features, reference_cols=[HOSPITAL_ID, ADMIT_TIMESTAMP, DISCHARGE_TIMESTAMP]
)
feature_handler.save(BASE_DATA_PATH, "features")

## Create new feature handler, load saved features from file

In [None]:
feature_handler1 = FeatureHandler()
feature_handler1.load(BASE_DATA_PATH, "features")

In [None]:
len(feature_handler1.features["temporal"].columns), len(
    feature_handler1.features["static"].columns
)

## Impute temporal features

## Add static features to temporal, (duplicate for each timestep)

In [None]:
# Train model with both static and temporal features.
merged_static_temporal = temporal.combine_first(static)
numerical_cols += ["age"]

# Train model with just temporal features
# merged_static_temporal = temporal
# static

## Create labels

In [None]:
num_timesteps = int(AGGREGATION_WINDOW / AGGREGATION_BUCKET_SIZE)
encounter_ids = list(merged_static_temporal.index.get_level_values(0).unique())
num_encounters = len(encounter_ids)

# All zeroes.
labels = np.zeros((num_encounters, num_timesteps))

# Set mortality within timeframe encounters to all 1s.
labels[
    [
        encounter_ids.index(enc_id)
        for enc_id in list(encounters_mortality_within_risk_timeframe[ENCOUNTER_ID])
    ]
] = 1

# Get which timestep discharge occurs and set those and following timesteps label values to be -1.
aggregated_discharge_events = aggregated_events.loc[
    aggregated_events[EVENT_NAME] == "discharge"
]
aggregated_mortality_events = aggregated_events.loc[
    aggregated_events[EVENT_NAME] == "death"
]
for enc_id in list(aggregated_discharge_events[ENCOUNTER_ID]):
    timestep_discharge = aggregated_discharge_events.loc[
        aggregated_discharge_events[ENCOUNTER_ID] == enc_id
    ]["timestep"]
    labels[encounter_ids.index(enc_id)][int(timestep_discharge) + 1 :] = -1

# Lookahead for each timestep, and see if death occurs in risk timeframe.
for enc_id in list(encounters_mortality_within_risk_timeframe[ENCOUNTER_ID]):
    mortality_encounter = mortality_events.loc[mortality_events[ENCOUNTER_ID] == enc_id]
    ts_ends = timestep_end_timestamps.loc[enc_id]["timestep_end_timestamp"]
    mortality_ts = mortality_encounter["event_timestamp"]
    for ts_idx, ts_end in enumerate(ts_ends):
        if not (
            mortality_ts <= ts_end + pd.to_timedelta(timeframe * 24, unit="h")
        ).all():
            labels[encounter_ids.index(enc_id)][ts_idx] = 0


mortality_risk_targets = labels

## Create train/val/test splits

In [None]:
def create_train_test_split(
    encounters: pd.DataFrame,
    fractions: Optional[List] = [0.8, 0.2],
    split_column: Optional[str] = None,
    split_values: List = None,
    seed: int = 3,
) -> tuple:
    """Split encounters into train/test.

    Parameters
    ----------
    encounters: pandas.DataFrame
        Dataframe with encounter IDs.
    fractions: list, optional
        Fraction of samples to use for train, test sets.
    split_column: str, optional
        If 'split_column' is specified, then that column is used to split.
    split_values: list, optional
        Along with 'split_column', a list of lists can be specified for filtering.
        e.g. [[2008], [2009, 2010]] for train/test split based on year.
    seed: int, optional
        Seed for random number generator.

    Returns
    -------
    tuple
        (train IDs, test IDs)

    """
    if split_column:
        if split_column not in encounters.columns:
            raise ValueError("Specified 'split column' not found in input dataframe")
        if not split_values:
            raise ValueError("Specify train/test split values for the 'split column'.!")
        train_encounters = encounters[ENCOUNTER_ID].loc[
            encounters[split_column].isin(split_values[0])
        ]
        test_encounters = encounters[ENCOUNTER_ID].loc[
            encounters[split_column].isin(split_values[1])
        ]
        return train_encounters, test_encounters

    encounter_ids = list(encounters[ENCOUNTER_ID].unique())
    random.seed(seed)
    random.shuffle(encounter_ids)
    num_train = int(fractions[0] * len(encounter_ids))

    return encounter_ids[0:num_train], encounter_ids[num_train:]


split_type = "hospital_id"

if split_type == "year":
    encounters_train_val_test["year"] = encounters_train_val_test[
        "admit_timestamp"
    ].dt.year
    train_ids, val_test_ids = create_train_test_split(
        encounters_train_val_test,
        split_column="year",
        split_values=[[2015, 2016, 2017, 2018, 2019], [2020]],
    )
    encounters_train_val_test[HOSPITAL_ID].value_counts(), encounters_train_val_test[
        "year"
    ].value_counts()
elif split_type == "hospital_id":
    train_ids, val_test_ids = create_train_test_split(
        encounters_train_val_test,
        split_column=HOSPITAL_ID,
        split_values=[["SBK", "UHNTG", "THPC", "THPM", "UHNTW", "SMH"], ["MSH"]],
    )
elif split_type == "random":
    train_ids, val_test_ids = create_train_test_split(encounters_train_val_test)


val_ids, test_ids = create_train_test_split(
    encounters_train_val_test.loc[
        encounters_train_val_test[ENCOUNTER_ID].isin(val_test_ids)
    ],
    [0.5, 0.5],
)
print(
    f"Train set: {len(train_ids)}, Val set: {len(val_ids)}, Test set: {len(test_ids)}"
)

X = merged_static_temporal[
    np.in1d(temporal.index.get_level_values(0), static.index.get_level_values(0))
]

y_train, y_val, y_test = [
    mortality_risk_targets[np.in1d(encounter_ids, ids)]
    for ids in [train_ids, val_ids, test_ids]
]
X_train, X_val, X_test = [
    X[np.in1d(X.index.get_level_values(0), ids)]
    for ids in [train_ids, val_ids, test_ids]
]

len(X), len(X_train), len(X_val), len(X_test), len(mortality_risk_targets), len(
    y_train
), len(y_val), len(y_test)
assert len(X.index.get_level_values(0).unique()) == len(mortality_risk_targets)

## Save train/val/test ids

In [None]:
train_ids_df = pd.DataFrame({"train": train_ids})
val_ids_df = pd.DataFrame({"val": val_ids})
test_ids_df = pd.DataFrame({"test": test_ids})
train_val_test_ids = pd.concat([train_ids_df, val_ids_df, test_ids_df], axis=1)
save_dataframe(
    train_val_test_ids, os.path.join(BASE_DATA_PATH, split_type, "train_val_test_ids")
)

## Normalize data

In [None]:
from sklearn.preprocessing import StandardScaler

X_train_normalized = X_train.copy()
X_val_normalized = X_val.copy()
X_test_normalized = X_test.copy()

for col in numerical_cols:
    scaler = StandardScaler().fit(X_train[col].values.reshape(-1, 1))
    X_train_normalized[col] = pd.Series(
        np.squeeze(scaler.transform(X_train[col].values.reshape(-1, 1))),
        index=X_train[col].index,
    )
    X_val_normalized[col] = pd.Series(
        np.squeeze(scaler.transform(X_val[col].values.reshape(-1, 1))),
        index=X_val[col].index,
    )
    X_test_normalized[col] = pd.Series(
        np.squeeze(scaler.transform(X_test[col].values.reshape(-1, 1))),
        index=X_test[col].index,
    )

## Save inputs and labels as numpy array

In [None]:
os.makedirs(os.path.join(BASE_DATA_PATH, split_type), exist_ok=True)

X_train_normalized.to_parquet(
    os.path.join(BASE_DATA_PATH, split_type, "X_train.parquet")
)
X_val_normalized.to_parquet(os.path.join(BASE_DATA_PATH, split_type, "X_val.parquet"))
X_test_normalized.to_parquet(os.path.join(BASE_DATA_PATH, split_type, "X_test.parquet"))

np.save(os.path.join(BASE_DATA_PATH, split_type, "y_train.npy"), y_train)
np.save(os.path.join(BASE_DATA_PATH, split_type, "y_val.npy"), y_val)
np.save(os.path.join(BASE_DATA_PATH, split_type, "y_test.npy"), y_test)

## Load train/val/test data

In [None]:
X_train_normalized = pd.read_parquet(
    os.path.join(BASE_DATA_PATH, split_type, "X_train.parquet")
)
X_val_normalized = pd.read_parquet(
    os.path.join(BASE_DATA_PATH, split_type, "X_val.parquet")
)
X_test_normalized = pd.read_parquet(
    os.path.join(BASE_DATA_PATH, split_type, "X_test.parquet")
)

y_train = np.load(os.path.join(BASE_DATA_PATH, split_type, "y_train.npy"))
y_val = np.load(os.path.join(BASE_DATA_PATH, split_type, "y_val.npy"))
y_test = np.load(os.path.join(BASE_DATA_PATH, split_type, "y_test.npy"))

## Reshape inputs

In [None]:
def reshape_inputs(inputs, num_timesteps):
    inputs = inputs.unstack()
    num_encounters = inputs.shape[0]
    inputs = inputs.values.reshape((num_encounters, num_timesteps, -1))

    return inputs


X_train_normalized_npy = reshape_inputs(X_train_normalized, num_timesteps)
X_val_normalized_npy = reshape_inputs(X_val_normalized, num_timesteps)
X_test_normalized_npy = reshape_inputs(X_test_normalized, num_timesteps)

## Save inputs as npy arrays

In [None]:
np.save(os.path.join(BASE_DATA_PATH, split_type, "X_train.npy"), X_train_normalized_npy)
np.save(os.path.join(BASE_DATA_PATH, split_type, "X_val.npy"), X_val_normalized_npy)
np.save(os.path.join(BASE_DATA_PATH, split_type, "X_test.npy"), X_test_normalized_npy)

## Plot timeline

In [None]:
encounter_id = random.choice(encounter_ids)
encounter_id = 13772609

# Show examples:
# 12779290, 13772609 - lookahead
# 11454928, 15795997 - no mortality
# 15253874 - all 1s

print(encounter_id)
combined_events_encounter = combined_events.loc[
    combined_events["encounter_id"] == encounter_id
]
fig = plot_timeline(combined_events_encounter, return_fig=True)

fig = fig.update_layout(width=2000, height=800)

ts_starts = timestep_start_timestamps.loc[encounter_id]["timestep_start_timestamp"]
ts_ends = timestep_end_timestamps.loc[encounter_id]["timestep_end_timestamp"]
for ts_end in ts_ends:
    fig.add_vline(ts_end)

label_encounter = labels[encounter_ids.index(encounter_id)]

for i in range(num_timesteps):
    label_ts = label_encounter[i]
    color_map = {-1: "grey", 0: "green", 1: "red"}
    fig.add_vrect(
        x0=ts_starts[i],
        x1=ts_ends[i],
        fillcolor=color_map[label_ts],
        opacity=0.25,
        line_width=0,
    )

fig.show()