## Data pipeline for predicting risk of mortality.

## Imports

In [1]:
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from cyclops.feature_handler import FeatureHandler
from cyclops.plotter import set_bars_color, setup_plot
from cyclops.processor import run_data_pipeline
from cyclops.processors.aggregate import Aggregator
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    DIAGNOSIS_CODE,
    DISCHARGE_DISPOSITION,
    DISCHARGE_TIMESTAMP,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    HOSPITAL_ID,
    LENGTH_OF_STAY_IN_ER,
    RESTRICT_TIMESTAMP,
    SEX,
    TRIAGE_LEVEL,
    WINDOW_START_TIMESTAMP,
)
from cyclops.processors.constants import SMH
from cyclops.processors.events import (
    combine_events,
    convert_to_events,
    normalize_events,
)
from cyclops.processors.impute import Imputer
from cyclops.processors.statics import compute_statics
from cyclops.processors.util import (
    fill_missing_timesteps,
    gather_columns,
    pivot_aggregated_events_to_features,
)
from cyclops.query import gemini
from cyclops.utils.file import save_dataframe

MORTALITY = "mortality"
LOS = "los"

2022-06-12 00:21:35,556 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


## Query

In [None]:
def query():
    er_admin_table = gemini.get_table(gemini.ER_ADMIN)
    encounters = gemini.patient_encounters(
        er_admin_table=er_admin_table,
        years=[2018, 2019, 2020],
        died=True,
        died_binarize_col="mortality",
    )
    labs = gemini.events(
        patient_encounters_table=encounters.query, event_category="lab"
    )
    vitals = gemini.events(
        patient_encounters_table=encounters.query, event_category="vitals"
    )
    encounters.run()
    labs.run()
    vitals.run()

    print(f"{len(encounters.data)} rows extracted!")
    print(f"{len(labs.data)} rows extracted!")
    print(f"{len(vitals.data)} rows extracted!")

    encounters.save(os.path.join(BASE_DATA_PATH, "admin_er"))
    labs.save(os.path.join(BASE_DATA_PATH, "labs"))
    vitals.save(os.path.join(BASE_DATA_PATH, "vitals"))


BASE_DATA_PATH = "/mnt/nfs/project/delirium/drift_exp/_extract"
os.makedirs(BASE_DATA_PATH, exist_ok=True)
query_files = [
    os.path.join(BASE_DATA_PATH, "admin_er.parquet"),
    os.path.join(BASE_DATA_PATH, "labs.parquet"),
    os.path.join(BASE_DATA_PATH, "vitals.parquet"),
]

# if not np.array([os.path.isfile(query_file) for query_file in query_files]).all():
query()

2022-06-12 00:21:41,814 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-06-12 00:21:41,818 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 5.143023 s
2022-06-12 00:32:27,671 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-06-12 00:32:27,676 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 645.855187 s


## Read saved query data

In [None]:
BASE_DATA_PATH = "/mnt/nfs/project/delirium/drift_exp/_extract"
encounters_data = pd.read_parquet(os.path.join(BASE_DATA_PATH, "admin_er.parquet"))
vitals_data = pd.read_parquet(os.path.join(BASE_DATA_PATH, "vitals.parquet"))
labs_data = pd.read_parquet(os.path.join(BASE_DATA_PATH, "labs.parquet"))

## Get encounters that ended in mortality outcome

In [None]:
encounters_mortality = encounters_data.loc[encounters_data[MORTALITY] == True]

## Offset discharge timestamp by K hours (risk of mortality within 24 hours) and create new column

In [None]:
offset = 24  # in hours
encounters_mortality = encounters_mortality.copy()
encounters_mortality["death_timestamp_offset"] = encounters_mortality[
    DISCHARGE_TIMESTAMP
] + pd.Timedelta(-offset, unit="h")
encounters_mortality[[DISCHARGE_TIMESTAMP, "death_timestamp_offset"]]

## Convert "death_timestamp_offset" to event

In [None]:
mortality_events = convert_to_events(
    encounters_mortality, event_name="death", timestamp_col="death_timestamp_offset"
)
mortality_events = pd.merge(
    mortality_events, encounters_mortality, on=ENCOUNTER_ID, how="left"
)
mortality_events = mortality_events[
    [ENCOUNTER_ID, EVENT_NAME, EVENT_TIMESTAMP, ADMIT_TIMESTAMP, EVENT_VALUE]
]
mortality_events[EVENT_VALUE] = 1
mortality_events

## Filter labs and vitals for the mortality subset

In [None]:
labs_mortality = labs_data.loc[
    labs_data[ENCOUNTER_ID].isin(encounters_mortality[ENCOUNTER_ID])
]
vitals_mortality = vitals_data.loc[
    vitals_data[ENCOUNTER_ID].isin(encounters_mortality[ENCOUNTER_ID])
]

lab_events = normalize_events(labs_mortality)
vitals_events = normalize_events(vitals_mortality)
mortality_events = normalize_events(mortality_events)

## Combine different event data

In [None]:
combined_events = combine_events([lab_events, vitals_events, mortality_events])

## Aggregate events

In [None]:
aggregator = Aggregator(bucket_size=6, window=72)
aggregated_events = aggregator(combined_events)

## Pivot aggregated events to get column-wise temporal features, add to feature handler

In [None]:
feature_handler = FeatureHandler()
temporal_features = pivot_aggregated_events_to_features(aggregated_events, np.mean)
feature_handler.add_features(temporal_features)
feature_handler.features["temporal"]["death"]

## Pivot table to get column that can be used to create labels. 1 corresponds to timestep where death happens (shifted by offset)

In [None]:
aggregated_mortality = aggregated_events.loc[aggregated_events[EVENT_NAME] == "death"]
pivoted_mortality = pivot_aggregated_events_to_features(aggregated_mortality, np.mean)
timesteps_mortality = pivoted_mortality.loc[
    pivoted_mortality["death"] == 1
].index.get_level_values(1)

timesteps_mortality_counts = timesteps_mortality.value_counts()
fig, axs = plt.subplots(1, 1, figsize=(10, 5), tight_layout=True)
ts_vals = plt.bar(
    list(np.unique(timesteps_mortality)), list(timesteps_mortality_counts), alpha=0.5
)
set_bars_color(ts_vals, "r")
axs.set_xticks(
    range(len(ts_vals)),
    list(np.unique(timesteps_mortality)),
    rotation="vertical",
    fontsize=20,
)
setup_plot(
    axs,
    "Mortality encounter distribution over timesteps",
    "timestep",
    "Num. encounters that resulted in mortality in that timestep",
    ["Count"],
)
plt.show()

## Compute static features, add to feature handler

In [None]:
encounters_mortality = gather_columns(
    encounters_mortality,
    [
        ENCOUNTER_ID,
        AGE,
        SEX,
        HOSPITAL_ID,
        ADMIT_TIMESTAMP,
        DISCHARGE_TIMESTAMP,
        TRIAGE_LEVEL,
    ],
)
static_features = compute_statics(encounters_mortality)
feature_handler.add_features(
    static_features, reference_cols=[HOSPITAL_ID, ADMIT_TIMESTAMP, DISCHARGE_TIMESTAMP]
)
feature_handler.save("test_features", "features")

In [None]:
feature_handler1 = FeatureHandler()
feature_handler1.load("/mnt/nfs/project/delirium/drift_exp", "test_features")
feature_handler1.plot_features(aggregate_type="static")

In [None]:
feature_handler1.features["temporal"]["death"]