# Query

In [None]:
from mimic_mortality_consts import (
    CLEANED_DIR,
    CONST_NAME,
    ENCOUNTERS_FILE,
    OUTCOME_DEATH,
    OUTCOME_DEATH_PROCESSED,
    PREDICT_OFFSET,
    QUERIED_DIR,
)

In [None]:
input(f"WARNING: LOADING CONSTANTS FROM {CONST_NAME}")

In [None]:
import numpy as np
import pandas as pd

from cyclops.processors.column_names import ENCOUNTER_ID
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.file import load_dataframe, save_dataframe

## Patient encounters

In [None]:
encounters_interface = mimic.patient_encounters()

encounters_query = encounters_interface.query
encounters_query = qp.Drop(
    ["insurance", "language", "marital_status", "edregtime", "edouttime"],
)(encounters_query)

encounters_interface = mimic.get_interface(encounters_query)
encounters = encounters_interface.run()
encounters.head(5)

Create death indicator

Hospital expire flag:
 - 1 - Death in hospital
 - 0 - Survived past discharge

In [None]:
# Drop encounters ending in death which don't have a death timestamp
invalid = (encounters["hospital_expire_flag"] == 1) & (encounters["deathtime"].isna())
encounters = encounters[~invalid]

# (Died in hospital) & (Death timestamp is defined)
encounters[OUTCOME_DEATH] = encounters["hospital_expire_flag"] == 1
encounters.head(5)

In [None]:
(encounters[OUTCOME_DEATH] == True).sum() / len(encounters)

In [None]:
save_dataframe(encounters, ENCOUNTERS_FILE)

## Events

In [None]:
events_interface = mimic.events()
events_query = events_interface.query
events_query = qp.Drop(["warning", "itemid", "storetime"])(events_query)
events_interface = mimic.get_interface(events_query)
events_interface.save_in_grouped_batches(QUERIED_DIR, ENCOUNTER_ID, int(1e7))

# Clean

Can be run entirely separately from the querying.

In [None]:
from mimic_mortality_consts import (
    CLEANED_DIR,
    CONST_NAME,
    ENCOUNTERS_FILE,
    OUTCOME_DEATH,
    OUTCOME_DEATH_PROCESSED,
    PREDICT_OFFSET,
    QUERIED_DIR,
)

In [None]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
)
from cyclops.processors.constants import TARGETS
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.file import (
    join,
    load_dataframe,
    save_consequtive_dataframes,
    save_dataframe,
    yield_dataframes,
)
from cyclops.utils.pandas import add_years_approximate

In [None]:
input(f"WARNING: LOADING CONSTANTS FROM {CONST_NAME}")

In [None]:
encounters = load_dataframe(ENCOUNTERS_FILE)

## Events

In [None]:
skip_n = 15
generator = yield_dataframes(QUERIED_DIR, skip_n=skip_n, log=False)

In [None]:
for save_count, events in enumerate(generator):
    events = events.drop(["stay_id"], axis=1)

    # Reverse deidentified dating
    events = pd.merge(
        encounters[[ENCOUNTER_ID, "anchor_year_difference"]], events, on=ENCOUNTER_ID
    )
    events[EVENT_TIMESTAMP] = add_years_approximate(
        events[EVENT_TIMESTAMP], events["anchor_year_difference"]
    )
    events = events.drop("anchor_year_difference", axis=1)

    # print("len(events[ENCOUNTER_ID].unique())", len(events[ENCOUNTER_ID].unique()))

    # Create the target as a timeseries event
    event_encounters = encounters[encounters[ENCOUNTER_ID].isin(events[ENCOUNTER_ID])]
    # print("len(event_encounters)", len(event_encounters))

    target_events = event_encounters[event_encounters[OUTCOME_DEATH] == True]
    del event_encounters

    target_events = target_events[[ENCOUNTER_ID, "deathtime"]]
    target_events = target_events.rename({"deathtime": EVENT_TIMESTAMP}, axis=1)
    target_events[EVENT_NAME] = OUTCOME_DEATH
    target_events[EVENT_CATEGORY] = TARGETS
    target_events[EVENT_VALUE] = 1

    # Offset death time targets such that the model is predicting death time in advance
    # print("BEFORE - target_events[EVENT_TIMESTAMP].iloc[0]", target_events[EVENT_TIMESTAMP].iloc[0])
    target_events[EVENT_TIMESTAMP] -= pd.DateOffset(hours=PREDICT_OFFSET)
    # print("AFTER - target_events[EVENT_TIMESTAMP].iloc[0]", target_events[EVENT_TIMESTAMP].iloc[0])

    # Include target
    events = pd.concat([events, target_events])

    # Preprocessing
    events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
    events[EVENT_CATEGORY] = normalize_categories(events[EVENT_CATEGORY])
    # events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

    # Concatenate event name and category since some names are the same in
    # different categories, e.g., 'flow' for categories 'heartware' and 'ecmo'
    events[EVENT_NAME] = events[EVENT_CATEGORY] + " - " + events[EVENT_NAME]
    events.head(5)

    save_dataframe(events, join(CLEANED_DIR, "batch_" + f"{save_count + skip_n:04d}"))
    del events
    del target_events