# Imports

In [None]:
import pandas as pd

from cyclops.processors.clean import normalize_categories, normalize_names
from cyclops.processors.column_names import (
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
)
from cyclops.query import process as qp
from cyclops.query.mimiciv import MIMICIVQuerier
from cyclops.utils.common import add_years_approximate
from cyclops.utils.file import join, load_dataframe, save_dataframe, yield_dataframes
from use_cases.mimiciv.mortality_decompensation.constants import (
    CLEANED_DIR,
    ENCOUNTERS_FILE,
    OUTCOME_DEATH,
    QUERIED_DIR,
)

mimic = MIMICIVQuerier()

# Query

## Patient encounters

In [None]:
encounters_interface = mimic.patient_encounters()

encounters_query = encounters_interface.query
encounters_query = qp.Drop(
    ["insurance", "language", "marital_status", "edregtime", "edouttime"],
)(encounters_query)

encounters_interface = mimic.get_interface(encounters_query)
encounters = encounters_interface.run()
encounters.head(5)

Create death indicator

Hospital expire flag:
 - 1 - Death in hospital
 - 0 - Survived past discharge

In [None]:
# Drop encounters ending in death which don't have a death timestamp
invalid = (encounters["hospital_expire_flag"] == 1) & (encounters["deathtime"].isna())
encounters = encounters[~invalid]

# (Died in hospital) & (Death timestamp is defined)
encounters[OUTCOME_DEATH] = encounters["hospital_expire_flag"] == 1
encounters.head(5)

In [None]:
(encounters[OUTCOME_DEATH] == True).sum() / len(encounters)  # noqa: E712

In [None]:
save_dataframe(encounters, ENCOUNTERS_FILE)

## Events

In [None]:
events_interface = mimic.events()
events_query = events_interface.query
events_query = qp.Drop(["warning", "itemid", "storetime"])(events_query)
events_interface = mimic.get_interface(events_query)
events_interface.save_in_grouped_batches(QUERIED_DIR, ENCOUNTER_ID, int(1e6))

# Clean / Preprocess

Can be run entirely separately from the querying.

## Patient encounters

In [None]:
encounters = load_dataframe(ENCOUNTERS_FILE)

## Events

In [None]:
skip_n = 0
generator = yield_dataframes(QUERIED_DIR, skip_n=skip_n, log=True)

In [None]:
for save_count, events in enumerate(generator):
    events = events.drop(["stay_id"], axis=1)

    # Reverse deidentified dating
    events = pd.merge(
        encounters[[ENCOUNTER_ID, "anchor_year_difference"]], events, on=ENCOUNTER_ID
    )
    events[EVENT_TIMESTAMP] = add_years_approximate(
        events[EVENT_TIMESTAMP], events["anchor_year_difference"]
    )
    events = events.drop("anchor_year_difference", axis=1)

    # Preprocessing
    events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
    events[EVENT_CATEGORY] = normalize_categories(events[EVENT_CATEGORY])
    # events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

    # Concatenate event name and category since some names are the same in
    # different categories, e.g., 'flow' for categories 'heartware' and 'ecmo'
    events[EVENT_NAME] = events[EVENT_CATEGORY] + " - " + events[EVENT_NAME]
    events.head(5)

    save_dataframe(events, join(CLEANED_DIR, "batch_" + f"{save_count + skip_n:04d}"))
    del events