In [None]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    BY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.file import (
    join,
    load_dataframe,
    process_dir_save_path,
    save_consequtive_dataframes,
    save_dataframe,
    yield_dataframes,
)

In [None]:
OUTCOME_DEATH = "outcome_death"

## Patient encounters

In [None]:
encounters_interface = mimic.patient_encounters(died_binarize_col=OUTCOME_DEATH)

encounters_query = encounters_interface.query
encounters_query = qp.Drop(
    ["insurance", "language", "marital_status", "hospital_expire_flag"]
)(encounters_query)

encounters_interface = mimic.get_interface(encounters_query)
encounters = encounters_interface.run()
encounters

In [None]:
save_dataframe(encounters, "encounters.parquet")

## Events

In [None]:
events_interface = mimic.events()

events_query = events_interface.query
events_query = qp.Drop(["warning", "itemid", "storetime"])(events_query)
events_interface = mimic.get_interface(events_query)

events_interface.save_in_grouped_batches("./test_batches2", ENCOUNTER_ID, int(1e7))

In [None]:
save_consequtive_dataframes("./test_batches2", "./0raw2", 4)

In [None]:
save_count = 0
generator = yield_dataframes("./test_batches2")
save_dir = "./1cleaned"
save_dir = process_dir_save_path(save_dir)

# RUNS IN BATCHES - RUN AFTER HERE

In [None]:
events = next(generator)
events = events.drop(["stay_id"], axis=1)
events

In [None]:
import warnings

from pandas.errors import PerformanceWarning


def add_years_approximate(
    timestamp_series: pd.Series, years_series: pd.Series
) -> pd.Series:
    """

    Approximates are typically either exact or incorrect by one day, e.g., on leap days.

    """
    # Add to the years column
    year = timestamp_series.dt.year + years_series

    # Handle the other columns
    month = timestamp_series.dt.month
    day = timestamp_series.dt.day
    hour = timestamp_series.dt.hour
    minute = timestamp_series.dt.minute

    # Create new timestamp column
    data = pd.DataFrame(
        {"year": year, "month": month, "day": day, "hour": hour, "minute": minute}
    )

    # Subtract 1 from potentially invalid leap days to avoid issues
    leap_days = (month == 2) & (day == 29)
    data["day"][leap_days] -= 1

    return pd.to_datetime(data)


def add_years_exact(timestamp_series: pd.Series, years_series: pd.Series) -> pd.Series:
    warnings.warn(
        "Computing the exact addition cannot be vectorized and is very slow. Consider using the quick, approximate calculation.",
        PerformanceWarning,
    )
    return timestamp_series + years_series.apply(lambda x: pd.DateOffset(years=x))

In [None]:
# Reverse deidentified dating
events = pd.merge(
    encounters[[ENCOUNTER_ID, "anchor_year_difference"]], events, on=ENCOUNTER_ID
)
events[EVENT_TIMESTAMP] = add_years_approximate(
    events[EVENT_TIMESTAMP], events["anchor_year_difference"]
)
events = events.drop("anchor_year_difference", axis=1)
events

In [None]:
# Create the target as a timeseries event
target_events = encounters[encounters[OUTCOME_DEATH] == True]
target_events = target_events[[ENCOUNTER_ID, "deathtime"]]
target_events = target_events.rename({"deathtime": EVENT_TIMESTAMP}, axis=1)
target_events[EVENT_NAME] = OUTCOME_DEATH
target_events[EVENT_CATEGORY] = TARGETS
target_events[EVENT_VALUE] = 1
target_events.head(5)

In [None]:
# Include target
events = pd.concat([events, target_events])

In [None]:
# Preprocessing
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
events[EVENT_CATEGORY] = normalize_categories(events[EVENT_CATEGORY])
# events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

# Concatenate event name and category since some names are the same in
# different categories, e.g., 'flow' for categories 'heartware' and 'ecmo'
events[EVENT_NAME] = events[EVENT_CATEGORY] + " - " + events[EVENT_NAME]
events.head(5)

In [None]:
save_dataframe(events, join(save_dir, "batch_" + f"{save_count:04d}"))
save_count += 1