# Imports

In [None]:
import time

import pandas as pd

from cyclops.processors.clean import normalize_names, normalize_values
from cyclops.processors.column_names import (
    DISCHARGE_TIMESTAMP,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_VALUE,
)
from cyclops.processors.feature.split import intersect_datasets
from cyclops.utils.file import join, save_dataframe
from drift_detection.gemini.mortality.constants import (
    CLEANED_DIR,
    ENCOUNTERS_FILE,
    OUTCOME_DEATH,
    QUERIED_DIR,
    TARGET_TIMESTAMP,
)
from drift_detection.gemini.query import main

# Query

In [None]:
t = time.time()
cohort, events = main()
print(time.time() - t)
cohort

In [None]:
cohort[OUTCOME_DEATH].sum() / len(cohort)

In [None]:
events

In [None]:
# Intersect over encounter IDs to get only those encounters common to both
cohort, events = intersect_datasets([cohort, events], ENCOUNTER_ID)

In [None]:
save_dataframe(events, join(QUERIED_DIR, "batch_0000.parquet"))

# Clean / Preprocess

In [None]:
death_events = cohort[cohort[OUTCOME_DEATH] == True]  # noqa: E712
death_events = death_events[[ENCOUNTER_ID, DISCHARGE_TIMESTAMP]]
death_events = death_events.rename({DISCHARGE_TIMESTAMP: TARGET_TIMESTAMP}, axis=1)
cohort = pd.merge(cohort, death_events, on=ENCOUNTER_ID, how="left")
cohort

In [None]:
save_dataframe(cohort, ENCOUNTERS_FILE)

In [None]:
# Normalize names and string values
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

# Convert values to numeric, dropping those which can't be converted
events[EVENT_VALUE] = pd.to_numeric(events[EVENT_VALUE], errors="coerce")
print("Length before:", len(events))
events = events[~events[EVENT_VALUE].isna()]
print("Length after:", len(events))
events

In [None]:
save_dataframe(events, join(CLEANED_DIR, "batch_0000.parquet"))