In [None]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    BY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import VectorizedNormalizer
from cyclops.processors.feature.split import split_idx
from cyclops.processors.feature.vectorize import intersect_vectorized, split_vectorized
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.common import print_dict
from cyclops.utils.file import load_array, load_dataframe, save_dataframe

In [None]:
YEARS = [2015, 2016, 2018, 2019, 2020]
MIN_YEAR = min(YEARS)
TARGET_NAME = "death"
TARGET_NAME_PROCESSED = "targets - death"

# Querying

## Patient encounters

In [None]:
encounters_interface = mimic.patient_encounters(
    years=YEARS, died=True, died_binarize_col="died"
)
encounters_query = encounters_interface.query
encounters_query = qp.Drop(
    ["insurance", "language", "marital_status", "hospital_expire_flag"]
)(encounters_query)
encounters_interface = mimic.get_interface(encounters_query)
encounters = encounters_interface.run()
encounters

In [None]:
save_dataframe(encounters, "encounters.parquet")

## Events

In [None]:
events_interface = mimic.events()
events_query = events_interface.query
events_query = qp.Drop(["warning", "itemid", "storetime"])(events_query)
events_interface = mimic.get_interface(events_query)
events = events_interface.run(limit=1000000)

# Reverse the deidentified dating
events = pd.merge(
    encounters[[ENCOUNTER_ID, "anchor_year_difference"]], events, on=ENCOUNTER_ID
)


def add_offset(row):
    row[EVENT_TIMESTAMP] += pd.DateOffset(years=row["anchor_year_difference"])
    return row


events = events.apply(add_offset, axis=1)
events = events.drop("anchor_year_difference", axis=1)

In [None]:
# Create the target as a timeseries event
target_events = encounters[encounters["died"] == True]
target_events = target_events[[ENCOUNTER_ID, "deathtime"]]
target_events = target_events.rename({"deathtime": EVENT_TIMESTAMP}, axis=1)
target_events[EVENT_NAME] = TARGET_NAME
target_events[EVENT_CATEGORY] = TARGETS
target_events[EVENT_VALUE] = 1
target_events.head(5)

In [None]:
# Include target
events = pd.concat([events, target_events])

In [None]:
# Preprocessing
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
events[EVENT_CATEGORY] = normalize_categories(events[EVENT_CATEGORY])
# events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

# Concatenate event name and category since some names are the same in
# different categories, e.g., 'flow' for categories 'heartware' and 'ecmo'
events[EVENT_NAME] = events[EVENT_CATEGORY] + " - " + events[EVENT_NAME]
events.head(5)

# Update target name having included the category
TARGET_NAME = TARGETS + " - " + TARGET_NAME

In [None]:
save_dataframe(events, "events.parquet")

------------------------------------------------------------------------------------------------

# Processing

## Tabular

In [None]:
encounters = load_dataframe("encounters.parquet")
encounters.head(5)

In [None]:
features = [
    AGE,
    SEX,
    "admission_type",
    "admission_location",
    "discharge_location",
    "ethnicity",
]

tab_features = TabularFeatures(
    data=encounters,
    features=features,
    by=ENCOUNTER_ID,
)

In [None]:
tab_features.types

In [None]:
tab_features.meta["admission_location"].get_mapping()

In [None]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

In [None]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)

In [None]:
tab_vectorized.shape

In [None]:
tab_vectorized.axis_names

In [None]:
tab_vectorized.get_index(FEATURES)

In [None]:
numeric_features = tab_features.features_by_type(NUMERIC)
numeric_features

## Temporal

In [None]:
events = load_dataframe("events.parquet")

events = events.sample(n=int(len(events) / 20))

events.head(5)

In [None]:
# Keep only the most popular events
top_events = events[EVENT_NAME].value_counts()[:100].index

# Force include the target
top_events = np.unique(np.append(top_events, TARGET_NAME))

events = events[events[EVENT_NAME].isin(top_events)]
events.head(5)

In [None]:
events[events[EVENT_NAME] == TARGET_NAME].head(5)

In [None]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=8,
    window_duration=24,  # Optional
    # agg_meta_for=EVENT_VALUE,  # Optional
)

In [None]:
events = events.reset_index(drop=True)

tmp_features = TemporalFeatures(
    events,
    features=EVENT_VALUE,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
)

In [None]:
aggregated = tmp_features.aggregate()
aggregated.head(5)

In [None]:
save_dataframe(aggregated, "aggregated.parquet")

In [None]:
temp_vectorized = aggregator.vectorize(aggregated)
temp_vectorized.shape

In [None]:
temp_vectorized.axis_names

## Splitting

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [None]:
tab_vectorized.shape, temp_vectorized.shape

In [None]:
tab_vectorized, temp_vectorized = intersect_vectorized(
    [tab_vectorized, temp_vectorized], axes=ENCOUNTER_ID
)

In [None]:
tab_vectorized.shape, temp_vectorized.shape

In [None]:
# Standardize only the numeric features (e.g., not binary indicators)
tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map={feat: STANDARD for feat in numeric_features},
)

In [None]:
# Standardize all features
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

### Dataset splits

Split into training, validation, and testing datasets.

Split using the same indices such that the tabular and temporal datasets remain aligned.

In [None]:
tab_splits, temp_splits = split_vectorized(
    [tab_vectorized, temp_vectorized], [0.8, 0.1], axes=ENCOUNTER_ID
)
tab_train_X, tab_val_X, tab_test_X = tab_splits
temp_train, temp_val, temp_test = temp_splits

In [None]:
tab_train_X.shape, tab_val_X.shape, tab_test_X.shape

In [None]:
temp_train.shape, temp_val.shape, temp_test.shape

### Split features/targets

Split out the targets in the temporal data.

In [None]:
temp_train_X, temp_train_y = temp_train.split_out(EVENT_NAME, [TARGET_NAME_PROCESSED])
temp_train_X.shape, temp_train_y.shape

In [None]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, [TARGET_NAME_PROCESSED])
temp_val_X.shape, temp_val_y.shape

In [None]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, [TARGET_NAME_PROCESSED])
temp_test_X.shape, temp_test_y.shape

### Normalization

In [None]:
splits = tab_train_X, tab_val_X, tab_test_X

for split in splits:
    split.fit_normalizer()
    split.normalize()

tab_train_X, tab_val_X, tab_test_X = splits

In [None]:
splits = temp_train_X, temp_val_X, temp_test_X

for split in splits:
    split.fit_normalizer()
    split.normalize()

temp_train_X, temp_val_X, temp_test_X = splits

In [None]:
"Percentage null:", np.isnan(temp_train_X.data).sum() / temp_train_X.data.size

## Save

In [None]:
tab_train_X.save("tab_train_X.npy")
tab_val_X.save("tab_val_X.npy")
tab_test_X.save("tab_test_X.npy")

temp_train_X.save("temp_train_X.npy")
temp_val_X.save("temp_val_X.npy")
temp_test_X.save("temp_test_X.npy")

temp_train_y.save("temp_train_y.npy")
temp_val_y.save("temp_val_y.npy")
temp_test_y.save("temp_test_y.npy")