In [None]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    BY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import VectorizedNormalizer
from cyclops.processors.feature.vectorize import intersect_vectorized, split_vectorized
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.common import print_dict
from cyclops.utils.file import load_array, load_dataframe, save_dataframe

In [None]:
OUTCOME_DEATH = "outcome_death"
OUTCOME_DEATH_PROCESSED = TARGETS + " - " + OUTCOME_DEATH

TAB_TARGETS = [OUTCOME_DEATH]
TEMP_TARGETS = [OUTCOME_DEATH_PROCESSED]

SPLIT_FRACTIONS = [0.8, 0.1]
PREDICT_OFFSET_HRS = 24

In [None]:
TAB_TARGETS

# Tabular-specific processing

In [None]:
cohort = load_dataframe("encounters.parquet")
cohort.head(5)

In [None]:
features = [
    AGE,
    SEX,
    OUTCOME_DEATH,
    "admission_type",
    "admission_location",
    "discharge_location",
    "ethnicity",
]

cohort = cohort.reset_index(drop=True)

tab_features = TabularFeatures(
    data=cohort,
    features=features,
    by=ENCOUNTER_ID,
)

In [None]:
tab_features.types

In [None]:
tab_features.meta.keys()

In [None]:
tab_features.meta["admission_location"].get_mapping()

In [None]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

In [None]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)

In [None]:
tab_vectorized.shape

In [None]:
tab_vectorized.axis_names

In [None]:
numeric_features = tab_features.features_by_type(NUMERIC)
numeric_features

In [None]:
tab_vectorized.save("tab_vectorized.npy")

# Temporal-specific processing

In [None]:
# chunk = 1

In [None]:
events = load_dataframe("events.parquet")
events = events.sample(n=int(len(events) / 20))
events.head(5)

In [None]:
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
# events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

In [None]:
# Keep only the most popular events
top_events = events[EVENT_NAME].value_counts()[:100].index

# Force include the target
top_events = np.unique(np.append(top_events, OUTCOME_DEATH))

events = events[events[EVENT_NAME].isin(top_events)]
events.head(5)

In [None]:
events[events[EVENT_NAME] == OUTCOME_DEATH_PROCESSED].head(5)

In [None]:
# Offset death time - i.e., should predict death time in advance
is_death_event = events[EVENT_NAME] == OUTCOME_DEATH_PROCESSED
events[is_death_event][EVENT_TIMESTAMP] = events[is_death_event][
    EVENT_TIMESTAMP
] - pd.DateOffset(hours=PREDICT_OFFSET_HRS)

In [None]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=8,
    window_duration=24,  # Optional
    # agg_meta_for=EVENT_VALUE,  # Optional
)

In [None]:
events = events.reset_index(drop=True)

tmp_features = TemporalFeatures(
    events,
    features=EVENT_VALUE,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
)

In [None]:
aggregated = tmp_features.aggregate()
aggregated.head(5)

In [None]:
save_dataframe(aggregated, "aggregated.parquet")

In [None]:
temp_vectorized = aggregator.vectorize(aggregated)
temp_vectorized.shape

In [None]:
temp_vectorized.axis_names

In [None]:
temp_vectorized.save("temp_vectorized.npy")

# Combined processing

# Prepare splits

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [None]:
tab_vectorized.shape, temp_vectorized.shape

In [None]:
tab_vectorized, temp_vectorized = intersect_vectorized(
    [tab_vectorized, temp_vectorized], axes=ENCOUNTER_ID
)
tab_vectorized.shape, temp_vectorized.shape

In [None]:
# Standardize only the numeric features (e.g., not binary indicators)
normalizer_map = {feat: STANDARD for feat in numeric_features}

tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map=normalizer_map,
)

In [None]:
# Standardize all events
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

## Dataset splits

Split into training, validation, and testing datasets such that the tabular and temporal encounters remain aligned.

In [None]:
tab_splits, temp_splits = split_vectorized(
    [tab_vectorized, temp_vectorized], SPLIT_FRACTIONS, axes=ENCOUNTER_ID
)
tab_train, tab_val, tab_test = tab_splits
temp_train, temp_val, temp_test = temp_splits

In [None]:
tab_train.shape, tab_val.shape, tab_test.shape

In [None]:
temp_train.shape, temp_val.shape, temp_test.shape

## Split features/targets

Split out the targets in the temporal data.

In [None]:
tab_train_X, tab_train_y = tab_train.split_out(FEATURES, TAB_TARGETS)
tab_train_X.shape, tab_train_y.shape

In [None]:
tab_val_X, tab_val_y = tab_val.split_out(FEATURES, TAB_TARGETS)
tab_val_X.shape, tab_val_y.shape

In [None]:
tab_test_X, tab_test_y = tab_test.split_out(FEATURES, TAB_TARGETS)
tab_test_X.shape, tab_test_y.shape

In [None]:
temp_train_X, temp_train_y = temp_train.split_out(EVENT_NAME, TEMP_TARGETS)
temp_train_X.shape, temp_train_y.shape

In [None]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, TEMP_TARGETS)
temp_val_X.shape, temp_val_y.shape

In [None]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, TEMP_TARGETS)
temp_test_X.shape, temp_test_y.shape

### Normalization

In [None]:
splits = (
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
)

for split in splits:
    split.fit_normalizer()
    split.normalize()

(
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
) = splits

## Save

In [None]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X"),
    (tab_val_X, "tab_val_X"),
    (tab_test_X, "tab_test_X"),
    (temp_train_X, "temp_train_X"),
    (temp_train_y, "temp_train_y"),
    (temp_val_X, "temp_val_X"),
    (temp_val_y, "temp_val_y"),
    (temp_test_X, "temp_test_X"),
    (temp_test_y, "temp_test_y"),
]
for vec, name in vectorized:
    with open(name + ".pkl", "wb") as handle:
        pickle.dump(vec, handle)