In [None]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    BY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import VectorizedNormalizer
from cyclops.processors.feature.vectorize import (
    intersect_vectorized,
    split_vectorized,
    vec_index_exp,
)
from cyclops.processors.impute import (
    np_ffill,
    np_ffill_bfill,
    np_fill_null_num,
    np_fill_null_zero,
)
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.common import print_dict
from cyclops.utils.file import (
    join,
    load_array,
    load_dataframe,
    save_dataframe,
    yield_dataframes,
)

In [None]:
SPLIT_FRACTIONS = [0.8, 0.1]
PREDICT_OFFSET = 24

# Tabular
OUTCOME_DEATH = "outcome_death"
TAB_TARGETS = [OUTCOME_DEATH]

# Temporal
OUTCOME_DEATH_PROCESSED = TARGETS + " - " + OUTCOME_DEATH
TEMP_DIR = "./2cleaned"
TEMP_TARGETS = [OUTCOME_DEATH_PROCESSED]

# Tabular-specific processing

In [None]:
cohort = load_dataframe("encounters.parquet")
cohort.head(5)

In [None]:
features = [
    AGE,
    SEX,
    OUTCOME_DEATH,
    "admission_type",
    "admission_location",
    "discharge_location",
    "ethnicity",
]

cohort = cohort.reset_index(drop=True)

tab_features = TabularFeatures(
    data=cohort,
    features=features,
    by=ENCOUNTER_ID,
)

In [None]:
tab_features.types

In [None]:
tab_features.meta.keys()

In [None]:
tab_features.meta["admission_location"].get_mapping()

In [None]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

In [None]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)

In [None]:
tab_vectorized.shape

In [None]:
tab_vectorized.axis_names

In [None]:
numeric_features = tab_features.features_by_type(NUMERIC)
numeric_features

# Temporal-specific processing

In [None]:
from functools import reduce

import numpy as np

top_n = 150
all_top_events = []
for i, events in enumerate(yield_dataframes(TEMP_DIR)):
    # Keep only the most popular events where the values are not null
    top_events = (
        events[EVENT_NAME][~events[EVENT_VALUE].isna()].value_counts()[:top_n].index
    )

    all_top_events.append(top_events)

    del events

# Take only the events common to every file
top_events = reduce(np.intersect1d, tuple(all_top_events))

# Force include the target
top_events = np.unique(np.append(top_events, OUTCOME_DEATH_PROCESSED))

top_events

In [None]:
len(top_events)

In [None]:
# events = events[events[EVENT_NAME].isin(top_events)]
# events.head(5)

In [None]:
events = next(generator)

In [None]:
# events = load_dataframe("events.parquet")
# events = events.sample(n=int(len(events) / 20))

In [None]:
generator = yield_dataframes(TEMP_DIR)

In [None]:
events = events[events[EVENT_NAME].isin(top_events)]
events

In [None]:
events[events[EVENT_NAME] == OUTCOME_DEATH_PROCESSED].head(5)

In [None]:
# Offset death time targets such that the model is predicting death time in advance
is_death_event = events[EVENT_NAME] == OUTCOME_DEATH_PROCESSED
events[is_death_event][EVENT_TIMESTAMP] = events[is_death_event][
    EVENT_TIMESTAMP
] - pd.DateOffset(hours=PREDICT_OFFSET)

In [None]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=8,
    window_duration=24,
)

In [None]:
events = events.reset_index(drop=True)

tmp_features = TemporalFeatures(
    events,
    features=EVENT_VALUE,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
)

In [None]:
aggregated = tmp_features.aggregate()
aggregated.head(5)

In [None]:
save_dataframe(aggregated, "aggregated.parquet")

In [None]:
aggregated = load_dataframe("aggregated.parquet")

In [None]:
temp_vectorized = aggregator.vectorize(aggregated)
temp_vectorized.shape

In [None]:
temp_vectorized.axis_names

In [None]:
for target in TEMP_TARGETS:
    event_ind = temp_vectorized.get_index_map(EVENT_NAME)[target]
    index_exp = vec_index_exp[:, :, event_ind]

    # Forward fill the the target values, e.g., [nan, nan, 1., nan, nan] -> [nan, nan, 1, 1, 1]
    temp_vectorized.impute_over_axis(TIMESTEP, np_ffill, index_exp=index_exp)

    # Fill remaining values with 0, e.g., [nan, nan, 1, 1, 1] -> [0, 0, 1, 1, 1]
    # or [nan, nan, nan, nan, nan] -> [0, 0, 0, 0, 0]
    temp_vectorized.impute_over_axis(TIMESTEP, np_fill_null_zero, index_exp=index_exp)

In [None]:
temp_vectorized.data[0, 0]

In [None]:
temp_vectorized.shape

In [None]:
# Forward fill then backward fill to get rid of each of the timestep nulls
temp_vectorized.impute_over_axis(TIMESTEP, np_ffill_bfill)

In [None]:
# Fill those all-null timesteps with feature mean
# (since forward and backward filling still leaves them all null)

axis = temp_vectorized.get_axis(EVENT_NAME)

for i in range(temp_vectorized.data.shape[axis]):
    index_exp = vec_index_exp[:, :, i]
    data_slice = temp_vectorized.data[index_exp]
    print("data_slice.shape", data_slice.shape)
    print("np.isnan(data_slice).sum()", np.isnan(data_slice).sum())
    mean = np.nanmean(data_slice)
    func = lambda x: np_fill_null_num(x, mean)
    print(mean)
    temp_vectorized.impute_over_axis(TIMESTEP, func, index_exp=index_exp)

In [None]:
temp_vectorized.data[0, 0]

# Combined processing

# Prepare splits

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [None]:
tab_vectorized.shape, temp_vectorized.shape

In [None]:
tab_vectorized, temp_vectorized = intersect_vectorized(
    [tab_vectorized, temp_vectorized], axes=ENCOUNTER_ID
)
tab_vectorized.shape, temp_vectorized.shape

In [None]:
# Standardize only the numeric features (e.g., not binary indicators)
normalizer_map = {feat: STANDARD for feat in numeric_features}

tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map=normalizer_map,
)

In [None]:
# Standardize all events
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

## Dataset splits

Split into training, validation, and testing datasets such that the tabular and temporal encounters remain aligned.

In [None]:
tab_splits, temp_splits = split_vectorized(
    [tab_vectorized, temp_vectorized], SPLIT_FRACTIONS, axes=ENCOUNTER_ID
)
tab_train, tab_val, tab_test = tab_splits
temp_train, temp_val, temp_test = temp_splits

In [None]:
tab_train.shape, tab_val.shape, tab_test.shape

In [None]:
temp_train.shape, temp_val.shape, temp_test.shape

## Split features/targets

Split out the targets in the temporal data.

In [None]:
tab_train_X, tab_train_y = tab_train.split_out(FEATURES, TAB_TARGETS)
tab_train_X.shape, tab_train_y.shape

In [None]:
tab_val_X, tab_val_y = tab_val.split_out(FEATURES, TAB_TARGETS)
tab_val_X.shape, tab_val_y.shape

In [None]:
tab_test_X, tab_test_y = tab_test.split_out(FEATURES, TAB_TARGETS)
tab_test_X.shape, tab_test_y.shape

In [None]:
temp_train_X, temp_train_y = temp_train.split_out(EVENT_NAME, TEMP_TARGETS)
temp_train_X.shape, temp_train_y.shape

In [None]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, TEMP_TARGETS)
temp_val_X.shape, temp_val_y.shape

In [None]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, TEMP_TARGETS)
temp_test_X.shape, temp_test_y.shape

### Normalization

In [None]:
splits = (
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
)

for split in splits:
    split.fit_normalizer()
    split.normalize()

(
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
) = splits

## Save

In [None]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X"),
    (tab_train_y, "tab_train_y"),
    (tab_val_X, "tab_val_X"),
    (tab_val_y, "tab_val_y"),
    (tab_test_X, "tab_test_X"),
    (tab_test_y, "tab_test_y"),
    (temp_train_X, "temp_train_X"),
    (temp_train_y, "temp_train_y"),
    (temp_val_X, "temp_val_X"),
    (temp_val_y, "temp_val_y"),
    (temp_test_X, "temp_test_X"),
    (temp_test_y, "temp_test_y"),
]
for vec, name in vectorized:
    with open(name + ".pkl", "wb") as handle:
        pickle.dump(vec, handle)