In [None]:
# from wanglabconsts import (
from mimicmortalityconsts import (
    AGGREGATED_DIR,
    AGGREGATED_FILE,
    CLEANED_DIR,
    CONST_NAME,
    ENCOUNTERS_FILE,
    OUTCOME_DEATH,
    OUTCOME_DEATH_PROCESSED,
    PREDICT_OFFSET,
    QUERIED_DIR,
    SPLIT_FRACTIONS,
    TAB_FEATURES_FILE,
    TAB_TARGETS,
    TAB_VEC_COMB,
    TAB_VECTORIZED_FILE,
    TEMP_TARGETS,
    TEMP_VECTORIZED_FILE,
    TIMESTEP_SIZE,
    VECTORIZED_DIR,
    WINDOW_DURATION,
)

In [None]:
input(f"WARNING: LOADING CONSTANTS FROM {CONST_NAME}")

In [None]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import Aggregator, tabular_as_aggregated
from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    ALL,
    FEATURES,
    MEAN,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import VectorizedNormalizer
from cyclops.processors.feature.vectorize import (
    Vectorized,
    intersect_vectorized,
    split_vectorized,
    vec_index_exp,
)
from cyclops.processors.impute import (
    np_ffill,
    np_ffill_bfill,
    np_fill_null_num,
    np_fill_null_zero,
)
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.common import print_dict
from cyclops.utils.file import (
    join,
    load_array,
    load_dataframe,
    load_pickle,
    process_dir_save_path,
    save_dataframe,
    save_pickle,
    yield_dataframes,
    yield_pickled_files,
)

# Get tabular

In [None]:
tab_features = load_pickle(TAB_FEATURES_FILE)

In [None]:
numeric_features = tab_features.features_by_type(NUMERIC)
numeric_features

In [None]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

In [None]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)
tab_vectorized.shape

# Temporal-specific processing

In [None]:
from functools import reduce

import numpy as np

all_top_events = []
for i, events in enumerate(yield_dataframes(CLEANED_DIR, log=False)):
    # Keep only the most popular events where the values are not null
    top_events = (
        events[EVENT_NAME][~events[EVENT_VALUE].isna()]
        .value_counts()[:TOP_N_EVENTS]
        .index
    )

    all_top_events.append(top_events)

    del events

# Take only the events common to every file
top_events = reduce(np.intersect1d, tuple(all_top_events))

# Force include the target
top_events = np.unique(np.append(top_events, OUTCOME_DEATH_PROCESSED))

top_events

In [None]:
len(top_events)

In [None]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=TIMESTEP_SIZE,
    window_duration=WINDOW_DURATION,
)

In [None]:
# Set up data generator
skip_n = 0
generator = yield_dataframes(CLEANED_DIR, skip_n=skip_n, log=False)

for save_count, events in enumerate(generator):
    # Take only the top events
    events = events[events[EVENT_NAME].isin(top_events)]

    # Aggregate
    events = events.reset_index(drop=True)
    tmp_features = TemporalFeatures(
        events,
        features=EVENT_VALUE,
        by=[ENCOUNTER_ID, EVENT_NAME],
        timestamp_col=EVENT_TIMESTAMP,
        aggregator=aggregator,
    )

    aggregated = tmp_features.aggregate()

    save_dataframe(
        aggregated, join(AGGREGATED_DIR, "batch_" + f"{save_count + skip_n:04d}")
    )

In [None]:
# Set up data generator
skip_n = 0
generator = yield_dataframes(AGGREGATED_DIR, skip_n=skip_n, log=False)

for save_count, aggregated in enumerate(generator):
    vec = aggregator.vectorize(aggregated)
    save_pickle(vec, join(VECTORIZED_DIR, "batch_" + f"{save_count + skip_n:04d}"))

In [None]:
vecs = list([vec for vec in yield_pickled_files(VECTORIZED_DIR)])
encounter_axis = vecs[0].get_axis(ENCOUNTER_ID)
res = np.concatenate([vec.data for vec in vecs], axis=encounter_axis)
indexes = vecs[0].indexes
indexes[encounter_axis] = np.concatenate([vec.indexes[encounter_axis] for vec in vecs])
temp_vectorized = Vectorized(res, indexes, vecs[0].axis_names)
del res
save_pickle(temp_vectorized, TEMP_VECTORIZED_FILE)

In [None]:
temp_vectorized.shape

In [None]:
temp_vectorized.axis_names

In [None]:
# Process targets - this is not real imputation, just using the imputation functions
# to do preprocessing
for target in TEMP_TARGETS:
    event_ind = temp_vectorized.get_index_map(EVENT_NAME)[target]
    index_exp = vec_index_exp[:, :, event_ind]

    # Forward fill target values, e.g., [nan, nan, 1., nan, nan] -> [nan, nan, 1, 1, 1]
    temp_vectorized.impute_over_axis(TIMESTEP, np_ffill, index_exp=index_exp)

    # Fill remaining values with 0, e.g., [nan, nan, 1, 1, 1] -> [0, 0, 1, 1, 1]
    # or [nan, nan, nan, nan, nan] -> [0, 0, 0, 0, 0]
    temp_vectorized.impute_over_axis(TIMESTEP, np_fill_null_zero, index_exp=index_exp)

# Combined processing

In [None]:
temp_vectorized = load_pickle(TEMP_VECTORIZED_FILE)

In [None]:
tab = tab_features.get_data(to_binary_indicators=ordinal_features).reset_index()
tab

In [None]:
# Take only the encounters with temporal events
tab = tab[np.in1d(tab[ENCOUNTER_ID].values, temp_vectorized.get_index(ENCOUNTER_ID))]
tab

In [None]:
tab_aggregated = tabular_as_aggregated(
    tab=tab,
    index=ENCOUNTER_ID,
    var_name=EVENT_NAME,
    value_name=EVENT_VALUE,
    strategy=ALL,
    num_timesteps=aggregator.window_duration // aggregator.timestep_size,
)
tab_aggregated

In [None]:
tab_aggregated_vec = aggregator.vectorize(tab_aggregated)

In [None]:
event_axis = temp_vectorized.get_axis(EVENT_NAME)
res = np.concatenate([temp_vectorized.data, tab_aggregated_vec.data], axis=event_axis)
indexes = [ind.copy() for ind in temp_vectorized.indexes]
indexes[event_axis] = np.concatenate(
    [temp_vectorized.indexes[event_axis], tab_aggregated_vec.indexes[event_axis]]
)
comb_vectorized = Vectorized(res, indexes, temp_vectorized.axis_names)

In [None]:
comb_vectorized.shape

In [None]:
_, save = comb_vectorized.split_out(EVENT_NAME, temp_vectorized.get_index(EVENT_NAME))
assert np.array_equal(save.data, temp_vectorized.data, equal_nan=True)

In [None]:
_, save = comb_vectorized.split_out(
    EVENT_NAME, tab_aggregated_vec.get_index(EVENT_NAME)
)
assert np.array_equal(save.data, tab_aggregated_vec.data, equal_nan=True)

In [None]:
# Don't include the tabular targets
comb_vectorized, _ = comb_vectorized.split_out(EVENT_NAME, TAB_TARGETS)

In [None]:
comb_vectorized.shape

In [None]:
comb_vectorized.get_index(EVENT_NAME)

In [None]:
np.isnan(tab_aggregated_vec.data).sum() / tab_aggregated_vec.data.size

In [None]:
np.isnan(temp_vectorized.data).sum() / temp_vectorized.data.size

In [None]:
np.isnan(comb_vectorized.data).sum() / comb_vectorized.data.size

# Prepare splits

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [None]:
tab_vectorized.shape, temp_vectorized.shape, comb_vectorized.shape

In [None]:
tab_vectorized, temp_vectorized, comb_vectorized = intersect_vectorized(
    [tab_vectorized, temp_vectorized, comb_vectorized], axes=ENCOUNTER_ID
)
tab_vectorized.shape, temp_vectorized.shape, comb_vectorized.shape

In [None]:
# Normalize only numeric features (e.g., not binary indicators)
# Note: Normalization is not occuring, we are only preparing the object
normalizer_map = {feat: STANDARD for feat in numeric_features}

tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map=normalizer_map,
)

In [None]:
# Normalize all events
# Note: Normalization is not occuring, we are only preparing the object
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

comb_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

## Dataset splits

Split into training, validation, and testing datasets such that the tabular and temporal encounters remain aligned.

In [None]:
tab_splits, temp_splits, comb_splits = split_vectorized(
    [tab_vectorized, temp_vectorized, comb_vectorized],
    SPLIT_FRACTIONS,
    axes=ENCOUNTER_ID,
)
tab_train, tab_val, tab_test = tab_splits
temp_train, temp_val, temp_test = temp_splits
comb_train, comb_val, comb_test = comb_splits

In [None]:
tab_train.shape, tab_val.shape, tab_test.shape

In [None]:
temp_train.shape, temp_val.shape, temp_test.shape

In [None]:
comb_train.shape, comb_val.shape, comb_test.shape

## Split features/targets

Split out the targets in the temporal data.

In [None]:
tab_train_X, tab_train_y = tab_train.split_out(FEATURES, TAB_TARGETS)
tab_train_X.shape, tab_train_y.shape

In [None]:
tab_val_X, tab_val_y = tab_val.split_out(FEATURES, TAB_TARGETS)
tab_val_X.shape, tab_val_y.shape

In [None]:
tab_test_X, tab_test_y = tab_test.split_out(FEATURES, TAB_TARGETS)
tab_test_X.shape, tab_test_y.shape

In [None]:
temp_train_X, temp_train_y = temp_train.split_out(EVENT_NAME, TEMP_TARGETS)
temp_train_X.shape, temp_train_y.shape

In [None]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, TEMP_TARGETS)
temp_val_X.shape, temp_val_y.shape

In [None]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, TEMP_TARGETS)
temp_test_X.shape, temp_test_y.shape

In [None]:
comb_train_X, comb_train_y = comb_train.split_out(EVENT_NAME, TEMP_TARGETS)
comb_train_X.shape, comb_train_y.shape

In [None]:
comb_val_X, comb_val_y = comb_val.split_out(EVENT_NAME, TEMP_TARGETS)
comb_val_X.shape, comb_val_y.shape

In [None]:
comb_test_X, comb_test_y = comb_test.split_out(EVENT_NAME, TEMP_TARGETS)
comb_test_X.shape, comb_test_y.shape

In [None]:
def impute(temp_vec):
    # Forward fill then backward fill to get rid of each of the timestep nulls
    temp_vec.impute_over_axis(TIMESTEP, np_ffill_bfill)

    # Fill those all-null timesteps with feature mean
    # (since forward and backward filling still leaves them all null)
    axis = temp_vec.get_axis(EVENT_NAME)

    for i in range(temp_vec.data.shape[axis]):
        index_exp = vec_index_exp[:, :, i]
        data_slice = temp_vec.data[index_exp]
        mean = np.nanmean(data_slice)
        func = lambda x: np_fill_null_num(x, mean)
        temp_vec.impute_over_axis(TIMESTEP, func, index_exp=index_exp)

    return temp_vec


temp_train_X = impute(temp_train_X)
temp_val_X = impute(temp_val_X)
temp_test_X = impute(temp_test_X)

comb_train_X = impute(comb_train_X)
comb_val_X = impute(comb_val_X)
comb_test_X = impute(comb_test_X)

### Normalization

In [None]:
splits = (
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
    comb_train_X,
    comb_val_X,
    comb_test_X,
)

for split in splits:
    split.fit_normalizer()
    split.normalize()

(
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
    comb_train_X,
    comb_val_X,
    comb_test_X,
) = splits

## Save

In [None]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X"),
    (tab_train_y, "tab_train_y"),
    (tab_val_X, "tab_val_X"),
    (tab_val_y, "tab_val_y"),
    (tab_test_X, "tab_test_X"),
    (tab_test_y, "tab_test_y"),
    (temp_train_X, "temp_train_X"),
    (temp_train_y, "temp_train_y"),
    (temp_val_X, "temp_val_X"),
    (temp_val_y, "temp_val_y"),
    (temp_test_X, "temp_test_X"),
    (temp_test_y, "temp_test_y"),
    (comb_train_X, "comb_train_X"),
    (comb_train_y, "comb_train_y"),
    (comb_val_X, "comb_val_X"),
    (comb_val_y, "comb_val_y"),
    (comb_test_X, "comb_test_X"),
    (comb_test_y, "comb_test_y"),
]
for vec, name in vectorized:
    save_pickle(vec, TAB_VEC_COMB + name + ".pkl")