# Shared notebook for processing temporal features.

# Imports

In [None]:
from functools import reduce

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import (
    Aggregator,
    tabular_as_aggregated,
    timestamp_ffill_agg,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    DISCHARGE_TIMESTAMP,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    RESTRICT_TIMESTAMP,
    TIMESTEP,
)
from cyclops.processors.constants import ALL, FEATURES, MEAN, NUMERIC, ORDINAL, STANDARD
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.vectorize import (
    Vectorized,
    intersect_vectorized,
    split_vectorized,
    vec_index_exp,
)
from cyclops.processors.impute import np_ffill_bfill, np_fill_null_num
from cyclops.utils.file import (
    join,
    load_dataframe,
    load_pickle,
    save_dataframe,
    save_pickle,
    yield_dataframes,
    yield_pickled_files,
)
from drift_detection.gemini.utils import get_use_case_params

# Choose dataset and use-case

In [None]:
DATASET = "gemini"
USE_CASE = "mortality"

use_case_params = get_use_case_params(DATASET, USE_CASE)
input(f"WARNING: LOADING CONSTANTS FROM {use_case_params}")

In [None]:
cohort = load_dataframe(use_case_params.ENCOUNTERS_FILE)
cohort = cohort.reset_index(drop=True)
cohort.head(5)

In [None]:
tab_features = TabularFeatures(
    data=cohort,
    features=use_case_params.TAB_FEATURES,
    by=ENCOUNTER_ID,
    force_types=use_case_params.TAB_FEATURES_TYPES,
)

numeric_features = tab_features.features_by_type(NUMERIC)
ordinal_features = tab_features.features_by_type(ORDINAL)

if len(ordinal_features) > 0:
    print(ordinal_features[0], "mapping:")
    print(tab_features.meta[ordinal_features[0]].get_mapping())

tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)
save_pickle(tab_vectorized, use_case_params.TAB_VECTORIZED_FILE)
save_pickle(tab_features, use_case_params.TAB_FEATURES_FILE)

In [None]:
load_dataframe(use_case_params.ENCOUNTERS_FILE)

In [None]:
timestamps = load_dataframe(use_case_params.ENCOUNTERS_FILE)[
    [
        ENCOUNTER_ID,
        ADMIT_TIMESTAMP,
        DISCHARGE_TIMESTAMP,
        use_case_params.TARGET_TIMESTAMP,
    ]
]
start_timestamps = (
    timestamps[[ENCOUNTER_ID, ADMIT_TIMESTAMP]]
    .set_index(ENCOUNTER_ID)
    .rename({ADMIT_TIMESTAMP: RESTRICT_TIMESTAMP}, axis=1)
)
start_timestamps

# Temporal-specific processing

In [None]:
# Determine which events to keep
# Keep only the most popular events where the values are not null
all_top_events = []
for i, events in enumerate(yield_dataframes(use_case_params.CLEANED_DIR, log=False)):
    top_events = (
        events[EVENT_NAME][~events[EVENT_VALUE].isna()]
        .value_counts()[: use_case_params.TOP_N_EVENTS]
        .index
    )

    all_top_events.append(top_events)

    del events

# Take only the events common to every file
top_events = reduce(np.intersect1d, tuple(all_top_events))

top_events

In [None]:
len(top_events)

In [None]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=use_case_params.TIMESTEP_SIZE,
    window_duration=use_case_params.WINDOW_DURATION,
)

In [None]:
# Aggregate
skip_n = 0
generator = yield_dataframes(use_case_params.CLEANED_DIR, skip_n=skip_n, log=False)

for save_count, events in enumerate(generator):
    # Take only the top events
    events = events[events[EVENT_NAME].isin(top_events)]

    # Aggregate
    events = events.reset_index(drop=True)
    tmp_features = TemporalFeatures(
        events,
        features=EVENT_VALUE,
        by=[ENCOUNTER_ID, EVENT_NAME],
        timestamp_col=EVENT_TIMESTAMP,
        aggregator=aggregator,
    )

    aggregated = tmp_features.aggregate(window_start_time=start_timestamps)

    save_dataframe(
        aggregated,
        join(use_case_params.AGGREGATED_DIR, "batch_" + f"{save_count + skip_n:04d}"),
    )
    del events

In [None]:
# Vectorize
skip_n = 0
generator = yield_dataframes(use_case_params.AGGREGATED_DIR, skip_n=skip_n, log=False)
for save_count, aggregated in enumerate(generator):
    vec = aggregator.vectorize(aggregated)
    save_pickle(
        vec,
        join(use_case_params.VECTORIZED_DIR, "batch_" + f"{save_count + skip_n:04d}"),
    )

In [None]:
# Take all Vectorized objects and turn them into a single object
vecs = list([vec for vec in yield_pickled_files(use_case_params.VECTORIZED_DIR)])
encounter_axis = vecs[0].get_axis(ENCOUNTER_ID)
res = np.concatenate([vec.data for vec in vecs], axis=encounter_axis)
indexes = vecs[0].indexes
indexes[encounter_axis] = np.concatenate([vec.indexes[encounter_axis] for vec in vecs])
temp_vectorized = Vectorized(res, indexes, vecs[0].axis_names)
del res

In [None]:
temp_vectorized.shape

In [None]:
temp_vectorized.axis_names

## Target creation

In [None]:
def compute_timestep(timestamps, event):
    timestamps[f"{event}_after_admit"] = timestamps[event] - timestamps[ADMIT_TIMESTAMP]
    timestamps[f"{event}_timestep"] = (
        timestamps[f"{event}_after_admit"]
        / pd.Timedelta(f"{use_case_params.TIMESTEP_SIZE} hour")
    ).apply(np.floor)
    return timestamps


timestamps["target"] = timestamps[use_case_params.TARGET_TIMESTAMP] - pd.DateOffset(
    hours=use_case_params.PREDICT_OFFSET
)
timestamps = compute_timestep(timestamps, "target")
timestamps = compute_timestep(timestamps, DISCHARGE_TIMESTAMP)
timestamps

In [None]:
timestamps[~timestamps[use_case_params.TARGET_TIMESTAMP].isna()]

In [None]:
encounter_order = pd.Series(temp_vectorized.get_index(ENCOUNTER_ID))
encounter_order = encounter_order.rename(ENCOUNTER_ID).to_frame()
encounter_order

In [None]:
discharge_timestep = DISCHARGE_TIMESTAMP + "_timestep"
timesteps = timestamps[[ENCOUNTER_ID, "target_timestep", discharge_timestep]]
aligned_timestamps = pd.merge(encounter_order, timesteps, on=ENCOUNTER_ID, how="left")
aligned_timestamps

In [None]:
num_timesteps = int(use_case_params.WINDOW_DURATION / use_case_params.TIMESTEP_SIZE)
shape = (len(aligned_timestamps), num_timesteps)

arr1 = timestamp_ffill_agg(
    aligned_timestamps["target_timestep"], num_timesteps, fill_nan=2
)
arr2 = timestamp_ffill_agg(
    aligned_timestamps[discharge_timestep], num_timesteps, val=-1, fill_nan=2
)
targets = np.minimum(arr1, arr2)
targets[targets == 2] = 0
targets[126:146]

In [None]:
aligned_timestamps.iloc[126:146]

In [None]:
targets = np.expand_dims(np.expand_dims(targets, 0), 2)
targets.shape

In [None]:
temp_vectorized.shape

In [None]:
# Include target
# temp_vectorized = temp_vectorized.remove_with_index(EVENT_NAME, TEMP_TARGETS)
# print(temp_vectorized.shape)
temp_vectorized = temp_vectorized.concat_over_axis(
    EVENT_NAME, targets, use_case_params.TEMP_TARGETS
)
temp_vectorized.shape

In [None]:
only_targets = temp_vectorized.take_with_index(EVENT_NAME, use_case_params.TEMP_TARGETS)
assert np.isnan(only_targets.data).sum() == 0

In [None]:
save_pickle(temp_vectorized, use_case_params.TEMP_VECTORIZED_FILE)

# Combined processing

In [None]:
temp_vectorized = load_pickle(use_case_params.TEMP_VECTORIZED_FILE)

In [None]:
tab = tab_features.get_data(to_binary_indicators=ordinal_features).reset_index()

# Take only the encounters with temporal events
tab = tab[np.in1d(tab[ENCOUNTER_ID].values, temp_vectorized.get_index(ENCOUNTER_ID))]

# Aggregate tabular
tab_aggregated = tabular_as_aggregated(
    tab=tab,
    index=ENCOUNTER_ID,
    var_name=EVENT_NAME,
    value_name=EVENT_VALUE,
    strategy=ALL,
    num_timesteps=aggregator.window_duration // aggregator.timestep_size,
)
tab_aggregated

In [None]:
# Vectorize tabular
tab_aggregated_vec = aggregator.vectorize(tab_aggregated)
tab_aggregated_vec.shape

In [None]:
temp_vectorized.shape

In [None]:
tab_aggregated_vec.shape

In [None]:
# Combine
comb_vectorized = temp_vectorized.concat_over_axis(
    EVENT_NAME, tab_aggregated_vec.data, tab_aggregated_vec.get_index(EVENT_NAME)
)
comb_vectorized.shape

In [None]:
# Don't include any of the tabular targets - split out to avoid label leakage
comb_vectorized, _ = comb_vectorized.split_out(EVENT_NAME, use_case_params.TAB_TARGETS)
comb_vectorized.shape

In [None]:
comb_vectorized.get_index(EVENT_NAME)

In [None]:
np.isnan(tab_aggregated_vec.data).sum() / tab_aggregated_vec.data.size

In [None]:
tab_aggregated_vec.data

In [None]:
np.isnan(temp_vectorized.data).sum() / temp_vectorized.data.size

In [None]:
np.isnan(comb_vectorized.data).sum() / comb_vectorized.data.size

# Prepare splits

In [None]:
tab_vectorized.shape, temp_vectorized.shape, comb_vectorized.shape

In [None]:
tab_vectorized, temp_vectorized, comb_vectorized = intersect_vectorized(
    [tab_vectorized, temp_vectorized, comb_vectorized], axes=ENCOUNTER_ID
)
tab_vectorized.shape, temp_vectorized.shape, comb_vectorized.shape

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [None]:
# Normalize only numeric features (e.g., not binary indicators)
# Note: Normalization is not occuring here, we are only doing the setup
normalizer_map = {feat: STANDARD for feat in numeric_features}

tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map=normalizer_map,
)

In [None]:
# Normalize all events
# Note: Normalization is not occuring here, we are only doing the setup
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

comb_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

## Dataset splits

Split into training, validation, and testing datasets such that the tabular and temporal encounters remain aligned.

In [None]:
tab_splits, temp_splits, comb_splits = split_vectorized(
    [tab_vectorized, temp_vectorized, comb_vectorized],
    use_case_params.SPLIT_FRACTIONS,
    axes=ENCOUNTER_ID,
)
tab_train, tab_val, tab_test = tab_splits
temp_train, temp_val, temp_test = temp_splits
comb_train, comb_val, comb_test = comb_splits

In [None]:
tab_train.shape, tab_val.shape, tab_test.shape

In [None]:
temp_train.shape, temp_val.shape, temp_test.shape

In [None]:
comb_train.shape, comb_val.shape, comb_test.shape

## Split features/targets

Split out the targets in the temporal data.

In [None]:
tab_train_X, tab_train_y = tab_train.split_out(FEATURES, use_case_params.TAB_TARGETS)
tab_train_X.shape, tab_train_y.shape

In [None]:
tab_val_X, tab_val_y = tab_val.split_out(FEATURES, use_case_params.TAB_TARGETS)
tab_val_X.shape, tab_val_y.shape

In [None]:
tab_test_X, tab_test_y = tab_test.split_out(FEATURES, use_case_params.TAB_TARGETS)
tab_test_X.shape, tab_test_y.shape

In [None]:
temp_train_X, temp_train_y = temp_train.split_out(
    EVENT_NAME, use_case_params.TEMP_TARGETS
)
temp_train_X.shape, temp_train_y.shape

In [None]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
temp_val_X.shape, temp_val_y.shape

In [None]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
temp_test_X.shape, temp_test_y.shape

In [None]:
comb_train_X, comb_train_y = comb_train.split_out(
    EVENT_NAME, use_case_params.TEMP_TARGETS
)
comb_train_X.shape, comb_train_y.shape

In [None]:
comb_val_X, comb_val_y = comb_val.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
comb_val_X.shape, comb_val_y.shape

In [None]:
comb_test_X, comb_test_y = comb_test.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
comb_test_X.shape, comb_test_y.shape

In [None]:
def impute(temp_vec):
    # Forward fill then backward fill to get rid of all of the timestep nulls
    temp_vec.impute_over_axis(TIMESTEP, np_ffill_bfill)

    # Fill those all-null timesteps with feature mean
    # (since forward and backward filling still leaves them all null)
    axis = temp_vec.get_axis(EVENT_NAME)

    for i in range(temp_vec.data.shape[axis]):
        index_exp = vec_index_exp[:, :, i]
        data_slice = temp_vec.data[index_exp]
        mean = np.nanmean(data_slice)
        func = lambda x: np_fill_null_num(x, mean)  # noqa: E731
        temp_vec.impute_over_axis(TIMESTEP, func, index_exp=index_exp)

    return temp_vec


temp_train_X = impute(temp_train_X)
temp_val_X = impute(temp_val_X)
temp_test_X = impute(temp_test_X)

comb_train_X = impute(comb_train_X)
comb_val_X = impute(comb_val_X)
comb_test_X = impute(comb_test_X)

### Normalization

In [None]:
splits = (
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
    comb_train_X,
    comb_val_X,
    comb_test_X,
)

for split in splits:
    split.fit_normalizer()
    split.normalize()

(
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
    comb_train_X,
    comb_val_X,
    comb_test_X,
) = splits

## Save

In [None]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X"),
    (tab_train_y, "tab_train_y"),
    (tab_val_X, "tab_val_X"),
    (tab_val_y, "tab_val_y"),
    (tab_test_X, "tab_test_X"),
    (tab_test_y, "tab_test_y"),
    (temp_train_X, "temp_train_X"),
    (temp_train_y, "temp_train_y"),
    (temp_val_X, "temp_val_X"),
    (temp_val_y, "temp_val_y"),
    (temp_test_X, "temp_test_X"),
    (temp_test_y, "temp_test_y"),
    (comb_train_X, "comb_train_X"),
    (comb_train_y, "comb_train_y"),
    (comb_val_X, "comb_val_X"),
    (comb_val_y, "comb_val_y"),
    (comb_test_X, "comb_test_X"),
    (comb_test_y, "comb_test_y"),
]
for vec, name in vectorized:
    save_pickle(vec, use_case_params.TAB_VEC_COMB + name + ".pkl")