## Shared notebook for processing tabular features.

In [None]:
DATASET = "mimic"
USECASE = "mortality_decompensation"

In [None]:
import importlib

usecase_params = importlib.import_module(
    ".".join(["notebooks", "usecases", DATASET, USECASE, "constants"])
)

In [None]:
input(f"WARNING: LOADING CONSTANTS FROM {usecase_params}")

In [None]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    ALL,
    FEATURES,
    MEAN,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures
from cyclops.processors.feature.normalization import VectorizedNormalizer
from cyclops.utils.common import print_dict
from cyclops.utils.file import (
    join,
    load_array,
    load_dataframe,
    load_pickle,
    process_dir_save_path,
    save_dataframe,
    save_pickle,
)

# Tabular-specific processing

In [None]:
cohort = load_dataframe(usecase_params.ENCOUNTERS_FILE)
cohort.head(5)

In [None]:
cohort["admission_location"].value_counts()

In [None]:
cohort = cohort.reset_index(drop=True)
tab_features = TabularFeatures(
    data=cohort,
    features=usecase_params.TAB_FEATURES,
    by=ENCOUNTER_ID,
)

In [None]:
tab_features.types

In [None]:
tab_features.meta.keys()

In [None]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

In [None]:
if len(ordinal_features) > 0:
    print(ordinal_features[0], "mapping:")
    print(tab_features.meta[ordinal_features[0]].get_mapping())

In [None]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)

In [None]:
numeric_features = tab_features.features_by_type(NUMERIC)
numeric_features

In [None]:
tab_vectorized.shape

In [None]:
tab_vectorized.axis_names

In [None]:
save_pickle(tab_vectorized, usecase_params.TAB_VECTORIZED_FILE)

In [None]:
save_pickle(tab_features, usecase_params.TAB_FEATURES_FILE)

# Prepare splits

In [None]:
# Normalize only numeric features (e.g., not binary indicators)
# Note: Normalization is not occuring, we are only preparing the object
normalizer_map = {feat: STANDARD for feat in numeric_features}

tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map=normalizer_map,
)

## Dataset splits

Split into training, validation, and testing datasets such that the tabular and temporal encounters remain aligned.

In [None]:
tab_vectorized.shape

In [None]:
tab_splits = tab_vectorized.split_by_fraction(
    ENCOUNTER_ID, usecase_params.SPLIT_FRACTIONS
)
tab_train, tab_val, tab_test = tab_splits

In [None]:
tab_train.shape, tab_val.shape, tab_test.shape

## Split features/targets

Split out the targets in the temporal data.

In [None]:
tab_train_X, tab_train_y = tab_train.split_out(FEATURES, usecase_params.TAB_TARGETS)
tab_train_X.shape, tab_train_y.shape

In [None]:
tab_val_X, tab_val_y = tab_val.split_out(FEATURES, usecase_params.TAB_TARGETS)
tab_val_X.shape, tab_val_y.shape

In [None]:
tab_test_X, tab_test_y = tab_test.split_out(FEATURES, usecase_params.TAB_TARGETS)
tab_test_X.shape, tab_test_y.shape

### Normalization

In [None]:
splits = tab_train_X, tab_val_X, tab_test_X

for split in splits:
    split.fit_normalizer()
    split.normalize()

tab_train_X, tab_val_X, tab_test_X = splits

## Save

In [None]:
# Store data
vectorized = [
    (tab_train_X, "tab_train_X"),
    (tab_train_y, "tab_train_y"),
    (tab_val_X, "tab_val_X"),
    (tab_val_y, "tab_val_y"),
    (tab_test_X, "tab_test_X"),
    (tab_test_y, "tab_test_y"),
]
for vec, name in vectorized:
    save_pickle(vec, usecase_params.TAB_UNALIGNED + name)