Filtering to get only main diagnosis... should this be done? Why aren't the other diagnoses important?

Instead we could get the encounter/main diagnoses in another table, then join on the entire diagnoses table to keep the other diagnoses

In [None]:
import pickle
import time

import numpy as np
import pandas as pd
from consts import OUTCOME_DEATH, OUTCOME_EDEMA
from querying import (
    get_bt_for_cohort,
    get_cohort,
    get_er_for_cohort,
    get_labs,
    get_most_recent_encounters,
    get_non_cardiac_diagnoses,
    main,
)

from cyclops.processors.aggregate import Aggregator, tabular_as_aggregated
from cyclops.processors.cleaning import normalize_names, normalize_values
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    DISCHARGE_TIMESTAMP,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    ALL,
    BINARY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    FIRST,
    LAST,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.vectorize import intersect_vectorized, split_vectorized
from cyclops.utils.file import load_dataframe, save_dataframe

In [None]:
SPLIT_FRACTIONS = [0.7, 0.2]
TAB_TARGETS = [OUTCOME_DEATH, OUTCOME_EDEMA]

OUTCOME_DEATH_TEMP = "outcome_death_temp"
TEMP_TARGETS = [OUTCOME_DEATH_TEMP]
PREDICT_OFFSET_HRS = 24

# Tabular-specific processing

In [None]:
cohort = load_dataframe("cohort.parquet")

In [None]:
features = [
    HOSPITAL_ID,
    AGE,
    SEX,
    DIAGNOSIS_TRAJECTORY,
    "readmission",
    "from_nursing_home_mapped",
    "from_acute_care_institution_mapped",
    "los_derived",
    "prev_encounter_count",
]

cohort = cohort.reset_index().drop("index", axis=1)

tab_features = TabularFeatures(
    cohort,
    features,
    by=ENCOUNTER_ID,
    targets=TAB_TARGETS,
)

In [None]:
tab_features.types

In [None]:
tab_features.meta.keys()

In [None]:
tab_features.meta[SEX].get_mapping()

In [None]:
tab_features.meta[DIAGNOSIS_TRAJECTORY].get_mapping()

In [None]:
tab_features.meta[OUTCOME_DEATH].get_mapping()

In [None]:
tab_features.meta[HOSPITAL_ID].get_mapping()

In [None]:
tab_features.meta["readmission"].get_mapping()

In [None]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

In [None]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)

In [None]:
tab_vectorized.shape

In [None]:
tab_vectorized.axis_names

In [None]:
numeric_features = tab_features.features_by_type(NUMERIC)
numeric_features

In [None]:
tab_vectorized.save("tab_vectorized.npy")

# Temporal-specific processing

In [None]:
events = load_dataframe("labs.parquet")
events = events.sample(n=int(len(events) / 100))
events.head(5)

In [None]:
events[EVENT_NAME].unique()

In [None]:
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

In [None]:
events[EVENT_NAME].unique()

In [None]:
# Create the target as a timeseries event
death_events = cohort[cohort[OUTCOME_DEATH] == True]
death_events = death_events[[ENCOUNTER_ID, DISCHARGE_TIMESTAMP]]

# Offset death time - i.e., should predict death time in advance
death_events[DISCHARGE_TIMESTAMP] = death_events[DISCHARGE_TIMESTAMP] - pd.DateOffset(
    hours=PREDICT_OFFSET_HRS
)

death_events = death_events.rename({DISCHARGE_TIMESTAMP: EVENT_TIMESTAMP}, axis=1)
death_events[EVENT_NAME] = OUTCOME_DEATH_TEMP
death_events[EVENT_VALUE] = 1
death_events.head(5)

In [None]:
events = pd.concat([events, death_events])

In [None]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=8,
    window_duration=24,
)

In [None]:
events = events.reset_index(drop=True)

tmp_features = TemporalFeatures(
    events,
    features=EVENT_VALUE,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
)

In [None]:
aggregated = tmp_features.aggregate()
aggregated.head(5)

In [None]:
save_dataframe(aggregated, "aggregated.parquet")

In [None]:
temp_vectorized = aggregator.vectorize(aggregated)
temp_vectorized.shape

In [None]:
temp_vectorized.axis_names

In [None]:
temp_vectorized.save("temp_vectorized.npy")

# Combined processing

In [None]:
tab = tab_features.get_data(to_binary_indicators=ordinal_features)

tab_aggregated = tabular_as_aggregated(
    tab=tab,
    index=ENCOUNTER_ID,
    var_name=EVENT_NAME,
    value_name=EVENT_VALUE,
    strategy=ALL,
    num_timesteps=aggregator.window_duration // aggregator.timestep_size,
)
tab_aggregated

In [None]:
aggregated

In [None]:
comb_aggregated = pd.concat([aggregated, tab_aggregated])
comb_aggregated = comb_aggregated.sort_index()
comb_aggregated

In [None]:
comb_aggregated.index[comb_aggregated.index.duplicated()]

In [None]:
comb_aggregated

In [None]:
comb_vectorized = aggregator.vectorize(comb_aggregated)
comb_vectorized.shape

# Prepare before splits

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [None]:
tab_vectorized.shape, temp_vectorized.shape

In [None]:
tab_vectorized, temp_vectorized, comb_vectorized = intersect_vectorized(
    [tab_vectorized, temp_vectorized, comb_vectorized], axes=ENCOUNTER_ID
)
tab_vectorized.shape, temp_vectorized.shape, comb_vectorized.shape

Add normalizers before splits for convienience (but do not yet fit/normalize)

In [None]:
# Standardize only the numeric features (e.g., not binary indicators)
normalizer_map = {feat: STANDARD for feat in numeric_features}

tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map=normalizer_map,
)

In [None]:
# Standardize all events
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

In [None]:
# Standardize the combined features
temp_normalizer_map = {feat: STANDARD for feat in temp_vectorized.get_index(EVENT_NAME)}
normalizer_map.update(temp_normalizer_map)

comb_vectorized.add_normalizer(
    EVENT_NAME,
    normalizer_map=normalizer_map,
)

## Dataset splits

Split into training, validation, and testing datasets such that the tabular and temporal encounters remain aligned.

In [None]:
tab_splits, temp_splits, comb_splits = split_vectorized(
    [tab_vectorized, temp_vectorized, comb_vectorized],
    SPLIT_FRACTIONS,
    axes=ENCOUNTER_ID,
)
tab_train, tab_val, tab_test = tab_splits
temp_train, temp_val, temp_test = temp_splits
comb_train, comb_val, comb_test = comb_splits

In [None]:
tab_train.shape, tab_val.shape, tab_test.shape

In [None]:
temp_train.shape, temp_val.shape, temp_test.shape

In [None]:
comb_train.shape, comb_val.shape, comb_test.shape

## Split features & targets

Split out the targets in the temporal data.

In [None]:
tab_train_X, tab_train_y = tab_train.split_out(FEATURES, TAB_TARGETS)
tab_train_X.shape, tab_train_y.shape

In [None]:
tab_val_X, tab_val_y = tab_val.split_out(FEATURES, TAB_TARGETS)
tab_val_X.shape, tab_val_y.shape

In [None]:
tab_test_X, tab_test_y = tab_test.split_out(FEATURES, TAB_TARGETS)
tab_test_X.shape, tab_test_y.shape

In [None]:
temp_train_X, temp_train_y = temp_train.split_out(EVENT_NAME, TEMP_TARGETS)
temp_train_X.shape, temp_train_y.shape

In [None]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, TEMP_TARGETS)
temp_val_X.shape, temp_val_y.shape

In [None]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, TEMP_TARGETS)
temp_test_X.shape, temp_test_y.shape

In [None]:
comb_train_X, comb_train_y = comb_train.split_out(
    EVENT_NAME, TAB_TARGETS + TEMP_TARGETS
)
comb_train_X.shape, comb_train_y.shape

In [None]:
comb_val_X, comb_val_y = comb_val.split_out(EVENT_NAME, TAB_TARGETS + TEMP_TARGETS)
comb_val_X.shape, comb_val_y.shape

In [None]:
comb_test_X, comb_test_y = comb_test.split_out(EVENT_NAME, TAB_TARGETS + TEMP_TARGETS)
comb_test_X.shape, comb_test_y.shape

## Normalization

In [None]:
splits = (
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
    comb_train_X,
    comb_val_X,
    comb_test_X,
)

for split in splits:
    split.fit_normalizer()
    split.normalize()

(
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
    comb_train_X,
    comb_val_X,
    comb_test_X,
) = splits

# Save

In [None]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X"),
    (tab_val_X, "tab_val_X"),
    (tab_test_X, "tab_test_X"),
    (temp_train_X, "temp_train_X"),
    (temp_train_y, "temp_train_y"),
    (temp_val_X, "temp_val_X"),
    (temp_val_y, "temp_val_y"),
    (temp_test_X, "temp_test_X"),
    (temp_test_y, "temp_test_y"),
    (comb_train_X, "comb_train_X"),
    (comb_train_y, "comb_train_y"),
    (comb_val_X, "comb_val_X"),
    (comb_val_y, "comb_val_y"),
    (comb_test_X, "comb_test_X"),
    (comb_test_y, "comb_test_y"),
]
for vec, name in vectorized:
    with open(name + ".pkl", "wb") as handle:
        pickle.dump(vec, handle)

In [None]:
"""
tab_train_X.save("tab_train_X.npy")
tab_val_X.save("tab_val_X.npy")
tab_test_X.save("tab_test_X.npy")

temp_train_X.save("temp_train_X.npy")
temp_val_X.save("temp_val_X.npy")
temp_test_X.save("temp_test_X.npy")

temp_train_y.save("temp_train_y.npy")
temp_val_y.save("temp_val_y.npy")
temp_test_y.save("temp_test_y.npy")
"""