Filtering to get only main diagnosis... should this be done? Why aren't the other diagnoses important?

Instead we could get the encounter/main diagnoses in another table, then join on the entire diagnoses table to keep the other diagnoses

In [None]:
import time

from querying import (
    get_bt_for_cohort,
    get_cohort,
    get_er_for_cohort,
    get_most_recent_encounters,
    get_non_cardiac_diagnoses,
    get_labs,
    main,
)

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import normalize_names, normalize_values
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import GroupbyNormalizer
from cyclops.utils.file import load_dataframe, save_dataframe

In [None]:
t = time.time()
cohort, labs = main()
print(time.time() - t)
cohort

In [None]:
cohort["outcome_death"].sum() / len(cohort)

In [None]:
cohort["outcome_edema"].sum() / len(cohort)

## Tabular

In [None]:
features = [
    HOSPITAL_ID,
    AGE,
    SEX,
    DIAGNOSIS_TRAJECTORY,
    "readmission",
    "from_nursing_home_mapped",
    "from_acute_care_institution_mapped",
    "los_derived",
    "prev_encounter_count",
]

tab_features = TabularFeatures(
    cohort,
    features,
    targets=["outcome_death", "outcome_edema"],
)

In [None]:
tab_features.types

In [None]:
tab_features.meta.keys()

In [None]:
tab_features.meta[SEX].get_mapping()

In [None]:
tab_features.meta[DIAGNOSIS_TRAJECTORY].get_mapping()

In [None]:
tab_features.meta["outcome_death"].get_mapping()

In [None]:
tab_features.meta["hospital_id"].get_mapping()

In [None]:
tab_features.meta["readmission"].get_mapping()

In [None]:
feature_normalizer = GroupbyNormalizer({AGE: MIN_MAX, "los_derived": STANDARD})
tab_features.add_normalizer(FEATURES, feature_normalizer)
tab_features.normalize(FEATURES)
tabular = tab_features.get_data(
    features_only=True,
    to_binary_indicators=[DIAGNOSIS_TRAJECTORY, HOSPITAL_ID, "readmission"]
)
tabular

In [None]:
tabular.columns

In [None]:
tabular.values

## Temporal

In [None]:
labs_copy = labs.copy()
#labs = labs_copy.copy()

In [None]:
labs[EVENT_NAME].unique()

In [None]:
labs[EVENT_NAME] = normalize_names(labs[EVENT_NAME])
labs[EVENT_VALUE] = normalize_values(labs[EVENT_VALUE])

In [None]:
labs[EVENT_NAME].unique()

In [None]:
feature_normalizer = GroupbyNormalizer({EVENT_VALUE: STANDARD}, by=EVENT_NAME)

aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=8,
    window_duration=24,  # Optional
    #agg_meta_for=EVENT_VALUE,  # Optional
)

labs = labs.reset_index().drop("index", axis=1)

features = [EVENT_VALUE]
tmp_features = TemporalFeatures(
    labs,
    features=features,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
)
#tmp_features.add_normalizer(FEATURES, feature_normalizer)
tmp_features.get_data()

In [None]:
#tmp_features.normalize(FEATURES)

In [None]:
aggregated = tmp_features.aggregate()
aggregated.head(5)

In [None]:
numpy.ndarray - (2000, 20, 3)
encounters
events (event_names)
timesteps/buckets 24/8=3

In [None]:
events_vectorized, group_indices = aggregator.vectorize(aggregated)
events_vectorized.shape

In [None]:
events_vectorized = np.squeeze(events_vectorized)
events_vectorized.shape

In [None]:
agg_col_map, encounter_id_map, event_name_map = group_indices

In [None]:
events_vectorized[
    encounter_id_map[29991695], event_name_map["spo2 desat limit - alarms"]
]

In [None]:
encounter_id_map

# Dataset splits

In [None]:
save_dataframe(aggregated, "aggregated.parquet")

In [None]:
save_dataframe(tabular, "tabular.parquet")

In [None]:
tabular.values.shape

In [None]:
train_data, val_data, test_data = split_data(events_vectorized, [0.7, 0.2])

In [None]:
train_data.shape

In [None]:
val_data.shape

In [None]:
test_data.shape