Filtering to get only main diagnosis... should this be done? Why aren't the other diagnoses important?

Instead we could get the encounter/main diagnoses in another table, then join on the entire diagnoses table to keep the other diagnoses

In [None]:
import time

from querying import (
    get_bt_for_cohort,
    get_cohort,
    get_er_for_cohort,
    get_most_recent_encounters,
    get_non_cardiac_diagnoses,
    main,
)

from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import GroupbyNormalizer
from cyclops.processors.aggregate import Aggregator

In [None]:
t = time.time()
cohort, labs = main()
labs = labs[labs["event_value"] != "PND"]
cohort.drop("subject_id", axis=1)
print(time.time() - t)
cohort

In [None]:
from cyclops.processors.diagnoses import process_diagnoses

trajectory = process_diagnoses(cohort[DIAGNOSIS_CODE])
cohort[trajectory.name] = trajectory
cohort = cohort.drop(DIAGNOSIS_CODE, axis=1)
cohort

In [None]:
cohort["outcome_death"].sum() / len(cohort)

In [None]:
try:
    print(cohort["outcome_edema"].sum() / len(cohort))
except:
    pass

## Tabular

In [None]:
features = [
    HOSPITAL_ID,
    AGE,
    SEX,
    HOSPITAL_ID,
    DIAGNOSIS_TRAJECTORY,
    "readmission",
    "from_nursing_home_mapped",
    "from_acute_care_institution_mapped",
    "los_derived",
    "prev_encounter_count",
]

tab_features = TabularFeatures(
    cohort,
    features,
    targets=["outcome_death"],
)

df = tab_features.get_data(to_indicators=[HOSPITAL_ID, "readmission"])
df

In [None]:
from cyclops.processors.feature.type_handling import collect_indicators
data, meta = collect_indicators(df, [HOSPITAL_ID, "readmission"])
data

In [None]:
meta

In [None]:
tab_features.types

In [None]:
tab_features.meta[SEX].get_mapping()

In [None]:
tab_features.meta["outcome_death"].get_mapping()

In [None]:
tab_features.meta.keys()

In [None]:
tab_features.meta["readmission"].get_mapping()

In [None]:
feature_normalizer = GroupbyNormalizer({AGE: MIN_MAX, "los_derived": STANDARD})
tab_features.add_normalizer(FEATURES, feature_normalizer)
tab_features.normalize(FEATURES)

In [None]:
tab_features.inverse_normalize(FEATURES)

## Temporal

In [None]:
import time

from querying import (
    get_bt_for_cohort,
    get_cohort,
    get_er_for_cohort,
    get_most_recent_encounters,
    get_non_cardiac_diagnoses,
    main,
)

from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import GroupbyNormalizer
from cyclops.processors.aggregate import Aggregator

In [None]:
import pandas as pd

import cyclops.query.process as qp
from cyclops.query import gemini

def get_labs() -> pd.DataFrame:
    table = gemini.events(
        "lab", drop_null_event_names=True #, drop_null_event_values=True
    ).query

    table = qp.FilterColumns(
        [ENCOUNTER_ID, EVENT_NAME, EVENT_VALUE, EVENT_VALUE_UNIT, EVENT_TIMESTAMP]
    )(table)

    return gemini.get_interface(table).run(limit=1000)

labs = get_labs()
labs

In [None]:
from cyclops.processors.cleaning import normalize_values
labs = normalize_values(labs)

In [None]:
feature_normalizer = GroupbyNormalizer({EVENT_VALUE: STANDARD}, by=EVENT_NAME)

aggregator = Aggregator(
    {EVENT_VALUE: "mean"},
    EVENT_TIMESTAMP,
    ENCOUNTER_ID,
    [ENCOUNTER_ID, EVENT_NAME],
    8,
    #window_duration=24,
    agg_meta_for=EVENT_VALUE,
)

tmp_features = TemporalFeatures(
    labs,
    [EVENT_VALUE],
    [ENCOUNTER_ID, EVENT_NAME],
    EVENT_TIMESTAMP,
    aggregator = aggregator
)
tmp_features.add_normalizer(FEATURES, feature_normalizer)
tmp_features.normalize(FEATURES)
tmp_features.get_data()

In [None]:
aggregated = tmp_features.aggregate()
aggregated

In [None]:
tmp_features.aggregator.window_times

In [None]:
aggregated = aggregated.reset_index().sort_values([ENCOUNTER_ID, EVENT_NAME, TIMESTEP])
aggregated