Filtering to get only main diagnosis... should this be done? Why aren't the other diagnoses important?

Instead we could get the encounter/main diagnoses in another table, then join on the entire diagnoses table to keep the other diagnoses

In [None]:
import time
import numpy as np
import pickle

from querying import (
    get_bt_for_cohort,
    get_cohort,
    get_er_for_cohort,
    get_most_recent_encounters,
    get_non_cardiac_diagnoses,
    get_labs,
    main,
)

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import normalize_names, normalize_values
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import GroupbyNormalizer
from cyclops.processors.feature.split import intersect_datasets, split_datasets
from cyclops.utils.file import load_dataframe, save_dataframe

In [None]:
import numpy as np

from cyclops.processors.feature.normalization import VectorizedNormalizer
from cyclops.processors.constants import MIN_MAX, STANDARD

In [None]:
feat_map = {"A": 0, "B": 1}

data = np.array([
    [[1, 2, 3], [3, 2, 100]],
    [[4, 5, 2], [9, 20, 10]],
]).astype(float)
data.shape

In [None]:
data

In [None]:
data[:, 0, :]

In [None]:
data[:, 1, :]

In [None]:
feat_map = {"A": 0, "B": 1}

In [None]:
normalizer = VectorizedNormalizer(
    axis=1,
    normalization_method=STANDARD
)
normalizer.fit(data, feat_map)

In [None]:
normalizer.transform(data, feat_map)

In [None]:
normalizer.normalizers

In [None]:
SPLIT_FRACTIONS = [0.7, 0.2]

# Querying

In [None]:
t = time.time()
cohort, labs = main()
print(time.time() - t)
cohort

In [None]:
save_dataframe(cohort, "cohort_raw.parquet")
save_dataframe(labs, "labs_raw.parquet")

In [None]:
cohort["outcome_death"].sum() / len(cohort)

In [None]:
cohort["outcome_edema"].sum() / len(cohort)

In [None]:
# Intersect over the ENCOUNTER_IDs
cohort, labs = intersect_datasets([cohort, labs], ENCOUNTER_ID)

In [None]:
len(cohort[ENCOUNTER_ID].unique())

In [None]:
len(labs[ENCOUNTER_ID].unique())

In [None]:
labs[EVENT_NAME].unique()

In [None]:
labs[EVENT_NAME] = normalize_names(labs[EVENT_NAME])
labs[EVENT_VALUE] = normalize_values(labs[EVENT_VALUE])

In [None]:
labs[EVENT_NAME].unique()

In [None]:
save_dataframe(cohort, "cohort.parquet")
save_dataframe(labs, "labs.parquet")

# Processing

## Tabular

In [None]:
cohort = load_dataframe("cohort.parquet")

In [None]:
features = [
    HOSPITAL_ID,
    AGE,
    SEX,
    DIAGNOSIS_TRAJECTORY,
    "readmission",
    "from_nursing_home_mapped",
    "from_acute_care_institution_mapped",
    "los_derived",
    "prev_encounter_count",
]

normalizer = GroupbyNormalizer({AGE: MIN_MAX, "los_derived": STANDARD})

cohort = cohort.reset_index().drop("index", axis=1)

tab_features = TabularFeatures(
    cohort,
    features,
    by=ENCOUNTER_ID,
    targets=["outcome_death", "outcome_edema"],
    normalizers={FEATURES: feature_normalizer}
)

In [None]:
tab_features.types

In [None]:
tab_features.meta.keys()

In [None]:
tab_features.meta[SEX].get_mapping()

In [None]:
tab_features.meta[DIAGNOSIS_TRAJECTORY].get_mapping()

In [None]:
tab_features.meta["outcome_death"].get_mapping()

In [None]:
tab_features.meta["hospital_id"].get_mapping()

In [None]:
tab_features.meta["readmission"].get_mapping()

## Temporal

In [None]:
labs = load_dataframe("labs.parquet")

In [None]:
features = [EVENT_VALUE]

aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=8,
    window_duration=24,  # Optional
    #agg_meta_for=EVENT_VALUE,  # Optional
)
normalizer = GroupbyNormalizer({EVENT_VALUE: STANDARD}, by=EVENT_NAME)

labs = labs.reset_index().drop("index", axis=1)

tmp_features = TemporalFeatures(
    labs,
    features=features,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
    normalizers={FEATURES: normalizer}
)

# Normalize and get the data
tmp_features.normalize(FEATURES)
tmp_features.get_data()

In [None]:
aggregated = tmp_features.aggregate()
aggregated

In [None]:
save_dataframe(aggregated, "aggregated.parquet")

In [None]:
temp_vectorized, group_indices = aggregator.vectorize(aggregated)
temp_vectorized.shape

In [None]:
temp_vectorized = np.squeeze(temp_vectorized)
temp_vectorized.shape

In [None]:
np.save("temp_vectorized.npy", temp_vectorized)

In [None]:
agg_col_map, encounter_id_map, temp_feat_map = group_indices

In [None]:
dict(list(encounter_id_map.items())[0:10])

In [None]:
temp_vectorized[
    encounter_id_map[15999943], event_name_map["hemoglobin"]
]

# Dataset processing

In [None]:
tab_splits, temp_splits = split_features([])
tab_train, tab_val, tab_test = tab_split

In [None]:
tab_train.normalize()
tab_val.normalize()
tab_test.normalize()

In [None]:
tab_features.normalize(FEATURES)
tabular = tab_features.get_data(
    features_only=True,
    to_binary_indicators=[DIAGNOSIS_TRAJECTORY, HOSPITAL_ID, "readmission"]
)
tabular

In [None]:
# Split over ENCOUNTER_ID
train_tab, val_tab, test_tab = tab_features.split(SPLIT_FRACTIONS)

In [None]:
len(train_tab._data)

In [None]:
len(val_tab._data)

In [None]:
len(test_tab._data)

In [None]:
len(tab_features._data) == len(train_tab._data) + len(val_tab._data) + len(test_tab._data)

In [None]:
tabular.columns

In [None]:
tab_vectorized = tab_features.vectorize(
    to_binary_indicators=[DIAGNOSIS_TRAJECTORY, HOSPITAL_ID, "readmission"]
)
tab_vectorized.data.shape

In [None]:
split = tab_vectorized.get_by_value(None, ['hospital_id_THPM', 'hospital_id_UHNTG'])
split.data.shape

In [None]:
dict(list(tab_by_map.items())[0:10])

In [None]:
dict(list(tab_feat_map.items())[0:10])

In [None]:
save_dataframe(tabular, "tabular.parquet")
np.save("tab_vectorized.npy", tab_vectorized)

with open('filename.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('filename.pickle', 'rb') as handle:
    b = pickle.load(handle)

Compare ENCOUNTER_ID

In [None]:
tabular = load_dataframe("tabular.parquet")
tab_vectorized = np.load("tab_vectorized.npy")
temp_vectorized = np.load("temp_vectorized.npy")


In [None]:
tab_vectorized.shape

In [None]:
temp_vectorized.shape

In [None]:
# Compare ENCOUNTER_ID
tab_encounter_ids = tabular[ENCOUNTER_ID].values

encounter_id_map_inv = {v: k for k, v in encounter_id_map.items()}
temp_encounter_ids = np.array(
    [encounter_id_map_inv[i] for i in range(len(encounter_id_map))]
)

In [None]:
tab_encounter_ids

In [None]:
temp_encounter_ids

In [None]:
assert np.array_equal(tab_encounter_ids, temp_encounter_ids)

# Dataset splits

Split and save the datasets

In [None]:
tab_split, temp_split = split_datasets([tab_vectorized, temp_vectorized], [0.7, 0.2])
tab_train, tab_val, tab_test = tab_split
temp_train, temp_val, temp_test = temp_split

In [None]:
print("tab_train", tab_train.shape)
print("tab_val", tab_val.shape)
print("tab_test", tab_test.shape)
print("temp_train", temp_train.shape)
print("temp_val", temp_val.shape)
print("temp_test", temp_test.shape)

In [None]:
np.save("tab_train.npy", tab_train)
np.save("tab_val.npy", tab_val)
np.save("tab_test.npy", tab_test)
np.save("temp_train.npy", temp_train)
np.save("temp_val.npy", temp_val)
np.save("temp_test.npy", temp_test)