Filtering to get only main diagnosis... should this be done? Why aren't the other diagnoses important?

Instead we could get the encounter/main diagnoses in another table, then join on the entire diagnoses table to keep the other diagnoses

In [31]:
import pickle
import time

import numpy as np
from querying import (
    get_bt_for_cohort,
    get_cohort,
    get_er_for_cohort,
    get_labs,
    get_most_recent_encounters,
    get_non_cardiac_diagnoses,
    main,
)

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import normalize_names, normalize_values
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.vectorize import intersect_vectorized, split_vectorized
from cyclops.utils.file import load_dataframe, save_dataframe

In [3]:
SPLIT_FRACTIONS = [0.7, 0.2]

# Processing

## Tabular

In [4]:
cohort = load_dataframe("cohort.parquet")

2022-07-19 17:34:57,326 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from cohort.parquet


In [6]:
features = [
    HOSPITAL_ID,
    AGE,
    SEX,
    DIAGNOSIS_TRAJECTORY,
    "readmission",
    "from_nursing_home_mapped",
    "from_acute_care_institution_mapped",
    "los_derived",
    "prev_encounter_count",
]

normalizer = GroupbyNormalizer({AGE: MIN_MAX, "los_derived": STANDARD})

cohort = cohort.reset_index().drop("index", axis=1)

tab_features = TabularFeatures(
    cohort,
    features,
    by=ENCOUNTER_ID,
    targets=["outcome_death", "outcome_edema"],
)

In [7]:
tab_features.types

{'prev_encounter_count': 'numeric',
 'diagnosis_trajectory': 'ordinal',
 'from_nursing_home_mapped': 'binary',
 'readmission': 'ordinal',
 'hospital_id': 'ordinal',
 'from_acute_care_institution_mapped': 'binary',
 'age': 'numeric',
 'outcome_edema': 'binary',
 'sex': 'binary',
 'outcome_death': 'binary',
 'los_derived': 'numeric'}

In [8]:
tab_features.meta.keys()

dict_keys(['prev_encounter_count', 'diagnosis_trajectory', 'from_nursing_home_mapped', 'readmission', 'hospital_id', 'from_acute_care_institution_mapped', 'age', 'outcome_edema', 'sex', 'outcome_death', 'los_derived'])

In [9]:
tab_features.meta[SEX].get_mapping()

{0: 'F', 1: 'M'}

In [10]:
tab_features.meta[DIAGNOSIS_TRAJECTORY].get_mapping()

{0: 'A00_B99',
 1: 'C00_D49',
 2: 'D50_D89',
 3: 'E00_E89',
 4: 'F01_F99',
 5: 'G00_G99',
 6: 'H00_H59',
 7: 'H60_H95',
 8: 'I00_I99',
 9: 'J00_J99',
 10: 'K00_K95',
 11: 'L00_L99',
 12: 'M00_M99',
 13: 'N00_N99',
 14: 'O00_O99',
 15: 'Q00_Q99',
 16: 'R00_R99',
 17: 'S00_T88',
 18: 'V00_Y99',
 19: 'Z00_Z99'}

In [11]:
tab_features.meta["outcome_death"].get_mapping()

{False: False, True: True}

In [12]:
tab_features.meta["hospital_id"].get_mapping()

{0: 'MSH', 1: 'SBK', 2: 'SMH', 3: 'THPC', 4: 'THPM', 5: 'UHNTG', 6: 'UHNTW'}

In [13]:
tab_features.meta["readmission"].get_mapping()

{0: '',
 1: 'new_to_acute',
 2: 'nota',
 3: 'planned_from_acute',
 4: 'unplanned_7_day_acute',
 5: 'unplanned_7_day_day_surg',
 6: 'unplanned_8_to_28_day_acute'}

In [14]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

['diagnosis_trajectory', 'readmission', 'hospital_id']

In [15]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())


In [16]:
tab_vectorized.shape

(107686, 42)

In [17]:
tab_vectorized.axis_names

['encounter_id', 'features']

In [18]:
numeric_features = tab_features.features_by_type(NUMERIC)
numeric_features

['prev_encounter_count', 'age', 'los_derived']

In [28]:
tab_vectorized.save("tab_vectorized.npy")

2022-07-19 17:54:43,164 [1;37mINFO[0m cyclops.utils.file - Saving array to tab_vectorized.npy


'tab_vectorized.npy'

## Temporal

In [19]:
labs = load_dataframe("labs.parquet")

labs = labs.sample(n=int(len(labs) / 20))

labs.head(5)

2022-07-19 17:35:13,952 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from labs.parquet


Unnamed: 0,encounter_id,event_name,event_value,event_value_unit,event_timestamp
2526779,15983490,glucose point of care,6.1,mmol/L,2019-03-29 09:03:00
1401506,13749220,mean cell volume,89.1,fL,2013-10-22 12:04:00
5288291,12274326,bicarbonate,22.0,mmol/L,2013-10-09 06:00:00
3808443,12117011,potassium,3.6,mmol/L,2015-04-04 08:17:00
7205612,13790718,white blood cell count,6.2,x10e9/L,2019-10-28 09:15:00


In [None]:
labs[EVENT_NAME].unique()

In [None]:
labs[EVENT_NAME] = normalize_names(labs[EVENT_NAME])
labs[EVENT_VALUE] = normalize_values(labs[EVENT_VALUE])

In [None]:
labs[EVENT_NAME].unique()

In [20]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=8,
    window_duration=24,  # Optional
    # agg_meta_for=EVENT_VALUE,  # Optional
)

In [21]:
labs = labs.reset_index(drop=True)

tmp_features = TemporalFeatures(
    labs,
    features=EVENT_VALUE,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
)

In [22]:
aggregated = tmp_features.aggregate()
aggregated.head(5)

2022-07-19 17:35:18,134 [1;37mINFO[0m cyclops.processors.cleaning - Dropped nulls over columns: event_timestamp. Removed 9 rows.
2022-07-19 17:44:37,649 [1;37mINFO[0m cyclops.utils.profile - Finished executing function __call__ in 559.585993 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,event_value,timestep_start
encounter_id,event_name,timestep,Unnamed: 3_level_1,Unnamed: 4_level_1
11100040,pt,0,12.5,2018-07-03 22:44:00
11100040,glucose point of care,0,14.1,2018-07-03 22:44:00
11100041,alp,0,37.0,2016-12-26 11:53:00
11100041,white blood cell count,0,6.02,2016-12-26 11:53:00
11100072,hemoglobin,0,98.0,2016-08-13 12:06:00


In [23]:
save_dataframe(aggregated, "aggregated.parquet")

2022-07-19 17:44:37,677 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to aggregated.parquet


'aggregated.parquet'

In [25]:
temp_vectorized = aggregator.vectorize(aggregated)
temp_vectorized.shape

2022-07-19 17:54:41,699 [1;37mINFO[0m cyclops.utils.profile - Finished executing function vectorize in 278.507680 s


(1, 100381, 53, 3)

In [26]:
temp_vectorized.axis_names

['aggfuncs', 'encounter_id', 'event_name', 'timesteps']

In [27]:
temp_vectorized.save("temp_vectorized.npy")

2022-07-19 17:54:41,731 [1;37mINFO[0m cyclops.utils.file - Saving array to temp_vectorized.npy


'temp_vectorized.npy'

## Prepare before splits

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [29]:
tab_vectorized.shape, temp_vectorized.shape

((107686, 42), (1, 100381, 53, 3))

In [32]:
tab_vectorized, temp_vectorized = intersect_vectorized(
    [tab_vectorized, temp_vectorized], axes=ENCOUNTER_ID
)
tab_vectorized.shape, temp_vectorized.shape

((100381, 42), (1, 100381, 53, 3))

In [33]:
# Standardize only the numeric features (e.g., not binary indicators)
tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map={feat: STANDARD for feat in numeric_features},
)

In [34]:
# Standardize all features
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

### Dataset splits

Split into training, validation, and testing datasets such that the tabular and temporal encounters remain aligned.

In [35]:
tab_splits, temp_splits = split_vectorized(
    [tab_vectorized, temp_vectorized], SPLIT_FRACTIONS, axes=ENCOUNTER_ID
)
tab_train_X, tab_val_X, tab_test_X = tab_splits
temp_train, temp_val, temp_test = temp_splits

In [36]:
tab_train_X.shape, tab_val_X.shape, tab_test_X.shape

((70267, 42), (20076, 42), (10038, 42))

In [37]:
temp_train.shape, temp_val.shape, temp_test.shape

((1, 70267, 53, 3), (1, 20076, 53, 3), (1, 10038, 53, 3))

### Split features & targets

Split out the targets in the temporal data.

In [38]:
temp_train_X, temp_train_y = temp_train.split_out(EVENT_NAME, [TARGET_NAME_PROCESSED])
temp_train_X.shape, temp_train_y.shape

NameError: name 'TARGET_NAME_PROCESSED' is not defined

In [39]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, [TARGET_NAME_PROCESSED])
temp_val_X.shape, temp_val_y.shape

NameError: name 'TARGET_NAME_PROCESSED' is not defined

In [None]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, [TARGET_NAME_PROCESSED])
temp_test_X.shape, temp_test_y.shape

### Normalization

In [None]:
splits = tab_train_X, tab_val_X, tab_test_X, temp_train_X, temp_val_X, temp_test_X

for split in splits:
    split.fit_normalizer()
    split.normalize()

tab_train_X, tab_val_X, tab_test_X, temp_train_X, temp_val_X, temp_test_X = splits

### Save

In [1]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X"),
    (tab_val_X, "tab_val_X"),
    (tab_test_X, "tab_test_X"),
    (temp_train_X, "temp_train_X"),
    (temp_train_y, "temp_train_y"),
    (temp_val_X, "temp_val_X"),
    (temp_val_y, "temp_val_y"),
    (temp_test_X, "temp_test_X"),
    (temp_test_y, "temp_test_y"),
]
for vec, name in vectorized:
    with open(name + '.pkl', 'wb') as handle:
        pickle.dump(vec, handle)

# Load data (deserialize)
#with open('filename.pickle', 'rb') as handle:
#    unserialized_data = pickle.load(handle)

NameError: name 'tab_train_X' is not defined

In [None]:
"""
tab_train_X.save("tab_train_X.npy")
tab_val_X.save("tab_val_X.npy")
tab_test_X.save("tab_test_X.npy")

temp_train_X.save("temp_train_X.npy")
temp_val_X.save("temp_val_X.npy")
temp_test_X.save("temp_test_X.npy")

temp_train_y.save("temp_train_y.npy")
temp_val_y.save("temp_val_y.npy")
temp_test_y.save("temp_test_y.npy")
"""