In [1]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    BY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import VectorizedNormalizer
from cyclops.processors.feature.vectorize import (
    intersect_vectorized,
    split_vectorized,
    vec_index_exp,
)
from cyclops.processors.impute import (
    np_ffill,
    np_ffill_bfill,
    np_fill_null_num,
    np_fill_null_zero,
)
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.common import print_dict
from cyclops.utils.file import (
    join,
    load_array,
    load_dataframe,
    save_dataframe,
    yield_dataframes,
    process_dir_save_path,
)

2022-08-10 17:15:12,881 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


In [2]:
SPLIT_FRACTIONS = [0.8, 0.1]
PREDICT_OFFSET = 24

# Tabular
OUTCOME_DEATH = "outcome_death"
TAB_TARGETS = [OUTCOME_DEATH]

# Temporal
OUTCOME_DEATH_PROCESSED = TARGETS + " - " + OUTCOME_DEATH
TEMP_DIR = "./2cleaned"
TEMP_TARGETS = [OUTCOME_DEATH_PROCESSED]

TEMP_AGG_DIR = process_dir_save_path("./3aggregated")
TIMESTEP_SIZE = 24
WINDOW_DURATION = 144

# Tabular-specific processing

In [3]:
cohort = load_dataframe("encounters.parquet")
cohort.head(5)

2022-08-10 17:15:12,913 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from encounters.parquet


Unnamed: 0,subject_id,encounter_id,admit_timestamp,discharge_timestamp,deathtime,admission_type,admission_location,discharge_location,ethnicity,edregtime,edouttime,sex,age,birth_year,dod,anchor_year_difference,outcome_death
0,19731189,21820217,2012-06-21 16:39:00,2012-06-27 12:00:00,NaT,ELECTIVE,,HOME,BLACK/AFRICAN AMERICAN,NaT,NaT,M,0,2012,NaT,-149,False
1,14523215,29575656,2018-12-30 21:07:00,2019-01-01 11:38:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,M,0,2018,NaT,-118,False
2,10487271,28274967,2009-09-13 13:37:00,2009-09-14 18:30:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,M,0,2009,NaT,-164,False
3,12188356,23159459,2018-02-12 00:38:00,2018-02-14 14:01:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,F,0,2018,NaT,-123,False
4,16487201,22587598,2018-06-10 21:58:00,2018-06-20 12:02:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,F,0,2018,NaT,-103,False


In [4]:
features = [
    AGE,
    SEX,
    OUTCOME_DEATH,
    "admission_type",
    "admission_location",
    "discharge_location",
    "ethnicity",
]

cohort = cohort.reset_index(drop=True)

tab_features = TabularFeatures(
    data=cohort,
    features=features,
    by=ENCOUNTER_ID,
)

In [5]:
tab_features.types

{'ethnicity': 'ordinal',
 'admission_location': 'ordinal',
 'admission_type': 'ordinal',
 'outcome_death': 'numeric',
 'sex': 'binary',
 'discharge_location': 'ordinal',
 'age': 'numeric'}

In [6]:
tab_features.meta.keys()

dict_keys(['ethnicity', 'admission_location', 'admission_type', 'outcome_death', 'sex', 'discharge_location', 'age'])

In [7]:
tab_features.meta["admission_location"].get_mapping()

{0: 'AMBULATORY SURGERY TRANSFER',
 1: 'CLINIC REFERRAL',
 2: 'EMERGENCY ROOM',
 3: 'INFORMATION NOT AVAILABLE',
 4: 'INTERNAL TRANSFER TO OR FROM PSYCH',
 5: 'PACU',
 6: 'PHYSICIAN REFERRAL',
 7: 'PROCEDURE SITE',
 8: 'TRANSFER FROM HOSPITAL',
 9: 'TRANSFER FROM SKILLED NURSING FACILITY',
 10: 'WALK-IN/SELF REFERRAL',
 11: 'nan'}

In [8]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

['ethnicity', 'admission_location', 'admission_type', 'discharge_location']

In [9]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is tryin

In [10]:
tab_vectorized.shape

(523740, 44)

In [11]:
tab_vectorized.axis_names

['encounter_id', 'features']

In [12]:
numeric_features = tab_features.features_by_type(NUMERIC)
numeric_features

['outcome_death', 'age']

# Temporal-specific processing

In [13]:
from functools import reduce

import numpy as np

top_n = 150
all_top_events = []
for i, events in enumerate(yield_dataframes(TEMP_DIR)):
    # Keep only the most popular events where the values are not null
    top_events = (
        events[EVENT_NAME][~events[EVENT_VALUE].isna()].value_counts()[:top_n].index
    )

    all_top_events.append(top_events)

    del events

# Take only the events common to every file
top_events = reduce(np.intersect1d, tuple(all_top_events))

# Force include the target
top_events = np.unique(np.append(top_events, OUTCOME_DEATH_PROCESSED))

top_events

2022-08-10 17:15:15,553 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./2cleaned/batch_0000.parquet
2022-08-10 17:15:18,236 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./2cleaned/batch_0001.parquet
2022-08-10 17:15:20,962 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./2cleaned/batch_0002.parquet
2022-08-10 17:15:23,727 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./2cleaned/batch_0003.parquet
2022-08-10 17:15:26,448 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./2cleaned/batch_0004.parquet
2022-08-10 17:15:29,034 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./2cleaned/batch_0005.parquet
2022-08-10 17:15:31,710 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./2cleaned/batch_0006.parquet
2022-08-10 17:15:34,419 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./2cleaned/batch_0007.parquet
2022-08-10 17:15:37,104 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./2c

array(['access lines - invasive - arterial line dressing occlusive',
       'access lines - invasive - arterial line placed in outside facility',
       'access lines - invasive - multi lumen dressing occlusive',
       'access lines - invasive - multi lumen placed in outside facility',
       'access lines - invasive - picc line placed in outside facility',
       'access lines - peripheral - 18 gauge dressing occlusive',
       'access lines - peripheral - 18 gauge placed in outside facility',
       'access lines - peripheral - 18 gauge placed in the field',
       'access lines - peripheral - 20 gauge dressing occlusive',
       'access lines - peripheral - 20 gauge placed in outside facility',
       'access lines - peripheral - 20 gauge placed in the field',
       'alarms - alarms on',
       'alarms - arterial blood pressure alarm - high',
       'alarms - arterial blood pressure alarm - low',
       'alarms - heart rate alarm - high',
       'alarms - heart rate alarm - low',


In [14]:
len(top_events)

134

In [15]:
generator = yield_dataframes(TEMP_DIR)
save_count = 0

In [16]:
import time

-------------------------------------------------------------------

In [371]:
# events = load_dataframe("events.parquet")
# events = events.sample(n=int(len(events) / 20))

In [388]:
t = time.time()
events = next(generator)

2022-08-10 17:53:55,129 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./2cleaned/batch_0022.parquet


In [389]:
events = events[events[EVENT_NAME].isin(top_events)]
events

Unnamed: 0,encounter_id,subject_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
0,26871943,13334871.0,2018-09-19 13:05:00,30,30.0,insp/min,alarms,alarms - resp alarm - high
1,26871943,13334871.0,2018-09-19 12:51:00,12.7,12.7,sec,labs,labs - prothrombin time
2,26871943,13334871.0,2018-09-19 12:51:00,40.3,40.3,sec,labs,labs - ptt
3,26871943,13334871.0,2018-09-19 12:51:00,1.2,1.2,,labs,labs - inr
4,26871943,13334871.0,2018-09-19 13:02:00,0,0.0,mmHg,routine vital signs,routine vital signs - arterial blood pressure ...
...,...,...,...,...,...,...,...,...
523469,22484462,,2016-12-23 22:25:00,,1.0,,targets,targets - outcome_death
523536,20087370,,2013-09-02 23:36:00,,1.0,,targets,targets - outcome_death
523668,28105246,,2009-12-19 18:44:00,,1.0,,targets,targets - outcome_death
523674,20127441,,2009-08-03 03:07:00,,1.0,,targets,targets - outcome_death


In [390]:
events[events[EVENT_NAME] == OUTCOME_DEATH_PROCESSED].head(5)

Unnamed: 0,encounter_id,subject_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
14,20252617,,2012-06-25 12:25:00,,1.0,,targets,targets - outcome_death
3914,21514453,,2013-03-18 15:15:00,,1.0,,targets,targets - outcome_death
6594,28369314,,2015-07-02 04:22:00,,1.0,,targets,targets - outcome_death
6595,20943686,,2012-12-28 21:10:00,,1.0,,targets,targets - outcome_death
6631,22029552,,2012-04-11 15:00:00,,1.0,,targets,targets - outcome_death


In [391]:
# Offset death time targets such that the model is predicting death time in advance
is_death_event = events[EVENT_NAME] == OUTCOME_DEATH_PROCESSED
events[is_death_event][EVENT_TIMESTAMP] = events[is_death_event][
    EVENT_TIMESTAMP
] - pd.DateOffset(hours=PREDICT_OFFSET)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events[is_death_event][EVENT_TIMESTAMP] = events[is_death_event][


In [392]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=TIMESTEP_SIZE,
    window_duration=WINDOW_DURATION,
)

In [393]:
events = events.reset_index(drop=True)

tmp_features = TemporalFeatures(
    events,
    features=EVENT_VALUE,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
)

In [394]:
aggregated = tmp_features.aggregate()
aggregated.head(5)

2022-08-10 17:54:00,868 [1;37mINFO[0m cyclops.processors.cleaning - Dropped nulls over columns: event_timestamp. Removed 191 rows.
2022-08-10 17:54:22,569 [1;37mINFO[0m cyclops.utils.profile - Finished executing function __call__ in 22.334663 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,event_value,timestep_start
encounter_id,event_name,timestep,Unnamed: 3_level_1,Unnamed: 4_level_1
20000094,targets - outcome_death,0,1.0,2017-03-03 09:21:00
20001305,targets - outcome_death,0,1.0,2017-03-27 19:23:00
20002810,targets - outcome_death,0,1.0,2018-07-05 06:05:00
20003465,targets - outcome_death,0,1.0,2017-12-06 16:30:00
20003587,targets - outcome_death,0,1.0,2011-04-18 02:15:00


In [395]:
#save_dataframe(aggregated, "aggregated.parquet")

In [396]:
temp_vectorized = aggregator.vectorize(aggregated)
temp_vectorized.shape

2022-08-10 17:54:29,389 [1;37mINFO[0m cyclops.utils.profile - Finished executing function vectorize in 6.799073 s


(1, 10910, 134, 6)

In [397]:
temp_vectorized.axis_names

['aggfuncs', 'encounter_id', 'event_name', 'timestep']

In [398]:
for target in TEMP_TARGETS:
    event_ind = temp_vectorized.get_index_map(EVENT_NAME)[target]
    index_exp = vec_index_exp[:, :, event_ind]

    # Forward fill the the target values, e.g., [nan, nan, 1., nan, nan] -> [nan, nan, 1, 1, 1]
    temp_vectorized.impute_over_axis(TIMESTEP, np_ffill, index_exp=index_exp)

    # Fill remaining values with 0, e.g., [nan, nan, 1, 1, 1] -> [0, 0, 1, 1, 1]
    # or [nan, nan, nan, nan, nan] -> [0, 0, 0, 0, 0]
    temp_vectorized.impute_over_axis(TIMESTEP, np_fill_null_zero, index_exp=index_exp)

In [399]:
# Forward fill then backward fill to get rid of each of the timestep nulls
temp_vectorized.impute_over_axis(TIMESTEP, np_ffill_bfill)

In [400]:
# Fill those all-null timesteps with feature mean
# (since forward and backward filling still leaves them all null)
axis = temp_vectorized.get_axis(EVENT_NAME)

for i in range(temp_vectorized.data.shape[axis]):
    index_exp = vec_index_exp[:, :, i]
    data_slice = temp_vectorized.data[index_exp]
    mean = np.nanmean(data_slice)
    func = lambda x: np_fill_null_num(x, mean)
    temp_vectorized.impute_over_axis(TIMESTEP, func, index_exp=index_exp)

In [401]:
path = join(TEMP_AGG_DIR, "batch_" + f"{save_count:04d}")
path

'./3aggregated/batch_0022'

In [402]:
with open(path + ".pkl", "wb") as handle:
    pickle.dump(temp_vectorized, handle)

save_count += 1

In [403]:
print(time.time() - t)

71.56541299819946


# Combined processing

# Prepare splits

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [None]:
tab_vectorized.shape, temp_vectorized.shape

In [None]:
tab_vectorized, temp_vectorized = intersect_vectorized(
    [tab_vectorized, temp_vectorized], axes=ENCOUNTER_ID
)
tab_vectorized.shape, temp_vectorized.shape

In [None]:
# Standardize only the numeric features (e.g., not binary indicators)
normalizer_map = {feat: STANDARD for feat in numeric_features}

tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map=normalizer_map,
)

In [None]:
# Standardize all events
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

## Dataset splits

Split into training, validation, and testing datasets such that the tabular and temporal encounters remain aligned.

In [None]:
tab_splits, temp_splits = split_vectorized(
    [tab_vectorized, temp_vectorized], SPLIT_FRACTIONS, axes=ENCOUNTER_ID
)
tab_train, tab_val, tab_test = tab_splits
temp_train, temp_val, temp_test = temp_splits

In [None]:
tab_train.shape, tab_val.shape, tab_test.shape

In [None]:
temp_train.shape, temp_val.shape, temp_test.shape

## Split features/targets

Split out the targets in the temporal data.

In [None]:
tab_train_X, tab_train_y = tab_train.split_out(FEATURES, TAB_TARGETS)
tab_train_X.shape, tab_train_y.shape

In [None]:
tab_val_X, tab_val_y = tab_val.split_out(FEATURES, TAB_TARGETS)
tab_val_X.shape, tab_val_y.shape

In [None]:
tab_test_X, tab_test_y = tab_test.split_out(FEATURES, TAB_TARGETS)
tab_test_X.shape, tab_test_y.shape

In [None]:
temp_train_X, temp_train_y = temp_train.split_out(EVENT_NAME, TEMP_TARGETS)
temp_train_X.shape, temp_train_y.shape

In [None]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, TEMP_TARGETS)
temp_val_X.shape, temp_val_y.shape

In [None]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, TEMP_TARGETS)
temp_test_X.shape, temp_test_y.shape

### Normalization

In [None]:
splits = (
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
)

for split in splits:
    split.fit_normalizer()
    split.normalize()

(
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
) = splits

## Save

In [None]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X"),
    (tab_train_y, "tab_train_y"),
    (tab_val_X, "tab_val_X"),
    (tab_val_y, "tab_val_y"),
    (tab_test_X, "tab_test_X"),
    (tab_test_y, "tab_test_y"),
    (temp_train_X, "temp_train_X"),
    (temp_train_y, "temp_train_y"),
    (temp_val_X, "temp_val_X"),
    (temp_val_y, "temp_val_y"),
    (temp_test_X, "temp_test_X"),
    (temp_test_y, "temp_test_y"),
]
for vec, name in vectorized:
    with open(name + ".pkl", "wb") as handle:
        pickle.dump(vec, handle)