# Shared notebook for processing temporal features.

# Imports

In [1]:
from functools import reduce
import numpy as np
import pandas as pd

from cyclops.processors.aggregate import (
    Aggregator,
    tabular_as_aggregated,
    timestamp_ffill_agg,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    DISCHARGE_TIMESTAMP,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    RESTRICT_TIMESTAMP,
    TIMESTEP,
)
from cyclops.processors.column_names import ENCOUNTER_ID
from cyclops.processors.constants import FEATURES, NUMERIC, ORDINAL, STANDARD
from cyclops.processors.feature.feature import TabularFeatures
from cyclops.processors.constants import ALL, FEATURES, MEAN, NUMERIC, ORDINAL, STANDARD
from cyclops.processors.feature.feature import TemporalFeatures
from cyclops.processors.feature.vectorize import (
    Vectorized,
    intersect_vectorized,
    split_vectorized,
    vec_index_exp,
)
from cyclops.processors.impute import np_ffill_bfill, np_fill_null_num
from cyclops.utils.file import (
    join,
    load_dataframe,
    load_pickle,
    save_dataframe,
    save_pickle,
    yield_dataframes,
    yield_pickled_files,
)
from drift_detection.gemini.utils import get_use_case_params

# Choose dataset and use-case

In [6]:
DATASET = "gemini"
USE_CASE = "mortality"

use_case_params = get_use_case_params(DATASET, USE_CASE)
input(f"WARNING: LOADING CONSTANTS FROM {use_case_params}")



''

In [7]:
cohort = load_dataframe(use_case_params.ENCOUNTERS_FILE)
cohort = cohort.reset_index(drop=True)
cohort.head(5)

2022-11-08 19:23:55,265 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/encounters.parquet


Unnamed: 0,encounter_id,admit_timestamp,discharge_timestamp,age,sex,hospital_id,outcome_death,readmission,from_nursing_home_mapped,from_acute_care_institution_mapped,los_derived,prev_encounter_count,diagnosis_code,diagnosis_trajectory,admit_via_ambulance,triage_level,deathtime
0,11100040,2018-07-03 21:36:00,2018-08-03 09:35:00,84,M,SMH,False,planned_from_acute,False,False,30.499306,1,G459,G00_G99,ground,resuscitation,NaT
1,11100041,2016-12-26 18:21:00,2016-12-27 11:00:00,76,F,SMH,False,new_to_acute,False,False,0.69375,0,R55,R00_R99,ground,emergent,NaT
2,11100072,2016-08-13 15:20:00,2016-08-16 16:45:00,72,F,SMH,False,new_to_acute,False,False,3.059028,0,J441,J00_J99,no_ambulance,emergent,NaT
3,11100095,2015-12-20 18:36:00,2015-12-30 10:25:00,84,M,SMH,False,unplanned_8_to_28_day_acute,False,False,9.659028,0,T826,S00_T88,no_ambulance,emergent,NaT
4,11100097,2019-05-23 06:09:00,2019-05-28 13:07:00,65,M,SMH,False,planned_from_acute,True,False,5.290278,7,N390,N00_N99,ground,emergent,NaT


In [8]:
tab_features = TabularFeatures(
    data=cohort,
    features=use_case_params.TAB_FEATURES,
    by=ENCOUNTER_ID,
    force_types=use_case_params.TAB_FEATURES_TYPES,
)

numeric_features = tab_features.features_by_type(NUMERIC)
ordinal_features = tab_features.features_by_type(ORDINAL)

if len(ordinal_features) > 0:
    print(ordinal_features[0], "mapping:")
    print(tab_features.meta[ordinal_features[0]].get_mapping())

tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)
save_pickle(tab_vectorized, use_case_params.TAB_VECTORIZED_FILE)
save_pickle(tab_features, use_case_params.TAB_FEATURES_FILE)

admit_via_ambulance mapping:
{0: '', 1: 'air', 2: 'ground', 3: 'no_ambulance', 4: 'no_info'}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat].replace(mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat].replace(mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat].replace(mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat].replace(mapping, inplace=True)


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/tab_features.pkl'

In [9]:
load_dataframe(use_case_params.ENCOUNTERS_FILE)

2022-11-08 19:24:01,393 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/encounters.parquet


Unnamed: 0,encounter_id,admit_timestamp,discharge_timestamp,age,sex,hospital_id,outcome_death,readmission,from_nursing_home_mapped,from_acute_care_institution_mapped,los_derived,prev_encounter_count,diagnosis_code,diagnosis_trajectory,admit_via_ambulance,triage_level,deathtime
0,11100040,2018-07-03 21:36:00,2018-08-03 09:35:00,84,M,SMH,False,planned_from_acute,False,False,30.499306,1,G459,G00_G99,ground,resuscitation,NaT
1,11100041,2016-12-26 18:21:00,2016-12-27 11:00:00,76,F,SMH,False,new_to_acute,False,False,0.693750,0,R55,R00_R99,ground,emergent,NaT
2,11100072,2016-08-13 15:20:00,2016-08-16 16:45:00,72,F,SMH,False,new_to_acute,False,False,3.059028,0,J441,J00_J99,no_ambulance,emergent,NaT
3,11100095,2015-12-20 18:36:00,2015-12-30 10:25:00,84,M,SMH,False,unplanned_8_to_28_day_acute,False,False,9.659028,0,T826,S00_T88,no_ambulance,emergent,NaT
4,11100097,2019-05-23 06:09:00,2019-05-28 13:07:00,65,M,SMH,False,planned_from_acute,True,False,5.290278,7,N390,N00_N99,ground,emergent,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143044,15999822,2012-04-09 21:44:00,2012-04-10 17:21:00,61,M,THPM,False,new_to_acute,False,False,0.817361,0,I635,I00_I99,ground,emergent,NaT
143045,15999864,2018-04-30 20:30:00,2018-05-06 12:46:00,58,F,THPC,False,new_to_acute,False,False,5.677778,0,L032,L00_L99,no_ambulance,emergent,NaT
143046,15999918,2019-09-18 05:59:00,2019-09-20 14:45:00,68,F,THPM,False,new_to_acute,False,False,2.365278,0,R55,R00_R99,ground,emergent,NaT
143047,15999943,2015-01-17 08:17:00,2015-01-23 11:33:00,78,F,THPM,False,nota,False,False,6.136111,0,J09,J00_J99,ground,urgent,NaT


In [10]:
timestamps = load_dataframe(use_case_params.ENCOUNTERS_FILE)[
    [
        ENCOUNTER_ID,
        ADMIT_TIMESTAMP,
        DISCHARGE_TIMESTAMP,
        use_case_params.TARGET_TIMESTAMP,
    ]
]
start_timestamps = (
    timestamps[[ENCOUNTER_ID, ADMIT_TIMESTAMP]]
    .set_index(ENCOUNTER_ID)
    .rename({ADMIT_TIMESTAMP: RESTRICT_TIMESTAMP}, axis=1)
)
start_timestamps

2022-11-08 19:24:01,655 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/encounters.parquet


Unnamed: 0_level_0,restrict_timestamp
encounter_id,Unnamed: 1_level_1
11100040,2018-07-03 21:36:00
11100041,2016-12-26 18:21:00
11100072,2016-08-13 15:20:00
11100095,2015-12-20 18:36:00
11100097,2019-05-23 06:09:00
...,...
15999822,2012-04-09 21:44:00
15999864,2018-04-30 20:30:00
15999918,2019-09-18 05:59:00
15999943,2015-01-17 08:17:00


# Temporal-specific processing

In [11]:
# Determine which events to keep
# Keep only the most popular events where the values are not null
all_top_events = []
for i, events in enumerate(yield_dataframes(use_case_params.CLEANED_DIR, log=False)):
    top_events = (
        events[EVENT_NAME][~events[EVENT_VALUE].isna()]
        .value_counts()[: use_case_params.TOP_N_EVENTS]
        .index
    )

    all_top_events.append(top_events)

    del events

# Take only the events common to every file
top_events = reduce(np.intersect1d, tuple(all_top_events))

top_events

Index(['sodium', 'potassium', 'lymphocyte', 'hemoglobin', 'bicarbonate',
       'creatinine', 'white blood cell count', 'platelet count',
       'mean cell volume', 'hematocrit', 'neutrophils',
       'glucose point of care', 'glucose random', 'urinalysis',
       'blood urea nitrogen', 'calcium', 'albumin', 'inr', 'x-ray', 'alt',
       'alp', 'bilirubin', 'ast', 'pt', 'aptt', 'lactate venous', 'ct',
       'troponin', 'arterial pao2', 'arterial paco2', 'unmapped_intervention',
       'arterial ph', 'high sensitivity troponin', 'venous pco2', 'ketone',
       'tsh', 'ultrasound', 'ldh', 'venous ph', 'urine specific gravity',
       'echo', 'vitamin b12', 'lactate arterial', 'urine sodium', 'rbc',
       'urine osmolality', 'ferritin', 'serum osmolality', 'mri',
       'endoscopy_mapped', 'crp', 'other', 'non-rbc', 'calcium, ionized',
       'hba1c', 'interventional', 'esr', 'fibrinogen', 'serum alcohol',
       'glucose fasting', 'd-dimer', 'inv_mech_vent_mapped', 'vitamin d',
       

In [12]:
len(top_events)

67

In [13]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=use_case_params.TIMESTEP_SIZE,
    window_duration=use_case_params.WINDOW_DURATION,
)

In [14]:
# Aggregate
skip_n = 0
generator = yield_dataframes(use_case_params.CLEANED_DIR, skip_n=skip_n, log=False)

for save_count, events in enumerate(generator):
    # Take only the top events
    events = events[events[EVENT_NAME].isin(top_events)]

    # Aggregate
    events = events.reset_index(drop=True)
    tmp_features = TemporalFeatures(
        events,
        features=EVENT_VALUE,
        by=[ENCOUNTER_ID, EVENT_NAME],
        timestamp_col=EVENT_TIMESTAMP,
        aggregator=aggregator,
    )

    aggregated = tmp_features.aggregate(window_start_time=start_timestamps)

    save_dataframe(
        aggregated,
        join(use_case_params.AGGREGATED_DIR, "batch_" + f"{save_count + skip_n:04d}"),
    )
    del events

2022-11-08 19:24:47,367 [1;37mINFO[0m cyclops.processors.clean - Dropped nulls over columns: event_timestamp. Removed 6861 rows.
2022-11-08 19:33:30,129 [1;37mINFO[0m cyclops.utils.profile - Finished executing function __call__ in 527.408176 s
2022-11-08 19:33:30,137 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/2-agg/batch_0000.parquet


In [15]:
# Vectorize
skip_n = 0
generator = yield_dataframes(use_case_params.AGGREGATED_DIR, skip_n=skip_n, log=False)
for save_count, aggregated in enumerate(generator):
    vec = aggregator.vectorize(aggregated)
    save_pickle(
        vec,
        join(use_case_params.VECTORIZED_DIR, "batch_" + f"{save_count + skip_n:04d}"),
    )

2022-11-08 19:35:10,078 [1;37mINFO[0m cyclops.utils.profile - Finished executing function vectorize in 91.606014 s
2022-11-08 19:35:10,086 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/3-vec/batch_0000.pkl


In [16]:
# Take all Vectorized objects and turn them into a single object
vecs = list([vec for vec in yield_pickled_files(use_case_params.VECTORIZED_DIR)])
encounter_axis = vecs[0].get_axis(ENCOUNTER_ID)
res = np.concatenate([vec.data for vec in vecs], axis=encounter_axis)
indexes = vecs[0].indexes
indexes[encounter_axis] = np.concatenate([vec.indexes[encounter_axis] for vec in vecs])
temp_vectorized = Vectorized(res, indexes, vecs[0].axis_names)
del res

2022-11-08 19:35:21,529 [1;37mINFO[0m cyclops.utils.file - Loading pickled data from /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/3-vec/batch_0000.pkl


In [17]:
temp_vectorized.shape

(1, 138187, 67, 6)

In [18]:
temp_vectorized.axis_names

['aggfuncs', 'encounter_id', 'event_name', 'timestep']

## Target creation

In [19]:
def compute_timestep(timestamps, event):
    timestamps[f"{event}_after_admit"] = timestamps[event] - timestamps[ADMIT_TIMESTAMP]
    timestamps[f"{event}_timestep"] = (
        timestamps[f"{event}_after_admit"]
        / pd.Timedelta(f"{use_case_params.TIMESTEP_SIZE} hour")
    ).apply(np.floor)
    return timestamps


timestamps["target"] = timestamps[use_case_params.TARGET_TIMESTAMP] - pd.DateOffset(
    hours=use_case_params.PREDICT_OFFSET
)
timestamps = compute_timestep(timestamps, "target")
timestamps = compute_timestep(timestamps, DISCHARGE_TIMESTAMP)
timestamps

Unnamed: 0,encounter_id,admit_timestamp,discharge_timestamp,deathtime,target,target_after_admit,target_timestep,discharge_timestamp_after_admit,discharge_timestamp_timestep
0,11100040,2018-07-03 21:36:00,2018-08-03 09:35:00,NaT,NaT,NaT,,30 days 11:59:00,30.0
1,11100041,2016-12-26 18:21:00,2016-12-27 11:00:00,NaT,NaT,NaT,,0 days 16:39:00,0.0
2,11100072,2016-08-13 15:20:00,2016-08-16 16:45:00,NaT,NaT,NaT,,3 days 01:25:00,3.0
3,11100095,2015-12-20 18:36:00,2015-12-30 10:25:00,NaT,NaT,NaT,,9 days 15:49:00,9.0
4,11100097,2019-05-23 06:09:00,2019-05-28 13:07:00,NaT,NaT,NaT,,5 days 06:58:00,5.0
...,...,...,...,...,...,...,...,...,...
143044,15999822,2012-04-09 21:44:00,2012-04-10 17:21:00,NaT,NaT,NaT,,0 days 19:37:00,0.0
143045,15999864,2018-04-30 20:30:00,2018-05-06 12:46:00,NaT,NaT,NaT,,5 days 16:16:00,5.0
143046,15999918,2019-09-18 05:59:00,2019-09-20 14:45:00,NaT,NaT,NaT,,2 days 08:46:00,2.0
143047,15999943,2015-01-17 08:17:00,2015-01-23 11:33:00,NaT,NaT,NaT,,6 days 03:16:00,6.0


In [20]:
timestamps[~timestamps[use_case_params.TARGET_TIMESTAMP].isna()]

Unnamed: 0,encounter_id,admit_timestamp,discharge_timestamp,deathtime,target,target_after_admit,target_timestep,discharge_timestamp_after_admit,discharge_timestamp_timestep
20,11100856,2019-08-11 18:24:00,2019-08-14 17:15:00,2019-08-14 17:15:00,2019-07-31 17:15:00,-12 days +22:51:00,-12.0,2 days 22:51:00,2.0
32,11101834,2020-03-31 16:43:00,2020-04-06 12:04:00,2020-04-06 12:04:00,2020-03-23 12:04:00,-9 days +19:21:00,-9.0,5 days 19:21:00,5.0
37,11102089,2011-01-02 17:00:00,2011-01-26 06:00:00,2011-01-26 06:00:00,2011-01-12 06:00:00,9 days 13:00:00,9.0,23 days 13:00:00,23.0
56,11103117,2019-03-17 23:37:00,2019-04-01 23:03:00,2019-04-01 23:03:00,2019-03-18 23:03:00,0 days 23:26:00,0.0,14 days 23:26:00,14.0
86,11104649,2017-06-11 16:39:00,2017-06-12 11:30:00,2017-06-12 11:30:00,2017-05-29 11:30:00,-14 days +18:51:00,-14.0,0 days 18:51:00,0.0
...,...,...,...,...,...,...,...,...,...
143009,15998960,2010-12-15 17:35:00,2010-12-19 15:00:00,2010-12-19 15:00:00,2010-12-05 15:00:00,-11 days +21:25:00,-11.0,3 days 21:25:00,3.0
143032,15999355,2011-08-29 21:52:00,2011-09-06 19:45:00,2011-09-06 19:45:00,2011-08-23 19:45:00,-7 days +21:53:00,-7.0,7 days 21:53:00,7.0
143033,15999370,2017-04-23 15:01:00,2017-04-26 05:07:00,2017-04-26 05:07:00,2017-04-12 05:07:00,-12 days +14:06:00,-12.0,2 days 14:06:00,2.0
143035,15999393,2017-01-05 16:22:00,2017-01-10 03:55:00,2017-01-10 03:55:00,2016-12-27 03:55:00,-10 days +11:33:00,-10.0,4 days 11:33:00,4.0


In [21]:
encounter_order = pd.Series(temp_vectorized.get_index(ENCOUNTER_ID))
encounter_order = encounter_order.rename(ENCOUNTER_ID).to_frame()
encounter_order

Unnamed: 0,encounter_id
0,11100040
1,11100041
2,11100072
3,11100095
4,11100097
...,...
138182,15999822
138183,15999864
138184,15999918
138185,15999943


In [22]:
discharge_timestep = DISCHARGE_TIMESTAMP + "_timestep"
timesteps = timestamps[[ENCOUNTER_ID, "target_timestep", discharge_timestep]]
aligned_timestamps = pd.merge(encounter_order, timesteps, on=ENCOUNTER_ID, how="left")
aligned_timestamps

Unnamed: 0,encounter_id,target_timestep,discharge_timestamp_timestep
0,11100040,,30.0
1,11100041,,0.0
2,11100072,,3.0
3,11100095,,9.0
4,11100097,,5.0
...,...,...,...
138182,15999822,,0.0
138183,15999864,,5.0
138184,15999918,,2.0
138185,15999943,,6.0


In [23]:
num_timesteps = int(use_case_params.WINDOW_DURATION / use_case_params.TIMESTEP_SIZE)
shape = (len(aligned_timestamps), num_timesteps)

arr1 = timestamp_ffill_agg(
    aligned_timestamps["target_timestep"], num_timesteps, fill_nan=2
)
arr2 = timestamp_ffill_agg(
    aligned_timestamps[discharge_timestep], num_timesteps, val=-1, fill_nan=2
)
targets = np.minimum(arr1, arr2)
targets[targets == 2] = 0
targets[126:146]

array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0., -1., -1., -1., -1., -1.],
       [ 0.,  0., -1., -1., -1., -1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0., -1., -1., -1., -1., -1.],
       [ 0.,  0.,  0.,  0.,  0., -1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0., -1., -1., -1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0., -1., -1., -1.],
       [-1., -1., -1., -1., -1., -1.],
       [ 0.,  0., -1., -1., -1., -1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [-1., -1., -1., -1., -1., -1.]])

In [24]:
aligned_timestamps.iloc[126:146]

Unnamed: 0,encounter_id,target_timestep,discharge_timestamp_timestep
126,11106664,,20.0
127,11106666,,1.0
128,11106673,,2.0
129,11106716,,14.0
130,11106811,,6.0
131,11106827,,10.0
132,11106961,,1.0
133,11106980,,5.0
134,11107038,,10.0
135,11107053,,17.0


In [25]:
targets = np.expand_dims(np.expand_dims(targets, 0), 2)
targets.shape

(1, 138187, 1, 6)

In [26]:
temp_vectorized.shape

(1, 138187, 67, 6)

In [27]:
# Include target
# temp_vectorized = temp_vectorized.remove_with_index(EVENT_NAME, TEMP_TARGETS)
# print(temp_vectorized.shape)
temp_vectorized = temp_vectorized.concat_over_axis(
    EVENT_NAME, targets, use_case_params.TEMP_TARGETS
)
temp_vectorized.shape

(1, 138187, 68, 6)

In [28]:
only_targets = temp_vectorized.take_with_index(EVENT_NAME, use_case_params.TEMP_TARGETS)
assert np.isnan(only_targets.data).sum() == 0

In [29]:
save_pickle(temp_vectorized, use_case_params.TEMP_VECTORIZED_FILE)

2022-11-08 19:35:23,838 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/temp_vectorized.pkl


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/temp_vectorized.pkl'

# Combined processing

In [30]:
temp_vectorized = load_pickle(use_case_params.TEMP_VECTORIZED_FILE)

2022-11-08 19:35:39,067 [1;37mINFO[0m cyclops.utils.file - Loading pickled data from /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/temp_vectorized.pkl


In [31]:
tab = tab_features.get_data(to_binary_indicators=ordinal_features).reset_index()

# Take only the encounters with temporal events
tab = tab[np.in1d(tab[ENCOUNTER_ID].values, temp_vectorized.get_index(ENCOUNTER_ID))]

# Aggregate tabular
tab_aggregated = tabular_as_aggregated(
    tab=tab,
    index=ENCOUNTER_ID,
    var_name=EVENT_NAME,
    value_name=EVENT_VALUE,
    strategy=ALL,
    num_timesteps=aggregator.window_duration // aggregator.timestep_size,
)
tab_aggregated

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat].replace(mapping, inplace=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,event_value
encounter_id,event_name,timestep,Unnamed: 3_level_1
11100040,admit_via_ambulance_,0,0.0
11100040,admit_via_ambulance_,1,0.0
11100040,admit_via_ambulance_,2,0.0
11100040,admit_via_ambulance_,3,0.0
11100040,admit_via_ambulance_,4,0.0
...,...,...,...
15999969,triage_level_urgent,1,0.0
15999969,triage_level_urgent,2,0.0
15999969,triage_level_urgent,3,0.0
15999969,triage_level_urgent,4,0.0


In [32]:
# Vectorize tabular
tab_aggregated_vec = aggregator.vectorize(tab_aggregated)
tab_aggregated_vec.shape

2022-11-08 19:37:07,241 [1;37mINFO[0m cyclops.utils.profile - Finished executing function vectorize in 35.168415 s


(1, 138187, 52, 6)

In [33]:
temp_vectorized.shape

(1, 138187, 68, 6)

In [34]:
tab_aggregated_vec.shape

(1, 138187, 52, 6)

In [35]:
# Combine
comb_vectorized = temp_vectorized.concat_over_axis(
    EVENT_NAME, tab_aggregated_vec.data, tab_aggregated_vec.get_index(EVENT_NAME)
)
comb_vectorized.shape

(1, 138187, 120, 6)

In [36]:
# Don't include any of the tabular targets - split out to avoid label leakage
comb_vectorized, _ = comb_vectorized.split_out(EVENT_NAME, use_case_params.TAB_TARGETS)
comb_vectorized.shape

(1, 138187, 119, 6)

In [37]:
comb_vectorized.get_index(EVENT_NAME)

array(['admit_via_ambulance_', 'admit_via_ambulance_air',
       'admit_via_ambulance_ground', 'admit_via_ambulance_no_ambulance',
       'admit_via_ambulance_no_info', 'age', 'albumin', 'alp', 'alt',
       'aptt', 'arterial paco2', 'arterial pao2', 'arterial ph', 'ast',
       'bicarbonate', 'bilirubin', 'blood urea nitrogen', 'calcium',
       'calcium, ionized', 'creatinine', 'crp', 'ct', 'd-dimer',
       'diagnosis_trajectory_A00_B99', 'diagnosis_trajectory_C00_D49',
       'diagnosis_trajectory_D50_D89', 'diagnosis_trajectory_E00_E89',
       'diagnosis_trajectory_F01_F99', 'diagnosis_trajectory_G00_G99',
       'diagnosis_trajectory_H00_H59', 'diagnosis_trajectory_H60_H95',
       'diagnosis_trajectory_I00_I99', 'diagnosis_trajectory_J00_J99',
       'diagnosis_trajectory_K00_K95', 'diagnosis_trajectory_L00_L99',
       'diagnosis_trajectory_M00_M99', 'diagnosis_trajectory_N00_N99',
       'diagnosis_trajectory_O00_O99', 'diagnosis_trajectory_Q00_Q99',
       'diagnosis_traject

In [38]:
np.isnan(tab_aggregated_vec.data).sum() / tab_aggregated_vec.data.size

0.0

In [39]:
tab_aggregated_vec.data

array([[[[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 1., 1.],
         ...,
         [1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 1., 1.],
         ...,
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]],

        ...,

        [[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 1., 1.],
         ...,
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [1.

In [40]:
np.isnan(temp_vectorized.data).sum() / temp_vectorized.data.size

0.8710733976990827

In [41]:
np.isnan(comb_vectorized.data).sum() / comb_vectorized.data.size

0.49775622725661867

# Prepare splits

In [42]:
tab_vectorized.shape, temp_vectorized.shape, comb_vectorized.shape

((143049, 52), (1, 138187, 68, 6), (1, 138187, 119, 6))

In [43]:
tab_vectorized, temp_vectorized, comb_vectorized = intersect_vectorized(
    [tab_vectorized, temp_vectorized, comb_vectorized], axes=ENCOUNTER_ID
)
tab_vectorized.shape, temp_vectorized.shape, comb_vectorized.shape

((138187, 52), (1, 138187, 68, 6), (1, 138187, 119, 6))

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [51]:
# Normalize only numeric features (e.g., not binary indicators)
# Note: Normalization is not occuring here, we are only doing the setup
normalizer_map = {feat: STANDARD for feat in numeric_features}

tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map=normalizer_map,
)

In [52]:
# Normalize all events
# Note: Normalization is not occuring here, we are only doing the setup
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

comb_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

## Dataset splits

Split into training, validation, and testing datasets such that the tabular and temporal encounters remain aligned.

In [53]:
tab_splits, temp_splits, comb_splits = split_vectorized(
    [tab_vectorized, temp_vectorized, comb_vectorized],
    use_case_params.SPLIT_FRACTIONS,
    axes=ENCOUNTER_ID,
)
tab_train, tab_val, tab_test = tab_splits
temp_train, temp_val, temp_test = temp_splits
comb_train, comb_val, comb_test = comb_splits

In [54]:
tab_train.shape, tab_val.shape, tab_test.shape

((110550, 52), (13818, 52), (13819, 52))

In [55]:
temp_train.shape, temp_val.shape, temp_test.shape

((1, 110550, 68, 6), (1, 13818, 68, 6), (1, 13819, 68, 6))

In [56]:
comb_train.shape, comb_val.shape, comb_test.shape

((1, 110550, 119, 6), (1, 13818, 119, 6), (1, 13819, 119, 6))

## Split features/targets

Split out the targets in the temporal data.

In [57]:
tab_train_X, tab_train_y = tab_train.split_out(FEATURES, use_case_params.TAB_TARGETS)
tab_train_X.shape, tab_train_y.shape

((110550, 51), (110550, 1))

In [58]:
tab_val_X, tab_val_y = tab_val.split_out(FEATURES, use_case_params.TAB_TARGETS)
tab_val_X.shape, tab_val_y.shape

((13818, 51), (13818, 1))

In [59]:
tab_test_X, tab_test_y = tab_test.split_out(FEATURES, use_case_params.TAB_TARGETS)
tab_test_X.shape, tab_test_y.shape

((13819, 51), (13819, 1))

In [60]:
temp_train_X, temp_train_y = temp_train.split_out(
    EVENT_NAME, use_case_params.TEMP_TARGETS
)
temp_train_X.shape, temp_train_y.shape

((1, 110550, 67, 6), (1, 110550, 1, 6))

In [61]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
temp_val_X.shape, temp_val_y.shape

((1, 13818, 67, 6), (1, 13818, 1, 6))

In [62]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
temp_test_X.shape, temp_test_y.shape

((1, 13819, 67, 6), (1, 13819, 1, 6))

In [63]:
comb_train_X, comb_train_y = comb_train.split_out(
    EVENT_NAME, use_case_params.TEMP_TARGETS
)
comb_train_X.shape, comb_train_y.shape

((1, 110550, 118, 6), (1, 110550, 1, 6))

In [64]:
comb_val_X, comb_val_y = comb_val.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
comb_val_X.shape, comb_val_y.shape

((1, 13818, 118, 6), (1, 13818, 1, 6))

In [65]:
comb_test_X, comb_test_y = comb_test.split_out(EVENT_NAME, use_case_params.TEMP_TARGETS)
comb_test_X.shape, comb_test_y.shape

((1, 13819, 118, 6), (1, 13819, 1, 6))

<cyclops.processors.feature.vectorize.Vectorized at 0x7f295c460550>

In [66]:
def impute(temp_vec):
    # Forward fill then backward fill to get rid of all of the timestep nulls
    temp_vec.impute_over_axis(TIMESTEP, np_ffill_bfill)

    # Fill those all-null timesteps with feature mean
    # (since forward and backward filling still leaves them all null)
    axis = temp_vec.get_axis(EVENT_NAME)

    for i in range(temp_vec.data.shape[axis]):
        index_exp = vec_index_exp[:, :, i]
        data_slice = temp_vec.data[index_exp]
        mean = np.nanmean(data_slice)
        func = lambda x: np_fill_null_num(x, mean)  # noqa: E731
        temp_vec.impute_over_axis(TIMESTEP, func, index_exp=index_exp)

    return temp_vec


temp_train_X = impute(temp_train_X)
temp_val_X = impute(temp_val_X)
temp_test_X = impute(temp_test_X)

comb_train_X = impute(comb_train_X)
comb_val_X = impute(comb_val_X)
comb_test_X = impute(comb_test_X)

  mean = np.nanmean(data_slice)


### Normalization

In [67]:
splits = (
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
    comb_train_X,
    comb_val_X,
    comb_test_X,
)

for split in splits:
    split.fit_normalizer()
    split.normalize()

(
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
    comb_train_X,
    comb_val_X,
    comb_test_X,
) = splits

## Save

In [68]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X"),
    (tab_train_y, "tab_train_y"),
    (tab_val_X, "tab_val_X"),
    (tab_val_y, "tab_val_y"),
    (tab_test_X, "tab_test_X"),
    (tab_test_y, "tab_test_y"),
    (temp_train_X, "temp_train_X"),
    (temp_train_y, "temp_train_y"),
    (temp_val_X, "temp_val_X"),
    (temp_val_y, "temp_val_y"),
    (temp_test_X, "temp_test_X"),
    (temp_test_y, "temp_test_y"),
    (comb_train_X, "comb_train_X"),
    (comb_train_y, "comb_train_y"),
    (comb_val_X, "comb_val_X"),
    (comb_val_y, "comb_val_y"),
    (comb_test_X, "comb_test_X"),
    (comb_test_y, "comb_test_y"),
]
for vec, name in vectorized:
    save_pickle(vec, use_case_params.TAB_VEC_COMB + name + ".pkl")

2022-11-08 14:52:53,239 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_train_X.pkl
2022-11-08 14:52:55,557 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_train_y.pkl
2022-11-08 14:52:56,913 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_val_X.pkl
2022-11-08 14:52:57,356 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_val_y.pkl
2022-11-08 14:52:57,788 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality/./data/4-final/aligned_tab_test_X.pkl
2022-11-08 14:52:58,239 [1;37mINFO[0m cyclops.utils.file - Pickling data to /mnt/nfs/project/