In [6]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    BY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import VectorizedNormalizer
from cyclops.processors.feature.vectorize import intersect_vectorized, split_vectorized
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.common import print_dict
from cyclops.utils.file import load_array, load_dataframe, save_dataframe

In [7]:
OUTCOME_DEATH = "outcome_death"
OUTCOME_DEATH_PROCESSED = TARGETS + " - " + OUTCOME_DEATH

TAB_TARGETS = [OUTCOME_DEATH]
TEMP_TARGETS = [OUTCOME_DEATH_PROCESSED]

SPLIT_FRACTIONS = [0.8, 0.1]
PREDICT_OFFSET_HRS = 24

In [8]:
TAB_TARGETS

['outcome_death']

# Tabular-specific processing

In [9]:
cohort = load_dataframe("encounters.parquet")
cohort.head(5)

2022-07-25 13:37:06,139 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from encounters.parquet


Unnamed: 0,subject_id,encounter_id,admit_timestamp,discharge_timestamp,deathtime,admission_type,admission_location,discharge_location,ethnicity,edregtime,edouttime,sex,age,birth_year,dod,anchor_year_difference,outcome_death
0,18660483,26256527,2015-07-01 09:44:00,2015-07-03 09:58:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,F,0,2015,NaT,-116,False
1,17978591,21300810,2015-10-10 00:21:00,2015-10-12 14:10:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,F,0,2015,NaT,-134,False
2,16420748,23082241,2018-10-25 22:30:00,2018-10-28 12:58:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,F,0,2018,NaT,-151,False
3,15206017,25518306,2018-10-22 15:28:00,2018-10-23 19:13:00,NaT,EU OBSERVATION,CLINIC REFERRAL,,BLACK/AFRICAN AMERICAN,2110-10-22 11:52:00,2110-10-22 17:37:00,F,76,1942,NaT,-92,
4,14695283,23679175,2018-06-24 18:54:00,2018-06-25 20:15:00,NaT,EU OBSERVATION,PHYSICIAN REFERRAL,,WHITE,2176-06-24 14:46:00,2176-06-25 20:15:00,F,90,1928,NaT,-158,


In [10]:
features = [
    AGE,
    SEX,
    OUTCOME_DEATH,
    "admission_type",
    "admission_location",
    "discharge_location",
    "ethnicity",
]

cohort = cohort.reset_index(drop=True)

tab_features = TabularFeatures(
    data=cohort,
    features=features,
    by=ENCOUNTER_ID,
)

In [11]:
tab_features.types

{'ethnicity': 'ordinal',
 'sex': 'binary',
 'admission_location': 'ordinal',
 'outcome_death': 'numeric',
 'age': 'numeric',
 'admission_type': 'ordinal',
 'discharge_location': 'ordinal'}

In [12]:
tab_features.meta.keys()

dict_keys(['ethnicity', 'sex', 'admission_location', 'outcome_death', 'age', 'admission_type', 'discharge_location'])

In [13]:
tab_features.meta["admission_location"].get_mapping()

{0: 'AMBULATORY SURGERY TRANSFER',
 1: 'CLINIC REFERRAL',
 2: 'EMERGENCY ROOM',
 3: 'INFORMATION NOT AVAILABLE',
 4: 'INTERNAL TRANSFER TO OR FROM PSYCH',
 5: 'PACU',
 6: 'PHYSICIAN REFERRAL',
 7: 'PROCEDURE SITE',
 8: 'TRANSFER FROM HOSPITAL',
 9: 'TRANSFER FROM SKILLED NURSING FACILITY',
 10: 'WALK-IN/SELF REFERRAL',
 11: 'nan'}

In [14]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

['ethnicity', 'admission_location', 'admission_type', 'discharge_location']

In [15]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is tryin

In [16]:
tab_vectorized.shape

(224764, 44)

In [17]:
tab_vectorized.axis_names

['encounter_id', 'features']

In [18]:
numeric_features = tab_features.features_by_type(NUMERIC)
numeric_features

['outcome_death', 'age']

In [19]:
tab_vectorized.save("tab_vectorized.npy")

2022-07-25 13:37:07,623 [1;37mINFO[0m cyclops.utils.file - Saving array to tab_vectorized.npy


'tab_vectorized.npy'

# Temporal-specific processing

In [None]:
# chunk = 1

In [20]:
events = load_dataframe("events.parquet")
events = events.sample(n=int(len(events) / 20))
events.head(5)

2022-07-25 13:37:07,674 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from events.parquet


Unnamed: 0,encounter_id,subject_id,stay_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
450760,26432496,17521365.0,38504972.0,2015-06-26 05:41:00,Repositioning for optimizing respiratory status,,,care plans,care plans - altered respiratory status ncp - ...
587343,22390175,17316181.0,31689230.0,2019-02-21 16:00:00,Portex,,,respiratory,respiratory - trach tube manufacturer
522275,23274561,17305750.0,36213134.0,2018-02-24 22:29:00,Grade 0,,,access lines - peripheral,access lines - peripheral - 22 g infiltration ...
42239,20574570,17566492.0,34188547.0,2019-01-29 20:00:00,Normal for Race,,,skin - assessment,skin - assessment - skin color
572467,22390175,17316181.0,31689230.0,2019-02-04 16:00:00,CPOT,,,pain/sedation,pain/sedation - cpot-pain assessment method


In [21]:
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
# events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

In [22]:
# Keep only the most popular events
top_events = events[EVENT_NAME].value_counts()[:100].index

# Force include the target
top_events = np.unique(np.append(top_events, OUTCOME_DEATH))

events = events[events[EVENT_NAME].isin(top_events)]
events.head(5)

Unnamed: 0,encounter_id,subject_id,stay_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
450760,26432496,17521365.0,38504972.0,2015-06-26 05:41:00,Repositioning for optimizing respiratory status,,,care plans,care plans - altered respiratory status ncp - ...
42239,20574570,17566492.0,34188547.0,2019-01-29 20:00:00,Normal for Race,,,skin - assessment,skin - assessment - skin color
47624,22310579,17337797.0,35798022.0,2018-08-19 16:00:00,Bed alarm activated,,,restraint/support systems,restraint/support systems - safety measures
53089,24181447,17412125.0,35377129.0,2018-08-13 12:05:00,Name,1.0,,neurological,neurological - orientation
249141,25996095,17443783.0,37012343.0,2015-06-28 14:33:00,Stool management,,,care plans,care plans - altered skin integrity ncp - inte...


In [23]:
events[events[EVENT_NAME] == OUTCOME_DEATH_PROCESSED].head(5)

Unnamed: 0,encounter_id,subject_id,stay_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
202141,25753285,,,2020-04-28 08:53:00,,1.0,,targets,targets - outcome_death
67998,20920997,,,2018-06-24 01:35:00,,1.0,,targets,targets - outcome_death
56159,28511446,,,2018-10-03 23:30:00,,1.0,,targets,targets - outcome_death
151995,26875920,,,2018-09-13 18:05:00,,1.0,,targets,targets - outcome_death
8598,26611016,,,2016-08-30 07:10:00,,1.0,,targets,targets - outcome_death


In [24]:
# Offset death time - i.e., should predict death time in advance
is_death_event = events[EVENT_NAME] == OUTCOME_DEATH_PROCESSED
events[is_death_event][EVENT_TIMESTAMP] = events[is_death_event][
    EVENT_TIMESTAMP
] - pd.DateOffset(hours=PREDICT_OFFSET_HRS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events[is_death_event][EVENT_TIMESTAMP] = events[is_death_event][EVENT_TIMESTAMP] - pd.DateOffset(


In [25]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=8,
    window_duration=24,  # Optional
    # agg_meta_for=EVENT_VALUE,  # Optional
)

In [26]:
events = events.reset_index(drop=True)

tmp_features = TemporalFeatures(
    events,
    features=EVENT_VALUE,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
)

In [27]:
aggregated = tmp_features.aggregate()
aggregated.head(5)

2022-07-25 13:37:08,025 [1;37mINFO[0m cyclops.processors.cleaning - Dropped nulls over columns: event_timestamp. Removed 6 rows.
2022-07-25 13:37:11,599 [1;37mINFO[0m cyclops.utils.profile - Finished executing function __call__ in 3.577729 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,event_value,timestep_start
encounter_id,event_name,timestep,Unnamed: 3_level_1,Unnamed: 4_level_1
20004718,targets - outcome_death,0,1.0,2015-01-10 14:04:00
20078746,pulmonary - cough type,1,,2018-10-02 21:00:00
20078746,neurological - pupil size right,0,,2018-10-02 13:00:00
20078746,treatments - oral care,0,,2018-10-02 13:00:00
20078746,restraint/support systems - restraint device,2,,2018-10-03 05:00:00


In [28]:
save_dataframe(aggregated, "aggregated.parquet")

2022-07-25 13:37:11,609 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to aggregated.parquet


'aggregated.parquet'

In [29]:
temp_vectorized = aggregator.vectorize(aggregated)
temp_vectorized.shape

2022-07-25 13:37:12,658 [1;37mINFO[0m cyclops.utils.profile - Finished executing function vectorize in 1.037816 s


(1, 314, 100, 3)

In [30]:
temp_vectorized.axis_names

['aggfuncs', 'encounter_id', 'event_name', 'timesteps']

In [31]:
temp_vectorized.save("temp_vectorized.npy")

2022-07-25 13:37:12,671 [1;37mINFO[0m cyclops.utils.file - Saving array to temp_vectorized.npy


'temp_vectorized.npy'

# Combined processing

# Prepare splits

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [32]:
tab_vectorized.shape, temp_vectorized.shape

((224764, 44), (1, 314, 100, 3))

In [33]:
tab_vectorized, temp_vectorized = intersect_vectorized(
    [tab_vectorized, temp_vectorized], axes=ENCOUNTER_ID
)
tab_vectorized.shape, temp_vectorized.shape

((314, 44), (1, 314, 100, 3))

In [34]:
# Standardize only the numeric features (e.g., not binary indicators)
normalizer_map = {feat: STANDARD for feat in numeric_features}

tab_vectorized.add_normalizer(
    FEATURES,
    normalizer_map=normalizer_map,
)

In [35]:
# Standardize all events
temp_vectorized.add_normalizer(
    EVENT_NAME,
    normalization_method=STANDARD,
)

## Dataset splits

Split into training, validation, and testing datasets such that the tabular and temporal encounters remain aligned.

In [36]:
tab_splits, temp_splits = split_vectorized(
    [tab_vectorized, temp_vectorized], SPLIT_FRACTIONS, axes=ENCOUNTER_ID
)
tab_train, tab_val, tab_test = tab_splits
temp_train, temp_val, temp_test = temp_splits

In [37]:
tab_train.shape, tab_val.shape, tab_test.shape

((251, 44), (32, 44), (31, 44))

In [38]:
temp_train.shape, temp_val.shape, temp_test.shape

((1, 251, 100, 3), (1, 32, 100, 3), (1, 31, 100, 3))

## Split features/targets

Split out the targets in the temporal data.

In [39]:
tab_train_X, tab_train_y = tab_train.split_out(FEATURES, TAB_TARGETS)
tab_train_X.shape, tab_train_y.shape

((251, 43), (251, 1))

In [40]:
tab_val_X, tab_val_y = tab_val.split_out(FEATURES, TAB_TARGETS)
tab_val_X.shape, tab_val_y.shape

((32, 43), (32, 1))

In [41]:
tab_test_X, tab_test_y = tab_test.split_out(FEATURES, TAB_TARGETS)
tab_test_X.shape, tab_test_y.shape

((31, 43), (31, 1))

In [42]:
temp_train_X, temp_train_y = temp_train.split_out(EVENT_NAME, TEMP_TARGETS)
temp_train_X.shape, temp_train_y.shape

((1, 251, 99, 3), (1, 251, 1, 3))

In [43]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, TEMP_TARGETS)
temp_val_X.shape, temp_val_y.shape

((1, 32, 99, 3), (1, 32, 1, 3))

In [44]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, TEMP_TARGETS)
temp_test_X.shape, temp_test_y.shape

((1, 31, 99, 3), (1, 31, 1, 3))

### Normalization

In [45]:
splits = (
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
)

for split in splits:
    split.fit_normalizer()
    split.normalize()

(
    tab_train_X,
    tab_val_X,
    tab_test_X,
    temp_train_X,
    temp_val_X,
    temp_test_X,
) = splits

## Save

In [46]:
# Store data (serialize)
vectorized = [
    (tab_train_X, "tab_train_X"),
    (tab_val_X, "tab_val_X"),
    (tab_test_X, "tab_test_X"),
    (temp_train_X, "temp_train_X"),
    (temp_train_y, "temp_train_y"),
    (temp_val_X, "temp_val_X"),
    (temp_val_y, "temp_val_y"),
    (temp_test_X, "temp_test_X"),
    (temp_test_y, "temp_test_y"),
]
for vec, name in vectorized:
    with open(name + ".pkl", "wb") as handle:
        pickle.dump(vec, handle)