In [52]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    BY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import VectorizedNormalizer
from cyclops.processors.feature.split import split_idx
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.common import print_dict
from cyclops.utils.file import load_dataframe, save_dataframe

In [53]:
YEARS = [2015, 2016, 2018, 2019, 2020]
MIN_YEAR = min(YEARS)
TARGET_NAME = "death"

# Querying

## Patient encounters

In [54]:
encounters_interface = mimic.patient_encounters(
    years=YEARS, died=True, died_binarize_col="died"
)
encounters_query = encounters_interface.query
encounters_query = qp.Drop(
    ["insurance", "language", "marital_status", "hospital_expire_flag"]
)(encounters_query)
encounters_interface = mimic.get_interface(encounters_query)
encounters = encounters_interface.run()
encounters

2022-07-15 11:46:08,986 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-07-15 11:46:08,987 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 2.809693 s


Unnamed: 0,subject_id,encounter_id,admit_timestamp,discharge_timestamp,deathtime,admission_type,admission_location,discharge_location,ethnicity,edregtime,edouttime,sex,age,birth_year,dod,anchor_year_difference,died
0,17978591,21300810,2015-10-10 00:21:00,2015-10-12 14:10:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,F,0,2015,NaT,-134,False
1,17763996,26149939,2018-03-29 22:14:00,2018-04-01 12:35:00,NaT,ELECTIVE,,HOME,UNKNOWN,NaT,NaT,M,0,2018,NaT,-155,False
2,12901523,26546757,2018-08-14 17:32:00,2018-08-17 03:18:00,NaT,ELECTIVE,,HOME,BLACK/AFRICAN AMERICAN,NaT,NaT,F,0,2018,NaT,-126,False
3,17871631,20600400,2015-12-03 12:43:00,2015-12-05 12:05:00,NaT,ELECTIVE,,HOME,ASIAN,NaT,NaT,F,0,2015,NaT,-126,False
4,13048111,23204476,2018-11-01 14:35:00,2018-11-05 15:55:00,NaT,ELECTIVE,,HOME,BLACK/AFRICAN AMERICAN,NaT,NaT,M,0,2018,NaT,-156,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224759,11512653,24728972,2015-08-24 18:56:00,2015-09-01 17:55:00,NaT,URGENT,TRANSFER FROM HOSPITAL,SKILLED NURSING FACILITY,WHITE,NaT,NaT,M,83,1932,NaT,-165,False
224760,10877983,21810481,2018-08-03 20:44:00,2018-08-10 13:30:00,NaT,OBSERVATION ADMIT,EMERGENCY ROOM,ACUTE HOSPITAL,WHITE,2183-08-03 17:50:00,2183-08-03 23:30:00,F,70,1948,NaT,-165,False
224761,12482183,28145020,2015-12-16 10:39:00,2015-12-17 18:40:00,NaT,URGENT,TRANSFER FROM HOSPITAL,HOME,OTHER,2138-12-15 23:29:00,2138-12-16 12:27:00,F,66,1949,NaT,-123,False
224762,11985678,21946538,2018-09-13 01:55:00,2018-09-17 10:34:00,NaT,OBSERVATION ADMIT,EMERGENCY ROOM,PSYCH FACILITY,WHITE,2187-09-12 16:38:00,2187-09-13 03:23:00,M,49,1969,NaT,-169,False


In [55]:
save_dataframe(encounters, "encounters.parquet")

2022-07-15 11:46:09,007 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to encounters.parquet


'encounters.parquet'

## Events

In [56]:
events_interface = mimic.events()
events_query = events_interface.query
events_query = qp.Drop(["warning", "itemid", "storetime"])(events_query)
events_interface = mimic.get_interface(events_query)
events = events_interface.run(limit=1000000)

# Reverse the deidentified dating
events = pd.merge(
    encounters[[ENCOUNTER_ID, "anchor_year_difference"]], events, on=ENCOUNTER_ID
)


def add_offset(row):
    row[EVENT_TIMESTAMP] += pd.DateOffset(years=row["anchor_year_difference"])
    return row


events = events.apply(add_offset, axis=1)
events = events.drop("anchor_year_difference", axis=1)

2022-07-15 11:46:13,184 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-07-15 11:46:13,185 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 3.846352 s


In [57]:
# Create the target as a timeseries event
target_events = encounters[encounters["died"] == True]
target_events = target_events[[ENCOUNTER_ID, "deathtime"]]
target_events = target_events.rename({"deathtime": EVENT_TIMESTAMP}, axis=1)
target_events[EVENT_NAME] = TARGET_NAME
target_events[EVENT_CATEGORY] = TARGETS
target_events[EVENT_VALUE] = 1
target_events.head(5)

Unnamed: 0,encounter_id,event_timestamp,event_name,event_category,event_value
833,23121290,2018-09-22 12:46:00,death,targets,1
1080,24112055,2018-03-13 01:15:00,death,targets,1
1428,24657556,2018-05-09 00:00:00,death,targets,1
1744,24422389,2018-06-06 15:00:00,death,targets,1
1745,21834123,2018-02-28 04:30:00,death,targets,1


In [58]:
# Include target
events = pd.concat([events, target_events])

In [59]:
# Preprocessing
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
events[EVENT_CATEGORY] = normalize_categories(events[EVENT_CATEGORY])
# events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

# Concatenate event name and category since some names are the same in
# different categories, e.g., 'flow' for categories 'heartware' and 'ecmo'
events[EVENT_NAME] = events[EVENT_CATEGORY] + " - " + events[EVENT_NAME]
events.head(5)

# Update target name having included the category
TARGET_NAME = TARGETS + " - " + TARGET_NAME

In [60]:
save_dataframe(events, "events.parquet")

2022-07-15 11:47:02,442 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to events.parquet


'events.parquet'

------------------------------------------------------------------------------------------------

# Processing

## Tabular

In [61]:
encounters = load_dataframe("encounters.parquet")
encounters.head(5)

2022-07-15 11:47:02,721 [1;37mINFO[0m cyclops.utils.file - Loading dataframe from encounters.parquet


Unnamed: 0,subject_id,encounter_id,admit_timestamp,discharge_timestamp,deathtime,admission_type,admission_location,discharge_location,ethnicity,edregtime,edouttime,sex,age,birth_year,dod,anchor_year_difference,died
0,17978591,21300810,2015-10-10 00:21:00,2015-10-12 14:10:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,F,0,2015,NaT,-134,False
1,17763996,26149939,2018-03-29 22:14:00,2018-04-01 12:35:00,NaT,ELECTIVE,,HOME,UNKNOWN,NaT,NaT,M,0,2018,NaT,-155,False
2,12901523,26546757,2018-08-14 17:32:00,2018-08-17 03:18:00,NaT,ELECTIVE,,HOME,BLACK/AFRICAN AMERICAN,NaT,NaT,F,0,2018,NaT,-126,False
3,17871631,20600400,2015-12-03 12:43:00,2015-12-05 12:05:00,NaT,ELECTIVE,,HOME,ASIAN,NaT,NaT,F,0,2015,NaT,-126,False
4,13048111,23204476,2018-11-01 14:35:00,2018-11-05 15:55:00,NaT,ELECTIVE,,HOME,BLACK/AFRICAN AMERICAN,NaT,NaT,M,0,2018,NaT,-156,False


In [62]:
features = [
    AGE,
    SEX,
    "admission_type",
    "admission_location",
    "discharge_location",
    "ethnicity",
]

tab_features = TabularFeatures(
    data=encounters,
    features=features,
    by=ENCOUNTER_ID,
)

In [63]:
tab_features.types

{'age': 'numeric',
 'discharge_location': 'ordinal',
 'sex': 'binary',
 'ethnicity': 'ordinal',
 'admission_location': 'ordinal',
 'admission_type': 'ordinal'}

In [64]:
tab_features.meta["admission_location"].get_mapping()

{0: 'AMBULATORY SURGERY TRANSFER',
 1: 'CLINIC REFERRAL',
 2: 'EMERGENCY ROOM',
 3: 'INFORMATION NOT AVAILABLE',
 4: 'INTERNAL TRANSFER TO OR FROM PSYCH',
 5: 'PACU',
 6: 'PHYSICIAN REFERRAL',
 7: 'PROCEDURE SITE',
 8: 'TRANSFER FROM HOSPITAL',
 9: 'TRANSFER FROM SKILLED NURSING FACILITY',
 10: 'WALK-IN/SELF REFERRAL',
 11: 'nan'}

In [65]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

['discharge_location', 'ethnicity', 'admission_location', 'admission_type']

In [66]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is tryin

In [67]:
tab_vectorized.shape

(224764, 43)

In [68]:
tab_vectorized.axis_names

['encounter_id', 'features']

In [69]:
tab_vectorized.get_index(FEATURES)

array(['age', 'sex', 'discharge_location_ACUTE HOSPITAL',
       'discharge_location_AGAINST ADVICE',
       'discharge_location_ASSISTED LIVING',
       'discharge_location_CHRONIC/LONG TERM ACUTE CARE',
       'discharge_location_DIED',
       'discharge_location_HEALTHCARE FACILITY',
       'discharge_location_HOME', 'discharge_location_HOME HEALTH CARE',
       'discharge_location_HOSPICE', 'discharge_location_OTHER FACILITY',
       'discharge_location_PSYCH FACILITY', 'discharge_location_REHAB',
       'discharge_location_SKILLED NURSING FACILITY',
       'ethnicity_AMERICAN INDIAN/ALASKA NATIVE', 'ethnicity_ASIAN',
       'ethnicity_BLACK/AFRICAN AMERICAN', 'ethnicity_HISPANIC/LATINO',
       'ethnicity_OTHER', 'ethnicity_UNABLE TO OBTAIN',
       'ethnicity_UNKNOWN', 'ethnicity_WHITE',
       'admission_location_AMBULATORY SURGERY TRANSFER',
       'admission_location_CLINIC REFERRAL',
       'admission_location_EMERGENCY ROOM',
       'admission_location_INFORMATION NOT AVAILA

In [70]:
numeric_features = tab_features.features_by_type(NUMERIC)
numeric_features

['age']

## Temporal

In [71]:
events = load_dataframe("events.parquet")

events = events.sample(n=int(len(events) / 20))

events.head(5)

2022-07-15 11:47:04,026 [1;37mINFO[0m cyclops.utils.file - Loading dataframe from events.parquet


Unnamed: 0,encounter_id,subject_id,stay_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
225351,23960370,15803381.0,31334098.0,2018-01-21 05:00:00,,0.0,,pain/sedation,pain/sedation - pain level response
412784,25600206,15874174.0,34675677.0,2018-03-25 16:00:00,Small,,,respiratory,respiratory - sputum amount
226155,23960370,15803381.0,31334098.0,2018-01-22 15:00:00,,,,routine vital signs,routine vital signs - ectopy type 1
247185,26736249,15790886.0,33411044.0,2015-06-12 20:30:00,Present,,,gi/gu,gi/gu - bowel sounds
135995,25124601,15861146.0,33546403.0,2018-06-30 20:00:00,-1 Awakens to voice (eye opening/contact) > 10...,-1.0,,pain/sedation,pain/sedation - richmond-ras scale


In [72]:
# Keep only the most popular events
top_events = events[EVENT_NAME].value_counts()[:100].index

# Force include the target
top_events = np.unique(np.append(top_events, TARGET_NAME))

events = events[events[EVENT_NAME].isin(top_events)]
events.head(5)

Unnamed: 0,encounter_id,subject_id,stay_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
226155,23960370,15803381.0,31334098.0,2018-01-22 15:00:00,,,,routine vital signs,routine vital signs - ectopy type 1
247185,26736249,15790886.0,33411044.0,2015-06-12 20:30:00,Present,,,gi/gu,gi/gu - bowel sounds
135995,25124601,15861146.0,33546403.0,2018-06-30 20:00:00,-1 Awakens to voice (eye opening/contact) > 10...,-1.0,,pain/sedation,pain/sedation - richmond-ras scale
343697,28975630,15838849.0,30123607.0,2018-11-19 12:00:00,City,1.0,,neurological,neurological - orientation
147801,28808173,15964158.0,39695400.0,2019-07-16 08:00:00,Weak Palpable,,,cardiovascular (pulses),cardiovascular (pulses) - posttib. pulses r


In [73]:
events[events[EVENT_NAME] == TARGET_NAME].head(5)

Unnamed: 0,encounter_id,subject_id,stay_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
147735,25452841,,,2015-07-21 11:50:00,,1.0,,targets,targets - death
128559,29787714,,,2018-09-09 05:48:00,,1.0,,targets,targets - death
19769,27733975,,,2018-07-22 21:56:00,,1.0,,targets,targets - death
40257,26184834,,,2018-01-20 05:15:00,,1.0,,targets,targets - death
220825,27349721,,,2015-09-15 09:40:00,,1.0,,targets,targets - death


In [74]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=8,
    window_duration=24,  # Optional
    # agg_meta_for=EVENT_VALUE,  # Optional
)

In [75]:
events = events.reset_index(drop=True)

tmp_features = TemporalFeatures(
    events,
    features=EVENT_VALUE,
    by=[ENCOUNTER_ID, EVENT_NAME],
    timestamp_col=EVENT_TIMESTAMP,
    aggregator=aggregator,
)

In [76]:
aggregated = tmp_features.aggregate()
aggregated.head(5)

2022-07-15 11:47:04,333 [1;37mINFO[0m cyclops.processors.cleaning - Dropped nulls over columns: event_timestamp. Removed 6 rows.
2022-07-15 11:47:07,832 [1;37mINFO[0m cyclops.utils.profile - Finished executing function __call__ in 3.502922 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,event_value,timestep_start
encounter_id,event_name,timestep,Unnamed: 3_level_1,Unnamed: 4_level_1
20118636,targets - death,0,1.0,2019-07-20 01:50:00
20127587,targets - death,0,1.0,2015-02-02 00:00:00
20170028,targets - death,0,1.0,2018-08-18 19:00:00
20187583,treatments - turn,1,,2018-09-22 23:00:00
20187583,cardiovascular - lle temp,2,,2018-09-23 07:00:00


In [77]:
save_dataframe(aggregated, "aggregated.parquet")

2022-07-15 11:47:07,843 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to aggregated.parquet


'aggregated.parquet'

In [78]:
temp_vectorized = aggregator.vectorize(aggregated)
temp_vectorized.shape

2022-07-15 11:47:08,510 [1;37mINFO[0m cyclops.utils.profile - Finished executing function vectorize in 0.650070 s


(1, 337, 100, 3)

In [79]:
temp_vectorized.axis_names

['aggfuncs', 'encounter_id', 'event_name', 'timesteps']

## Dataset splits

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [80]:
tab_encounters = set(tab_vectorized.get_index(ENCOUNTER_ID))
temp_encounters = set(temp_vectorized.get_index(ENCOUNTER_ID))
encounters_intersection = np.array(list(tab_encounters.intersection(temp_encounters)))

In [81]:
tab_vectorized = tab_vectorized.take_with_index(ENCOUNTER_ID, encounters_intersection)
temp_vectorized = temp_vectorized.take_with_index(ENCOUNTER_ID, encounters_intersection)

In [82]:
tab_vectorized.shape

(337, 43)

In [83]:
temp_vectorized.shape

(1, 337, 100, 3)

Split into training, validation, and testing datasets.

Split using the same indices such that the tabular and temporal datasets remain aligned.

In [84]:
splits = split_idx(
    fractions=[0.8, 0.1, 0.1],
    data_len=len(encounters_intersection),
)

In [85]:
tab_train_X, tab_val_X, tab_test_X = tab_vectorized.split_by_indices(
    ENCOUNTER_ID, splits
)

temp_train, temp_val, temp_test = temp_vectorized.split_by_indices(ENCOUNTER_ID, splits)

In [86]:
tab_train_X.shape, tab_val_X.shape, tab_test_X.shape

((270, 43), (33, 43), (34, 43))

In [87]:
temp_train.shape, temp_val.shape, temp_test.shape

((1, 270, 100, 3), (1, 33, 100, 3), (1, 34, 100, 3))

### Split features/targets

Split out the targets in the temporal data.

In [88]:
temp_train_X, temp_train_y = temp_train.split_out(EVENT_NAME, [TARGET_NAME])
temp_train_X.shape, temp_train_y.shape

((1, 270, 99, 3), (1, 270, 1, 3))

In [89]:
temp_val_X, temp_val_y = temp_val.split_out(EVENT_NAME, [TARGET_NAME])
temp_val_X.shape, temp_val_y.shape

((1, 33, 99, 3), (1, 33, 1, 3))

In [90]:
temp_test_X, temp_test_y = temp_test.split_out(EVENT_NAME, [TARGET_NAME])
temp_test_X.shape, temp_test_y.shape

((1, 34, 99, 3), (1, 34, 1, 3))

### Normalization

In [91]:
splits = tab_train_X, tab_val_X, tab_test_X

for split in splits:
    normalizer = VectorizedNormalizer(
        split.get_axis(FEATURES),
        normalizer_map={feat: STANDARD for feat in numeric_features},
    )
    split.add_normalizer(normalizer)
    split.normalize()

tab_train_X, tab_val_X, tab_test_X = splits

In [92]:
# Only normalizing the inputs since the outputs are binary
splits = temp_train_X, temp_val_X, temp_test_X

for split in splits:
    normalizer = VectorizedNormalizer(
        split.get_axis(EVENT_NAME),
        normalization_method=STANDARD,
    )
    split.add_normalizer(normalizer)
    split.normalize()

temp_train_X, temp_val_X, temp_test_X = splits

In [93]:
"Percentage null:", np.isnan(temp_train_X.data).sum() / temp_train_X.data.size

('Percentage null:', 0.9930914079062227)

## Save