In [21]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    BY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.file import (
    join,
    load_dataframe,
    process_dir_save_path,
    save_consequtive_dataframes,
    save_dataframe,
    yield_dataframes,
)

In [2]:
OUTCOME_DEATH = "outcome_death"

## Patient encounters

In [3]:
encounters_interface = mimic.patient_encounters(died_binarize_col=OUTCOME_DEATH)

encounters_query = encounters_interface.query
encounters_query = qp.Drop(
    ["insurance", "language", "marital_status", "hospital_expire_flag"]
)(encounters_query)

encounters_interface = mimic.get_interface(encounters_query)
encounters = encounters_interface.run()
encounters

2022-07-28 18:01:49,300 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-07-28 18:01:49,301 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 4.956333 s


Unnamed: 0,subject_id,encounter_id,admit_timestamp,discharge_timestamp,deathtime,admission_type,admission_location,discharge_location,ethnicity,edregtime,edouttime,sex,age,birth_year,dod,anchor_year_difference,outcome_death
0,14849152,25726085,2018-04-21 21:18:00,2018-04-24 12:57:00,NaT,ELECTIVE,,HOME,ASIAN,NaT,NaT,M,0,2018,NaT,-112,False
1,19731189,21820217,2012-06-21 16:39:00,2012-06-27 12:00:00,NaT,ELECTIVE,,HOME,BLACK/AFRICAN AMERICAN,NaT,NaT,M,0,2012,NaT,-149,False
2,14523215,29575656,2018-12-30 21:07:00,2019-01-01 11:38:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,M,0,2018,NaT,-118,False
3,10487271,28274967,2009-09-13 13:37:00,2009-09-14 18:30:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,M,0,2009,NaT,-164,False
4,12188356,23159459,2018-02-12 00:38:00,2018-02-14 14:01:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,F,0,2018,NaT,-123,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523735,11906002,29443220,2015-08-03 18:08:00,2015-08-04 18:25:00,NaT,OBSERVATION ADMIT,EMERGENCY ROOM,HOME,WHITE,2161-08-03 14:34:00,2161-08-03 19:39:00,M,65,1950,NaT,-146,False
523736,12770900,22279098,2010-10-20 08:37:00,2010-10-25 12:57:00,NaT,URGENT,PHYSICIAN REFERRAL,HOME,BLACK/AFRICAN AMERICAN,NaT,NaT,F,29,1981,NaT,-143,False
523737,19072817,26415000,2012-09-06 23:06:00,2012-09-12 16:59:00,NaT,EW EMER.,EMERGENCY ROOM,SKILLED NURSING FACILITY,WHITE,2194-09-06 17:09:00,2194-09-07 01:06:00,F,91,1921,NaT,-182,False
523738,16221445,22515498,2012-08-18 13:23:00,2012-08-19 16:47:00,NaT,EW EMER.,EMERGENCY ROOM,HOME,WHITE,2169-08-18 07:57:00,2169-08-18 14:40:00,F,67,1945,NaT,-157,False


In [4]:
save_dataframe(encounters, "encounters.parquet")

2022-07-28 18:01:49,325 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to encounters.parquet


'encounters.parquet'

## Events

In [3]:
events_interface = mimic.events()

events_query = events_interface.query
events_query = qp.Drop(["warning", "itemid", "storetime"])(events_query)
events_interface = mimic.get_interface(events_query)

events_interface.save_in_grouped_batches("./test_batches2", ENCOUNTER_ID, int(1e7))

2022-07-28 14:29:01,651 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-07-28 14:29:01,652 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 34.050622 s
2022-07-28 14:30:02,815 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./test_batches2/batch_0000.parquet
2022-07-28 14:31:31,051 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./test_batches2/batch_0001.parquet
2022-07-28 14:32:40,506 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./test_batches2/batch_0002.parquet
2022-07-28 14:33:52,125 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./test_batches2/batch_0003.parquet
2022-07-28 14:35:00,202 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./test_batches2/batch_0004.parquet
2022-07-28 14:36:09,528 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./test_batches2/batch_0005.parquet
2022-07-28 14:37:18,628 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./test_batches2/bat

In [4]:
save_consequtive_dataframes("./test_batches2", "./0raw2", 4)

2022-07-28 16:14:40,371 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0000.parquet
2022-07-28 16:14:42,437 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0001.parquet
2022-07-28 16:14:44,664 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0002.parquet
2022-07-28 16:14:46,634 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0003.parquet


4


2022-07-28 16:14:51,039 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./0raw2/batch_0.parquet
2022-07-28 16:15:03,860 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0004.parquet
2022-07-28 16:15:05,990 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0005.parquet
2022-07-28 16:15:07,862 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0006.parquet
2022-07-28 16:15:09,708 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0007.parquet


4


2022-07-28 16:15:14,164 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./0raw2/batch_1.parquet
2022-07-28 16:15:27,400 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0008.parquet
2022-07-28 16:15:29,536 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0009.parquet
2022-07-28 16:15:31,461 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0010.parquet
2022-07-28 16:15:33,422 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0011.parquet


4


2022-07-28 16:15:37,895 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./0raw2/batch_2.parquet
2022-07-28 16:15:50,287 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0012.parquet
2022-07-28 16:15:52,455 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0013.parquet
2022-07-28 16:15:54,381 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0014.parquet
2022-07-28 16:15:56,442 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0015.parquet


4


2022-07-28 16:16:00,903 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./0raw2/batch_3.parquet
2022-07-28 16:16:13,372 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0016.parquet
2022-07-28 16:16:15,669 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0017.parquet
2022-07-28 16:16:17,581 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0018.parquet
2022-07-28 16:16:19,475 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0019.parquet


4


2022-07-28 16:16:24,017 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./0raw2/batch_4.parquet
2022-07-28 16:16:36,796 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0020.parquet
2022-07-28 16:16:38,926 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0021.parquet
2022-07-28 16:16:40,840 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0022.parquet
2022-07-28 16:16:42,808 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0023.parquet


4


2022-07-28 16:16:47,341 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./0raw2/batch_5.parquet
2022-07-28 16:16:59,590 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0024.parquet
2022-07-28 16:17:01,764 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0025.parquet
2022-07-28 16:17:03,696 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0026.parquet
2022-07-28 16:17:05,532 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0027.parquet


4


2022-07-28 16:17:10,015 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./0raw2/batch_6.parquet
2022-07-28 16:17:22,319 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0028.parquet
2022-07-28 16:17:24,496 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0029.parquet
2022-07-28 16:17:26,469 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0030.parquet
2022-07-28 16:17:28,345 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0031.parquet


4


2022-07-28 16:17:32,930 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./0raw2/batch_7.parquet
2022-07-28 16:17:45,585 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0032.parquet
2022-07-28 16:17:48,428 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./0raw2/batch_8.parquet


In [13]:
save_count = 0
generator = yield_dataframes("./test_batches2")
save_dir = "./1cleaned"
save_dir = process_dir_save_path(save_dir)

# RUNS IN BATCHES - RUN AFTER HERE

In [23]:
events = next(generator)
events = events.drop(["stay_id"], axis=1)
events

2022-07-28 18:05:38,310 [1;37mINFO[0m cyclops.utils.file - Loading DataFrame from ./test_batches2/batch_0001.parquet


Unnamed: 0,subject_id,encounter_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
0,13547939,20544092,2130-06-20 01:00:00,4,4.0,L/min,Alarms,Minute Volume Alarm - Low
1,13547939,20544092,2130-06-20 01:00:00,10,10.0,L/min,Alarms,Minute Volume Alarm - High
2,13547939,20544092,2130-06-20 01:00:00,5,5.0,cmH2O,Respiratory,PEEP set
3,13547939,20544092,2130-06-20 01:00:00,50,50.0,,Respiratory,Inspired O2 Fraction
4,13547939,20544092,2130-06-20 01:00:00,36.9,36.9,°C,Respiratory,Inspired Gas Temp.
...,...,...,...,...,...,...,...,...
9989910,13412848,20439864,2151-02-23 23:03:00,1,1.0,,Adm History/FHPA,Unable to assess activity / mobility
9989911,13412848,20439864,2151-02-23 23:03:00,0,0.0,,Adm History/FHPA,Intravenous / IV access prior to admission
9989912,13412848,20439864,2151-02-23 23:03:00,0,0.0,,Adm History/FHPA,ETOH
9989913,13412848,20439864,2151-02-23 23:03:00,0,0.0,,Adm History/FHPA,Recreational drug use


In [24]:
import warnings

from pandas.errors import PerformanceWarning


def add_years_approximate(
    timestamp_series: pd.Series, years_series: pd.Series
) -> pd.Series:
    """

    Approximates are typically either exact or incorrect by one day, e.g., on leap days.

    """
    # Add to the years column
    year = timestamp_series.dt.year + years_series

    # Handle the other columns
    month = timestamp_series.dt.month
    day = timestamp_series.dt.day
    hour = timestamp_series.dt.hour
    minute = timestamp_series.dt.minute

    # Create new timestamp column
    data = pd.DataFrame(
        {"year": year, "month": month, "day": day, "hour": hour, "minute": minute}
    )

    # Subtract 1 from potentially invalid leap days to avoid issues
    leap_days = (month == 2) & (day == 29)
    data["day"][leap_days] -= 1

    return pd.to_datetime(data)


def add_years_exact(timestamp_series: pd.Series, years_series: pd.Series) -> pd.Series:
    warnings.warn(
        "Computing the exact addition cannot be vectorized and is very slow. Consider using the quick, approximate calculation.",
        PerformanceWarning,
    )
    return timestamp_series + years_series.apply(lambda x: pd.DateOffset(years=x))

In [25]:
# Reverse deidentified dating
events = pd.merge(
    encounters[[ENCOUNTER_ID, "anchor_year_difference"]], events, on=ENCOUNTER_ID
)
events[EVENT_TIMESTAMP] = add_years_approximate(
    events[EVENT_TIMESTAMP], events["anchor_year_difference"]
)
events = events.drop("anchor_year_difference", axis=1)
events

Unnamed: 0,encounter_id,subject_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
0,20620061,11481318,2018-05-18 14:25:00,80.9,80.9,kg,General,Admission Weight (Kg)
1,20620061,11481318,2018-05-18 14:25:00,65,65.0,Inch,General,Height
2,20620061,11481318,2018-05-18 14:25:00,165,165.0,cm,General,Height (cm)
3,20620061,11481318,2018-05-18 16:00:00,SR (Sinus Rhythm),,,Routine Vital Signs,Heart Rhythm
4,20620061,11481318,2018-05-18 16:00:00,To Pain,2.0,,Neurological,GCS - Eye Opening
...,...,...,...,...,...,...,...,...
9989910,20534655,14408427,2009-02-12 11:00:00,WNL,,,Access Lines - Peripheral,20 Gauge Site Appear
9989911,20534655,14408427,2009-02-12 12:00:00,Calm/Cooperative,4.0,,Pain/Sedation,Riker-SAS Scale
9989912,20534655,14408427,2009-02-12 12:00:00,Yes,,,Pain/Sedation,Pain Present
9989913,20534655,14408427,2009-02-12 12:00:00,Aching,,,Pain/Sedation,Pain Type


In [26]:
# Create the target as a timeseries event
target_events = encounters[encounters[OUTCOME_DEATH] == True]
target_events = target_events[[ENCOUNTER_ID, "deathtime"]]
target_events = target_events.rename({"deathtime": EVENT_TIMESTAMP}, axis=1)
target_events[EVENT_NAME] = OUTCOME_DEATH
target_events[EVENT_CATEGORY] = TARGETS
target_events[EVENT_VALUE] = 1
target_events.head(5)

Unnamed: 0,encounter_id,event_timestamp,event_name,event_category,event_value
15,20252617,2012-06-25 12:25:00,outcome_death,targets,1
4030,20317279,2012-03-01 20:40:00,outcome_death,targets,1
4147,24422389,2018-06-06 15:00:00,outcome_death,targets,1
4148,21834123,2018-02-28 04:30:00,outcome_death,targets,1
4167,22256496,2009-08-03 15:30:00,outcome_death,targets,1


In [27]:
# Include target
events = pd.concat([events, target_events])

In [28]:
# Preprocessing
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
events[EVENT_CATEGORY] = normalize_categories(events[EVENT_CATEGORY])
# events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

# Concatenate event name and category since some names are the same in
# different categories, e.g., 'flow' for categories 'heartware' and 'ecmo'
events[EVENT_NAME] = events[EVENT_CATEGORY] + " - " + events[EVENT_NAME]
events.head(5)

Unnamed: 0,encounter_id,subject_id,event_timestamp,value,event_value,event_value_unit,event_category,event_name
0,20620061,11481318.0,2018-05-18 14:25:00,80.9,80.9,kg,general,general - admission weight
1,20620061,11481318.0,2018-05-18 14:25:00,65,65.0,Inch,general,general - height
2,20620061,11481318.0,2018-05-18 14:25:00,165,165.0,cm,general,general - height
3,20620061,11481318.0,2018-05-18 16:00:00,SR (Sinus Rhythm),,,routine vital signs,routine vital signs - heart rhythm
4,20620061,11481318.0,2018-05-18 16:00:00,To Pain,2.0,,neurological,neurological - gcs - eye opening


In [29]:
save_dataframe(events, join(save_dir, "batch_" + f"{save_count:04d}"))
save_count += 1

2022-07-28 18:06:05,269 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to ./1cleaned/batch_0001.parquet
