# Imports

In [1]:
import time
import pandas as pd
from cyclops.processors.clean import normalize_names, normalize_values
from cyclops.processors.column_names import (
    DISCHARGE_TIMESTAMP,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_VALUE,
)
from cyclops.processors.feature.split import intersect_datasets
from cyclops.utils.file import join, save_dataframe
from drift_detection.gemini.mortality.constants import (
    CLEANED_DIR,
    ENCOUNTERS_FILE,
    OUTCOME_DEATH,
    QUERIED_DIR,
    TARGET_TIMESTAMP,
)
from drift_detection.gemini.query import main

2022-11-08 13:03:20,075 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


# Query

In [2]:
t = time.time()
cohort, events = main()
print(time.time() - t)
cohort

2022-11-08 13:03:26,162 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-11-08 13:03:26,167 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 6.004662 s
2022-11-08 13:03:28,959 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-11-08 13:03:28,964 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 2.793898 s
2022-11-08 13:03:49,658 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-11-08 13:03:49,663 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 19.888311 s
2022-11-08 13:04:04,505 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-11-08 13:04:05,606 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 2.988089 s
2022-11-08 13:04:08,146 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-11-08 13:04:08,150 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_quer

767.5443470478058


Unnamed: 0,encounter_id,admit_timestamp,discharge_timestamp,age,sex,hospital_id,outcome_death,readmission,from_nursing_home_mapped,from_acute_care_institution_mapped,los_derived,prev_encounter_count,diagnosis_code,diagnosis_trajectory,admit_via_ambulance,triage_level
0,15320930,2017-10-22 10:30:00,2018-03-14 16:00:00,88,F,THPC,False,nota,False,False,143.229170,0,I500,I00_I99,no_ambulance,emergent
1,15535918,2016-08-30 12:00:00,2016-09-09 18:09:00,28,F,THPC,False,nota,False,False,10.256250,0,O0803,O00_O99,no_ambulance,emergent
2,15612180,2014-11-12 04:11:00,2014-11-20 18:53:00,65,F,THPC,False,nota,False,False,8.612500,0,G939,G00_G99,no_ambulance,emergent
3,15682885,2017-04-06 05:15:00,2017-04-07 22:06:00,73,F,THPC,False,new_to_acute,False,False,1.702083,0,S32010,S00_T88,ground,emergent
4,15712185,2012-11-14 03:30:00,2012-11-26 14:42:00,65,M,THPC,False,new_to_acute,False,False,12.466666,0,G952,G00_G99,no_ambulance,urgent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143318,12349773,2020-02-11 10:15:00,2020-02-11 13:15:00,63,M,SBK,True,planned_from_acute,False,False,0.125000,0,Z5181,Z00_Z99,no_info,no_info
143319,13651026,2020-04-14 19:14:00,2020-04-27 12:15:00,95,F,UHNTG,True,nota,False,False,12.709028,6,C569,C00_D49,ground,urgent
143320,13933236,2018-12-29 20:53:00,2019-01-21 13:00:00,57,M,UHNTG,True,nota,False,False,22.671528,0,Z515,Z00_Z99,no_ambulance,emergent
143321,15508246,2018-04-05 18:14:00,2018-04-17 18:03:00,91,M,THPM,True,nota,False,False,11.992361,1,Z515,Z00_Z99,ground,urgent


In [3]:
cohort[OUTCOME_DEATH].sum() / len(cohort)

0.10419123239117238

In [4]:
events

Unnamed: 0,encounter_id,event_name,event_value,event_value_unit,event_timestamp,event_category
0,15320930,Mean Cell Volume (MCV),,fL,2017-11-07 06:00:00,labs
1,15320930,Mean Cell Volume (MCV),,fL,2017-11-07 20:36:00,labs
2,15320930,Mean Cell Volume (MCV),,fL,2017-11-08 10:00:00,labs
3,15320930,Mean Cell Volume (MCV),,fL,2017-11-27 06:00:00,labs
4,15320930,Mean Cell Volume (MCV),,fL,2017-12-12 06:00:00,labs
...,...,...,...,...,...,...
17502598,14802293,unmapped_intervention,1,,2015-03-25 00:00:00,interventions
17502599,14777944,unmapped_intervention,1,,2015-03-27 00:00:00,interventions
17502600,14531456,unmapped_intervention,1,,2015-03-30 00:00:00,interventions
17502601,14974045,unmapped_intervention,1,,2014-05-29 00:00:00,interventions


In [5]:
# Intersect over encounter IDs to get only those encounters common to both
cohort, events = intersect_datasets([cohort, events], ENCOUNTER_ID)

In [6]:
save_dataframe(events, join(QUERIED_DIR, "batch_0000.parquet"))

2022-11-08 13:16:27,570 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/0-queried/batch_0000.parquet


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/0-queried/batch_0000.parquet'

# Clean / Preprocess

In [7]:
death_events = cohort[cohort[OUTCOME_DEATH] == True]  # noqa: E712
death_events = death_events[[ENCOUNTER_ID, DISCHARGE_TIMESTAMP]]
death_events = death_events.rename({DISCHARGE_TIMESTAMP: TARGET_TIMESTAMP}, axis=1)
cohort = pd.merge(cohort, death_events, on=ENCOUNTER_ID, how="left")
cohort

Unnamed: 0,encounter_id,admit_timestamp,discharge_timestamp,age,sex,hospital_id,outcome_death,readmission,from_nursing_home_mapped,from_acute_care_institution_mapped,los_derived,prev_encounter_count,diagnosis_code,diagnosis_trajectory,admit_via_ambulance,triage_level,deathtime
0,11100040,2018-07-03 21:36:00,2018-08-03 09:35:00,84,M,SMH,False,planned_from_acute,False,False,30.499306,1,G459,G00_G99,ground,resuscitation,NaT
1,11100041,2016-12-26 18:21:00,2016-12-27 11:00:00,76,F,SMH,False,new_to_acute,False,False,0.693750,0,R55,R00_R99,ground,emergent,NaT
2,11100072,2016-08-13 15:20:00,2016-08-16 16:45:00,72,F,SMH,False,new_to_acute,False,False,3.059028,0,J441,J00_J99,no_ambulance,emergent,NaT
3,11100095,2015-12-20 18:36:00,2015-12-30 10:25:00,84,M,SMH,False,unplanned_8_to_28_day_acute,False,False,9.659028,0,T826,S00_T88,no_ambulance,emergent,NaT
4,11100097,2019-05-23 06:09:00,2019-05-28 13:07:00,65,M,SMH,False,planned_from_acute,True,False,5.290278,7,N390,N00_N99,ground,emergent,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143044,15999822,2012-04-09 21:44:00,2012-04-10 17:21:00,61,M,THPM,False,new_to_acute,False,False,0.817361,0,I635,I00_I99,ground,emergent,NaT
143045,15999864,2018-04-30 20:30:00,2018-05-06 12:46:00,58,F,THPC,False,new_to_acute,False,False,5.677778,0,L032,L00_L99,no_ambulance,emergent,NaT
143046,15999918,2019-09-18 05:59:00,2019-09-20 14:45:00,68,F,THPM,False,new_to_acute,False,False,2.365278,0,R55,R00_R99,ground,emergent,NaT
143047,15999943,2015-01-17 08:17:00,2015-01-23 11:33:00,78,F,THPM,False,nota,False,False,6.136111,0,J09,J00_J99,ground,urgent,NaT


In [8]:
save_dataframe(cohort, ENCOUNTERS_FILE)

2022-11-08 13:21:01,087 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/encounters.parquet


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/encounters.parquet'

In [9]:
# Normalize names and string values
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

# Convert values to numeric, dropping those which can't be converted
events[EVENT_VALUE] = pd.to_numeric(events[EVENT_VALUE], errors="coerce")
print("Length before:", len(events))
events = events[~events[EVENT_VALUE].isna()]
print("Length after:", len(events))
events

Length before: 17502603
Length after: 17029291


Unnamed: 0,encounter_id,event_name,event_value,event_value_unit,event_timestamp,event_category
17371788,11100040,unmapped_intervention,1.0,,2018-07-04 00:00:00,interventions
15174588,11100040,glucose point of care,10.9,mmol/L,2018-07-11 21:08:00,labs
15174587,11100040,glucose point of care,10.9,mmol/L,2018-07-04 21:18:00,labs
15174586,11100040,glucose point of care,10.8,mmol/L,2018-07-06 12:25:00,labs
15174585,11100040,glucose point of care,10.7,mmol/L,2018-07-18 17:15:00,labs
...,...,...,...,...,...,...
66069,15999969,albumin,28.0,g/L,2015-02-20 03:50:00,labs
66068,15999969,aptt,26.5,SEC,2015-02-17 22:00:00,labs
66067,15999969,aptt,24.8,SEC,2015-02-19 05:50:00,labs
66076,15999969,arterial paco2,31.0,MMHG,2015-02-19 05:50:00,labs


In [10]:
save_dataframe(events, join(CLEANED_DIR, "batch_0000.parquet"))

2022-11-08 13:26:45,574 [1;37mINFO[0m cyclops.utils.file - Saving dataframe to /mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/1-cleaned/batch_0000.parquet


'/mnt/nfs/project/delirium/drift_exp/OCT-18-2022/gemini/mortality_decompensation/./data/1-cleaned/batch_0000.parquet'