## Imports

In [1]:
import sys
from functools import reduce

sys.path.append("..")

import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import select, func, extract, desc
from sqlalchemy.sql.expression import and_

from evidently import ColumnMapping
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataQualityTab
from evidently.model_profile import Profile
from evidently.model_profile.sections import DataQualityProfileSection

import config
import cyclops
from cyclops.processors.column_names import (
    ENCOUNTER_ID,
    HOSPITAL_ID,
    ADMIT_TIMESTAMP,
    DISCHARGE_TIMESTAMP,
    DISCHARGE_DISPOSITION,
    READMISSION,
    AGE,
    SEX,
    TOTAL_COST,
    CITY,
    PROVINCE,
    COUNTRY,
    LANGUAGE,
    LENGTH_OF_STAY_IN_ER,
    VITAL_MEASUREMENT_NAME,
    VITAL_MEASUREMENT_VALUE,
    VITAL_MEASUREMENT_TIMESTAMP,
    LAB_TEST_NAME,
    LAB_TEST_TIMESTAMP,
    LAB_TEST_RESULT_VALUE,
    LAB_TEST_RESULT_UNIT,
    REFERENCE_RANGE,
)
from cyclops.processors.constants import EMPTY_STRING
from cyclops.processors.admin import AdminProcessor
from cyclops.processors.vitals import VitalsProcessor
from cyclops.processors.labs import LabsProcessor
from cyclops.processors.outcomes import OutcomesProcessor
from cyclops.processors.feature_handler import FeatureHandler
from cyclops.orm import Database

%load_ext autoreload
%load_ext nb_black

2022-03-10 16:17:42,677 [1;37mINFO[0m config          - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-03-10 16:17:42,690 [1;37mINFO[0m cyclops.processors.base - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-03-10 16:17:42,728 [1;37mINFO[0m cyclops.utils.profile - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-03-10 16:17:42,754 [1;37mINFO[0m cyclops.processors.admin - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-03-10 16:17:42,776 [1;37mINFO[0m cyclops.processors.vitals - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-03-10 16:17:42,828 [1;37mINFO[0m cyclops.processors.labs - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-03-10 16:17:42,910 [1;37mINFO[0m cyclops.processors.outcomes - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-03-10 16:17:43,815 [1;37mINFO[0m cyclops.processors.feature_handler - Log file is /mnt/nfs/home/krishnanam/

<IPython.core.display.Javascript object>

## Load config, setup ORM

In [2]:
cfg = config.read_config("../configs/default/*.yaml")
db = Database(cfg)

2022-03-10 16:17:44,832 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


<IPython.core.display.Javascript object>

## Quick check to see counts/sparsity of a certain field on GEMINI

In [3]:
UNIQUE_VALUES = "unique_values"
UNIQUE_VALUE_COUNTS = "unique_value_counts"
unique_vals_field = db.public.ip_administrative.language
query = select(
    unique_vals_field.label(UNIQUE_VALUES),
    func.count(unique_vals_field).label(UNIQUE_VALUE_COUNTS),
)
query = query.group_by(UNIQUE_VALUES).order_by(desc(UNIQUE_VALUE_COUNTS))

counts = db.run_query(query)
print(f"Num. of unique values: {len(counts)}")
print(counts.head(100))

2022-03-10 16:17:45,242 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-10 16:17:45,245 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.145718 s


Num. of unique values: 206
         unique_values  unique_value_counts
0                                    156141
1              English                96756
2              ENGLISH                36205
3                  ENG                26434
4                Other                 6306
..                 ...                  ...
95      Spanish & Eng.                    7
96         Yugoslavian                    7
97                 GUJ                    6
98  Filipino (Ilocano)                    6
99             Ilocano                    6

[100 rows x 2 columns]


<IPython.core.display.Javascript object>

## Extract slices

In [4]:
HOSPITALS = ["SMH"]
YEARS = [2018, 2019, 2020]
LIMIT_LABS = 100000
LIMIT_MEDICATIONS = 100000
LIMIT_VITALS = 100000

EXTRACT_SAVE_PATH = "/mnt/nfs/project/delirium/_extract/first_models/extract.h5"

<IPython.core.display.Javascript object>

## Query just admin data

In [5]:
query = select(
    db.public.ip_administrative.genc_id.label(ENCOUNTER_ID),
    db.public.ip_administrative.hospital_id.label(HOSPITAL_ID),
    db.public.ip_administrative.admit_date_time.label(ADMIT_TIMESTAMP),
    db.public.ip_administrative.discharge_date_time.label(DISCHARGE_TIMESTAMP),
    db.public.ip_administrative.discharge_disposition.label(DISCHARGE_DISPOSITION),
    db.public.ip_administrative.age.label(AGE),
    db.public.ip_administrative.gender.label(SEX),
).where(
    and_(
        db.public.ip_administrative.hospital_id.in_(HOSPITALS),
        extract("year", db.public.ip_administrative.admit_date_time).in_(YEARS),
    )
)
admin_labs_data = db.run_query(query)
print(f"{len(admin_labs_data)} rows fetched!")
admin_labs_data.to_hdf(
    EXTRACT_SAVE_PATH,
    key=f"query_gemini_admin",
)

2022-03-10 16:17:45,628 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-10 16:17:45,630 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.091586 s


10756 rows fetched!


<IPython.core.display.Javascript object>

## Query to get admin + labs

In [6]:
query = (
    select(
        db.public.ip_administrative.genc_id.label(ENCOUNTER_ID),
        db.public.ip_administrative.hospital_id.label(HOSPITAL_ID),
        db.public.ip_administrative.admit_date_time.label(ADMIT_TIMESTAMP),
        db.public.ip_administrative.discharge_date_time.label(DISCHARGE_TIMESTAMP),
        db.public.lab.lab_test_name_mapped.label(LAB_TEST_NAME),
        db.public.lab.result_value.label(LAB_TEST_RESULT_VALUE),
        db.public.lab.result_unit.label(LAB_TEST_RESULT_UNIT),
        db.public.lab.sample_collection_date_time.label(LAB_TEST_TIMESTAMP),
        db.public.lab.reference_range.label(REFERENCE_RANGE),
    )
    .where(
        and_(
            db.public.ip_administrative.hospital_id.in_(HOSPITALS),
            extract("year", db.public.ip_administrative.admit_date_time).in_(YEARS),
            db.public.lab.lab_test_name_mapped != EMPTY_STRING,
        )
    )
    .join(
        db.public.lab.data,
        db.public.ip_administrative.genc_id == db.public.lab.genc_id,
    )
    .limit(LIMIT_LABS)
)  # Use limit here, since lab queries can have millions of rows!
admin_labs_data = db.run_query(query)
print(f"{len(admin_labs_data)} rows fetched!")
admin_labs_data.to_hdf(
    EXTRACT_SAVE_PATH,
    key=f"query_gemini_admin_labs",
)

2022-03-10 16:18:10,646 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-10 16:18:10,651 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 24.685511 s


100000 rows fetched!


<IPython.core.display.Javascript object>

## Query to get admin + vitals

In [7]:
query = (
    select(
        db.public.ip_administrative.genc_id.label(ENCOUNTER_ID),
        db.public.ip_administrative.hospital_id.label(HOSPITAL_ID),
        db.public.ip_administrative.age.label(AGE),
        db.public.ip_administrative.gender.label(SEX),
        db.public.ip_administrative.residence_code,
        db.public.ip_administrative.admit_date_time.label(ADMIT_TIMESTAMP),
        db.public.ip_administrative.discharge_date_time.label(DISCHARGE_TIMESTAMP),
        db.public.vitals.measurement_mapped.label(VITAL_MEASUREMENT_NAME),
        db.public.vitals.measurement_value.label(VITAL_MEASUREMENT_VALUE),
        db.public.vitals.measure_date_time.label(VITAL_MEASUREMENT_TIMESTAMP),
        db.public.vitals.reference_range.label(REFERENCE_RANGE),
    )
    .where(
        and_(
            db.public.ip_administrative.hospital_id.in_(HOSPITALS),
            extract("year", db.public.ip_administrative.admit_date_time).in_(YEARS),
        )
    )
    .join(
        db.public.vitals.data,
        db.public.ip_administrative.genc_id == db.public.vitals.genc_id,
    )
    .where(
        db.public.vitals.measurement_mapped != EMPTY_STRING,
    )
    .limit(LIMIT_VITALS)
)
admin_vitals_data = db.run_query(query)
print(f"{len(admin_vitals_data)} rows fetched!")
admin_vitals_data.to_hdf(
    EXTRACT_SAVE_PATH,
    key=f"query_gemini_admin_vitals",
)

2022-03-10 16:18:11,845 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-10 16:18:11,848 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.851991 s


100000 rows fetched!


<IPython.core.display.Javascript object>

## Query Length of Stay in ER

In [8]:
query = (
    select(
        db.public.ip_administrative.genc_id.label(ENCOUNTER_ID),
        db.public.er_administrative.duration_er_stay_derived.label(
            LENGTH_OF_STAY_IN_ER
        ),
    )
    .where(
        and_(
            db.public.ip_administrative.hospital_id.in_(HOSPITALS),
            extract("year", db.public.ip_administrative.admit_date_time).in_(YEARS),
        )
    )
    .join(
        db.public.er_administrative.data,
        db.public.ip_administrative.genc_id == db.public.er_administrative.genc_id,
    )
)
los_er_data = db.run_query(query)
print(f"{len(los_er_data)} rows fetched!")
los_er_data.to_hdf(
    EXTRACT_SAVE_PATH,
    key=f"query_gemini_los_er",
)

2022-03-10 16:18:12,375 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-10 16:18:12,378 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.079155 s


10419 rows fetched!


<IPython.core.display.Javascript object>

## Process Admin data

In [9]:
%autoreload 2

data = pd.read_hdf(EXTRACT_SAVE_PATH, key=f"query_gemini_admin")
must_have_columns = [ENCOUNTER_ID, AGE, SEX]

admin_processor = AdminProcessor(data, must_have_columns)
admin_features = admin_processor.process()

2022-03-10 16:18:12,451 [1;37mINFO[0m cyclops.processors.base - Processing raw admin data...
2022-03-10 16:18:12,455 [1;37mINFO[0m cyclops.processors.base - # samples: 10756, # encounters: 10756
2022-03-10 16:18:12,457 [1;37mINFO[0m cyclops.processors.admin - # admin features: 2, # encounters: 10756
2022-03-10 16:18:18,601 [1;37mINFO[0m cyclops.utils.profile - Finished executing function process in 6.149496 s


<IPython.core.display.Javascript object>

## Process Labs data

In [10]:
%autoreload 2

data = pd.read_hdf(EXTRACT_SAVE_PATH, key=f"query_gemini_admin_labs")
must_have_columns = [
    ENCOUNTER_ID,
    ADMIT_TIMESTAMP,
    LAB_TEST_NAME,
    LAB_TEST_TIMESTAMP,
    LAB_TEST_RESULT_VALUE,
    LAB_TEST_RESULT_UNIT,
    REFERENCE_RANGE,
]
labs_processor = LabsProcessor(data, must_have_columns)
labs_features = labs_processor.process()

2022-03-10 16:18:18,905 [1;37mINFO[0m cyclops.processors.base - Processing raw lab data...
2022-03-10 16:18:18,910 [1;37mINFO[0m cyclops.processors.base - # samples: 100000, # encounters: 10269
2022-03-10 16:18:18,938 [1;37mINFO[0m cyclops.processors.base - Aggregating labs within aggregation window...
2022-03-10 16:18:18,942 [1;37mINFO[0m cyclops.processors.base - # samples: 43307, # encounters: 10168
2022-03-10 16:18:19,004 [1;37mINFO[0m cyclops.processors.base - Remove text in parentheses and normalize lab test names...
2022-03-10 16:18:19,007 [1;37mINFO[0m cyclops.processors.base - # samples: 43307, # encounters: 10168
2022-03-10 16:18:19,027 [1;37mINFO[0m cyclops.processors.base - Drop unsupported...
2022-03-10 16:18:19,030 [1;37mINFO[0m cyclops.processors.base - # samples: 30060, # encounters: 9822
2022-03-10 16:18:19,078 [1;37mINFO[0m cyclops.processors.base - Fixing inequalities and removing outlier values...
2022-03-10 16:18:19,081 [1;37mINFO[0m cyclops.pro

<IPython.core.display.Javascript object>

## Process vitals data

In [11]:
%autoreload 2

data = pd.read_hdf(EXTRACT_SAVE_PATH, key=f"query_gemini_admin_vitals")
must_have_columns = [
    ENCOUNTER_ID,
    ADMIT_TIMESTAMP,
    VITAL_MEASUREMENT_NAME,
    VITAL_MEASUREMENT_VALUE,
    VITAL_MEASUREMENT_TIMESTAMP,
    REFERENCE_RANGE,
]

vitals_processor = VitalsProcessor(data, must_have_columns)
vitals_features = vitals_processor.process()

print("Reference range unique values")
print(vitals_processor.data[REFERENCE_RANGE].unique())

2022-03-10 16:18:23,776 [1;37mINFO[0m cyclops.processors.base - Processing raw vitals data...
2022-03-10 16:18:23,781 [1;37mINFO[0m cyclops.processors.base - # samples: 100000, # encounters: 493
2022-03-10 16:18:23,797 [1;37mINFO[0m cyclops.processors.base - Aggregating vitals within aggregation window...
2022-03-10 16:18:23,801 [1;37mINFO[0m cyclops.processors.base - # samples: 11794, # encounters: 482
2022-03-10 16:18:23,819 [1;37mINFO[0m cyclops.processors.base - Drop oxygen flow rate, saturation samples (unsupported)...
2022-03-10 16:18:23,822 [1;37mINFO[0m cyclops.processors.base - # samples: 8940, # encounters: 482
2022-03-10 16:18:23,852 [1;37mINFO[0m cyclops.processors.base - Convert Positive/Negative to 1/0...
2022-03-10 16:18:23,854 [1;37mINFO[0m cyclops.processors.base - # samples: 8940, # encounters: 482
2022-03-10 16:18:23,858 [1;37mINFO[0m cyclops.processors.base - Fill empty result string values with NaN...
2022-03-10 16:18:23,860 [1;37mINFO[0m cyclop

Reference range unique values
['0-40' '20-500' 'NA-NA' '20-200' '50-250' '25-45' '20-220' '0-100']


<IPython.core.display.Javascript object>

## Merge processed admin, labs, vitals

## Look at some outcomes

In [12]:
# Readmission codes on GEMINI
lookup_query = select(
    db.public.lookup_ip_administrative.variable,
    db.public.lookup_ip_administrative.value,
    db.public.lookup_ip_administrative.description,
).subquery()
admin_lookup_data = db.run_query(lookup_query)
admin_lookup_data["variable"].unique()

readmission_codes = admin_lookup_data.loc[
    admin_lookup_data["variable"] == "readmission"
]
for code, desc in zip(readmission_codes["value"], readmission_codes["description"]):
    print(code, desc)

2022-03-10 16:18:24,834 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-10 16:18:24,837 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.019360 s


1 Planned readmission from previous acute care (no time restriction)
2 Unplanned readmission within 7 days following discharge from acute care
3 Unplanned readmission 8 to 28 days following discharge from acute care
4 Unplanned readmission within 7 days following discharge from day surgery
5 New patient to the acute care unit
9 None of the above


<IPython.core.display.Javascript object>

## Process outcomes data

In [13]:
%autoreload 2

admin_data = pd.read_hdf(EXTRACT_SAVE_PATH, key=f"query_gemini_admin")
los_er_data = pd.read_hdf(EXTRACT_SAVE_PATH, key=f"query_gemini_los_er")

data = pd.merge(admin_data, los_er_data, how="outer")
must_have_columns = [
    ENCOUNTER_ID,
    AGE,
    SEX,
    DISCHARGE_DISPOSITION,
    LENGTH_OF_STAY_IN_ER,
]
outcomes_processor = OutcomesProcessor(data, must_have_columns)
outcome_targets = outcomes_processor.process()
outcome_targets

2022-03-10 16:18:24,947 [1;37mINFO[0m cyclops.processors.base - Processing raw outcomes data...
2022-03-10 16:18:24,951 [1;37mINFO[0m cyclops.processors.base - # samples: 10756, # encounters: 10756
2022-03-10 16:18:24,972 [1;37mINFO[0m cyclops.utils.profile - Finished executing function process in 0.024523 s


Unnamed: 0,mortality_in_hospital,length_of_stay_in_er
11934843,False,33.450000
11939459,False,11.100000
11940320,False,17.433332
11966194,False,25.783333
11967049,False,10.833333
...,...,...
11876913,False,13.983334
11916858,False,40.083332
11906350,False,13.733334
11921461,False,32.583332


<IPython.core.display.Javascript object>

## Merge processed admin, labs, vitals, outcomes

In [14]:
feature_handler = FeatureHandler()
feature_handler.add_features(admin_features)
feature_handler.add_features(labs_features)
feature_handler.add_features(vitals_features)
feature_handler.add_features(outcome_targets)

print(
    len(feature_handler.features),
    len(admin_features),
    len(labs_features),
    len(vitals_features),
    len(outcome_targets),
)

10756 10756 9822 482 10756


<IPython.core.display.Javascript object>

## Data quality report (admin + labs + vitals + outcomes)

In [None]:
dashboard = Dashboard(tabs=[DataQualityTab()])
print(feature_handler.get_numerical_features())
print(feature_handler.get_categorical_features())
column_mapping = ColumnMapping(
    numerical_features=feature_handler.get_numerical_features(),
    categorical_features=feature_handler.get_categorical_features(),
)
dashboard.calculate(
    reference_data=feature_handler.features,
    current_data=None,
    column_mapping=column_mapping,
)
dashboard.save(f"data_processed.html")

['age', 'mean cell volume', 'arterial paco2', 'venous pco2', 'arterial ph', 'respiratory rate', 'weight', 'CAM', 'Diastolic BP', 'Systolic BP', 'Temperature', 'Heart Rate', 'fio2', 'length_of_stay_in_er']
['sex', 'mortality_in_hospital']
