## Imports

In [1]:
import sys

sys.path.append("..")

import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import select, func, extract, desc
from sqlalchemy.sql.expression import and_

from evidently import ColumnMapping
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataQualityTab
from evidently.model_profile import Profile
from evidently.model_profile.sections import DataQualityProfileSection

import config
import cyclops
from cyclops.processors.column_names import (
    ENCOUNTER_ID,
    HOSPITAL_ID,
    ADMIT_TIMESTAMP,
    DISCHARGE_TIMESTAMP,
    AGE,
    SEX,
    TOTAL_COST,
    CITY,
    PROVINCE,
    COUNTRY,
    LANGUAGE,
    VITAL_MEASUREMENT_NAME,
    VITAL_MEASUREMENT_VALUE,
    VITAL_MEASUREMENT_TIMESTAMP,
    REFERENCE_RANGE,
)
from cyclops.processors.constants import EMPTY_STRING
from cyclops.processors.admin import AdminProcessor
from cyclops.processors.vitals import VitalsProcessor
from cyclops.processors.feature_handler import FeatureHandler
from cyclops.orm import Database

%load_ext autoreload
%load_ext nb_black

2022-03-06 13:57:24,078 [1;37mINFO[0m config          - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-06 13:57:26,348 [1;37mINFO[0m cyclops.processors.base - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-06 13:57:26,354 [1;37mINFO[0m cyclops.utils.profile - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-06 13:57:26,359 [1;37mINFO[0m cyclops.processors.admin - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-06 13:57:26,372 [1;37mINFO[0m cyclops.processors.vitals - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-06 13:57:26,441 [1;37mINFO[0m cyclops.orm     - Log file is /mnt/nfs/home/krishnanam/log.log


<IPython.core.display.Javascript object>

## Load config, setup ORM

In [2]:
cfg = config.read_config("../configs/default/*.yaml")
db = Database(cfg)

2022-03-06 13:57:27,281 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


<IPython.core.display.Javascript object>

## Quick check to see counts/sparsity of a certain field on GEMINI

In [3]:
UNIQUE_VALUES = "unique_values"
UNIQUE_VALUE_COUNTS = "unique_value_counts"
unique_vals_field = db.public.ip_administrative.language
query = select(
    unique_vals_field.label(UNIQUE_VALUES),
    func.count(unique_vals_field).label(UNIQUE_VALUE_COUNTS),
)
query = query.group_by(UNIQUE_VALUES).order_by(desc(UNIQUE_VALUE_COUNTS))

counts = db.run_query(query)
print(f"Num. of unique values: {len(counts)}")
print(counts.head(100))

2022-03-06 13:57:27,435 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-06 13:57:27,438 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.137188 s


Num. of unique values: 206
         unique_values  unique_value_counts
0                                    156141
1              English                96756
2              ENGLISH                36205
3                  ENG                26434
4                Other                 6306
..                 ...                  ...
95      Spanish & Eng.                    7
96         Yugoslavian                    7
97                 GUJ                    6
98  Filipino (Ilocano)                    6
99             Ilocano                    6

[100 rows x 2 columns]


<IPython.core.display.Javascript object>

## Extract slices

In [4]:
HOSPITAL = "SMH"
YEAR = 2020
LIMIT_LABS = 100000
LIMIT_MEDICATIONS = 100000
LIMIT_VITALS = 100000

<IPython.core.display.Javascript object>

## Query to get admin + diagnosis

In [5]:
query = (
    select(db.public.ip_administrative.x, db.public.diagnosis.x)
    .where(
        and_(
            db.public.ip_administrative.hospital_id == HOSPITAL,
            extract("year", db.public.ip_administrative.admit_date_time) == YEAR,
        )
    )
    .join(
        db.public.diagnosis.x,
        db.public.ip_administrative.genc_id == db.public.diagnosis.genc_id,
    )
)

data = db.run_query(query)
print(f"{len(data)} rows fetched!")

2022-03-06 13:57:28,073 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-06 13:57:28,076 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.594527 s


15840 rows fetched!


<IPython.core.display.Javascript object>

## Query to get admin + labs

In [6]:
query = (
    select(db.public.ip_administrative.x, db.public.lab.x)
    .where(
        and_(
            db.public.ip_administrative.hospital_id == HOSPITAL,
            extract("year", db.public.ip_administrative.admit_date_time) == YEAR,
        )
    )
    .join(db.public.lab.x, db.public.ip_administrative.genc_id == db.public.lab.genc_id)
    .where(db.public.lab.lab_test_name_mapped != "")
    .limit(LIMIT_LABS)
)  # Use limit here, since lab queries can have millions of rows!
data = db.run_query(query)
print(f"{len(data)} rows fetched!")

2022-03-06 13:57:45,816 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-06 13:57:45,820 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 17.718374 s


100000 rows fetched!


<IPython.core.display.Javascript object>

## Query to get admin + pharmacy

In [7]:
query = (
    select(db.public.ip_administrative.x, db.public.pharmacy.x)
    .where(
        and_(
            db.public.ip_administrative.hospital_id == HOSPITAL,
            extract("year", db.public.ip_administrative.admit_date_time) == YEAR,
        )
    )
    .join(
        db.public.pharmacy.x,
        db.public.ip_administrative.genc_id == db.public.pharmacy.genc_id,
    )
    .limit(LIMIT_MEDICATIONS)
)  # Use limit here, since pharmacy queries can have millions of rows!
data = db.run_query(query)
print(f"{len(data)} rows fetched!")

2022-03-06 13:57:49,235 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-06 13:57:49,238 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 3.376549 s


76421 rows fetched!


<IPython.core.display.Javascript object>

## Query to get admin + vitals

In [8]:
EXTRACT_SAVE_PATH = "/mnt/nfs/project/delirium/_extract/vitals/extract.h5"
query = (
    select(
        db.public.ip_administrative.genc_id.label(ENCOUNTER_ID),
        db.public.ip_administrative.hospital_id.label(HOSPITAL_ID),
        db.public.ip_administrative.admit_date_time.label(ADMIT_TIMESTAMP),
        db.public.ip_administrative.discharge_date_time.label(DISCHARGE_TIMESTAMP),
        db.public.ip_administrative.age.label(AGE),
        db.public.ip_administrative.gender.label(SEX),
        db.public.ip_administrative.language.label(LANGUAGE),
        db.public.ip_administrative.total_cost.label(TOTAL_COST),
        db.public.vitals.measurement_mapped.label(VITAL_MEASUREMENT_NAME),
        db.public.vitals.measurement_value.label(VITAL_MEASUREMENT_VALUE),
        db.public.vitals.measure_date_time.label(VITAL_MEASUREMENT_TIMESTAMP),
        db.public.vitals.reference_range.label(REFERENCE_RANGE),
    )
    .where(
        and_(
            db.public.ip_administrative.hospital_id == HOSPITAL,
            extract("year", db.public.ip_administrative.admit_date_time) == YEAR,
        )
    )
    .join(
        db.public.vitals.x,
        db.public.ip_administrative.genc_id == db.public.vitals.genc_id,
    )
    .where(
        db.public.vitals.measurement_mapped != EMPTY_STRING,
    )
    .limit(LIMIT_VITALS)
)
vitals_data = db.run_query(query)
print(f"{len(vitals_data)} rows fetched!")
vitals_data.to_hdf(
    EXTRACT_SAVE_PATH,
    key=f"query_gemini_admin_vitals_{HOSPITAL}_{YEAR}",
)

2022-03-06 13:57:50,893 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-06 13:57:50,896 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 1.543095 s
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['hospital_id', 'sex', 'language', 'total_cost',
       'vital_measurement_name', 'vital_measurement_value', 'reference_range'],
      dtype='object')]

  pytables.to_hdf(


100000 rows fetched!


<IPython.core.display.Javascript object>

## Process Admin data

In [9]:
%autoreload 2

data = pd.read_hdf(
    EXTRACT_SAVE_PATH, key=f"query_gemini_admin_vitals_{HOSPITAL}_{YEAR}"
)
must_have_columns = [ENCOUNTER_ID, AGE, SEX]

admin_processor = AdminProcessor(data, must_have_columns)
admin_features = admin_processor.process()

feature_handler = FeatureHandler()
feature_handler.add_features(admin_features)

2022-03-06 13:58:01,240 [1;37mINFO[0m cyclops.processors.base - Processing raw admin data...
2022-03-06 13:58:01,244 [1;37mINFO[0m cyclops.processors.base - # samples: 100000, # encounters: 639
2022-03-06 13:58:01,248 [1;37mINFO[0m cyclops.processors.admin - # admin features: 2, # encounters: 639
2022-03-06 13:58:01,603 [1;37mINFO[0m cyclops.utils.profile - Finished executing function process in 0.363192 s


<IPython.core.display.Javascript object>

## Merge back admin + vitals

## Process vitals data

In [10]:
%autoreload 2

data = pd.read_hdf(
    EXTRACT_SAVE_PATH, key=f"query_gemini_admin_vitals_{HOSPITAL}_{YEAR}"
)
must_have_columns = [
    ENCOUNTER_ID,
    ADMIT_TIMESTAMP,
    VITAL_MEASUREMENT_NAME,
    VITAL_MEASUREMENT_VALUE,
    VITAL_MEASUREMENT_TIMESTAMP,
    REFERENCE_RANGE,
]
feature_handler = FeatureHandler()
vitals_processor = VitalsProcessor(data, must_have_columns)
vitals_features = vitals_processor.process()
feature_handler.add_features(vitals_features)

print("Reference range unique values")
print(vitals_processor.data[REFERENCE_RANGE].unique())

2022-03-06 13:58:01,823 [1;37mINFO[0m cyclops.processors.base - Processing raw vitals data...
2022-03-06 13:58:01,827 [1;37mINFO[0m cyclops.processors.base - # samples: 100000, # encounters: 639
2022-03-06 13:58:01,842 [1;37mINFO[0m cyclops.processors.base - Filtering vitals within aggregation window...
2022-03-06 13:58:01,845 [1;37mINFO[0m cyclops.processors.base - # samples: 15807, # encounters: 629
2022-03-06 13:58:01,870 [1;37mINFO[0m cyclops.processors.base - Drop oxygen flow rate, saturation samples...
2022-03-06 13:58:01,872 [1;37mINFO[0m cyclops.processors.base - # samples: 11954, # encounters: 629
2022-03-06 13:58:01,909 [1;37mINFO[0m cyclops.processors.base - Convert Positive/Negative to 1/0...
2022-03-06 13:58:01,911 [1;37mINFO[0m cyclops.processors.base - # samples: 11954, # encounters: 629
2022-03-06 13:58:01,915 [1;37mINFO[0m cyclops.processors.vitals - Converting string result values to numeric...
2022-03-06 13:58:01,921 [1;37mINFO[0m cyclops.processo

Reference range unique values
['NA-NA' '20-200' '20-220' '25-45' '0-40' '50-250' '20-500' '0-100']


<IPython.core.display.Javascript object>

## Data quality report (admin + vitals)

In [11]:
dashboard = Dashboard(tabs=[DataQualityTab()])
column_mapping = ColumnMapping(numerical_features=vitals_features.columns)
dashboard.calculate(
    reference_data=feature_handler.df, current_data=None, column_mapping=column_mapping
)
dashboard.save(f"vitals_processed_{HOSPITAL}_{YEAR}.html")

<IPython.core.display.Javascript object>