## Imports

In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import select, func, extract, desc
from sqlalchemy.sql.expression import and_

from evidently import ColumnMapping
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataQualityTab
from evidently.model_profile import Profile
from evidently.model_profile.sections import DataQualityProfileSection

import config
import cyclops
from cyclops.processors.column_names import (
    ENCOUNTER_ID,
    HOSPITAL_ID,
    ADMIT_TIMESTAMP,
    DISCHARGE_TIMESTAMP,
    AGE,
    SEX,
    TOTAL_COST,
    CITY,
    PROVINCE,
    COUNTRY,
    LANGUAGE,
    VITAL_MEASUREMENT_NAME,
    VITAL_MEASUREMENT_VALUE,
    VITAL_MEASUREMENT_TIMESTAMP,
    REFERENCE_RANGE
)
from cyclops.processors.constants import EMPTY_STRING
from cyclops.processors.admin import AdminProcessor
from cyclops.processors.vitals import VitalsProcessor
from cyclops.processors.feature_handler import FeatureHandler
from cyclops.orm import Database

%load_ext autoreload

2022-03-04 21:24:26,163 [1;37mINFO[0m config          - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-04 21:24:26,171 [1;37mINFO[0m cyclops.processors.base - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-04 21:24:26,177 [1;37mINFO[0m cyclops.utils.profile - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-04 21:24:26,182 [1;37mINFO[0m cyclops.processors.admin - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-04 21:24:26,187 [1;37mINFO[0m cyclops.processors.vitals - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-04 21:24:26,267 [1;37mINFO[0m cyclops.orm     - Log file is /mnt/nfs/home/krishnanam/log.log


## Load config, setup ORM

In [2]:
cfg = config.read_config("../configs/default/*.yaml")
db = Database(cfg)

2022-03-04 21:24:26,977 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


## Quick check to see counts/sparsity of a certain field on GEMINI

In [3]:
UNIQUE_VALUES = 'unique_values'
UNIQUE_VALUE_COUNTS = 'unique_value_counts'
unique_vals_field = db.public.ip_administrative.language
query = select(unique_vals_field.label(UNIQUE_VALUES),
               func.count(unique_vals_field).label(UNIQUE_VALUE_COUNTS)
              )
query = query.group_by(UNIQUE_VALUES).order_by(desc(UNIQUE_VALUE_COUNTS))

counts = db.run_query(query)
print(f"Num. of unique values: {len(counts)}")
print(counts.head(100))

2022-03-04 21:24:27,126 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-04 21:24:27,128 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.13529729843139648 s


Num. of unique values: 206
         unique_values  unique_value_counts
0                                    156141
1              English                96756
2              ENGLISH                36205
3                  ENG                26434
4                Other                 6306
..                 ...                  ...
95      Spanish & Eng.                    7
96         Yugoslavian                    7
97                 GUJ                    6
98  Filipino (Ilocano)                    6
99             Ilocano                    6

[100 rows x 2 columns]


## Extract slices

In [4]:
HOSPITAL = "SMH"
YEAR = 2020
LIMIT_LABS = 100000
LIMIT_MEDICATIONS = 100000
LIMIT_VITALS = 100000

## Query to get admin + diagnosis

In [5]:
query = select(db.public.ip_administrative.x,
               db.public.diagnosis.x
              ).where(and_(db.public.ip_administrative.hospital_id == HOSPITAL,
                           extract('year', db.public.ip_administrative.admit_date_time) == YEAR))\
              .join(db.public.diagnosis.x, 
                    db.public.ip_administrative.genc_id == db.public.diagnosis.genc_id
                   )
              
data = db.run_query(query)
print(f"{len(data)} rows fetched!")

2022-03-04 21:24:27,753 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-04 21:24:27,756 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.5941524505615234 s


15840 rows fetched!


## Query to get admin + labs

In [6]:
query = select(db.public.ip_administrative.x,
               db.public.lab.x
              ).where(and_(db.public.ip_administrative.hospital_id == HOSPITAL,
                           extract('year', db.public.ip_administrative.admit_date_time) == YEAR))\
              .join(db.public.lab.x, 
                    db.public.ip_administrative.genc_id == db.public.lab.genc_id
                   ).where(db.public.lab.lab_test_name_mapped != '').limit(LIMIT_LABS) # Use limit here, since lab queries can have millions of rows!
data = db.run_query(query)
print(f"{len(data)} rows fetched!")

2022-03-04 21:25:17,331 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-04 21:25:17,335 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 49.564553022384644 s


100000 rows fetched!


## Query to get admin + pharmacy

In [7]:
query = select(db.public.ip_administrative.x,
               db.public.pharmacy.x
              ).where(and_(db.public.ip_administrative.hospital_id == HOSPITAL,
                           extract('year', db.public.ip_administrative.admit_date_time) == YEAR))\
              .join(db.public.pharmacy.x, 
                    db.public.ip_administrative.genc_id == db.public.pharmacy.genc_id
                   ).limit(LIMIT_MEDICATIONS) # Use limit here, since pharmacy queries can have millions of rows!
data = db.run_query(query)
print(f"{len(data)} rows fetched!")

2022-03-04 21:25:20,761 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-04 21:25:20,764 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 3.401991605758667 s


76421 rows fetched!


## Query to get admin + vitals

In [8]:
EXTRACT_SAVE_PATH = "/mnt/nfs/project/delirium/_extract/vitals/extract.h5"
query = select(db.public.ip_administrative.genc_id.label(ENCOUNTER_ID),
               db.public.ip_administrative.hospital_id.label(HOSPITAL_ID),
               db.public.ip_administrative.admit_date_time.label(ADMIT_TIMESTAMP),
               db.public.ip_administrative.discharge_date_time.label(DISCHARGE_TIMESTAMP),
               db.public.ip_administrative.age.label(AGE),
               db.public.ip_administrative.gender.label(SEX),
               db.public.ip_administrative.language.label(LANGUAGE),
               db.public.ip_administrative.total_cost.label(TOTAL_COST),
               db.public.vitals.measurement_mapped.label(VITAL_MEASUREMENT_NAME),
               db.public.vitals.measurement_value.label(VITAL_MEASUREMENT_VALUE),
               db.public.vitals.measure_date_time.label(VITAL_MEASUREMENT_TIMESTAMP),
               db.public.vitals.reference_range.label(REFERENCE_RANGE),
              ).where(and_(db.public.ip_administrative.hospital_id == HOSPITAL,
                           extract('year', db.public.ip_administrative.admit_date_time) == YEAR))\
              .join(db.public.vitals.x, 
                    db.public.ip_administrative.genc_id == db.public.vitals.genc_id
                   )\
              .where(
                  db.public.vitals.measurement_mapped != EMPTY_STRING,
              )\
              .limit(LIMIT_VITALS)
vitals_data = db.run_query(query)
print(f"{len(vitals_data)} rows fetched!")
vitals_data.to_hdf(
    EXTRACT_SAVE_PATH,
    key=f"query_gemini_admin_vitals_{HOSPITAL}_{YEAR}",
)

2022-03-04 21:25:22,099 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-04 21:25:22,102 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 1.23238205909729 s
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['hospital_id', 'sex', 'language', 'total_cost',
       'vital_measurement_name', 'vital_measurement_value', 'reference_range'],
      dtype='object')]

  pytables.to_hdf(


100000 rows fetched!


## Process Admin data

In [9]:
%autoreload 2

data = pd.read_hdf(
    EXTRACT_SAVE_PATH,
    key=f"query_gemini_admin_vitals_{HOSPITAL}_{YEAR}"
)
must_have_columns = [
    ENCOUNTER_ID,
    AGE,
    SEX
]

admin_processor = AdminProcessor(data, must_have_columns)
admin_features = admin_processor.process()

feature_handler = FeatureHandler()
feature_handler.add_features(admin_features)
print(admin_features)

2022-03-04 21:25:22,895 [1;37mINFO[0m cyclops.processors.base - Processing raw admin data...
2022-03-04 21:25:22,899 [1;37mINFO[0m cyclops.processors.base - # samples: 100000, # encounters: 583
2022-03-04 21:25:22,902 [1;37mINFO[0m cyclops.processors.admin - # admin features: 2, # encounters: 583
2022-03-04 21:25:23,231 [1;37mINFO[0m cyclops.utils.profile - Finished executing function process in 0.33580493927001953 s


          age sex
11250661   79   M
11250769   73   M
11250994   88   F
11251071   61   M
11251119   51   F
...       ...  ..
11494456   62   M
11494588   75   F
11494627   89   F
11494762   74   M
11495242   54   M

[583 rows x 2 columns]


## Merge back admin + vitals

In [10]:
print(feature_handler.df_scaled)

          age  sex
0    0.857230    1
1    0.528293    1
2    1.350635    0
3   -0.129581    1
4   -0.677810    0
..        ...  ...
578 -0.074758    1
579  0.637938    0
580  1.405458    0
581  0.583116    1
582 -0.513341    1

[583 rows x 2 columns]


## Process vitals data

In [11]:
%autoreload 2

data = pd.read_hdf(
    EXTRACT_SAVE_PATH,
    key=f"query_gemini_admin_vitals_{HOSPITAL}_{YEAR}"
)
must_have_columns = [
    ENCOUNTER_ID,
    ADMIT_TIMESTAMP,
    VITAL_MEASUREMENT_NAME,
    VITAL_MEASUREMENT_VALUE,
    VITAL_MEASUREMENT_TIMESTAMP,
    REFERENCE_RANGE
]
feature_handler = FeatureHandler()
vitals_processor = VitalsProcessor(data, must_have_columns)
vitals_features = vitals_processor.process()
feature_handler.add_features(vitals_features)

print("Reference range unique values")
print(vitals_processor.data[REFERENCE_RANGE].unique())

2022-03-04 21:25:23,552 [1;37mINFO[0m cyclops.processors.base - Processing raw vitals data...
2022-03-04 21:25:23,556 [1;37mINFO[0m cyclops.processors.base - # samples: 100000, # encounters: 583
2022-03-04 21:25:23,578 [1;37mINFO[0m cyclops.processors.base - Filtering vitals within aggregation window...
2022-03-04 21:25:23,581 [1;37mINFO[0m cyclops.processors.base - # samples: 14429, # encounters: 573
2022-03-04 21:25:23,603 [1;37mINFO[0m cyclops.processors.base - Drop oxygen flow rate, saturation samples...
2022-03-04 21:25:23,605 [1;37mINFO[0m cyclops.processors.base - # samples: 10876, # encounters: 573
2022-03-04 21:25:23,639 [1;37mINFO[0m cyclops.processors.base - Convert Positive/Negative to 1/0...
2022-03-04 21:25:23,641 [1;37mINFO[0m cyclops.processors.base - # samples: 10876, # encounters: 573
2022-03-04 21:25:23,645 [1;37mINFO[0m cyclops.processors.vitals - Converting string result values to numeric...
2022-03-04 21:25:23,651 [1;37mINFO[0m cyclops.processo

Reference range unique values
['20-220' '0-40' '50-250' '25-45' '20-500' '20-200' 'NA-NA' '0-100']


## Data quality report (admin + vitals)

In [12]:
dashboard = Dashboard(tabs=[DataQualityTab()])
column_mapping = ColumnMapping(numerical_features=vitals_features.columns)
dashboard.calculate(reference_data=feature_handler.df,
                    current_data=None,
                    column_mapping=column_mapping)
dashboard.save(f"vitals_processed_{HOSPITAL}_{YEAR}.html")