## Imports

In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import select, extract
from sqlalchemy.sql.expression import and_

from evidently import ColumnMapping
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataQualityTab
from evidently.model_profile import Profile
from evidently.model_profile.sections import DataQualityProfileSection

import config
import cyclops
from cyclops.processors.column_names import (
    ENCOUNTER_ID,
    HOSPITAL_ID,
    ADMIT_TIMESTAMP,
    DISCHARGE_TIMESTAMP,
    VITAL_MEASUREMENT_NAME,
    VITAL_MEASUREMENT_VALUE,
    VITAL_MEASUREMENT_TIMESTAMP,
    REFERENCE_RANGE
)
from cyclops.processors.constants import EMPTY_STRING
from cyclops.processors.vitals import VitalsProcessor
from cyclops.processors.feature_handler import FeatureHandler
from cyclops.orm import Database

%load_ext autoreload

2022-03-02 17:22:25,336 [1;37mINFO[0m config          - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-02 17:22:25,348 [1;37mINFO[0m cyclops.utils.profile - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-02 17:22:25,353 [1;37mINFO[0m cyclops.processors.vitals - Log file is /mnt/nfs/home/krishnanam/log.log
2022-03-02 17:22:25,424 [1;37mINFO[0m cyclops.orm     - Log file is /mnt/nfs/home/krishnanam/log.log


## Load config, setup ORM

In [2]:
cfg = config.read_config("../configs/default/*.yaml")
db = Database(cfg)

2022-03-02 17:22:26,097 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


## Extract slices

In [3]:
HOSPITAL = "SMH"
YEAR = 2020
LIMIT_LABS = 100000
LIMIT_MEDICATIONS = 100000
LIMIT_VITALS = 100000

## Query to get admin + diagnosis

In [4]:
query = select(db.public.ip_administrative.x,
               db.public.diagnosis.x
              ).where(and_(db.public.ip_administrative.hospital_id == HOSPITAL,
                           extract('year', db.public.ip_administrative.admit_date_time) == YEAR))\
              .join(db.public.diagnosis.x, 
                    db.public.ip_administrative.genc_id == db.public.diagnosis.genc_id
                   )
              
data = db.run_query(query)
print(f"{len(data)} rows fetched!")

2022-03-02 17:22:27,187 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-03-02 17:22:27,190 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 1.0613429546356201 s


15840 rows fetched!


## Query to get admin + labs

In [None]:
query = select(db.public.ip_administrative.x,
               db.public.lab.x
              ).where(and_(db.public.ip_administrative.hospital_id == HOSPITAL,
                           extract('year', db.public.ip_administrative.admit_date_time) == YEAR))\
              .join(db.public.lab.x, 
                    db.public.ip_administrative.genc_id == db.public.lab.genc_id
                   ).where(db.public.lab.lab_test_name_mapped != '').limit(LIMIT_LABS) # Use limit here, since lab queries can have millions of rows!
data = db.run_query(query)
print(f"{len(data)} rows fetched!")

## Query to get admin + pharmacy

In [None]:
query = select(db.public.ip_administrative.x,
               db.public.pharmacy.x
              ).where(and_(db.public.ip_administrative.hospital_id == HOSPITAL,
                           extract('year', db.public.ip_administrative.admit_date_time) == YEAR))\
              .join(db.public.pharmacy.x, 
                    db.public.ip_administrative.genc_id == db.public.pharmacy.genc_id
                   ).limit(LIMIT_MEDICATIONS) # Use limit here, since pharmacy queries can have millions of rows!
data = db.run_query(query)
print(f"{len(data)} rows fetched!")

## Query to get admin + vitals

In [None]:
query = select(db.public.ip_administrative.genc_id.label(ENCOUNTER_ID),
               db.public.ip_administrative.hospital_id.label(HOSPITAL_ID),
               db.public.ip_administrative.admit_date_time.label(ADMIT_TIMESTAMP),
               db.public.ip_administrative.discharge_date_time.label(DISCHARGE_TIMESTAMP),
               db.public.vitals.measurement_mapped.label(VITAL_MEASUREMENT_NAME),
               db.public.vitals.measurement_value.label(VITAL_MEASUREMENT_VALUE),
               db.public.vitals.measure_date_time.label(VITAL_MEASUREMENT_TIMESTAMP),
               db.public.vitals.reference_range.label(REFERENCE_RANGE),
              ).where(and_(db.public.ip_administrative.hospital_id == HOSPITAL,
                           extract('year', db.public.ip_administrative.admit_date_time) == YEAR))\
              .join(db.public.vitals.x, 
                    db.public.ip_administrative.genc_id == db.public.vitals.genc_id
                   )\
              .where(
                  db.public.vitals.measurement_mapped != EMPTY_STRING,
              )\
              .limit(LIMIT_VITALS)
vitals_data = db.run_query(query)
print(f"{len(vitals_data)} rows fetched!")
vitals_data.to_hdf(
    "/mnt/nfs/project/delirium/_extract/vitals/extract.h5",
    key=f"query_gemini_vitals_{HOSPITAL}_{YEAR}",
)

## Process vitals data

In [None]:
%autoreload 2

data = pd.read_hdf(
    "/mnt/nfs/project/delirium/_extract/vitals/extract.h5",
    key=f"query_gemini_vitals_{HOSPITAL}_{YEAR}"
)
must_have_columns = [
    ENCOUNTER_ID,
    ADMIT_TIMESTAMP,
    VITAL_MEASUREMENT_NAME,
    VITAL_MEASUREMENT_VALUE,
    VITAL_MEASUREMENT_TIMESTAMP,
    REFERENCE_RANGE
]
feature_handler = FeatureHandler()
vitals_processor = VitalsProcessor(data, must_have_columns)
vitals_features = vitals_processor.process()
feature_handler.add_features(vitals_features)

print("Reference range unique values")
print(vitals_processor.data[REFERENCE_RANGE].unique())

## Data quality report (vitals)

In [None]:
dashboard = Dashboard(tabs=[DataQualityTab()])
column_mapping = ColumnMapping(numerical_features=vitals_features.columns)
dashboard.calculate(reference_data=feature_handler.df,
                    current_data=None,
                    column_mapping=column_mapping)
dashboard.save(f"vitals_processed_{HOSPITAL}_{YEAR}.html")