### This notebook shows examples of how to use the cyclops.processor API on GEMINI.

## Get all patient encounters including ER data, with diagnoses from St. Michael's hospital from March 1 2020 to March 15 2020, with vitals and labs and interventions.

In [1]:
from cyclops.processors.column_names import AGE, DIAGNOSIS_CODE, ENCOUNTER_ID, SEX
from cyclops.processors.constants import SMH
from cyclops.processors.utils import gather_columns
from cyclops.query import gemini

# patients = gemini.patients(
#     hospitals=[SMH], from_date="2020-03-01", to_date="2020-03-15", include_er_data=True
# )
# patients_diagnoses = gemini.diagnoses(patients=patients)
# patients_vitals = gemini.events(patients=patients, category="vitals")
# patients_labs = gemini.events(patients=patients, category="lab")
# patients_interventions = gemini.events(patients=patients, category="intervention")

# diagnoses_data = patients_diagnoses.run()
# vitals_data = patients_vitals.run()
# labs_data = patients_labs.run()
# interventions_data = patients_interventions.run()

# print(f"{len(diagnoses_data)} diagnoses rows extracted!")
# print(f"{len(vitals_data)} vitals rows extracted!")
# print(f"{len(labs_data)} labs rows extracted!")
# print(f"{len(interventions_data)} interventions rows extracted!")

# patients_diagnoses.save(".", "diagnoses")
# patients_vitals.save(".", "vitals")
# patients_labs.save(".", "lab")
# patients_interventions.save(".", "intervention")

2022-04-25 08:28:44,363 [1;37mINFO[0m cyclops.processors.utils - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 08:28:44,526 [1;37mINFO[0m cyclops.config  - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 08:28:44,619 [1;37mINFO[0m cyclops.query.utils - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 08:28:44,665 [1;37mINFO[0m cyclops.utils.profile - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 08:28:44,670 [1;37mINFO[0m cyclops.orm     - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 08:28:44,676 [1;37mINFO[0m cyclops.query.interface - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 08:28:45,278 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


## Process queried data into features. (Static + Temporal).

In [2]:
import numpy as np
import pandas as pd

from cyclops.processor import Aggregator, Imputer, featurize

diagnoses_data = pd.read_parquet("diagnoses.gzip")
vitals_data = pd.read_parquet("vitals.gzip")
labs_data = pd.read_parquet("lab.gzip")
interventions_data = pd.read_parquet("intervention.gzip")

static_diagnoses_data = gather_columns(
    diagnoses_data, [ENCOUNTER_ID, AGE, SEX, DIAGNOSIS_CODE]
)
feature_handler = featurize(
    temporal_data=[labs_data, vitals_data],
    imputer=Imputer(strategy="none"),
    aggregator=Aggregator(bucket_size=6, window=4),
)


print(
    np.median(diagnoses_data["duration_er_stay_derived"]),
    np.mean(diagnoses_data["duration_er_stay_derived"]),
)

2022-04-25 08:28:45,322 [1;37mINFO[0m cyclops.processors.aggregate - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 08:28:45,329 [1;37mINFO[0m cyclops.processors.diagnoses - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 08:28:45,334 [1;37mINFO[0m cyclops.processors.events - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 08:28:46,007 [1;37mINFO[0m cyclops.processors.feature_handler - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 08:28:46,230 [1;37mINFO[0m cyclops.processors.utils - Cleaning raw event data...
2022-04-25 08:28:46,235 [1;37mINFO[0m cyclops.processors.utils - # samples: 14899, # encounters: 150
2022-04-25 08:28:46,237 [1;37mINFO[0m cyclops.processors.utils - # columns: 59, # encounters: 150
2022-04-25 08:28:46,262 [1;37mINFO[0m cyclops.processors.utils - Drop unsupported events...
2022-04-25 08:28:46,264 [1;37mINFO[0m cyclops.processors.utils - # samples: 

16.3 19.34077708604869


## Inspect feature handler, and check features.

In [3]:
print(feature_handler.get_numerical_features())
print(feature_handler.get_categorical_features())
# feature_handler.features

['sodium', 'urine sodium', 'urine specific gravity', 'bicarbonate', 'tsh', 'blood urea nitrogen', 'vitamin b12', 'white blood cell count', 'high sensitivity troponin', 'lymphocyte', 'neutrophils', 'albumin', 'alp', 'alt', 'aptt', 'ast', 'bilirubin', 'urinalysis', 'calcium', 'creatinine', 'd-dimer', 'esr', 'serum alcohol', 'ferritin', 'fibrinogen', 'glucose point of care', 'glucose random', 'hematocrit', 'hemoglobin', 'crp', 'inr', 'ketone', 'lactate venous', 'ldh', 'mean cell volume', 'serum osmolality', 'urine osmolality', 'venous pco2', 'arterial ph', 'venous ph', 'platelet count', 'arterial pao2', 'potassium', 'pt', 'diastolic bp', 'oxygen saturation', 'heart rate', 'respiratory rate', 'systolic bp', 'temperature']
['vitamin d', 'hba1c', 'calcium, ionized', 'lactate arterial', 'arterial paco2', 'oxygen flow rate']


## Look at data quality report (patient statics + labs + vitals)

In [4]:
# from evidently import ColumnMapping
# from evidently.dashboard import Dashboard
# from evidently.dashboard.tabs import DataQualityTab

# dashboard = Dashboard(tabs=[DataQualityTab()])
# column_mapping = ColumnMapping(
#     numerical_features=feature_handler.get_numerical_features(),
#     categorical_features=feature_handler.get_categorical_features(),
# )
# dashboard.calculate(
#     reference_data=feature_handler.features,
#     current_data=None,
#     column_mapping=column_mapping,
# )
# dashboard.save(f"data_processed.html")