### This notebook shows examples of how to use the cyclops.processor API on GEMINI.

## Get all patient encounters including ER data, with diagnoses from St. Michael's hospital from March 1 2020 to March 15 2020, with vitals and labs and interventions.

In [1]:
from cyclops.processors.column_names import AGE, DIAGNOSIS_CODE, ENCOUNTER_ID, SEX
from cyclops.processors.constants import SMH
from cyclops.processors.utils import gather_columns
from cyclops.query import gemini

# patients = gemini.patients(
#     hospitals=[SMH], from_date="2020-03-01", to_date="2020-03-15", include_er_data=True
# )
# patients_diagnoses = gemini.diagnoses(patients=patients)
# patients_vitals = gemini.events(patients=patients, category="vitals")
# patients_labs = gemini.events(patients=patients, category="lab")
# patients_interventions = gemini.events(patients=patients, category="intervention")

# diagnoses_data = patients_diagnoses.run()
# vitals_data = patients_vitals.run()
# labs_data = patients_labs.run()
# interventions_data = patients_interventions.run()

# print(f"{len(diagnoses_data)} diagnoses rows extracted!")
# print(f"{len(vitals_data)} vitals rows extracted!")
# print(f"{len(labs_data)} labs rows extracted!")
# print(f"{len(interventions_data)} interventions rows extracted!")

# patients_diagnoses.save(".", "diagnoses")
# patients_vitals.save(".", "vitals")
# patients_labs.save(".", "lab")
# patients_interventions.save(".", "intervention")

2022-04-25 06:23:05,412 [1;37mINFO[0m cyclops.processors.utils - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 06:23:05,558 [1;37mINFO[0m cyclops.config  - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 06:23:05,632 [1;37mINFO[0m cyclops.query.utils - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 06:23:05,638 [1;37mINFO[0m cyclops.utils.profile - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 06:23:05,643 [1;37mINFO[0m cyclops.orm     - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 06:23:05,648 [1;37mINFO[0m cyclops.query.interface - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 06:23:06,212 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


## Process queried data into features. (Static + Temporal).

In [2]:
import pandas as pd

from cyclops.processor import Aggregator, Imputer, featurize

diagnoses_data = pd.read_parquet("diagnoses.gzip")
vitals_data = pd.read_parquet("vitals.gzip")
labs_data = pd.read_parquet("lab.gzip")
interventions_data = pd.read_parquet("intervention.gzip")

diagnoses_data = gather_columns(
    diagnoses_data, [ENCOUNTER_ID, AGE, SEX, DIAGNOSIS_CODE]
)
feature_handler = featurize(
    temporal_data=[labs_data, vitals_data],
    imputer=Imputer(strategy="none"),
    aggregator=Aggregator(bucket_size=12),
)

2022-04-25 06:23:06,244 [1;37mINFO[0m cyclops.processors.aggregate - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 06:23:06,253 [1;37mINFO[0m cyclops.processors.diagnoses - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 06:23:06,270 [1;37mINFO[0m cyclops.processors.events - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 06:23:06,856 [1;37mINFO[0m cyclops.processors.feature_handler - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-04-25 06:23:07,040 [1;37mINFO[0m cyclops.processors.utils - Cleaning raw event data...
2022-04-25 06:23:07,045 [1;37mINFO[0m cyclops.processors.utils - # samples: 14899, # encounters: 150
2022-04-25 06:23:07,047 [1;37mINFO[0m cyclops.processors.utils - # columns: 59, # encounters: 150
2022-04-25 06:23:07,066 [1;37mINFO[0m cyclops.processors.utils - Drop unsupported events...
2022-04-25 06:23:07,070 [1;37mINFO[0m cyclops.processors.utils - # samples: 

0 <class 'pandas.core.indexes.multi.MultiIndex'>


2022-04-25 06:23:07,830 [1;37mINFO[0m cyclops.processors.utils - Fixing inequalities and removing outlier values...
2022-04-25 06:23:07,833 [1;37mINFO[0m cyclops.processors.utils - # samples: 20244, # encounters: 150
2022-04-25 06:23:07,835 [1;37mINFO[0m cyclops.processors.utils - # columns: 58, # encounters: 150
2022-04-25 06:23:07,891 [1;37mINFO[0m cyclops.processors.utils - Convert Positive/Negative to 1/0...
2022-04-25 06:23:07,894 [1;37mINFO[0m cyclops.processors.utils - # samples: 20244, # encounters: 150
2022-04-25 06:23:07,896 [1;37mINFO[0m cyclops.processors.utils - # columns: 58, # encounters: 150
2022-04-25 06:23:07,901 [1;37mINFO[0m cyclops.processors.utils - Fill empty result string values with NaN...
2022-04-25 06:23:07,904 [1;37mINFO[0m cyclops.processors.utils - # samples: 20244, # encounters: 150
2022-04-25 06:23:07,906 [1;37mINFO[0m cyclops.processors.utils - # columns: 58, # encounters: 150
2022-04-25 06:23:07,910 [1;37mINFO[0m cyclops.processors.

13 <class 'pandas.core.indexes.multi.MultiIndex'>


## Inspect feature handler, and check features.

In [5]:
print(feature_handler.get_numerical_features())
print(feature_handler.get_categorical_features())

['sodium', 'urine specific gravity', 'bicarbonate', 'tsh', 'vitamin b12', 'white blood cell count', 'high sensitivity troponin', 'lymphocyte', 'neutrophils', 'albumin', 'alp', 'alt', 'aptt', 'ast', 'bilirubin', 'urinalysis', 'calcium', 'creatinine', 'ferritin', 'glucose point of care', 'glucose random', 'hematocrit', 'hemoglobin', 'inr', 'lactate venous', 'ldh', 'mean cell volume', 'venous pco2', 'venous ph', 'platelet count', 'potassium', 'pt', 'oxygen saturation', 'heart rate', 'respiratory rate', 'temperature', 'systolic bp', 'diastolic bp', 'weight']
['blood urea nitrogen', 'calcium, ionized', 'd-dimer', 'serum alcohol', 'crp', 'ketone', 'lactate arterial', 'arterial paco2', 'arterial ph', 'arterial pao2']


## Look at data quality report (patient statics + labs + vitals)

In [4]:
# from evidently import ColumnMapping
# from evidently.dashboard import Dashboard
# from evidently.dashboard.tabs import DataQualityTab

# dashboard = Dashboard(tabs=[DataQualityTab()])
# column_mapping = ColumnMapping(
#     numerical_features=feature_handler.get_numerical_features(),
#     categorical_features=feature_handler.get_categorical_features(),
# )
# dashboard.calculate(
#     reference_data=feature_handler.features,
#     current_data=None,
#     column_mapping=column_mapping,
# )
# dashboard.save(f"data_processed.html")