### This notebook shows examples of how to use the cyclops.processor API on GEMINI.

## Get all patient encounters including ER data, with diagnoses from St. Michael's hospital from March 1 2020 to March 15 2020, with vitals and labs and interventions.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd

from cyclops.feature_handler import FeatureHandler
from cyclops.processor import featurize
from cyclops.processors.aggregate import Aggregator
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    DIAGNOSIS_CODE,
    ENCOUNTER_ID,
    HOSPITAL_ID,
    SEX,
)
from cyclops.processors.constants import SMH
from cyclops.processors.impute import Imputer
from cyclops.processors.utils import gather_columns
from cyclops.query import gemini

patients = gemini.patients(
    hospitals=[SMH], from_date="2020-03-01", to_date="2020-03-15", include_er_data=True
)
patients_diagnoses = gemini.diagnoses(patients=patients)
patients_vitals = gemini.events(patients=patients, category="vitals")
patients_labs = gemini.events(patients=patients, category="lab")
patients_interventions = gemini.events(patients=patients, category="intervention")

diagnoses_data = patients_diagnoses.run()
vitals_data = patients_vitals.run()
labs_data = patients_labs.run()
interventions_data = patients_interventions.run()

print(f"{len(diagnoses_data)} diagnoses rows extracted!")
print(f"{len(vitals_data)} vitals rows extracted!")
print(f"{len(labs_data)} labs rows extracted!")
print(f"{len(interventions_data)} interventions rows extracted!")

patients_diagnoses.save(".", "diagnoses")
patients_vitals.save(".", "vitals")
patients_labs.save(".", "lab")
patients_interventions.save(".", "intervention")

2022-05-03 23:58:56,355 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!
2022-05-03 23:58:57,180 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-05-03 23:58:57,183 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.806349 s
2022-05-03 23:58:59,763 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-05-03 23:58:59,766 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 2.581149 s
2022-05-03 23:59:37,124 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-05-03 23:59:37,129 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 37.360674 s
2022-05-03 23:59:37,476 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-05-03 23:59:37,479 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 0.345615 s
2022-05-03 23:59:37,481 [1;37mINFO[0m cyclops.query.interface - Saving queried data to ./

1068 diagnoses rows extracted!
20244 vitals rows extracted!
14899 labs rows extracted!
350 interventions rows extracted!


2022-05-03 23:59:37,705 [1;37mINFO[0m cyclops.query.interface - Saving queried data to ./intervention.gzip


## Process queried data into features. (Static + Temporal).

In [2]:
diagnoses_data = pd.read_parquet("diagnoses.gzip")
vitals_data = pd.read_parquet("vitals.gzip")
labs_data = pd.read_parquet("lab.gzip")
interventions_data = pd.read_parquet("intervention.gzip")

static_diagnoses_data = gather_columns(
    diagnoses_data,
    [ENCOUNTER_ID, AGE, SEX, DIAGNOSIS_CODE, HOSPITAL_ID, ADMIT_TIMESTAMP],
)
feature_handler = featurize(
    static_data=[static_diagnoses_data],
    temporal_data=[labs_data, vitals_data],
    aggregator=Aggregator(bucket_size=6, window=120),
    static_imputer=Imputer(
        strategy="median",
        encounter_missingness_threshold=0.25,
        feature_missingness_threshold=0.5,
    ),
    temporal_imputer=Imputer(
        strategy="median",
        encounter_missingness_threshold=0.95,
        feature_missingness_threshold=0.75,
    ),
    reference_cols=[HOSPITAL_ID, ADMIT_TIMESTAMP],
)

2022-05-03 23:59:55,438 [1;37mINFO[0m cyclops.processors.utils - Processing raw diagnosis codes...
2022-05-03 23:59:55,441 [1;37mINFO[0m cyclops.processors.utils - # samples: 1068, # encounters: 150
2022-05-03 23:59:55,485 [1;37mINFO[0m cyclops.processors.utils - Grouping ICD codes to trajectories...
2022-05-03 23:59:55,487 [1;37mINFO[0m cyclops.processors.utils - # samples: 1068, # encounters: 150
2022-05-03 23:59:55,489 [1;37mINFO[0m cyclops.processors.diagnoses - # diagnosis features: 19, # encounters: 150
2022-05-03 23:59:55,574 [1;37mINFO[0m cyclops.utils.profile - Finished executing function group_diagnosis_codes_to_trajectories in 0.136107 s
2022-05-03 23:59:55,598 [1;37mINFO[0m cyclops.processors.utils - Gathering static features...
2022-05-03 23:59:55,601 [1;37mINFO[0m cyclops.processors.utils - # samples: 1068, # encounters: 150
2022-05-03 23:59:55,603 [1;37mINFO[0m cyclops.processors.utils - # columns: 5, # encounters: 150
2022-05-03 23:59:55,777 [1;37mINF

2022-05-04 00:00:18,897 [1;37mINFO[0m cyclops.processors.impute - Dropping vitamin b12 feature, missingness is higher than threshold!
2022-05-04 00:00:18,900 [1;37mINFO[0m cyclops.processors.impute - Dropping white blood cell count feature, missingness is higher than threshold!
2022-05-04 00:00:18,903 [1;37mINFO[0m cyclops.processors.impute - Dropping high sensitivity troponin feature, missingness is higher than threshold!
2022-05-04 00:00:18,906 [1;37mINFO[0m cyclops.processors.impute - Dropping vitamin d feature, missingness is higher than threshold!
2022-05-04 00:00:18,909 [1;37mINFO[0m cyclops.processors.impute - Dropping hba1c feature, missingness is higher than threshold!
2022-05-04 00:00:18,912 [1;37mINFO[0m cyclops.processors.impute - Dropping lymphocyte feature, missingness is higher than threshold!
2022-05-04 00:00:18,915 [1;37mINFO[0m cyclops.processors.impute - Dropping neutrophils feature, missingness is higher than threshold!
2022-05-04 00:00:18,918 [1;37mI

## Plot example temporal features.

In [None]:
plot_features = [
    "oxygen saturation",
    "sodium",
    "temperature",
    "respiratory rate",
    "diastolic bp",
]

encounter_id = 11289767
temporal_features = feature_handler.get_numerical_feature_names()["temporal"]

features_encounter = feature_handler.features["temporal"].loc[encounter_id][
    plot_features
]
feature_handler.plot_features(encounter_id, names=plot_features)

## Plot histogram of static features.

In [None]:
feature_handler.plot_features(aggregate_type="static", names="age")

## Create new FeatureHandler and load features from file.

In [3]:
feature_handler.save(".", "test_features")
feature_handler = FeatureHandler()
feature_handler.load(".", "test_features")

2022-05-04 00:00:29,685 [1;37mINFO[0m cyclops.feature_handler - Saving static features to ./test_features_static.gzip
2022-05-04 00:00:29,704 [1;37mINFO[0m cyclops.feature_handler - Saving temporal features to ./test_features_temporal.gzip
2022-05-04 00:00:29,715 [1;37mINFO[0m cyclops.feature_handler - Loading features from file...
2022-05-04 00:00:29,718 [1;37mINFO[0m cyclops.feature_handler - Found file to load for static features...
2022-05-04 00:00:29,720 [1;37mINFO[0m cyclops.feature_handler - Successfully loaded static features from file...
2022-05-04 00:00:29,727 [1;37mINFO[0m cyclops.feature_handler - Found file to load for temporal features...
2022-05-04 00:00:29,733 [1;37mINFO[0m cyclops.feature_handler - Successfully loaded temporal features from file...
