### This notebook shows examples of how to use the cyclops.processor API on GEMINI.

## Get all patient encounters including ER data, with diagnoses from St. Michael's hospital from March 1 2020 to March 15 2020, with vitals and labs and interventions.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd

from cyclops.feature_handler import FeatureHandler
from cyclops.processor import featurize
from cyclops.processors.aggregate import Aggregator
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    DIAGNOSIS_CODE,
    ENCOUNTER_ID,
    HOSPITAL_ID,
    SEX,
)
from cyclops.processors.constants import SMH
from cyclops.processors.impute import Imputer
from cyclops.processors.util import gather_columns
from cyclops.query import gemini

patients = gemini.patients(
    hospitals=[SMH], from_date="2020-03-01", to_date="2020-03-15", include_er_data=True
)
patients_diagnoses = gemini.diagnoses(patients=patients)
patients_vitals = gemini.events(patients=patients, category="vitals")
patients_labs = gemini.events(patients=patients, category="lab")
patients_interventions = gemini.events(patients=patients, category="intervention")

diagnoses_data = patients_diagnoses.run()
vitals_data = patients_vitals.run()
labs_data = patients_labs.run()
interventions_data = patients_interventions.run()

print(f"{len(diagnoses_data)} diagnoses rows extracted!")
print(f"{len(vitals_data)} vitals rows extracted!")
print(f"{len(labs_data)} labs rows extracted!")
print(f"{len(interventions_data)} interventions rows extracted!")

patients_diagnoses.save(".", "diagnoses")
patients_vitals.save(".", "vitals")
patients_labs.save(".", "lab")
patients_interventions.save(".", "intervention")

2022-05-11 17:50:02,385 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!
2022-05-11 17:50:12,250 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-05-11 17:50:12,252 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 9.847437 s
2022-05-11 17:50:14,694 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-05-11 17:50:14,698 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 2.443027 s
2022-05-11 17:52:58,846 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-05-11 17:52:58,849 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 164.149198 s
2022-05-11 17:53:01,355 [1;37mINFO[0m cyclops.orm     - Query returned successfully!
2022-05-11 17:53:01,358 [1;37mINFO[0m cyclops.utils.profile - Finished executing function run_query in 2.506609 s
2022-05-11 17:53:01,360 [1;37mINFO[0m cyclops.query.interface - Saving queried data to .

1068 diagnoses rows extracted!
20244 vitals rows extracted!
14899 labs rows extracted!
350 interventions rows extracted!


2022-05-11 17:53:01,592 [1;37mINFO[0m cyclops.query.interface - Saving queried data to ./intervention.gzip


## Process queried data into features. (Static + Temporal).

In [2]:
diagnoses_data = pd.read_parquet("diagnoses.gzip")
vitals_data = pd.read_parquet("vitals.gzip")
labs_data = pd.read_parquet("lab.gzip")
interventions_data = pd.read_parquet("intervention.gzip")

static_diagnoses_data = gather_columns(
    diagnoses_data,
    [ENCOUNTER_ID, AGE, SEX, DIAGNOSIS_CODE, HOSPITAL_ID, ADMIT_TIMESTAMP],
)
feature_handler = featurize(
    static_data=[static_diagnoses_data],
    temporal_data=[labs_data, vitals_data],
    aggregator=Aggregator(bucket_size=6, window=120),
    static_imputer=Imputer(
        strategy="median",
        encounter_missingness_threshold=0.25,
        feature_missingness_threshold=0.5,
    ),
    temporal_imputer=Imputer(
        strategy="median",
        encounter_missingness_threshold=0.95,
        feature_missingness_threshold=0.75,
    ),
    reference_cols=[HOSPITAL_ID, ADMIT_TIMESTAMP],
)

2022-05-11 17:53:01,771 [1;37mINFO[0m cyclops.processors.util - Processing raw diagnosis codes...
2022-05-11 17:53:01,775 [1;37mINFO[0m cyclops.processors.util - # samples: 1068, # encounters: 150
2022-05-11 17:53:01,823 [1;37mINFO[0m cyclops.processors.util - Grouping ICD codes to trajectories...
2022-05-11 17:53:01,826 [1;37mINFO[0m cyclops.processors.util - # samples: 1068, # encounters: 150
2022-05-11 17:53:01,828 [1;37mINFO[0m cyclops.processors.diagnoses - # diagnosis features: 19, # encounters: 150
2022-05-11 17:53:01,858 [1;37mINFO[0m cyclops.utils.profile - Finished executing function group_diagnosis_codes_to_trajectories in 0.086733 s
2022-05-11 17:53:01,879 [1;37mINFO[0m cyclops.processors.util - Gathering static features...
2022-05-11 17:53:01,881 [1;37mINFO[0m cyclops.processors.util - # samples: 1068, # encounters: 150
2022-05-11 17:53:01,883 [1;37mINFO[0m cyclops.processors.util - # columns: 5, # encounters: 150
2022-05-11 17:53:02,038 [1;37mINFO[0m c

2022-05-11 17:53:27,239 [1;37mINFO[0m cyclops.processors.impute - Dropping bilirubin feature, missingness is higher than threshold!
2022-05-11 17:53:27,242 [1;37mINFO[0m cyclops.processors.impute - Dropping calcium feature, missingness is higher than threshold!
2022-05-11 17:53:27,245 [1;37mINFO[0m cyclops.processors.impute - Dropping calcium, ionized feature, missingness is higher than threshold!
2022-05-11 17:53:27,247 [1;37mINFO[0m cyclops.processors.impute - Dropping creatinine feature, missingness is higher than threshold!
2022-05-11 17:53:27,250 [1;37mINFO[0m cyclops.processors.impute - Dropping esr feature, missingness is higher than threshold!
2022-05-11 17:53:27,253 [1;37mINFO[0m cyclops.processors.impute - Dropping serum alcohol feature, missingness is higher than threshold!
2022-05-11 17:53:27,256 [1;37mINFO[0m cyclops.processors.impute - Dropping ferritin feature, missingness is higher than threshold!
2022-05-11 17:53:27,259 [1;37mINFO[0m cyclops.processors.

## Plot example temporal features.

In [3]:
plot_features = [
    "oxygen saturation",
    "sodium",
    "temperature",
    "respiratory rate",
    "diastolic bp",
]

encounter_id = 11289767
temporal_features = feature_handler.get_numerical_feature_names()["temporal"]

features_encounter = feature_handler.features["temporal"].loc[encounter_id][
    plot_features
]
feature_handler.plot_features(encounter_id, names=plot_features)

KeyError: "['sodium'] not in index"

## Plot histogram of static features.

In [None]:
feature_handler.plot_features(aggregate_type="static", names="age")

## Create new FeatureHandler and load features from file.

In [None]:
feature_handler.save(".", "test_features")
feature_handler = FeatureHandler()
feature_handler.load(".", "test_features")