### This notebook shows examples of how to use the cyclops.processor API on GEMINI.

## Get all patient encounters including ER data, with diagnoses from St. Michael's hospital from March 1 2020 to March 15 2020, with vitals and labs and interventions.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd

from cyclops.feature_handler import FeatureHandler
from cyclops.processor import featurize
from cyclops.processors.aggregate import Aggregator
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    DIAGNOSIS_CODE,
    ENCOUNTER_ID,
    HOSPITAL_ID,
    SEX,
)
from cyclops.processors.constants import SMH
from cyclops.processors.impute import Imputer
from cyclops.processors.utils import gather_columns
from cyclops.query import gemini

# patients = gemini.patients(
#     hospitals=[SMH], from_date="2020-03-01", to_date="2020-03-15", include_er_data=True
# )
# patients_diagnoses = gemini.diagnoses(patients=patients)
# patients_vitals = gemini.events(patients=patients, category="vitals")
# patients_labs = gemini.events(patients=patients, category="lab")
# patients_interventions = gemini.events(patients=patients, category="intervention")

# diagnoses_data = patients_diagnoses.run()
# vitals_data = patients_vitals.run()
# labs_data = patients_labs.run()
# interventions_data = patients_interventions.run()

# print(f"{len(diagnoses_data)} diagnoses rows extracted!")
# print(f"{len(vitals_data)} vitals rows extracted!")
# print(f"{len(labs_data)} labs rows extracted!")
# print(f"{len(interventions_data)} interventions rows extracted!")

# patients_diagnoses.save(".", "diagnoses")
# patients_vitals.save(".", "vitals")
# patients_labs.save(".", "lab")
# patients_interventions.save(".", "intervention")

2022-05-02 17:44:05,076 [1;37mINFO[0m cyclops.utils.profile - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-05-02 17:44:05,084 [1;37mINFO[0m cyclops.processors.impute - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-05-02 17:44:05,090 [1;37mINFO[0m cyclops.feature_handler - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-05-02 17:44:05,109 [1;37mINFO[0m cyclops.processors.utils - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-05-02 17:44:05,115 [1;37mINFO[0m cyclops.processors.aggregate - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-05-02 17:44:05,122 [1;37mINFO[0m cyclops.processors.diagnoses - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-05-02 17:44:05,155 [1;37mINFO[0m cyclops.processors.events - Log file is /mnt/nfs/home/krishnanam/vector-delirium/log.log
2022-05-02 17:44:05,159 [1;37mINFO[0m cyclops.processors.outcomes - Log file is /mnt/nfs/home/kr

## Process queried data into features. (Static + Temporal).

In [12]:
diagnoses_data = pd.read_parquet("diagnoses.gzip")
vitals_data = pd.read_parquet("vitals.gzip")
labs_data = pd.read_parquet("lab.gzip")
interventions_data = pd.read_parquet("intervention.gzip")

static_diagnoses_data = gather_columns(
    diagnoses_data,
    [ENCOUNTER_ID, AGE, SEX, DIAGNOSIS_CODE, HOSPITAL_ID, ADMIT_TIMESTAMP],
)
feature_handler = featurize(
    static_data=[static_diagnoses_data],
    temporal_data=[labs_data, vitals_data],
    aggregator=Aggregator(bucket_size=6, window=120),
    static_imputer=Imputer(
        strategy="median",
        encounter_missingness_threshold=0.25,
        feature_missingness_threshold=0.5,
    ),
    temporal_imputer=Imputer(
        strategy="median",
        encounter_missingness_threshold=0.95,
        feature_missingness_threshold=0.75,
    ),
    reference_cols=[HOSPITAL_ID, ADMIT_TIMESTAMP],
)

2022-05-02 18:04:03,746 [1;37mINFO[0m cyclops.processors.utils - Processing raw diagnosis codes...
2022-05-02 18:04:03,752 [1;37mINFO[0m cyclops.processors.utils - # samples: 1068, # encounters: 150
2022-05-02 18:04:03,797 [1;37mINFO[0m cyclops.processors.utils - Grouping ICD codes to trajectories...
2022-05-02 18:04:03,800 [1;37mINFO[0m cyclops.processors.utils - # samples: 1068, # encounters: 150
2022-05-02 18:04:03,802 [1;37mINFO[0m cyclops.processors.diagnoses - # diagnosis features: 19, # encounters: 150
2022-05-02 18:04:03,884 [1;37mINFO[0m cyclops.utils.profile - Finished executing function group_diagnosis_codes_to_trajectories in 0.138546 s
2022-05-02 18:04:03,906 [1;37mINFO[0m cyclops.processors.utils - Gathering static features...
2022-05-02 18:04:03,908 [1;37mINFO[0m cyclops.processors.utils - # samples: 1068, # encounters: 150
2022-05-02 18:04:03,910 [1;37mINFO[0m cyclops.processors.utils - # columns: 5, # encounters: 150
2022-05-02 18:04:04,070 [1;37mINF

2022-05-02 18:04:27,713 [1;37mINFO[0m cyclops.processors.impute - Dropping white blood cell count feature, missingness is higher than threshold!
2022-05-02 18:04:27,716 [1;37mINFO[0m cyclops.processors.impute - Dropping high sensitivity troponin feature, missingness is higher than threshold!
2022-05-02 18:04:27,719 [1;37mINFO[0m cyclops.processors.impute - Dropping vitamin d feature, missingness is higher than threshold!
2022-05-02 18:04:27,721 [1;37mINFO[0m cyclops.processors.impute - Dropping hba1c feature, missingness is higher than threshold!
2022-05-02 18:04:27,724 [1;37mINFO[0m cyclops.processors.impute - Dropping lymphocyte feature, missingness is higher than threshold!
2022-05-02 18:04:27,726 [1;37mINFO[0m cyclops.processors.impute - Dropping neutrophils feature, missingness is higher than threshold!
2022-05-02 18:04:27,729 [1;37mINFO[0m cyclops.processors.impute - Dropping albumin feature, missingness is higher than threshold!
2022-05-02 18:04:27,731 [1;37mINFO

## Plot example temporal features.

In [13]:
plot_features = [
    "oxygen saturation",
    "sodium",
    "temperature",
    "respiratory rate",
    "diastolic bp",
]

encounter_id = 11289767
temporal_features = feature_handler.get_numerical_feature_names()["temporal"]

features_encounter = feature_handler.features["temporal"].loc[encounter_id][
    plot_features
]
feature_handler.plot_features(encounter_id, names=plot_features)

## Plot histogram of static features.

In [14]:
feature_handler.plot_features(aggregate_type="static", names="age")

## Create new FeatureHandler and load features from file.

In [15]:
feature_handler.save(".", "test_features")
feature_handler = FeatureHandler()
feature_handler.load(".", "test_features")

2022-05-02 18:04:52,596 [1;37mINFO[0m cyclops.feature_handler - Saving static features to ./test_features_static.gzip
2022-05-02 18:04:52,623 [1;37mINFO[0m cyclops.feature_handler - Saving temporal features to ./test_features_temporal.gzip
2022-05-02 18:04:52,637 [1;37mINFO[0m cyclops.feature_handler - Loading features from file...
2022-05-02 18:04:52,639 [1;37mINFO[0m cyclops.feature_handler - Found file to load for static features...
2022-05-02 18:04:52,640 [1;37mINFO[0m cyclops.feature_handler - Successfully loaded static features from file...
2022-05-02 18:04:52,655 [1;37mINFO[0m cyclops.feature_handler - Found file to load for temporal features...
2022-05-02 18:04:52,664 [1;37mINFO[0m cyclops.feature_handler - Successfully loaded temporal features from file...
