In [None]:
import os
import sys

sys.path.append("..")

import re
from collections import OrderedDict

import config
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sqlalchemy
from evidently import ColumnMapping
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataQualityTab
from evidently.model_profile import Profile
from evidently.model_profile.sections import DataQualityProfileSection
from sqlalchemy import desc, func, select
from sqlalchemy.sql.expression import and_, or_

import cyclops
import cyclops.query_mimic as qm
import cyclops.query_utils as q_utils
from cyclops.orm import Database
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    DIAGNOSIS_CODE,
    ENCOUNTER_ID,
    HOSPITAL_ID,
    LAB_TEST_NAME,
    LAB_TEST_RESULT_UNIT,
    LAB_TEST_RESULT_VALUE,
    LAB_TEST_TIMESTAMP,
    REFERENCE_RANGE,
)
from cyclops.processors.diagnosis_codes import DiagnosisProcessor
from cyclops.processors.feature_handler import FeatureHandler
from cyclops.processors.labs import LabsProcessor
from cyclops.queries import query_gemini_delirium_diagnosis, query_gemini_delirium_lab

%load_ext autoreload
%autoreload 2 # Load when external files are updated
%load_ext nb_black

# Setup ORM

In [None]:
os.environ["USER"] = "postgres"
os.environ["PGPASSWORD"] = "pwd"
cfg = config.read_config("../configs/default/*.yaml")
db = Database(cfg)

MIMIC EDA
See sample_code/delirium.ipynb

In sample_code/data_layer.ipynb:
from evidently.dashboard.tabs import DataQualityTab

# Feature extraction

In [None]:
from cyclops.processors.column_names import DIAGNOSIS_CODE, ENCOUNTER_ID

In [None]:
query = qm.patient_diagnoses(db, version=10, include_icd_title=True)
query = qm.join_with_patients(db, query)
query = q_utils.drop_attributes(
    query,
    [
        "anchor_year",
        "dod",
        "anchor_year_group_start",
        "anchor_year_group_end",
        "year",
        "anchor_year_difference",
    ],
)
query = q_utils.rename_attributes(
    query, {"hadm_id": ENCOUNTER_ID, "icd_code": DIAGNOSIS_CODE, "anchor_age": "age"}
)

data = db.run_query(query)
data

## Target

We have all data we need, so we can determine the targets. In this case, it's any code relating to delirium:

In [None]:
data["target"] = data["icd_title"].str.contains("delirium", case=False)
data.head()

Get encounter IDs where they have the target diagnoses

In [None]:
target_encounters = data[ENCOUNTER_ID][data["target"]].drop_duplicates().to_numpy()
target_encounters

Alternatively, we could have queried this data again:

In [None]:
query = qm.patient_diagnoses_by_substring(db, "delirium", version=10)
query = q_utils.rename_attributes(
    query, {"hadm_id": ENCOUNTER_ID, "icd_code": DIAGNOSIS_CODE}
)
target_df = db.run_query(query)
target_df.head()

In [None]:
target_encounters2 = target_df[ENCOUNTER_ID].values
target_encounters2

Aside from the order, these are identical:

In [None]:
set(target_encounters) == set(target_encounters2)

## Features

In [None]:
must_have_columns = [ENCOUNTER_ID, DIAGNOSIS_CODE]
data_diagnosis = data[must_have_columns]

diagnosis_processor = DiagnosisProcessor(data_diagnosis, must_have_columns)
diagnosis_features = diagnosis_processor.process()
diagnosis_features.index.names = [ENCOUNTER_ID]
diagnosis_features.head()

### Patient statics features

In [None]:
# Get statics
statics = data[[ENCOUNTER_ID, "gender", "age"]]
statics.head()

In [None]:
# Join features with statics
diagnosis_features = pd.merge(diagnosis_features, statics, on=ENCOUNTER_ID)
diagnosis_features.set_index(ENCOUNTER_ID, inplace=True)
diagnosis_features.head()

## Label leakage

In [None]:
# Check to make sure none of the feature code overlap with the target codes
target_codes = set(data[DIAGNOSIS_CODE][data["target"]].values)
# feature_codes = None
# assert len(target_codes.intersection(feature_codes)) == 0

## Combine features and target

In [None]:
is_target = np.in1d(diagnosis_features.index, target_encounters)
diagnosis_features["target"] = is_target.astype(int)
diagnosis_features.head()

In [None]:
# Percentage of true targets
diagnosis_features["target"].sum() / len(diagnosis_features)

### AGE BROKEN - FIX

NOTE: I still think some of the ages shouldn't be 0 when they are.

This age doesn't actually correspond to the hospital visit? Need to sort this out. Where is the approx year in the hospital table?

# Feature Store

## Dataset Prep

Introduction: https://docs.feast.dev/

Quickstart: https://docs.feast.dev/getting-started/quickstart

Quickstart Colab: https://colab.research.google.com/github/feast-dev/feast/blob/master/examples/quickstart/quickstart.ipynb

Feature Repository: https://docs.feast.dev/reference/feature-repository



In [None]:
%%sh
pip install wheel
pip install feast -U -q
pip install Pygments -q
echo "Please restart your runtime now (Runtime -> Restart runtime). This ensures that the correct dependencies are loaded."

In [None]:
!feast init feature_repo

In [None]:
%cd feature_repo
!ls -R

In [None]:
!pygmentize feature_store.yaml

In [None]:
import pandas as pd

pd.read_parquet("data/driver_stats.parquet")

In [None]:
!pygmentize -f terminal16m example.py

In [None]:
!feast apply

In [None]:
from datetime import datetime, timedelta

import pandas as pd
from feast import FeatureStore

# The entity dataframe is the dataframe we want to enrich with feature values
entity_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003],
        "label_driver_reported_satisfaction": [1, 5, 3],
        "event_timestamp": [
            datetime.now() - timedelta(minutes=11),
            datetime.now() - timedelta(minutes=36),
            datetime.now() - timedelta(minutes=73),
        ],
    }
)

store = FeatureStore(repo_path=".")

training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
).to_df()

print("----- Feature schema -----\n")
print(training_df.info())

print()
print("----- Example features -----\n")
print(training_df.head())

In [None]:
from datetime import datetime

!feast materialize-incremental {datetime.now().isoformat()}

In [None]:
print("--- Data directory ---")
!ls data

import sqlite3

import pandas as pd

con = sqlite3.connect("data/online_store.db")
print("\n--- Schema of online store ---")
print(
    pd.read_sql_query(
        "SELECT * FROM feature_repo_driver_hourly_stats", con
    ).columns.tolist()
)
con.close()

In [None]:
from pprint import pprint

from feast import FeatureStore

store = FeatureStore(repo_path=".")

feature_vector = store.get_online_features(
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
    entity_rows=[
        {"driver_id": 1004},
        {"driver_id": 1005},
    ],
).to_dict()

pprint(feature_vector)

# H20 Baseline Handler

In [None]:
!pip install h2o

In [None]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

In [None]:
# EMPTY STRING COLUMN CAUSING ISSUES?
diagnosis_features = diagnosis_features.drop([""], axis=1)

In [None]:
frame = h2o.H2OFrame(diagnosis_features)

# Convert variables to categorical
# asfactor(): Convert column/columns in the current frame to categoricals.
for c in frame.columns:
    # All columns are categorical (diagnoses/gender), except for age
    if c != "age":
        frame[c] = frame[c].asfactor()

training_frame, validation_frame = frame.split_frame(ratios=[0.8], seed=1234)

In [None]:
len(training_frame)

In [None]:
len(validation_frame)

In [None]:
y = "target"
x = [i for i in list(diagnosis_features.columns) if i != y]

In [None]:
# AN ASIDE: Is this of any use to us, perhaps in the feature handler?
"""
# Automatic label preprocessing?
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
train_x = train_x.apply(
    lambda col: label.fit_transform(col), axis=0, result_type="expand"
)
test_x = test_x.apply(
    lambda col: label.fit_transform(col), axis=0, result_type="expand"
)
train_x
"""

In [None]:
automl = H2OAutoML(max_models=30, max_runtime_secs=300, seed=1, balance_classes=True)
automl.train(x=x, y=y, training_frame=training_frame, validation_frame=validation_frame)
leader = automl.leaderboard
leader.head()

In [None]:
leader.head(rows=leader.nrows)  # Entire leaderboard

In [None]:
automl.leader.params.keys()

In [None]:
m = h2o.get_model(automl.leader)

In [None]:
pred = automl.leader.predict(validation_frame)
pred_df = pred.as_data_frame(use_pandas=True)
pred_df.head()

In [None]:
true_arr = validation_frame["target"].as_data_frame(use_pandas=True).values[:, 0]
true_arr

In [None]:
pred_arr = pred_df["predict"].values
pred_arr

In [None]:
(pred_arr == true_arr).sum() / len(validation_frame)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

cm = confusion_matrix(true_arr, pred_arr)
cmd = ConfusionMatrixDisplay(cm)
cmd.from_predictions(true_arr, pred_arr)