In [1]:
import pickle

import numpy as np
import pandas as pd

from cyclops.processors.aggregate import Aggregator
from cyclops.processors.cleaning import (
    normalize_categories,
    normalize_names,
    normalize_values,
)
from cyclops.processors.column_names import (
    ADMIT_TIMESTAMP,
    AGE,
    CARE_UNIT,
    DIAGNOSIS_CODE,
    DIAGNOSIS_TRAJECTORY,
    ENCOUNTER_ID,
    EVENT_CATEGORY,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    EVENT_VALUE_UNIT,
    HOSPITAL_ID,
    SEX,
    SUBJECT_ID,
    TIMESTEP,
    YEAR,
)
from cyclops.processors.constants import (
    BINARY,
    BY,
    CATEGORICAL_INDICATOR,
    FEATURE_INDICATOR_ATTR,
    FEATURE_MAPPING_ATTR,
    FEATURE_TYPE_ATTR,
    FEATURE_TYPES,
    FEATURES,
    MEAN,
    MIN_MAX,
    MISSING_CATEGORY,
    NUMERIC,
    ORDINAL,
    STANDARD,
    STRING,
    TARGETS,
)
from cyclops.processors.feature.feature import TabularFeatures, TemporalFeatures
from cyclops.processors.feature.normalization import GroupbyNormalizer
from cyclops.processors.feature.split import split_datasets
from cyclops.query import mimic
from cyclops.query import process as qp
from cyclops.utils.common import print_dict
from cyclops.utils.file import load_dataframe, save_dataframe

2022-07-14 15:16:15,822 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


In [None]:
YEARS = [2015, 2016, 2018, 2019, 2020]
MIN_YEAR = min(YEARS)
TARGET_NAMES = ["death"]

# Querying

In [None]:
encounters_interface = mimic.patient_encounters(
    years=YEARS, died=True, died_binarize_col="died"
)
encounters_query = encounters_interface.query
encounters_query = qp.Drop(
    ["insurance", "language", "marital_status", "hospital_expire_flag"]
)(encounters_query)
encounters_interface = mimic.get_interface(encounters_query)
encounters = encounters_interface.run()
encounters

In [None]:
target_events = encounters[encounters["died"] == True]
target_events = target_events[[ENCOUNTER_ID, "deathtime"]]
target_events = target_events.rename({"deathtime": EVENT_TIMESTAMP}, axis=1)
target_events[EVENT_NAME] = TARGET_NAME
target_events[EVENT_VALUE] = 1
target_events.head(5)

In [None]:
save_dataframe(encounters, "encounters.parquet")

In [None]:
events_interface = mimic.events()
events_query = events_interface.query
events_query = qp.Drop(["warning", "itemid", "storetime"])(events_query)
events_interface = mimic.get_interface(events_query)
events = events_interface.run(limit=1000000)
events = pd.merge(
    encounters[[ENCOUNTER_ID, "anchor_year_difference"]], events, on=ENCOUNTER_ID
)


def add_offset(row):
    row[EVENT_TIMESTAMP] += pd.DateOffset(years=row["anchor_year_difference"])
    return row


events = events.apply(add_offset, axis=1)
events = events.drop("anchor_year_difference", axis=1)

# Include target as an event
events = pd.concat([events, target_events])

events

In [None]:
save_dataframe(events, "events.parquet")

------------------------------------------------------------------------------------------------

# Processing

## Tabular

In [2]:
encounters = load_dataframe("encounters.parquet")
encounters.head(5)

2022-07-14 15:16:15,924 [1;37mINFO[0m cyclops.utils.file - Loading dataframe from encounters.parquet


Unnamed: 0,subject_id,encounter_id,admit_timestamp,discharge_timestamp,deathtime,admission_type,admission_location,discharge_location,ethnicity,edregtime,edouttime,sex,age,birth_year,dod,anchor_year_difference,died
0,19759112,20610510,2018-07-03 02:10:00,2018-07-05 12:15:00,NaT,ELECTIVE,,HOME,UNKNOWN,NaT,NaT,M,0,2018,NaT,-140,False
1,17978591,21300810,2015-10-10 00:21:00,2015-10-12 14:10:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,F,0,2015,NaT,-134,False
2,16420748,23082241,2018-10-25 22:30:00,2018-10-28 12:58:00,NaT,ELECTIVE,,HOME,WHITE,NaT,NaT,F,0,2018,NaT,-151,False
3,15206017,25518306,2018-10-22 15:28:00,2018-10-23 19:13:00,NaT,EU OBSERVATION,CLINIC REFERRAL,,BLACK/AFRICAN AMERICAN,2110-10-22 11:52:00,2110-10-22 17:37:00,F,76,1942,NaT,-92,
4,14695283,23679175,2018-06-24 18:54:00,2018-06-25 20:15:00,NaT,EU OBSERVATION,PHYSICIAN REFERRAL,,WHITE,2176-06-24 14:46:00,2176-06-25 20:15:00,F,90,1928,NaT,-158,


In [3]:
features = [
    AGE,
    SEX,
    "admission_type",
    "admission_location",
    "discharge_location",
    "ethnicity",
]

tab_features = TabularFeatures(
    data=encounters,
    features=features,
    by=ENCOUNTER_ID,
)

In [4]:
tab_features.types

{'sex': 'binary',
 'discharge_location': 'ordinal',
 'admission_location': 'ordinal',
 'ethnicity': 'ordinal',
 'admission_type': 'ordinal',
 'age': 'numeric'}

In [5]:
tab_features.meta["admission_location"].get_mapping()

{0: 'AMBULATORY SURGERY TRANSFER',
 1: 'CLINIC REFERRAL',
 2: 'EMERGENCY ROOM',
 3: 'INFORMATION NOT AVAILABLE',
 4: 'INTERNAL TRANSFER TO OR FROM PSYCH',
 5: 'PACU',
 6: 'PHYSICIAN REFERRAL',
 7: 'PROCEDURE SITE',
 8: 'TRANSFER FROM HOSPITAL',
 9: 'TRANSFER FROM SKILLED NURSING FACILITY',
 10: 'WALK-IN/SELF REFERRAL',
 11: 'nan'}

In [6]:
ordinal_features = tab_features.features_by_type(ORDINAL)
ordinal_features

['discharge_location', 'admission_location', 'ethnicity', 'admission_type']

In [7]:
tab_vectorized = tab_features.vectorize(to_binary_indicators=ordinal_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feat] = data[feat].replace(self.meta[feat].get_mapping())
A value is tryin

In [8]:
tab_vectorized.shape

(224764, 43)

In [9]:
tab_vectorized.axis_names

['encounter_id', 'features']

In [10]:
tab_vectorized.get_index(FEATURES)

array(['sex', 'age', 'discharge_location_ACUTE HOSPITAL',
       'discharge_location_AGAINST ADVICE',
       'discharge_location_ASSISTED LIVING',
       'discharge_location_CHRONIC/LONG TERM ACUTE CARE',
       'discharge_location_DIED',
       'discharge_location_HEALTHCARE FACILITY',
       'discharge_location_HOME', 'discharge_location_HOME HEALTH CARE',
       'discharge_location_HOSPICE', 'discharge_location_OTHER FACILITY',
       'discharge_location_PSYCH FACILITY', 'discharge_location_REHAB',
       'discharge_location_SKILLED NURSING FACILITY',
       'admission_location_AMBULATORY SURGERY TRANSFER',
       'admission_location_CLINIC REFERRAL',
       'admission_location_EMERGENCY ROOM',
       'admission_location_INFORMATION NOT AVAILABLE',
       'admission_location_INTERNAL TRANSFER TO OR FROM PSYCH',
       'admission_location_PACU', 'admission_location_PHYSICIAN REFERRAL',
       'admission_location_PROCEDURE SITE',
       'admission_location_TRANSFER FROM HOSPITAL',
   

In [None]:
numeric_features = tab_feature.features_by_type(NUMERIC)
numeric_features

## Temporal

In [None]:
events = load_dataframe("events.parquet")

# Process
events[EVENT_NAME] = normalize_names(events[EVENT_NAME])
events[EVENT_CATEGORY] = normalize_categories(events[EVENT_CATEGORY])
# events[EVENT_VALUE] = normalize_values(events[EVENT_VALUE])

# Same event names are the same, but mean different things for different categories
# E.g., 'flow' for categories heartware and ecmo
events[EVENT_NAME] = events[EVENT_NAME] + " - " + events[EVENT_CATEGORY]
events.head(5)

In [None]:
# Keep only the most popular events, including the target
top_events = events[EVENT_NAME].value_counts()[:150].index
top_events = np.append(top_events, TARGET_NAME).unique()
print(top_events)
events = events[events[EVENT_NAME].isin(top_events)]
events

In [None]:
events.head(5)

In [None]:
events[events[EVENT_NAME].str.contains("death")].head(5)

In [None]:
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=8,
    window_duration=24,  # Optional
    # agg_meta_for=EVENT_VALUE,  # Optional
)

In [None]:
aggregated = tmp_features.aggregate()
aggregated.head(5)

In [None]:
aggregated.head(100)

In [None]:
save_dataframe(aggregated, "aggregated.parquet")

In [None]:
temp_vectorized = aggregator.vectorize(aggregated)
temp_vectorized.data.shape

In [None]:
temp_vectorized.axis_names

In [None]:
# tmp_features.normalize(FEATURES)
# tmp_features.get_data()

In [None]:
temp_vectorized = np.squeeze(temp_vectorized)
temp_vectorized.shape

In [None]:
train_data, val_data, test_data = split_data(temp_vectorized, [0.7, 0.2])

In [None]:
train_data.shape

In [None]:
val_data.shape

In [None]:
test_data.shape

## Dataset splits

Take only the encounters available in all of the datasets and align the datasets over encounters.

In [26]:
tab_encounters = set(tab_vectorized.get_index(ENCOUNTER_ID))
temp_encounters = set(temp_vectorized.get_index(ENCOUNTER_ID))
encounters_intersection = np.array(list(tab_encounters.intersection(temp_encounters)))

In [24]:
tab_vectorized = tab_vectorized.take_with_index(ENCOUNTER_ID, encounters_intersection)

temp_vectorized = temp_vectorized.take_with_index(ENCOUNTER_ID, encounters_intersection)

Split into training, validation, and testing datasets.

Split using the same indices such that the tabular and temporal datasets remain aligned.

In [None]:
splits = split_idx(
    fractions=[0.8, 0.1, 0.1],
    data_len=len(encounters_intersection),
)

In [None]:
tab_train_X, tab_val_X, tab_test_X = tab_vectorized.split_by_indices(
    ENCOUNTER_ID, splits
)

temp_train, temp_val, temp_test = temp_vectorized.split_by_indices(ENCOUNTER_ID, splits)

### Split features/targets

Split out the targets in the temporal data.

In [None]:
temp_train_X, temp_train_y = temp_train.split_out(FEATURES, TARGET_NAMES)
temp_val_X, temp_val_y = temp_val.split_out(FEATURES, TARGET_NAMES)
temp_test_X, temp_test_y = temp_test.split_out(FEATURES, TARGET_NAMES)

### Normalization

In [None]:
for split in tab_splits:
    tab_normalizer = VectorizedNormalizer(
        tab_vectorized.get_axis(FEATURES),
        normalization_map={feat: STANDARD for feat in numeric_features},
    )
    tab_vectorized.add_normalizer(FEATURES, tab_normalizer)
    tab_vectorized.normalize(FEATURES)

In [None]:
for split in temp_splits:
    tab_normalizer = VectorizedNormalizer(
        tab_vectorized.get_axis(EVENT_VALUE),
        normalization_method=STANDARD,
    )
    tab_vectorized.add_normalizer(FEATURES, tab_normalizer)
    tab_vectorized.normalize(FEATURES)

## Save