# Mortality Prediction using Tabular Data

This notebooks presents the use-case of predicting the risk of mortality in patients on Mimic-IV dataset.

In [None]:
import glob

import dask.dataframe as dd
import pandas as pd
import yaml
from datasets import Dataset
from datasets.features import ClassLabel
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from cyclops.datasets.slicer import SliceSpec
from cyclops.evaluate.metrics import MetricCollection, create_metric
from cyclops.models.catalog import create_model
from cyclops.models.constants import CONFIG_ROOT
from cyclops.process.aggregate import Aggregator
from cyclops.process.column_names import (
    AGE,
    ENCOUNTER_ID,
    EVENT_NAME,
    EVENT_TIMESTAMP,
    EVENT_VALUE,
    SEX,
)
from cyclops.process.constants import MEAN, NUMERIC, STRING
from cyclops.process.feature.feature import TabularFeatures
from cyclops.tasks.mortality_prediction import MortalityPrediction
from cyclops.utils.file import join, load_dataframe, process_dir_save_path

## Constants

In [None]:
DATASET = "mimiciv"
CONST_NAME = "mortality_decompensation"

USECASE_ROOT_DIR = join(
    "/mnt/data",
    "cyclops",
    "use_cases",
    DATASET,
    CONST_NAME,
)
DATA_DIR = process_dir_save_path(join(USECASE_ROOT_DIR, "./data"))
ENCOUNTERS_FILE = join(DATA_DIR, "encounters.parquet")
EVENTS_DIR = join(DATA_DIR, "./1-cleaned")

OUTCOME_DEATH = "outcome_death"
TAB_FEATURES = [
    AGE,
    SEX,
    "admission_type",
    "admission_location",
]

SPLIT_FRACTIONS = [0.8, 0.1, 0.1]

## Features
A list of selected events along side age, sex, admission type and admission location are used as the features.

In [None]:
selected_events = [
    "routine vital signs - arterial blood pressure diastolic",
    "routine vital signs - arterial blood pressure mean",
    "routine vital signs - arterial blood pressure systolic",
    "routine vital signs - heart rate",
    "routine vital signs - non invasive blood pressure diastolic",
    "routine vital signs - non invasive blood pressure mean",
    "routine vital signs - non invasive blood pressure systolic",
    "labs - arterial base excess",
    "labs - arterial co2 pressure",
    "labs - arterial o2 pressure",
    "labs - bun",
    "labs - calcium non-ionized",
    "labs - chloride",
    "labs - creatinine",
    "labs - glucose",
    "labs - glucose finger stick",
    "labs - hco3",
    "labs - hematocrit",
    "labs - hemoglobin",
    "labs - inr",
    "labs - magnesium",
    "labs - ph",
    "labs - phosphorous",
    "labs - platelet count",
    "labs - potassium",
    "labs - prothrombin time",
    "labs - ptt",
    "labs - sodium",
    "labs - tco2  arterial",
    "labs - wbc",
    "respiratory - apnea interval",
    "respiratory - fspn high",
    "respiratory - inspired o2 fraction",
    "respiratory - mean airway pressure",
    "respiratory - minute volume",
    "respiratory - o2 flow",
    "respiratory - o2 saturation pulseoxymetry",
    "respiratory - paw high",
    "respiratory - peak insp. pressure",
    "respiratory - peep set",
    "respiratory - respiratory rate",
    "respiratory - tidal volume",
    "respiratory - ventilator mode",
    "respiratory - vti high",
]

In [None]:
features_list = sorted(selected_events + TAB_FEATURES)

## Data Loading

The tabular features are extracted from the encouter records and the events are extracted from the saved parquet files.

In [None]:
encounters_df = load_dataframe(ENCOUNTERS_FILE)
encounters_df

In [None]:
parquet_files = list(glob.glob(join(EVENTS_DIR, "*.parquet")))
ddf = dd.read_parquet(path=parquet_files)
ddf.npartitions

## Data Preprocessing

In the preprocessing step, the values of selected events are aggregated. After creating a dataframe that contains all the features, Scikit-learn transformations are used to preprocess the categorical and numerical data.

In [None]:
# Instantiate the aggregator
aggregator = Aggregator(
    aggfuncs={EVENT_VALUE: MEAN},
    timestamp_col=EVENT_TIMESTAMP,
    time_by=ENCOUNTER_ID,
    agg_by=[ENCOUNTER_ID, EVENT_NAME],
    timestep_size=24,
    window_duration=72,
)

In [None]:
# A function to filter data based on the time events were recorded.
# Here, the deathtime must be at least 24 hours after the last event.
def stop_bound(data: pd.DataFrame):
    data = data[
        (data["deathtime"] > (data["stop"] + pd.Timedelta(hours=24)))
        | (data[OUTCOME_DEATH] != 1)
    ]
    return data


# Perform aggregation on data partitions
def process_partition(partition):
    invalid = (partition["hospital_expire_flag"] == 1) & (partition["deathtime"].isna())
    partition = partition[~invalid]
    partition[OUTCOME_DEATH] = partition["hospital_expire_flag"] == 1
    partition[OUTCOME_DEATH] = partition[OUTCOME_DEATH].astype(int)
    # Aggregate events values over the window duration
    aggregated = aggregator.aggregate_values(partition, stop_bound_func=stop_bound)
    # Convert the events rows into feature columns
    aggregated = aggregated.reset_index()
    aggregated = aggregated.pivot(
        index=ENCOUNTER_ID, columns=EVENT_NAME, values=EVENT_VALUE
    )
    return aggregated

In [None]:
# run aggregation on data partitions and create the final dataframe
selected_events = sorted(selected_events)
ddf = ddf.loc[ddf[EVENT_NAME].isin(selected_events)]
meta_df = pd.DataFrame(columns=selected_events)
ddf = ddf.map_partitions(process_partition, meta=meta_df)
df = ddf.compute()
df

In [None]:
# merge events data with encounters data which can be used for slicing during evaluation
df = df.merge(encounters_df, on=ENCOUNTER_ID).set_index(ENCOUNTER_ID)
df[OUTCOME_DEATH] = df[OUTCOME_DEATH].astype(int)
df

In [None]:
# the data is heavily unbalanced
class_counts = df[OUTCOME_DEATH].value_counts()
class_ratio = class_counts[0] / class_counts[1]
class_ratio

In [None]:
# use TabularFeatures object to detect the feature types
tab_features = TabularFeatures(
    data=df.reset_index(),
    features=features_list,
    by=ENCOUNTER_ID,
    targets=OUTCOME_DEATH,
    force_types={
        SEX: STRING,
        "admission_location": STRING,
        "admission_type": STRING,
    },
)

In [None]:
categorical_features = sorted(tab_features.features_by_type(STRING))
categorical_features

In [None]:
numeric_features = sorted((tab_features.features_by_type(NUMERIC)))
numeric_features

In [None]:
# extract the indices of the features from a subset of the dataframe \
# that is used for modeling
categorical_indices = [
    df[features_list].columns.get_loc(column) for column in categorical_features
]
numeric_indices = [
    df[features_list].columns.get_loc(column) for column in numeric_features
]

In [None]:
# define the preprocessor
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_indices),
        ("cat", categorical_transformer, categorical_indices),
    ],
    remainder="passthrough",
)
# fit the preprocessor for later use
preprocessor = preprocessor.fit(df[features_list])

## Dataset Creation

The dataframe is converted to Hugging Face Dataset which is necessary for evaluation and optional for training and inference.

In [None]:
dataset = Dataset.from_pandas(df)
dataset.cleanup_cache_files()
dataset

In [None]:
# split the data to train and test subsets while preserving \
# the same class ratio for both subsets
dataset = dataset.cast_column(OUTCOME_DEATH, ClassLabel(num_classes=2))
dataset = dataset.train_test_split(
    train_size=SPLIT_FRACTIONS[0], stratify_by_column=OUTCOME_DEATH, seed=42
)
dataset["train"]

## Model Creation

The CyclOps Model API is used to create models using estimators from the Scikit-learn package. The configuration of the model is based on the corresponding config files, which include the necessary parameters for instantiating the Scikit-learn estimators, as well as optional parameters for hyperparameter search.

In [None]:
mlp_name = "mlp"
config_path = join(CONFIG_ROOT, mlp_name + ".yaml")
with open(config_path, "r") as f:
    mlp_config = yaml.safe_load(f)
    mlp_best_model_params = mlp_config.pop("best_model_params", None)

mlp_model = create_model(mlp_name, **mlp_config)

In [None]:
xgb_name = "xgb_classifier"
config_path = join(CONFIG_ROOT, xgb_name + ".yaml")
with open(config_path, "r") as f:
    xgb_config = yaml.safe_load(f)
    xgb_best_model_params = xgb_config.pop("best_model_params", None)
    xgb_fit_params = xgb_config.pop("fit_params", None)

# handle imbalanced data
xgb_config["model_params"]["scale_pos_weight"] = class_ratio

xgb_model = create_model(xgb_name, **xgb_config)

## Mortality Prediction Task

The CyclOps Task API is used to create a Mortality Prediction Task based on the available models and dataset. The task can contain multiple models that can be trained and used for prediction individually. This is particularly useful when comparing the performance of multiple models during the evaluation step.

In [None]:
mortality_task = MortalityPrediction(
    {xgb_name: xgb_model}, task_features=features_list, task_target=[OUTCOME_DEATH]
)

In [None]:
mortality_task.add_model(mlp_name)
mortality_task.list_models()

In [None]:
mortality_task.list_models_params()

### Training

The `train` method can be used for Hugging Face datasets, numpy arrays or dataframes (containing only the relevant columns). 
If the input data is a Hugging Face Dataset and not preprocessed already, a `ColumnTransformer` object can be passed as transforms.

In [None]:
mortality_task.train(
    dataset["train"],
    model_name=xgb_name,
    transforms=preprocessor,
)

In [None]:
mortality_task.train(dataset["train"], model_name=mlp_name, transforms=preprocessor)

### Prediction

In the prediction phase, the task object allows for a variety of data inputs, including numpy arrays, pandas dataframes, and Hugging Face Datasets.

When using a Hugging Face dataset as the input, it is possible to obtain the entire dataset with the added prediction column as the output of the predict method.

In [None]:
# run prediction
ds_with_mlp_preds = mortality_task.predict(
    dataset,
    model_name=xgb_name,
    prediction_column_prefix="preds",
    transforms=preprocessor,
    only_predictions=False,
    proba=True,
    splits_mapping={"test": "test"},
)
ds_with_mlp_preds.to_pandas()

### Evaluation

Evaluation is typically performed on a Hugging Face dataset. To evaluate the models, you can also provide a slice specification to see how well they perform for different slices of data based on the feature values.

In addition to the dataset and slice specification, you need to specify the desired evaluation metrics. This can be done by providing a MetricCollection object, a list of metrics, or metric names.


In [None]:
spec_list = [
    {"sex": {"value": "M"}},  # feature value is M
    {
        "age": {
            "min_value": 18,
            "max_value": 65,
            "min_inclusive": True,
            "max_inclusive": False,
        }
    },  # feature value is between 18 and 65, inclusive of 18, exclusive of 65
    {
        "admission_type": {"value": ["EW EMER.", "DIRECT EMER.", "URGENT"]}
    },  # feature value is in the list
    {
        "admission_location": {
            "value": ["PHYSICIAN REFERRAL", "CLINIC REFERRAL", "WALK-IN/SELF REFERRAL"],
            "negate": True,
        }
    },  # feature value is NOT in the list
    {
        "dod": {"max_value": "2019-12-01", "keep_nulls": False}
    },  # possibly before COVID-19
    {
        "dod": {"max_value": "2019-12-01", "negate": True, "keep_nulls": False}
    },  # possibly during COVID-19
    {"admit_timestamp": {"month": [6, 7, 8, 9], "keep_nulls": False}},
    {
        "sex": {"value": "F"},
        "race": {
            "value": [
                "BLACK/AFRICAN AMERICAN",
                "BLACK/CARIBBEAN ISLAND",
                "BLACK/CAPE VERDEAN",
                "BLACK/AFRICAN",
            ]
        },
        "age": {"min_value": 25, "max_value": 40},
    },  # compound slice
]


# create the slice functions
slice_spec = SliceSpec(spec_list)

In [None]:
# define metrics
metric_names = ["accuracy", "precision", "recall", "f1_score", "auroc"]
metrics = [create_metric(metric_name, task="binary") for metric_name in metric_names]
metric_collection = MetricCollection(metrics)

# run evaluation
results, dataset_with_preds = mortality_task.evaluate(
    dataset["test"],
    metric_collection,
    model_names=xgb_name,
    transforms=preprocessor,
    prediction_column_prefix="preds",
    slice_spec=slice_spec,
    batch_size=200,
)

In [None]:
results