# NIHCXR Synthetic Drift - Gaussian Shift

## Load Libraries

In [2]:
from cyclops.monitor import (
    Detector,
    Experimenter,
    Reductor,
    SyntheticShiftApplicator,
    TSTester,
)
from cyclops.monitor.plotter import plot_drift_samples_pval
from cyclops.monitor.utils import Loader

## Query Data

In [1]:
import os

import numpy as np
import pandas as pd
from datasets import Dataset, Image

from cyclops.monitor.utils import nihcxr_preprocess

In [2]:
def nihcxr_preprocess(df: pd.DataFrame, nihcxr_dir: str) -> pd.DataFrame:
    """Preprocess NIHCXR dataframe.

    Add a column with the path to the image and create one-hot encoded pathogies from Finding Labels column.

    Args:
        df (pd.DataFrame): NIHCXR dataframe.

    Returns:
        pd.DataFrame: pre-processed NIHCXR dataframe.
    """

    # Add path column
    df["image"] = df["Image Index"].apply(
        lambda x: os.path.join(nihcxr_dir, "images", x)
    )

    # Create one-hot encoded pathologies
    pathologies = df["Finding Labels"].str.get_dummies(sep="|")

    # Add one-hot encoded pathologies to dataframe
    df = pd.concat([df, pathologies], axis=1)

    return df


nihcxr_dir = "/home/akore/NIHCXR"
df = pd.read_csv(os.path.join(nihcxr_dir, "Data_Entry_2017.csv"))
df = nihcxr_preprocess(df, nihcxr_dir)
nih_ds = Dataset.from_pandas(df, preserve_index=False)
nih_ds = nih_ds.cast_column("image", Image())

In [21]:
len(nih_ds["Patient Gender"])

112120

In [22]:
nih_ds["Patient Gender"].isin(["M"])

AttributeError: 'list' object has no attribute 'isin'

In [28]:
# filter out only Patient Gender = Male
nih_ds.filter(
    lambda examples: [
        example in [30, 31, 32, 33, 34, 35] for example in examples["Patient Age"]
    ],
    batched=True,
)

  0%|          | 0/113 [00:00<?, ?ba/s]

Dataset({
    features: ['Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID', 'Patient Age', 'Patient Gender', 'View Position', 'OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]', 'Unnamed: 11', 'image', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax'],
    num_rows: 10102
})

In [8]:
from cyclops.evaluate.slicing import SlicingConfig

In [None]:
# define the slices
feature_values = [
    {"Patient Gender": {"value": "M"}},
    {"Patient Gender": {"value": "F"}},
    {"Patient Age": {"min": 25, "max": 40}},
    {"Patient Age": {"min": 65}},
    {"View Position": {"value": "PA"}},
]

# create the slice functions
slice_config = SlicingConfig(feature_values=feature_values)

In [64]:
s = SlicingConfig(
    feature_values=[
        {
            "Patient Age": {
                "min_value": 25,
                "max_value": 40,
                "min_inclusive": True,
                "max_inclusive": True,
            }
        }
    ]
)

for sn, sf in s.get_slices().items():
    ds = nih_ds.filter(sf, batched=True)

  0%|          | 0/113 [00:00<?, ?ba/s]

## Initalize Reductor, Tester & Detector

In [None]:
reductor = Reductor(
    dr_method="TAE_txrv_CNN",
)

tester = TSTester(
    tester_method="mmd",
)


detector = Detector(
    reductor=reductor,
    tester=tester,
)
with Loader("Initializing the detector..."):
    detector.fit(nih_ds, progress=False)

## Setup Baseline Experiment

In [30]:
import xgboost

In [31]:
# initialize the xgboost model
xgb = xgboost.XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)

In [33]:
xgb.fit(np.random.rand(100, 100), np.random.randint(0, 2, 100))

In [38]:
xgb.predict_proba(np.random.rand(5, 100))

array([[0.25060946, 0.74939054],
       [0.8712652 , 0.1287348 ],
       [0.34347236, 0.65652764],
       [0.9052336 , 0.09476641],
       [0.50999343, 0.4900066 ]], dtype=float32)

In [None]:
baseline_experiment = Experimenter(
    "sensitivity_test",
    detector=detector,
)

## Setup Drift Experiments (Categorical Shift)

In [None]:
shiftapplicators = []
shift_type = ["categorical_shift"] * 3
cat_col = ["gender", "view", "age"]
target_categories = ["M", "PA", "18-35"]

for s_type, col, target in zip(shift_type, cat_col, target_categories):
    shiftapplicators.append(
        SyntheticShiftApplicator(
            shift_type=s_type,
            categorical_column=col,
            target_category=target,
        )
    )

experiments = []
for shiftapplicator in shiftapplicators:
    drift_experiment = Experimenter(
        "sensitivity_test",
        detector=detector,
        shiftapplicator=shiftapplicator,
    )
    experiments.append(drift_experiment)

## Run Experiments

In [None]:
baseline_results = baseline_experiment.run(dataset, metadata, metadata_mapping)
drift_results = []
for experiment in experiments:
    drift_results.append(experiment.run(dataset, metadata, metadata_mapping))

## Gather Results

In [None]:
results_dict = {}
results_dict.update({"baseline": baseline_results})
for itr, result in enumerate(drift_results):
    results_dict.update({f"{cat_col[itr]}: {target_categories[itr]}": result})

## Plot Experimental Results

In [None]:
plot_drift_samples_pval(results_dict, 0.05)