# Prepare CheXpert+ Dataset

This notebook converts the data format of CheXpert+ tables into a MIMIC-CXR style that is expected by our code. The exception to this is we leave the original image path structure intact and include logic to handle image loading from source. For the data tables, we require the following:
* Across all tables, we require patient, study, and dicom IDs to be globally unique.
* For the metadata table, we use columns:
    * `subject_id`
    * `study_id`
    * `dicom_id` - **unique primary key**
    * `ViewPosition` - column denoting the xray view
* For the splits table, we use columns:
    * `subject_id`
    * `study_id`
    * `dicom_id` - **unique primary key**
    * `split` - where the values for split should be one of:
        * `train`
        * `validate`
        * `test`
* For the report table, we use columns:
    * `study_id` - **unique primary key**, only 1 report per study
    * `findings`
    * `impression`

In [None]:
import os
import pandas as pd

df = pd.read_csv("/opt/gpudata/chexpertplus/df_chexpert_plus_240401.csv")
output_dir = "/opt/gpudata/chexpertplus"

In [None]:
# create globally unique study and dicom IDs
ids = df["path_to_image"].str.split("/")
df["subject_id"] = ids.str[1]
df["study_id"] = ids.str[1] + "_" + ids.str[2]
df["dicom_id"] = ids.str[1] + "_" + ids.str[2] + "_" + ids.str[3].str.split(".").str[0] # remove file ext

In [None]:
# patient32368 image corrupted, omit
df = df[df["subject_id"] != "patient32368"].reset_index(drop=True)

In [None]:
# derive new validation split from the training set, use provided validation split as test split
train_subjects = df.loc[df["split"] == "train", "subject_id"].drop_duplicates()
new_validate = set(train_subjects.sample(n=400, replace=False, random_state=42))
df["split"] = df["split"].replace({"valid": "test"})
df.loc[df["subject_id"].isin(new_validate), "split"] = "validate"

In [None]:
split_df = df[["subject_id", "study_id", "dicom_id", "split"]]
metadata_df = df[["subject_id", "study_id", "dicom_id", "frontal_lateral", "ap_pa"]].rename(columns={"ap_pa": "ViewPosition"})
report_df = df[["study_id", "section_findings", "section_impression"]].rename(columns={
    "section_findings": "findings",
    "section_impression": "impression",
})

In [None]:
# view position derived from ap_pa column which is only used if image is frontal
# populate missing values from more general frontal_lateral columns
metadata_df["ViewPosition"] = metadata_df["ViewPosition"].where(metadata_df["ViewPosition"].notna(), metadata_df["frontal_lateral"])

In [None]:
# chexpertplus deidentification created near identical duplicates of reports relating to a single study
# see: https://github.com/Stanford-AIMI/chexpert-plus/issues/13
# remove those duplicates, arbitrarily keeping the first
report_df = report_df.sort_values("study_id").drop_duplicates("study_id", keep="first").reset_index(drop=True)
split_df = split_df.sort_values("dicom_id").reset_index(drop=True)
metadata_df = metadata_df.sort_values("dicom_id").reset_index(drop=True)

In [None]:
assert report_df["study_id"].is_unique
assert split_df["dicom_id"].is_unique
assert metadata_df["dicom_id"].is_unique

In [None]:
split_df.to_csv(os.path.join(output_dir, "split.csv"), index=False)
metadata_df.to_csv(os.path.join(output_dir, "metadata.csv"), index=False)
report_df.to_csv(os.path.join(output_dir, "report.csv"), index=False)