### View Projection Metadata
Download the metadata table from this Kaggle dataset where the author manually annotated all images with the imaging view (frontal or lateral): https://www.kaggle.com/datasets/raddar/chest-xrays-indiana-university

In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

In [None]:
src_csv = "/opt/gpudata/openi/IU-xray-projections.csv"
image_dir = "/opt/gpudata/openi/pngs"
report_dir = "/opt/gpudata/openi/reports"

metadata_csv = "/opt/gpudata/openi/metadata.csv"
split_csv = "/opt/gpudata/openi/split.csv"
report_csv = "/opt/gpudata/openi/report.csv"

In [None]:
df = pd.read_csv(src_csv)
df = df.sort_values(["uid", "filename"])
df["filename"] = "CXR" + df["filename"]
df["filename"] = df["filename"].str.replace("CXR1_IM", "CXR1_1_IM")

# as described in paper, only 1 study per patient: https://pmc.ncbi.nlm.nih.gov/articles/PMC5009925/
df["subject_id"] = "patient_" + df["uid"].astype(str)
# maybe the second identifier in the filename is the study ID? (e.g. 5678 in CXR1234_IM-5678-9999.png)
# doesn't really matter, just using the same patient ID for simplicity
df["study_id"] = "study_" + df["uid"].astype(str)
df["dicom_id"] = df["filename"].str.replace(".dcm.png", "")
df["ViewPosition"] = df["projection"]

In [None]:
for fname in df["dicom_id"]:
    fpath = os.path.join(image_dir, fname + ".png")
    assert os.path.exists(fpath)

In [None]:
data = []
for id_ in tqdm(df["uid"].drop_duplicates()):
    fpath = os.path.join(report_dir, f"{id_}.xml")
    assert os.path.exists(fpath)
    with open(fpath) as f:
        soup = BeautifulSoup(f, features="xml")
    tags = soup.find_all("AbstractText")
    section_names = [t.attrs["Label"].lower() for t in tags]
    section_texts = [t.text for t in tags]
    # check sections are unique
    assert len(section_names) == len(set(section_names))
    datum = dict(zip(section_names, section_texts))
    datum["study_id"] = f"study_{id_}"
    data.append(datum)
report_df = pd.DataFrame(data)

In [None]:
val_test_ids = df["uid"].drop_duplicates().sample(frac=0.2, replace=False, random_state=42)
val_ids = val_test_ids.iloc[len(val_test_ids) // 2:]
test_ids = val_test_ids.iloc[:len(val_test_ids) // 2]

df["split"] = "train"
df.loc[df["uid"].isin(val_ids), "split"] = "validate"
df.loc[df["uid"].isin(test_ids), "split"] = "test"

In [None]:
df["split"].value_counts()

In [None]:
metadata_df = df[["subject_id", "study_id", "dicom_id", "ViewPosition"]].copy()
split_df = df[["subject_id", "study_id", "dicom_id", "split"]].copy()
report_df = report_df[["study_id", "comparison", "indication", "findings", "impression"]]

In [None]:
metadata_df.to_csv(metadata_csv, index=False)
split_df.to_csv(split_csv, index=False)
report_df.to_csv(report_csv, index=False)