# PI-CAI Preprocessing for FPN-MIL

1. **Step 1 — Align modalities**: Resample ADC and HBV onto T2W grid; save `t2w.mha`, `adc_to_t2w.mha`, `hbv_to_t2w.mha` per patient.
2. **Step 2 — Labels & paths**: Load metadata CSVs, build label table (patient_id, isup, cs_pca), map each patient to fold and file paths.
3. **Step 3 — Export**: Save labels CSV for the MIL pipeline (`data/picai_labels.csv`).

Set `INPUT_ROOT` / `DATA_ROOT` for your environment (Kaggle or local).

In [None]:
# # ============================================================
# # PI-CAI (Kaggle) — Preprocessing Step 1
# # Align modalities by resampling ADC + HBV onto T2W grid
# # Dataset structure (from your screenshot):
# #   /kaggle/input/prostate-cancer-pi-cai-dataset/
# #       picai_public_images_fold0/10000/10000_..._t2w.mha
# #       picai_public_images_fold0/10000/10000_..._adc.mha
# #       picai_public_images_fold0/10000/10000_..._hbv.mha
# #   ... folds 0..4
# #
# # Output:
# #   /kaggle/working/picai_step1_aligned/foldX/patientID/
# #       t2w.mha, adc_to_t2w.mha, hbv_to_t2w.mha
# # ============================================================

# !pip -q install SimpleITK

# import os
# from pathlib import Path
# import SimpleITK as sitk

# INPUT_ROOT = Path("/kaggle/input/prostate-cancer-pi-cai-dataset")
# OUTPUT_ROOT = Path("/kaggle/working/picai_step1_aligned")
# OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

# FOLDS = [0, 1, 2, 3, 4]

# def read_mha(p: Path) -> sitk.Image:
#     img = sitk.ReadImage(str(p))
#     return sitk.Cast(img, sitk.sitkFloat32)

# def resample_to_t2(moving: sitk.Image, t2: sitk.Image, interp=sitk.sitkLinear) -> sitk.Image:
#     """
#     Resample 'moving' image onto the grid of 't2' using identity transform.
#     This assumes images are already in the same physical space (often true in PI-CAI).
#     If later you find misalignment, we add registration in step 1b.
#     """
#     identity = sitk.Transform(3, sitk.sitkIdentity)
#     return sitk.Resample(moving, t2, identity, interp, 0.0, moving.GetPixelID())

# def find_modality_files(patient_dir: Path):
#     """
#     Returns paths for adc, hbv, t2w inside a patient folder.
#     Files look like: 10000_1000000_adc.mha
#     """
#     adc = next(patient_dir.glob("*_adc.mha"), None)
#     hbv = next(patient_dir.glob("*_hbv.mha"), None)
#     t2w = next(patient_dir.glob("*_t2w.mha"), None)
#     return adc, hbv, t2w

# def save(img: sitk.Image, out_path: Path):
#     out_path.parent.mkdir(parents=True, exist_ok=True)
#     sitk.WriteImage(img, str(out_path), useCompression=True)

# # ---- main loop ----
# total_patients = 0
# done_patients = 0
# skipped = 0

# # Count patients (optional)
# for fold in FOLDS:
#     fold_dir = INPUT_ROOT / f"picai_public_images_fold{fold}"
#     if fold_dir.exists():
#         total_patients += sum(1 for p in fold_dir.iterdir() if p.is_dir())

# print("Total patient folders found:", total_patients)

# for fold in FOLDS:
#     fold_dir = INPUT_ROOT / f"picai_public_images_fold{fold}"
#     if not fold_dir.exists():
#         continue

#     out_fold_dir = OUTPUT_ROOT / f"fold{fold}"
#     out_fold_dir.mkdir(parents=True, exist_ok=True)

#     patient_dirs = sorted([p for p in fold_dir.iterdir() if p.is_dir()])

#     for pi, patient_dir in enumerate(patient_dirs):
#         patient_id = patient_dir.name

#         adc_path, hbv_path, t2w_path = find_modality_files(patient_dir)
#         if adc_path is None or hbv_path is None or t2w_path is None:
#             skipped += 1
#             continue

#         # Read
#         t2w = read_mha(t2w_path)
#         adc = read_mha(adc_path)
#         hbv = read_mha(hbv_path)

#         # Resample ADC/HBV to T2W grid
#         adc_to_t2w = resample_to_t2(adc, t2w, interp=sitk.sitkLinear)
#         hbv_to_t2w = resample_to_t2(hbv, t2w, interp=sitk.sitkLinear)

#         # Save (keep simple stable names)
#         out_patient = out_fold_dir / patient_id
#         save(t2w,        out_patient / "t2w.mha")
#         save(adc_to_t2w, out_patient / "adc_to_t2w.mha")
#         save(hbv_to_t2w, out_patient / "hbv_to_t2w.mha")

#         done_patients += 1
#         if done_patients == 1 or done_patients % 50 == 0:
#             print(f"Aligned & saved {done_patients}/{total_patients}  (last: fold{fold}/{patient_id})")

# print("Done.")
# print("Saved patients:", done_patients, "| Skipped (missing files):", skipped)
# print("Output folder:", OUTPUT_ROOT)

In [None]:
# import pandas as pd
# from pathlib import Path

# DATA_ROOT = Path("/kaggle/input/prostate-cancer-pi-cai-dataset")

# # ---- CSV paths (as in your screenshot) ----
# csv_lesion       = DATA_ROOT / "Metadata with lesion info.csv"
# csv_no_lesion    = DATA_ROOT / "Metadata without lesion info.csv"
# csv_isup         = DATA_ROOT / "Metadata(for ISUP).csv"
# csv_isup_no_les  = DATA_ROOT / "Metadata(for ISUP without lesion info).csv"

# csvs = {
#     "lesion": csv_lesion,
#     "no_lesion": csv_no_lesion,
#     "isup": csv_isup,
#     "isup_no_lesion": csv_isup_no_les,
# }

# # ---- Load and inspect ----
# dfs = {}
# for name, p in csvs.items():
#     if p.exists():
#         df = pd.read_csv(p)
#         dfs[name] = df
#         print(f"\n{name}: {p.name}")
#         print("shape:", df.shape)
#         print("columns:", list(df.columns)[:30], ("..." if len(df.columns)>30 else ""))
#     else:
#         print(f"\n{name}: MISSING -> {p}")

# # Pick the best source for ISUP labels (prefer 'isup' if exists)
# df_isup_src = dfs.get("isup") or dfs.get("isup_no_lesion")
# if df_isup_src is None:
#     raise RuntimeError("No ISUP metadata CSV found. Check file names in the dataset.")

# # ---- Helper: find likely column names robustly ----
# def find_col(df, candidates):
#     cols = {c.lower(): c for c in df.columns}
#     for cand in candidates:
#         if cand.lower() in cols:
#             return cols[cand.lower()]
#     return None

# # Identify patient-id and ISUP columns (names vary across releases)
# pid_col = find_col(df_isup_src, ["patient_id", "patient", "case_id", "subject_id", "id"])
# isup_col = find_col(df_isup_src, ["isup", "isup_grade", "isup_grade_group", "grade_group"])

# if pid_col is None or isup_col is None:
#     print("\nCould not auto-detect columns.")
#     print("Columns available:", list(df_isup_src.columns))
#     raise RuntimeError("Please check which column is patient id and which column is ISUP in your CSV.")

# print("\nDetected patient id column:", pid_col)
# print("Detected ISUP column:", isup_col)

# # ---- Build label table ----
# labels = df_isup_src[[pid_col, isup_col]].copy()
# labels = labels.rename(columns={pid_col: "patient_id", isup_col: "isup"})
# labels["patient_id"] = labels["patient_id"].astype(str)
# labels["isup"] = pd.to_numeric(labels["isup"], errors="coerce")

# # csPCa definition used in PI-CAI: ISUP >= 2
# labels["cs_pca"] = (labels["isup"] >= 2).astype("int")

# print("\nLabel table preview:")
# display(labels.head())

# # ---- Map each patient to its fold and file paths ----
# def find_patient_in_folds(patient_id: str):
#     for fold in range(5):
#         pdir = DATA_ROOT / f"picai_public_images_fold{fold}" / patient_id
#         if pdir.exists():
#             return fold, pdir
#     return None, None

# def modality_paths(patient_dir: Path):
#     # files look like: 10000_1000000_t2w.mha, etc.
#     t2w = next(patient_dir.glob("*_t2w.mha"), None)
#     adc = next(patient_dir.glob("*_adc.mha"), None)
#     hbv = next(patient_dir.glob("*_hbv.mha"), None)
#     return t2w, adc, hbv

# folds, t2ws, adcs, hbvs, exist_flags = [], [], [], [], []
# for pid in labels["patient_id"]:
#     fold, pdir = find_patient_in_folds(pid)
#     if pdir is None:
#         folds.append(None); t2ws.append(None); adcs.append(None); hbvs.append(None); exist_flags.append(False)
#         continue
#     t2w, adc, hbv = modality_paths(pdir)
#     folds.append(fold)
#     t2ws.append(str(t2w) if t2w else None)
#     adcs.append(str(adc) if adc else None)
#     hbvs.append(str(hbv) if hbv else None)
#     exist_flags.append(bool(t2w and adc and hbv))

# labels["fold"] = folds
# labels["t2w_path"] = t2ws
# labels["adc_path"] = adcs
# labels["hbv_path"] = hbvs
# labels["has_all_modalities"] = exist_flags

# print("\nPatients with all 3 modalities:", labels["has_all_modalities"].sum(), "/", len(labels))
# print("Patients missing folder or modality files:", (~labels["has_all_modalities"]).sum())

# display(labels.sample(5))
