# PI-CAI preprocessing on Kaggle (picai_prep)

Runs the **official** [picai_prep](https://github.com/DIAGNijmegen/picai_prep) pipeline: **MHA → nnU-Net raw** (resampled T2W/ADC/HBV to shared voxel spacing).

**Run order:** 1) Install pip 2) Paths 3) **PREPROCESSING** (the cell that runs `MHA2nnUNetConverter`) 4) View preprocessed images.

**Add dataset:** [Prostate Cancer (PI-CAI) Dataset](https://www.kaggle.com/datasets/varshithpsingh/prostate-cancer-pi-cai-dataset). **Output:** set `OUTPUT_ROOT` in the Paths cell (default: `/kaggle/working`).

In [None]:
# Install official PI-CAI preprocessing
!pip install -q picai_prep

In [None]:
import os
from pathlib import Path

# Paths — set these for your environment
KAGGLE_INPUT = Path("/kaggle/input/prostate-cancer-pi-cai-dataset")  # PI-CAI dataset (add as input)

# Where to save preprocessed output (default: Kaggle working so you can Save Version)
# For local runs use e.g. Path("./picai_preprocessed")
OUTPUT_ROOT = Path("/kaggle/working")

FOLDS = [3, 4]

# Check dataset is present
if not KAGGLE_INPUT.exists():
    raise FileNotFoundError(
        f"Dataset not found at {KAGGLE_INPUT}. "
        "Add the PI-CAI dataset (e.g. 'Prostate Cancer (PI-CAI) Dataset') to this notebook."
    )

for f in FOLDS:
    fold_dir = KAGGLE_INPUT / f"picai_public_images_fold{f}"
    n = len(list(fold_dir.iterdir())) if fold_dir.exists() else 0
    print(f"fold{f}: {n} patient dirs" if fold_dir.exists() else f"fold{f}: MISSING")

OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
print("\nInput:", KAGGLE_INPUT)
print("Output:", OUTPUT_ROOT.resolve())

In [None]:
# Diagnostic: show input structure (picai_prep expects patient_id/patient_id_study_id_{t2w,adc,hbv}.mha)
sample_fold = KAGGLE_INPUT / "picai_public_images_fold3"
if sample_fold.exists():
    patients = sorted([p.name for p in sample_fold.iterdir() if p.is_dir()])[:3]
    print("Sample patient dirs:", patients)
    for pid in patients[:1]:
        files = [f.name for f in (sample_fold / pid).iterdir()]
        print(f"  {pid}/ files:", files)
    has_all = all(any(m in f.lower() for f in files) for m in ["_t2w.mha", "_adc.mha", "_hbv.mha"])
    print("  -> OK, structure matches picai_prep (t2w, adc, hbv present)" if has_all else "  -> Missing t2w/adc/hbv .mha")
else:
    print("Fold0 not found. Listing input root:", list(KAGGLE_INPUT.iterdir())[:20])

## PREPROCESSING — Run this cell

**Run the cell below** to convert MHA → nnU-Net raw (resampled T2W/ADC/HBV). Output is saved to `OUTPUT_ROOT` (set in the Paths cell). Then run "View preprocessed images" to see the result.

In [None]:
import json
from picai_prep import MHA2nnUNetConverter
from picai_prep.examples.mha2nnunet.picai_archive import generate_mha2nnunet_settings

for fold in FOLDS:
    fold_name = f"picai_public_images_fold{fold}"
    input_dir = KAGGLE_INPUT / fold_name
    if not input_dir.exists():
        print(f"Skip fold {fold}: {input_dir} not found")
        continue

    output_dir = OUTPUT_ROOT / f"nnUNet_raw_data_fold{fold}"
    output_dir.mkdir(parents=True, exist_ok=True)
    settings_path = OUTPUT_ROOT / f"mha2nnunet_settings_fold{fold}.json"

    task_name = f"Task2201_picai_fold{fold}"

    # 1) Generate settings for this fold
    generate_mha2nnunet_settings(
        archive_dir=str(input_dir),
        output_path=str(settings_path),
        annotations_dir=None,  # no lesion masks required for preprocessing
        task=task_name,
    )

    # 2) Remove annotation_path so conversion runs without lesion masks
    with open(settings_path) as f:
        settings = json.load(f)
    for item in settings["archive"]:
        item.pop("annotation_path", None)
    with open(settings_path, "w") as f:
        json.dump(settings, f, indent=4)
    n_cases = len(settings["archive"])
    print(f"Fold {fold}: {n_cases} cases (no annotations)")
    if n_cases == 0:
        print(f"  WARNING: No cases in archive for fold {fold}. Check input dir: {input_dir}")
        continue

    # 3) Create imagesTr so converter can write
    (output_dir / task_name / "imagesTr").mkdir(parents=True, exist_ok=True)
    # 4) Convert MHA → nnU-Net raw
    archive = MHA2nnUNetConverter(
        scans_dir=str(input_dir),
        output_dir=str(output_dir),
        mha2nnunet_settings=str(settings_path),
        annotations_dir=None,
    )
    archive.convert()
    (output_dir / task_name).mkdir(parents=True, exist_ok=True)
    archive.create_dataset_json()

    print(f"Done fold {fold} -> {output_dir}")

In [None]:
# Show conversion log: FIRST part (errors appear here) and LAST part
from pathlib import Path
root = OUTPUT_ROOT if 'OUTPUT_ROOT' in globals() else Path("/kaggle/working")
logs = sorted(root.glob("**/picai_prep_*.log"))
if logs:
    with open(logs[-1]) as f:
        lines = f.readlines()
    print("=== First 120 lines (look for 'CASE' and 'Error:' for why conversion failed) ===")
    print("".join(lines[:120]))
    print("...")
    print("=== Last 25 lines ===")
    print("".join(lines[-25:]))
else:
    print("No picai_prep log found.")

### If imagesTr is empty: check the conversion log

Run the cell below to see why conversion failed (last 60 lines of the log).

## View preprocessed images

Load one preprocessed case and display axial slices for T2W, ADC, and HBV side by side.

In [None]:
# View preprocessed T2W / ADC / HBV slices (run after preprocessing cells above)
try:
    import nibabel as nib
except ImportError:
    !pip install -q nibabel
    import nibabel as nib
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

if 'OUTPUT_ROOT' not in globals():
    OUTPUT_ROOT = Path("/kaggle/working")
FOLDS = [3, 4]

# Find first available preprocessed fold and case
case_dir = None
base = None
for fold in FOLDS:
    task_name = f"Task2201_picai_fold{fold}"
    task_path = OUTPUT_ROOT / f"nnUNet_raw_data_fold{fold}" / task_name
    if not task_path.exists():
        continue
    # Option 1: imagesTr/*_0000.nii.gz
    images_tr = task_path / "imagesTr"
    if images_tr.exists():
        files_0000 = list(images_tr.glob("*_0000.nii.gz"))
        if files_0000:
            case_dir = images_tr
            base = files_0000[0].name.replace("_0000.nii.gz", "")
            print(f"Showing case: {task_name} / imagesTr / {base}")
            break
    # Option 2: search anywhere under task_path for *_0000.nii.gz
    if case_dir is None:
        for nii in task_path.rglob("*_0000.nii.gz"):
            case_dir = nii.parent
            base = nii.name.replace("_0000.nii.gz", "")
            print(f"Showing case: {task_name} / {case_dir.relative_to(task_path)} / {base}")
            break
    if case_dir is not None:
        break
else:
    case_dir = None
    base = None

if case_dir is None or base is None:
    print("No preprocessed case found.")
    print("  -> Run the PREPROCESSING cell above, then re-run this cell.")
    # Debug: show what is inside the task folder
    tp = OUTPUT_ROOT / "nnUNet_raw_data_fold3" / "Task2201_picai_fold3"
    if tp.exists():
        print("  -> Inside Task2201_picai_fold3:", [x.name for x in tp.iterdir()])
        imtr = tp / "imagesTr"
        if imtr.exists():
            files = list(imtr.glob("*.nii.gz"))[:10]
            print("  -> imagesTr (first 10):", [f.name for f in files])
        else:
            print("  -> imagesTr/ does not exist (conversion may have failed; check picai_prep_*.log)")
    else:
        for p in sorted(OUTPUT_ROOT.iterdir()):
            if p.is_dir() and "nnUNet" in p.name:
                sub = [x.name for x in p.iterdir()][:5]
                print(f"     {p.name}/  {sub}")
else:
    # Load the 3 modalities (nnU-Net: 0000=T2W, 0001=ADC, 0002=HBV)
    t2w = nib.load(str(case_dir / f"{base}_0000.nii.gz")).get_fdata()
    adc = nib.load(str(case_dir / f"{base}_0001.nii.gz")).get_fdata()
    hbv = nib.load(str(case_dir / f"{base}_0002.nii.gz")).get_fdata()

    # Axial = slice dimension (usually the one with fewest slices in prostate MRI)
    slice_axis = np.argmin(t2w.shape)
    n_slices = t2w.shape[slice_axis]
    slice_idx = n_slices // 2
    slices_to_show = [slice_idx - 3, slice_idx, slice_idx + 3]
    slices_to_show = [max(0, min(s, n_slices - 1)) for s in slices_to_show]

    def take_slice(vol, axis, idx):
        if axis == 0:
            return vol[idx, :, :]
        if axis == 1:
            return vol[:, idx, :]
        return vol[:, :, idx]

    fig, axes = plt.subplots(3, 3, figsize=(10, 10))
    mods = [("T2W", t2w), ("ADC", adc), ("HBV", hbv)]
    for col, (name, vol) in enumerate(mods):
        for row, si in enumerate(slices_to_show):
            ax = axes[row, col]
            sl = take_slice(vol, slice_axis, si)
            ax.imshow(sl.T, cmap="gray", origin="lower")
            ax.set_axis_off()
            if row == 0:
                ax.set_title(name)
            if col == 0:
                ax.set_ylabel(f"slice {si}")
    plt.suptitle("Preprocessed axial slices (T2W, ADC, HBV)")
    plt.tight_layout()
    plt.show()

## Optional: Zip output for download

Kaggle lets you download `/kaggle/working/`. Zipping reduces the number of files if you download manually.

In [None]:
# # Zip each fold's nnU-Net raw data (optional) — uses Python stdlib
# import shutil
# from pathlib import Path
# if 'OUTPUT_ROOT' not in globals(): OUTPUT_ROOT = Path(".")

# for fold in FOLDS:
#     src = OUTPUT_ROOT / f"nnUNet_raw_data_fold{fold}"
#     if not src.exists():
#         continue
#     zip_path = OUTPUT_ROOT / f"nnUNet_raw_data_fold{fold}.zip"
#     shutil.make_archive(str(zip_path).replace(".zip", ""), "zip", OUTPUT_ROOT, src.name)
#     print(f"Created {zip_path.name}")

## Summary

- **Preprocessed data:** `nnUNet_raw_data_fold3/`, `fold4/` under `OUTPUT_ROOT`.
- Each contains resampled T2W, ADC, HBV in nnU-Net raw format (4D NIfTI per case).
- **To use as input in another notebook:** On Kaggle, set `OUTPUT_ROOT = Path("/kaggle/working")`, run preprocessing, then click **Save Version** (or create a new Dataset from the output). The saved output becomes a dataset you can add as **Input** to any notebook.
- Use these paths in your FPN-MIL pipeline (e.g. load NIfTIs and extract 2D slices/patches) or for nnU-Net/picai_baseline training.