In [10]:
import shutil
from pathlib import Path
import pandas as pd
import os
import re
# ==== CONFIGURE THESE PATHS ====
datasets_path = os.path.join(os.path.pardir, "an2dl2526c2")
BASE_DIR = os.path.join(os.path.pardir, "an2dl2526c2", "preprocessing_results", "train_patches")  
BASE_DIR = Path(BASE_DIR)  
IMAGES_DIR = BASE_DIR 
MASKS_DIR = BASE_DIR / "masks"
CSV_PATH = os.path.join(datasets_path, "train_labels.csv")
OUTPUT_DIR = Path(os.path.join( "temp", "by_label"))

In [11]:
def sanitize_label_for_folder(label: str) -> str:
    """
    Turn label like 'HER2(+)' or 'Luminal A' into a safe folder name.
    Example: 'HER2(+)' -> 'HER2_'
             'Luminal A' -> 'Luminal_A'
    """
    label = label.strip()
    # Replace any sequence of non-alphanumeric/._- with underscore
    return re.sub(r"[^A-Za-z0-9_.-]+", "_", label)

def run_split():
    # CSV assumed like:
    # img_0001.png,HER2(+)
    # img_0002.png,Luminal A
    df = pd.read_csv(
        CSV_PATH,
        header=None,
        names=["csv_image_name", "label"]
    )

    for _, row in df.iterrows():
        csv_img_name = str(row["csv_image_name"]).strip()   # e.g. "img_0001.png"
        label_raw = str(row["label"]).strip()               # e.g. "HER2(+)"
        base_stem = Path(csv_img_name).stem                 # "img_0001"

        # Find ALL image patches for this base:
        # e.g. img_0001_0001.png, img_0001_0002.png, img_0001_px.png, ...
        patch_images = sorted(IMAGES_DIR.glob(f"{base_stem}_*.png"))

        if not patch_images:
            print(f"[WARNING] No patches found for base {base_stem}")
            continue

        label_folder = sanitize_label_for_folder(label_raw)
        label_dir = OUTPUT_DIR / label_folder
        label_img_dir = label_dir / "images"
        label_mask_dir = label_dir / "masks"

        label_img_dir.mkdir(parents=True, exist_ok=True)
        label_mask_dir.mkdir(parents=True, exist_ok=True)

        for img_path in patch_images:
            # Derive corresponding mask name:
            # img_0001_abc.png -> mask_0001_abc.png
            mask_name = img_path.name.replace("img_", "mask_", 1)
            mask_path = MASKS_DIR / mask_name

            # ---- Copy image patch ----
            if img_path.exists():
                shutil.copy2(img_path, label_img_dir / img_path.name)
            else:
                print(f"[WARNING] Image patch not found (this should not happen): {img_path}")

            # ---- Copy mask patch ----
            if mask_path.exists():
                shutil.copy2(mask_path, label_mask_dir / mask_path.name)
            else:
                print(f"[WARNING] Mask patch not found for {img_path.name} -> expected {mask_path.name}")

    print(f"Done! Files organized under: {OUTPUT_DIR}")


In [12]:
run_split()

Done! Files organized under: temp\by_label
