In [2]:
# Standard libraries
import os
import shutil
from glob import glob
from pathlib import Path

# Image processing
import cv2
import numpy as np

# For splitting the dataset
from sklearn.model_selection import train_test_split
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ipateam/nuinsseg")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\rita\.cache\kagglehub\datasets\ipateam\nuinsseg\versions\5


In [3]:
# Root where each organ folder lives, containing subfolders:
#  - 'tissue images'
#  - 'mask binary'
#  - 'distance maps'
#  - 'label masks modify'
#  - 'vague areas/mask binary'
from pathlib import Path

# ✅ Root folder that contains the organ folders (NuInsSeg has organ-based structure)
RAW_ROOT = Path(r"C:\Users\rita\.cache\kagglehub\datasets\ipateam\nuinsseg\versions\5")

# ✅ Where you want to store processed/resized/split data
OUT_ROOT = Path("data")  # you can change this to e.g., "processed_nuinsseg"
SPLITS = ["train", "val", "test"]

# ✅ Image size for all images and masks
TARGET_SIZE = (256, 256)

# ✅ Split ratios
test_frac = 0.10    # 10% of all = test
val_frac = 0.10     # 10% of the rest = val
random_seed = 42


In [4]:
# Create data/{train,val,test}/{images,masks,distance_maps,label_masks,vague_masks}
for split in SPLITS:
    for sub in ["images", "masks", "distance_maps", "label_masks", "vague_masks"]:
        (OUT_ROOT / split / sub).mkdir(parents=True, exist_ok=True)
print("Directory scaffold created under", OUT_ROOT)

Directory scaffold created under data


In [5]:
# Collect tuples of paths: (tissue_img, mask_bin, distance_map, label_mask, vague_mask)
data_tuples = []
for organ in RAW_ROOT.iterdir():
    if not organ.is_dir():
        continue
    # define per-organ subdirs
    tissue_dir = organ / "tissue images"
    mask_dir   = organ / "mask binary"
    dist_dir   = organ / "distance maps"
    label_dir  = organ / "label masks modify"
    vague_dir  = organ / "vague areas" / "mask binary"

    # check existence
    for d in [tissue_dir, mask_dir, dist_dir, label_dir, vague_dir]:
        if not d.exists():
            print(f"⚠️ Skipping {organ.name}: missing {d}")
            break
    else:
        # collect matching filenames by intersection
        for img_path in tissue_dir.glob("*.png"):
            stem = img_path.stem
            m1 = mask_dir / f"{stem}.png"
            m2 = dist_dir / f"{stem}.png"
            m3 = label_dir / f"{stem}.tif"
            m4 = vague_dir / f"{stem}.png"
            if m1.exists() and m2.exists() and m3.exists() and m4.exists():
                data_tuples.append((img_path, m1, m2, m3, m4))
            else:
                print(f"⚠️ Missing file for {stem} in {organ.name}")

print(f"✅ Found {len(data_tuples)} complete data tuples across {len([d for d in RAW_ROOT.iterdir() if d.is_dir()])} organs.")

✅ Found 665 complete data tuples across 31 organs.


In [6]:
# first separate test set
train_val, test = train_test_split(
    data_tuples, test_size=test_frac, random_state=random_seed
)
# then split train_val into train + val
train, val = train_test_split(
    train_val, test_size=val_frac/(1-test_frac), random_state=random_seed
)
print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")

Train: 531, Val: 67, Test: 67


In [7]:
def process_and_save(split_list, split_name):
    """
    Resizes each modality to TARGET_SIZE and saves into the split folders.
    """
    for (img, msk, dist, lbl, vmask) in split_list:
        # load all
        arr_img  = cv2.imread(str(img))
        arr_msk  = cv2.imread(str(msk),  cv2.IMREAD_GRAYSCALE)
        arr_dist = cv2.imread(str(dist), cv2.IMREAD_GRAYSCALE)
        arr_lbl  = cv2.imread(str(lbl),  cv2.IMREAD_UNCHANGED)
        arr_v    = cv2.imread(str(vmask), cv2.IMREAD_GRAYSCALE)

        # resize to TARGET_SIZE
        img_r  = cv2.resize(arr_img,  TARGET_SIZE, interpolation=cv2.INTER_CUBIC)
        msk_r  = cv2.resize(arr_msk,  TARGET_SIZE, interpolation=cv2.INTER_NEAREST)
        dist_r = cv2.resize(arr_dist, TARGET_SIZE, interpolation=cv2.INTER_NEAREST)
        lbl_r  = cv2.resize(arr_lbl,  TARGET_SIZE, interpolation=cv2.INTER_NEAREST)
        v_r    = cv2.resize(arr_v,    TARGET_SIZE, interpolation=cv2.INTER_NEAREST)

        # save
        cv2.imwrite(str(OUT_ROOT/split_name/"images"/ img.name),  img_r)
        cv2.imwrite(str(OUT_ROOT/split_name/"masks"/  msk.name),  msk_r)
        cv2.imwrite(str(OUT_ROOT/split_name/"distance_maps"/ dist.name), dist_r)
        cv2.imwrite(str(OUT_ROOT/split_name/"label_masks"/ lbl.name), lbl_r)
        cv2.imwrite(str(OUT_ROOT/split_name/"vague_masks"/ vmask.name),  v_r)

# run for each split
process_and_save(train, "train")
process_and_save(val,   "val")
process_and_save(test,  "test")
print("Resizing & copying complete.")

Resizing & copying complete.


In [8]:
for split in SPLITS:
    counts = {sub: len(list((OUT_ROOT/split/sub).glob("*.*")))
              for sub in ["images","masks","distance_maps","label_masks","vague_masks"]}
    print(f"{split}: ", counts)

train:  {'images': 531, 'masks': 531, 'distance_maps': 531, 'label_masks': 531, 'vague_masks': 531}
val:  {'images': 67, 'masks': 67, 'distance_maps': 67, 'label_masks': 67, 'vague_masks': 67}
test:  {'images': 67, 'masks': 67, 'distance_maps': 67, 'label_masks': 67, 'vague_masks': 67}
