SPLIT THEM TO 3 FOLDERS

In [1]:
from pathlib import Path
import shutil
import json
import csv
import random
import re
from datetime import datetime

# =========================
# CONFIG
# =========================
INTERIM_DIR = Path("../../data/interim/Stage1/color_clahe_1500x1000_noborder_aug")  # source
PROCESSED_DIR = Path("../../data/processed/Stage1")

IMAGES_DIR = PROCESSED_DIR / "images"
TRAIN_DIR = IMAGES_DIR / "train"
VAL_DIR = IMAGES_DIR / "val"
TEST_DIR = IMAGES_DIR / "test"

TRAIN_RATIO = 0.85
VAL_RATIO = 0.10
TEST_RATIO = 0.05
SEED = 42

IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".tiff"}

# =========================
# SETUP
# =========================
random.seed(SEED)

TRAIN_DIR.mkdir(parents=True, exist_ok=True)
VAL_DIR.mkdir(parents=True, exist_ok=True)
TEST_DIR.mkdir(parents=True, exist_ok=True)

image_paths = sorted(
    [p for p in INTERIM_DIR.iterdir() if p.suffix.lower() in IMAGE_EXTS]
)

assert len(image_paths) > 0, "No images found in interim directory"
print(f"Found {len(image_paths)} images")

# =========================
# SPLIT DATASET
# =========================
random.shuffle(image_paths)

n = len(image_paths)
n_train = int(n * TRAIN_RATIO)
n_val = int(n * VAL_RATIO)

train_imgs = image_paths[:n_train]
val_imgs = image_paths[n_train:n_train + n_val]
test_imgs = image_paths[n_train + n_val:]

# =========================
# COPY IMAGES
# =========================
def copy_split(imgs, split_dir, split_name):
    relpaths = []
    for p in imgs:
        dst = split_dir / p.name
        shutil.copy2(p, dst)
        relpaths.append(f"images/{split_name}/{p.name}")
    return relpaths

train_rel = copy_split(train_imgs, TRAIN_DIR, "train")
val_rel   = copy_split(val_imgs, VAL_DIR, "val")
test_rel  = copy_split(test_imgs, TEST_DIR, "test")

# =========================
# SAVE splits.json
# =========================
splits = {
    "train": train_rel,
    "val": val_rel,
    "test": test_rel,
}

with open(PROCESSED_DIR / "splits.json", "w") as f:
    json.dump(
        {
            "created_at": datetime.now().isoformat(timespec="seconds"),
            "source": str(INTERIM_DIR),
            "seed": SEED,
            "ratios": {
                "train": TRAIN_RATIO,
                "val": VAL_RATIO,
                "test": TEST_RATIO,
            },
            "counts": {k: len(v) for k, v in splits.items()},
            "splits": splits,
        },
        f,
        indent=2,
    )

# =========================
# INDEX.CSV
# =========================
def extract_timestamp(filename: str) -> str:
    """
    Matches: YYYY-MM-DD_HH-MM-SS-sss.png
    """
    m = re.search(r"(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})-(\d{3})", filename)
    if not m:
        return ""
    return f"{m.group(1)}T{m.group(2).replace('-', ':')}.{m.group(3)}"

with open(PROCESSED_DIR / "index.csv", "w", newline="") as f:
    writer = csv.DictWriter(
        f,
        fieldnames=[
            "filepath",
            "split",
            "stage",
            "scan_session_id",
            "filename",
            "scan_timestamp",
            "source_interim_path",
        ],
    )
    writer.writeheader()

    for split_name, imgs in [
        ("train", train_imgs),
        ("val", val_imgs),
        ("test", test_imgs),
    ]:
        for p in imgs:
            writer.writerow(
                {
                    "filepath": f"images/{split_name}/{p.name}",
                    "split": split_name,
                    "stage": "Stage0",
                    "scan_session_id": INTERIM_DIR.name,
                    "filename": p.name,
                    "scan_timestamp": extract_timestamp(p.name),
                    "source_interim_path": str(p),
                }
            )

print("✅ Dataset frozen and organized into train / val / test")


Found 875 images
✅ Dataset frozen and organized into train / val / test


Automate Labelling for Stage1 since all data will have same labels

For Train.json

In [3]:
from pathlib import Path
import json

IMG_DIR = Path("../../data/processed/Stage1/images/train")
OUT_JSON = Path("../../data/labels/Stage1/train.json")

OUT_JSON.parent.mkdir(parents=True, exist_ok=True)

annotations = []

for p in sorted(IMG_DIR.iterdir()):
    if p.suffix.lower() in {".png", ".jpg", ".jpeg"}:
        annotations.append({
            "image": p.name,
            "no_contraband": 1,
            "isolated_items": 1
        })

with open(OUT_JSON, "w") as f:
    json.dump(annotations, f, indent=2)

print(f"✅ JSON generated with {len(annotations)} samples → {OUT_JSON}")


✅ JSON generated with 743 samples → ../../data/labels/Stage1/train.json


For Val.json

In [None]:
from pathlib import Path
import json

IMG_DIR = Path("../../data/processed/Stage1/images/val")
OUT_JSON = Path("../../data/labels/Stage1/val.json")

OUT_JSON.parent.mkdir(parents=True, exist_ok=True)

annotations = []

for p in sorted(IMG_DIR.iterdir()):
    if p.suffix.lower() in {".png", ".jpg", ".jpeg"}:
        annotations.append({
            "image": p.name,
            "no_contraband": 1,
            "isolated_items": 1
        })

with open(OUT_JSON, "w") as f:
    json.dump(annotations, f, indent=2)

print(f"✅ JSON generated with {len(annotations)} samples → {OUT_JSON}")
