In [None]:
SRC_JSON   = "../dataset/annotations/instances_all.json"  
IMAGES_ALL = "../dataset/images/all"                   
OUT_ROOT   = "../dataset"                              

SPLIT = {"train": 0.8, "val": 0.1, "test": 0.1}           # Summe muss 1.0 sein
SEED  = 42
COPY_FILES = True   # True = kopieren, False = verschieben
KEEP_IMAGES_WITHOUT_ANNOT = True  # True: Bilder ohne Annotations bleiben im Split


In [2]:
import json, os, random, shutil
from pathlib import Path
from collections import defaultdict

In [3]:
def ensure_sum_one(d):
    s = sum(d.values())
    if abs(s - 1.0) > 1e-6:
        raise ValueError(f"SPLIT-Summenfehler: {s:.4f} != 1.0")

In [4]:
def load_coco(path):
    with open(path, "r", encoding="utf-8") as f:
        coco = json.load(f)
    for k in ("images", "annotations", "categories"):
        if k not in coco:
            raise KeyError(f"COCO JSON fehlt Schlüssel: '{k}'")
    return coco

In [5]:
def filter_existing_images(coco, images_dir, keep_missing=False):
    exist, miss = [], []
    for im in coco["images"]:
        if (Path(images_dir) / im["file_name"]).exists():
            exist.append(im)
        else:
            miss.append(im)
    if miss and not keep_missing:
        print(f"[Info] Entferne {len(miss)} referenzierte, aber fehlende Bilder.")
        return {**coco, "images": exist}, miss
    if miss:
        print(f"[Warn] {len(miss)} Dateien fehlen auf Disk, bleiben aber in JSON (kann später brechen).")
    return coco, miss

In [6]:
def index_by_image(annotations):
    by_img = defaultdict(list)
    for a in annotations:
        by_img[a["image_id"]].append(a)
    return by_img


In [7]:
def subset_annotations(annotations, image_ids, keep_empty=True):
    sub = [a for a in annotations if a.get("image_id") in image_ids]
    if keep_empty:
        return sub
    # Falls Bilder ohne Annotations ausgeschlossen werden sollen, wird hier nichts weiter benötigt,
    # das passiert über die Auswahl der Bilder selbst.
    return sub

In [8]:
def write_coco(path, images, annotations, categories):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump({"images": images, "annotations": annotations, "categories": categories}, f)

In [9]:
def transfer_images(imgs, src_root, dst_root, do_copy=True):
    dst_root.mkdir(parents=True, exist_ok=True)
    n_ok, n_miss = 0, 0
    for im in imgs:
        src = Path(src_root) / im["file_name"]
        dst = Path(dst_root) / im["file_name"]
        dst.parent.mkdir(parents=True, exist_ok=True)
        if src.exists():
            if do_copy:
                shutil.copy2(src, dst)
            else:
                shutil.move(src, dst)
            n_ok += 1
        else:
            n_miss += 1
    return n_ok, n_miss

In [10]:
ensure_sum_one(SPLIT)
coco = load_coco(SRC_JSON)
coco, missing_on_disk = filter_existing_images(coco, IMAGES_ALL, keep_missing=False)

images = coco["images"]
anns   = coco["annotations"]
cats   = coco["categories"]

In [11]:
# deterministisch mischen
random.seed(SEED)
random.shuffle(images)

n = len(images)
n_train = int(round(SPLIT["train"] * n))
n_val   = int(round(SPLIT["val"]   * n))
# Rest zu test
if n_train + n_val > n:
    n_val = max(0, n - n_train)
n_test  = max(0, n - n_train - n_val)

splits = {
    "train": images[:n_train],
    "val":   images[n_train:n_train+n_val],
    "test":  images[n_train+n_val:],
}

print(f"[Split] total={n}  ->  train={len(splits['train'])}, val={len(splits['val'])}, test={len(splits['test'])}")
if missing_on_disk:
    print(f"[Note] {len(missing_on_disk)} in JSON gelistete Bilder fehl(t)en auf Disk und wurden ausgeschlossen.")

[Split] total=589  ->  train=471, val=59, test=59


In [12]:

# JSONs schreiben + Bilder kopieren/verschieben
ann_out_dir = Path(OUT_ROOT) / "annotations"
img_out_root = Path(OUT_ROOT) / "images"

for split_name, split_imgs in splits.items():
    img_ids = {im["id"] for im in split_imgs}
    split_anns = subset_annotations(anns, img_ids, keep_empty=True)

    out_json = ann_out_dir / f"instances_{split_name}.json"
    write_coco(out_json, split_imgs, split_anns, cats)

    dst_img_dir = img_out_root / split_name
    copied, missing = transfer_images(split_imgs, Path(IMAGES_ALL), dst_img_dir, do_copy=COPY_FILES)
    action = "kopiert" if COPY_FILES else "verschoben"
    print(f"[{split_name}] {action}: {copied} | fehlend: {missing} | json: {out_json}")

[train] kopiert: 471 | fehlend: 0 | json: ../dataset/annotations/instances_train.json
[val] kopiert: 59 | fehlend: 0 | json: ../dataset/annotations/instances_val.json
[test] kopiert: 59 | fehlend: 0 | json: ../dataset/annotations/instances_test.json
