# Dataset Preparation for YOLOv11 and Detectron2 (Per-Letter Training)

This notebook prepares a unified dataset for:

- **YOLOv11 (Ultralytics)** object detection with **multi-class letters** (all letters in one dataset, trainable per letter via `classes=[...]`).
- **Detectron2**, also trained **per-letter**, i.e. one Detectron2 model per letter using filtered single-letter COCO annotations.


In [7]:
from pathlib import Path
import os
import json
import random
import shutil

# ==== CONFIGURATION ====
# Path to the original COCO annotations file (multi-class letters)
COCO_PATH = Path("annotations/annotations.json")

# Root directory of the original images with script modes as subfolders
#   images_root/
#       french/...
#       german/...
IMAGES_ROOT = Path("images")

# Target root directory for the prepared dataset
TARGET_ROOT = Path("ASC_dataset2")

# Validation split ratio
VAL_RATIO = 0.2  # 20% for validation
RANDOM_SEED = 42

TARGET_ROOT.mkdir(parents=True, exist_ok=True)
print(f"COCO_PATH: {COCO_PATH.resolve()}")
print(f"IMAGES_ROOT: {IMAGES_ROOT.resolve()}")
print(f"TARGET_ROOT: {TARGET_ROOT.resolve()}")


# === OPTIONAL CLEANING STEP ===

CLEAN_COCO_PATH = Path(TARGET_ROOT / "annotations_clean.json")  # new, cleaned file
IMAGES_ROOT = Path("images")                          # with images/french, images/german


def clean_coco_missing_images(raw_coco_path: Path,
                              images_root: Path,
                              out_coco_path: Path):
    """
    Remove images and annotations from a COCO file if the image file
    no longer exists on disk (in images/french or images/german).

    Also drops categories that are no longer used.
    """
    with raw_coco_path.open("r") as f:
        coco = json.load(f)

    images = coco["images"]
    annotations = coco["annotations"]
    categories = coco["categories"]

    def image_exists(fname: str) -> bool:
        # We search for the filename under both script modes
        for script_mode in ["french", "german"]:
            p = images_root / script_mode / fname
            if p.exists():
                return True
        return False

    # Keep only images that exist on disk
    kept_images = []
    kept_image_ids = set()
    missing_files = []

    for img in images:
        fname = img["file_name"]
        if image_exists(fname):
            kept_images.append(img)
            kept_image_ids.add(img["id"])
        else:
            missing_files.append(fname)

    # Keep only annotations whose image_id is still present
    kept_annotations = [ann for ann in annotations if ann["image_id"] in kept_image_ids]

    # Optionally: keep only categories that are still used
    used_cat_ids = {ann["category_id"] for ann in kept_annotations}
    kept_categories = [cat for cat in categories if cat["id"] in used_cat_ids]

    cleaned_coco = {
        "images": kept_images,
        "annotations": kept_annotations,
        "categories": kept_categories,
    }

    with out_coco_path.open("w") as f:
        json.dump(cleaned_coco, f, indent=4)

    print(f"Total images in original COCO: {len(images)}")
    print(f"Images found on disk:          {len(kept_images)}")
    print(f"Images missing on disk:        {len(missing_files)}")
    if missing_files:
        print("Example missing files (up to 10):", missing_files[:10])

    print(f"Total annotations (original):  {len(annotations)}")
    print(f"Annotations kept:              {len(kept_annotations)}")

    print(f"Categories kept:               {len(kept_categories)}")
    print(f"Cleaned COCO written to:       {out_coco_path}")


clean_coco_missing_images(COCO_PATH, IMAGES_ROOT, CLEAN_COCO_PATH)

# IMPORTANT: tell the rest of the notebook to use the cleaned file
COCO_PATH = CLEAN_COCO_PATH
print("COCO_PATH updated to:", COCO_PATH)


COCO_PATH: /home/suliman/midrash_auto_annotate_asc/annotations/annotations.json
IMAGES_ROOT: /home/suliman/midrash_auto_annotate_asc/images
TARGET_ROOT: /home/suliman/midrash_auto_annotate_asc/ASC_dataset2
Total images in original COCO: 53
Images found on disk:          37
Images missing on disk:        16
Example missing files (up to 10): ['003_000_00.jpg', '030_000_00.jpg', '030_000_01.jpg', '030_000_02.jpg', '036_000_02.jpg', '034_000_01.jpg', '035_000_00.jpg', '035_000_02.jpg', '035_000_03.jpg', '034_000_00.jpg']
Total annotations (original):  10063
Annotations kept:              6379
Categories kept:               6
Cleaned COCO written to:       ASC_dataset2/annotations_clean.json
COCO_PATH updated to: ASC_dataset2/annotations_clean.json


## Step 1: Split COCO and Images into Train/Val (Preserving Script Modes)

This step:
- Loads `annotations.json`.
- Infers the **script mode** (`french` / `german`) of each image by searching under `images/`.
- Splits images randomly into **train** and **val** using `VAL_RATIO`.
- Writes two COCO files under `ASC_dataset2/`:
  - `annotations_train.json`
  - `annotations_val.json`
- Copies images into:
  - `ASC_dataset2/images/train/french`, `ASC_dataset2/images/train/german`
  - `ASC_dataset2/images/val/french`,   `ASC_dataset2/images/val/german`


In [8]:
def split_coco_and_images(coco_path: Path,
                          images_root: Path,
                          target_root: Path,
                          val_ratio: float = 0.2,
                          seed: int = 42):
    random.seed(seed)

    with coco_path.open("r") as f:
        coco = json.load(f)

    images = coco["images"]
    annotations = coco["annotations"]
    categories = coco["categories"]

    # Helper: find full path and script mode (french/german) for each image
    def find_image_path(fname: str):
        for script_mode in ["french", "german"]:
            p = images_root / script_mode / fname
            if p.exists():
                return p, script_mode
        raise FileNotFoundError(f"Image {fname} not found under {images_root}/french or german.")

    # Attach full path and script_mode to each image entry (temporarily)
    for img in images:
        fname = img["file_name"]
        full_path, script_mode = find_image_path(fname)
        img["__full_path"] = str(full_path)
        img["__script_mode"] = script_mode

    # Shuffle and split
    random.shuffle(images)
    split_idx = int(len(images) * (1.0 - val_ratio))
    train_images = images[:split_idx]
    val_images = images[split_idx:]

    train_ids = {img["id"] for img in train_images}
    val_ids = {img["id"] for img in val_images}

    train_annotations = [ann for ann in annotations if ann["image_id"] in train_ids]
    val_annotations = [ann for ann in annotations if ann["image_id"] in val_ids]

    # Clean temp fields and build COCO dicts
    def clean_image_list(img_list):
        cleaned = []
        for img in img_list:
            img_copy = dict(img)
            img_copy.pop("__full_path", None)
            img_copy.pop("__script_mode", None)
            cleaned.append(img_copy)
        return cleaned

    train_coco = {
        "images": clean_image_list(train_images),
        "annotations": train_annotations,
        "categories": categories,
    }
    val_coco = {
        "images": clean_image_list(val_images),
        "annotations": val_annotations,
        "categories": categories,
    }

    # Write split COCO files
    train_json = target_root / "annotations_train.json"
    val_json = target_root / "annotations_val.json"

    with train_json.open("w") as f:
        json.dump(train_coco, f, indent=4)
    with val_json.open("w") as f:
        json.dump(val_coco, f, indent=4)

    # Create target image directories
    for split in ["train", "val"]:
        for script_mode in ["french", "german"]:
            (target_root / "images" / split / script_mode).mkdir(parents=True, exist_ok=True)

    # Copy images into target structure
    def copy_images(img_list, split_name: str):
        for img in img_list:
            src = Path(img["__full_path"])
            script_mode = img["__script_mode"]
            dst = target_root / "images" / split_name / script_mode / img["file_name"]
            dst.parent.mkdir(parents=True, exist_ok=True)
            if not dst.exists():
                shutil.copy2(src, dst)

    copy_images(train_images, "train")
    copy_images(val_images, "val")

    print(f"Train images: {len(train_images)}, Val images: {len(val_images)}")
    print(f"Wrote: {train_json}")
    print(f"Wrote: {val_json}")


# Run the split
split_coco_and_images(COCO_PATH, IMAGES_ROOT, TARGET_ROOT, VAL_RATIO, RANDOM_SEED)


Train images: 29, Val images: 8
Wrote: ASC_dataset2/annotations_train.json
Wrote: ASC_dataset2/annotations_val.json


## Step 2: Create Multi-class YOLO Labels (One Dataset for All Letters)

We now convert the split COCO files into YOLO format, **preserving script modes**. The output
structure will be:

```text
ASC_dataset2/
  labels/
    train/french/*.txt
    train/german/*.txt
    val/french/*.txt
    val/german/*.txt
```

- Each `.txt` file contains multiple lines: `class_id cx cy w h`.
- `class_id` is a YOLO index (0..N-1) derived from the COCO `categories`.
- This is **multi-class**, keeping the distinct letters.

Later, in YOLOv11 you can train **one letter at a time** by using the `classes=[index]`
and `single_cls=True` options.


In [9]:
def coco_to_yolo_multiclass(image_root: Path, coco_json: Path):
    """Convert COCO annotations to YOLO txt labels (multi-class),
    preserving script mode subdirectories (french/german).

    image_root: e.g. ASC_dataset2/images/train
    coco_json:  e.g. ASC_dataset2/annotations_train.json
    """
    labels_root = Path(str(image_root).replace("images", "labels"))
    labels_root.mkdir(parents=True, exist_ok=True)

    with coco_json.open("r") as f:
        coco = json.load(f)

    image_info = {img["id"]: img for img in coco["images"]}

    # Build category_id -> YOLO class index
    categories = sorted(coco["categories"], key=lambda c: c["id"])
    cat_id_to_yolo = {cat["id"]: idx for idx, cat in enumerate(categories)}

    # Group annotations by image
    anns_by_img = {}
    for ann in coco["annotations"]:
        img_id = ann["image_id"]
        anns_by_img.setdefault(img_id, []).append(ann)

    for img_id, anns in anns_by_img.items():
        img = image_info[img_id]
        fname = img["file_name"]
        width, height = img["width"], img["height"]

        # Determine script mode by checking subdirs
        script_mode_found = None
        for script_mode in ["french", "german"]:
            candidate = image_root / script_mode / fname
            if candidate.exists():
                script_mode_found = script_mode
                break

        if script_mode_found is None:
            print(f"WARNING: image {fname} not found under {image_root}/french or german.")
            continue

        labels_dir = labels_root / script_mode_found
        labels_dir.mkdir(parents=True, exist_ok=True)

        stem = Path(fname).stem
        label_path = labels_dir / f"{stem}.txt"

        with label_path.open("w") as lf:
            for ann in anns:
                x, y, w, h = ann["bbox"]
                cx = (x + w / 2) / width
                cy = (y + h / 2) / height
                nw = w / width
                nh = h / height

                class_id = cat_id_to_yolo[ann["category_id"]]
                lf.write(f"{class_id} {cx:.6f} {cy:.6f} {nw:.6f} {nh:.6f}\n")

    print(f"YOLO labels created under: {labels_root}")
    print("Category mapping (COCO id -> YOLO index):")
    for cat in categories:
        print(f"  {cat['id']} -> {cat_id_to_yolo[cat['id']]} : {cat['name']}")


# Run for train and val splits
coco_to_yolo_multiclass(TARGET_ROOT / "images" / "train", TARGET_ROOT / "annotations_train.json")
coco_to_yolo_multiclass(TARGET_ROOT / "images" / "val",   TARGET_ROOT / "annotations_val.json")


YOLO labels created under: ASC_dataset2/labels/train
Category mapping (COCO id -> YOLO index):
  1 -> 0 : Aleph
  2 -> 1 : He
  3 -> 2 : Mem
  4 -> 3 : Shin
  5 -> 4 : Mem Sofit
  6 -> 5 : Tav
YOLO labels created under: ASC_dataset2/labels/val
Category mapping (COCO id -> YOLO index):
  1 -> 0 : Aleph
  2 -> 1 : He
  3 -> 2 : Mem
  4 -> 3 : Shin
  5 -> 4 : Mem Sofit
  6 -> 5 : Tav


## Step 3: Per-Letter COCO Files for Detectron2 (Single Letter per Model)

Now we prepare **per-letter COCO files** so that Detectron2 can be trained on one letter at a time,
similar to how YOLO can be trained with `classes=[i]`.

For each COCO `category_id` (letter class), we create:

- `ASC_dataset2/per_letter/train_cat_<ID>.json`
- `ASC_dataset2/per_letter/val_cat_<ID>.json`

Each of these files contains:
- Only annotations where `category_id == <ID>`.
- Only images that have at least one annotation of that letter.
- A `categories` list with a **single entry**: that specific letter.

When you register one of these in Detectron2, it will internally map that
single category to class index 0, so you can use `NUM_CLASSES = 1` in your config.


In [11]:
def make_per_letter_coco_splits(base_train_json: Path,
                                base_val_json: Path,
                                target_root: Path):
    """Generate per-letter COCO files for all categories found in base_train_json.

    Outputs go into target_root / "per_letter" as:
      - train_cat_<ID>.json
      - val_cat_<ID>.json
    """
    per_letter_dir = target_root / "per_letter"
    per_letter_dir.mkdir(parents=True, exist_ok=True)

    with base_train_json.open("r") as f:
        train_coco = json.load(f)
    with base_val_json.open("r") as f:
        val_coco = json.load(f)

    categories = train_coco["categories"]
    cat_id_to_cat = {c["id"]: c for c in categories}

    # Helper: filter a COCO dict to a single category_id
    def filter_to_cat(coco_dict, cat_id):
        anns = [a for a in coco_dict["annotations"] if a["category_id"] == cat_id]
        img_ids = {a["image_id"] for a in anns}
        imgs = [img for img in coco_dict["images"] if img["id"] in img_ids]
        cat = cat_id_to_cat[cat_id]
        return {
            "images": imgs,
            "annotations": anns,
            "categories": [cat],
        }

    for cat in categories:
        cat_id = cat["id"]
        name = cat.get("name", f"cat_{cat_id}")

        train_single = filter_to_cat(train_coco, cat_id)
        val_single = filter_to_cat(val_coco, cat_id)

        # Skip if there are no annotations for this cat in train/val
        if len(train_single["annotations"]) == 0 and len(val_single["annotations"]) == 0:
            print(f"Skipping category {cat_id} ({name}): no annotations in train or val.")
            continue

        out_train = per_letter_dir / f"train_cat_{cat_id}.json"
        out_val = per_letter_dir / f"val_cat_{cat_id}.json"

        with out_train.open("w") as f:
            json.dump(train_single, f, indent=4)
        with out_val.open("w") as f:
            json.dump(val_single, f, indent=4)

        print(f"Wrote per-letter COCO for category {cat_id} ({name}):")
        print(f"  {out_train}")
        print(f"  {out_val}")


make_per_letter_coco_splits(
    TARGET_ROOT / "annotations_train.json",
    TARGET_ROOT / "annotations_val.json",
    TARGET_ROOT,
)


Wrote per-letter COCO for category 1 (Aleph):
  ASC_dataset2/per_letter/train_cat_1.json
  ASC_dataset2/per_letter/val_cat_1.json
Wrote per-letter COCO for category 2 (He):
  ASC_dataset2/per_letter/train_cat_2.json
  ASC_dataset2/per_letter/val_cat_2.json
Wrote per-letter COCO for category 3 (Mem):
  ASC_dataset2/per_letter/train_cat_3.json
  ASC_dataset2/per_letter/val_cat_3.json
Wrote per-letter COCO for category 4 (Shin):
  ASC_dataset2/per_letter/train_cat_4.json
  ASC_dataset2/per_letter/val_cat_4.json
Wrote per-letter COCO for category 5 (Mem Sofit):
  ASC_dataset2/per_letter/train_cat_5.json
  ASC_dataset2/per_letter/val_cat_5.json
Wrote per-letter COCO for category 6 (Tav):
  ASC_dataset2/per_letter/train_cat_6.json
  ASC_dataset2/per_letter/val_cat_6.json
