# Data Understanding

In [1]:
import logging
import xml.etree.ElementTree as ET
from pathlib import Path
from collections import Counter, defaultdict

from rich.console import Console
from rich.progress import track
from rich.table import Table

console = Console()

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger("dataset_audit")

In [2]:
ROOT = Path("../../data")

TRAIN_SRC = ROOT / "train"     # annotated
TEST_SRC  = ROOT / "test1"     # NOT annotated (inference only)

COUNTRIES = ["India", "Czech", "Japan"]

assert TRAIN_SRC.exists(), "Train directory missing"
assert TEST_SRC.exists(),  "Test directory missing"

logger.info("Dataset paths validated")

2026-02-02 11:57:58,370 | INFO | Dataset paths validated


In [3]:
def get_xml_dir(country_root: Path):
    """
    Handles: annotations/xmls/*.xml
    """
    xml_dir = country_root / "annotations" / "xmls"
    assert xml_dir.exists(), f"Missing {xml_dir}"
    return xml_dir

In [4]:
def scan_train_classes():
    class_counter = Counter()
    per_country = defaultdict(Counter)
    total_xml = 0

    for country in COUNTRIES:
        country_root = TRAIN_SRC / country
        xml_dir = get_xml_dir(country_root)

        xml_files = list(xml_dir.glob("*.xml"))
        total_xml += len(xml_files)

        for xml in track(xml_files, description=f"[cyan]{country}"):
            tree = ET.parse(xml)
            root = tree.getroot()

            for obj in root.findall("object"):
                cls = obj.find("name").text.strip()
                class_counter[cls] += 1
                per_country[country][cls] += 1

    return class_counter, per_country, total_xml

In [5]:
train_classes, train_country_classes, train_xmls = scan_train_classes()

logger.info(f"Annotated training XML files: {train_xmls}")

2026-02-02 11:58:00,601 | INFO | Annotated training XML files: 21041


In [6]:
table = Table(title="TRAIN SET – VOC Class Distribution")
table.add_column("Class name", style="bold cyan")
table.add_column("Count", justify="right")

for cls, cnt in train_classes.most_common():
    table.add_row(cls, str(cnt))

console.print(table)

In [7]:
for country in COUNTRIES:
    table = Table(title=f"{country} – TRAIN Class Distribution")
    table.add_column("Class name", style="bold green")
    table.add_column("Count", justify="right")

    for cls, cnt in train_country_classes[country].most_common():
        table.add_row(cls, str(cnt))

    console.print(table)

# Dataset understanding

| Code    | Meaning (standard RDD)           |
| ------- | -------------------------------- |
| D00     | Longitudinal crack               |
| D10     | Transverse crack                 |
| D20     | Alligator crack                  |
| D40     | Pothole                          |
| D44     | Pothole (variant)                |
| D50     | Rutting                          |
| D43     | Utility cut                      |
| D01/D11 | Minor crack variants             |
| D0w0    | Noise / typo (1 sample → ignore) |


In [8]:
BINARY_CLASS_ID = 0
BINARY_CLASS_NAME = "road_anomaly"

# Data Training

In [9]:
import random
import shutil
import logging
import xml.etree.ElementTree as ET
from pathlib import Path
from PIL import Image

from rich.console import Console
from rich.progress import track

In [10]:
console = Console()

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger("binary_builder")

In [11]:
ROOT = Path("../../data")

TRAIN_SRC = ROOT / "train"     # annotated
TEST_SRC  = ROOT / "test1"     # inference only (DO NOT TOUCH)

OUT = ROOT / "yolo_binary"

COUNTRIES = ["India", "Czech", "Japan"]

TRAIN_OUT = OUT / "train"
VAL_OUT   = OUT / "val"

VAL_RATIO = 0.2
SEED = 42

random.seed(SEED)

In [12]:
for split in [TRAIN_OUT, VAL_OUT]:
    (split / "images").mkdir(parents=True, exist_ok=True)
    (split / "labels").mkdir(parents=True, exist_ok=True)

logger.info("YOLO output directories created")

2026-02-02 11:58:01,757 | INFO | YOLO output directories created


In [13]:
def get_xml_dir(country_root: Path):
    xml_dir = country_root / "annotations" / "xmls"
    assert xml_dir.exists(), f"Missing {xml_dir}"
    return xml_dir

In [14]:
def voc_to_yolo_binary(xml_path: Path, img_path: Path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    img = Image.open(img_path)
    w, h = img.size

    yolo_lines = []

    for obj in root.findall("object"):
        box = obj.find("bndbox")

        xmin = float(box.find("xmin").text)
        ymin = float(box.find("ymin").text)
        xmax = float(box.find("xmax").text)
        ymax = float(box.find("ymax").text)

        xc = ((xmin + xmax) / 2) / w
        yc = ((ymin + ymax) / 2) / h
        bw = (xmax - xmin) / w
        bh = (ymax - ymin) / h

        yolo_lines.append(
            f"{BINARY_CLASS_ID} {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}"
        )

    return yolo_lines

In [15]:
def split_images(country_root: Path):
    img_dir = country_root / "images"
    images = list(img_dir.glob("*"))
    random.shuffle(images)

    split_idx = int(len(images) * (1 - VAL_RATIO))
    return images[:split_idx], images[split_idx:]

In [16]:
def build_split(country: str, images, split_out: Path):
    country_root = TRAIN_SRC / country
    xml_dir = get_xml_dir(country_root)

    kept = 0

    for img in track(images, description=f"[cyan]{country} → {split_out.name}"):
        xml = xml_dir / f"{img.stem}.xml"
        if not xml.exists():
            continue

        yolo_lines = voc_to_yolo_binary(xml, img)
        if len(yolo_lines) == 0:
            continue

        new_name = f"{country}_{img.name}"

        shutil.copy(img, split_out / "images" / new_name)
        (split_out / "labels" / f"{country}_{img.stem}.txt").write_text(
            "\n".join(yolo_lines)
        )

        kept += 1

    return kept

In [17]:
train_count = 0
val_count = 0

for country in COUNTRIES:
    country_root = TRAIN_SRC / country
    train_imgs, val_imgs = split_images(country_root)

    train_count += build_split(country, train_imgs, TRAIN_OUT)
    val_count   += build_split(country, val_imgs,   VAL_OUT)

logger.info(f"Final TRAIN samples: {train_count}")
logger.info(f"Final VAL samples:   {val_count}")

assert train_count > 5000, "Too few training samples"
assert val_count > 1000,   "Too few validation samples"

2026-02-02 11:58:12,602 | INFO | Final TRAIN samples: 11651
2026-02-02 11:58:12,603 | INFO | Final VAL samples:   2918


In [18]:
(OUT / "data.yaml").write_text(f"""
path: {OUT.resolve()}
train: train/images
val: val/images

nc: 1
names: ["road_defect"]
""".strip())

logger.info("data.yaml created")

2026-02-02 11:58:12,642 | INFO | data.yaml created
