In [2]:
from pathlib import Path
from collections import Counter

DATA_DIR = Path("../../../data/combined_annotatedv2")

label_files = list(DATA_DIR.glob("*.txt"))
image_files = list(DATA_DIR.glob("*.jpg"))

print(f"Total images: {len(image_files)}")
print(f"Total label files: {len(label_files)}")

class_counter = Counter()
empty_labels = 0
non_yolo_files = 0

for label in label_files:
    if label.stat().st_size == 0:
        empty_labels += 1
        continue

    with open(label, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            parts = line.strip().split()

            # Empty or malformed line
            if len(parts) < 5:
                continue

            # Case 1: YOLO numeric format
            if parts[0].isdigit():
                class_counter[int(parts[0])] += 1

            # Case 2: Class-name-based format
            else:
                non_yolo_files += 1
                class_name = " ".join(parts[:-4])
                class_counter[class_name] += 1

print("\nClass distribution (raw):")
for k, v in class_counter.items():
    print(f"{k}: {v}")

print(f"\nEmpty annotation files: {empty_labels}")
print(f"Non-YOLO (string-label) lines detected: {non_yolo_files}")


Total images: 25600
Total label files: 25600

Class distribution (raw):
4: 6918
6: 4224
7: 548
0: 12810
1: 2340
2: 7456
3: 708
5: 906

Empty annotation files: 8317
Non-YOLO (string-label) lines detected: 0


In [4]:
from pathlib import Path

SRC_DIR = Path("../../../data/combined_annotatedv2")
DST_DIR = Path("../../../data/combined_annotated_2class")

DST_DIR.mkdir(exist_ok=True)

CRACK_CLASSES = {0, 1, 2, 3, 4, 5}
POTHOLE_CLASSES = {6, 7}

# Copy images
for img in SRC_DIR.glob("*.jpg"):
    (DST_DIR / img.name).write_bytes(img.read_bytes())

# Convert labels
skipped_lines = 0
processed_boxes = 0

for label in SRC_DIR.glob("*.txt"):
    dst_label = DST_DIR / label.name

    if label.stat().st_size == 0:
        dst_label.write_text("")
        continue

    new_lines = []

    with open(label, "r") as f:
        for line in f:
            parts = line.strip().split()

            # Skip invalid / metadata lines
            if len(parts) < 5:
                skipped_lines += 1
                continue

            try:
                cls = int(parts[0])
                x, y, w, h = parts[1:5]
            except ValueError:
                skipped_lines += 1
                continue

            if cls in CRACK_CLASSES:
                new_cls = 0
            elif cls in POTHOLE_CLASSES:
                new_cls = 1
            else:
                continue

            new_lines.append(f"{new_cls} {x} {y} {w} {h}")
            processed_boxes += 1

    dst_label.write_text("\n".join(new_lines))

print(f"Processed boxes: {processed_boxes}")
print(f"Skipped malformed lines: {skipped_lines}")


Processed boxes: 35910
Skipped malformed lines: 8


In [5]:
from pathlib import Path

bad = 0
for f in Path("../../../data/combined_annotated_2class").glob("*.txt"):
    with open(f) as fh:
        for line in fh:
            if len(line.split()) != 5:
                bad += 1

print("Bad YOLO lines:", bad)


Bad YOLO lines: 0
