## Initial dataset 

In [15]:
import os
import xml.etree.ElementTree as ET

def count_images(list_path):
    if not os.path.isfile(list_path):
        print(f"File not found: {list_path}")
        return 0
    
    with open(list_path, "r", encoding="utf-8") as f:
        names = [line.strip() for line in f.readlines() if line.strip()]
    return len(names)

train_list = "data_tu/image_names_train.txt"
val_list   = "data_tu/image_names_val.txt"

nb_train = count_images(train_list)
nb_val   = count_images(val_list)

print(f"Number of TRAIN images     : {nb_train}")
print(f"Number of VALIDATION images: {nb_val}")
print(f"Total number of images     : {nb_train + nb_val}")

percentage = (nb_val / (nb_train + nb_val)) * 100
print("Validation set represents", round(percentage), "% of the dataset")

Number of TRAIN images     : 1151
Number of VALIDATION images: 288
Total number of images     : 1439
Validation set represents 20 % of the dataset


## New dataset with data augmentation

In [16]:
from collections import defaultdict

def count_files_in_folder(folder_path, extensions={".jpg", ".jpeg", ".png"}):
    if not os.path.isdir(folder_path):
        print(f"Folder not found: {folder_path}")
        return 0

    count = 0
    for fname in os.listdir(folder_path):
        if os.path.splitext(fname)[1].lower() in extensions:
            count += 1
    return count


def count_objects_per_class(annotations_folder):
    class_counts = defaultdict(int)

    for fname in os.listdir(annotations_folder):
        if not fname.endswith(".xml"):
            continue
        
        xml_path = os.path.join(annotations_folder, fname)
        try:
            root = ET.parse(xml_path).getroot()
        except Exception:
            continue

        for obj in root.findall("object"):
            name = obj.findtext("name")
            if name:
                class_counts[name.strip()] += 1

    return class_counts


images_folder = "export_michelin_v1/images"
annotations_folder = "export_michelin_v1/annotations"

nb_images = count_files_in_folder(images_folder, extensions={".jpg", ".png"})
nb_xml = count_files_in_folder(annotations_folder, extensions={".xml"})

print(f"Number of images      : {nb_images}")
print(f"Number of annotations : {nb_xml}")

if nb_images == nb_xml:
    print("Images and annotations match")
else:
    print("Mismatch between images and annotations")
    print(f"Difference: {abs(nb_images - nb_xml)}")

print("\nObject count per class")

class_counts = count_objects_per_class(annotations_folder)

def extract_value(cls_name):
    try:
        return int(cls_name.split(":")[1])
    except:
        return 999

sorted_classes = sorted(class_counts.items(), key=lambda x: extract_value(x[0]))

total_objects = sum(class_counts.values())

for cls, count in sorted_classes:
    print(f"{cls:12} : {count}")

print(f"\nTotal annotated objects: {total_objects}")

Number of images      : 1409
Number of annotations : 1409
Images and annotations match

Object count per class
temoin:0     : 143
temoin:25    : 200
temoin:50    : 200
temoin:75    : 200
temoin:80    : 200
temoin:90    : 200
temoin:95    : 66
temoin:100   : 200

Total annotated objects: 1409
