In [1]:
# pip install tqdm
# pip install ultralytics

In [2]:
from pathlib import Path
import pandas as pd
import os
from tqdm.notebook import tqdm
import yaml as yml
from IPython.display import display
from ultralytics.data.utils import verify_image_label,IMG_FORMATS, img2label_paths

In [3]:
YAML_FILE_NAME = r"data.yaml"
DEFAULT_RATIO = {"train":9, "val":1}
get_files_with_extensions = lambda folder_path, extensions : [file for file in folder_path.iterdir() if file.suffix.split('.')[1] in extensions]
class YOLODataset:
    def __init__(self, dataset_path):
        self.path = dataset_path
        self.dataset_name = self.path.name
        self.image_dir = self.path / r"images"
        self.label_dir = self.path / r"labels"
        self.train_images_path = self.image_dir / "train"
        self.train_labels_path = self.label_dir / "train"
        self.val_images_path = self.image_dir / "val"
        self.val_labels_path = self.label_dir / "val"
        self.yaml_file = self.path / YAML_FILE_NAME

    def __str__(self):
        return self.dataset_name
    def get_stats(self):
        self.train_images = get_files_with_extensions(self.train_images_path, IMG_FORMATS)
        self.val_images = get_files_with_extensions(self.val_images_path, IMG_FORMATS)
        self.train_labels = get_files_with_extensions(self.train_labels_path, ["txt"])
        self.val_labels = get_files_with_extensions(self.val_labels_path, ["txt"])
        self.total_images = self.train_images + self.val_images
        self.total_labels = self.train_labels + self.val_labels
        # Count the number of images and labels
        self.train_images_count, self.train_labels_count = len(self.train_images), len(self.train_labels)
        self.val_images_count, self.val_labels_count = len(self.val_images), len(self.val_labels)
        self.total_images_count, self.total_labels_count = len(self.total_images), len(self.total_labels)
        # Find the background images
        background_image_names = set(file.stem for file in self.total_images) - set(file.stem for file in self.total_labels)
        self.background_images = [file for file in self.total_images if file.stem in background_image_names]
        self.BG_count = len(self.background_images)
        #  calculate the ratio between val and train
        self.train_ratio = self.train_images_count / self.total_images_count
        self.val_ratio = self.val_images_count / self.total_images_count
        data = {}
        data["dataset_name"] = self.dataset_name
        data["train_images_count"] = self.train_images_count
        data["train_labels_count"] = self.train_labels_count
        data["val_images_count"] = self.val_images_count
        data["val_labels_count"] = self.val_labels_count
        data["total_images_count"] = self.total_images_count
        data["total_labels_count"] = self.total_labels_count
        data["train_ratio"] = self.train_ratio
        data["val_ratio"] = self.val_ratio
        data["BG_count"] = self.BG_count
        data = {k: [v] for k, v in data.items()}
        return pd.DataFrame(data)
    def check_yaml_file(self):
        if self.yaml_file.exists():
            with open(self.yaml_file, "r", encoding="utf-8") as file:
                data = yml.safe_load(file)
            # Print the contents
            if Path(data["path"]).name != self.dataset_name:
                print(rf"yaml file has invalid path Path: {data['path']}")
            elif data["train"] != r"images/train":
                print(rf"yaml file has invalid path train: {data['train']}")
            elif data["val"] != r"images/val":
                print(rf"yaml file has invalid path val: {data['val']}")
            elif data["test"] != r"images/val":
                print(rf"yaml file has invalid path test: {data['test']}")
            elif data["names"] != {0: "person"}:
                print(rf"yaml file has invalid names: {data['names']}")
        else:
            print(rf"for {self.dataset_name} yaml file does not exist")
    def check_labels_format(self):
        labels_with_invalid_format = []
        lebels_with_wrong_class_id = []
        for label in self.total_labels:
            if os.stat(label).st_size == 0:
                os.remove(label)
                continue
            lines = []
            with open(label, "r") as file:
                lines = file.readlines()
            for line in lines:
                if len(line.split()) != 5 :
                    labels_with_invalid_format.append(label)
                elif int(line.split()[0]) != 0:
                    lebels_with_wrong_class_id.append(label)
        return labels_with_invalid_format, lebels_with_wrong_class_id
    def verify_images_and_labels(self):
        label_paths = img2label_paths([str(file) for file in self.total_images])
        for x, y in tqdm(zip(self.total_images, label_paths), desc=f"Verifying {self.dataset_name}", total=len(self.total_images)):
                msg = verify_image_label((x, y, x.name, False, 1, 0, 0, True))
                if msg[9]:
                    print(msg[9])

# path to the directory containing the dataset 

In [4]:
root_dir = Path(r"E:\analysis\Task_cycle\2025_04_TC04\datacheck")
datasets = [YOLODataset(sub_dir) for sub_dir in root_dir.iterdir() if sub_dir.is_dir()]
datasets_stats = pd.DataFrame()

# check dataset stat

In [5]:
for dataset in datasets:
    data = dataset.get_stats()
    datasets_stats = pd.concat([datasets_stats, data], ignore_index=True)    
display(datasets_stats)

Unnamed: 0,dataset_name,train_images_count,train_labels_count,val_images_count,val_labels_count,total_images_count,total_labels_count,train_ratio,val_ratio,BG_count
0,2025_04_tc04_dataset_06,28096,28096,3111,3111,31207,31207,0.900311,0.099689,0
1,2025_04_tc04_dataset_09,26894,24619,2746,2746,29640,27365,0.907355,0.092645,2275


# check data.yaml file

In [6]:
for dataset in datasets:
    dataset.check_yaml_file()

# check labels format

In [7]:
errors = []
for dataset in datasets:
    errors = dataset.check_labels_format()
    print(dataset)
    print(errors)
    # for error in errors[1]:
    # # print(error)
    #     with open(error, "r") as file:
    #         lines = file.readlines()
    #     mod_lines = []
    #     for line in lines:
    #         line = line.strip().split()
    #         line[0] = "0"
    #         line = " ".join(line)
    #         mod_lines.append(line+"\n")
    #     with open(error, "w") as file:
    #         file.writelines(mod_lines)

2025_04_tc04_dataset_06
([], [])
2025_04_tc04_dataset_09
([], [])


# Verify image and label on YOLOv8

In [8]:
for dataset in datasets:
    dataset.verify_images_and_labels()

Verifying 2025_04_tc04_dataset_06:   0%|          | 0/31207 [00:00<?, ?it/s]

Verifying 2025_04_tc04_dataset_09:   0%|          | 0/29640 [00:00<?, ?it/s]

