In [3]:
import os
import glob


In [1]:
%pwd

'/home/uday/Practise/github_personal/pothole_detection/pothole_detection/research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'/home/uday/Practise/github_personal/pothole_detection/pothole_detection'

In [24]:
from pathlib import Path
from typing import List
from dataclasses import dataclass

In [25]:
@dataclass(frozen=True)
class DataValidatorConfig:
    data_dir: Path
    data_folder: str
    train_folder: str
    val_folder: str
    sub_dirs: List[str]

In [26]:
from potholeDetector.constants import *
from potholeDetector.utils.common import *

In [36]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH): 
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)


    def get_data_validation_config(self) -> DataValidatorConfig:
        config = self.config.data_validation
        self.data_dir = config.data_dir
        self.data_folder = config.data_folder
        self.train_folder = config.train_folder
        self.val_folder = config.val_folder
        self.sub_dirs = config.sub_dirs

        data_ingestion_config = DataValidatorConfig(
            self.data_dir,
            self.data_folder,
            self.train_folder,
            self.val_folder,
            self.sub_dirs,
            )

        return data_ingestion_config                   

In [37]:
import os
import glob
import zipfile
from pathlib import Path

In [47]:
class DataValidation:
    def __init__(self, config:DataValidatorConfig):
        self.config = config
        self.train_dir = os.path.join(self.config.data_dir, self.config.data_folder, self.config.train_folder)
        self.valid_dir = os.path.join(self.config.data_dir, self.config.data_folder, self.config.val_folder)        
        

    def validate_folder_structure(self):
        """
        Validate the folder structure of the dataset for YOLO.

        Returns:
            bool: True if the folder structure is valid, False otherwise.
        """        
        required_subdirs = self.config.sub_dirs
        for dir in [self.train_dir, self.valid_dir]:
            if not os.path.isdir(dir):
                raise False
            for folder in required_subdirs:
                path = os.path.join(dir, folder)
                if not os.path.isdir(path):
                    raise False
        return True

    def validate_image_label_counts(self):
        """
        Validate that the number of images matches the number of label files for YOLO.
        If a mismatch is found, delete the corresponding files.
        """        
        self._validate_counts_for_folder(self.train_dir)
        self._validate_counts_for_folder(self.valid_dir)

    def _validate_counts_for_folder(self, folder_path):
        """
        Validate image and label counts for a specific folder.
        If a mismatch is found, delete the corresponding files.

        Args:
            folder_path (str): Path to the folder containing images and labels.
        """
        required_subdirs = self.config.sub_dirs
        image_dir = os.path.join(folder_path, required_subdirs[0])  # Path to the folder containing images
        label_dir = os.path.join(folder_path, required_subdirs[1])  # Path to the folder containing labels

        image_files = set(self._get_file_names(image_dir, ".jpg"))
        label_files = set(self._get_file_names(label_dir, ".txt"))

        missing_image_files = list(label_files - image_files)
        missing_label_files = list(image_files - label_files)

        # Delete files with missing image or label
        for file in missing_image_files:
            image_path = os.path.join(image_dir, f"{file}.jpg")
            label_path = os.path.join(label_dir, f"{file}.txt")
            self._delete_file(image_path)
            self._delete_file(label_path)

        for file in missing_label_files:
            image_path = os.path.join(image_dir, f"{file}.jpg")
            label_path = os.path.join(label_dir, f"{file}.txt")
            self._delete_file(image_path)
            self._delete_file(label_path)

    def _get_file_names(self, directory, extension):
        """
        Get a list of file names with a specific extension in a directory.

        Args:
            directory (str): Directory path.
            extension (str): File extension (e.g., ".jpg", ".txt").

        Returns:
            list: List of file names.
        """
        return [Path(file).stem for file in glob.glob(os.path.join(directory, f"*{extension}"))]            

    def _delete_file(self, file_path):
        """
        Delete a file.

        Args:
            file_path (str): Path to the file.
        """
        if os.path.isfile(file_path):
            os.remove(file_path)                                    


In [49]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_folder_structure()
    data_validation.validate_image_label_counts()
except Exception as e:
    raise e


[2023-07-22 13:35:57,278: INFO:common:YAML file: config/config.yaml loaded successfully]
[2023-07-22 13:35:57,279: INFO:common:YAML file: params.yaml loaded successfully]
