In [1]:
import os
import yaml
import logging
import sys
import shutil
import random
from pathlib import Path
# Move two directories up from the current file location
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

# Now import the LoggerManager
from Research_docs.utils.my_logger_module import LoggerManager

In [2]:

# Initialize LoggerManager
logger = LoggerManager("dataset_logs", category="dataset_processing")



✅ Logger handlers initialized. Log file: C:\Users\sathish\Downloads\FL_ModelForAV\Research_docs\logs\dataset_processing\dataset_logs\log_2025-02-20_17-29-36.log
🧹 Checking for old logs to clean...
✅ Logging initialized for 'dataset_logs' in category 'dataset_processing'. Logs saved in C:\Users\sathish\Downloads\FL_ModelForAV\Research_docs\logs\dataset_processing\dataset_logs\log_2025-02-20_17-29-36.log


In [3]:
# Define updated category mapping based on new labels
CATEGORY_MAPPING = {
    "person": 0,
    "pedestrian": 0,  # Merge pedestrian into person
    "rider": 1,
    "car": 2,
    "truck": 3,
    "bus": 4,
    "train": 5,
    "motor": 6,  # Motorcycle
    "motorcycle": 6,  # Merge motorcycle into motor
    "bike": 7,  # Bicycle
    "bicycle": 7,  # Merge bicycle into bike
    "traffic light": 8,
    "traffic sign": 9,
    "trailer": 10,
    "other person": 11,
    "other vehicle": 12
}

In [4]:
# Ensure the 'names' list follows correct index order
CATEGORY_NAMES = [name for name, index in sorted(CATEGORY_MAPPING.items(), key=lambda item: item[1])]


In [5]:
# ========== Utility Functions ==========

def get_image_files(directory: Path):
    """Returns a list of image file paths in a given directory."""
    return [f for f in directory.iterdir() if f.suffix in ['.jpg', '.png']] if directory.exists() else []


def write_list_to_file(file_path: Path, data_list: list):
    """Writes a list of strings to a file, creating it if necessary."""
    try:
        with file_path.open('w', encoding='utf-8') as f:
            f.write('\n'.join(data_list) + '\n')
    except Exception as e:
        raise IOError(f"❌ Error writing to {file_path}: {e}")



def copy_files(files: list, source_dir: Path, dest_dir: Path):
    """Copies a list of files from source_dir to dest_dir."""
    for file in files:
        source_path = source_dir / file.name
        dest_path = dest_dir / file.name
        shutil.copy2(source_path, dest_path)


def create_directories_if_not_exist(*dirs):
    """Creates multiple directories if they don’t exist."""
    for directory in dirs:
        directory.mkdir(parents=True, exist_ok=True)


# def log_missing_directory(directory: Path, description: str):
#     """Logs a warning if a directory is missing."""
#     if not directory.exists():
#         logging.warning(f"Skipping {description}, directory missing: {directory}")



In [6]:
def create_batches(batch_root: Path, num_batches: int, logger):
    """Creates batch directories if they don't exist."""
    logger.logger.info(f"Creating {num_batches} batch directories under {batch_root}...")

    for i in range(1, num_batches + 1):
        batch_path = batch_root / f"batch_{i}"
        batch_path.mkdir(parents=True, exist_ok=True)
        logger.logger.info(f"✅ Created/Verified batch directory: {batch_path}")


In [7]:
def save_directory_structure(batch_root: Path, logger):
    """Scans and saves the directory structure for each batch."""
    logger.logger.info("📂 Saving directory structure for all batches...")

    # Ensure batch root exists
    batch_root.mkdir(parents=True, exist_ok=True)

    for batch_path in batch_root.iterdir():
        if not batch_path.is_dir():
            continue  # Skip non-directory files

        structure_file = batch_path / "directory_structure.txt"

        # Define expected files
        expected_files = ["train.txt", "val.txt", "test.txt"]
        structure_content = [
            f"{file}: Exists" if (batch_path / file).exists() else f"{file}: Missing"
            for file in expected_files
        ]

        # Write structure details to file
        try:
            write_list_to_file(structure_file, structure_content)
            logger.logger.info(f"✅ directory_structure.txt saved in {batch_path}.")
        except Exception as e:
            logger.logger.error(f"❌ Error writing directory structure in {batch_path}: {e}")


In [8]:
def create_splits(batch_root: Path, logger):
    """Creates or updates train.txt, val.txt, and test.txt files listing image paths."""
    logger.logger.info("📂 Generating dataset split files (train.txt, val.txt, test.txt)...")

    # Ensure batch root exists
    batch_root.mkdir(parents=True, exist_ok=True)

    # Define image root
    images_root = batch_root / "images"

    # Define dataset splits
    split_files = {
        "train": batch_root / "train.txt",
        "val": batch_root / "val.txt",
        "test": batch_root / "test.txt",
    }

    # Get image paths for each split
    image_paths = {
        split: get_image_files(images_root / split)
        for split in ["train", "val", "test"]
    }

    # Write image paths to respective files
    for split, file_path in split_files.items():
        try:
            write_list_to_file(file_path, [str(img.resolve()) for img in image_paths[split]])
            logger.logger.info(f"✅ {split}.txt updated.")
        except Exception as e:
            logger.logger.error(f"❌ Error updating {split}.txt: {e}")


In [9]:
import random
import shutil
from pathlib import Path

def create_mini_dataset(source_root: Path, batch_root: Path, num_images=10, num_batches=5, logger=None):
    """Creates mini batches inside 'batch_root/batches/' from the full dataset."""
    logger.logger.info(f"📂 Creating {num_batches} mini batches inside {batch_root / 'batches'}...")

    # Ensure source dataset exists
    if not source_root.exists():
        logger.logger.error(f"❌ Source directory {source_root} does not exist.")
        return

    # Ensure batch root exists
    batch_root.mkdir(parents=True, exist_ok=True)

    # Define `batches/` directory inside `batch_root`
    batches_dir = batch_root / "batches"
    batches_dir.mkdir(parents=True, exist_ok=True)

    # Define source directories
    images_src = source_root / "images"
    labels_src = source_root / "labels"

    for batch_num in range(1, num_batches + 1):
        batch_dir = batches_dir / f"batch_{batch_num}"
        batch_dir.mkdir(parents=True, exist_ok=True)

        for split in ["train", "val", "test"]:
            src_images = images_src / split
            src_labels = labels_src / split
            dest_images = batch_dir / "images" / split
            dest_labels = batch_dir / "labels" / split

            # Ensure batch directories exist
            dest_images.mkdir(parents=True, exist_ok=True)
            dest_labels.mkdir(parents=True, exist_ok=True)

            # Check if source images exist
            if not src_images.exists():
                logger.logger.warning(f"⚠️ Skipping {split} in batch_{batch_num}, images folder missing: {src_images}")
                continue

            # Get available images
            image_files = [f for f in src_images.iterdir() if f.suffix in ['.jpg', '.png']]
            if not image_files:
                logger.logger.warning(f"⚠️ No images found in {src_images}, skipping.")
                continue

            # Randomly select images
            selected_images = random.sample(image_files, min(num_images, len(image_files)))

            # Copy selected images and corresponding labels
            for image in selected_images:
                shutil.copy2(image, dest_images / image.name)

                # Copy corresponding label if exists
                label_path = src_labels / f"{image.stem}.txt"
                if label_path.exists():
                    shutil.copy2(label_path, dest_labels / label_path.name)

    logger.logger.info(f"✅ Mini batches created successfully inside {batches_dir}")


In [10]:
import yaml
from pathlib import Path

def ensure_data_yaml_exists(batch_root: Path, logger):
    """Ensures a valid data.yaml file and dataset split files exist in each mini batch directory.
    If `train.txt`, `val.txt`, or `test.txt` are missing, they will be created with file names inside the batch.
    """
    logger.logger.info("📄 Creating/updating data.yaml and dataset split files for all mini batches...")

    # Ensure batch root exists
    batch_root.mkdir(parents=True, exist_ok=True)

    # Define batches directory inside batch_root
    batches_dir = batch_root / "batches"

    # Ensure batches directory exists
    if not batches_dir.exists():
        logger.logger.error(f"❌ Batches directory {batches_dir} does not exist. Run `create_mini_dataset` first.")
        return

    # Iterate over all mini batches
    for batch_dir in batches_dir.iterdir():
        if not batch_dir.is_dir():
            continue  # Skip non-directory files

        data_yaml_path = batch_dir / "data.yaml"
        images_root = batch_dir / "images"
        labels_root = batch_dir / "labels"

        # Ensure the batch directories exist
        images_root.mkdir(parents=True, exist_ok=True)
        labels_root.mkdir(parents=True, exist_ok=True)

        # Define dataset splits
        dataset_splits = ["train", "val", "test"]
        split_files = {split: batch_dir / f"{split}.txt" for split in dataset_splits}

        # Generate file lists for train, val, and test
        for split, file_path in split_files.items():
            split_dir = images_root / split
            image_files = [f.name for f in split_dir.iterdir() if f.suffix in ['.jpg', '.png']] if split_dir.exists() else []

            # Write image names to the corresponding .txt file
            with file_path.open('w', encoding='utf-8') as f:
                f.write('\n'.join(image_files) + '\n')

            logger.logger.info(f"✅ {split}.txt updated in {batch_dir} with {len(image_files)} images.")

        try:
            logger.logger.info(f"📄 Overwriting data.yaml in {batch_dir}...")

            data_yaml_content = {
                'path': str(batch_dir.resolve()),  # Root dataset path
                'train': "images/train",
                'val': "images/val",
                'test': "images/test",
                'nc': 13,  # Number of classes
                'names': [
                    "person", "rider", "car", "truck", "bus", "train", "motorcycle", "bicycle",
                    "traffic light", "traffic sign", "trailer", "other person", "other vehicle"
                ]
            }

            # Write to data.yaml
            with data_yaml_path.open('w', encoding='utf-8') as f:
                yaml.dump(data_yaml_content, f, default_flow_style=False, sort_keys=False)

            logger.logger.info(f"✅ data.yaml updated successfully in {batch_dir}.")
        except Exception as e:
            logger.logger.error(f"❌ Error updating data.yaml in {batch_dir}: {e}")


In [14]:
print(69863/10)

6986.3


In [None]:
# 🔹 1️⃣ Initialize Logger
logger = LoggerManager("dataset_logs", category="dataset_processing")
# Define source and destination paths
source_root = Path(r"C:\Users\sathish\Downloads\FL_ModelForAV\data\bdd100k")
# dest_root = Path(r"C:\Users\sathish\Downloads\FL_ModelForAV\my-project\data\bdd100_mini")

batch_root = Path(r"C:\Users\sathish\Downloads\FL_ModelForAV\test\bdd100_mini")

# 🔹 3️⃣ Ensure batch root exists
batch_root.mkdir(parents=True, exist_ok=True)

# 🔹 4️⃣ First, save directory structure
# save_directory_structure(batch_root, logger)

# 🔹 5️⃣ Then, create batch directories (if they don’t exist)
# create_batches(batch_root, num_batches=5, logger=logger)  # Creating 5 batches as an example

# create_mini_dataset(source_root, batch_root, num_images=10, num_batches=5, logger=logger)
create_mini_dataset(source_root, batch_root, num_images=, num_batches=10, logger=logger)
# create_mini_dataset(source_root, num_images=10, num_batches=5, logger=logger)

# # 🔹 7️⃣ Save `data.yaml` files inside each batch
ensure_data_yaml_exists(batch_root, logger)

# # 🔹 8️⃣ Final batch processing check
# process_batches(batch_root, logger)

# 🔹 9️⃣ Close Logger (Optional, ensures logs are saved)
logger.close_logger()


2025-02-20 17:29:36,321 - INFO - 📂 Creating 10 mini batches inside C:\Users\sathish\Downloads\FL_ModelForAV\test\bdd100_mini\batches...
2025-02-20 17:29:39,562 - INFO - ✅ Mini batches created successfully inside C:\Users\sathish\Downloads\FL_ModelForAV\test\bdd100_mini\batches
2025-02-20 17:29:39,563 - INFO - 📄 Creating/updating data.yaml and dataset split files for all mini batches...
2025-02-20 17:29:39,565 - INFO - ✅ train.txt updated in C:\Users\sathish\Downloads\FL_ModelForAV\test\bdd100_mini\batches\batch_1 with 10 images.
2025-02-20 17:29:39,566 - INFO - ✅ val.txt updated in C:\Users\sathish\Downloads\FL_ModelForAV\test\bdd100_mini\batches\batch_1 with 10 images.
2025-02-20 17:29:39,566 - INFO - ✅ test.txt updated in C:\Users\sathish\Downloads\FL_ModelForAV\test\bdd100_mini\batches\batch_1 with 10 images.
2025-02-20 17:29:39,567 - INFO - 📄 Overwriting data.yaml in C:\Users\sathish\Downloads\FL_ModelForAV\test\bdd100_mini\batches\batch_1...
2025-02-20 17:29:39,568 - INFO - ✅ data

✅ Logger 'dataset_logs' closed successfully.


In [15]:
import random
from pathlib import Path

def create_equal_batches(source_root: Path, batch_root: Path, num_batches: int, logger=None):
    """
    Creates batches with an equal number of images from train, val, and test sets.
    
    Args:
        source_root (Path): Path to the source dataset directory.
        batch_root (Path): Path to the directory where batches will be created.
        num_batches (int): Number of batches to create.
        logger (LoggerManager, optional): Logger for logging messages. Defaults to None.
    """
    logger.logger.info(f"📂 Creating {num_batches} equal batches inside {batch_root / 'batches'}...")

    # Ensure source dataset exists
    if not source_root.exists():
        logger.logger.error(f"❌ Source directory {source_root} does not exist.")
        return

    # Ensure batch root exists
    batch_root.mkdir(parents=True, exist_ok=True)

    # Define `batches/` directory inside `batch_root`
    batches_dir = batch_root / "batches"
    batches_dir.mkdir(parents=True, exist_ok=True)

    # Define source directories
    images_src = source_root / "images"
    labels_src = source_root / "labels"

    # Get all images from train, val, and test sets
    train_images = get_image_files(images_src / "train")
    val_images = get_image_files(images_src / "val")
    test_images = get_image_files(images_src / "test")

    # Calculate the number of images per batch for each split
    train_per_batch = len(train_images) // num_batches
    val_per_batch = len(val_images) // num_batches
    test_per_batch = len(test_images) // num_batches

    for batch_num in range(1, num_batches + 1):
        batch_dir = batches_dir / f"batch_{batch_num}"
        batch_dir.mkdir(parents=True, exist_ok=True)

        # Create subdirectories for images and labels
        dest_images = batch_dir / "images"
        dest_labels = batch_dir / "labels"
        dest_images.mkdir(parents=True, exist_ok=True)
        dest_labels.mkdir(parents=True, exist_ok=True)

        # Randomly select images for each split
        selected_train = random.sample(train_images, train_per_batch)
        selected_val = random.sample(val_images, val_per_batch)
        selected_test = random.sample(test_images, test_per_batch)

        # Copy selected images and corresponding labels
        for split, selected_images in zip(["train", "val", "test"], [selected_train, selected_val, selected_test]):
            split_images = dest_images / split
            split_labels = dest_labels / split
            split_images.mkdir(parents=True, exist_ok=True)
            split_labels.mkdir(parents=True, exist_ok=True)

            for image in selected_images:
                shutil.copy2(image, split_images / image.name)

                # Copy corresponding label if exists
                label_path = labels_src / split / f"{image.stem}.txt"
                if label_path.exists():
                    shutil.copy2(label_path, split_labels / label_path.name)

        logger.logger.info(f"✅ Batch {batch_num} created with {train_per_batch} train, {val_per_batch} val, and {test_per_batch} test images.")

    logger.logger.info(f"✅ All {num_batches} batches created successfully inside {batches_dir}")

# Example usage
source_root = Path(r"C:\Users\sathish\Downloads\FL_ModelForAV\data\bdd100k")
batch_root = Path(r"C:\Users\sathish\Downloads\FL_ModelForAV\test\bdd100_mini")
num_batches = 10  # Number of batches to create

# Initialize Logger
logger = LoggerManager("dataset_logs", category="dataset_processing")

# Create equal batches
create_equal_batches(source_root, batch_root, num_batches, logger)

# Ensure data.yaml files exist in each batch
ensure_data_yaml_exists(batch_root, logger)

# Close Logger
logger.close_logger()

✅ Logger 'dataset_logs' closed successfully.
