In [1]:
import json
import os
import random
import shutil
from tqdm import tqdm
from typing import Tuple, List, Dict
from pycocotools import mask as maskUtils

def create_directory_structure(base_dir: str) -> None:
    """Creates the directory structure for train/valid/test splits."""
    for split in ['train', 'valid', 'test']:
        for subdir in ['images', 'labels']:
            os.makedirs(os.path.join(base_dir, split, subdir), exist_ok=True)

def split_dataset(image_ids: List[int], train_ratio: float, val_ratio: float, test_ratio: float = 0) -> Tuple[List[int], List[int], List[int]]:
    """
    Splits the dataset into train, validation, and test sets.
    
    Args:
        image_ids: List of all image IDs
        train_ratio: Proportion of data for training (0-1)
        val_ratio: Proportion of data for validation (0-1)
        test_ratio: Proportion of data for testing (0-1)
    
    Returns:
        Tuple of lists containing image IDs for each split
    """
    if not 0.999 <= train_ratio + val_ratio + test_ratio <= 1.001:
        raise ValueError("Split ratios must sum to 1")
    
    shuffled_ids = image_ids.copy()
    random.shuffle(shuffled_ids)
    
    total = len(shuffled_ids)
    train_end = int(total * train_ratio)
    val_end = int(total * (train_ratio + val_ratio))
    
    train_ids = shuffled_ids[:train_end]
    val_ids = shuffled_ids[train_end:val_end]
    test_ids = shuffled_ids[val_end:] if test_ratio > 0 else []
    
    return train_ids, val_ids, test_ids

def coco_to_yolo(coco_json_path: str, image_dir: str, output_dir: str, 
                 train_ratio: float = 0.8, val_ratio: float = 0.2, 
                 test_ratio: float = 0.0, seed: int = 42) -> None:
    """
    Converts COCO annotations to YOLO format with train/valid/test splits.

    Args:
        coco_json_path: Path to the COCO JSON file
        image_dir: Directory containing the source images
        output_dir: Directory where the YOLO-formatted dataset will be saved
        train_ratio: Proportion of data for training (default: 0.8)
        val_ratio: Proportion of data for validation (default: 0.2)
        test_ratio: Proportion of data for testing (default: 0.0)
        seed: Random seed for reproducibility
    """
    random.seed(seed)
    
    # Load COCO JSON
    print("Loading COCO annotations...")
    with open(coco_json_path, 'r') as f:
        coco_data = json.load(f)
    
    # Create category mappings
    categories = coco_data['categories']
    category_id_to_name = {cat['id']: cat['name'] for cat in categories}
    category_id_to_index = {cat['id']: idx for idx, cat in enumerate(categories)}
    
    # Create image mappings
    images = coco_data['images']
    image_id_to_info = {img['id']: img for img in images}
    
    # Group annotations by image
    print("Processing annotations...")
    annotations = coco_data['annotations']
    image_to_annotations = {}
    for ann in annotations:
        image_id = ann['image_id']
        if image_id not in image_to_annotations:
            image_to_annotations[image_id] = []
        image_to_annotations[image_id].append(ann)
    
    # Create directory structure
    create_directory_structure(output_dir)
    
    # Split dataset
    all_image_ids = list(image_to_annotations.keys())
    train_ids, val_ids, test_ids = split_dataset(all_image_ids, train_ratio, val_ratio, test_ratio)
    
    # Process each split
    splits = {
        'train': train_ids,
        'valid': val_ids,
        'test': test_ids
    }
    
    missing_images = []
    copied_images = []
    
    for split_name, split_ids in splits.items():
        if not split_ids:  # Skip empty splits
            continue
            
        print(f"\nProcessing {split_name} split...")
        for image_id in tqdm(split_ids, desc=f"Converting {split_name} annotations"):
            image_info = image_id_to_info[image_id]
            image_width = image_info['width']
            image_height = image_info['height']
            image_file_name = image_info['file_name']
            label_file_name = os.path.splitext(image_file_name)[0] + '.txt'
            
            # Setup paths
            src_image_path = os.path.join(image_dir, image_file_name)
            dst_image_path = os.path.join(output_dir, split_name, 'images', image_file_name)
            label_file_path = os.path.join(output_dir, split_name, 'labels', label_file_name)
            
            # Copy image
            if os.path.exists(src_image_path):
                if not os.path.exists(dst_image_path):
                    shutil.copy2(src_image_path, dst_image_path)
                    copied_images.append(image_file_name)
            else:
                missing_images.append(image_file_name)
                continue  # Skip creating labels for missing images
            
            # Write labels
            with open(label_file_path, 'w') as label_file:
                for ann in image_to_annotations[image_id]:
                    class_index = category_id_to_index[ann['category_id']]
                    
                    if 'segmentation' in ann and ann['segmentation'] and ann.get('iscrowd', 0) == 0:
                        # Process segmentation
                        for seg in ann['segmentation']:
                            coords = []
                            for i in range(0, len(seg), 2):
                                x = seg[i] / image_width
                                y = seg[i+1] / image_height
                                coords.extend([x, y])
                            label_file.write(f"{class_index} " + " ".join(map(str, coords)) + "\n")
                    
                    elif 'bbox' in ann:
                        # Process bbox
                        x_min, y_min, width, height = ann['bbox']
                        x_center = (x_min + width/2) / image_width
                        y_center = (y_min + height/2) / image_height
                        width = width / image_width
                        height = height / image_height
                        label_file.write(f"{class_index} {x_center} {y_center} {width} {height}\n")
    
    # Generate data.yaml
    data_yaml_path = os.path.join(output_dir, 'data.yaml')
    with open(data_yaml_path, 'w') as data_yaml_file:
        data_yaml_file.write(f"train: ./train/images\n")
        data_yaml_file.write(f"val: ./valid/images\n")
        if test_ratio > 0:
            data_yaml_file.write(f"test: ./test/images\n")
        data_yaml_file.write(f"\nnc: {len(categories)}\n")
        names = [cat['name'] for cat in categories]
        data_yaml_file.write(f"names: {names}\n")
    
    # Print summary
    print(f"\nConversion completed!")
    print(f"Dataset statistics:")
    print(f"- Train images: {len(train_ids)}")
    print(f"- Validation images: {len(val_ids)}")
    if test_ratio > 0:
        print(f"- Test images: {len(test_ids)}")
    print(f"\nProcessing summary:")
    print(f"- Total images copied: {len(copied_images)}")
    if missing_images:
        print(f"- Missing images: {len(missing_images)}")
        print("  First few missing images:", missing_images[:5])
        print("  Check if the image_dir path is correct and contains all images.")

In [3]:
# Example usage
if __name__ == "__main__":
    coco_json_path = r'd:\OneDrive - Personal\DS\FleetBlox\Data\Final\Inspection\Exterior\Car Parts\Final_Parts_Labels Corrected COCO\annotations\instances_default.json'
    image_dir = r'D:\OneDrive - Personal\DS\FleetBlox\Data\Final\Inspection\Exterior\Car Parts\Final_Parts_Labels Corrected COCO\images'  # Directory containing the original images
    output_dir = r'D:\OneDrive - Personal\DS\FleetBlox\Data\Final\Inspection\Exterior\Car Parts\Final_Parts_Labels Corrected YOLO'
    
    # Convert with 80% train, 20% val split
    coco_to_yolo(
        coco_json_path=coco_json_path,
        image_dir=image_dir,
        output_dir=output_dir,
        train_ratio=0.8,
        val_ratio=0.2,
        test_ratio=0.0,
        seed=101
    )

Loading COCO annotations...
Processing annotations...

Processing train split...


Converting train annotations: 100%|██████████| 217/217 [00:01<00:00, 199.46it/s]



Processing valid split...


Converting valid annotations: 100%|██████████| 55/55 [00:00<00:00, 191.55it/s]


Conversion completed!
Dataset statistics:
- Train images: 217
- Validation images: 55

Processing summary:
- Total images copied: 272



