# Importing Libraries

In [None]:
import json
import os
import shutil
from sklearn.model_selection import train_test_split

# Defining Functions

In [5]:
def split_coco_dataset(
    coco_json_path,
    image_dir,
    output_dir,
    train_split=0.7,
    val_split=0.2,
    test_split=0.1
):
    """
    Split COCO dataset into train, validation and test sets.
    
    Args:
        coco_json_path (str): Path to COCO JSON file
        image_dir (str): Directory containing images
        output_dir (str): Directory to save split datasets
        train_split (float): Proportion of data for training (default: 0.7)
        val_split (float): Proportion of data for validation (default: 0.2)
        test_split (float): Proportion of data for testing (default: 0.1)
    """
    # Validate split ratios
    if abs(train_split + val_split + test_split - 1.0) > 1e-10:
        raise ValueError("Split ratios must sum to 1")

    # Read COCO JSON file
    with open(coco_json_path, 'r') as f:
        coco_data = json.load(f)

    # Create output directories
    splits = ['train', 'val'] if test_split == 0 else ['train', 'val', 'test']
    split_dirs = {}
    for split in splits:
        split_dirs[split] = os.path.join(output_dir, split)
        os.makedirs(split_dirs[split], exist_ok=True)
        os.makedirs(os.path.join(split_dirs[split], 'images'), exist_ok=True)

    # Split images
    images = coco_data['images']
    
    if test_split == 0:
        # Binary split between train and val
        train_images, val_images = train_test_split(
            images, 
            train_size=train_split,
            test_size=val_split,
            random_state=42
        )
        test_images = []  # Empty list for test set
    else:
        # Three-way split
        train_images, temp_images = train_test_split(
            images, 
            train_size=train_split,
            random_state=42
        )
        relative_val_ratio = val_split / (val_split + test_split)
        val_images, test_images = train_test_split(
            temp_images,
            train_size=relative_val_ratio,
            random_state=42
        )

    split_images = {
        'train': train_images,
        'val': val_images,
        'test': test_images
    }

    # Create split datasets
    for split_name, split_imgs in split_images.items():
        # Skip test split if test_split is 0
        if test_split == 0 and split_name == 'test':
            continue
            
        # Create new COCO data structure
        split_coco = {
            'info': coco_data.get('info', {}),
            'licenses': coco_data.get('licenses', []),
            'categories': coco_data.get('categories', []),
            'images': split_imgs,
            'annotations': []
        }

        # Get image IDs for this split
        split_image_ids = {img['id'] for img in split_imgs}

        # Filter annotations for images in this split
        split_coco['annotations'] = [
            ann for ann in coco_data.get('annotations', [])
            if ann['image_id'] in split_image_ids
        ]

        # Save JSON
        split_json_path = os.path.join(split_dirs[split_name], f'instances_{split_name}.json')
        with open(split_json_path, 'w') as f:
            json.dump(split_coco, f, indent=2)

        # Copy images
        for img in split_imgs:
            src_path = os.path.join(image_dir, img['file_name'])
            dst_path = os.path.join(split_dirs[split_name], 'images', img['file_name'])
            if os.path.exists(src_path):
                shutil.copy2(src_path, dst_path)
            else:
                print(f"Warning: Image not found: {src_path}")

        print(f"{split_name} set:")
        print(f"  Images: {len(split_imgs)}")
        print(f"  Annotations: {len(split_coco['annotations'])}\n")

# Usage

In [6]:
# Example usage - just modify these paths according to your setup
coco_json_path = r'd:\OneDrive - Personal\DS\FleetBlox\Data\Final\Inspection\Exterior\Exterior Damage Final COCO (Augmented) Label Names Corrected\annotations\instances_default.json'
image_dir = r'D:\OneDrive - Personal\DS\FleetBlox\Data\Final\Inspection\Exterior\Exterior Damage Final COCO (Augmented) Label Names Corrected\images'
output_dir = r'D:\OneDrive - Personal\DS\FleetBlox\Data\Final\Inspection\Exterior\splitted'

# Run the splitting function
split_coco_dataset(
    coco_json_path=coco_json_path,
    image_dir=image_dir,
    output_dir=output_dir,
    train_split=0.8,
    val_split=0.2,
    test_split=0.0
)

train set:
  Images: 885
  Annotations: 25131

val set:
  Images: 222
  Annotations: 7435

