In [1]:
import os
import shutil
import json
import random
from tqdm import tqdm  # Import progress bar

# Define original dataset paths
original_base_path = "coco2017"
original_train_images_path = os.path.join(original_base_path, "train2017")
original_val_images_path = os.path.join(original_base_path, "val2017")
original_test_images_path = os.path.join(original_base_path, "test2017")
original_annotations_path = os.path.join(original_base_path, "annotations")

# Define subset paths
subset_base_path = "cocosubset"
subset_train_images_path = os.path.join(subset_base_path, "train2017")
subset_val_images_path = os.path.join(subset_base_path, "val2017")
subset_test_images_path = os.path.join(subset_base_path, "test2017")
subset_annotations_path = os.path.join(subset_base_path, "annotations")

# Define annotation file paths
original_train_annotations_path = os.path.join(original_annotations_path, "instances_train2017.json")
original_val_annotations_path = os.path.join(original_annotations_path, "instances_val2017.json")

subset_train_annotations_path = os.path.join(subset_annotations_path, "instances_train2017.json")
subset_val_annotations_path = os.path.join(subset_annotations_path, "instances_val2017.json")

# Set image counts based on ~2GB total size
train_subset_size = 5600  # ~1.4GB
val_subset_size = 1200    # ~0.3GB
test_subset_size = 1200   # ~0.3GB

# Function to create image subset and annotations
def create_subset(original_images_path, original_annotations_path, subset_images_path, subset_annotations_path, subset_size, dataset_name):
    print(f"\nProcessing {dataset_name} dataset...")

    # Load COCO annotations
    with open(original_annotations_path, "r") as f:
        coco_data = json.load(f)

    # Get a random subset of images
    all_images = coco_data["images"]
    subset_images = random.sample(all_images, subset_size)

    # Create new annotation file
    subset_annotations = {
        "info": coco_data["info"],
        "licenses": coco_data["licenses"],
        "images": subset_images,
        "annotations": [ann for ann in coco_data["annotations"] if ann["image_id"] in {img["id"] for img in subset_images}],
        "categories": coco_data["categories"],
    }

    # Save new annotations file
    os.makedirs(os.path.dirname(subset_annotations_path), exist_ok=True)
    with open(subset_annotations_path, "w") as f:
        json.dump(subset_annotations, f, indent=4)

    # Copy selected images with progress bar
    os.makedirs(subset_images_path, exist_ok=True)
    print(f"Copying {subset_size} images for {dataset_name}...")
    for img in tqdm(subset_images, desc=f"Copying {dataset_name}", unit="img"):
        src = os.path.join(original_images_path, img["file_name"])
        dst = os.path.join(subset_images_path, img["file_name"])
        shutil.copy(src, dst)

# Create train subset
create_subset(original_train_images_path, original_train_annotations_path, subset_train_images_path, subset_train_annotations_path, train_subset_size, "Train")

# Create validation subset
create_subset(original_val_images_path, original_val_annotations_path, subset_val_images_path, subset_val_annotations_path, val_subset_size, "Validation")

# Create test subset (random selection if no annotations)
if os.path.exists(original_test_images_path):
    os.makedirs(subset_test_images_path, exist_ok=True)
    test_images = random.sample(os.listdir(original_test_images_path), test_subset_size)
    print("\nCopying test images...")
    for img in tqdm(test_images, desc="Copying Test", unit="img"):
        shutil.copy(os.path.join(original_test_images_path, img), os.path.join(subset_test_images_path, img))

# Copy full annotations folder (for compatibility)
print("\nCopying annotations...")
shutil.copytree(original_annotations_path, subset_annotations_path, dirs_exist_ok=True)

print("\nSubset creation complete! (~2GB total)")



Processing Train dataset...
Copying 5600 images for Train...


Copying Train: 100%|██████████| 5600/5600 [01:05<00:00, 85.90img/s] 



Processing Validation dataset...
Copying 1200 images for Validation...


Copying Validation: 100%|██████████| 1200/1200 [00:06<00:00, 175.06img/s]



Copying test images...


Copying Test: 100%|██████████| 1200/1200 [00:11<00:00, 100.78img/s]



Copying annotations...

Subset creation complete! (~2GB total)
