In [None]:
import os
import urllib.request
import zipfile

# Create directory
os.makedirs('coco_subset', exist_ok=True)
os.chdir('coco_subset')

# Download annotations (instances_val2017.json contains bbox annotations but no masks; we use it for image IDs)
ann_url = 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip'
ann_zip = 'annotations_trainval2017.zip'
if not os.path.exists(ann_zip):
    urllib.request.urlretrieve(ann_url, ann_zip)
with zipfile.ZipFile(ann_zip, 'r') as zip_ref:
    zip_ref.extractall('.')  # Extracts to annotations/instances_val2017.json
os.remove(ann_zip)  # Clean up zip

# Download val2017 images
img_url = 'http://images.cocodataset.org/zips/val2017.zip'
img_zip = 'val2017.zip'
if not os.path.exists(img_zip):
    urllib.request.urlretrieve(img_url, img_zip)
with zipfile.ZipFile(img_zip, 'r') as zip_ref:
    zip_ref.extractall('.')  # Extracts to val2017/ directory
os.remove(img_zip)  # Clean up zip

In [None]:
from pycocotools.coco import COCO
import cv2
import os
from PIL import Image
import numpy as np
from tqdm import tqdm

# Paths
ann_file = 'annotations/instances_val2017.json'
coco = COCO(ann_file)
img_dir = 'val2017'
output_dir = 'new_dataset'
os.makedirs(output_dir, exist_ok=True)

# Get all image IDs from val2017 (no category filtering; all ~5K images)
img_ids = coco.getImgIds()  # Or slice for smaller subset: img_ids = img_ids[:100]
images = coco.loadImgs(img_ids)

for img_info in tqdm(images, desc="Processing images"):
    img_path = os.path.join(img_dir, img_info['file_name'])
    
    # Load colored image (label)
    colored = cv2.imread(img_path)
    colored_rgb = cv2.cvtColor(colored, cv2.COLOR_BGR2RGB)  # To RGB for consistency
    
    # Convert to grayscale (B/W input)
    gray = cv2.cvtColor(colored, cv2.COLOR_BGR2GRAY)
    # Expand gray to 3-channel for saving as JPG (all channels identical)
    gray_3ch = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
    
    # Save input (B/W) and label (colored)
    base_name = os.path.splitext(img_info['file_name'])[0]
    cv2.imwrite(os.path.join(output_dir, f'input_{base_name}.jpg'), gray_3ch)
    cv2.imwrite(os.path.join(output_dir, f'label_{base_name}.jpg'), colored_rgb)

print(f"Dataset created in {output_dir}/ with {len(images)} pairs.")