In [0]:
import tensorflow as tf
from datasets import load_dataset
cppe5 = load_dataset("cppe-5")

In [0]:
import os
import json
from PIL import Image
import io
from tqdm import tqdm

# Define single output path
output_path = "/Volumes/shm/default/cppe5"
os.makedirs(output_path, exist_ok=True)

# Prepare all examples first
def prepare_batch(examples, batch_size=50):
    batches = []
    current_batch = []
    
    for idx, example in enumerate(examples):
        image = example['image']
        image_filename = f"image_{int(example['image_id']):04d}.png"
        
        # Prepare image and annotation data
        annotation = {
            "image_id": example['image_id'],
            "file_name": image_filename,
            "width": example['width'],
            "height": example['height'],
            "objects": example['objects']
        }
        
        current_batch.append((idx, image, image_filename, annotation))
        
        if len(current_batch) >= batch_size:
            batches.append(current_batch)
            current_batch = []
    
    if current_batch:
        batches.append(current_batch)
    
    return batches

# Process batches
def save_batch(batch):
    for idx, image, image_filename, annotation in batch:
        try:
            # Save image
            image_path = os.path.join(output_path, image_filename)
            image.save(image_path, optimize=True)
            
            # Save annotation alongside image
            annotation_filename = image_filename.replace(".png",".json")
            annotation_path = os.path.join(output_path, annotation_filename)
            with open(annotation_path, 'w') as f:
                json.dump(annotation, f)
                
        except Exception as e:
            print(f"Error processing example {idx}: {e}")

# Process all examples in batches
batches = prepare_batch(cppe5['train'], batch_size=50)
for batch in tqdm(batches, desc="Processing batches"):
    save_batch(batch)