# CPPE Object Detection Optimization


https://huggingface.co/datasets/rishitdagli/cppe-5

In [0]:
import tensorflow as tf
import tensorflow_io as tfio

def create_dataset(file_pattern, batch_size=8, is_training=True):
    # Optimized data loading pipeline
    dataset = tf.data.TFRecordDataset(
        tf.data.Dataset.list_files(file_pattern),
        compression_type="GZIP",
        num_parallel_reads=tf.data.experimental.AUTOTUNE
    )
    
    # Parse TFRecords
    dataset = dataset.map(
        parse_tf_example,
        num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    
    if is_training:
        # Cache dataset in memory if it fits
        dataset = dataset.cache()
        
        # Shuffle with a large buffer for better randomization
        dataset = dataset.shuffle(buffer_size=1000)
        
        # Apply data augmentation
        dataset = dataset.map(
            augment_data,
            num_parallel_calls=tf.data.experimental.AUTOTUNE
        )
    
    # Batch the data
    dataset = dataset.batch(batch_size)
    
    # Prefetch for better GPU utilization
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    return dataset

def parse_tf_example(example_proto):
    # Define the features to extract from TFRecord
    feature_description = {
        'image/encoded': tf.io.FixedLenFeature([], tf.string),
        'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
        'image/object/class/label': tf.io.VarLenFeature(tf.int64),
    }
    
    # Parse the example
    features = tf.io.parse_single_example(example_proto, feature_description)
    
    # Decode the image
    image = tf.io.decode_jpeg(features['image/encoded'], channels=3)
    
    # Normalize image to [0, 1]
    image = tf.cast(image, tf.float32) / 255.0
    
    # Resize image to model input size
    image = tf.image.resize(image, [512, 512])
    
    # Get bounding boxes
    xmins = tf.sparse.to_dense(features['image/object/bbox/xmin'])
    ymins = tf.sparse.to_dense(features['image/object/bbox/ymin'])
    xmaxs = tf.sparse.to_dense(features['image/object/bbox/xmax'])
    ymaxs = tf.sparse.to_dense(features['image/object/bbox/ymax'])
    
    # Get class labels
    class_labels = tf.sparse.to_dense(features['image/object/class/label'])
    
    # Pad bounding boxes and labels to max_objects
    num_objects = tf.shape(xmins)[0]
    bboxes = tf.stack([xmins, ymins, xmaxs, ymaxs], axis=1)
    bboxes = pad_to_fixed_size(bboxes, 30, 4)
    
    # One-hot encode class labels and pad
    labels = tf.one_hot(class_labels - 1, depth=5)  # Assuming 5 classes
    labels = pad_to_fixed_size(labels, 30, 5)
    
    return image, (bboxes, labels)

def pad_to_fixed_size(tensor, max_objects, feature_size):
    """Pad tensor to fixed size along first dimension"""
    shape = tf.shape(tensor)
    paddings = [[0, max_objects - shape[0]], [0, 0]]
    return tf.pad(tensor, paddings)

def augment_data(image, labels):
    """Apply data augmentation to image and adjust bounding boxes accordingly"""
    # Random brightness
    image = tf.image.random_brightness(image, max_delta=0.2)
    
    # Random contrast
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
    
    # Random flip left-right (and adjust bounding boxes)
    flip = tf.random.uniform([], 0, 1) > 0.5
    if flip:
        image = tf.image.flip_left_right(image)
        bboxes, class_labels = labels
        
        # Flip x-coordinates: new_x = 1 - old_x
        # For xmin: new_xmin = 1 - old_xmax
        # For xmax: new_xmax = 1 - old_xmin
        flipped_xmin = 1.0 - bboxes[:, 2]  # 1 - xmax
        flipped_xmax = 1.0 - bboxes[:, 0]  # 1 - xmin
        
        bboxes = tf.stack([
            flipped_xmin,
            bboxes[:, 1],  # ymin stays the same
            flipped_xmax,
            bboxes[:, 3],  # ymax stays the same
        ], axis=1)
        
        labels = (bboxes, class_labels)
    
    return image, labels


In [0]:
import tensorflow as tf
import json
import os
import numpy as np
from tqdm import tqdm

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()  # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def create_tf_example(image_path, img_shape=(224, 224)):
    """
    Create a TFRecord example from an image and its annotation.
    
    Args:
        image_path: Path to the image file
        img_shape: Target shape for resizing images
        
    Returns:
        tf.train.Example
    """
    # Load image
    img_data = tf.io.read_file(image_path)
    
    # Load annotation
    annotation_path = image_path.replace('.png', '.json')
    with open(annotation_path, 'r') as f:
        ann = json.load(f)
    
    # Get image dimensions
    img = tf.io.decode_png(img_data, channels=3)
    height, width = tf.shape(img)[0].numpy(), tf.shape(img)[1].numpy()
    
    # Extract bounding boxes and categories
    bboxes = np.array(ann['objects']['bbox'], dtype=np.float32)
    categories = np.array(ann['objects']['category'], dtype=np.int32)
    
    # Convert bboxes to normalized format [xmin, ymin, xmax, ymax]
    # Original format is [x, y, width, height]
    xmins = bboxes[:, 0] / width
    ymins = bboxes[:, 1] / height
    xmaxs = (bboxes[:, 0] + bboxes[:, 2]) / width
    ymaxs = (bboxes[:, 1] + bboxes[:, 3]) / height
    
    # Create TF Example
    feature = {
        'image/encoded': _bytes_feature(img_data),
        'image/height': _int64_feature([height]),
        'image/width': _int64_feature([width]),
        'image/filename': _bytes_feature(os.path.basename(image_path).encode('utf8')),
        'image/source_id': _bytes_feature(str(ann.get('image_id', 0)).encode('utf8')),
        'image/object/bbox/xmin': _float_feature(xmins),
        'image/object/bbox/ymin': _float_feature(ymins),
        'image/object/bbox/xmax': _float_feature(xmaxs),
        'image/object/bbox/ymax': _float_feature(ymaxs),
        'image/object/class/label': _int64_feature(categories + 1),  # Add 1 because TF OD API uses 1-indexed classes
    }
    
    return tf.train.Example(features=tf.train.Features(feature=feature))

def convert_to_tfrecord(image_paths, output_path, num_shards=10):
    """
    Convert a list of images and their annotations to TFRecord format.
    
    Args:
        image_paths: List of paths to image files
        output_path: Base path for output TFRecord files
        num_shards: Number of shards to split the dataset into
    """
    # Calculate number of examples per shard
    num_examples = len(image_paths)
    examples_per_shard = int(np.ceil(num_examples / num_shards))
    
    # Create TFRecord writers
    writers = []
    for i in range(num_shards):
        shard_path = f"{output_path}-{i:05d}-of-{num_shards:05d}.tfrecord"
        writers.append(tf.io.TFRecordWriter(shard_path))
    
    # Process each image and write to TFRecord
    for idx, image_path in enumerate(tqdm(image_paths, desc="Converting to TFRecord")):
        try:
            # Create TF Example
            tf_example = create_tf_example(image_path)
            
            # Write to appropriate shard
            shard_idx = idx // examples_per_shard
            writers[shard_idx].write(tf_example.SerializeToString())
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
    
    # Close all writers
    for writer in writers:
        writer.close()
    
    print(f"Created {num_shards} TFRecord files at {output_path}")

def convert_dataset_to_tfrecord(data_dir, output_dir, split_ratio=0.8):
    """
    Convert an entire dataset to TFRecord format with train/val split.
    
    Args:
        data_dir: Directory containing images and annotations
        output_dir: Directory to save TFRecord files
        split_ratio: Ratio of train to validation data
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get all image paths
    image_paths = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.png')]
    
    # Shuffle image paths
    np.random.shuffle(image_paths)
    
    # Split into train and validation sets
    split_idx = int(len(image_paths) * split_ratio)
    train_paths = image_paths[:split_idx]
    val_paths = image_paths[split_idx:]
    
    # Convert train and validation sets to TFRecord
    convert_to_tfrecord(train_paths, os.path.join(output_dir, 'train'), num_shards=max(1, len(train_paths) // 1000))
    convert_to_tfrecord(val_paths, os.path.join(output_dir, 'val'), num_shards=max(1, len(val_paths) // 1000))
    
    print(f"Converted {len(train_paths)} training images and {len(val_paths)} validation images to TFRecord format")

In [0]:
tf_example = create_tf_example('/Volumes/shm/default/cppe5/image_0001.png')