This notebook is used to preprocess the CocoDoom dataset to allow for faster training.

Each image will be loaded, preprocessed, and saved as a tensor shard in the same location as the original images.

In [1]:
# Add project directory to path for imports
from transformers import DetrImageProcessorFast
import sys
import os
sys.path.append(os.path.join(os.pardir))

from PIL import Image
from Vision.datasets import CocoDoomDataset
from transformers import DetrImageProcessor

import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# create preprocessor
processor = DetrImageProcessor.from_pretrained(
    "facebook/detr-resnet-50",
    size={"shortest_edge": 200, "longest_edge": 320}
)

# create dataset instance
dataset = CocoDoomDataset(
    data_dir=os.path.join(os.pardir, os.pardir, "datasets", "cocodoom"),
    annotation_file_name="run-train.json",
    processor=processor
)

loading annotations into memory...
Done (t=0.93s)
creating index...
index created!
loading annotations into memory...
Done (t=0.92s)
creating index...
index created!
Loaded run-train.json
Number of images: 50732
Number of Categories: 94


In [3]:
# cache X GB of data
from tqdm import tqdm

def cache_files():
    split_name = "train"
    saved, skipped = 0, 0
    save_root = os.path.join(
        os.pardir, os.pardir, "datasets", "cocodoom", "preprocessed"
    )
    os.makedirs(save_root, exist_ok=True)

    cache_size_gb = 64
    # use uniform distribution to sample what idx should be cached
    dist = torch.distributions.Uniform(0, len(dataset))
    approx_item_size = 0.012 # in GB
    num_items_to_cache = int(cache_size_gb / approx_item_size)
    print(f"Caching approximately {num_items_to_cache} items (~{cache_size_gb} GB)")
    sampled_indices = dist.sample((num_items_to_cache,)).long().tolist()


    for i in tqdm(sampled_indices, desc=f"Preprocessing {split_name}"):
        image, target, img_file_name = dataset.get_image(i)

        encoding = processor(
            images=image,
            annotations=target,
            return_tensors="pt"
        )

        pixel_values = encoding['pixel_values'].squeeze()
        target = dict(encoding['labels'][0])

        # reduce format of target tensors
        # target['boxes'] = target['boxes'].to(torch.float16)
        # del target['size']
        # del target['orig_size']
        # # we only have 94 categories
        # target['class_labels'] = target['class_labels'].to(torch.int16)
        # del target['area']  # remove area to save space
        # del target['iscrowd'] # remove iscrowd to save space

        # modify file name to have .pt extension
        # pt_file_name = os.path.splitext(img_file_name)[0] + ".pt"
        pt_file_name = f"{i}.pt"
        save_path = os.path.join(save_root, pt_file_name)

        if not os.path.exists(os.path.dirname(save_path)):
            os.makedirs(os.path.dirname(save_path), exist_ok=True)

        if os.path.exists(save_path):
            skipped += 1
            continue

        torch.save(
            {
                "pixel_values": pixel_values,
                "labels": target
            },
            save_path
        )
        saved += 1

    print(f"{split_name}: saved {saved}, skipped {skipped}")

# cache_files()

In [4]:
pixel_values, labels = dataset[0]
print(f"Loaded pixel values shape: {pixel_values.shape}")
print(f"Loaded labels: {labels}")

print(f"{labels['class_labels'].dtype}")

Loaded pixel values shape: torch.Size([3, 200, 320])
Loaded labels: {'size': tensor([200, 320]), 'image_id': tensor([1010000002]), 'class_labels': tensor([0, 0]), 'boxes': tensor([[0.4328, 0.6225, 0.0531, 0.1250],
        [0.5484, 0.5700, 0.0469, 0.1000]]), 'area': tensor([180., 115.]), 'iscrowd': tensor([0, 0]), 'orig_size': tensor([200, 320])}
torch.int64


In [5]:
# benchmarking dataset, which includes preprocessing
import time

start_time = time.perf_counter()
for i in range(1000):
    pixel_values, labels = dataset[i]
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Time taken to fetch 1000 items: {elapsed_time:.2f} seconds")

Time taken to fetch 1000 items: 0.90 seconds


In [6]:
# benchmarking disk-cached dataset
import time
from Vision.datasets import PreprocessedDataset

cached_dataset = PreprocessedDataset(
    dataset=dataset,
    cache_dir=os.path.join(os.pardir, os.pardir, "datasets", "cocodoom", "preprocessed")
)

start_time = time.perf_counter()
for i in range(1000):
    pixel_values, labels = cached_dataset[i]
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Time taken to fetch 1000 items: {elapsed_time:.2f} seconds")

Time taken to fetch 1000 items: 0.95 seconds


# Benchmarking Processors

In [7]:
processor1 = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
processor2 = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True)
processor3 = DetrImageProcessorFast.from_pretrained("facebook/detr-resnet-50")

In [8]:
import time

start_time = time.perf_counter()
for i in range(1000):
    img, target, _ = dataset.get_image(i)
    processor1(images=img, annotations=target, return_tensors="pt")

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Processor 1 time taken to process 1000 items: {elapsed_time:.2f} seconds")

start_time = time.perf_counter()
for i in range(1000):
    img, target, _ = dataset.get_image(i)
    processor2(images=img, annotations=target, return_tensors="pt")

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Processor 2 time taken to process 1000 items: {elapsed_time:.2f} seconds")

start_time = time.perf_counter()
for i in range(1000):
    img, target, _ = dataset.get_image(i)
    processor3(images=img, annotations=target, return_tensors="pt")

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Processor 3 time taken to process 1000 items: {elapsed_time:.2f} seconds")


Processor 1 time taken to process 1000 items: 13.39 seconds
Processor 2 time taken to process 1000 items: 13.39 seconds
Processor 3 time taken to process 1000 items: 2.66 seconds


# Converting CocoDoom to YOLO Format

In [9]:
import json
from pathlib import Path

def convert_coco_to_yolo(coco_json_path, output_dir):
    """
    Convert COCO format annotations to YOLO format.

    This is necessary because the standard YOLO converter
    does not work if the file paths contain subdirectories.
    
    Args:
        coco_json_path: Path to COCO JSON file
        output_dir: Root directory to save YOLO format labels
    """
    # Load COCO JSON
    with open(coco_json_path, 'r') as f:
        coco_data = json.load(f)
    
    # Create mapping from image_id to annotations
    img_annotations = {}
    for ann in coco_data['annotations']:
        img_id = ann['image_id']
        if img_id not in img_annotations:
            img_annotations[img_id] = []
        img_annotations[img_id].append(ann)
    
    # Process each image
    for image in tqdm(coco_data['images'], desc=f"Converting {Path(coco_json_path).name}"):
        img_id = image['id']
        img_width = image['width']
        img_height = image['height']
        file_name = image['file_name']
        
        # Create output label path based on image path structure
        # labels/path/to/image.txt
        label_file = Path(output_dir) / 'labels' / file_name
        label_file = label_file.with_suffix('.txt')
        
        # Create parent directories if they don't exist
        label_file.parent.mkdir(parents=True, exist_ok=True)
        
        # Get annotations for this image
        annotations = img_annotations.get(img_id, [])
        
        # Write YOLO format labels
        with open(label_file, 'w') as f:
            for ann in annotations:
                # COCO bbox format: [x, y, width, height] in image coordinates
                bbox = ann['bbox']
                x, y, w, h = bbox
                
                # Convert to YOLO format: [x_center, y_center, width, height] normalized
                x_center = (x + w / 2) / img_width
                y_center = (y + h / 2) / img_height
                w_norm = w / img_width
                h_norm = h / img_height
                
                # Clamp values to [0, 1]
                x_center = max(0, min(1, x_center))
                y_center = max(0, min(1, y_center))
                w_norm = max(0, min(1, w_norm))
                h_norm = max(0, min(1, h_norm))
                
                # Class ID from category_id
                class_id = ann['category_id']
                
                # Write to file: class_id x_center y_center width height (all normalized)
                f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")

# Convert all splits
coco_root = "/home/xavier/projects/datasets/cocodoom"
output_root = "/home/xavier/projects/datasets/cocodoom/yolo"

for split in ['run-train.json', 'run-val.json', 'run-test.json']:
    coco_json_path = os.path.join(coco_root, split)
    if os.path.exists(coco_json_path):
        convert_coco_to_yolo(coco_json_path, output_root)
    else:
        print(f"Skipping {split} - file not found")

print("Conversion complete!")

Converting run-train.json: 100%|██████████| 50732/50732 [00:01<00:00, 42100.81it/s]
Converting run-val.json: 100%|██████████| 9510/9510 [00:00<00:00, 40736.04it/s]
Converting run-test.json: 100%|██████████| 5907/5907 [00:00<00:00, 40833.81it/s]

Conversion complete!





In [10]:
import shutil
from pathlib import Path
import json
import yaml

def create_yaml_file(path, categories):
    # construct names dictionary with sequential IDs
    names = {i: cat['name'] for i, cat in enumerate(categories)}

    yaml_content = {
        'path': str(Path(path).absolute()),
        'train': 'images/train',
        'val': 'images/val',
        'test': 'images/test',
        'names': names
    }

    yaml_path = Path(path) / 'data.yaml'
    with open(yaml_path, 'w') as f:
        yaml.dump(yaml_content, f)
    
    print(f"\ndata.yaml created at {yaml_path}")
    print(yaml.dump(yaml_content))


def organize_yolo_dataset(yolo_root, coco_root, coco_data):
    """
    Organize YOLO dataset into flat structure expected by YOLO.
    
    Creates:
    - images/train/, images/val/, images/test/
    - labels/train/, labels/val/, labels/test/
    - data.yaml
    
    Args:
        yolo_root: Path to YOLO dataset root
        coco_root: Path to original COCO dataset root
    """
    yolo_path = Path(yolo_root)
    coco_path = Path(coco_root)
    
    # Create directory structure
    for split_name in ['train', 'val', 'test']:
        (yolo_path / 'images' / split_name).mkdir(parents=True, exist_ok=True)
        (yolo_path / 'labels' / split_name).mkdir(parents=True, exist_ok=True)
    
    # Load COCO JSONs to get image file names
    splits = {
        'train': 'run-train.json',
        'val': 'run-val.json',
        'test': 'run-test.json'
    }
    
    # Process each split
    for split_name, json_file in splits.items():
        json_path = coco_path / json_file
        
        if not json_path.exists():
            print(f"Skipping {split_name} - file not found")
            continue
        
        with open(json_path, 'r') as f:
            coco_data = json.load(f)
        
        print(f"\nProcessing {split_name} split...")
        
        # Copy images and labels
        for image in tqdm(coco_data['images'], desc=f"{split_name}"):
            file_name = image['file_name']
            
            # Find the original image file
            # The file_name includes the run/map structure like "run1/map01/rgb/image.png"
            src_img = coco_path / file_name
            
            if src_img.exists():
                # Create a unique flat filename: run1_map01_rgb_image.png
                flat_name = file_name.replace('/', '_')
                dst_img = yolo_path / 'images' / split_name / flat_name

                # skip if already exists
                if dst_img.exists():
                    continue
                
                # Copy image (use symlink for speed if on same filesystem)
                try:
                    os.symlink(src_img.absolute(), dst_img)
                except (FileExistsError, OSError):
                    # If symlink fails, copy the file
                    if not dst_img.exists():
                        shutil.copy2(src_img, dst_img)
                
                # Copy corresponding label
                nested_label = yolo_path / 'labels' / file_name
                nested_label = nested_label.with_suffix('.txt')
                
                if nested_label.exists():
                    flat_label = yolo_path / 'labels' / split_name / flat_name
                    flat_label = flat_label.with_suffix('.txt')
                    
                    if not flat_label.exists():
                        shutil.copy2(nested_label, flat_label)
            else:
                print(f"Warning: Image not found: {src_img}")

# Run the organization
yolo_root = "/home/xavier/projects/datasets/cocodoom/yolo"
coco_root = "/home/xavier/projects/datasets/cocodoom"

with open(Path(coco_root) / 'run-train.json', 'r') as f:
    coco_data = json.load(f)

# organize_yolo_dataset(yolo_root, coco_root, coco_data)
print("\nYOLO dataset organization complete!")

# Generate data.yaml
create_yaml_file(yolo_root, coco_data['categories'])


YOLO dataset organization complete!

data.yaml created at /home/xavier/projects/datasets/cocodoom/yolo/data.yaml
names:
  0: POSSESSED
  1: SHOTGUY
  2: VILE
  3: FIRE
  4: UNDEAD
  5: TRACER
  6: SMOKE
  7: FATSO
  8: FATSHOT
  9: CHAINGUY
  10: TROOP
  11: SERGEANT
  12: HEAD
  13: BRUISER
  14: BRUISERSHOT
  15: KNIGHT
  16: SKULL
  17: SPIDER
  18: BABY
  19: CYBORG
  20: PAIN
  21: WOLFSS
  22: BARREL
  23: TROOPSHOT
  24: HEADSHOT
  25: ROCKET
  26: PLASMA
  27: BFG
  28: ARACHPLAZ
  29: PUFF
  30: BLOOD
  31: TFOG
  32: EXTRABFG
  33: MISC0
  34: MISC1
  35: MISC2
  36: MISC3
  37: MISC4
  38: MISC10
  39: MISC11
  40: MISC12
  41: INV
  42: MISC13
  43: INS
  44: MISC14
  45: MISC15
  46: MEGA
  47: CLIP
  48: MISC17
  49: MISC18
  50: MISC19
  51: MISC20
  52: MISC21
  53: MISC22
  54: MISC23
  55: MISC24
  56: MISC25
  57: CHAINGUN
  58: MISC26
  59: MISC27
  60: MISC28
  61: SHOTGUN
  62: SUPERSHOTGUN
  63: MISC29
  64: MISC30
  65: MISC32
  66: MISC33
  67: MISC34
  68: MI