### Install Dependencies

In [None]:
!pip install uv

# Latests transformers + vision libraries
!echo "0"
!uv pip install --system --prerelease if-necessary-or-explicit -q -U git+https://github.com/huggingface/transformers
!echo "1"
!uv pip install --system --prerelease if-necessary-or-explicit -q -U timm torchvision Pillow
!echo "2"
# fix broken numpy
!pip install -q -U "numpy>=2.0.0,<3.0.0"



# For training
!echo "3"
!uv pip install --system --prerelease if-necessary-or-explicit -q -U datasets accelerate
!echo "4"
!pip install -q -U "albumentations==1.4.6" torchmetrics

!pip install -q  -U "numpy>=2.0.0,<3.0.0"
import os
os.kill(os.getpid(), 9)

Collecting uv
  Downloading uv-0.6.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.6.14-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.6.14
0
1
2
3
4
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.5/153.5 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m961.5/961.5 kB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[?25h

### Load Dataset from Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! ls
! ls drive/MyDrive/_ColabFiles/walmartai_veggievision
! unzip -qq drive/MyDrive/_ColabFiles/walmartai_veggievision/dataset_kavan_patel_coco.zip -d data

drive  sample_data
dataset_kavan_patel_coco.zip  instances_train.json  instances_val.json


### Create a Custom Data Class

`image_processor` expects annotations in the following format: `{'image_id': int, 'annotations': List[Dict]}`. Currently, we have `'annotations': List[Dict]`.

In [3]:
import os
import json
import torch
from torch.utils.data import Dataset
from PIL import Image

class CustomCOCODataset(Dataset):
    def __init__(self, images_dir, annotation_file, image_processor):
        self.images_dir = images_dir
        self.image_processor = image_processor

        with open(annotation_file, 'r') as f:
            coco = json.load(f)

        # Build index
        self.images = {img["id"]: img for img in coco["images"]}
        self.annotations = {}
        for ann in coco["annotations"]:
            image_id = ann["image_id"]
            if image_id not in self.annotations:
                self.annotations[image_id] = []
            self.annotations[image_id].append(ann)

        self.ids = list(self.images.keys())
        self.categories = {cat["id"]: cat["name"] for cat in coco["categories"]}

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        image_id = self.ids[idx]
        image_info = self.images[image_id]
        image_path = os.path.join(self.images_dir, image_info["file_name"])
        image = Image.open(image_path).convert("RGB")

        # COCO annotation format: bbox = [x, y, width, height]
        anns = self.annotations.get(image_id, [])
        boxes = [ann["bbox"] for ann in anns]
        class_labels = [ann["category_id"] for ann in anns]

        # Format for AutoImageProcessor
        encoded = self.image_processor(
            images=image,
            annotations={"image_id": image_id, "annotations": anns},
            return_tensors="pt"
        )

        encoded["image_id"] = image_id
        return encoded


### Preprocess the Data

In [4]:
from transformers import AutoImageProcessor

checkpoint = "PekingU/rtdetr_v2_r18vd"  # or mobilenetv3
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

train_dataset = CustomCOCODataset(
    images_dir="/content/data/dataset_kavan_patel_coco/train",
    annotation_file="/content/data/dataset_kavan_patel_coco/annotations/instances_train.json",
    image_processor=image_processor,
)

val_dataset = CustomCOCODataset(
    images_dir="/content/data/dataset_kavan_patel_coco/val",
    annotation_file="/content/data/dataset_kavan_patel_coco/annotations/instances_val.json",
    image_processor=image_processor,
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


### Define a custom collate_fn to batch images together

In [5]:
from transformers import DefaultDataCollator
from torch.utils.data.dataloader import default_collate
from torch.utils.data import DataLoader


def collate_fn(batch):
    # Batch is a list of dicts. We want to return a dict of lists.
    pixel_values = [example["pixel_values"].squeeze() for example in batch]
    encoding = {"pixel_values": torch.stack(pixel_values)}

    # Process labels as-is (they are already in list-of-dict form)
    encoding["labels"] = [example["labels"][0] for example in batch] # Access the first element of 'labels' to get the desired dictionary.
    print(f'encoding["labels"]', encoding["labels"])

    return encoding
# debug collate
loader = DataLoader(train_dataset, batch_size=2, collate_fn=collate_fn)
batch = next(iter(loader))
print(batch.keys())
print(batch['pixel_values'].shape)


encoding["labels"] [{'size': tensor([640, 640]), 'image_id': tensor([0]), 'class_labels': tensor([8]), 'boxes': tensor([[0.5053, 0.6431, 0.7338, 0.4945]]), 'area': tensor([148641.5000]), 'iscrowd': tensor([0]), 'orig_size': tensor([4032, 3024])}, {'size': tensor([640, 640]), 'image_id': tensor([1]), 'class_labels': tensor([13]), 'boxes': tensor([[0.5327, 0.6796, 0.4296, 0.3313]]), 'area': tensor([58300.6289]), 'iscrowd': tensor([0]), 'orig_size': tensor([4032, 3024])}]
dict_keys(['pixel_values', 'labels'])
torch.Size([2, 3, 640, 640])


In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

### Define `Trainer` and model args

In [10]:
from transformers import AutoModelForObjectDetection, TrainingArguments, Trainer

id2label = {0: "banana-bag", 1: "banana", 2: "Blackberries", 3: "Raspberries", 4: "lemon-bag", 5: "lemon", 6: "grapes-bag", 7: "grapes", 8: "tomato-bag", 9: "tomato", 10: "apple-bag", 11: "apple", 12: "chili-bag", 13: "chili"}  # Change to your class mapping
label2id = {v: k for k, v in id2label.items()}


model = AutoModelForObjectDetection.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

training_args = TrainingArguments(
    output_dir="./rtdetr_v2_mobile_output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=25,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    remove_unused_columns=False,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    push_to_hub=False,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=image_processor,
    data_collator=collate_fn,
)


Some weights of RTDetrV2ForObjectDetection were not initialized from the model checkpoint at PekingU/rtdetr_v2_r18vd and are newly initialized because the shapes did not match:
- model.decoder.class_embed.0.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([14]) in the model instantiated
- model.decoder.class_embed.0.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([14, 256]) in the model instantiated
- model.decoder.class_embed.1.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([14]) in the model instantiated
- model.decoder.class_embed.1.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([14, 256]) in the model instantiated
- model.decoder.class_embed.2.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([14]) in the model instantiated
- model.decoder.class_embed.2.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([14, 256]) in the model instantiated
- model

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


### Train

In [9]:
import os
os.environ["WANDB_DISABLED"] = "true"  # to skip wandb login for testing

trainer.train()

encoding["labels"] [{'size': tensor([640, 640]), 'image_id': tensor([0]), 'class_labels': tensor([8]), 'boxes': tensor([[0.5053, 0.6431, 0.7338, 0.4945]]), 'area': tensor([148641.5000]), 'iscrowd': tensor([0]), 'orig_size': tensor([4032, 3024])}, {'size': tensor([640, 640]), 'image_id': tensor([1]), 'class_labels': tensor([13]), 'boxes': tensor([[0.5327, 0.6796, 0.4296, 0.3313]]), 'area': tensor([58300.6289]), 'iscrowd': tensor([0]), 'orig_size': tensor([4032, 3024])}, {'size': tensor([640, 640]), 'image_id': tensor([2]), 'class_labels': tensor([7]), 'boxes': tensor([[0.5176, 0.4150, 0.4499, 0.7940]]), 'area': tensor([146314.4062]), 'iscrowd': tensor([0]), 'orig_size': tensor([3024, 4032])}, {'size': tensor([640, 640]), 'image_id': tensor([3]), 'class_labels': tensor([14]), 'boxes': tensor([[0.3872, 0.6731, 0.6095, 0.3224]]), 'area': tensor([80487.1094]), 'iscrowd': tensor([0]), 'orig_size': tensor([4032, 3024])}, {'size': tensor([640, 640]), 'image_id': tensor([4]), 'class_labels': te

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


### Evaluate

In [None]:
trainer.evaluate()

# from pprint import pprint

# metrics = trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="eval")
# pprint(metrics)

### Push to Hugging Face Hug for future use

In [None]:
trainer.push_to_hub()

### Inference

In [None]:
import torch
import requests
from PIL import Image, ImageDraw

device = "cuda"

url = "https://images.pexels.com/photos/8413299/pexels-photo-8413299.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2"
image = Image.open(requests.get(url, stream=True).raw)



In [None]:
inputs = image_processor(images=[image], return_tensors="pt")
inputs = inputs.to(device)
with torch.no_grad():
    outputs = model(**inputs)
target_sizes = torch.tensor([image.size[::-1]])

result = image_processor.post_process_object_detection(outputs, threshold=0.4, target_sizes=target_sizes)[0]

for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"

In [None]:
image_with_boxes = image.copy()
draw = ImageDraw.Draw(image_with_boxes)

for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    x, y, x2, y2 = tuple(box)
    draw.rectangle((x, y, x2, y2), outline="red", width=1)
    text_label = model.config.id2label[label.item()]
    draw.text((x, y), f"{text_label} [ {score.item():.2f} ]", fill="blue")

image_with_boxes