In [None]:
import os
import random
import glob
import shutil


import json
import yaml

from collections import defaultdict
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

from lxml import etree as ET

import pandas as pd
import matplotlib.pyplot as plt

import cv2

from sklearn.model_selection import train_test_split

### Load Rt Detr V2 image processor and pretrained model from hugging face 

In [None]:
import numpy as np
from transformers import AutoImageProcessor
checkpoint = "PekingU/rtdetr_v2_r50vd"
image_size = 480

# set image to 480 x480
image_processor = AutoImageProcessor.from_pretrained(
    checkpoint,
    do_resize=True,
    size={"width": image_size, "height": image_size},
    use_fast=True,
)

In [6]:
import albumentations as A

train_augmentation_and_transform = A.Compose(
    [
        A.Perspective(p=0.1),
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.5),
        A.HueSaturationValue(p=0.1),
    ],
    bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25, min_width=1, min_height=1),
)

# to make sure boxes are clipped to image size and there is no boxes with area < 1 pixel
validation_transform = A.Compose(
    [A.NoOp()],
    bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=1, min_width=1, min_height=1),
)

  check_for_updates()


In [7]:
from transformers import AutoModelForObjectDetection
import torch
classname2idx = {"logo": 0}
idx2classname =  {0:"logo"}
model = AutoModelForObjectDetection.from_pretrained(
    checkpoint,
    id2label=idx2classname,
    label2id=classname2idx,
    ignore_mismatched_sizes=True,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of RTDetrV2ForObjectDetection were not initialized from the model checkpoint at PekingU/rtdetr_v2_r50vd and are newly initialized because the shapes did not match:
- model.decoder.class_embed.0.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([1]) in the model instantiated
- model.decoder.class_embed.0.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([1, 256]) in the model instantiated
- model.decoder.class_embed.1.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([1]) in the model instantiated
- model.decoder.class_embed.1.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([1, 256]) in the model instantiated
- model.decoder.class_embed.2.bias: found shape torch.Size([80]) in the checkpoint and torch.Size([1]) in the model instantiated
- model.decoder.class_embed.2.weight: found shape torch.Size([80, 256]) in the checkpoint and torch.Size([1, 256]) in the model instantiated
- model.decod

RTDetrV2ForObjectDetection(
  (model): RTDetrV2Model(
    (backbone): RTDetrV2ConvEncoder(
      (model): RTDetrResNetBackbone(
        (embedder): RTDetrResNetEmbeddings(
          (embedder): Sequential(
            (0): RTDetrResNetConvLayer(
              (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
              (normalization): RTDetrV2FrozenBatchNorm2d()
              (activation): ReLU()
            )
            (1): RTDetrResNetConvLayer(
              (convolution): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (normalization): RTDetrV2FrozenBatchNorm2d()
              (activation): ReLU()
            )
            (2): RTDetrResNetConvLayer(
              (convolution): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (normalization): RTDetrV2FrozenBatchNorm2d()
              (activation): ReLU()
            )
          )
          (pooler)

In [8]:
from torch.utils.data import Dataset
from PIL import Image


class LogoDataset(Dataset):
    def __init__(self, dataset, image_processor, transform=None):
        self.dataset = dataset
        self.image_processor = image_processor
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]

        image_path = sample["file_path"]
        image = Image.open(image_path)
        formatted_annotations = sample["annotation"]

        # Convert image to RGB numpy array
        image = np.array(image.convert("RGB"))

        result = self.image_processor(
            images=image, annotations=formatted_annotations, return_tensors="pt"
        )

        # Image processor expands batch dimension, lets squeeze it
        result = {k: v[0] for k, v in result.items()}

        return result


## Load transformed dataset from pkl

In [9]:
import pickle
train_path = "/kaggle/input/dataset-logo/train_dataset.pkl"
valid_path = "/kaggle/input/dataset-logo/validation_dataset.pkl"

with open(train_path, "rb") as f:
    train = pickle.load(f)

with open(valid_path, "rb") as f:
    valid = pickle.load(f)

### set training arguments, evualting model after every epoch

In [10]:
from transformers import TrainingArguments, Trainer
import torch

# 1) Clear any stray allocations
torch.cuda.empty_cache()

training_args = TrainingArguments(
    output_dir="rtdetr-v2-finetune-on-logo",
    num_train_epochs=5,
    max_grad_norm=0.1,
    learning_rate=5e-5,
    warmup_steps=300,
    per_device_train_batch_size=8,
    dataloader_num_workers=2,
    metric_for_best_model="eval_map",
    greater_is_better=True,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    save_total_limit=2,
    remove_unused_columns=False,
    eval_do_concat_batches=False
)


In [11]:
import torch

def collate_fn(batch):
    data = {}
    data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
    data["labels"] = [x["labels"] for x in batch]
    return data

### create custom MapEvaluator

In [12]:
import numpy as np
from dataclasses import dataclass
from transformers.image_transforms import center_to_corners_format
from torchmetrics.detection.mean_ap import MeanAveragePrecision


@dataclass
class ModelOutput:
    logits: torch.Tensor
    pred_boxes: torch.Tensor


class MAPEvaluator:

    def __init__(self, image_processor, threshold=0.00, id2label=None):
        self.image_processor = image_processor
        self.threshold = threshold
        self.id2label = id2label

    def collect_image_sizes(self, targets):
        """Collect image sizes across the dataset as list of tensors with shape [batch_size, 2]."""
        image_sizes = []
        for batch in targets:
            batch_image_sizes = torch.tensor(np.array([x["size"] for x in batch]))
            image_sizes.append(batch_image_sizes)
        return image_sizes

    def collect_targets(self, targets, image_sizes):
        post_processed_targets = []
        for target_batch, image_size_batch in zip(targets, image_sizes):
            for target, size in zip(target_batch, image_size_batch):

                # here we have "yolo" format (x_center, y_center, width, height) in relative coordinates 0..1
                # and we need to convert it to "pascal" format (x_min, y_min, x_max, y_max) in absolute coordinates
                height, width = size
                boxes = torch.tensor(target["boxes"])
                boxes = center_to_corners_format(boxes)
                boxes = boxes * torch.tensor([[width, height, width, height]])

                labels = torch.tensor(target["class_labels"])
                post_processed_targets.append({"boxes": boxes, "labels": labels})
        return post_processed_targets

    def collect_predictions(self, predictions, image_sizes):
        post_processed_predictions = []
        for batch, target_sizes in zip(predictions, image_sizes):
            batch_logits, batch_boxes = batch[1], batch[2]
            output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
            post_processed_output = self.image_processor.post_process_object_detection(
                output, threshold=self.threshold, target_sizes=target_sizes
            )
            post_processed_predictions.extend(post_processed_output)
        return post_processed_predictions

    @torch.no_grad()
    def __call__(self, evaluation_results):
        # 1) Pre- and post-process your preds & targets
        predictions, targets = evaluation_results.predictions, evaluation_results.label_ids

        image_sizes = self.collect_image_sizes(targets)
        post_processed_targets = self.collect_targets(targets, image_sizes)
        post_processed_predictions = self.collect_predictions(predictions, image_sizes)
    
        # 2) Instantiate the metric under the name "evaluator"
        evaluator = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
        evaluator.warn_on_many_detections = False
        evaluator.update(post_processed_predictions, post_processed_targets)

        metrics = evaluator.compute()
    
        # …then your wrapping of per-class metrics, rounding, etc…
        return metrics


eval_compute_metrics_fn = MAPEvaluator(image_processor=image_processor, threshold=0.01, id2label=idx2classname)

In [None]:
import os
os.environ["WANDB_API_KEY"] = os.getenv("WANDB_API_KEY")

In [14]:
from transformers import logging as hf_logging

# suppress everything below ERROR
hf_logging.set_verbosity_error()


## take 5% subset of whole dataset from both train and valid dataset

In [None]:
# taking first 5% records only  from train dataset
import torch
from torch.utils.data import random_split, DataLoader

# 1. compute split sizes
total = len(train)
n_small = int(0.05 * total)
n_rest  = total - n_small

# 2. do the random split (with a fixed seed for reproducibility)
generator = torch.Generator().manual_seed(42)
small_train_ds, _ = random_split(
    train,
    [n_small, n_rest],
    generator=generator
)
print(len(small_train_ds))
small_train_ds[0]

6346


{'pixel_values': tensor([[[1.0000, 1.0000, 1.0000,  ..., 0.9961, 0.9961, 0.9961],
          [1.0000, 1.0000, 1.0000,  ..., 0.9961, 0.9961, 0.9961],
          [1.0000, 1.0000, 1.0000,  ..., 0.9961, 0.9961, 0.9961],
          ...,
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 0.9961, 0.9961, 0.9961],
          [1.0000, 1.0000, 1.0000,  ..., 0.9961, 0.9961, 0.9961],
          [1.0000, 1.0000, 1.0000,  ..., 0.9961, 0.9961, 0.9961],
          ...,
          [1.0000, 1.0000, 1.0000,  ..., 0.9922, 0.9922, 0.9922],
          [1.0000, 1.0000, 1.0000,  ..., 0.9922, 0.9922, 0.9922],
          [1.0000, 1.0000, 1.0000,  ..., 0.9922, 0.9922, 0.9922]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 0.9961, 0.9961, 0.9961],
          [1.0000, 1.0000, 1.0000,  ..., 0.9961, 0.9961, 0.9961],
          [1.0000, 1.000

In [16]:
# taking first 5% records only  from valid dataset
import torch
from torch.utils.data import random_split, DataLoader

# 1. compute split sizes
valid_total = len(valid)
n_small_valid = int(0.05 * valid_total)
n_rest_valid  = valid_total - n_small_valid

# 2. do the random split (with a fixed seed for reproducibility)
generator = torch.Generator().manual_seed(42)
small_valid_ds, _ = random_split(
    valid,
    [n_small_valid, n_rest_valid],
    generator=generator
)
print(len(small_valid_ds))
small_valid_ds[0]["pixel_values"].shape

1586


torch.Size([3, 480, 480])

In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_ds,
    eval_dataset=small_valid_ds,
    tokenizer=image_processor,
    data_collator=collate_fn,
    compute_metrics=eval_compute_metrics_fn,
)
print("start")


  trainer = Trainer(


start


In [18]:

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjainshabrahul[0m ([33mrahuljain[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Map,Map 50,Map 75,Map Small,Map Medium,Map Large,Mar 1,Mar 10,Mar 100,Mar Small,Mar Medium,Mar Large,Map Per Class,Mar 100 Per Class,Classes
1,75.1041,16.517601,0.09862,0.186916,0.091947,0.00232,0.060296,0.154176,0.20744,0.57435,0.743444,0.194444,0.61383,0.792817,0.09862,0.743444,0
2,14.7469,15.840319,0.154669,0.2734,0.152305,0.003309,0.087499,0.187939,0.246722,0.611498,0.747086,0.222222,0.591702,0.804603,0.154669,0.747086,0
3,13.7803,15.52908,0.191681,0.348199,0.186753,0.008547,0.091824,0.256377,0.280749,0.631478,0.763944,0.288889,0.642553,0.809693,0.191681,0.763944,0
4,12.6367,15.292116,0.1581,0.29474,0.151084,0.021814,0.100844,0.20877,0.256816,0.62898,0.763736,0.294444,0.628085,0.814086,0.1581,0.763736,0
5,12.06,15.252739,0.166947,0.309448,0.158867,0.028637,0.095921,0.228708,0.27513,0.632362,0.769771,0.25,0.650426,0.815411,0.166947,0.769771,0


TrainOutput(global_step=3970, training_loss=21.25415850814704, metrics={'train_runtime': 2798.6307, 'train_samples_per_second': 11.338, 'train_steps_per_second': 1.419, 'total_flos': 5.62267331875584e+18, 'train_loss': 21.25415850814704, 'epoch': 5.0})

In [23]:
trainer.save_model("/kaggle/working/final_model")  


In [24]:
import shutil
from IPython.display import FileLink, display

# 1) Set these paths:
src_folder   = "/kaggle/working/final_model"            # <-- the folder you want to download
archive_base = "/kaggle/working/rtdetr-v2-finetuned-model"    # <-- zip will be created at this path + ".zip"

# 2) Create the ZIP
shutil.make_archive(archive_base, 'zip', src_folder)

# 3) Display a download link
zip_path = archive_base + ".zip"
display(FileLink(zip_path, result_html_prefix="Click here to download: "))
