In [1]:
!pip install -q datasets transformers[torch] timm accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


import albumentations as A

from transformers import DetrFeatureExtractor, AutoModelForObjectDetection, TrainingArguments,Trainer

## Model - Detr Resnet50 backbone

In [3]:
label2id = {
    "logo": 0,
    "text": 1,
}

id2label = {v: k for k, v in label2id.items()}

In [4]:

feature_extractor_checkpoint = "facebook/detr-resnet-50"
feature_extractor = DetrFeatureExtractor.from_pretrained(feature_extractor_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [5]:

model = AutoModelForObjectDetection.from_pretrained(
    feature_extractor_checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

config.json:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/167M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection were not initialized from the model checkpoin

## Dataset

In [6]:
dataset = load_dataset("bastienp/visible-watermark-pita", download_mode="force_redownload")

Downloading readme:   0%|          | 0.00/5.14k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/740k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'bbox', 'id', 'area', 'image_id', 'category_id'],
        num_rows: 98
    })
    test: Dataset({
        features: ['image', 'bbox', 'id', 'area', 'image_id', 'category_id'],
        num_rows: 30
    })
    val: Dataset({
        features: ['image', 'bbox', 'id', 'area', 'image_id', 'category_id'],
        num_rows: 18
    })
})

In [8]:
dataset["train"][4]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512>,
 'bbox': [270, 133, 483, 58],
 'id': 99,
 'area': 28014,
 'image_id': 30200,
 'category_id': 2}

## Preprocessing

In [9]:
preprocess = A.Compose([
    A.Resize(512, 512),
    A.RandomBrightnessContrast(p=0.3),
    A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.50, rotate_limit=45, p=.5),
], bbox_params=A.BboxParams(format="coco", label_fields=["category_ids"]))

preprocess_viz = A.Compose([
    A.Resize(512, 512),
], bbox_params=A.BboxParams(format="coco", label_fields=["category_ids"]))

In [10]:
def clamp_coco_bbox(bbox, img_width, img_height):
    x, y, width, height = bbox

    # Ensure x and y are within the image boundaries
    x = max(0, min(x, img_width))
    y = max(0, min(y, img_height))

    # Ensure width and height do not extend beyond the image boundaries
    width = min(width, img_width - x)
    height = min(height, img_height - y)

    return [x, y, width, height]


In [11]:

def formatted_anns(image_id, category, area, bbox):
    annotations = []
    for i in range(0, len(category)):
        new_ann = {
            "image_id": image_id,
            "category_id": category[i],
            "isCrowd": 0,
            "area": area[i],
            "bbox": list(bbox[i]),
        }
        annotations.append(new_ann)

    return annotations

In [12]:
# transforming a batch
def transform_aug_ann(examples):
    image_ids = examples["image_id"]
    images, bboxes, areas, categories = [], [], [], []
    for image, bbox, category, area in zip(examples["image"], examples["bbox"], examples["category_id"], examples["area"]):
        image = np.array(image.convert("RGB"))
        img_shape = image.shape

        out = preprocess(image=image, bboxes=[clamp_coco_bbox(bb, img_shape[0], img_shape[1]) for bb in [bbox]], category_ids=[category])

        areas.append([area])
        images.append(out["image"])
        bboxes.append(out["bboxes"])
        categories.append(out["category_ids"])

    targets = [
        {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
        for id_, cat_, ar_, box_ in zip(image_ids, categories, areas, bboxes)
    ]


    return feature_extractor(images=images, annotations=targets, return_tensors="pt")

In [13]:
dataset["train"] = dataset["train"].with_transform(transform_aug_ann)

In [14]:
def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    encoding = feature_extractor.pad(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    batch = {}
    batch["pixel_values"] = encoding["pixel_values"]
    batch["pixel_mask"] = encoding["pixel_mask"]
    batch["labels"] = labels
    return batch

## Train

In [15]:
training_args = TrainingArguments(
    output_dir="detr-resnet-50_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=50,
    save_steps=200,
    logging_steps=20,
    learning_rate=1e-5,
    weight_decay=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=dataset["train"],
    tokenizer=feature_extractor,
)


In [None]:
trainer.train()


Step,Training Loss


In [None]:
trainer.save_model("detr-resnet-50_finetuned")

In [None]:
df_logs = pd.DataFrame(trainer.state.log_history)
df_logs.head()

In [None]:
df_logs[["loss"]].plot(title="Training Metrics")