In [1]:
!pip install datasets transformers albumentations timm evaluate
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.14.3-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting timm
  Downloading timm-0.9.2-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.4 MB/s[0m eta [36m0

In [2]:
from datasets import load_dataset
dataset = load_dataset('keremberke/protective-equipment-detection', 'full')
dataset

Downloading builder script:   0%|          | 0.00/6.11k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/904M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/245M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'objects'],
        num_rows: 6473
    })
    validation: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'objects'],
        num_rows: 3570
    })
    test: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'objects'],
        num_rows: 1935
    })
})

In [3]:
import albumentations
import numpy as np
import torch

transform = albumentations.Compose(
    [
        albumentations.Resize(480, 480),
        albumentations.HorizontalFlip(p=0.4),
    ],
    bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)

In [4]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection

name = "facebook/detr-resnet-50"
image_processor = AutoImageProcessor.from_pretrained(name)

Downloading (…)rocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [5]:
def format_annotation(category, area, bbox, image_id):
  annotation = []
  for item in range(len(category)):
    annotation.append({
        'image_id':image_id,
        'area': area[item],
        'bbox': list(bbox[item]),
        # "isCrowd": 0,
        'category_id': category[item]
    })
  return annotation

In [6]:
def batch_transform(samples):
  image_ids = samples['image_id']
  category, area, bbox, images = [], [], [], []
  for image, obj in zip(samples['image'], samples['objects']):
    # print(np.array(image.convert('RGB')).shape)
    image = np.array(image.convert('RGB'))[:,:,::-1]
    out = transform(image=image, category=obj['category'], bboxes=obj['bbox'])

    area.append(obj['area'])
    bbox.append(out['bboxes'])
    category.append(out['category'])
    images.append(out['image'])

    targets = [
        {"image_id": id_, "annotations": format_annotation(category=cat_, area=ar_, bbox=box_, image_id=id_)}
        for cat_, ar_, box_, id_  in zip(category, area, bbox, image_ids)
    ]

    return image_processor(images=images, annotations=targets, return_tensors="pt")

In [7]:
train_data = dataset['train'].with_transform(batch_transform)
eval_data = dataset['validation'].with_transform(batch_transform)
train_data[0]

{'pixel_values': tensor([[[ 0.8789,  0.8447,  0.7933,  ..., -0.5253, -0.6965, -0.8164],
          [ 0.8789,  0.8447,  0.7933,  ..., -0.5082, -0.6452, -0.7479],
          [ 0.8789,  0.8447,  0.7933,  ..., -0.4911, -0.5767, -0.6452],
          ...,
          [-1.4329, -1.3987, -1.3644,  ..., -1.2103, -1.2103, -1.2103],
          [-1.4500, -1.4158, -1.3815,  ..., -1.2103, -1.2274, -1.2274],
          [-1.4672, -1.4329, -1.3987,  ..., -1.2103, -1.2274, -1.2274]],
 
         [[ 0.9405,  0.9055,  0.8529,  ...,  0.1527, -0.0924, -0.2500],
          [ 0.9405,  0.9055,  0.8529,  ...,  0.1702, -0.0574, -0.1800],
          [ 0.9405,  0.9055,  0.8529,  ...,  0.1877,  0.0126, -0.0924],
          ...,
          [-1.2304, -1.1954, -1.1604,  ..., -0.4076, -0.3901, -0.3901],
          [-1.2479, -1.2129, -1.1779,  ..., -0.4251, -0.4076, -0.4076],
          [-1.2654, -1.2304, -1.1954,  ..., -0.4426, -0.4251, -0.4076]],
 
         [[ 1.1062,  1.0714,  1.0191,  ..., -0.6367, -0.6541, -0.6541],
          [ 

In [8]:
def collate_fn(batch):
    pixel_vals = [i["pixel_values"] for i in batch]
    encoding = image_processor.pad(pixel_vals, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    batch = {}
    batch["pixel_values"] = encoding["pixel_values"]
    batch["pixel_mask"] = encoding["pixel_mask"]
    batch["labels"] = labels
    return batch

In [9]:
names = dataset["train"].features['objects'].feature['category'].names
id2label, label2id = dict(), dict()

for i in range(len(names)):
  id2label[i]=names[i]
  label2id[names[i]] = i

In [12]:
model = AutoModelForObjectDetection.from_pretrained(name, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

Some weights of DetrForObjectDetection were not initialized from the model checkpoint at facebook/detr-resnet-50 and are newly initialized because the shapes did not match:
- class_labels_classifier.weight: found shape torch.Size([92, 256]) in the checkpoint and torch.Size([11, 256]) in the model instantiated
- class_labels_classifier.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([11]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
import evaluate
# evaluate.list_evaluation_modules()
metrics = evaluate.load('mean_iou')

Downloading builder script:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="detr-resnet-50_safety",
    per_device_train_batch_size=8,
    num_train_epochs=10,
    # max_steps=50,
    fp16=True,
    save_steps=2000,
    logging_steps=15,
    learning_rate=1e-5,
    weight_decay=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
)

In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_data,
    tokenizer=image_processor,
    eval_dataset=eval_data,
    # compute_metrics=metrics.compute
)

trainer.train()



Step,Training Loss
15,5.304
30,5.3817
45,4.738


TrainOutput(global_step=50, training_loss=5.1215966796875, metrics={'train_runtime': 23.8417, 'train_samples_per_second': 16.777, 'train_steps_per_second': 2.097, 'total_flos': 2.3895953856e+16, 'train_loss': 5.1215966796875, 'epoch': 0.06})

## Evaluation

In [16]:
saved_model_name = 'safety'
trainer.save_model(saved_model_name)

In [38]:
### OPTIONAL - REDUCE TEST DATA SIZE FOR FAST EVALUATION
# dataset["test"] = dataset["test"][:10]

[1466, 920, 1500, 1923, 297, 656, 1064, 1223, 758, 1780]

In [39]:
import json
import os

# format annotations the same as for training, no need for data augmentation
def val_formatted_anns(image_id, objects):
    annotations = []
    for i in range(0, len(objects["id"])):
        new_ann = {
            "id": objects["id"][i],
            "category_id": objects["category"][i],
            "iscrowd": 0,
            "image_id": image_id,
            "area": objects["area"][i],
            "bbox": objects["bbox"][i],
        }
        annotations.append(new_ann)

    return annotations


# Save images and annotations into the files torchvision.datasets.CocoDetection expects
def save_cppe5_annotation_file_images(cppe5):
    output_json = {}
    path_output_cppe5 = f"{os.getcwd()}/cppe5/"

    if not os.path.exists(path_output_cppe5):
        os.makedirs(path_output_cppe5)

    path_anno = os.path.join(path_output_cppe5, "cppe5_ann.json")
    categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label]
    output_json["images"] = []
    output_json["annotations"] = []
    for i in range(len(cppe5)):
        print(cppe5)
        ann = val_formatted_anns(cppe5["image_id"][i], cppe5["objects"][i])
        output_json["images"].append(
            {
                "id": cppe5["image_id"][i],
                "width": cppe5["image"][i].width,
                "height": cppe5["image"][i].height,
                "file_name": f"{cppe5['image_id'][i]}.png",
            }
        )
        output_json["annotations"].extend(ann)
    output_json["categories"] = categories_json

    with open(path_anno, "w") as file:
        json.dump(output_json, file, ensure_ascii=False, indent=4)

    for im, img_id in zip(cppe5["image"], cppe5["image_id"]):
        path_img = os.path.join(path_output_cppe5, f"{img_id}.png")
        im.save(path_img)

    return path_output_cppe5, path_anno

In [40]:
import torchvision


class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, image_processor, ann_file):
        super().__init__(img_folder, ann_file)
        self.image_processor = image_processor

    def __getitem__(self, idx):
        # read in PIL image and target in COCO format
        img, target = super(CocoDetection, self).__getitem__(idx)

        # preprocess image and target: converting target to DETR format,
        # resizing + normalization of both image and target)
        image_id = self.ids[idx]
        target = {"image_id": image_id, "annotations": target}
        encoding = self.image_processor(images=img, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()  # remove batch dimension
        target = encoding["labels"][0]  # remove batch dimension

        return {"pixel_values": pixel_values, "labels": target}


im_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")

# path_output_cppe5, path_anno = save_cppe5_annotation_file_images(dataset["test"])
# test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [41]:
test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [42]:
path_output_cppe5, path_anno = save_cppe5_annotation_file_images(dataset["test"])

{'image_id': [1466, 920, 1500, 1923, 297, 656, 1064, 1223, 758, 1780], 'image': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720 at 0x7DA05C1DDF60>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720 at 0x7DA05C1DE950>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720 at 0x7DA05C1DE170>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720 at 0x7DA05C1DCE50>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=400x266 at 0x7DA05C1DF820>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720 at 0x7DA05C1DD1E0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2048x1536 at 0x7DA05C1DCD60>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720 at 0x7DA05C1DD7E0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720 at 0x7DA05C1DC8E0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720 at 0x7DA05C1DD030>], 'width': [1280, 1280, 1280, 1280, 400, 1280, 2048, 1280, 1280, 1280], 'height'

In [None]:
import evaluate
from tqdm import tqdm

# model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco)
val_dataloader = torch.utils.data.DataLoader(
    test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn
)

with torch.no_grad():
    for idx, batch in enumerate(tqdm(val_dataloader)):
        pixel_values = batch["pixel_values"]
        pixel_mask = batch["pixel_mask"]

        labels = [
            {k: v for k, v in t.items()} for t in batch["labels"]
        ]  # these are in DETR format, resized + normalized

        # forward pass
        outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)

        orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
        results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to COCO api

        module.add(prediction=results, reference=labels)
        del batch

results = module.compute()
print(results)

Downloading builder script:   0%|          | 0.00/6.40k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/14.9k [00:00<?, ?B/s]

  0%|          | 0/242 [00:00<?, ?it/s]

# **Inference**

In [None]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image
import requests

url = ""
image = Image.open(requests.get(url, stream=True).raw)

processor = DetrImageProcessor.from_pretrained(saved_model_name)
model = DetrForObjectDetection.from_pretrained(saved_model_name)

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# Minimum required accuracy
min_acc = 0.8
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
    )


In [None]:
def inference(image, model, processor):
  inputs = processor(images=image, return_tensors="pt")
  outputs = model(**inputs)

  # Minimum required accuracy
  min_acc = 0.8
  target_sizes = torch.tensor([image.size[::-1]])
  results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

  for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
      box = [round(i, 2) for i in box.tolist()]
      print(
              f"Detected {model.config.id2label[label.item()]} with confidence "
              f"{round(score.item(), 3)} at location {box}"
      )

In [25]:
# !zip -r 'safety_model.zip' 'safety'

  adding: safety/ (stored 0%)
  adding: safety/config.json (deflated 61%)
  adding: safety/preprocessor_config.json (deflated 47%)
  adding: safety/training_args.bin (deflated 48%)
  adding: safety/pytorch_model.bin (deflated 7%)


## Dataset Visualisation

In [None]:
from PIL import ImageDraw, Image
image = dataset['train'][1]['image']
dataset['train'][1]

{'image_id': 5335,
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,
 'width': 1280,
 'height': 720,
 'objects': {'id': [11158, 11159, 11160, 11161],
  'area': [8475, 14268, 19436, 18328],
  'bbox': [[814.0, 60.0, 113.0, 75.0],
   [971.0, 404.0, 82.0, 174.0],
   [369.0, 298.0, 113.0, 172.0],
   [250.0, 376.0, 116.0, 158.0]],
  'category': [4, 0, 0, 0]}}

In [None]:
dataset["train"].features["objects"].feature["category"].names

['glove',
 'goggles',
 'helmet',
 'mask',
 'no_glove',
 'no_goggles',
 'no_helmet',
 'no_mask',
 'no_shoes',
 'shoes']

In [None]:
draw = ImageDraw.Draw(image)
obj = dataset['train'][1]['objects']
for idx, cat in enumerate(obj['category']):
  x,y,w,h = tuple(obj['bbox'][idx])
  draw.rectangle((x,y,x+w,y+h), (255, 255, 60*cat), 2)

In [None]:
image.save('04.png')