In [None]:
!pip install -q ultralytics

In [None]:
# https://cocodataset.org/#download
!wget http://images.cocodataset.org/zips/val2017.zip
!unzip -qo val2017.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip -qo annotations_trainval2017.zip

--2023-11-06 16:12:11--  http://images.cocodataset.org/zips/val2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 3.5.25.143, 16.182.71.81, 52.217.69.20, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|3.5.25.143|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 815585330 (778M) [application/zip]
Saving to: ‘val2017.zip.1’


2023-11-06 16:13:05 (14.6 MB/s) - ‘val2017.zip.1’ saved [815585330/815585330]

--2023-11-06 16:13:12--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 16.182.73.137, 52.217.101.172, 3.5.9.207, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|16.182.73.137|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip.1’


2023-11-06 16:13:34 (11.2 MB/s) - ‘annotations_trainval2017.zip.1’ saved [252907541/252907541]



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import transforms
from torchvision.datasets import CocoDetection
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [None]:
COCO_PATH = "val2017"
COCO_ANN_PATH = "annotations/instances_val2017.json"

In [None]:
import os
import torch
import torch.utils.data
import torchvision
from PIL import Image
from pycocotools.coco import COCO

class COCODataset(torch.utils.data.Dataset):
    def __init__(self, root, annFile, transforms=None):
        self.root = root
        self.transforms = transforms
        self.coco = COCO(annFile)
        self.ids = list(sorted(self.coco.imgs.keys()))

    def __getitem__(self, index):
        # Own coco file
        coco = self.coco
        # Image ID
        img_id = self.ids[index]
        # List: get annotation id from coco
        ann_ids = coco.getAnnIds(imgIds=img_id)
        # Dictionary: target coco_annotation file for an image
        coco_annotation = coco.loadAnns(ann_ids)
        # path for input image
        path = coco.loadImgs(img_id)[0]['file_name']
        # open the input image
        img = Image.open(os.path.join(self.root, path))

        # number of objects in the image
        num_objs = len(coco_annotation)

        # Bounding boxes for objects
        # In coco format, bbox = [xmin, ymin, width, height]
        # In pytorch, the input should be [xmin, ymin, xmax, ymax]
        boxes = []
        for i in range(num_objs):
            xmin = coco_annotation[i]['bbox'][0]
            ymin = coco_annotation[i]['bbox'][1]
            xmax = xmin + coco_annotation[i]['bbox'][2]
            ymax = ymin + coco_annotation[i]['bbox'][3]
            boxes.append([xmin, ymin, xmax, ymax])
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # Labels (In my case, I only one class: target class or background)
        labels = torch.ones((num_objs,), dtype=torch.int64)
        # Tensorise img_id
        img_id = torch.tensor([img_id])
        # Size of bbox (Rectangular)
        areas = []
        for i in range(num_objs):
            areas.append(coco_annotation[i]['area'])
        areas = torch.as_tensor(areas, dtype=torch.float32)
        # Iscrowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        # Annotation is in dictionary format
        my_annotation = {}
        my_annotation["boxes"] = boxes
        my_annotation["labels"] = labels
        my_annotation["image_id"] = img_id
        my_annotation["area"] = areas
        my_annotation["iscrowd"] = iscrowd

        if self.transforms is not None:
            img = self.transforms(img)

        return img, my_annotation

    def __len__(self):
        return len(self.ids)

In [None]:
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection import retinanet_resnet50_fpn
from torchvision.models.detection import ssdlite320_mobilenet_v3_large
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, FasterRCNN_ResNet50_FPN_Weights
from torchvision.models.detection import RetinaNet_ResNet50_FPN_Weights
from torchvision.models.detection import SSDLite320_MobileNet_V3_Large_Weights
from torchvision.transforms import functional as F
from torchvision.datasets import CocoDetection
from torch.utils.data import DataLoader
from torchvision import transforms
from PIL import Image
import math
from sklearn.metrics import f1_score

# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the collate function
def collate_fn(batch):
    return tuple(zip(*batch))

data_transform = transforms.Compose(
    [
        # transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ]
)
coco_dataset = COCODataset(root=COCO_PATH, annFile=COCO_ANN_PATH, transforms=data_transform)


dataset_size = len(coco_dataset)
batch_size = 6 # 4 so that it would run on our gpus
train_split = int(0.8 * dataset_size)
indices = list(range(dataset_size))

# todo toodo todo
train_sampler = SubsetRandomSampler(indices[:100])
val_sampler = SubsetRandomSampler(indices[1000:1100])
# train_sampler = SubsetRandomSampler(indices[:train_split])
# val_sampler = SubsetRandomSampler(indices[train_split:])

train_loader = DataLoader(coco_dataset, batch_size=batch_size, sampler=train_sampler, collate_fn=lambda x: x)
val_loader = DataLoader(coco_dataset, batch_size=batch_size, sampler=val_sampler, collate_fn=lambda x: x)
model = fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
model_retina = retinanet_resnet50_fpn(weights=RetinaNet_ResNet50_FPN_Weights.DEFAULT)
model_ssd = ssdlite320_mobilenet_v3_large(weights=SSDLite320_MobileNet_V3_Large_Weights.DEFAULT)
num_classes = 91 # кол-во классов в датасете

in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)


# Train the model
def train_the_model(model):
  num_epochs = 2 # Model quility ...
  for epoch in range(num_epochs):
      model.train()
      model_retina.train()
      model_ssd.train()

      train_loss = []
      for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
          images = list(image[0].to(device) for image in batch)
          targets = [{k: v.to(device) for k, v in t[1].items()} for t in batch]
          new_targets = []
          new_images = []
          for target, image in zip(targets, images):
              if target['boxes'].shape[0] == 0:
                  continue
              new_targets.append(target)
              new_images.append(image)
          loss_dict = model(new_images, new_targets)
          losses = sum(loss for loss in loss_dict.values())
          train_loss.extend(loss_dict.values())
          optimizer.zero_grad()
          losses.backward()
          optimizer.step()
          # del targets, images, new_targets, new_images

      with torch.no_grad():
          model.eval()
          true_labels = []
          pred_labels = []
          for batch in tqdm(val_loader, desc=f"Val Epoch {epoch + 1}/{num_epochs}"):
              images = list(image[0].to(device) for image in batch)
              targets = [{k: v.to(device) for k, v in t[1].items()} for t in batch]
              new_targets = []
              new_images = []
              for target, image in zip(targets, images):
                  if target['boxes'].shape[0] == 0:
                      continue
                  new_targets.append(target)
                  new_images.append(image)
              predictions = model(new_images, new_targets)

              for i in range(len(predictions)):
                  pred = predictions[i]['labels'].cpu().tolist()
                  target = targets[i]['labels'].cpu().tolist()
                  if len(pred) < len(target):
                      pred.extend([0] * (len(target) - len(pred)))
                  elif len(pred) > len(target):
                      # a lot noisy predictions with low score
                      # pred = pred[:len(target)]

                      target.extend([0] * (len(pred) - len(target)))
                  pred_labels.extend(pred)
                  true_labels.extend(target)

          print("F1 Score: ", f1_score(true_labels, pred_labels, average='macro'), "Train Loss: ", sum(train_loss)/len(train_loss))
          print("model_eval: ", model.eval())
      lr_scheduler.step()

  # Save the fine-tuned model
  torch.save(model.state_dict(), "fine_tuned_model.pth")


loading annotations into memory...
Done (t=0.58s)
creating index...
index created!


Downloading: "https://download.pytorch.org/models/retinanet_resnet50_fpn_coco-eeacb38b.pth" to /root/.cache/torch/hub/checkpoints/retinanet_resnet50_fpn_coco-eeacb38b.pth
100%|██████████| 130M/130M [00:01<00:00, 105MB/s]


In [None]:
for mdl in [model, model_ssd, model_retina]:
  mdl = mdl.to(device)
  train_the_model(mdl)
  del mdl
  torch.cuda.empty_cache()

Epoch 1/2:   0%|          | 0/17 [00:00<?, ?it/s]

Val Epoch 1/2:   0%|          | 0/17 [00:00<?, ?it/s]

F1 Score:  0.10309946829205033 Train Loss:  tensor(0.4377, device='cuda:0')
model_eval:  FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1,

Epoch 2/2:   0%|          | 0/17 [00:00<?, ?it/s]

Val Epoch 2/2:   0%|          | 0/17 [00:00<?, ?it/s]

F1 Score:  0.15675034591816564 Train Loss:  tensor(0.1613, device='cuda:0')
model_eval:  FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1,

Epoch 1/2:   0%|          | 0/17 [00:00<?, ?it/s]

Val Epoch 1/2:   0%|          | 0/17 [00:00<?, ?it/s]

F1 Score:  0.0010345785307596166 Train Loss:  tensor(3.5049, device='cuda:0')
model_eval:  SSD(
  (backbone): SSDLiteFeatureExtractorMobileNet(
    (features): Sequential(
      (0): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): Hardswish()
        )
        (1): InvertedResidual(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
              (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (1): Conv2dNormActivation(
              (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(16, eps=0.001, momentum=0.03, aff

Epoch 2/2:   0%|          | 0/17 [00:00<?, ?it/s]

Val Epoch 2/2:   0%|          | 0/17 [00:00<?, ?it/s]

F1 Score:  0.0010164026356371794 Train Loss:  tensor(3.4500, device='cuda:0')
model_eval:  SSD(
  (backbone): SSDLiteFeatureExtractorMobileNet(
    (features): Sequential(
      (0): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): Hardswish()
        )
        (1): InvertedResidual(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
              (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (1): Conv2dNormActivation(
              (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(16, eps=0.001, momentum=0.03, aff

Epoch 1/2:   0%|          | 0/17 [00:00<?, ?it/s]

OutOfMemoryError: ignored

In [None]:
! ls -l ./annotations/instances_val2017.json

In [None]:
# dsdsdsdsdsdsdsdsdsdsdsdsdsdsdsdsdsdsdsdsdsdsdsdsdsd