<a href="https://colab.research.google.com/github/Samin-Sadaf7/Image_works/blob/main/Transformer_Yolo8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [2]:
import os
HOME = os.getcwd()
print(HOME)

/content


In [3]:
!pip install --upgrade torch ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.68-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.68-py3-none-any.whl (913 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m913.6/913.6 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.14-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.68 ultralytics-thop-2.0.14


In [4]:
from IPython import display
display.clear_output()

import ultralytics
ultralytics.checks()

Ultralytics 8.3.68 🚀 Python-3.11.11 torch-2.5.1+cu121 CPU (Intel Xeon 2.20GHz)
Setup complete ✅ (2 CPUs, 12.7 GB RAM, 31.1/107.7 GB disk)


In [5]:
from ultralytics import YOLO

from IPython.display import display, Image

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
import torch
import torch.nn as nn
from einops import rearrange, repeat

class VisionTransformer(nn.Module):
    def __init__(self, img_size=224, patch_size=16, num_classes=1000, dim=768, depth=12, heads=12, mlp_dim=3072, dropout=0.1, emb_dropout=0.1):
        super(VisionTransformer, self).__init__()
        assert img_size % patch_size == 0, "Image size must be divisible by patch size"
        num_patches = (img_size // patch_size) ** 2
        patch_dim = (3 * patch_size ** 2)

        self.patch_size = patch_size
        self.patch_to_embedding = nn.Linear(patch_dim, dim)
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(dim, heads, mlp_dim, dropout),
            depth
        )

        self.to_cls_token = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        p = self.patch_size
        # Break image into patches
        patches = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=p, p2=p)
        tokens = self.patch_to_embedding(patches)

        # Add class token and positional embeddings
        b, n, _ = tokens.shape
        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b=b)
        tokens = torch.cat((cls_tokens, tokens), dim=1)
        tokens += self.pos_embedding[:, :(n + 1)]
        tokens = self.dropout(tokens)

        # Apply transformer
        tokens = self.transformer(tokens)
        cls_token = tokens[:, 0]
        return self.mlp_head(cls_token)

In [17]:
from ultralytics import YOLO

class YOLOv8WithViT(nn.Module):
    def __init__(self, vit_config, yolo_weights="yolov8n.pt"):
        super(YOLOv8WithViT, self).__init__()
        # Load YOLOv8 model
        self.yolo = YOLO(yolo_weights).model

        # Replace or augment backbone
        self.vit = VisionTransformer(**vit_config)
        self.yolo.model[0] = nn.Sequential(
            self.vit,
            nn.Conv2d(vit_config['dim'], 256, kernel_size=1)  # Match YOLO backbone output channels
        )

    def forward(self, x):
        return self.yolo(x)

In [18]:
# Define ViT configuration
vit_config = {
    "img_size": 224,
    "patch_size": 16,
    "num_classes": 15,
    "dim": 768,
    "depth": 12,
    "heads": 12,
    "mlp_dim": 3072,
    "dropout": 0.1,
    "emb_dropout": 0.1
}

In [19]:
# Create the model
model = YOLOv8WithViT(vit_config)



In [20]:
model

YOLOv8WithViT(
  (yolo): DetectionModel(
    (model): Sequential(
      (0): Sequential(
        (0): VisionTransformer(
          (patch_to_embedding): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (transformer): TransformerEncoder(
            (layers): ModuleList(
              (0-11): 12 x TransformerEncoderLayer(
                (self_attn): MultiheadAttention(
                  (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
                )
                (linear1): Linear(in_features=768, out_features=3072, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
                (linear2): Linear(in_features=3072, out_features=768, bias=True)
                (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (dropout1): Dropout(p=0.1, inplace=False)
  

In [13]:
original_model = YOLO('yolov8n.pt')

In [21]:
model.train= original_model.train

In [22]:
model

YOLOv8WithViT(
  (yolo): DetectionModel(
    (model): Sequential(
      (0): Sequential(
        (0): VisionTransformer(
          (patch_to_embedding): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (transformer): TransformerEncoder(
            (layers): ModuleList(
              (0-11): 12 x TransformerEncoderLayer(
                (self_attn): MultiheadAttention(
                  (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
                )
                (linear1): Linear(in_features=768, out_features=3072, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
                (linear2): Linear(in_features=3072, out_features=768, bias=True)
                (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (dropout1): Dropout(p=0.1, inplace=False)
  

In [23]:
# Fine-tune the model
model.train(data='/content/drive/MyDrive/Dataset- -Conference/dataset.yaml', epochs=50)

Ultralytics 8.3.68 🚀 Python-3.11.11 torch-2.5.1+cu121 CPU (Intel Xeon 2.20GHz)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8n.pt, data=/content/drive/MyDrive/Dataset- -Conference/dataset.yaml, epochs=50, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=train, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show

100%|██████████| 755k/755k [00:00<00:00, 21.1MB/s]


Overriding model.yaml nc=80 with nc=15

                   from  n    params  module                                       arguments                     
  0                  -1  1       464  ultralytics.nn.modules.conv.Conv             [3, 16, 3, 2]                 
  1                  -1  1      4672  ultralytics.nn.modules.conv.Conv             [16, 32, 3, 2]                
  2                  -1  1      7360  ultralytics.nn.modules.block.C2f             [32, 32, 1, True]             
  3                  -1  1     18560  ultralytics.nn.modules.conv.Conv             [32, 64, 3, 2]                
  4                  -1  2     49664  ultralytics.nn.modules.block.C2f             [64, 64, 2, True]             
  5                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  6                  -1  2    197632  ultralytics.nn.modules.block.C2f             [128, 128, 2, True]           
  7                  -1  1    295424  ultralytic

[34m[1mtrain: [0mScanning /content/drive/MyDrive/Dataset- -Conference/train/labels.cache... 2096 images, 0 backgrounds, 0 corrupt: 100%|██████████| 2096/2096 [00:00<?, ?it/s]


[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))


  check_for_updates()
[34m[1mval: [0mScanning /content/drive/MyDrive/Dataset- -Conference/valid/labels.cache... 256 images, 0 backgrounds, 0 corrupt: 100%|██████████| 256/256 [00:00<?, ?it/s]


Plotting labels to runs/detect/train/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000526, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train[0m
Starting training for 50 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


  0%|          | 0/131 [00:10<?, ?it/s]


KeyboardInterrupt: 