<a href="https://colab.research.google.com/github/Samin-Sadaf7/Image_works/blob/main/VisionTransformers_Image_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!pip install einops



In [23]:
import torch
from torchvision.datasets import OxfordIIITPet
import matplotlib.pyplot as plt
from random import random
from torchvision.transforms import ToTensor, Resize
from torchvision.transforms.functional import to_pil_image

In [24]:
to_tensor = [Resize((144, 144)), ToTensor()]

In [25]:
class Compose(object):
  def __init__(self, transforms):
    self.transforms = transforms
  def __call__(self, image, target):
    for t in self.transforms:
      image = t(image)
    return image, target

In [26]:
def show_images(images, num_samples=40, cols=8):
    plt.figure(figsize=(15,15))
    idx = int(len(dataset) / num_samples)
    print(images)
    for i, img in enumerate(images):
        if i % idx == 0:
            plt.subplot(int(num_samples/cols) + 1, cols, int(i/idx) + 1)
            plt.imshow(to_pil_image(img[0]))

In [27]:
dataset = OxfordIIITPet(root=".", download=True, transforms=Compose(to_tensor))
show_images(dataset)

Dataset OxfordIIITPet
    Number of datapoints: 3680
    Root location: .
    <__main__.Compose object at 0x79654e2e0610>


In [34]:
from torch import nn
from einops.layers.torch import Rearrange
from torch import Tensor

class PatchEmbedding(nn.Module):
  def __init__(self, in_channels = 3, patch_size = 8, emb_size = 128):
    self.patch_size = patch_size
    super().__init__()
    self.projection = nn.Sequential(
         Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size),
         nn.Linear(patch_size * patch_size * in_channels, emb_size)
    )
  def forward(self, x:Tensor)->Tensor:
    x = self.projection(x)
    return x

In [35]:
sample_datapoint = torch.unsqueeze(dataset[0][0], 0)
print("Initial shape: ", sample_datapoint.shape)
embedding = PatchEmbedding()(sample_datapoint)
print("Patches shape: ", embedding.shape)

Initial shape:  torch.Size([1, 3, 144, 144])
Patches shape:  torch.Size([1, 324, 128])


In [36]:
from einops import rearrange

class Attention(nn.Module):
    def __init__(self, dim, n_heads, dropout):
      super().__init__()
      self.n_heads = n_heads
      self.attention = nn.modules.MultiheadAttention(dim, n_heads, dropout=dropout)
      self.q =torch.nn.Linear(dim,dim)
      self.k =torch.nn.Linear(dim,dim)
      self.v =torch.nn.Linear(dim,dim)

    def forward(self, x):
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)
        attn_output, attn_output_weights = self.attention(x, x, x)
        return attn_output

In [37]:
Attention(dim=128, n_heads=4, dropout=0.)(torch.ones((1, 5, 128))).shape

torch.Size([1, 5, 128])

In [38]:
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

In [39]:
norm = PreNorm(128, Attention(dim=128, n_heads=4, dropout=0.))
norm(torch.ones((1, 5, 128))).shape

torch.Size([1, 5, 128])

In [41]:
class FeedForward(nn.Sequential):
  def __init__(self, dim , hidden_dim, dropout = 0):
    super().__init__(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
    )

ff = FeedForward(dim=128, hidden_dim=256)
ff(torch.ones((1, 5, 128))).shape

torch.Size([1, 5, 128])

In [42]:
class ResidualAdd(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        res = x
        x = self.fn(x, **kwargs)
        x += res
        return x

In [43]:
residual_att = ResidualAdd(Attention(dim=128, n_heads=4, dropout=0.))
residual_att(torch.ones((1, 5, 128))).shape

torch.Size([1, 5, 128])