In [None]:
!pip install torchinfo
import torch
import torchvision

import torch.utils.data as Dataloader
from torchvision import transforms,datasets
from torch import nn
from torchinfo import summary

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
class PatchEmbedding(nn.Module):

  def __init__(self,in_channels:int=3,
               patch_size:int=16,
               embedding_dim:int=768) :
      super().__init__()

      self.patcher = nn.Conv2d(in_channels = in_channels,
                               out_channels = embedding_dim,
                               kernel_size = patch_size,
                               stride = patch_size,
                               padding=0)
      self.flatten = nn.Flatten(start_dim=2,end_dim=3)

  def forward(self,x) :
    # image to patches
    print(x.size())
    x = self.patcher(x)
    print(x.size())
    # flatten the patches
    x_flattened = self.flatten(x)
    # permute to batch_size * num_patches * embedding_dim
    return x_flattened.permute(0,2,1)


In [None]:
class MultiHeadSelfAttentionBlock(nn.Module):

  def __init__(self,
               embed_dim:int=768,
               num_heads:int=12,
               att_dropout:float=0,
               ):
    super().__init__()
    # create LayerNorm layer
    self.layer_norm = nn.LayerNorm(normalized_shape=embed_dim)
    # create Multi Head Self Attention Layer
    self.msa = nn.MultiheadAttention(embed_dim = embed_dim,
                                     num_heads = num_heads,
                                     dropout = att_dropout,
                                     batch_first = True)
  def forward(self, input):
    x = self.layer_norm(input)

    att_output,_ = self.msa(query = x,
                            key = x,
                            value = x,
                            need_weights=False)
    #skip connection
    att_output = att_output + input

    return att_output



In [None]:
class MultiLayerPerceptronBlock(nn.Module):

  def __init__(self,
               embed_dim:int=768,
               mlp_size:int=3072,
               dropout:float=0.1) :
    super().__init__()
    # create LayerNorm layer
    self.layer_norm = nn.LayerNorm(normalized_shape=embed_dim)
    # create Multi Layer Perceptron layer
    self.mlp = nn.Sequential(
        nn.Linear(embed_dim,mlp_size),
        nn.GELU(),
        nn.Dropout(p=dropout),
        nn.Linear(mlp_size,embed_dim),
        nn.Dropout(p=dropout)
    )

  def forward(self,input):
    x = self.layer_norm(input)
    x = self.mlp(x)
    # skip connection
    x = x+input

    return x

In [None]:
class TransformerEncoderBlock(nn.Module):

  def __init__(self,
               embed_dim:int=768,
               mlp_size:int=3072,
               num_heads:int=12,
               att_dropout:float=0.0,
               mlp_dropout:float=0.1):
    super().__init__()
    # Multi Head Self Attentation Block (Layer Norm + Multi Head Self Attentation + Skip Connection)
    self.msa = MultiHeadSelfAttentionBlock(embed_dim=embed_dim,
                                           num_heads=num_heads,
                                           att_dropout=att_dropout)
    # Multi layer perceptron Block(Layer Norm + Multi layer perceptron + Skip Connection)
    self.mlp = MultiLayerPerceptronBlock(embed_dim=embed_dim,
                                         mlp_size=mlp_size,
                                         dropout=mlp_dropout)
  def forward(self,input):
    x = self.msa(input)
    x = self.mlp(x)
    return x



In [None]:
transformer_block = TransformerEncoderBlock()

summary(transformer_block,(1,196,768))

Layer (type:depth-idx)                   Output Shape              Param #
TransformerEncoderBlock                  [1, 196, 768]             --
├─MultiHeadSelfAttentionBlock: 1-1       [1, 196, 768]             --
│    └─LayerNorm: 2-1                    [1, 196, 768]             1,536
│    └─MultiheadAttention: 2-2           [1, 196, 768]             2,362,368
├─MultiLayerPerceptronBlock: 1-2         [1, 196, 768]             --
│    └─LayerNorm: 2-3                    [1, 196, 768]             1,536
│    └─Sequential: 2-4                   [1, 196, 768]             --
│    │    └─Linear: 3-1                  [1, 196, 3072]            2,362,368
│    │    └─GELU: 3-2                    [1, 196, 3072]            --
│    │    └─Dropout: 3-3                 [1, 196, 3072]            --
│    │    └─Linear: 3-4                  [1, 196, 768]             2,360,064
│    │    └─Dropout: 3-5                 [1, 196, 768]             --
Total params: 7,087,872
Trainable params: 7,087,872
Non-tr

In [None]:
class VIT(nn.Module):

  def __init__(self,
               img_size:int=224,
               num_classes:int=2,
               num_transformer_block:int=12,
               in_channels:int=3,
               patch_size:int=16,
               embed_dim:int=768,
               mlp_size:int=3072,
               num_heads:int=12,
               att_dropout:float=0.0,
               mlp_dropout:float=0.1,
               embedding_dropout:float=0.0
               ):
    super().__init__()

    assert img_size % patch_size == 0, f"Input image size must be divisble by patch size, image shape: {img_size}, patch size: {patch_size}"

    self.num_patches = int((img_size**2)/(patch_size**2))

    self.classEmbedding = nn.Parameter(data=torch.rand((1,1,embed_dim)),requires_grad=True)

    self.patchEmbedding = PatchEmbedding(in_channels=in_channels,patch_size=patch_size,embedding_dim=embed_dim)

    self.positionalEmbedding = nn.Parameter(data=torch.rand((1,self.num_patches+1,embed_dim)),requires_grad=True)

    self.embedding_dropout = nn.Dropout(p=embedding_dropout)

    self.transformerEncoder = nn.Sequential(*[TransformerEncoderBlock(embed_dim=embed_dim,
                                                                      mlp_size=mlp_size,
                                                                      num_heads=num_heads,
                                                                      att_dropout=att_dropout,
                                                                      mlp_dropout=mlp_dropout) for _ in range(num_transformer_block)])
    self.classifier = nn.Sequential(
        nn.LayerNorm(normalized_shape=embed_dim),
        nn.Linear(in_features=embed_dim,out_features=num_classes)
    )

  def forward(self,input):
    batchSize = input.shape[0]
    # print(batchSize)
    x = self.patchEmbedding(input)
    cls_tokens = self.classEmbedding.expand((batchSize,-1,-1))
    x = torch.concat((cls_tokens,x),dim=1)
    x = self.positionalEmbedding + x
    x = self.embedding_dropout(x)
    x = self.transformerEncoder(x)
    output = self.classifier(x[:,0])
    return output









In [None]:
image = torch.rand((1,3,224,224))

model = VIT()

output = model(image)
print(output)

torch.Size([1, 3, 224, 224])
torch.Size([1, 768, 14, 14])
tensor([[ 0.2373, -0.3200]], grad_fn=<AddmmBackward0>)


In [None]:
summary(model)

Layer (type:depth-idx)                                            Param #
VIT                                                               152,064
├─PatchEmbedding: 1-1                                             --
│    └─Conv2d: 2-1                                                590,592
│    └─Flatten: 2-2                                               --
├─Dropout: 1-2                                                    --
├─Sequential: 1-3                                                 --
│    └─TransformerEncoderBlock: 2-3                               --
│    │    └─MultiHeadSelfAttentionBlock: 3-1                      2,363,904
│    │    └─MultiLayerPerceptronBlock: 3-2                        4,723,968
│    └─TransformerEncoderBlock: 2-4                               --
│    │    └─MultiHeadSelfAttentionBlock: 3-3                      2,363,904
│    │    └─MultiLayerPerceptronBlock: 3-4                        4,723,968
│    └─TransformerEncoderBlock: 2-5                         

NameError: name 'tar' is not defined