# Vit Transformer

## Модель

In [None]:
import torch
from torch import nn

In [None]:
# Смоделируем данные

n_features = 10  # Количество признаков
n_classes = 3  # Количество классов
batch_size = 5 

data = torch.randn((batch_size, n_features))
print(data.shape)
print(data)

torch.Size([5, 10])
tensor([[ 0.5724, -0.6533,  1.1049, -0.4849, -1.7450,  1.1229, -0.9716,  0.7787,
          1.1459, -0.2301],
        [-0.2087, -0.3661, -0.7727,  1.9516,  1.5495,  1.6531, -1.0622,  0.5459,
          0.2340, -1.3714],
        [ 0.1229, -1.1388,  0.8286,  1.2500, -0.6799,  1.2717,  2.5585,  0.5955,
         -0.0385, -0.9558],
        [-0.9188,  1.2414,  0.1035, -0.8419,  1.6002, -0.3624,  0.7136, -0.1262,
         -0.9676, -1.4855],
        [-1.1576,  0.6840, -0.5990, -1.0105,  0.4938, -0.5943,  1.1988, -0.7912,
          1.4762,  0.3026]])


In [None]:
# Зададим простую модель
model = nn.Linear(n_features, n_classes)

In [None]:
# Применим модель к вектору
answer = model(data)
print(answer.shape)
print(answer)

torch.Size([5, 3])
tensor([[-0.1430, -0.1978, -0.2459],
        [ 0.0568,  0.2415,  1.2328],
        [ 0.3432,  0.4882, -0.8710],
        [-0.0182,  0.0482,  0.2527],
        [ 0.3254, -0.0420,  0.0331]], grad_fn=<AddmmBackward>)


In [None]:
# Модель как наследник nn.Module
class SimpleNN(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()

        self.lin = nn.Linear(n_features, n_classes)

    def forward(self, x):
        return self.lin(x)

In [None]:
# Попробуем применить модель в виде класса к данным
model = SimpleNN(n_features, n_classes)

answer = model(data)
print(answer.shape)
print(answer)

torch.Size([5, 3])
tensor([[ 0.5724,  0.0264, -0.9001],
        [ 0.3813,  0.7960, -0.4653],
        [ 0.2878, -0.9297, -0.8761],
        [-0.4443, -0.0975, -0.2454],
        [-0.1670, -0.1594,  0.1794]], grad_fn=<AddmmBackward>)


In [None]:
!pip install torchsummary
from torchsummary import summary

model = SimpleNN(n_features, n_classes).cuda()

# 5, 10
input_size = (batch_size, n_features)
print(summary(model, input_size))

You should consider upgrading via the '/home/yessense/PycharmProjects/scene_vae/venv/bin/python -m pip install --upgrade pip' command.[0m
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 5, 3]              33
Total params: 33
Trainable params: 33
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------
None


In [None]:
# Модель как sequential
model = nn.Sequential(nn.Linear(n_features, n_classes))

answer = model(data)
print(answer.shape)
print(answer)

In [None]:
# Модель как nn.ModuleList

model = nn.ModuleList([nn.Linear(n_features, n_classes)])

# answer = model(data)
# print(answer.shape)
# print(answer)

answer = model[0](data)
print(answer.shape)
print(answer)


torch.Size([5, 3])
tensor([[-0.3989, -1.1825,  0.7602],
        [-0.0798,  0.7053, -0.0033],
        [-0.2838, -0.0166, -0.5549],
        [-0.1167,  0.6097, -0.4273],
        [-0.2631,  0.2675, -0.2055]], grad_fn=<AddmmBackward>)


In [None]:
# Проверим параметры модели
class ParametersCheck(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()

        self.lin = nn.Linear(n_features, n_classes)
        self.seq = nn.Sequential(nn.Linear(n_features, n_classes))
        self.module_list = nn.ModuleList([nn.Linear(n_features, n_classes)])
        self.list_of_layers = [nn.Linear(n_features, n_classes)]


In [None]:
model = ParametersCheck(n_features, n_classes)

for i, param in enumerate(model.parameters()):
    print(f'Параметр #{i + 1}.')
    print(f'\t{param.shape}')

Параметр #1.
	torch.Size([3, 10])
Параметр #2.
	torch.Size([3])
Параметр #3.
	torch.Size([3, 10])
Параметр #4.
	torch.Size([3])
Параметр #5.
	torch.Size([3, 10])
Параметр #6.
	torch.Size([3])


## ViT

![alt text](https://drive.google.com/uc?export=view&id=1J5TvycDPs8pzfvlXvtO5MCFBy64yp9Fa)

In [None]:
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 KB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.0


In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary

![](https://amaarora.github.io/images/vit-01.png)

## Часть 1. Patch Embedding, CLS Token, Position Encoding

![](https://amaarora.github.io/images/vit-02.png)

In [None]:
# input image `B, C, H, W`
x = torch.randn(1, 3, 224, 224)
# 2D conv
conv = nn.Conv2d(3, 768, 16, 16)
conv(x).reshape(-1, 196).transpose(0,1).shape

torch.Size([196, 768])

In [None]:
class PatchEmbedding(nn.Module):
    """ Image to Patch Embedding
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()

        ...

        self.patch_embeddings = nn. ...

    def forward(self, image):

        ...
        
        return patches

In [None]:
patch_embed = PatchEmbedding()
x = torch.randn(1, 3, 224, 224)
patch_embed(x).shape 

torch.Size([1, 196, 768])

![](https://amaarora.github.io/images/vit-03.png)

## Часть 2. Transformer Encoder

![](https://amaarora.github.io/images/ViT.png)

![](https://amaarora.github.io/images/vit-07.png)

In [None]:
class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, drop=0.):
        super().__init__()

        # Linear Layers
        ...

        # Activation(s)
        ...

    def forward(self, x):
        
        ...

        return x

In [None]:
x = torch.randn(1, 197,768)
mlp = MLP(768, 3072, 768)
out = mlp(x)
out.shape

torch.Size([1, 197, 768])

In [None]:
class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., out_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim ** -0.5

        self.qkv = ...
        self.attn_drop = nn.Dropout(attn_drop)
        self.out = ...
        self.out_drop = nn.Dropout(out_drop)

    def forward(self, x):
        
        # Attention
        ...

        ...

        # Out projection

        ...

        return x


![](https://amaarora.github.io/images/vit-08.png)

In [None]:
# attn = (q @ k.transpose(-2, -1)) * self.scale
# attn = attn.softmax(dim=-1)

In [None]:
x = torch.randn(1, 197, 768)
attention = Attention(768, 8)
out = attention(x)
out.shape

torch.Size([1, 197, 768])

In [None]:
class Block(nn.Module):
    def __init__(self, dim, num_heads=8, mlp_ratio=4, drop_rate=0.):
        super().__init__()

        # Normalization
        ...

        # Attention
        ...

        # Dropout
        ...

        # Normalization
        ...

        # MLP
        ...
                

    def forward(self, x):
        # Attetnion
        ...

        # MLP
        ...
        return x

In [None]:
x = torch.randn(1, 197, 768)
block = Block(768, 8)
out = attention(x)
out.shape

torch.Size([1, 197, 768])

В оригинальной реализации теперь используется [DropPath](https://github.com/rwightman/pytorch-image-models/blob/e98c93264cde1657b188f974dc928b9d73303b18/timm/layers/drop.py)

In [None]:
class Transformer(nn.Module):
    def __init__(self, depth, dim, num_heads=8, mlp_ratio=4, drop_rate=0.):
        super().__init__()
        self.blocks = nn.ModuleList([
            Block(dim, num_heads, mlp_ratio, drop_rate)
            for i in range(depth)])

    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        return x

In [None]:
x = torch.randn(1, 197, 768)
block = Transformer(12, 768)
out = attention(x)
out.shape

torch.Size([1, 197, 768])

![](https://amaarora.github.io/images/vit-06.png)

In [None]:
from torch.nn.modules.normalization import LayerNorm

class ViT(nn.Module):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000,
                 embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., 
                 qkv_bias=False, drop_rate=0.,):
        super().__init__()

        # Присвоение переменных
        ...

        # Path Embeddings, CLS Token, Position Encoding
        ...

        # Transformer Encoder
        ...

        # Classifier
        ...

    def forward(self, x):
      
        # Path Embeddings, CLS Token, Position Encoding
        ...

        # Transformer Encoder
        ...

        # Classifier
        ...

        return x

In [None]:
x = torch.randn(1, 3, 224, 224)
vit = ViT()
out = vit(x)
out.shape

torch.Size([1, 1000])

# Домашнее задание


1. Выбрать датасет для классификации изображений с размерностью 64x64+ 
2. Обучить ViT на таком датасете.
3. Попробовать поменять размерности и посмотреть, что поменяется при обучении.


Примечание:
- Датасеты можно взять [тут](https://pytorch.org/vision/stable/datasets.html#built-in-datasets) или найти в другом месте.
- Из за того, что ViT учится медленно, количество примеров в датасете можно ограничить до 1к-5к.