In [1]:
# import packages here
import torch

from torch.autograd import Variable
from torch import Tensor
import torch.nn as nn

In [5]:
#--------------------------------------------------
#       Model Training Function
#--------------------------------------------------
import torch.optim as optim
import time

def trainModel(net, trainloader, train_option, testloader=None):
  loss_func = nn.CrossEntropyLoss()
  lr = train_option['lr']
  epoch = train_option['epoch']
  device = train_option['device'] if 'device' in train_option.keys() else 'cpu'
  log_iter = train_option['log_iter'] if 'log_iter' in train_option.keys() else 20
  eval_epoch = 1

  if 'optimizer' in train_option.keys():
    optimizer = train_option['optimizer']
  else:
    optimizer = optim.Adam(net.parameters(), lr=lr)

  start_time = time.time()
  if device == 'gpu':
    net = net.cuda()

  iters = 0
  running_loss = 0.0
  for ep in range(epoch):
    net.train()
    for iter, (x, y) in enumerate(trainloader):
      iters += 1
      batch_x = Variable(x).float()
      batch_y = Variable(y).long()
      if device == 'gpu':
        batch_x = batch_x.cuda()
        batch_y = batch_y.cuda()

      outputs = net(batch_x)
      loss = loss_func(outputs, batch_y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      running_loss += loss.item()

      time_lapse = time.strftime('%H:%M:%S', time.gmtime(time.time() - start_time))
      if iter % log_iter == 0:
        print('Epoch:{:2d} | Iter:{:5d} | Time: {} | Train Loss: {:.4f} | Average Loss: {:.4f} '.format(ep+1, iter, time_lapse, loss.item(), running_loss/iters))

    if testloader is not None and ep % eval_epoch == 0:
      evalModel(net, testloader)


In [6]:
#--------------------------------------------------
#       Model Evaluating Function
#--------------------------------------------------
import time

def evalModel(net, testloader):
  acc = 0.0
  count = 0
  start_time = time.time()
  device = 'gpu' if next(net.parameters()).is_cuda else 'cpu'
  net.eval()

  for iter, (x, y) in enumerate(testloader):
        count += x.shape[0]
        batch_x = Variable(x).float()
        batch_y = Variable(y).long()
        if device == 'gpu':
          batch_x = batch_x.cuda()
          batch_y = batch_y.cuda()
        outputs = net(batch_x)
        acc += torch.sum(outputs.max(1)[1]==batch_y)

  time_lapse = time.strftime('%H:%M:%S', time.gmtime(time.time() - start_time))
  print('Accuracy: {:5f} | Time: {}'.format(acc/count,time_lapse))


First I will build a Transformer Encoder Layer. Vison Transformers consist of multiple transformer encoder layers (i.e. the left block of the following image). My implementation includes a multi-head attention layer, a feed forward layer, two norm layers and two residual connection.

<img src="https://production-media.paperswithcode.com/method_collections/trans.jpeg" width="400">

In [7]:
class TransformerEncoderLayer(nn.Module):
  def __init__(self,
               embedding_dims = 128,
               dropout=0.1,
               mlp_hidden_dim = 32,
               num_heads = 2,
               ):
    super().__init__()


    # Multihead Attn
    self.attention = nn.MultiheadAttention(embed_dim=embedding_dims, num_heads=num_heads, dropout=dropout)

    # Feedforward Layer
    self.feed_forward = nn.Sequential(
      nn.Linear(embedding_dims, mlp_hidden_dim),
      nn.ReLU(),
      nn.Linear(mlp_hidden_dim, embedding_dims),
    )

    # Layer Norm
    self.norm1 = nn.LayerNorm(embedding_dims)
    self.norm2 = nn.LayerNorm(embedding_dims)

    # Dropout
    self.dropout = nn.Dropout(dropout)


  def forward(self, x):

    # Multi-Head Self-Attention
    attn_output, _ = self.attention(x, x, x)
    x = x + self.dropout(attn_output)
    x = self.norm1(x)

    # Feedforward Layer
    ff_output = self.feed_forward(x)
    x = x + self.dropout(ff_output)
    x = self.norm2(x)


    return x

<img src=https://production-media.paperswithcode.com/methods/Screen_Shot_2021-01-26_at_9.43.31_PM_uI4jjMq.png width="400">

Now I will start coding the Vision Transformer. My ViT consists of:


**1) Data Preprocessing**: Implementing the patchify process to transfer the image (batch_size, channel, height, width) to a sequence of tokens (batch_size, num_tokens, embedding_dimension).


**2) Positional Encoding**

**3) Extra Learnable [CLASS] embedding**

**4) Transformer Encoder**: Constructing a transformer encoder by TransformerEncoderLayer you have already built.

**5) Prediction**: Building the MLP Head and making classification.

I will train my ViT on MNIST and
 - report the test accuracy
 - save my ViT model in a checkpoint file named: **./vit_trained.pt**  

In [8]:
class ViT(nn.Module):
    def __init__(self,
                 img_size=64,
                 in_channels=1,
                 patch_size=16,
                 embedding_dims=128,
                 num_transformer_layers=2,
                 dropout=0.1,
                 mlp_hidden_dim=128,
                 num_heads=2,
                 num_classes=22):
        super().__init__()
        '''
    img_size: img height and width.
    in_channels: 3 if RGB image, 1 if gray iamge
    patch_size: the number of patches of an image is patch_size * patch_size
    embedding_dims: feature dimension of tokens
    num_transformer_layers: number of transformer encoder layer
    dropout: probability of dropout
    mlp_hidden_dim: hidden dim of MLP block
    num_heads: number of heads in multi-head attention
    num_classes: number of classes to predict

    You are suggested but NOT required to use all inputs.
    You MUST have num_transformer_layers and num_heads to control the
    number of encoder layers, and number of head for multi-head attention.

    Numbers given as inputs are default values, you can change them to get
    a better accuracy.
    '''

        # Patch Embedding
        self.num_patches = (img_size // patch_size) ** 2
        self.patch_embedding = nn.Conv2d(in_channels, embedding_dims, kernel_size=patch_size, stride=patch_size)

        # Position Embedding
        self.position_embedding = nn.Parameter(torch.zeros(1, self.num_patches, embedding_dims))

        # Class Embedding
        self.class_embedding = nn.Parameter(torch.rand(1, embedding_dims))

        # Transformer Encoder Layers
        self.transformer_encoder_layers = nn.ModuleList()
        for _ in range(num_transformer_layers):
            self.transformer_encoder_layers.append(TransformerEncoderLayer(
                embedding_dims, dropout, mlp_hidden_dim, num_heads))

        # Classification Head
        _classification_head_shape = (self.num_patches + 1) * embedding_dims
        self.classification_head = nn.Linear(_classification_head_shape, num_classes)

    def forward(self, x: Tensor):

        # Patch Embedding
        x = self.patch_embedding(x)

        # Reshape into sequence of tokens
        x = x.flatten(2).transpose(1, 2)

        # Position Embedding
        x = x + self.position_embedding

        # Class Embedding
        x = torch.stack([torch.vstack((self.class_embedding, x[i])) for i in range(len(x))])

        # Transformer Encoder Layers
        for layer in self.transformer_encoder_layers:
            x = layer(x)

        # Classification Head
        x = x.flatten(1)
        x = self.classification_head(x)

        return x


In [9]:
# from torchinfo import summary
# net = ViT()
# first_x_batch = trainloader_small[0][0]
# summary(net, input_size=first_x_batch.shape)

In [10]:
#load MNIST dataset
from torchvision.datasets.mnist import MNIST
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader

transform = ToTensor()
train_set = MNIST(root='./sample_data', train=True, download=True, transform=transform)
test_set = MNIST(root='./sample_data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_set, shuffle=True, batch_size=128)
test_loader = DataLoader(test_set, shuffle=False, batch_size=128)

In [11]:
#--------------------------------------------------
#       Start Training & Evaluation
#--------------------------------------------------

# define training options
train_option = {}
train_option['lr'] = 0.001
train_option['epoch'] = 10
train_option['device'] = 'gpu'

# start training
net = ViT()
trainModel(net, train_loader, train_option, test_loader)


Epoch: 1 | Iter:    0 | Time: 00:00:03 | Train Loss: 3.1802 | Average Loss: 3.1802 
Epoch: 1 | Iter:   20 | Time: 00:00:04 | Train Loss: 1.0668 | Average Loss: 1.3398 
Epoch: 1 | Iter:   40 | Time: 00:00:04 | Train Loss: 0.9968 | Average Loss: 1.1111 
Epoch: 1 | Iter:   60 | Time: 00:00:05 | Train Loss: 0.8847 | Average Loss: 1.0144 
Epoch: 1 | Iter:   80 | Time: 00:00:06 | Train Loss: 0.6256 | Average Loss: 0.9372 
Epoch: 1 | Iter:  100 | Time: 00:00:07 | Train Loss: 0.6232 | Average Loss: 0.8811 
Epoch: 1 | Iter:  120 | Time: 00:00:08 | Train Loss: 0.5890 | Average Loss: 0.8317 
Epoch: 1 | Iter:  140 | Time: 00:00:09 | Train Loss: 0.6382 | Average Loss: 0.7966 
Epoch: 1 | Iter:  160 | Time: 00:00:10 | Train Loss: 0.5434 | Average Loss: 0.7661 
Epoch: 1 | Iter:  180 | Time: 00:00:11 | Train Loss: 0.7342 | Average Loss: 0.7407 
Epoch: 1 | Iter:  200 | Time: 00:00:11 | Train Loss: 0.4170 | Average Loss: 0.7200 
Epoch: 1 | Iter:  220 | Time: 00:00:12 | Train Loss: 0.6140 | Average Loss: 

Accuracy on Test Set: 0.909100 at epoch 10 | Time for training 00:03:22 | Time for testing 00:00:01

In [12]:
# save model as vit_trained.pt
# torch.save(net.state_dict(), 'vit_trained.pt')

<!--Write your report here in markdown or html-->
