# Chapter_02: A Deeper look into transformers

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

In [3]:
DEVICE = 'mps' if torch.backends.mps.is_available() else 'cpu'
DEVICE

'mps'

## Encoder and decoder stacks

## Encoder Part of the Transformer

### Simplified encoder layer example

<img src="https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Fwww.researchgate.net%2Fpublication%2F334288604%2Ffigure%2Ffig1%2FAS%3A778232232148992%401562556431066%2FThe-Transformer-encoder-structure.ppm&f=1&nofb=1&ipt=41cff290b603e54651f280652c0c36716a4db949384a706382450d2c4de21547">

In [24]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, mask=None):
        super().__init__()

        # Self Attention Layer
        self.self_attn = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, dropout=dropout, device=DEVICE, batch_first=True)

        # FFN Layer
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, 2 * dim_feedforward),
            nn.GLU(dim=-1), # shape: dim_feedforward
            nn.Linear(in_features=dim_feedforward, out_features=d_model),
            nn.Dropout(dropout),
        )

        # Normalization Layers
        self.norm1 = nn.LayerNorm(normalized_shape=d_model)
        self.norm2 = nn.LayerNorm(normalized_shape=d_model)

        # Dropout layers
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Self Attention Layer
        attn_output, attn_output_weights = self.self_attn(query=x, key=x, value=x, attn_mask=mask)
        
        x = x + self.dropout1(attn_output) # -------> Residual connection
        x = self.norm1(x) # ---------> Normalisation

        # FFN Layer
        ff_output = self.feed_forward(x)
        
        x = x + self.dropout2(ff_output) # -------> Residual connection
        x = self.norm2(x) # ---------> Normalisation

        return x

In [25]:
# Parameters
d_model = 512
nhead = 1
dim_feedforward = 1024
mask = None
dropout=0.1

In [35]:
encoder = EncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, mask=mask).to(DEVICE)

In [36]:
encoder

EncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (feed_forward): Sequential(
    (0): Linear(in_features=512, out_features=2048, bias=True)
    (1): GLU(dim=-1)
    (2): Linear(in_features=1024, out_features=512, bias=True)
    (3): Dropout(p=0.1, inplace=False)
  )
  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
)

- 1 sentence
- 10 tokens
- each token â†’ 512-d vector

In [88]:
X = torch.randn(1, 10, 512).to(DEVICE)

In [89]:
y = encoder(X).to(DEVICE)

In [90]:
y.shape

torch.Size([1, 10, 512])

In [91]:
y

tensor([[[-1.0316, -1.2937, -0.6671,  ...,  0.3537,  0.4786, -0.1570],
         [ 0.2058, -0.9301, -0.1024,  ..., -0.2799,  0.5939, -0.1482],
         [-0.9173,  1.0031,  1.0584,  ...,  1.5927, -0.3666,  0.7721],
         ...,
         [-0.5479, -1.6185,  0.2500,  ..., -0.2237, -0.4919,  1.0004],
         [ 0.3741, -1.0538, -1.5171,  ..., -0.3846,  0.1506, -0.6575],
         [-0.7019, -0.4913, -0.5312,  ...,  0.7327,  0.3981,  0.9056]]],
       device='mps:0', grad_fn=<NativeLayerNormBackward0>)

In [92]:
X

tensor([[[-0.9320, -1.2575, -0.8526,  ...,  0.1295,  0.3823, -0.1893],
         [ 0.2127, -0.9730, -0.3256,  ...,  0.1059,  0.4513,  0.1268],
         [-0.8601,  1.3270,  0.7551,  ...,  1.6539, -0.4757,  0.8001],
         ...,
         [-0.9603, -1.3108,  0.0224,  ...,  0.0707, -0.3812,  1.0555],
         [ 0.4042, -0.4574, -1.9427,  ..., -0.0130,  0.3644, -0.3996],
         [-0.6386, -0.4629, -0.4960,  ...,  1.1980,  0.4254,  1.0251]]],
       device='mps:0')