# Encoder network implementation
---
### Formula
$$Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V$$



#### Shapes (Attention)
|Object             |Shape                    | Comment                                       |
|-------------------|-------------------------|-----------------------------------------------|
|$q_i, k_i$         | $d_k$                   | Query and Key representation (64 in paper)    |
|$v_i$              | $d_v$                   | Value representation (64 in paper)            |
|$x_i$              | $d_{model}$             | Word representation (embedding, 512 in paper) |
|$W^{(Q)}, W^{(K)}$ | $d_{model} \times d_k$  | Weight matrix                                 |
|$W^{(V)}$          | $d_{model} \times d_v$  | Weight matrix                                 |


$ T $ - Sequence length

$X (T \times d_{model}) $ 

$Q = XW^{(Q)} \rightarrow (T \times d_{model}) \times (d_{model} \times d_k) \rightarrow (T \times d_k)$

$K = XW^{(K)} \rightarrow (T \times d_{model}) \times (d_{model} \times d_k) \rightarrow (T \times d_k)$

$V = XW^{(V)} \rightarrow (T \times d_{model}) \times (d_{model} \times d_v) \rightarrow (T \times d_v)$

$QK^T \rightarrow (T \times d_k) \times (d_k \times T) \rightarrow (T \times T)$

$\frac{QK^T}{\sqrt{d_k}} \rightarrow (T \times T)$

$\frac{QK^T}{\sqrt{d_k}}V \rightarrow (T \times T) \times (T \times d_v) \rightarrow (T \times d_v)$







In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
 
import numpy as np
import matplotlib.pyplot as plt

 <img src="./images/attention.png" alt="Attention and Multi-head Attention" width="505" />
 
*Image from: https://data-science-blog.com/blog/2021/04/07/multi-head-attention-mechanism/*

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_k, d_model, n_heads):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_k
        self.n_heads = n_heads
        
        self.query = nn.Linear(d_model, d_k * n_heads)
        self.key = nn.Linear(d_model, d_k * n_heads)
        self.value = nn.Linear(d_model, d_k * n_heads)
        
        self.out = nn.Linear(d_k * n_heads, d_model)
        
    def forward(self, q, k, v, mask=None):
        
        # Attention(Q, K, V) = softmax(QK^T / sqrt(d_k)) V
        
        q = self.query(q) # N x T x (h*d_k) 
        k = self.key(k)   # N x T x (h*d_k)
        v = self.value(v) # N x T x (h*d_v) # d_v == d_k
        
        N = q.shape[0] # batch size
        T = q.shape[1] # sequence length
        
        # Changing shapes (reuqing for matrix multiplication)
        # view: (N, T, h*d_k) -> (N, T, h, d_k)
        # transpose: (N, T, h, d_k) -> (N, h, T, d_k)
        
        q = q.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        k = k.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        v = v.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        
        # (N, h, T, d_k) x (N, h, d_k, T) -> (N, h, T, T)
        atention_scores = q * k.transpose(-2, -1) / np.sqrt(self.d_k)
        
        if mask is not None:
            # Mask has (N, T) shape, so we need to add two (inner) dimensions
            # We also change zeros with -inf, so that softmax will ignore these values
            atention_scores = atention_scores.masked_fill(
                 mask[:, None, None, :] == 0, float('-inf')
                 )
        attention_weights = F.softmax(atention_scores, dim=-1)
        
        A = attention_weights @ v
        
        # Reshape (N, h, T, d_k) -> (N, T, h, d_k) -> (N, T, h*d_k)
        A = A.transpose(1, 2)
        
        # Concatenate
        A = A.contiguous().view(N, T, self.n_heads * self.d_k)
        
        return self.out(A)
        
        
                


 <img src="./images/TransformerBlock.png" alt="Transformer Block" width="300" />
 
*Image from: Duan, Wenying & Jiang, Liu & Wang, Ning & Rao, Hong. (2019). Pre-Trained Bidirectional Temporal Representation for Crowd Flows Prediction in Regular Region. IEEE Access. PP. 1-1. 10.1109/ACCESS.2019.2944990.* 

In [None]:
class TransformetBlock(nn.Module):
    def __init__(self, d_k, d_model, n_heads, dropout=0.1):
        super().__init__()
        
        self.ln1 = nn.LayerNorm(d_model)
        
        self.attention = MultiHeadAttention(d_k, d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
        self.ff = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model)
        )
        
    def forward(self, x, mask=None):
        x = self.ln1(x + self.attention(x, x, x, mask))
        x = self.ln2(x + self.ff(x))
        x = self.dropout(x)
        return x

### Positional encoding


$$PE_{(pos, 2i)}=sin(pos/10000^{2i/d_{model}})$$
$$PE_{(pos, 2i+1)}=cos(pos/10000^{2i/d_{model}})$$


This line requires additional explanations:
```bash
div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model))
```
Generally, this is exactly $$1/(10000^{2i/d_{model}})$$, and we are doing it for numerical stability only. Let's do the math:

$$10,000^{\frac{2i}{d_{model}}}$$
$$=(e^{log(10,000)})^{\frac{2i}{d_{model}}}$$
$$=e^{\frac{2i * log(10,000)}{d_{model}}}$$

Additionaly, we use $-log(10,000)$ because we want $1/(10000^{2i/d_{model}})$ and
$$a^{-n} = \frac{1}{a^n}$$ 

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        
        # [ [0], [1], [2], ..., [max_len-1] ]
        # 2d array of size max_len x 1
        position = torch.arange(max_len).unsqueeze(1)
        
        #[0, 2, 4, ...]
        exp_term = torch.arange(0, d_model, 2) 
        
        
        div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe)
        
        def forward(self, x):
            # x.shape: N x T x D
            x = x + self.pe[:, :x.size(1), :]
            return self.dropout(x)
        
        
        

In [4]:
torch.arange(0, 10, 2)

tensor([0, 2, 4, 6, 8])