# Transformer model architecture
![](./images/transformer_architecture.png)

# Transformer implementation

In [1]:
import torch

## Embed

### Embedding

In [2]:
class Embedder(torch.nn.Module):
    
    def __init__(self, vocab_size, d_model):
        super().__init_()
        self.embed = torch.nn.Embedding(vocab_size, d_model)
        
    def forward(self, x):
        return self.embed(x)

### Positional encoding

$$ PE_{(pos, 2i)} = sin(pos/10000^{2i/d_{model}}) $$

$$ PE_{(pos, 2i + 1)} = cos(pos/10000^{2i/d_{model}}) $$

In [3]:
import math

class PositionalEncoder(torch.nn.Module):
    
    def __init__(self, d_model, max_seq_len=80):
        super().__init()
        self.d_model = d_model
        
        # create constant positional encoding matrix
        pe_matrix = torch.zeros(max_seq_len, d_model)
        
        for pos in len(max_seq_len):
            for i in len(0, d_model, 2):
                pe_matrix[pos, i] = math.sin(pos/10000**(2*i/d_model))
                pe_matrix[pos, i+1] = math.cos(pos/10000**(2*i/d_model))
        pe_matrix = pe_matrix.unsqueeze(0)     # Add one dimension for batch size
        self.register_buffer('pe', pe_matrix)  # Register as persistent buffer
        
    def forward(self, x):
        # x is a sentence after embedding with dim (batch, number of words, vector dimension)
        seq_len = x.size()[1]
        x = x + self.pe[:, :seq_len]
        return x

## Model layers

### Scaled Dot-Product Attention layer

![](images/scaled_dot_product_attention.png)

In [4]:
import math
import torch.nn.functional as F

# Given Query, Key, Value, calculate the final weighted value
def scaled_dot_product_attention(q, k, v, mask=None, dropout=None):
    # Shape of q and k are the same, both are (batch_size, seq_len, d_k)
    # Shape of v is (batch_size, seq_len, d_v)
    attention_scores = torch.matmul(q, k.transpose(-2, -1))/math.sqrt(q.shape[-1])  # size (batch_size, seq_len, seq_len)
    
    # Apply mask to scores
    if mask is not None:
        mask = mask.unsqueeze(1)
        attention_scores = attention_scores.masked_fill(mask == 0, value=-1e9)
        
    # Softmax along the last dimension
    attention_weights = F.softmax(attention_scores, dim=-1)
    
    if dropout is not None:
        attention_weights = dropout(attention_weights)
        
    output = torch.matmul(attention_weights, v)
    return output

### Multi-Head Attention layer

![](images/multi_head_attention.png)

In [5]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, n_heads, d_model, dropout=0.1):
        super().__init__()
        
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k = self.d_v = d_model//n_heads
        
        # self attention linear layers
        self.q_linear = nn.Linear(d_model, self.d_k)
        self.k_linear = nn.Linear(d_model, self.d_k)
        self.v_linear = nn.Linear(d_model, self.d_v)
        
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(n_heads*self.d_v, d_model)
        
    def forward(self, x, mask=None):
        multi_head_attention_outputs = []
        for i in range(self.n_heads):
            q = self.q_linear(x)  # size: (batch_size, seq_len, d_k)
            k = self.k_linear(x)  # size: (batch_size, seq_len, d_k)
            v = self.v_linear(x)  # size: (batch_size, seq_len, d_v)
            
            # Scaled Dot-Product attention
            v = attention(q, k, v, mask, self.dropout)  # (batch_size, seq_len, d_v)
            multi_head_attention_outputs.append(v)
            
        # Concat
        concat = torch.cat(multi_head_attention_outputs, 1)  # (batch_size, seq_len, n_heads*d_v)
        
        # Linear layer to recover to original shape
        output = self.out(concat)  # (batch_size, seq_len, d_model)
        
        return output

### Feed Forward layer

In [6]:
class FeedForward(torch.nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        x = self.droput(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

### Layer Normalization layer