# Encoder network implementation
---
### Formula
$$Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V$$



#### Shapes (Attention)
|Object             |Shape                    | Comment                                       |
|-------------------|-------------------------|-----------------------------------------------|
|$q_i, k_i$         | $d_k$                   | Query and Key representation (64 in paper)    |
|$v_i$              | $d_v$                   | Value representation (64 in paper)            |
|$x_i$              | $d_{model}$             | Word representation (embedding, 512 in paper) |
|$W^{(Q)}, W^{(K)}$ | $d_{model} \times d_k$  | Weight matrix                                 |
|$W^{(V)}$          | $d_{model} \times d_v$  | Weight matrix                                 |


$ T $ - Sequence length

$X (T \times d_{model}) $ 

$Q = XW^{(Q)} \rightarrow (T \times d_{model}) \times (d_{model} \times d_k) \rightarrow (T \times d_k)$

$K = XW^{(K)} \rightarrow (T \times d_{model}) \times (d_{model} \times d_k) \rightarrow (T \times d_k)$

$V = XW^{(V)} \rightarrow (T \times d_{model}) \times (d_{model} \times d_v) \rightarrow (T \times d_v)$

$QK^T \rightarrow (T \times d_k) \times (d_k \times T) \rightarrow (T \times T)$

$\frac{QK^T}{\sqrt{d_k}} \rightarrow (T \times T)$

$\frac{QK^T}{\sqrt{d_k}}V \rightarrow (T \times T) \times (T \times d_v) \rightarrow (T \times d_v)$







In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
 
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime

 <img src="./images/attention.png" alt="Attention and Multi-head Attention" width="505" />
 
*Image from [Attention is All you need](https://arxiv.org/abs/1706.03762) paper*

In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_k, d_model, n_heads):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_k
        self.n_heads = n_heads
        
        self.query = nn.Linear(d_model, d_k * n_heads)
        self.key = nn.Linear(d_model, d_k * n_heads)
        self.value = nn.Linear(d_model, d_k * n_heads)
        
        self.out = nn.Linear(d_k * n_heads, d_model)
        
    def forward(self, q, k, v, pad_mask=None):
        
        # Attention(Q, K, V) = softmax(QK^T / sqrt(d_k)) V
        
        q = self.query(q) # N x T x (h*d_k) 
        k = self.key(k)   # N x T x (h*d_k)
        v = self.value(v) # N x T x (h*d_v) # d_v == d_k
        
        N = q.shape[0] # batch size
        T = q.shape[1] # sequence length
        
        # Changing shapes (reuqired for matrix multiplication)
        # view: (N, T, h*d_k) -> (N, T, h, d_k)
        # transpose: (N, T, h, d_k) -> (N, h, T, d_k)
        
        q = q.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        k = k.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        v = v.view(N, T, self.n_heads, self.d_k).transpose(1, 2)
        
        # (N, h, T, d_k) x (N, h, d_k, T) -> (N, h, T, T)
        atention_scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)
        
        if pad_mask is not None:
            # Mask has (N, T) shape, so we need to add two (inner) dimensions
            # We also change zeros with -inf, so that softmax will ignore these values
            atention_scores = atention_scores.masked_fill(
                 pad_mask[:, None, None, :] == 0, float('-inf')
                 )
        attention_weights = F.softmax(atention_scores, dim=-1)
        
        A = attention_weights @ v
        
        # Reshape (N, h, T, d_k) -> (N, T, h, d_k) -> (N, T, h*d_k)
        A = A.transpose(1, 2)
        
        # Concatenate
        A = A.contiguous().view(N, T, self.n_heads * self.d_k)
        
        return self.out(A)
        
        
                


 <img src="./images/TransformerBlock.png" alt="Transformer Block" width="300" />
 
*Image from: Duan, Wenying & Jiang, Liu & Wang, Ning & Rao, Hong. (2019). Pre-Trained Bidirectional Temporal Representation for Crowd Flows Prediction in Regular Region. IEEE Access. PP. 1-1. 10.1109/ACCESS.2019.2944990.* 

In [3]:
class TransformerBlock(nn.Module):
    def __init__(self, d_k, d_model, n_heads, dropout=0.1):
        super().__init__()
                
        self.attention = MultiHeadAttention(d_k, d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
        self.ff = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout)
        )
        

        
    def forward(self, x, pad_mask=None):
        x = self.norm1(x + self.attention(x, x, x, pad_mask))
        x = self.norm2(x + self.ff(x))
        x = self.dropout(x)
        return x

### Positional encoding


$$PE_{(pos, 2i)}=sin(pos/10000^{2i/d_{model}})$$
$$PE_{(pos, 2i+1)}=cos(pos/10000^{2i/d_{model}})$$


This line requires additional explanations:
```bash
div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model))
```
Generally, this is exactly $$1/(10000^{2i/d_{model}})$$, and we are doing it for numerical stability only. Let's do the math:

$$10,000^{\frac{2i}{d_{model}}}$$
$$=(e^{log(10,000)})^{\frac{2i}{d_{model}}}$$
$$=e^{\frac{2i * log(10,000)}{d_{model}}}$$

Additionaly, we use $-log(10,000)$ because we want $1/(10000^{2i/d_{model}})$ and
$$a^{-n} = \frac{1}{a^n}$$ 

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        
        # [ [0], [1], [2], ..., [max_len-1] ]
        # 2d array of size max_len x 1
        position = torch.arange(max_len).unsqueeze(1)
        
        #[0, 2, 4, ...]
        exp_term = torch.arange(0, d_model, 2) 
        
        
        div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x.shape: N x T x D
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)
        
        
        

### Encoder

In [5]:
class Encoder(nn.Module):
    def __init__(
        self, 
        vocab_size : int,
        max_len : int,
        d_k : int,
        d_model : int,
        n_heads : int,
        n_layers : int,
        n_classes : int,
        dropout : float = 0.1,
    ):
    
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len, dropout=dropout)
        transformer_blocks = [
            TransformerBlock(d_k, d_model, n_heads, dropout=dropout)
            for _ in range(n_layers)
        ]
        
        self.transformer_blocks = nn.Sequential(*transformer_blocks)
        self.norm = nn.LayerNorm(d_model)
        self.out = nn.Linear(d_model, n_classes)
        
    def forward(self, x, pad_mask = None):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for block in self.transformer_blocks:
            x = block(x, pad_mask)
        
        # This is tricky. We assume that we are doing text classification
        # and we only care about the first token
        # As e result, the dimension is changed
        # N x T x D -> N x D   
        x = x[:, 0, :]
        
        x = self.norm(x)
        return self.out(x)
        
        

In [6]:
model = Encoder(
    vocab_size=20_000,
    max_len = 1024,
    d_k = 16,
    d_model = 64,
    n_heads = 4,
    n_layers = 2,
    n_classes = 5,
    dropout = 0.1,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print (device)
model.to(device)

cuda


Encoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (attention): MultiHeadAttention(
        (query): Linear(in_features=64, out_features=64, bias=True)
        (key): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (out): Linear(in_features=64, out_features=64, bias=True)
      )
      (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (ff): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
    )
    (1): TransformerBlock(
      (attention): MultiHeadAtt

Let's test if the model works

In [7]:
batch_size = 16
nr_words = 512
x = np.random.randint(0, 20_000, size=(batch_size, nr_words))
x_t = torch.tensor(x).to(device)

mask = np.ones((batch_size, nr_words))
mask[:, 256:] = 0
mask_t = torch.tensor(mask).to(device)

y = model(x_t, mask_t)
y.shape

torch.Size([16, 5])

## Training and evaluation

In [8]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
from datasets import load_dataset
raw_datasets = load_dataset('glue', 'sst2')
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [10]:
raw_datasets['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [11]:
def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [13]:
# Remove obsolete columns from datasets
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [14]:
from torch.utils.data import DataLoader
train_loader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=32, 
    shuffle=True,
    collate_fn=data_collator
    )

valid_loader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=32, 
    collate_fn=data_collator
    )

In [15]:
for batch in train_loader:
    for k, v in batch.items():
        print(f'k: {k}, v.shape: {v.shape}')
    break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


k: labels, v.shape: torch.Size([32])
k: input_ids, v.shape: torch.Size([32, 38])
k: attention_mask, v.shape: torch.Size([32, 38])


In [16]:
print ('Nr classes:', set(tokenized_datasets['train']['labels']))
print ('Vocab size:', tokenizer.vocab_size)
# Can also be max_model_input_sizes to display sizes for all models
print ('Max length:', tokenizer.model_max_length)


Nr classes: {0, 1}
Vocab size: 28996
Max length: 512


In [17]:
model = Encoder(
    vocab_size=tokenizer.vocab_size,
    max_len = tokenizer.model_max_length,
    d_k = 16,
    d_model = 64,
    n_heads = 4,
    n_layers = 2,
    n_classes = 2,
    dropout = 0.1,
)

model.to(device)

Encoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (attention): MultiHeadAttention(
        (query): Linear(in_features=64, out_features=64, bias=True)
        (key): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (out): Linear(in_features=64, out_features=64, bias=True)
      )
      (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (ff): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
    )
    (1): TransformerBlock(
      (attention): MultiHeadAtt

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [19]:
def train(model, criterion, optimizer, 
          train_loader, valid_loader, 
          epochs, print_every = 1):
    train_losses = np.zeros(epochs)
    valid_losses = np.zeros(epochs)
    
    for it in range(epochs):
        model.train()
        t0 = datetime.now()
        train_loss = 0
        n_train = 0
        
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()} # Move data to GPU
            optimizer.zero_grad()
            outputs = model(batch["input_ids"], batch["attention_mask"])
            loss = criterion(outputs, batch["labels"])
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()*batch["input_ids"].size(0)
            n_train += batch["input_ids"].size(0)

        train_loss /= n_train
        
        model.eval()
        valid_loss = 0
        n_valid = 0
        for batch in valid_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(batch["input_ids"], batch["attention_mask"])
            loss = criterion(outputs, batch["labels"])
            valid_loss += loss.item()*batch["input_ids"].size(0)
            n_valid += batch["input_ids"].size(0)
        
        valid_loss /= n_valid
        
        train_losses[it] = train_loss
        valid_losses[it] = valid_loss
        
        if it%print_every == 0:
            print (f'Epoch: {it}: Train loss: {train_loss:.4f}, Valid loss: {valid_loss:.4f}, Duration: {datetime.now() - t0}')
        
    return train_losses, valid_losses
        

In [20]:
train_losses, valid_losses = train(
    model, criterion, optimizer, 
    train_loader, valid_loader,
    epochs=4, print_every=1)

Epoch: 0: Train loss: 0.5365, Valid loss: 0.5034, Duration: 0:00:12.726300
Epoch: 1: Train loss: 0.3734, Valid loss: 0.4685, Duration: 0:00:12.342296
Epoch: 2: Train loss: 0.3005, Valid loss: 0.5104, Duration: 0:00:12.266042
Epoch: 3: Train loss: 0.2590, Valid loss: 0.5378, Duration: 0:00:12.239693


In [21]:
def get_metrics(model, loader):
    n_correct = 0.
    true_positives = 0.
    true_negatives = 0.
    false_positives = 0.
    false_negatives = 0.
    n_total = 0.
    
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        output = model(batch['input_ids'], batch['attention_mask'])

        _, pred = torch.max(output, 1)
        n_correct += (pred == batch['labels']).sum().item()
        
        true_positives += (pred * batch['labels']).sum().item()
        true_negatives += ((1 - pred) * (1 - batch['labels'])).sum().item()
        false_positives += (pred * (1 - batch['labels'])).sum().item()
        false_negatives += ((1 - pred) * batch['labels']).sum().item()
        
        n_total += batch['labels'].shape[0]
        
        
        accuracy = 100 * n_correct / n_total
        precision = 100 * true_positives / (true_positives + false_positives)
        recall = 100 * true_positives / (true_positives + false_negatives)
        
    return  accuracy, precision, recall

model.eval()
train_acc, train_prec, train_rec = get_metrics(model, train_loader)
valid_acc, valid_prec, valid_rec = get_metrics(model, valid_loader)

print (f'Train acc: {train_acc:.2f}%, valid acc: {valid_acc:.2f}%')
print (f'Train precision: {train_prec:.2f}%, valid prec: {valid_prec:.2f}%')
print (f'Train recall: {train_rec:.2f}%, valid recall: {valid_rec:.2f}%')

Train acc: 93.06%, valid acc: 79.36%
Train precision: 94.16%, valid prec: 81.88%
Train recall: 93.35%, valid recall: 76.35%
