In [1]:
# code borrowed from :: https://github.com/sjchoi86/upstage-basic-deeplearning/blob/main/notebook/mha.ipynb
#  https://github.com/Huffon/pytorch-transformer-kor-eng/tree/18ac0f71d8d4f6eab1720da65bd497d17e8604a9
import math
from IPython.display import clear_output
import matplotlib.pyplot as plt
import numpy as np
import cv2
import io
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from collections import OrderedDict, Counter

### Dataset Preparation

korean to english (from :: https://github.com/Huffon/pytorch-transformer-kor-eng/tree/18ac0f71d8d4f6eab1720da65bd497d17e8604a9)

In [3]:
BATCH_SIZE = 32

from utils import load_dataset, make_iter, Params

train_data, valid_data = load_dataset('train')
train_iter, valid_iter = make_iter(BATCH_SIZE, 'train',
                                           train_data=train_data, valid_data=valid_data)


Loading AI Hub Kor-Eng translation dataset and converting it to pandas DataFrame . . .
Number of training examples: 92000
Number of validation examples: 11500


OSError: [E941] Can't find model 'en'. It looks like you're trying to load a model from a shortcut, which is deprecated as of spaCy v3.0. To load the model, use its full name instead:

nlp = spacy.load("en_core_web_sm")

For more details on the available models, see the models directory: https://spacy.io/models. If you want to create a blank model, use spacy.blank: nlp = spacy.blank("en")

In [3]:
train_data[0]

KeyError: 0

## Transformer 

### Self-Attention

+ Data $X \in \mathbb{R}^{n \times d}$ 
+ $Q, K \in \mathbb{R}^{n \times d_K}$
+ $V \in \mathbb{R}^{n \times d_V}$

$\text{Attention}(Q,K,V) = \text{softmax}(\frac{QK^T}{\sqrt{d_K}})V  \in \mathbb{R}^{n \times d_V}$

### MultiHeadAttention

$\text{head}_i = \text{Attention}(QW^Q_i , KW^K_i, VW^V_i)$

In [40]:
from einops import rearrange, repeat
from einops.layers.torch import Rearrange
from torch import einsum

class MultiHeadAttention(nn.Module):
    def __init__(self, indim, heads = 8, dim_head = 64, dropout=0.1,USE_BIAS=False):
        super(MultiHeadAttention, self).__init__()
        self.heads = heads
        self.USE_BIAS = USE_BIAS
        self.scale = dim_head ** -0.5
        self.qkv = nn.Linear(indim, heads * dim_head * 3, bias = self.USE_BIAS)
        self.lin = nn.Linear(dim_head * heads , indim , bias = self.USE_BIAS)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self,x,mask=None):
        b, n, _ ,  h = *x.shape, self.heads
        qkv = self.qkv(x).chunk(3, dim = -1)
        q,k,v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv) #batch, heads, seq, dim
        
        scores = einsum('b h i d, b h j d -> b h i j',q,k) * self.scale  # batch, heads, seq, seq
        if mask is not None:
            scores = scores.masked_fill(mask==0,-1e9)
        attn = scores.softmax(dim=-1)
        
        out = einsum('b h i n, b h n d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out = self.lin(out)
        out = self.dropout(out)
        
        return out, attn
        

# Self-Attention Layer
n_batch = 128
d_feat  = 32
dim_head = 64
n_words = 12
x = torch.rand(n_batch,n_words,d_feat)
self_attention = MultiHeadAttention(indim=d_feat, dim_head=dim_head)
out, attn = self_attention(x)
print(x.shape , ' -> ', out.shape , attn.shape)


torch.Size([128, 12, 32])  ->  torch.Size([128, 12, 32]) torch.Size([128, 8, 12, 12])


## Encoder

**Feed Forward Network**

In [41]:
class FeedForward(nn.Module):
    def __init__(self, indim, hdim, dropout = 0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(indim, hdim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hdim, indim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

In [60]:
class EncoderLayer(nn.Module):
    def __init__(self, hdim, heads=8, dim_head=64):
        super(EncoderLayer, self).__init__()
        self.LayerNorm = nn.LayerNorm(hdim, eps=1e-6)
        self.mha = MultiHeadAttention(hdim, heads, dim_head)
        self.ffn = FeedForward(hdim, hdim)

    def forward(self, x, mask=None):
        # x [b x n x d]

        x = self.LayerNorm(x)
        out ,attn = self.mha(x, mask=mask) # out [b x n x d] attn [b x h x n x n]
        
        out = self.LayerNorm(out)
        out = out + self.ffn(out)  # [b x n x d]

        return out, attn


In [81]:
class Encoder(nn.Module):
    def __init__(self, indim, hdim, heads=8, dim_head=64, depth=4):
        super().__init__()
        self.depth = 4
        self.layers = nn.ModuleList([])
        self.embedding = nn.Embedding(indim, hdim)
        for _ in range(depth):
            self.layers.append(EncoderLayer(hdim=hdim, heads=heads, dim_head=dim_head))
        self.layers = nn.Sequential(*self.layers)
    def forward(self, x, mask = None):
        x = self.embedding(x)
        attn_ = list()
        for layer in self.layers:
            x, attn = layer(x, mask = mask)
            attn_.append(attn)
        return x, attn_
    
# Self-Attention Layer
n_batch = 128
d_feat  = 32
dim_head = 64
x = iter(train_iter).next()[0]
encoder = Encoder(indim=1000 ,hdim=32, heads=8, dim_head=dim_head , depth=4)
out , attn_ = encoder(x)
print(x.shape , ' -> ', out.shape, attn_[0].shape)


IndexError: index out of range in self