# Transformer

## Multi-Headded Attention

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


### Scaled Dot-Product Attension
- Data $X \in R^{n\times d}$ whrer $n$ is the number of data and $d$ is the data dimension. Usually 512
- $Q,K\in R^{n\times d_k}$, where $d_k$ is dimension of $K$.
- $V\in R^{n\times d_v}$

##### $\text{Attention}(Q,K,V) = \text{softmax}\big(\frac{QK^T}{\sqrt{d_k}}\big)V \in \mathbb{R}^{n\times d_v}$

In [8]:
# self attention
class SDPA(nn.Module):
    def forward(self,Q,K,V,mask=None):
        d_k = K.size()[-1] # key dimension
        scores = Q.matmul(K.transpose(-2,-1))/np.sqrt(d_k)
        if mask is not None: #if mask exists
            scores = scores.maksed_fill(mask==0, -1e9) #mask == 0 인 부분에 -1e9
        attention = F.softmax(scores,dim=-1)#last dim (row vector) is a query score with other keys.
        out = attention.matmul(V)
        return out, attention
    
# Demo run
sdpa = SDPA()
n_batch,d_k,d_v = 3,128,258
Q_n, K_n, V_n = 30, 30, 30 # sequence length
Q = torch.rand(n_batch,Q_n,d_k)
K = torch.rand(n_batch,K_n,d_k)
V = torch.rand(n_batch,Q_n,d_v)
out, attention = sdpa(Q,K,V)
def sh(x): return str(x.shape)[11:-1]
print ("SDPA: Q%s K%s V%s => out%s attention%s"%
       (sh(Q),sh(K),sh(V),sh(out),sh(attention)))

#supports Multi-headedAttention as well
head = 5
Q = torch.rand(n_batch,head,Q_n,d_k)
K = torch.rand(n_batch,head,K_n,d_k)
V = torch.rand(n_batch,head,Q_n,d_v)
out, attention = sdpa(Q,K,V)
def sh(x): return str(x.shape)[11:-1]
print ("SDPA: Q%s K%s V%s => out%s attention%s"%
       (sh(Q),sh(K),sh(V),sh(out),sh(attention)))

#Batch 수가 head 보다 상위 dimension. 5개의 head를 3개씩 process.


SDPA: Q[3, 30, 128] K[3, 30, 128] V[3, 30, 258] => out[3, 30, 258] attention[3, 30, 30]
SDPA: Q[3, 5, 30, 128] K[3, 5, 30, 128] V[3, 5, 30, 258] => out[3, 5, 30, 258] attention[3, 5, 30, 30]


### Multi-Headed Attention (MHA)
- In the case of splitting embeddings into the number of heads and assign to each head.

In [37]:
class MHA(nn.Module):
    def __init__(self, d_feat=128,n_head=5,actv=F.relu,USE_BIAS=True,dropout_p=0.1,devise=None):
        """
        d_feat : feature dimension (same for each input words)
        n_head : number of heads
        actv : activation function after each linear
        USE_BIAS : whether to use bias
        dropout_p : dropout rate
        device : device select
        """

        super(MHA,self).__init__()
        if (d_feat%n_head) != 0:
            raise ValueError(f"d_feat({d_feat}) should be divisible by ({n_head})")
        self.d_feat = d_feat
        self.n_head = n_head
        self.d_head = self.d_feat // self.n_head
        self.actv = actv
        self.USE_BIAS = USE_BIAS
        self.dropout_p = dropout_p
        
        # embedding 된 latent vector 상태로 들어옴
        self.lin_Q = nn.Linear(d_feat,d_feat,self.USE_BIAS)
        self.lin_K = nn.Linear(d_feat,d_feat,self.USE_BIAS)
        self.lin_V = nn.Linear(d_feat,d_feat,self.USE_BIAS)
        self.lin_O = nn.Linear(d_feat,d_feat,self.USE_BIAS)
        #dropout arg 는 %로 주고, 클래스 안에서 nn.dropout 함수에 전달.
        self.dropout = nn.Dropout(p=self.dropout_p)

    def forward(self,Q,K,V,mask=None):
        """
        Q:[n_batch,n_Q,d_feat]
        K:[n_batch,n_K,d_feat]
        V:[n_batch,n_V,d_feat], where n_K == n_V
        """

        n_batch = Q.shape[0] # Q는 batch 단위로 들어온다
        Q_feat = self.lin_Q(Q) # [n_batch, n_Q, d_feat]
        K_feat = self.lin_K(K) 
        V_feat = self.lin_V(V)
        
        #It is important to shape the lower dimension first and then permutate.
        #Actually I don't gettit.
        Q_split = Q_feat.view(n_batch,-1,self.n_head,self.d_head).permute(0,2,1,3)
        K_split = K_feat.view(n_batch,-1,self.n_head,self.d_head).permute(0,2,1,3)
        V_split = V_feat.view(n_batch,-1,self.n_head,self.d_head).permute(0,2,1,3)
        print("K_shape:",K_split.shape)
        d_K = K_split.size()[-1] # shape이 반환하는 tuple 에서 바로 index 접근
        #K_split.size()[-1]. .size() method를 불로온 후에 index 접근
        
        scores = Q_split.matmul(K_split.transpose(-2,-1))/np.sqrt(d_K)
        if mask is not None:
            scores = scores.masked_fill(mask==0,-1e9)
        attention = torch.softmax(scores,dim=-1) #dim=-1 : softmax along dimension -1 
        print("attention shape:",attention.shape)
        # attention = [n_batch, n_head, n_Q, n_K]
        x_raw = torch.matmul(self.dropout(attention),V_split)
        # x_raw = [n_batch, n_head,n_Q,d_head]

        # Reshape x to operate add and normalization?
        x_rsh1 = x_raw.permute(0,2,1,3).contiguous() #x_rh1의 객체 id 순서를 dim 1 따라 해줌
        # x_rh1 : [n_batch, n_Q, n_head, d_head]
        x_rsh2 = x_rsh1.view(n_batch,-1,self.d_feat)
        # x_rh2 : [n_batch,n_Q, d_feat]

        # Linear
        x = self.lin_O(x_rsh2)
        # x:[n_batch, n_Q, d_feat]

        out = {'Q_feat':Q_feat,'K_feat':K_feat,'V_feat':V_feat,
               'Q_split':Q_split,'K_split':K_split,'V_split':V_split,
               'scores':scores,'attention':attention,
               'x_raw':x_raw,'x_rsh1':x_rsh1,'x_rsh2':x_rsh2,'x':x} # x is the output of MHA
        return out


In [38]:
# Self-Attention Layer
n_batch = 128
n_src   = 32
d_feat  = 200
n_head  = 5
src = torch.rand(n_batch,n_src,d_feat)
self_attention = MHA(
    d_feat=d_feat,n_head=n_head,actv=F.relu,USE_BIAS=True,dropout_p=0.1)
out = self_attention.forward(src,src,src,mask=None)

K_shape: torch.Size([128, 5, 32, 40])
attention shape: torch.Size([128, 5, 32, 32])


### Positional Embedding
$PE(pos,2i) = \sin(pos/10000^{2i/d_{model}})$

$PE(pos,2i+1) = \cos(pos/10000^{2i/d_{model}})$

In [None]:
import torch
import torch.nn as nn
import math

class PE(nn.module):
    def __init__(self,d_model,max_len=5000):
        super(PE,self).__init__()
        
        