In [1]:
AttentionBlockimport sys
import os
from pathlib import Path
sys.path.append(str(Path(os.getcwd()).parent))
import math,torch
from torch import nn
from activations import *

### Source: https://github.com/fastai/course22p2/blob/master/nbs/27_attention.ipynb

In [2]:
import matplotlib.pyplot as plt

In [3]:
from diffusers.models.attention import AttentionBlock

In [4]:
set_seed(42)
x = torch.randn(64,32,16,16)

In [5]:
t = x.view(*x.shape[:2], -1).transpose(1, 2)
t.shape

torch.Size([64, 256, 32])

In [6]:
ni = 32

In [7]:
sk = nn.Linear(ni, ni)
sq = nn.Linear(ni, ni)
sv = nn.Linear(ni, ni)

In [8]:
k = sk(t)
q = sq(t)
v = sv(t)

In [9]:
(q@k.transpose(1,2)).shape

torch.Size([64, 256, 256])

In [10]:
class SelfAttention(nn.Module):
    def __init__(self, ni):
        super().__init__()
        self.scale = math.sqrt(ni)
        self.norm = nn.GroupNorm(1, ni)
        #q, k and v are basically projections
        self.q = nn.Linear(ni, ni)
        self.k = nn.Linear(ni, ni)
        self.v = nn.Linear(ni, ni)
        self.proj = nn.Linear(ni, ni)
    
    def forward(self, x):
        inp = x
        n,c,h,w = x.shape
        x = self.norm(x)
        x = x.view(n, c, -1).transpose(1, 2)
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)
        s = (q@k.transpose(1,2))/self.scale
        x = s.softmax(dim=-1)@v
        x = self.proj(x)
        x = x.transpose(1,2).reshape(n,c,h,w)
        return x+inp

In [12]:
sa = SelfAttention(32)

In [13]:
ra = sa(x)
ra.shape

torch.Size([64, 32, 16, 16])

In [14]:
ra[0,0,0]

tensor([ 1.9104,  1.4186,  0.8385, -2.1584,  0.6318, -1.2443, -0.0789, -1.6844,
        -0.7939,  1.6117, -0.3852, -1.4307, -0.7494, -0.6010, -0.8335,  0.7477],
       grad_fn=<SelectBackward0>)

In [15]:
def cp_parms(a,b):
    b.weight = a.weight
    b.bias = a.bias

In [16]:
at = AttentionBlock(32, norm_num_groups=1)
src = sa.q,sa.k,sa.v,sa.proj,sa.norm
dst = at.query,at.key,at.value,at.proj_attn,at.group_norm
for s,d in zip(src,dst): cp_parms(s,d)

In [17]:
rb = at(x)
rb[0,0,0]

tensor([ 1.9104,  1.4186,  0.8385, -2.1584,  0.6318, -1.2443, -0.0789, -1.6844,
        -0.7939,  1.6117, -0.3852, -1.4307, -0.7494, -0.6010, -0.8335,  0.7477],
       grad_fn=<SelectBackward0>)

In [18]:
sqkv = nn.Linear(ni, ni*3)
st = sqkv(t)
st.shape

torch.Size([64, 256, 96])

In [19]:
q,k,v = torch.chunk(st, 3, dim=-1)
q.shape

torch.Size([64, 256, 32])

In [20]:
(k@q.transpose(1,2)).shape

torch.Size([64, 256, 256])

In [21]:
class SelfAttention(nn.Module):
    def __init__(self, ni):
        super().__init__()
        self.scale = math.sqrt(ni)
        self.norm = nn.BatchNorm2d(ni)
        self.qkv = nn.Linear(ni, ni*3)
        self.proj = nn.Linear(ni, ni)
    
    def forward(self, inp):
        n,c,h,w = inp.shape
        x = self.norm(inp).view(n, c, -1).transpose(1, 2)
        q,k,v = torch.chunk(self.qkv(x), 3, dim=-1)
        s = (q@k.transpose(1,2))/self.scale
        x = s.softmax(dim=-1)@v
        x = self.proj(x).transpose(1,2).reshape(n,c,h,w)
        return x+inp

In [22]:
sa = SelfAttention(32)
sa(x).shape

torch.Size([64, 32, 16, 16])

In [23]:
sa(x).std()

tensor(1.0047, grad_fn=<StdBackward0>)

In [24]:
def heads_to_batch(x, heads):
    n,sl,d = x.shape
    x = x.reshape(n, sl, heads, -1)
    return x.transpose(2, 1).reshape(n*heads,sl,-1)

def batch_to_heads(x, heads):
    n,sl,d = x.shape
    x = x.reshape(-1, heads, sl, d)
    return x.transpose(2, 1).reshape(-1,sl,d*heads)

In [25]:
from einops import rearrange

In [26]:
t2 = rearrange(t , 'n s (h d) -> (n h) s d', h=8)
t.shape, t2.shape

(torch.Size([64, 256, 32]), torch.Size([512, 256, 4]))

In [27]:
t3 = rearrange(t2, '(n h) s d -> n s (h d)', h=8)

In [28]:
t2.shape,t3.shape

(torch.Size([512, 256, 4]), torch.Size([64, 256, 32]))

In [29]:
(t==t3).all()

tensor(True)

In [30]:
class SelfAttentionMultiHead(nn.Module):
    def __init__(self, ni, nheads):
        super().__init__()
        self.nheads = nheads
        self.scale = math.sqrt(ni/nheads)
        self.norm = nn.BatchNorm2d(ni)
        self.qkv = nn.Linear(ni, ni*3)
        self.proj = nn.Linear(ni, ni)
    
    def forward(self, inp):
        n,c,h,w = inp.shape
        x = self.norm(inp).view(n, c, -1).transpose(1, 2)
        x = self.qkv(x)
        #split channels like batcehs so that heads don't have nothing related for differetn channels
        x = rearrange(x, 'n s (h d) -> (n h) s d', h=self.nheads)
        q,k,v = torch.chunk(x, 3, dim=-1)
        s = (q@k.transpose(1,2))/self.scale
        x = s.softmax(dim=-1)@v
        #undo previous rearrange
        x = rearrange(x, '(n h) s d -> n s (h d)', h=self.nheads)
        x = self.proj(x).transpose(1,2).reshape(n,c,h,w)
        return x+inp

In [31]:
sa = SelfAttentionMultiHead(32, 4)
sx = sa(x)
sx.shape

torch.Size([64, 32, 16, 16])

In [32]:
sx.mean(),sx.std()

(tensor(0.0248, grad_fn=<MeanBackward0>),
 tensor(1.0069, grad_fn=<StdBackward0>))

In [33]:
nm = nn.MultiheadAttention(32, num_heads=8, batch_first=True)
nmx,nmw = nm(t,t,t)
nmx = nmx+t

In [34]:
nmx.mean(),nmx.std()

(tensor(-0.0008, grad_fn=<MeanBackward0>),
 tensor(1.0019, grad_fn=<StdBackward0>))