Scaled Dot product atention

In [11]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [12]:
seq_len = 4
batch_size = 1
input_dim = 512
d_model = 512
x = torch.randn((batch_size,seq_len,input_dim))

In [13]:
x.shape

torch.Size([1, 4, 512])

In [15]:
qkv_layer = nn.Linear(input_dim,3*d_model)

In [18]:
qkv = qkv_layer(x)

In [20]:
num_heads = 8
head_dim = d_model//num_heads
qkv = qkv.reshape(batch_size,seq_len,num_heads,3*head_dim)

In [22]:
qkv = qkv.permute(0,2,1,3)
qkv.shape

torch.Size([1, 8, 4, 192])

In [25]:
q, k,v = qkv.chunk(3,dim=-1)

In [30]:
d_k = q.size()[-1]
scaled = torch.matmul(q,k.transpose(-2,-1))/math.sqrt(d_k)
scaled.shape

torch.Size([1, 8, 4, 4])

In [35]:
mask = torch.full(scaled.size(),float('-inf'))
mask = torch.triu(mask,diagonal=1)
mask

tensor([[[[0., -inf, -inf, -inf],
          [0., 0., -inf, -inf],
          [0., 0., 0., -inf],
          [0., 0., 0., 0.]],

         [[0., -inf, -inf, -inf],
          [0., 0., -inf, -inf],
          [0., 0., 0., -inf],
          [0., 0., 0., 0.]],

         [[0., -inf, -inf, -inf],
          [0., 0., -inf, -inf],
          [0., 0., 0., -inf],
          [0., 0., 0., 0.]],

         [[0., -inf, -inf, -inf],
          [0., 0., -inf, -inf],
          [0., 0., 0., -inf],
          [0., 0., 0., 0.]],

         [[0., -inf, -inf, -inf],
          [0., 0., -inf, -inf],
          [0., 0., 0., -inf],
          [0., 0., 0., 0.]],

         [[0., -inf, -inf, -inf],
          [0., 0., -inf, -inf],
          [0., 0., 0., -inf],
          [0., 0., 0., 0.]],

         [[0., -inf, -inf, -inf],
          [0., 0., -inf, -inf],
          [0., 0., 0., -inf],
          [0., 0., 0., 0.]],

         [[0., -inf, -inf, -inf],
          [0., 0., -inf, -inf],
          [0., 0., 0., -inf],
          [0., 0., 0.,

In [39]:
attention = F.softmax(scaled+mask,dim=-1)

In [41]:
value = torch.matmul(attention,v)
value.shape

torch.Size([1, 8, 4, 64])

<h1>Multi Head Attention</h1>

In [60]:
def scaled_dot_product(q,k,v,mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q,k.transpose(-2,-1))/math.sqrt(d_k)
    if mask is not None:
        scaled+=mask
    attention = F.softmax(scaled,dim=-1)
    values = torch.matmul(attention,v)
    return values,attention
    

class multiheadAttention(nn.Module):

    def __init__(self,input_dim,d_model,num_heads):
        super().__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dims = d_model//num_heads
        self.qkv_layer = nn.Linear(input_dim,3*d_model)
        self.liner_layer = nn.Linear(d_model,d_model)

    
    def forward(self,x,mask=None):
        batch_size,seq_len,input_dim=x.size()    
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size,seq_len,num_heads,3*head_dim)
        qkv = qkv.permute(0,2,1,3)
        q,k,v = qkv.chunk(3,dim=-1)
        values,attention = scaled_dot_product(q,k,v,mask)
        values = values.reshape(batch_size,seq_len,self.head_dims*self.num_heads)
        out = self.liner_layer(values)
        return out
        


In [61]:
batch_size=30
seq_len=5
x = torch.randn((batch_size,seq_len,input_dim))
model = multiheadAttention(input_dim,512,8)
out = model.forward(x)

In [62]:
out.shape

torch.Size([30, 5, 512])

<h1>Positonal encodidng</h1>>

In [63]:
import torch 
import torch.nn as nn

max_seq_len=10
d_model = 512

In [69]:
x[0][1][2] = 10
x[0][1][2]

tensor(10.)

In [65]:
x = torch.randn((1,max_seq_len,d_model))

In [70]:
for pos in range(max_seq_len):
    for i in range(0,d_model,2):
        x[0][pos][i] = math.sin(pos/10000**(i/d_model))
    for i in range(1,d_model,2):    
        x[0][pos][i] = math.cos(pos/10000**(i/d_model))

In [71]:
x

tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.5522e-01,  8.2186e-01,  ...,  1.0000e+00,
           1.0366e-04,  1.0000e+00],
         [ 9.0930e-01, -3.8347e-01,  9.3641e-01,  ...,  1.0000e+00,
           2.0733e-04,  1.0000e+00],
         ...,
         [ 6.5699e-01,  8.2982e-01,  4.5239e-01,  ...,  1.0000e+00,
           7.2564e-04,  1.0000e+00],
         [ 9.8936e-01, -3.3935e-03,  9.9067e-01,  ...,  1.0000e+00,
           8.2931e-04,  1.0000e+00],
         [ 4.1212e-01, -8.3358e-01,  6.7637e-01,  ...,  1.0000e+00,
           9.3297e-04,  1.0000e+00]]])

In [77]:
def postional_encoding(max_seq_len,d_model):
    even_i = torch.arange(0,d_model,2).float()
    denom = torch.pow(10000,even_i/d_model)
    pos = torch.arange(max_seq_len).reshape(-1,1)
    even_pe = torch.sin(pos/denom)
    odd_pe = torch.cos(pos/denom)
    stacked = torch.stack([even_pe,odd_pe],dim=2)
    PE = torch.flatten(stacked,start_dim=1,end_dim=2)
    return PE

In [78]:
postional_encoding(max_seq_len,d_model)

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.0366e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
          2.0733e-04,  1.0000e+00],
        ...,
        [ 6.5699e-01,  7.5390e-01,  4.5239e-01,  ...,  1.0000e+00,
          7.2564e-04,  1.0000e+00],
        [ 9.8936e-01, -1.4550e-01,  9.9067e-01,  ...,  1.0000e+00,
          8.2931e-04,  1.0000e+00],
        [ 4.1212e-01, -9.1113e-01,  6.7637e-01,  ...,  1.0000e+00,
          9.3297e-04,  1.0000e+00]])

<h1>layer normalization</h1>