In [1]:
import torch

In [2]:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [3]:
input_size = inputs.shape[0]
masked_mat = torch.tril(torch.ones(input_size,input_size))

In [4]:
from torch import nn
class causualAttentionb_v1(nn.Module) : 
    def __init__(self, d_in, d_out, qkv_bias=False) : 
        super().__init__()
        # nn.Linear also creates a trainable matrix of in_dim X out_dim
        self.W_query = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias = qkv_bias)
        
    def forward(self, x) :
#         x is input : 
        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim = -1)
        
        input_size = inputs.shape[0]
        masked_mat = torch.tril(torch.ones(input_size,input_size))
        print(masked_mat)
        
        masked_attn_weights = attn_weights * masked_mat
        row_sums = masked_attn_weights.sum(dim = 1, keepdim = True)
        masked_attn_weights = masked_attn_weights / row_sums
#         attn_weights = torch.softmax(attn_weights, dim = -1)
        print(masked_attn_weights)
        
        context_vectors = attn_weights @ values
        return context_vectors


In [5]:
c_attn = causualAttentionb_v1(3,2)
c_attn(inputs)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4724, 0.5276, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3089, 0.3450, 0.3462, 0.0000, 0.0000, 0.0000],
        [0.2390, 0.2542, 0.2547, 0.2521, 0.0000, 0.0000],
        [0.1881, 0.2040, 0.2044, 0.1960, 0.2075, 0.0000],
        [0.1570, 0.1676, 0.1679, 0.1682, 0.1747, 0.1646]],
       grad_fn=<DivBackward0>)


tensor([[ 0.2194, -0.0643],
        [ 0.2214, -0.0649],
        [ 0.2214, -0.0649],
        [ 0.2200, -0.0645],
        [ 0.2206, -0.0648],
        [ 0.2200, -0.0645]], grad_fn=<MmBackward0>)

In [6]:
from torch import nn
torch.manual_seed(123)
class causualAttentionb_v2(nn.Module) : 
    def __init__(self, d_in, d_out, qkv_bias=False) : 
        super().__init__()
        # nn.Linear also creates a trainable matrix of in_dim X out_dim
        self.W_query = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias = qkv_bias)
        
    def forward(self, x) :
#         x is input : 
        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
#         attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim = -1)
        print(attn_scores)
        input_size = inputs.shape[0]
        mask = torch.triu(torch.ones(input_size,input_size), diagonal = 1)
        print(mask)
        masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
        print(masked)
        
        masked_attn_weights = torch.softmax(masked / keys.shape[-1]**0.5, dim = -1)
        print(masked_attn_weights)
        
        context_vectors = masked_attn_weights @ values
        return context_vectors

In [7]:
c_attn = causualAttentionb_v2(3,2)
c_attn(inputs)

tensor([[0.3111, 0.3479, 0.3471, 0.1714, 0.2350, 0.1928],
        [0.1655, 0.2602, 0.2576, 0.1445, 0.1384, 0.1790],
        [0.1667, 0.2602, 0.2577, 0.1443, 0.1391, 0.1784],
        [0.0510, 0.1080, 0.1064, 0.0643, 0.0476, 0.0835],
        [0.1415, 0.1875, 0.1863, 0.0987, 0.1121, 0.1174],
        [0.0476, 0.1192, 0.1171, 0.0731, 0.0477, 0.0966]],
       grad_fn=<MmBackward0>)
tensor([[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]])
tensor([[0.3111,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1655, 0.2602,   -inf,   -inf,   -inf,   -inf],
        [0.1667, 0.2602, 0.2577,   -inf,   -inf,   -inf],
        [0.0510, 0.1080, 0.1064, 0.0643,   -inf,   -inf],
        [0.1415, 0.1875, 0.1863, 0.0987, 0.1121,   -inf],
        [0.0476, 0.1192, 0.1171, 0.0731, 0.0477, 0.0966]],
       grad_fn=<MaskedFillBackward0>)
tensor([[1.0000, 0.0000, 0.00

tensor([[-0.4519,  0.2216],
        [-0.5874,  0.0058],
        [-0.6300, -0.0632],
        [-0.5675, -0.0843],
        [-0.5526, -0.0981],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)

## Creating dropouts

In [8]:
ones_mat = torch.ones(6,6)
print(ones_mat)
dropout = torch.nn.Dropout(0.5)
print(dropout(ones_mat))

tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]])
tensor([[0., 0., 2., 2., 0., 2.],
        [2., 0., 2., 0., 2., 0.],
        [2., 2., 0., 0., 2., 0.],
        [0., 2., 2., 2., 2., 0.],
        [0., 0., 0., 0., 0., 2.],
        [2., 2., 0., 0., 2., 2.]])


In [9]:
## with dropouts
from torch import nn
torch.manual_seed(123)
class causualAttentionb_v3(nn.Module) : 
    def __init__(self, d_in, d_out, qkv_bias=False) : 
        super().__init__()
        # nn.Linear also creates a trainable matrix of in_dim X out_dim
        self.W_query = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias = qkv_bias)
        
    def forward(self, x) :
#         x is input : 
        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
#         attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim = -1)
        input_size = inputs.shape[0]
        mask = torch.triu(torch.ones(input_size,input_size), diagonal = 1)
#         print(mask)
        masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
#         print(masked)
        
        masked_attn_weights = torch.softmax(masked / keys.shape[-1]**0.5, dim = -1)
        print(masked_attn_weights)
        masked_attn_weights = dropout(masked_attn_weights)
        print(masked_attn_weights)
        
        context_vectors = masked_attn_weights @ values
        return context_vectors

In [10]:
c_attn = causualAttentionb_v3(3,2)
c_attn(inputs)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4833, 0.5167, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3190, 0.3408, 0.3402, 0.0000, 0.0000, 0.0000],
        [0.2445, 0.2545, 0.2542, 0.2468, 0.0000, 0.0000],
        [0.1994, 0.2060, 0.2058, 0.1935, 0.1953, 0.0000],
        [0.1624, 0.1709, 0.1706, 0.1654, 0.1625, 0.1682]],
       grad_fn=<SoftmaxBackward0>)
tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.9665, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6380, 0.6816, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5090, 0.5085, 0.4936, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3249, 0.3418, 0.0000, 0.0000, 0.3249, 0.3363]],
       grad_fn=<MulBackward0>)


tensor([[ 0.0000,  0.0000],
        [-0.4368,  0.2142],
        [-0.7751,  0.0077],
        [-0.9140, -0.2769],
        [ 0.0000,  0.0000],
        [-0.6906, -0.0974]], grad_fn=<MmBackward0>)

In [11]:
inputs = torch.rand(2,6,3)
inputs.shape
context_size = inputs.shape[1]

In [32]:
## Creating class which handles multiple batches

# input (x) --> [batch_size,tokens, token(dim)]
# no of batches and batch_size are different
# batch_size denotes the number of inputs in a single batch
import torch
from torch import nn

class causualAttention_v4(nn.Module) :
    
    def __init__(self,d_in, d_out,context_length,dropout,qky_bias=False) :
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias = qky_bias)
        self.W_key = nn.Linear(d_in, d_out, bias = qky_bias)
        self.W_value = nn.Linear(d_in, d_out, bias = qky_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal = 1))
        
    def forward(self,x) :
        # x : [batch_size, tokens, dim]
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        
        # keys : [batch_size, tokens , d_out]
        # queries : [batch_size, tokens, d_out]
        # values : [batch_size, tokens, d_out]
        
#         print(keys.shape, queries.shape, values.shape)
        
        attn_scores = queries @ keys.transpose(1,2)
        # attn_scores : [batch_size, token_size, token_size]
#         print(attn_scores)
        
        attn_scores.masked_fill_(
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim = -1)
#         print(attn_weights)
        
        context_vec = attn_weights @ values
        # context_vec --> [batch_size, tokens, dim_out]
        print(context_vec.shape)
        return context_vec

        

In [33]:
c_attn = causualAttention_v4(3,2,6,0.5)
c_attn(inputs)

torch.Size([2, 6, 2])


tensor([[[0.3495, 0.0982],
         [0.3668, 0.1288],
         [0.3236, 0.1003],
         [0.3116, 0.1005],
         [0.3158, 0.0991],
         [0.3270, 0.1292]],

        [[0.1181, 0.0051],
         [0.1367, 0.0219],
         [0.2159, 0.0822],
         [0.2124, 0.0461],
         [0.2421, 0.0647],
         [0.2548, 0.0744]]], grad_fn=<UnsafeViewBackward0>)

In [49]:
import torch
from torch import nn

class MultiHeadAttensionWrapper(nn.Module): 
    
    def __init__(self, d_in, d_out, context_length, dropout,num_heads, qkv_bias=False) :
        # d_in : input vector embedding size | 3 here
        # d_out : hyperparam | 2 here
        super().__init__()
        self.heads = nn.ModuleList(
            [causualAttention_v4(d_in, d_out, context_length, dropout,qkv_bias) for _ in range(num_heads)]
        )
        
    def forward(self,x) :
        return torch.cat([head(x) for head in self.heads], dim = -1)
        # head(X) dim --> [6,dim_out]
        # torch.cat --> [6, dim_out * num_heads]
        

In [50]:
multi_head = MultiHeadAttensionWrapper(3,2,6,0.5,2)
multi_head(inputs)

torch.Size([2, 6, 2])
torch.Size([2, 6, 2])


tensor([[[-0.1827,  0.3134,  0.4914,  0.1591],
         [-0.1631,  0.2361,  0.4388,  0.2684],
         [-0.2089,  0.3010,  0.4037,  0.0925],
         [-0.2761,  0.3381,  0.3294, -0.0102],
         [-0.2722,  0.3446,  0.3584,  0.0198],
         [-0.2925,  0.2896,  0.2787,  0.0847]],

        [[-0.1627,  0.2677,  0.2235, -0.1434],
         [-0.1674,  0.2483,  0.2118, -0.0999],
         [-0.2549,  0.2496,  0.1786, -0.0239],
         [-0.2192,  0.3143,  0.2975, -0.0682],
         [-0.2348,  0.3123,  0.3053, -0.0321],
         [-0.2277,  0.2986,  0.3099,  0.0035]]], grad_fn=<CatBackward0>)

In [51]:
## Torch cat
## torch.cat(tensors, dimension : <dim>)
# tensors must have same shape except the cat dimension
# Example : 
x = torch.randn(3,2)
print(x)
x1 = torch.cat((x,x), dim = 0)
x2 = torch.cat((x,x), dim = 1)
print(x1.shape, x2.shape)
print(x1,x2)

tensor([[-1.0821,  0.3351],
        [ 2.2381, -0.5132],
        [ 0.2942,  2.0296]])
torch.Size([6, 2]) torch.Size([3, 4])
tensor([[-1.0821,  0.3351],
        [ 2.2381, -0.5132],
        [ 0.2942,  2.0296],
        [-1.0821,  0.3351],
        [ 2.2381, -0.5132],
        [ 0.2942,  2.0296]]) tensor([[-1.0821,  0.3351, -1.0821,  0.3351],
        [ 2.2381, -0.5132,  2.2381, -0.5132],
        [ 0.2942,  2.0296,  0.2942,  2.0296]])
