In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [6]:
class Attention(nn.Module):
    def __init__(self, d_model =2,
                 row_dim =0,
                 col_dim =1):
        super().__init__()
        #storing and initializing weight , key and value matrix
        self.W_q = nn.Linear(in_features = d_model,
                             out_features= d_model,
                             bias = False)
        self.W_k = nn.Linear(in_features = d_model,
                             out_features= d_model,
                             bias = False)
        self.W_v = nn.Linear(in_features = d_model,
                             out_features= d_model,
                             bias = False)
        self.row_dim=row_dim
        self.col_dim=col_dim
    #calculating attention values that how are the words in the sentences are related
    def forward(self, encodings_for_q,encodings_for_v,encodings_for_k,mask = None):
        #same encodings but differently calculated
        q = self.W_q(encodings_for_q) #query matrix
        k = self.W_k(encodings_for_k) #key matrix
        v = self.W_v(encodings_for_v) #value matrix

        sims = torch.matmul(q,k.transpose(self.row_dim,self.col_dim)) #similarities for each queries with different key values
        scaled_sims = sims/torch.tensor(k.size(self.col_dim)**0.5)

        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask = mask, value = -1e9) #masked_fill only work on the boolean matrix// where the matrix gets true value; it returns the value
        attention_percentage = F.softmax(scaled_sims, dim = self.col_dim)
        attention_value = torch.matmul(attention_percentage,v)
        return attention_value
                 

#### Calculate Encoder Decoder Attention

In [7]:
## create matrices of token encodings...
encodings_for_q = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

encodings_for_k = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

encodings_for_v = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

## set the seed for the random number generator
torch.manual_seed(42)

## create an attention object
attention = Attention(d_model=2,
                      row_dim=0,
                      col_dim=1)

## calculate encoder-decoder attention
attention(encodings_for_q, encodings_for_k, encodings_for_v)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

### Calculate Multi-Head Attention

In [10]:
class MultiHeadAttention(nn.Module):
    def __init__(self,
                 d_model =2,
                 row_dim =0,
                 col_dim =1,
                 number_heads = 1):
        super().__init__()
    
        self.col_dim=col_dim
        #create a bunch of self heads
        self.heads = nn.ModuleList(
            [Attention(d_model, row_dim, col_dim) #initialize with same value each time
             for _ in range(number_heads)]
        )
    def forward(self, encodings_for_q,encodings_for_v,encodings_for_k):
        #run the data through all the attention heads
        #get different value output and finally concatenate them is the main target
        return torch.cat([head(encodings_for_q,
                               encodings_for_k,
                               encodings_for_v,)
                          for head in self.heads],dim = self.col_dim)
        
                 

In [14]:
## set the seed for the random number generator
torch.manual_seed(42)

## create an attention object
multiHeadAttention = MultiHeadAttention(d_model=2,
                                        row_dim=0,
                                        col_dim=1,
                                        number_heads=2)

## calculate encoder-decoder attention
multiHeadAttention(encodings_for_q, encodings_for_k, encodings_for_v)

tensor([[ 1.0100,  1.0641, -0.7081, -0.8268],
        [ 0.2040,  0.7057, -0.7417, -0.9193],
        [ 3.4989,  2.2427, -0.7190, -0.8447]], grad_fn=<CatBackward0>)

In [15]:
## set the seed for the random number generator
torch.manual_seed(42)

## create an attention object
multiHeadAttention = MultiHeadAttention(d_model=2,
                                        row_dim=0,
                                        col_dim=1,
                                        number_heads=1)

## calculate encoder-decoder attention
multiHeadAttention(encodings_for_q, encodings_for_k, encodings_for_v)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<CatBackward0>)