In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
class MaskedSelfAttention(nn.Module):

    def __init__(self, d_model = 2,
                 row_dim = 0, col_dim = 1):
        # here d_model means word embedding size
        # and row_dim and col_dim are used to access the row and column of the matrix
        
        super().__init__()
        
        
        # generated the weights for the query, key and value matrices
        self.W_q = nn.Linear(in_features = d_model,
                             out_features = d_model,
                             bias = False)
        self.W_k = nn.Linear(in_features = d_model,
                             out_features = d_model,
                             bias = False)
        self.W_v = nn.Linear(in_features = d_model,
                             out_features = d_model,
                             bias = False)
        self.row_dim = row_dim
        self.col_dim = col_dim
        # row in vertical direction and column in horizontal direction
        # [[1,2,3],  |
        #  [4,5,6],  |dim0 means row count ----> this means col
        #  [7,8,9]]  |
        
    def forward(self,token_encodings,mask = None):
        # created teh query, key and value matrices from the token encodings X weights
        q = self.W_q(token_encodings)
        k = self.W_k(token_encodings)
        v = self.W_v(token_encodings)
        
        # calculate the similarity between the query and key matrices
        sims = torch.matmul(q,k.transpose(dim0=self.row_dim,dim1=self.col_dim))
        
        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)
        
        # we need to add the mask to the similarity matrix
        # # this is done to avoid the attention to the upcoming tokens in the sequence
        
        if mask is not None:
            # here we are adding a very large negative number to the similarity matrix
            # so that when we apply softmax, it will ignore the masked tokens
            scaled_sims = scaled_sims.masked_fill(mask == mask,value=-1e9)
        
        
        # apply softmax to the similarity matrix to get the attention percentages
        # here self.col_dim is used to access the column of the matrix which is in the horizontal direction
        attention_percentages = F.softmax(scaled_sims, dim = self.col_dim)
        
        # now calculate the attention weighted sum of the value matrix
        attention_score = torch.matmul(attention_percentages,v)
        
        return attention_score
        
        
        
        

In [4]:
## create a matrix of token encodings...
encodings_matrix = torch.tensor([[1.16, 0.23],
                                 [0.57, 1.36],
                                 [4.41, -2.16]])
maskedSelfAttention = MaskedSelfAttention(d_model=2,
                               row_dim=0,
                               col_dim=1)
mask = torch.tril(torch.ones(3,3))
mask = mask == 0
print(maskedSelfAttention(encodings_matrix,mask))

tensor([[ 0.4239, -0.8639],
        [ 0.4239, -0.8639],
        [ 0.4239, -0.8639]], grad_fn=<MmBackward0>)
