In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [10]:
class SelfAttention(nn.Module):

    def __init__(self, d_model = 2,
                 row_dim = 0, col_dim = 1):
        # here d_model means word embedding size
        # and row_dim and col_dim are used to access the row and column of the matrix
        
        super().__init__()
        
        
        # generated the weights for the query, key and value matrices
        self.W_q = nn.Linear(in_features = d_model,
                             out_features = d_model,
                             bias = False)
        self.W_k = nn.Linear(in_features = d_model,
                             out_features = d_model,
                             bias = False)
        self.W_v = nn.Linear(in_features = d_model,
                             out_features = d_model,
                             bias = False)
        self.row_dim = row_dim
        self.col_dim = col_dim
        # row in vertical direction and column in horizontal direction
        # [[1,2,3],  |
        #  [4,5,6],  |dim0 means row count ----> this means col
        #  [7,8,9]]  |
        
    def forward(self,token_encodings):
        # created teh query, key and value matrices from the token encodings X weights
        q = self.W_q(token_encodings)
        k = self.W_k(token_encodings)
        v = self.W_v(token_encodings)
        
        # calculate the similarity between the query and key matrices
        sims = torch.matmul(q,k.transpose(dim0=self.row_dim,dim1=self.col_dim))
        
        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)
        
        # apply softmax to the similarity matrix to get the attention percentages
        # here self.col_dim is used to access the column of the matrix which is in the horizontal direction
        attention_percentages = F.softmax(scaled_sims, dim = self.col_dim)
        
        # now calculate the attention weighted sum of the value matrix
        attention_score = torch.matmul(attention_percentages,v)
        
        return attention_score
        
        
        
        
        
        
        

In [11]:
encodings_matrix = torch.tensor([[1.11,.23],
                                 [2.34,.45],
                               [3.56,.67],
                               [4.78,.89]])
torch.manual_seed(0)
attention = SelfAttention(d_model = 2,
                            row_dim = 0,
                            col_dim = 1)
print(attention.forward(encodings_matrix))

tensor([[-0.0769, -0.6806],
        [-0.0734, -0.6524],
        [-0.0701, -0.6249],
        [-0.0668, -0.5982]], grad_fn=<MmBackward0>)


In [None]:
print(attention.W_q.weight.transpose(0,1))
print(attention.W_k.weight.transpose(0,1))
print(attention.W_v.weight.transpose(0,1))

tensor([[-0.0053, -0.5820],
        [ 0.3793, -0.5204]], grad_fn=<TransposeBackward0>)
tensor([[-0.2723, -0.0140],
        [ 0.1896,  0.5607]], grad_fn=<TransposeBackward0>)
tensor([[-0.0628, -0.2137],
        [ 0.1871, -0.1390]], grad_fn=<TransposeBackward0>)
