In [2]:
import torch ## torch let's us create a tensor
import torch.nn as nn ## torch.nn gives us nn.module() and nn.Linear()
import torch.nn.functional as f ## that gives us the softmax

#### nn.Module is the base class for all neural network modules that you make with pytorch
#### d_module is the dimension of the model or the number of word embeddings values per token, we will d_model to define the size of all the weight matrices that we'll use to create the queries, keys and values.


In [9]:
class SelfAttention(nn.Module):
    def __init__(self, d_model=2,
                 row_dim = 0,
                 col_dim = 1):
        #d_model = the number of embedding model values per token. token is the word repository basically or corpus in other way
        ## row_dim , col_dim = the indices we should use to access rows or columns
        super().__init__()
        self.W_q = nn.Linear(in_features = d_model, #self.W_q creates the untrained query matrix and the linear function not just store the value but also do the math when the time comes
                             out_features = d_model,
                             bias = False) #in features = row of the query weight matrix
                                           ## out_features = column of the query weight matrix
        self.W_k = nn.Linear(in_features = d_model,
                             out_features = d_model, #shape (2 x 2 Matrix)
                             bias = False)
        self.W_v = nn.Linear(in_features = d_model,
                             out_features = d_model,
                             bias = False)
        self.row_dim= row_dim
        self.col_dim = col_dim
        #linear function do the matrix multiplication too
    def forward(self, token_encodings): #this mehtod actually calculated the self attention
        q = self.W_q(token_encodings) ##shape 3x2 matrix multiply with 2x2 matrix,
        k = self.W_k(token_encodings) ##output size will be 3x2 matrix
        v = self.W_v(token_encodings)


        sims = torch.matmul(q,k.transpose(dim0 = self.row_dim, dim1 = self.col_dim)) #3x2 matrix will be multiplied by 2x3 matrix
        scaled_sims = sims/torch.tensor(k.size(self.col_dim)**0.5)
        attention_percents = f.softmax(scaled_sims, dim = self.col_dim)
        attention_scores = torch.matmul(attention_percents, v)
        return attention_scores



In [10]:
encodings_matrix = torch.tensor([[1.16,0.23],
                                 [0.57,1.36],
                                 [4.41,-2.16]])
torch.manual_seed(42)
selfAttention = SelfAttention(d_model = 2,
                              row_dim=0,
                              col_dim =1)
selfAttention(encodings_matrix)


tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [12]:
encodings_matrix.size()

torch.Size([3, 2])

In [13]:
encodings_matrix.size(1)

2

In [17]:
#untrained key weights
selfAttention.W_k.weight.transpose(0,1) #why do we use (0,1) here?

tensor([[-0.1549, -0.3443],
        [ 0.1427,  0.4153]], grad_fn=<TransposeBackward0>)

In [24]:
selfAttention.W_k.weight.transpose(1,0)

tensor([[-0.1549, -0.3443],
        [ 0.1427,  0.4153]], grad_fn=<TransposeBackward0>)

In [20]:
#trained key values
selfAttention.W_k(encodings_matrix)

tensor([[-0.1469, -0.3038],
        [ 0.1057,  0.3685],
        [-0.9914, -2.4152]], grad_fn=<MmBackward0>)

In [23]:
q= torch.matmul(encodings_matrix,selfAttention.W_k.weight.transpose(0,1))

tensor([[-0.1469, -0.3038],
        [ 0.1057,  0.3685],
        [-0.9914, -2.4152]], grad_fn=<MmBackward0>)

In [16]:
selfAttention.W_k.weight

Parameter containing:
tensor([[-0.1549,  0.1427],
        [-0.3443,  0.4153]], requires_grad=True)