In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [9]:
class MaskedAttention(nn.Module):
    def __init__(self, d_model = 2,
                 row_dim = 0,
                 col_dim = 1):
        super().__init__()
        #initialize random query, key and value weights and linear function also do the matrix multiplication
        self.W_q = nn.Linear(in_features = d_model, 
                             out_features = d_model,
                             bias = False)
        self.W_k = nn.Linear(in_features = d_model,
                             out_features = d_model,
                             bias = False)
        self.W_v = nn.Linear(in_features = d_model,
                             out_features = d_model,
                             bias = False)
        self.row_dim = row_dim
        self.col_dim = col_dim


    def forward(self, tokenencoddings,mask = None):
        q = self.W_q(tokenencoddings)
        k = self.W_k(tokenencoddings)
        v = self.W_v(tokenencoddings)
        sims = torch.matmul(q,k.transpose(self.row_dim,self.col_dim))
        scaled_sims = sims/ torch.tensor(k.size(self.col_dim)**0.5)
        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask = mask, value = -1e9)
        Attention_percent = F.softmax(scaled_sims, dim = self.col_dim)

        Attention_scores = torch.matmul(Attention_percent,v)

        return(Attention_scores)
        
        

In [10]:
## create a matrix of token encodings...
encodings_matrix = torch.tensor([[1.16, 0.23],
                                 [0.57, 1.36],
                                 [4.41, -2.16]])

## set the seed for the random number generator
torch.manual_seed(42)

## create a masked self-attention object
maskedSelfAttention = MaskedAttention(d_model=2,
                               row_dim=0,
                               col_dim=1)

## create the mask so that we don't use
## tokens that come after a token of interest
mask = torch.tril(torch.ones(3, 3))
mask = mask == 0
mask # print out the mask

tensor([[False,  True,  True],
        [False, False,  True],
        [False, False, False]])

In [11]:
## calculate masked self-attention
maskedSelfAttention(encodings_matrix, mask)

tensor([[ 0.6038,  0.7434],
        [-0.0062,  0.6072],
        [ 3.4989,  2.2427]], grad_fn=<MmBackward0>)

In [12]:
#creating mask
m = torch.ones(3,3)
print(f"Unit matrix: \n{m}")

n = torch.tril(m)
print(f" lower triangular matrix: \n {n}")

mask = n==0
print(f"Representation with true and false \n{mask}")

Unit matrix: 
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
 lower triangular matrix: 
 tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
Representation with true and false 
tensor([[False,  True,  True],
        [False, False,  True],
        [False, False, False]])


In [13]:
help(torch.tensor)

Help on built-in function tensor in module torch:

tensor(...)
    tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
    
    Constructs a tensor with no autograd history (also known as a "leaf tensor", see :doc:`/notes/autograd`) by copying :attr:`data`.
    
    
        When working with tensors prefer using :func:`torch.Tensor.clone`,
        :func:`torch.Tensor.detach`, and :func:`torch.Tensor.requires_grad_` for
        readability. Letting `t` be a tensor, ``torch.tensor(t)`` is equivalent to
        ``t.clone().detach()``, and ``torch.tensor(t, requires_grad=True)``
        is equivalent to ``t.clone().detach().requires_grad_(True)``.
    
    .. seealso::
    
        :func:`torch.as_tensor` preserves autograd history and avoids copies where possible.
        :func:`torch.from_numpy` creates a tensor that shares storage with a NumPy array.
    
    Args:
        data (array_like): Initial data for the tensor. Can be a list, tuple,
      

#### Some Basic Assesment of Torch

In [12]:
a = torch.tensor([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])
print(a)

b = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
print(b)

print(a.size())



a.size(2)

torch.rand(2,2)

o = torch.ones(3,3)
o

f = torch.tril(o)
f

tensor([[[ 1,  2,  3],
         [ 4,  5,  6]],

        [[ 7,  8,  9],
         [10, 11, 12]]])
tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])
torch.Size([2, 2, 3])


tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])