In [1]:
import torch ## torch let's us create tensors and also provides helper functions
import torch.nn as nn ## torch.nn gives us nn.module() and nn.Linear()
import torch.nn.functional as F # This gives us the softmax()
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


In [2]:
class SelfAttention(nn.Module):

    def __init__(self, dm=10,row_dim=0, col_dim=1):
        super().__init__()

        self.WQ = nn.Linear(in_features=dm, out_features=dm, bias=False)
        self.WK = nn.Linear(in_features=dm, out_features=dm, bias=False)
        self.WV = nn.Linear(in_features=dm, out_features=dm, bias=False)


        self.row_dim = row_dim
        self.col_dim = col_dim
    
    def forward(self, encoding):

        Q = self.WQ(encoding)
        K = self.WK(encoding)
        V = self.WV(encoding)


        similarity = Q.matmul(K.transpose(dim0=self.row_dim, dim1=self.col_dim))/torch.tensor(K.size(self.col_dim)**0.5)

        similarity = F.softmax(similarity, dim=self.col_dim)
        
        attention = similarity.matmul(V)
        return attention





In [3]:
n = 5000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(42)
encodings = torch.rand(n,n)
print(encodings.shape)

torch.Size([5000, 5000])


In [4]:
self_attention = SelfAttention(dm=n)
self_attention(encodings)

tensor([[ 0.4454,  0.2774, -0.3229,  ..., -0.2133, -0.1296, -0.8072],
        [ 0.4455,  0.2774, -0.3231,  ..., -0.2133, -0.1296, -0.8069],
        [ 0.4454,  0.2775, -0.3231,  ..., -0.2134, -0.1297, -0.8070],
        ...,
        [ 0.4454,  0.2776, -0.3229,  ..., -0.2134, -0.1296, -0.8069],
        [ 0.4456,  0.2774, -0.3230,  ..., -0.2135, -0.1297, -0.8072],
        [ 0.4455,  0.2773, -0.3231,  ..., -0.2134, -0.1297, -0.8070]],
       grad_fn=<MmBackward0>)