In [1]:
import torch

# Sequence length T
T = 4

# Random tensor representing "attention scores"
wei = torch.randn(T, T)
print("Original Attention Scores (wei):\n", wei)

# Generate a lower-triangular matrix (ones below diagonal, zeros above)
tril = torch.tril(torch.ones(T, T))
print("\nLower-triangular Mask (tril):\n", tril)

# Apply the masking: set upper-triangular entries to -inf
wei_masked = wei.masked_fill(tril == 0, float('-inf'))
# read it as follows : For every position where the corresponding entry in tril is zero, set the value at the same position in wei to negative infinity (-inf).

print("\nMasked Attention Scores (wei after masking):\n", wei_masked)

# Applying softmax along the rows to get normalized attention
attention_weights = torch.softmax(wei_masked, dim=-1)
print("\nNormalized Attention Weights (after softmax):\n", attention_weights)

Original Attention Scores (wei):
 tensor([[-0.9529,  0.0129, -0.1987,  0.5916],
        [ 2.0206, -0.0556, -1.2152,  0.4703],
        [-0.6061, -1.1618, -0.6499,  0.3146],
        [ 0.4922,  0.2849, -1.0224, -0.6005]])

Lower-triangular Mask (tril):
 tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.],
        [1., 1., 1., 1.]])

Masked Attention Scores (wei after masking):
 tensor([[-0.9529,    -inf,    -inf,    -inf],
        [ 2.0206, -0.0556,    -inf,    -inf],
        [-0.6061, -1.1618, -0.6499,    -inf],
        [ 0.4922,  0.2849, -1.0224, -0.6005]])

Normalized Attention Weights (after softmax):
 tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.8886, 0.1114, 0.0000, 0.0000],
        [0.3951, 0.2267, 0.3782, 0.0000],
        [0.4223, 0.3432, 0.0929, 0.1416]])
