In [1]:
import torch

# Define sequence length and vocab size
seq_len = 6
vocab_size = 10  # Assume tokens are integers from 0 to 9
pad_token = 0  # Padding token is represented as 0

# Step 1: Create random encoder input with some padding
encoder_input = torch.randint(1, vocab_size, (seq_len,))  # Random tokens (excluding pad_token)
encoder_input[2] = pad_token  # Manually add padding at index 2
encoder_input[4] = pad_token  # Manually add padding at index 4

print("Encoder Input:", encoder_input.tolist())

# Step 2: Create encoder mask
encoder_mask = (encoder_input != pad_token).unsqueeze(0).unsqueeze(0).int()  # (1, 1, seq_len)

print("Encoder Mask:", encoder_mask.tolist())

# Step 3: Generate dummy attention scores (seq_len x seq_len)
attn_scores = torch.randn(seq_len, seq_len)  # Random attention scores

print("Original Attention Scores:")
print(attn_scores)

# Step 4: Apply mask (set masked positions to a very negative number)
masked_attn_scores = attn_scores.masked_fill(encoder_mask.squeeze(0).squeeze(0) == 0, -1e9)

print("Masked Attention Scores:")
print(masked_attn_scores)


Encoder Input: [4, 9, 0, 8, 0, 6]
Encoder Mask: [[[1, 1, 0, 1, 0, 1]]]
Original Attention Scores:
tensor([[ 1.4993,  0.1686, -1.1113, -1.0623, -0.3536, -1.9742],
        [-0.9570,  0.5242, -0.6316, -0.0038,  1.1087, -0.1498],
        [-0.1731,  1.6809, -0.0410,  0.0450, -0.2899,  0.1282],
        [-1.8678, -0.0308, -0.7427,  0.4536, -0.7229, -1.9170],
        [-1.0089, -0.8133,  1.4476, -1.2438, -0.1860,  0.3591],
        [-1.1489,  2.2726, -0.1667, -1.9493, -1.5931, -0.9043]])
Masked Attention Scores:
tensor([[ 1.4993e+00,  1.6858e-01, -1.0000e+09, -1.0623e+00, -1.0000e+09,
         -1.9742e+00],
        [-9.5704e-01,  5.2424e-01, -1.0000e+09, -3.8383e-03, -1.0000e+09,
         -1.4978e-01],
        [-1.7306e-01,  1.6809e+00, -1.0000e+09,  4.4996e-02, -1.0000e+09,
          1.2824e-01],
        [-1.8678e+00, -3.0787e-02, -1.0000e+09,  4.5358e-01, -1.0000e+09,
         -1.9170e+00],
        [-1.0089e+00, -8.1331e-01, -1.0000e+09, -1.2438e+00, -1.0000e+09,
          3.5905e-01],
       

In [3]:
import torch

def causal_mask(seq_len):
    """Creates a causal mask (upper triangular mask)"""
    mask = torch.tril(torch.ones(seq_len, seq_len))  # Lower triangular matrix (float by default)
    return mask.unsqueeze(0).int()  # Convert to int

# Example decoder input (with padding tokens)
seq_len = 5
pad_token = 0
decoder_input = torch.tensor([3, 7, 0, 5, 9])  # Random sequence with a padding token at index 2

# Padding mask: (1, seq_len)
padding_mask = (decoder_input != pad_token).unsqueeze(0).int()

# Causal mask: (1, seq_len, seq_len) (converted to int)
causal_mask_tensor = causal_mask(seq_len)

# Apply bitwise AND to combine padding mask and causal mask
decoder_mask = padding_mask.unsqueeze(-1) & causal_mask_tensor  # Shape (1, seq_len, seq_len)

print("Decoder Input:", decoder_input.tolist())
print("Padding Mask:", padding_mask.tolist())
print("Causal Mask:\n", causal_mask_tensor.squeeze(0))
print("Final Decoder Mask:\n", decoder_mask.squeeze(0))


Decoder Input: [3, 7, 0, 5, 9]
Padding Mask: [[1, 1, 0, 1, 1]]
Causal Mask:
 tensor([[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]], dtype=torch.int32)
Final Decoder Mask:
 tensor([[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]], dtype=torch.int32)
