In [1]:
import torch

# Define sequence length and vocab size
seq_len = 6
vocab_size = 10  # Assume tokens are integers from 0 to 9
pad_token = 0  # Padding token is represented as 0

# Step 1: Create random encoder input with some padding
encoder_input = torch.randint(1, vocab_size, (seq_len,))  # Random tokens (excluding pad_token)
encoder_input[2] = pad_token  # Manually add padding at index 2
encoder_input[4] = pad_token  # Manually add padding at index 4

print("Encoder Input:", encoder_input.tolist())

# Step 2: Create encoder mask
encoder_mask = (encoder_input != pad_token).unsqueeze(0).unsqueeze(0).int()  # (1, 1, seq_len)

print("Encoder Mask:", encoder_mask.tolist())

# Step 3: Generate dummy attention scores (seq_len x seq_len)
attn_scores = torch.randn(seq_len, seq_len)  # Random attention scores

print("Original Attention Scores:")
print(attn_scores)

# Step 4: Apply mask (set masked positions to a very negative number)
masked_attn_scores = attn_scores.masked_fill(encoder_mask.squeeze(0).squeeze(0) == 0, -1e9)

print("Masked Attention Scores:")
print(masked_attn_scores)


Encoder Input: [4, 9, 0, 8, 0, 6]
Encoder Mask: [[[1, 1, 0, 1, 0, 1]]]
Original Attention Scores:
tensor([[ 1.4993,  0.1686, -1.1113, -1.0623, -0.3536, -1.9742],
        [-0.9570,  0.5242, -0.6316, -0.0038,  1.1087, -0.1498],
        [-0.1731,  1.6809, -0.0410,  0.0450, -0.2899,  0.1282],
        [-1.8678, -0.0308, -0.7427,  0.4536, -0.7229, -1.9170],
        [-1.0089, -0.8133,  1.4476, -1.2438, -0.1860,  0.3591],
        [-1.1489,  2.2726, -0.1667, -1.9493, -1.5931, -0.9043]])
Masked Attention Scores:
tensor([[ 1.4993e+00,  1.6858e-01, -1.0000e+09, -1.0623e+00, -1.0000e+09,
         -1.9742e+00],
        [-9.5704e-01,  5.2424e-01, -1.0000e+09, -3.8383e-03, -1.0000e+09,
         -1.4978e-01],
        [-1.7306e-01,  1.6809e+00, -1.0000e+09,  4.4996e-02, -1.0000e+09,
          1.2824e-01],
        [-1.8678e+00, -3.0787e-02, -1.0000e+09,  4.5358e-01, -1.0000e+09,
         -1.9170e+00],
        [-1.0089e+00, -8.1331e-01, -1.0000e+09, -1.2438e+00, -1.0000e+09,
          3.5905e-01],
       

In [4]:
import torch

def causal_mask(seq_len):
    """Creates a causal mask (upper triangular mask)"""
    mask = torch.tril(torch.ones(seq_len, seq_len))  # Lower triangular matrix (float by default)
    return mask.unsqueeze(0).int()  # Convert to int

# Example decoder input (with padding tokens)
seq_len = 5
pad_token = 0
decoder_input = torch.tensor([3, 7, 0, 5, 9])  # Random sequence with a padding token at index 2

# Padding mask: (1, seq_len)
padding_mask = (decoder_input != pad_token).unsqueeze(0).int()

# Causal mask: (1, seq_len, seq_len) (converted to int)
causal_mask_tensor = causal_mask(seq_len)

# Final decoder mask (1, seq_len, seq_len)
decoder_mask = padding_mask.unsqueeze(-1) & causal_mask_tensor  # Shape (1, seq_len, seq_len)

# Step 1: Generate random attention scores (seq_len x seq_len)
attn_scores = torch.randn(seq_len, seq_len)  # Random values

# Step 2: Apply decoder mask (set masked positions to a very negative number)
masked_attn_scores = attn_scores.masked_fill(decoder_mask.squeeze(0) == 0, -1e9)

# Print outputs
print("Decoder Input:", decoder_input.tolist())
print("Padding Mask:", padding_mask.tolist())
print("Causal Mask:\n", causal_mask_tensor.squeeze(0))
print("Final Decoder Mask:\n", decoder_mask.squeeze(0))
print("\nOriginal Attention Scores:\n", attn_scores)
print("\nMasked Attention Scores:\n", masked_attn_scores)


Decoder Input: [3, 7, 0, 5, 9]
Padding Mask: [[1, 1, 0, 1, 1]]
Causal Mask:
 tensor([[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]], dtype=torch.int32)
Final Decoder Mask:
 tensor([[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]], dtype=torch.int32)

Original Attention Scores:
 tensor([[-2.0673,  1.2917,  0.9502,  1.6841,  0.3131],
        [-1.6142,  0.4656,  0.3090, -0.8188, -0.0746],
        [ 0.0351, -0.0609,  0.9087, -0.2348, -0.5654],
        [ 0.0091,  2.3085,  0.2298,  0.4479,  0.9507],
        [ 0.9271, -0.7019, -0.8267,  0.8278, -0.1082]])

Masked Attention Scores:
 tensor([[-2.0673e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [-1.6142e+00,  4.6559e-01, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 9.0505e-03,  2.3085e+00,  2.2975e-01,  4.4795

In [1]:
import torch
import torch.nn.functional as F

# Set seed for reproducibility
torch.manual_seed(42)

# Dummy attention scores: shape (batch, heads, seq_len, seq_len)
batch_size, heads, seq_len = 1, 1, 5
attention_scores = torch.randn(batch_size, heads, seq_len, seq_len)

# Create a binary mask where 1 means valid token and 0 means masked token.
# For example, let's mask out the last two tokens (columns).
mask = torch.tensor([[1, 1, 1, 0, 0]], dtype=torch.bool)  # shape: (seq_len,)
mask = mask.unsqueeze(0).unsqueeze(0)  # shape: (batch, heads, seq_len)

# Expand mask to match attention_scores shape along the last dimension
mask = mask.expand(batch_size, heads, seq_len, seq_len)

# --- Case 1: Using -1e9 for masked values ---
attn_scores_neg_inf = attention_scores.clone()
attn_scores_neg_inf.masked_fill_(mask == 0, -1e9)
softmax_neg_inf = F.softmax(attn_scores_neg_inf, dim=-1)

# --- Case 2: Using 0 for masked values ---
attn_scores_zero = attention_scores.clone()
attn_scores_zero.masked_fill_(mask == 0, 0)
softmax_zero = F.softmax(attn_scores_zero, dim=-1)

print("Original Attention Scores:")
print(attention_scores)
print("\nMask:")
print(mask[0, 0])  # print one example mask for clarity

print("\nSoftmax with -1e9 for masked values:")
print(softmax_neg_inf)

print("\nSoftmax with 0 for masked values:")
print(softmax_zero)


Original Attention Scores:
tensor([[[[ 1.9269,  1.4873,  0.9007, -2.1055,  0.6784],
          [-1.2345, -0.0431, -1.6047, -0.7521, -0.6866],
          [-0.4934,  0.2415, -1.1109,  0.0915, -2.3169],
          [-0.2168, -1.3847, -0.3957,  0.8034, -0.6216],
          [-0.5920, -0.0631, -0.8286,  0.3309, -1.5576]]]])

Mask:
tensor([[ True,  True,  True, False, False],
        [ True,  True,  True, False, False],
        [ True,  True,  True, False, False],
        [ True,  True,  True, False, False],
        [ True,  True,  True, False, False]])

Softmax with -1e9 for masked values:
tensor([[[[0.4993, 0.3217, 0.1789, 0.0000, 0.0000],
          [0.2007, 0.6607, 0.1386, 0.0000, 0.0000],
          [0.2759, 0.5753, 0.1488, 0.0000, 0.0000],
          [0.4657, 0.1449, 0.3894, 0.0000, 0.0000],
          [0.2868, 0.4868, 0.2264, 0.0000, 0.0000]]]])

Softmax with 0 for masked values:
tensor([[[[0.4360, 0.2809, 0.1562, 0.0635, 0.0635],
          [0.0843, 0.2777, 0.0583, 0.2899, 0.2899],
          [0

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define parameters
batch_size = 2  # Example batch size
seq_length = 512  # Given sequence length
vocab_size = 3000  # Given vocabulary size

# Simulate model output (logits) and labels
proj_output = torch.randn(batch_size, seq_length, vocab_size)  # Model logits
label = torch.randint(0, vocab_size, (batch_size, seq_length))  # Ground truth labels

# Implement CrossEntropyLoss from scratch
def custom_cross_entropy_loss(logits, targets):
    log_probs = F.log_softmax(logits, dim=-1)  # Apply log softmax
    nll_loss = -log_probs[range(log_probs.shape[0]), targets]  # Negative log likelihood loss
    return nll_loss.mean()  # Mean loss

# Reshape for loss computation
proj_output_reshaped = proj_output.view(-1, vocab_size)  # Shape: (batch_size * seq_length, vocab_size)
label_reshaped = label.view(-1)  # Shape: (batch_size * seq_length)

# Compute loss using custom CrossEntropyLoss
loss = custom_cross_entropy_loss(proj_output_reshaped, label_reshaped)

print("Loss:", loss.item())


Loss: 8.514948844909668
