In [1]:
import sys
sys.path.append('..')

import torch
import torch.nn as nn
import json

# Load model components from src/
from src.model import Transformer, PositionalEncoding
from src.layers import EncoderLayer, DecoderLayer, FeedForward
from src.attention import MultiHeadAttention, scaled_dot_product_attention

print("‚úÖ Successfully imported all model components!")

‚úÖ Successfully imported all model components!


## 1. Scaled Dot-Product Attention

$$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

In [2]:
# Test scaled_dot_product_attention
print("Testing scaled_dot_product_attention...")

# Create dummy inputs
batch_size = 2
num_heads = 8
seq_len = 10
d_k = 64

Q = torch.randn(batch_size, num_heads, seq_len, d_k)
K = torch.randn(batch_size, num_heads, seq_len, d_k)
V = torch.randn(batch_size, num_heads, seq_len, d_k)

# Test without mask
output, attn_weights = scaled_dot_product_attention(Q, K, V)
print(f"‚úÖ Output shape: {output.shape}")
print(f"‚úÖ Attention weights shape: {attn_weights.shape}")

# Test with mask
mask = torch.ones(batch_size, 1, 1, seq_len).bool()
output_masked, attn_weights_masked = scaled_dot_product_attention(Q, K, V, mask)
print(f"‚úÖ Masked output shape: {output_masked.shape}")

Testing scaled_dot_product_attention...
‚úÖ Output shape: torch.Size([2, 8, 10, 64])
‚úÖ Attention weights shape: torch.Size([2, 8, 10, 10])
‚úÖ Masked output shape: torch.Size([2, 8, 10, 64])


## 2. Multi-Head Attention

In [3]:
# Test MultiHeadAttention
print("Testing MultiHeadAttention...")

d_model = 512
num_heads = 8
batch_size = 2
seq_len = 10

mha = MultiHeadAttention(d_model, num_heads)

# Create dummy inputs
x = torch.randn(batch_size, seq_len, d_model)
output, attn_weights = mha(x, x, x)

print(f"‚úÖ Input shape: {x.shape}")
print(f"‚úÖ Output shape: {output.shape}")
print(f"‚úÖ Parameters: {sum(p.numel() for p in mha.parameters()):,}")

Testing MultiHeadAttention...
‚úÖ Input shape: torch.Size([2, 10, 512])
‚úÖ Output shape: torch.Size([2, 10, 512])
‚úÖ Parameters: 1,050,624


## 3. Positional Encoding

$$PE_{(pos, 2i)} = \sin\left(\frac{pos}{10000^{2i/d_{model}}}\right)$$
$$PE_{(pos, 2i+1)} = \cos\left(\frac{pos}{10000^{2i/d_{model}}}\right)$$

In [4]:
# Test PositionalEncoding
print("Testing PositionalEncoding...")

d_model = 512
max_len = 128
batch_size = 2
seq_len = 10

pe = PositionalEncoding(d_model, max_len)

# Create dummy input
x = torch.randn(batch_size, seq_len, d_model)
output = pe(x)

print(f"‚úÖ Input shape: {x.shape}")
print(f"‚úÖ Output shape: {output.shape}")
print(f"‚úÖ First 5 positional encoding values:")
print(pe.pe[0, :5, :5])

Testing PositionalEncoding...
‚úÖ Input shape: torch.Size([2, 10, 512])
‚úÖ Output shape: torch.Size([2, 10, 512])
‚úÖ First 5 positional encoding values:
tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000],
        [ 0.8415,  0.5403,  0.8219,  0.5697,  0.8020],
        [ 0.9093, -0.4161,  0.9364, -0.3509,  0.9581],
        [ 0.1411, -0.9900,  0.2451, -0.9695,  0.3428],
        [-0.7568, -0.6536, -0.6572, -0.7537, -0.5486]])


## 4. Feed-Forward Network

In [5]:
# Test FeedForward
print("Testing FeedForward Network...")

d_model = 512
d_ff = 2048
batch_size = 2
seq_len = 10

ffn = FeedForward(d_model, d_ff)

# Create dummy input
x = torch.randn(batch_size, seq_len, d_model)
output = ffn(x)

print(f"‚úÖ Input shape: {x.shape}")
print(f"‚úÖ Output shape: {output.shape}")
print(f"‚úÖ Parameters: {sum(p.numel() for p in ffn.parameters()):,}")

Testing FeedForward Network...
‚úÖ Input shape: torch.Size([2, 10, 512])
‚úÖ Output shape: torch.Size([2, 10, 512])
‚úÖ Parameters: 2,099,712


## 5. Encoder Layer

In [6]:
# Test EncoderLayer
print("Testing EncoderLayer...")

d_model = 512
num_heads = 8
d_ff = 2048
batch_size = 2
seq_len = 10

encoder_layer = EncoderLayer(d_model, num_heads, d_ff)

# Create dummy input
x = torch.randn(batch_size, seq_len, d_model)
output = encoder_layer(x)

print(f"‚úÖ Input shape: {x.shape}")
print(f"‚úÖ Output shape: {output.shape}")
print(f"‚úÖ Parameters: {sum(p.numel() for p in encoder_layer.parameters()):,}")

Testing EncoderLayer...
‚úÖ Input shape: torch.Size([2, 10, 512])
‚úÖ Output shape: torch.Size([2, 10, 512])
‚úÖ Parameters: 3,152,384


## 6. Decoder Layer

In [7]:
# Test DecoderLayer
print("Testing DecoderLayer...")

d_model = 512
num_heads = 8
d_ff = 2048
batch_size = 2
src_len = 10
tgt_len = 8

decoder_layer = DecoderLayer(d_model, num_heads, d_ff)

# Create dummy inputs
tgt = torch.randn(batch_size, tgt_len, d_model)
encoder_output = torch.randn(batch_size, src_len, d_model)

output = decoder_layer(tgt, encoder_output)

print(f"‚úÖ Target shape: {tgt.shape}")
print(f"‚úÖ Encoder output shape: {encoder_output.shape}")
print(f"‚úÖ Decoder output shape: {output.shape}")
print(f"‚úÖ Parameters: {sum(p.numel() for p in decoder_layer.parameters()):,}")

Testing DecoderLayer...
‚úÖ Target shape: torch.Size([2, 8, 512])
‚úÖ Encoder output shape: torch.Size([2, 10, 512])
‚úÖ Decoder output shape: torch.Size([2, 8, 512])
‚úÖ Parameters: 4,204,032


## 7. Full Transformer Model

In [8]:
# Load tokenizer config
print("Loading tokenizer configuration...")
with open('../data/processed/tokenizer_info.json', 'r') as f:
    tokenizer_info = json.load(f)

src_vocab_size = tokenizer_info['vi_vocab_size']
tgt_vocab_size = tokenizer_info['en_vocab_size']
max_len = tokenizer_info['max_length']

print(f"‚úÖ Source vocab size: {src_vocab_size}")
print(f"‚úÖ Target vocab size: {tgt_vocab_size}")
print(f"‚úÖ Max length: {max_len}")

# Initialize Transformer
print("\nInitializing Transformer model...")
model = Transformer(
    src_vocab_size=src_vocab_size,
    tgt_vocab_size=tgt_vocab_size,
    d_model=512,
    num_heads=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    d_ff=2048,
    max_len=max_len,
    dropout=0.1,
    pad_idx=tokenizer_info['pad_id']
)

print(f"‚úÖ Model created successfully!")

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nüìä Model Statistics:")
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")
print(f"   Model size: ~{total_params * 4 / (1024**2):.2f} MB (fp32)")

Loading tokenizer configuration...
‚úÖ Source vocab size: 32000
‚úÖ Target vocab size: 32000
‚úÖ Max length: 128

Initializing Transformer model...
‚úÖ Model created successfully!

üìä Model Statistics:
   Total parameters: 93,324,544
   Trainable parameters: 93,324,544
   Model size: ~356.00 MB (fp32)


## 8. Model Testing

In [9]:
# Test full model forward pass
print("Testing full Transformer forward pass...")

batch_size = 4
src_len = 20
tgt_len = 15

# Create dummy inputs (random token IDs)
src = torch.randint(0, src_vocab_size, (batch_size, src_len))
tgt = torch.randint(0, tgt_vocab_size, (batch_size, tgt_len))

print(f"Source shape: {src.shape}")
print(f"Target shape: {tgt.shape}")

# Forward pass
output = model(src, tgt)

print(f"\n‚úÖ Output shape: {output.shape}")
print(f"   Expected: [batch_size={batch_size}, tgt_len={tgt_len}, vocab_size={tgt_vocab_size}]")

# Test with masks (should fail, for demonstration)
print("\nTesting with masks (should fail, do not use extra mask arguments)...")
try:
    src_mask = (src != tokenizer_info['pad_id'])
    tgt_mask = (tgt != tokenizer_info['pad_id'])
    output_masked = model(src, tgt, src_mask, tgt_mask)
    print(f"‚úÖ Masked output shape: {output_masked.shape}")
except TypeError as e:
    print(f"‚ùå Error: {e}")
    print("Do NOT pass masks to model; only pass src and tgt. Masks are handled internally.")

print("\n" + "=" * 60)
print("MODEL BUILDING COMPLETE!")
print("=" * 60)
print(f"‚úÖ All components working correctly")
print(f"‚úÖ Model ready for training")
print(f"‚úÖ Total parameters: {total_params:,}")
print(f"\nüìå Next step: Open 04_training.ipynb to train the model")

Testing full Transformer forward pass...
Source shape: torch.Size([4, 20])
Target shape: torch.Size([4, 15])

‚úÖ Output shape: torch.Size([4, 15, 32000])
   Expected: [batch_size=4, tgt_len=15, vocab_size=32000]

Testing with masks (should fail, do not use extra mask arguments)...
‚ùå Error: Transformer.forward() takes 3 positional arguments but 5 were given
Do NOT pass masks to model; only pass src and tgt. Masks are handled internally.

MODEL BUILDING COMPLETE!
‚úÖ All components working correctly
‚úÖ Model ready for training
‚úÖ Total parameters: 93,324,544

üìå Next step: Open 04_training.ipynb to train the model
