In [None]:
%pip install torch numpy tqdm sentencepiece matplotlib

import sys
import os
from pathlib import Path

# Add parent directory to path for imports
notebook_path = Path.cwd()
project_root = notebook_path.parent
sys.path.append(str(project_root))

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from transformer.config import TransformerConfig
from transformer.model import Transformer
from transformer.loss import masked_loss


In [None]:
class ToyDataset(Dataset):
    """Simple toy dataset for demonstration."""
    
    def __init__(self, num_samples=1000, seq_len=10, vocab_size=1000):
        self.num_samples = num_samples
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        
        # Generate random sequences
        self.src_data = torch.randint(
            4, vocab_size, (num_samples, seq_len))  # Start from 4 to reserve special tokens
        self.tgt_data = torch.randint(
            4, vocab_size, (num_samples, seq_len))
        
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        return self.src_data[idx], self.tgt_data[idx]

# Create dataset and dataloader
train_dataset = ToyDataset()
train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)

print("Dataset size:", len(train_dataset))
print("Number of batches:", len(train_dataloader))

# Show a sample batch
src_batch, tgt_batch = next(iter(train_dataloader))
print("\nSample batch shapes:")
print("Source:", src_batch.shape)
print("Target:", tgt_batch.shape)


In [None]:
# Create model configuration
config = TransformerConfig(
    vocab_size=1000,
    max_seq_len=10,
    d_model=128,
    num_heads=4,
    num_layers=3,
    d_ff=512,
    dropout=0.1,
    label_smoothing=0.1,
    max_lr=1e-3,
    warmup_steps=1000
)

print("Model configuration:")
for key, value in config.__dict__.items():
    print(f"{key}: {value}")


In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Create model
model = Transformer(config).to(device)

# Create optimizer
optimizer = optim.Adam(
    model.parameters(),
    lr=config.max_lr,
    betas=(0.9, 0.98),
    eps=1e-9
)

# Create learning rate scheduler
def lr_lambda(step):
    # Linear warmup followed by inverse square root decay
    if step < config.warmup_steps:
        return float(step) / float(max(1, config.warmup_steps))
    return float(config.warmup_steps ** 0.5) / float(step ** 0.5)

scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

# Print model summary
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")


In [None]:
# Training settings
num_epochs = 5
log_interval = 10

# Training history
history = {
    'loss': [],
    'lr': []
}

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    with tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}') as pbar:
        for batch_idx, (src, tgt) in enumerate(pbar):
            src = src.to(device)
            tgt = tgt.to(device)
            
            # Create target input and output
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            
            # Forward pass
            logits, _ = model(src, tgt_input)
            
            # Calculate loss
            loss = masked_loss(
                logits,
                tgt_output,
                config.pad_token_id,
                config.label_smoothing
            )
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            scheduler.step()
            
            # Update history
            current_lr = scheduler.get_last_lr()[0]
            history['loss'].append(loss.item())
            history['lr'].append(current_lr)
            
            # Update progress bar
            pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'lr': f'{current_lr:.6f}'
            })
            
            total_loss += loss.item()
    
    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1} - Average loss: {avg_loss:.4f}')


In [None]:
# Create figure with two subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))

# Plot loss
ax1.plot(history['loss'])
ax1.set_title('Training Loss')
ax1.set_xlabel('Batch')
ax1.set_ylabel('Loss')
ax1.grid(True)

# Plot learning rate
ax2.plot(history['lr'])
ax2.set_title('Learning Rate')
ax2.set_xlabel('Batch')
ax2.set_ylabel('Learning Rate')
ax2.grid(True)

plt.tight_layout()
plt.show()


In [None]:
# Create output directory
output_dir = project_root / 'checkpoints'
output_dir.mkdir(exist_ok=True)

# Save checkpoint
checkpoint_path = output_dir / 'transformer_model.pt'
torch.save({
    'epoch': num_epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'config': config.__dict__,
    'loss': history['loss'][-1]
}, checkpoint_path)

print(f"Model saved to {checkpoint_path}")
