# Transformers: Implementation and Examples

This notebook provides a comprehensive guide to understanding and implementing transformers from scratch, as well as using pre-trained models.

## Table of Contents
1. [Setup and Imports](#setup)
2. [Attention Mechanism](#attention)
3. [Multi-Head Attention](#multihead)
4. [Positional Encoding](#positional)
5. [Transformer Building Blocks](#blocks)
6. [Using Pre-trained Models](#pretrained)
7. [Fine-tuning Example](#finetuning)

## 1. Setup and Imports <a name="setup"></a>

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 2. Scaled Dot-Product Attention <a name="attention"></a>

The fundamental building block of transformers:

$$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

In [None]:
def scaled_dot_product_attention(query, key, value, mask=None):
    """
    Compute scaled dot-product attention.
    
    Args:
        query: Query tensor of shape (batch_size, seq_len, d_k)
        key: Key tensor of shape (batch_size, seq_len, d_k)
        value: Value tensor of shape (batch_size, seq_len, d_v)
        mask: Optional mask tensor
    
    Returns:
        output: Attention output
        attention_weights: Attention weights for visualization
    """
    d_k = query.size(-1)
    
    # Compute attention scores
    scores = torch.matmul(query, key.transpose(-2, -1)) / np.sqrt(d_k)
    
    # Apply mask if provided
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    
    # Apply softmax
    attention_weights = F.softmax(scores, dim=-1)
    
    # Compute output
    output = torch.matmul(attention_weights, value)
    
    return output, attention_weights

# Example usage
batch_size, seq_len, d_k = 2, 4, 8
Q = torch.randn(batch_size, seq_len, d_k)
K = torch.randn(batch_size, seq_len, d_k)
V = torch.randn(batch_size, seq_len, d_k)

output, attn_weights = scaled_dot_product_attention(Q, K, V)
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {attn_weights.shape}")

# Visualize attention weights
plt.figure(figsize=(8, 6))
sns.heatmap(attn_weights[0].detach().numpy(), annot=True, fmt='.2f', cmap='viridis')
plt.title('Attention Weights Visualization')
plt.xlabel('Key Position')
plt.ylabel('Query Position')
plt.show()

## 3. Multi-Head Attention <a name="multihead"></a>

Multiple attention heads allow the model to attend to different aspects of the input.

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        # Linear projections
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, d_k)"""
        x = x.view(batch_size, -1, self.num_heads, self.d_k)
        return x.transpose(1, 2)
    
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # Linear projections
        Q = self.W_q(query)
        K = self.W_k(key)
        V = self.W_v(value)
        
        # Split into multiple heads
        Q = self.split_heads(Q, batch_size)
        K = self.split_heads(K, batch_size)
        V = self.split_heads(V, batch_size)
        
        # Apply attention
        attn_output, attn_weights = scaled_dot_product_attention(Q, K, V, mask)
        
        # Concatenate heads
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(batch_size, -1, self.d_model)
        
        # Final linear projection
        output = self.W_o(attn_output)
        
        return output, attn_weights

# Example usage
d_model = 512
num_heads = 8
seq_len = 10
batch_size = 2

mha = MultiHeadAttention(d_model, num_heads)
x = torch.randn(batch_size, seq_len, d_model)
output, attn_weights = mha(x, x, x)

print(f"Multi-head attention output shape: {output.shape}")
print(f"Attention weights shape: {attn_weights.shape}")

## 4. Positional Encoding <a name="positional"></a>

Since transformers don't have inherent notion of order, we add positional encodings.

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

# Example and visualization
d_model = 128
max_len = 100
pos_encoding = PositionalEncoding(d_model, max_len)

# Visualize positional encodings
plt.figure(figsize=(12, 8))
plt.imshow(pos_encoding.pe[0, :50, :].numpy(), cmap='RdBu', aspect='auto')
plt.colorbar()
plt.xlabel('Dimension')
plt.ylabel('Position')
plt.title('Positional Encoding Visualization')
plt.show()

## 5. Transformer Building Blocks <a name="blocks"></a>

Complete encoder and decoder layers.

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # Self-attention with residual connection
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed-forward with residual connection
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

# Example usage
d_model = 512
num_heads = 8
d_ff = 2048
seq_len = 10
batch_size = 2

encoder_layer = EncoderLayer(d_model, num_heads, d_ff)
x = torch.randn(batch_size, seq_len, d_model)
output = encoder_layer(x)

print(f"Encoder layer output shape: {output.shape}")

## 6. Using Pre-trained Models <a name="pretrained"></a>

Let's use Hugging Face transformers library to load and use pre-trained models.

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Example sentences
sentences = [
    "Transformers are powerful neural network architectures.",
    "Attention mechanisms enable parallel processing of sequences."
]

# Tokenize and encode
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Get model outputs
with torch.no_grad():
    outputs = model(**encoded_input)

# Extract embeddings
last_hidden_states = outputs.last_hidden_state
print(f"Output shape: {last_hidden_states.shape}")
print(f"Shape: (batch_size={last_hidden_states.shape[0]}, seq_len={last_hidden_states.shape[1]}, hidden_size={last_hidden_states.shape[2]})")

# Extract CLS token embeddings (sentence representations)
cls_embeddings = last_hidden_states[:, 0, :]
print(f"\nCLS embeddings shape: {cls_embeddings.shape}")

## 7. Fine-tuning Example <a name="finetuning"></a>

Example of how to fine-tune a pre-trained transformer for a specific task.

In [None]:
from transformers import AutoModelForSequenceClassification, AdamW

# Create a classification model
num_labels = 2  # Binary classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Example training data
train_texts = [
    "This movie was fantastic!",
    "I didn't enjoy this film at all.",
    "Great storyline and acting.",
    "Boring and predictable."
]
train_labels = [1, 0, 1, 0]  # 1: positive, 0: negative

# Tokenize
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')

# Create simple training loop (simplified for demonstration)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(**train_encodings, labels=torch.tensor(train_labels))
    loss = outputs.loss
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

# Inference
model.eval()
test_text = ["This is an amazing movie!"]
test_encoding = tokenizer(test_text, return_tensors='pt', truncation=True, padding=True)

with torch.no_grad():
    outputs = model(**test_encoding)
    predictions = torch.argmax(outputs.logits, dim=-1)
    
print(f"\nPrediction: {'Positive' if predictions.item() == 1 else 'Negative'}")

## Summary

In this notebook, we've covered:

1. **Attention Mechanism**: The core building block of transformers
2. **Multi-Head Attention**: Parallel attention mechanisms for richer representations
3. **Positional Encoding**: Adding sequence order information
4. **Transformer Layers**: Complete encoder/decoder implementations
5. **Pre-trained Models**: Using BERT and other models from Hugging Face
6. **Fine-tuning**: Adapting pre-trained models for specific tasks

## Next Steps

- Experiment with different architectures (GPT, T5, etc.)
- Try fine-tuning on your own datasets
- Explore Vision Transformers for image tasks
- Study optimization techniques and training strategies
- Implement efficient attention variants (e.g., Linear Attention, Performer)