# Gradient Accumulation, Clipping, and Model Compilation

**File Location:** `notebooks/04_performance_and_scaling/09_grad_accum_clip_compile.ipynb`

## Introduction

This notebook covers advanced gradient techniques and model compilation for optimized training. Learn gradient accumulation for large effective batch sizes, gradient clipping for stable training, and torch.compile for significant speedups.

## Gradient Accumulation

### Understanding Gradient Accumulation

```python
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import time

class GradientAccumulationDemo(pl.LightningModule):
    """Demo model for gradient accumulation"""
    
    def __init__(self, input_size=256, hidden_size=512, num_classes=10, learning_rate=1e-3):
        super().__init__()
        self.save_hyperparameters()
        
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(hidden_size, num_classes)
        )
        
        from torchmetrics import Accuracy
        self.train_acc = Accuracy(task="multiclass", num_classes=num_classes)
        self.val_acc = Accuracy(task="multiclass", num_classes=num_classes)
        
        # Track gradient accumulation stats
        self.accum_steps_taken = []
        self.effective_batch_sizes = []
        
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        preds = torch.argmax(logits, dim=1)
        self.train_acc(preds, y)
        
        # Track effective batch size
        accumulate_grad_batches = self.trainer.accumulate_grad_batches
        current_batch_size = x.size(0)
        effective_batch_size = current_batch_size * accumulate_grad_batches
        
        self.effective_batch_sizes.append(effective_batch_size)
        
        # Log information
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_acc', self.train_acc, on_epoch=True)
        self.log('effective_batch_size', effective_batch_size, on_step=False, on_epoch=True)
        self.log('accumulate_steps', accumulate_grad_batches, on_step=False, on_epoch=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        preds = torch.argmax(logits, dim=1)
        self.val_acc(preds, y)
        
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_acc', self.val_acc, on_epoch=True, prog_bar=True)
        
        return loss
    
    def on_before_optimizer_step(self, optimizer, optimizer_idx):
        # Track when optimizer steps are actually taken
        self.accum_steps_taken.append(self.global_step)
        
        # Log gradient norms
        total_norm = 0
        for p in self.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1. / 2)
        
        self.log('grad_norm', total_norm, on_step=True, on_epoch=False)
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.hparams.learning_rate, weight_decay=1e-4)

# Create dataset
def create_dataset(num_samples=5000, input_size=256, num_classes=10):
    torch.manual_seed(42)
    X = torch.randn(num_samples, input_size)
    weights = torch.randn(input_size)
    logits = X @ weights
    y = torch.div(logits - logits.min(), (logits.max() - logits.min()) / (num_classes - 1), rounding_mode='floor').long()
    y = torch.clamp(y, 0, num_classes - 1)
    return X, y

X, y = create_dataset()
dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Different batch sizes to compare with gradient accumulation
small_batch_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Small batches
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print("✓ Dataset and model setup completed")
```

### Comparing Different Accumulation Strategies

```python
def compare_gradient_accumulation():
    """Compare different gradient accumulation settings"""
    
    configs = [
        {"name": "No Accumulation", "batch_size": 64, "accumulate": 1},
        {"name": "2x Accumulation", "batch_size": 32, "accumulate": 2}, 
        {"name": "4x Accumulation", "batch_size": 16, "accumulate": 4},
        {"name": "8x Accumulation", "batch_size": 8, "accumulate": 8},
    ]
    
    results = {}
    
    for config in configs:
        print(f"\n=== {config['name']} ===")
        print(f"Batch size: {config['batch_size']}, Accumulate: {config['accumulate']}")
        print(f"Effective batch size: {config['batch_size'] * config['accumulate']}")
        
        # Create dataloader with specific batch size
        train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
        
        model = GradientAccumulationDemo()
        
        trainer = pl.Trainer(
            max_epochs=3,
            accumulate_grad_batches=config['accumulate'],
            enable_checkpointing=False,
            logger=False,
            enable_progress_bar=False,
            limit_train_batches=50,  # Limit for demo
            limit_val_batches=20
        )
        
        start_time = time.time()
        trainer.fit(model, train_loader, val_loader)
        training_time = time.time() - start_time
        
        final_acc = trainer.callback_metrics.get('val_acc', 0)
        final_loss = trainer.callback_metrics.get('val_loss', float('inf'))
        
        results[config['name']] = {
            'time': training_time,
            'accuracy': final_acc,
            'loss': final_loss,
            'effective_batch_size': config['batch_size'] * config['accumulate'],
            'optimizer_steps': len(model.accum_steps_taken)
        }
        
        print(f"Training time: {training_time:.2f}s")
        print(f"Final accuracy: {final_acc:.4f}")
        print(f"Optimizer steps taken: {len(model.accum_steps_taken)}")
    
    return results

# Run comparison
print("=== Gradient Accumulation Comparison ===")
accumulation_results = compare_gradient_accumulation()

print(f"\n=== Results Summary ===")
for name, result in accumulation_results.items():
    print(f"{name:15} | Time: {result['time']:5.2f}s | Acc: {result['accuracy']:.4f} | Steps: {result['optimizer_steps']:3d} | Eff.BS: {result['effective_batch_size']:2d}")
```

## Gradient Clipping Techniques

### Understanding Gradient Clipping

```python
class GradientClippingDemo(pl.LightningModule):
    """Demo model for gradient clipping techniques"""
    
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        
        # Deeper model more prone to gradient issues
        layers = []
        input_size = 256
        for i in range(6):  # 6 hidden layers
            layers.extend([
                nn.Linear(input_size if i == 0 else 512, 512),
                nn.ReLU(),
                nn.Dropout(0.1)
            ])
        layers.append(nn.Linear(512, 10))
        
        self.model = nn.Sequential(*layers)
        
        from torchmetrics import Accuracy
        self.train_acc = Accuracy(task="multiclass", num_classes=10)
        self.val_acc = Accuracy(task="multiclass", num_classes=10)
        
        # Track gradient statistics
        self.grad_norms = []
        self.clipped_grad_norms = []
        self.clipping_ratios = []
        
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        preds = torch.argmax(logits, dim=1)
        self.train_acc(preds, y)
        
        self.log('train_loss', loss, on_step=True, on_epoch=True)
        self.log('train_acc', self.train_acc, on_epoch=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        preds = torch.argmax(logits, dim=1)
        self.val_acc(preds, y)
        
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_acc', self.val_acc, on_epoch=True, prog_bar=True)
        
        return loss
    
    def on_before_optimizer_step(self, optimizer, optimizer_idx):
        # Calculate gradient norm before clipping
        total_norm = 0
        for p in self.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1. / 2)
        
        self.grad_norms.append(total_norm)
        self.log('grad_norm_before_clip', total_norm, on_step=True)
        
        # If gradient clipping is enabled, calculate clipping ratio
        if hasattr(self.trainer, 'gradient_clip_val') and self.trainer.gradient_clip_val is not None:
            clip_val = self.trainer.gradient_clip_val
            clipping_ratio = min(1.0, clip_val / (total_norm + 1e-6))
            self.clipping_ratios.append(clipping_ratio)
            
            self.log('clipping_ratio', clipping_ratio, on_step=True)
            self.log('gradient_clipped', float(clipping_ratio < 1.0), on_step=True)
    
    def on_after_optimizer_step(self, optimizer, optimizer_idx):
        # Calculate gradient norm after clipping
        total_norm = 0
        for p in self.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1. / 2)
        
        self.clipped_grad_norms.append(total_norm)
        self.log('grad_norm_after_clip', total_norm, on_step=True)
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

def compare_gradient_clipping():
    """Compare different gradient clipping strategies"""
    
    clipping_configs = [
        {"name": "No Clipping", "clip_val": None, "clip_algo": None},
        {"name": "Norm Clip 0.5", "clip_val": 0.5, "clip_algo": "norm"},
        {"name": "Norm Clip 1.0", "clip_val": 1.0, "clip_algo": "norm"},
        {"name": "Norm Clip 2.0", "clip_val": 2.0, "clip_algo": "norm"},
        {"name": "Value Clip 0.1", "clip_val": 0.1, "clip_algo": "value"},
    ]
    
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    
    clipping_results = {}
    
    for config in clipping_configs:
        print(f"\n=== {config['name']} ===")
        
        model = GradientClippingDemo()
        
        trainer = pl.Trainer(
            max_epochs=3,
            gradient_clip_val=config['clip_val'],
            gradient_clip_algorithm=config['clip_algo'],
            enable_checkpointing=False,
            logger=False,
            enable_progress_bar=False,
            limit_train_batches=50,
            limit_val_batches=20
        )
        
        start_time = time.time()
        trainer.fit(model, train_loader, val_loader)
        training_time = time.time() - start_time
        
        final_acc = trainer.callback_metrics.get('val_acc', 0)
        final_loss = trainer.callback_metrics.get('val_loss', float('inf'))
        
        # Gradient statistics
        avg_grad_norm = np.mean(model.grad_norms) if model.grad_norms else 0
        max_grad_norm = np.max(model.grad_norms) if model.grad_norms else 0
        clipping_frequency = np.mean([r < 1.0 for r in model.clipping_ratios]) if model.clipping_ratios else 0
        
        clipping_results[config['name']] = {
            'time': training_time,
            'accuracy': final_acc,
            'loss': final_loss,
            'avg_grad_norm': avg_grad_norm,
            'max_grad_norm': max_grad_norm,
            'clipping_freq': clipping_frequency
        }
        
        print(f"Final accuracy: {final_acc:.4f}")
        print(f"Average gradient norm: {avg_grad_norm:.4f}")
        print(f"Max gradient norm: {max_grad_norm:.4f}")
        print(f"Clipping frequency: {clipping_frequency:.2%}")
    
    return clipping_results

print("\n=== Gradient Clipping Comparison ===")
clipping_results = compare_gradient_clipping()

print(f"\n=== Clipping Results Summary ===")
for name, result in clipping_results.items():
    print(f"{name:15} | Acc: {result['accuracy']:.4f} | AvgGrad: {result['avg_grad_norm']:6.3f} | MaxGrad: {result['max_grad_norm']:6.3f} | ClipFreq: {result['clipping_freq']:5.1%}")
```

## Model Compilation with torch.compile

### Basic Model Compilation

```python
class CompilationDemo(pl.LightningModule):
    """Demo model for torch.compile optimization"""
    
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        
        self.model = nn.Sequential(
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 10)
        )
        
        from torchmetrics import Accuracy
        self.train_acc = Accuracy(task="multiclass", num_classes=10)
        self.val_acc = Accuracy(task="multiclass", num_classes=10)
        
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        preds = torch.argmax(logits, dim=1)
        self.train_acc(preds, y)
        
        self.log('train_loss', loss, on_step=True, on_epoch=True)
        self.log('train_acc', self.train_acc, on_epoch=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        preds = torch.argmax(logits, dim=1)
        self.val_acc(preds, y)
        
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_acc', self.val_acc, on_epoch=True, prog_bar=True)
        
        return loss
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-3)

def compare_compilation():
    """Compare training with and without torch.compile"""
    
    # Check if torch.compile is available (PyTorch 2.0+)
    if not hasattr(torch, 'compile'):
        print("⚠️ torch.compile not available. Please use PyTorch 2.0+")
        return
    
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    
    # Test different compilation modes
    compilation_modes = [
        {"name": "No Compilation", "compile": False, "mode": None},
        {"name": "Default Compile", "compile": True, "mode": "default"},
        {"name": "Reduce Overhead", "compile": True, "mode": "reduce-overhead"},
        {"name": "Max Autotune", "compile": True, "mode": "max-autotune"},
    ]
    
    compilation_results = {}
    
    for config in compilation_modes:
        print(f"\n=== {config['name']} ===")
        
        model = CompilationDemo()
        
        # Apply compilation if requested
        if config['compile']:
            print(f"Compiling model with mode: {config['mode']}")
            model = torch.compile(model, mode=config['mode'])
        
        trainer = pl.Trainer(
            max_epochs=2,  # Reduced for demo
            enable_checkpointing=False,
            logger=False,
            enable_progress_bar=False,
            limit_train_batches=100,  # More batches to see compilation benefits
            limit_val_batches=30
        )
        
        # Warmup run (compilation happens during first few iterations)
        start_time = time.time()
        trainer.fit(model, train_loader, val_loader)
        total_time = time.time() - start_time
        
        final_acc = trainer.callback_metrics.get('val_acc', 0)
        final_loss = trainer.callback_metrics.get('val_loss', float('inf'))
        
        compilation_results[config['name']] = {
            'time': total_time,
            'accuracy': final_acc,
            'loss': final_loss
        }
        
        print(f"Training time: {total_time:.2f}s")
        print(f"Final accuracy: {final_acc:.4f}")
    
    return compilation_results

# Test compilation if available
print("\n=== Model Compilation Comparison ===")
try:
    compilation_results = compare_compilation()
    if compilation_results:
        print(f"\n=== Compilation Results Summary ===")
        baseline_time = compilation_results.get("No Compilation", {}).get('time', 1)
        
        for name, result in compilation_results.items():
            speedup = baseline_time / result['time'] if result['time'] > 0 else 1
            print(f"{name:18} | Time: {result['time']:5.2f}s | Speedup: {speedup:.2f}x | Acc: {result['accuracy']:.4f}")
except Exception as e:
    print(f"Compilation test failed: {e}")
```

## Advanced Optimization Techniques

### Combined Optimizations

```python
class OptimizedModel(pl.LightningModule):
    """Model with all optimization techniques combined"""
    
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        
        # Efficient model architecture
        self.model = nn.Sequential(
            nn.Linear(256, 512),
            nn.LayerNorm(512),  # Better for mixed precision
            nn.GELU(),          # Efficient activation
            nn.Dropout(0.1),
            
            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Dropout(0.1),
            
            nn.Linear(512, 10)
        )
        
        from torchmetrics import Accuracy, MetricCollection
        metrics = MetricCollection({
            'accuracy': Accuracy(task="multiclass", num_classes=10),
        })
        
        self.train_metrics = metrics.clone(prefix='train_')
        self.val_metrics = metrics.clone(prefix='val_')
        
        # Optimization tracking
        self.optimization_stats = {
            'grad_norms': [],
            'training_times': [],
            'memory_usage': []
        }
        
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        
        # Time the forward pass
        start_time = time.time()
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        forward_time = time.time() - start_time
        
        preds = torch.argmax(logits, dim=1)
        self.train_metrics(preds, y)
        
        # Log performance metrics
        self.log('train_loss', loss, on_step=True, on_epoch=True)
        self.log('forward_time_ms', forward_time * 1000, on_step=True)
        self.log_dict(self.train_metrics, on_epoch=True)
        
        # Memory tracking
        if torch.cuda.is_available() and batch_idx % 50 == 0:
            memory_mb = torch.cuda.memory_allocated() / 1024**2
            self.optimization_stats['memory_usage'].append(memory_mb)
            self.log('memory_mb', memory_mb, on_step=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        preds = torch.argmax(logits, dim=1)
        self.val_metrics(preds, y)
        
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log_dict(self.val_metrics, on_epoch=True, prog_bar=True)
        
        return loss
    
    def on_before_optimizer_step(self, optimizer, optimizer_idx):
        # Track gradient norms
        total_norm = 0
        for p in self.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1. / 2)
        
        self.optimization_stats['grad_norms'].append(total_norm)
        self.log('grad_norm', total_norm, on_step=True)
    
    def configure_optimizers(self):
        # Optimized optimizer settings
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=1e-3,
            weight_decay=1e-4,
            eps=1e-4,  # Larger eps for mixed precision stability
            betas=(0.9, 0.95)  # Slightly adjusted betas
        )
        
        # Efficient learning rate schedule
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=1e-3,
            total_steps=self.trainer.estimated_stepping_batches,
            pct_start=0.05,  # Short warmup
            div_factor=25,   # Initial lr = max_lr / div_factor
            final_div_factor=1000  # Final lr = initial_lr / final_div_factor
        )
        
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'interval': 'step'
            }
        }

def create_optimized_trainer():
    """Create trainer with all optimizations enabled"""
    
    return pl.Trainer(
        # Performance optimizations
        precision="16-mixed",           # Mixed precision
        accumulate_grad_batches=2,      # Gradient accumulation
        gradient_clip_val=1.0,          # Gradient clipping
        gradient_clip_algorithm="norm",
        
        # Training config
        max_epochs=5,
        
        # Efficiency settings
        enable_checkpointing=False,
        logger=False,
        enable_progress_bar=True,
        log_every_n_steps=25
    )

print("\n=== Fully Optimized Training ===")

# Create optimized model and trainer
optimized_model = OptimizedModel()

# Apply compilation if available
if hasattr(torch, 'compile'):
    print("🚀 Applying torch.compile...")
    optimized_model = torch.compile(optimized_model, mode="reduce-overhead")

optimized_trainer = create_optimized_trainer()
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

print("Training fully optimized model...")
start_time = time.time()
optimized_trainer.fit(optimized_model, train_loader, val_loader)
total_time = time.time() - start_time

final_acc = optimized_trainer.callback_metrics.get('val_accuracy', 0)
peak_memory = max(optimized_model.optimization_stats['memory_usage']) if optimized_model.optimization_stats['memory_usage'] else 0

print(f"✓ Optimized training completed in {total_time:.2f}s")
print(f"Final accuracy: {final_acc:.4f}")
print(f"Peak memory usage: {peak_memory:.1f}MB")
print(f"Average gradient norm: {np.mean(optimized_model.optimization_stats['grad_norms']):.4f}")
```

## Best Practices Summary

### Optimization Guidelines

```python
def print_optimization_guidelines():
    """Print comprehensive optimization guidelines"""
    
    guidelines = {
        "Gradient Accumulation": {
            "✅ Use when": [
                "GPU memory is limited",
                "Want larger effective batch sizes",
                "Training very large models"
            ],
            "⚠️ Consider": [
                "Batch normalization behavior changes",
                "Optimizer steps happen less frequently", 
                "May need to adjust learning rates"
            ],
            "🔧 Best practices": [
                "accumulate_grad_batches=2-8 typically",
                "Monitor effective batch size",
                "Adjust LR schedule accordingly"
            ]
        },
        
        "Gradient Clipping": {
            "✅ Use when": [
                "Training deep networks (>10 layers)",
                "Gradient exploding problems",
                "Using RNNs or transformers"
            ],
            "⚠️ Consider": [
                "May slow convergence if too aggressive",
                "Monitor clipping frequency",
                "Different values for different optimizers"
            ],
            "🔧 Best practices": [
                "Start with clip_val=1.0",
                "Use 'norm' clipping usually",
                "Monitor gradient statistics"
            ]
        },
        
        "Model Compilation": {
            "✅ Use when": [
                "PyTorch 2.0+ available",
                "Model has many repeated operations",
                "Training for many epochs"
            ],
            "⚠️ Consider": [
                "First few iterations are slower",
                "May increase memory usage",
                "Debug mode disables optimizations"
            ],
            "🔧 Best practices": [
                "Use 'reduce-overhead' mode for training",
                "Compile after model is finalized",
                "Warmup with several batches"
            ]
        },
        
        "Combined Optimizations": {
            "✅ Recommended stack": [
                "Mixed precision (16-mixed)",
                "Gradient accumulation (2-4x)",
                "Gradient clipping (norm, 1.0)",
                "Model compilation (reduce-overhead)",
                "AdamW optimizer with appropriate eps"
            ],
            "⚠️ Testing protocol": [
                "Compare with baseline (FP32, no opts)",
                "Monitor training stability",
                "Verify final model accuracy",
                "Profile memory usage"
            ]
        }
    }
    
    print("=" * 60)
    print("🚀 OPTIMIZATION BEST PRACTICES GUIDE")
    print("=" * 60)
    
    for category, info in guidelines.items():
        print(f"\n📊 {category.upper()}")
        print("-" * 50)
        
        for section, items in info.items():
            print(f"\n{section}")
            for item in items:
                print(f"  • {item}")
    
    print("\n" + "=" * 60)
    print("💡 Remember: Always measure performance improvements!")
    print("🧪 Test each optimization individually before combining")
    print("📈 Monitor both speed AND accuracy when optimizing")
    print("=" * 60)

# Print comprehensive guidelines
print_optimization_guidelines()
```

## Summary

This notebook covered advanced gradient techniques and model compilation:

1. **Gradient Accumulation**: Simulate larger batch sizes by accumulating gradients across multiple mini-batches
2. **Gradient Clipping**: Prevent gradient explosion using norm-based or value-based clipping
3. **Model Compilation**: Use torch.compile for significant training speedups
4. **Combined Optimizations**: Stack multiple techniques for maximum performance
5. **Best Practices**: Guidelines for applying optimizations effectively

Key optimization techniques:
- **Gradient Accumulation**: 2-8x accumulation typical, monitor effective batch size
- **Gradient Clipping**: Start with norm clipping at 1.0, adjust based on gradient statistics  
- **Model Compilation**: Use "reduce-overhead" mode, expect warmup overhead
- **Mixed Precision**: Combine with other optimizations for maximum benefit

Performance benefits:
- **Memory Efficiency**: Gradient accumulation enables training larger models
- **Training Stability**: Gradient clipping prevents divergence in deep networks
- **Speed Improvements**: Model compilation can provide 1.2-2x speedups
- **Resource Utilization**: Better GPU utilization through optimized operations

Practical considerations:
- Test optimizations individually before combining
- Monitor both training speed and model accuracy
- Profile memory usage to avoid OOM errors
- Adjust hyperparameters when changing batch dynamics
- Use proper warmup periods for compiled models

Next notebook: We'll explore profiling and performance tuning techniques.