# Profiler and Performance Tuning

**File Location:** `notebooks/04_performance_and_scaling/10_profiler_and_perf_tuning.ipynb`

## Introduction

This notebook covers comprehensive performance profiling and tuning in PyTorch Lightning. Learn to identify bottlenecks, optimize data loading, tune hyperparameters for performance, and create production-ready training pipelines.

## Lightning Profilers

### Built-in Profiler Usage

```python
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.profilers import SimpleProfiler, AdvancedProfiler, PyTorchProfiler
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import time
import os
from pathlib import Path

class ProfilingDemoModel(pl.LightningModule):
    """Model designed to demonstrate profiling capabilities"""
    
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        
        # Model with various operation types
        self.conv_layers = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(32)
        )
        
        self.fc_layers = nn.Sequential(
            nn.Linear(128 * 32, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 10)
        )
        
        from torchmetrics import Accuracy
        self.train_acc = Accuracy(task="multiclass", num_classes=10)
        self.val_acc = Accuracy(task="multiclass", num_classes=10)
        
    def forward(self, x):
        # Reshape for conv1d: [batch, channels, length]
        x = x.unsqueeze(1)
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc_layers(x)
        return x
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        
        # Simulate some expensive operations
        if batch_idx % 10 == 0:
            # Expensive CPU operation
            _ = torch.sin(torch.cos(x.cpu())).to(x.device)
        
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        preds = torch.argmax(logits, dim=1)
        self.train_acc(preds, y)
        
        # Some metric computations
        if batch_idx % 5 == 0:
            probs = F.softmax(logits, dim=1)
            entropy = -torch.sum(probs * torch.log(probs + 1e-8), dim=1).mean()
            self.log('entropy', entropy, on_step=True)
        
        self.log('train_loss', loss, on_step=True, on_epoch=True)
        self.log('train_acc', self.train_acc, on_epoch=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        preds = torch.argmax(logits, dim=1)
        self.val_acc(preds, y)
        
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_acc', self.val_acc, on_epoch=True, prog_bar=True)
        
        return loss
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-3)

# Create dataset with some processing overhead
def create_profiling_dataset(num_samples=2000, input_size=256, num_classes=10):
    torch.manual_seed(42)
    # Create more complex synthetic data
    base_data = torch.randn(num_samples, input_size)
    # Add some correlation structure
    weights = torch.randn(input_size, input_size) * 0.1
    X = base_data + torch.mm(base_data, weights)
    
    # Create targets with some noise
    target_weights = torch.randn(input_size)
    logits = X @ target_weights
    y = torch.div(logits - logits.min(), (logits.max() - logits.min()) / (num_classes - 1), rounding_mode='floor').long()
    y = torch.clamp(y, 0, num_classes - 1)
    
    return X, y

X, y = create_profiling_dataset()
dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

print("✓ Profiling demo setup completed")
```

### Simple Profiler

```python
print("=== Simple Profiler ===")

# Simple profiler for basic timing information
simple_profiler = SimpleProfiler(
    dirpath="./profiling_results",
    filename="simple_profile"
)

model = ProfilingDemoModel()
trainer_simple = pl.Trainer(
    max_epochs=2,
    profiler=simple_profiler,
    enable_checkpointing=False,
    logger=False,
    enable_progress_bar=False,
    limit_train_batches=30,
    limit_val_batches=10
)

print("Training with Simple Profiler...")
trainer_simple.fit(model, train_loader, val_loader)

# Read and display profiling results
profile_file = Path("./profiling_results/simple_profile.txt")
if profile_file.exists():
    print("\n📊 Simple Profiler Results:")
    with open(profile_file, 'r') as f:
        content = f.read()
        print(content[:1000] + "..." if len(content) > 1000 else content)
else:
    print("Profile file not found")

print("✓ Simple profiling completed")
```

### PyTorch Profiler (Advanced)

```python
print("\n=== PyTorch Profiler ===")

# Advanced PyTorch profiler with detailed GPU/CPU analysis
pytorch_profiler = PyTorchProfiler(
    dirpath="./profiling_results",
    filename="pytorch_profile",
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA] if torch.cuda.is_available() else [torch.profiler.ProfilerActivity.CPU],
    record_shapes=True,
    profile_memory=True,
    with_stack=True,
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2)
)

model = ProfilingDemoModel()
trainer_pytorch = pl.Trainer(
    max_epochs=1,
    profiler=pytorch_profiler,
    enable_checkpointing=False,
    logger=False,
    enable_progress_bar=False,
    limit_train_batches=20
)

print("Training with PyTorch Profiler...")
trainer_pytorch.fit(model, train_loader, val_loader)
print("✓ PyTorch profiling completed - check tensorboard for detailed results")
```

## Data Loading Optimization

### DataLoader Performance Tuning

```python
import multiprocessing
from torch.utils.data import Dataset

class SlowDataset(Dataset):
    """Dataset with artificial processing delays"""
    
    def __init__(self, data, targets, processing_delay=0.001):
        self.data = data
        self.targets = targets
        self.delay = processing_delay
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Simulate preprocessing time
        time.sleep(self.delay)
        
        # Some data transformations
        x = self.data[idx]
        y = self.targets[idx]
        
        # Simulate augmentation
        if torch.rand(1) > 0.5:
            x = x + torch.randn_like(x) * 0.01
            
        return x, y

def benchmark_dataloader_configs():
    """Benchmark different DataLoader configurations"""
    
    # Create dataset with processing overhead
    slow_dataset = SlowDataset(X[:800], y[:800], processing_delay=0.002)
    
    configs = [
        {"num_workers": 0, "pin_memory": False, "persistent_workers": False},
        {"num_workers": 2, "pin_memory": False, "persistent_workers": False},
        {"num_workers": 4, "pin_memory": False, "persistent_workers": False},
        {"num_workers": 2, "pin_memory": True, "persistent_workers": False},
        {"num_workers": 2, "pin_memory": True, "persistent_workers": True},
    ]
    
    results = {}
    
    for config in configs:
        print(f"\n--- Config: {config} ---")
        
        loader = DataLoader(
            slow_dataset,
            batch_size=32,
            shuffle=True,
            **config
        )
        
        # Benchmark loading time
        start_time = time.time()
        batch_times = []
        
        for i, batch in enumerate(loader):
            batch_start = time.time()
            # Simulate some processing
            x, y = batch
            _ = x.mean()
            batch_end = time.time()
            
            batch_times.append(batch_end - batch_start)
            
            if i >= 20:  # Limit batches for demo
                break
        
        total_time = time.time() - start_time
        avg_batch_time = np.mean(batch_times)
        
        config_name = f"workers_{config['num_workers']}_pin_{config['pin_memory']}_persist_{config['persistent_workers']}"
        results[config_name] = {
            'total_time': total_time,
            'avg_batch_time': avg_batch_time,
            'batches_per_sec': 1 / avg_batch_time
        }
        
        print(f"Total time: {total_time:.2f}s, Avg batch: {avg_batch_time*1000:.1f}ms")
    
    return results

print("=== DataLoader Optimization ===")
dataloader_results = benchmark_dataloader_configs()

print(f"\n📊 DataLoader Benchmark Results:")
for config, result in dataloader_results.items():
    print(f"{config:25} | Total: {result['total_time']:5.2f}s | Batch: {result['avg_batch_time']*1000:5.1f}ms | Rate: {result['batches_per_sec']:5.1f} batch/s")
```

### Memory and I/O Optimization

```python
class OptimizedDataset(Dataset):
    """Memory-optimized dataset with caching"""
    
    def __init__(self, data, targets, cache_size=100):
        self.data = data
        self.targets = targets
        self.cache = {}
        self.cache_size = cache_size
        self.access_count = {}
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Check cache first
        if idx in self.cache:
            self.access_count[idx] = self.access_count.get(idx, 0) + 1
            return self.cache[idx]
        
        # Process data
        x = self.data[idx]
        y = self.targets[idx]
        
        # Apply transformations
        x_processed = x + torch.randn_like(x) * 0.01
        
        # Cache if space available
        if len(self.cache) < self.cache_size:
            self.cache[idx] = (x_processed, y)
        else:
            # Replace least accessed item
            least_accessed = min(self.cache.keys(), key=lambda k: self.access_count.get(k, 0))
            del self.cache[least_accessed]
            if least_accessed in self.access_count:
                del self.access_count[least_accessed]
            self.cache[idx] = (x_processed, y)
        
        self.access_count[idx] = 1
        return x_processed, y

def compare_dataset_implementations():
    """Compare different dataset implementations"""
    
    # Test datasets
    regular_dataset = TensorDataset(X[:1000], y[:1000])
    slow_dataset = SlowDataset(X[:1000], y[:1000], processing_delay=0.001)
    optimized_dataset = OptimizedDataset(X[:1000], y[:1000], cache_size=200)
    
    datasets = [
        ("Regular TensorDataset", regular_dataset),
        ("Slow Dataset", slow_dataset),
        ("Optimized Dataset", optimized_dataset)
    ]
    
    results = {}
    
    for name, dataset in datasets:
        print(f"\n--- Testing {name} ---")
        
        loader = DataLoader(
            dataset,
            batch_size=32,
            shuffle=True,
            num_workers=2,
            pin_memory=True
        )
        
        start_time = time.time()
        for i, batch in enumerate(loader):
            if i >= 15:
                break
        
        total_time = time.time() - start_time
        results[name] = total_time
        
        print(f"Time for 15 batches: {total_time:.3f}s")
    
    return results

print("\n=== Dataset Implementation Comparison ===")
dataset_results = compare_dataset_implementations()

fastest_time = min(dataset_results.values())
print(f"\n📊 Dataset Performance Comparison:")
for name, time_taken in dataset_results.items():
    speedup = fastest_time / time_taken
    print(f"{name:20} | Time: {time_taken:.3f}s | Relative speed: {speedup:.2f}x")
```

## Performance Monitoring and Metrics

### Custom Performance Callbacks

```python
from pytorch_lightning.callbacks import Callback

class PerformanceMonitorCallback(Callback):
    """Comprehensive performance monitoring callback"""
    
    def __init__(self):
        self.batch_times = []
        self.epoch_times = []
        self.memory_usage = []
        self.gpu_utilization = []
        self.step_times = {}
        
    def on_train_epoch_start(self, trainer, pl_module):
        self.epoch_start_time = time.time()
        
    def on_train_epoch_end(self, trainer, pl_module):
        epoch_time = time.time() - self.epoch_start_time
        self.epoch_times.append(epoch_time)
        pl_module.log('epoch_duration', epoch_time)
        
    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
        self.batch_start_time = time.time()
        
    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        batch_time = time.time() - self.batch_start_time
        self.batch_times.append(batch_time)
        
        # Log batch timing every N steps
        if batch_idx % 20 == 0:
            pl_module.log('batch_time_ms', batch_time * 1000, on_step=True)
            
            # Memory tracking
            if torch.cuda.is_available():
                memory_mb = torch.cuda.memory_allocated() / 1024**2
                self.memory_usage.append(memory_mb)
                pl_module.log('gpu_memory_mb', memory_mb, on_step=True)
                
                # GPU utilization (approximate)
                utilization = min(100, batch_time * 100 / 0.05)  # Rough estimate
                self.gpu_utilization.append(utilization)
                pl_module.log('gpu_utilization_pct', utilization, on_step=True)
    
    def on_validation_start(self, trainer, pl_module):
        self.val_start_time = time.time()
        
    def on_validation_end(self, trainer, pl_module):
        val_time = time.time() - self.val_start_time
        pl_module.log('validation_duration', val_time)
        
        # Performance summary
        if self.batch_times:
            avg_batch_time = np.mean(self.batch_times[-100:])  # Last 100 batches
            throughput = 32 / avg_batch_time  # Samples per second (assuming batch_size=32)
            
            pl_module.log('avg_batch_time_ms', avg_batch_time * 1000)
            pl_module.log('throughput_samples_per_sec', throughput)
        
        if self.memory_usage:
            peak_memory = max(self.memory_usage[-100:])
            avg_memory = np.mean(self.memory_usage[-100:])
            
            pl_module.log('peak_memory_mb', peak_memory)
            pl_module.log('avg_memory_mb', avg_memory)
    
    def get_performance_summary(self):
        """Get comprehensive performance summary"""
        summary = {}
        
        if self.batch_times:
            summary['avg_batch_time'] = np.mean(self.batch_times)
            summary['batch_time_std'] = np.std(self.batch_times)
            summary['min_batch_time'] = np.min(self.batch_times)
            summary['max_batch_time'] = np.max(self.batch_times)
        
        if self.epoch_times:
            summary['avg_epoch_time'] = np.mean(self.epoch_times)
            summary['total_training_time'] = sum(self.epoch_times)
        
        if self.memory_usage:
            summary['peak_memory_mb'] = np.max(self.memory_usage)
            summary['avg_memory_mb'] = np.mean(self.memory_usage)
        
        if self.gpu_utilization:
            summary['avg_gpu_utilization'] = np.mean(self.gpu_utilization)
        
        return summary

class ResourceUsageCallback(Callback):
    """Monitor CPU and system resource usage"""
    
    def __init__(self):
        try:
            import psutil
            self.psutil = psutil
            self.cpu_percentages = []
            self.memory_percentages = []
        except ImportError:
            print("⚠️ psutil not available for system monitoring")
            self.psutil = None
    
    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        if self.psutil and batch_idx % 50 == 0:
            cpu_percent = self.psutil.cpu_percent(interval=None)
            memory_info = self.psutil.virtual_memory()
            
            self.cpu_percentages.append(cpu_percent)
            self.memory_percentages.append(memory_info.percent)
            
            pl_module.log('cpu_usage_pct', cpu_percent, on_step=True)
            pl_module.log('system_memory_pct', memory_info.percent, on_step=True)
    
    def get_resource_summary(self):
        if not self.psutil:
            return {}
        
        return {
            'avg_cpu_usage': np.mean(self.cpu_percentages) if self.cpu_percentages else 0,
            'peak_cpu_usage': np.max(self.cpu_percentages) if self.cpu_percentages else 0,
            'avg_memory_usage': np.mean(self.memory_percentages) if self.memory_percentages else 0,
            'peak_memory_usage': np.max(self.memory_percentages) if self.memory_percentages else 0
        }

# Test performance monitoring
print("\n=== Performance Monitoring ===")

perf_callback = PerformanceMonitorCallback()
resource_callback = ResourceUsageCallback()

model = ProfilingDemoModel()
trainer_perf = pl.Trainer(
    max_epochs=2,
    callbacks=[perf_callback, resource_callback],
    enable_checkpointing=False,
    logger=False,
    enable_progress_bar=False,
    limit_train_batches=50,
    limit_val_batches=15
)

print("Training with performance monitoring...")
trainer_perf.fit(model, train_loader, val_loader)

# Get performance summaries
perf_summary = perf_callback.get_performance_summary()
resource_summary = resource_callback.get_resource_summary()

print("\n📊 Performance Summary:")
for key, value in perf_summary.items():
    if 'time' in key:
        print(f"{key}: {value:.3f}s")
    elif 'memory' in key:
        print(f"{key}: {value:.1f}MB")
    else:
        print(f"{key}: {value:.2f}")

print("\n🖥️ Resource Usage Summary:")
for key, value in resource_summary.items():
    print(f"{key}: {value:.1f}%")
```

## Production Performance Optimization

### Comprehensive Optimization Pipeline

```python
class ProductionOptimizedModel(pl.LightningModule):
    """Production model with all performance optimizations"""
    
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        
        # Optimized architecture
        self.model = nn.Sequential(
            nn.Linear(256, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(512, 512),
            nn.LayerNorm(512),
            nn.GELU(), 
            nn.Dropout(0.1),
            nn.Linear(512, 10)
        )
        
        # Efficient metrics
        from torchmetrics import Accuracy
        self.train_acc = Accuracy(task="multiclass", num_classes=10)
        self.val_acc = Accuracy(task="multiclass", num_classes=10)
        
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        # Efficient accuracy computation
        with torch.no_grad():
            preds = torch.argmax(logits, dim=1)
            self.train_acc(preds, y)
        
        # Minimal logging for performance
        if batch_idx % 25 == 0:
            self.log('train_loss', loss, on_step=True, prog_bar=True)
        
        self.log('train_acc', self.train_acc, on_epoch=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        
        preds = torch.argmax(logits, dim=1)
        self.val_acc(preds, y)
        
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_acc', self.val_acc, on_epoch=True, prog_bar=True)
        
        return loss
    
    def configure_optimizers(self):
        # Optimized optimizer settings
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=2e-3,  # Higher LR for faster convergence
            weight_decay=1e-4,
            eps=1e-4,  # Better for mixed precision
            betas=(0.9, 0.95)
        )
        
        # Efficient scheduler
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=2e-3,
            total_steps=self.trainer.estimated_stepping_batches,
            pct_start=0.05,
            div_factor=25,
            final_div_factor=1000
        )
        
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'interval': 'step'
            }
        }

def create_production_trainer():
    """Create production-optimized trainer"""
    
    return pl.Trainer(
        # Core performance settings
        precision="16-mixed",
        accumulate_grad_batches=2,
        gradient_clip_val=1.0,
        gradient_clip_algorithm="norm",
        
        # Training efficiency
        max_epochs=5,
        log_every_n_steps=50,  # Reduce logging overhead
        
        # Validation efficiency
        check_val_every_n_epoch=1,
        num_sanity_val_steps=2,  # Minimal sanity check
        
        # System optimization
        enable_progress_bar=True,
        enable_model_summary=False,  # Skip for production
        
        # Callbacks for monitoring
        callbacks=[PerformanceMonitorCallback()],
        
        # Disable for demo
        enable_checkpointing=False,
        logger=False
    )

def create_optimized_dataloader(dataset, batch_size=64, is_training=True):
    """Create performance-optimized DataLoader"""
    
    num_workers = min(4, multiprocessing.cpu_count())
    
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=is_training,
        num_workers=num_workers,
        pin_memory=True,
        persistent_workers=True,
        drop_last=is_training,  # For training stability
        prefetch_factor=2  # Prefetch more batches
    )

print("\n=== Production Optimization Pipeline ===")

# Create optimized components
prod_model = ProductionOptimizedModel()

# Apply compilation if available
if hasattr(torch, 'compile'):
    print("🚀 Applying torch.compile for production...")
    prod_model = torch.compile(prod_model, mode="reduce-overhead")

# Optimized data loading
prod_train_loader = create_optimized_dataloader(train_dataset, batch_size=64, is_training=True)
prod_val_loader = create_optimized_dataloader(val_dataset, batch_size=64, is_training=False)

# Production trainer
prod_trainer = create_production_trainer()

print("🏭 Starting production-optimized training...")
start_time = time.time()
prod_trainer.fit(prod_model, prod_train_loader, prod_val_loader)
total_time = time.time() - start_time

# Get final performance metrics
final_acc = prod_trainer.callback_metrics.get('val_acc', 0)
perf_callback = [cb for cb in prod_trainer.callbacks if isinstance(cb, PerformanceMonitorCallback)][0]
perf_summary = perf_callback.get_performance_summary()

print(f"\n🎯 Production Training Results:")
print(f"Total training time: {total_time:.2f}s")
print(f"Final validation accuracy: {final_acc:.4f}")
print(f"Average batch time: {perf_summary.get('avg_batch_time', 0)*1000:.1f}ms")
print(f"Peak memory usage: {perf_summary.get('peak_memory_mb', 0):.1f}MB")

# Throughput calculation
total_samples = len(train_dataset) * prod_trainer.current_epoch
throughput = total_samples / total_time
print(f"Training throughput: {throughput:.0f} samples/second")
```

## Performance Best Practices

### Optimization Checklist

```python
def print_performance_checklist():
    """Comprehensive performance optimization checklist"""
    
    checklist = {
        "🚀 Model Architecture": [
            "Use LayerNorm instead of BatchNorm for mixed precision",
            "Prefer GELU over ReLU for better numerical properties", 
            "Minimize model complexity without sacrificing accuracy",
            "Use efficient attention mechanisms (if applicable)",
            "Avoid unnecessary model.eval()/model.train() switches"
        ],
        
        "⚡ Training Configuration": [
            "Enable mixed precision (precision='16-mixed')",
            "Use gradient accumulation for effective larger batches",
            "Apply gradient clipping (clip_val=1.0 typically)",
            "Optimize learning rate schedule (OneCycleLR often best)",
            "Use AdamW with appropriate eps for mixed precision"
        ],
        
        "💾 Data Loading": [
            "Use multiple workers (2-8 typically optimal)",
            "Enable pin_memory for GPU training",
            "Use persistent_workers=True for repeated epochs",
            "Prefetch data with prefetch_factor=2-4",
            "Optimize batch size for your GPU memory"
        ],
        
        "🔧 System Optimization": [
            "Compile model with torch.compile (PyTorch 2.0+)",
            "Use appropriate CUDA versions and drivers", 
            "Optimize CPU/GPU data transfer",
            "Monitor and eliminate CPU/GPU bottlenecks",
            "Use NVMe SSDs for fast data loading"
        ],
        
        "📊 Monitoring": [
            "Profile regularly with PyTorchProfiler",
            "Monitor GPU utilization and memory",
            "Track batch processing times",
            "Log performance metrics during training",
            "Set up alerts for performance degradation"
        ],
        
        "🎯 Production Deployment": [
            "Reduce logging frequency in production",
            "Minimize validation frequency if appropriate",
            "Use efficient checkpointing strategies",
            "Implement proper error handling and recovery",
            "Plan for scaling and distributed training"
        ]
    }
    
    print("=" * 70)
    print("🚀 PERFORMANCE OPTIMIZATION CHECKLIST")
    print("=" * 70)
    
    for category, items in checklist.items():
        print(f"\n{category}")
        print("-" * 50)
        for item in items:
            print(f"  □ {item}")
    
    print(f"\n" + "=" * 70)
    print("💡 Key Performance Metrics to Monitor:")
    print("  • Batch processing time (target: <100ms for small models)")
    print("  • GPU utilization (target: >80%)")
    print("  • Memory usage (target: <90% of available)")
    print("  • Training throughput (samples/second)")
    print("  • Time to convergence")
    print("=" * 70)

print_performance_checklist()

# Performance optimization summary
print(f"\n📈 Performance Optimization Summary:")
print(f"✓ Mixed precision training can provide 1.5-2x speedup")
print(f"✓ Proper data loading can improve throughput by 2-5x")
print(f"✓ Model compilation can provide 1.2-1.8x speedup") 
print(f"✓ Gradient accumulation enables training larger models")
print(f"✓ Combined optimizations can provide 3-10x overall speedup")
print(f"\n🎯 Always measure performance improvements systematically!")
```

## Summary

This notebook covered comprehensive performance profiling and optimization:

1. **Lightning Profilers**: Simple, Advanced, and PyTorch profilers for identifying bottlenecks
2. **Data Loading Optimization**: num_workers, pin_memory, persistent_workers tuning
3. **Performance Monitoring**: Custom callbacks for tracking training metrics
4. **Resource Usage**: CPU, memory, and GPU utilization monitoring
5. **Production Optimization**: Complete optimization pipeline for deployment
6. **Best Practices**: Comprehensive checklist for performance optimization

Key profiling tools:
- **SimpleProfiler**: Basic timing information for quick analysis
- **PyTorchProfiler**: Detailed GPU/CPU profiling with TensorBoard integration
- **Custom Callbacks**: Real-time performance monitoring during training

Data loading optimizations:
- **num_workers**: 2-8 workers typically optimal
- **pin_memory**: Essential for GPU training
- **persistent_workers**: Reduces worker initialization overhead
- **prefetch_factor**: 2-4 for better pipeline utilization

Performance monitoring metrics:
- **Batch Time**: Target <100ms for efficient training
- **GPU Utilization**: Aim for >80% utilization
- **Memory Usage**: Keep <90% of available memory
- **Throughput**: Track samples processed per second

Production considerations:
- Combine multiple optimizations for maximum benefit
- Profile regularly to identify new bottlenecks
- Monitor performance degradation over time
- Plan for scaling and distributed training
- Implement proper error handling and recovery

Expected performance gains:
- Mixed precision: 1.5-2x speedup
- Data loading optimization: 2-5x improvement
- Model compilation: 1.2-1.8x speedup
- Combined optimizations: 3-10x overall improvement

Next notebook: We'll explore multi-GPU strategies and distributed training.