# Lab 1.3.5: Profiling Workshop - SOLUTIONS

This notebook contains complete solutions to the exercises in Lab 1.3.5.

---

## üéØ Learning Objectives Checklist

By completing this lab, you should now be able to:
- [x] Profile GPU code using PyTorch Profiler
- [x] Understand Nsight Systems timeline analysis
- [x] Identify common bottlenecks (data loading, CPU‚ÜîGPU sync)
- [x] Apply optimizations based on profiling results

---

In [None]:
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.profiler import profile, record_function, ProfilerActivity

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

## Solution: CNN Training Loop with Profiling

In [None]:
class SimpleCNN(nn.Module):
    """
    SOLUTION: Simple CNN for image classification.
    """
    def __init__(self, num_classes: int = 10):
        super().__init__()
        
        self.features = nn.Sequential(
            # Conv block 1
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 32 -> 16
            
            # Conv block 2
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 16 -> 8
            
            # Conv block 3
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 8 -> 4
        )
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 4 * 4, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


# Test model
model = SimpleCNN().to(device)
dummy_input = torch.randn(1, 3, 32, 32).to(device)
output = model(dummy_input)
print(f"Model output shape: {output.shape}")

# Count parameters
num_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {num_params:,}")

In [None]:
# Create synthetic CIFAR-like data
def create_synthetic_image_data(n_samples: int = 10000, 
                                 img_size: int = 32, 
                                 n_classes: int = 10):
    """Create synthetic image classification data."""
    X = torch.randn(n_samples, 3, img_size, img_size)
    y = torch.randint(0, n_classes, (n_samples,))
    return TensorDataset(X, y)


# Create dataset
dataset = create_synthetic_image_data(n_samples=10000)
print(f"Dataset: {len(dataset)} samples")

In [None]:
def train_epoch_slow(model, dataloader, criterion, optimizer, device):
    """
    Slow training loop with bottlenecks.
    """
    model.train()
    total_loss = 0.0
    
    for data, target in dataloader:
        # Bottleneck: blocking transfer
        data = data.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        
        # Bottleneck: sync with .item()
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    return total_loss / len(dataloader)


def train_epoch_optimized(model, dataloader, criterion, optimizer, device):
    """
    SOLUTION: Optimized training loop.
    """
    model.train()
    total_loss = torch.tensor(0.0, device=device)
    
    for data, target in dataloader:
        # Fix 1: non_blocking transfer
        data = data.to(device, non_blocking=True)
        target = target.to(device, non_blocking=True)
        
        # Fix 2: set_to_none is faster
        optimizer.zero_grad(set_to_none=True)
        output = model(data)
        loss = criterion(output, target)
        
        # Fix 3: accumulate on GPU
        total_loss += loss.detach()
        
        loss.backward()
        optimizer.step()
    
    return (total_loss / len(dataloader)).item()

In [None]:
# Profile slow version
print("üìä Profiling SLOW training loop")
print("="*60)

model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Slow dataloader (no optimizations)
dataloader_slow = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=0)

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
) as prof:
    _ = train_epoch_slow(model, dataloader_slow, criterion, optimizer, device)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

In [None]:
# Profile optimized version
print("\nüìä Profiling OPTIMIZED training loop")
print("="*60)

model = SimpleCNN().to(device)
optimizer = optim.Adam(model.parameters())

# Optimized dataloader
dataloader_fast = DataLoader(
    dataset, 
    batch_size=64, 
    shuffle=True, 
    num_workers=2,
    pin_memory=True,
    persistent_workers=True
)

# Warm up
_ = train_epoch_optimized(model, dataloader_fast, criterion, optimizer, device)

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
) as prof_opt:
    _ = train_epoch_optimized(model, dataloader_fast, criterion, optimizer, device)

print(prof_opt.key_averages().table(sort_by="cuda_time_total", row_limit=10))

In [None]:
# Benchmark comparison
print("\n‚è±Ô∏è Benchmark: Slow vs Optimized")
print("="*60)

# Slow
model = SimpleCNN().to(device)
optimizer = optim.Adam(model.parameters())

torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(3):
    _ = train_epoch_slow(model, dataloader_slow, criterion, optimizer, device)
torch.cuda.synchronize()
time_slow = (time.perf_counter() - start) / 3

# Optimized
model = SimpleCNN().to(device)
optimizer = optim.Adam(model.parameters())

# Warm up
_ = train_epoch_optimized(model, dataloader_fast, criterion, optimizer, device)

torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(3):
    _ = train_epoch_optimized(model, dataloader_fast, criterion, optimizer, device)
torch.cuda.synchronize()
time_optimized = (time.perf_counter() - start) / 3

print(f"Slow version:      {time_slow:.3f} seconds/epoch")
print(f"Optimized version: {time_optimized:.3f} seconds/epoch")
print(f"\nüöÄ Speedup: {time_slow/time_optimized:.2f}x")

## Key Optimizations Applied:

1. **DataLoader optimizations:**
   - `num_workers=2`: Parallel data loading
   - `pin_memory=True`: Faster CPU‚ÜíGPU transfers
   - `persistent_workers=True`: Don't restart workers each epoch

2. **Transfer optimizations:**
   - `non_blocking=True`: Async data transfer

3. **Training loop optimizations:**
   - `zero_grad(set_to_none=True)`: Faster than setting to zero
   - Accumulate loss on GPU: Avoid `.item()` sync

4. **Memory optimizations:**
   - `loss.detach()`: Don't keep computation graph for loss accumulation

## Cleanup

In [None]:
import gc

del model, optimizer, dataset, dataloader_slow, dataloader_fast
gc.collect()
torch.cuda.empty_cache()

print("‚úÖ Cleanup complete")
print("\nüéì Module 1.3: CUDA Python & GPU Programming - COMPLETE!")