# Task 6.5: Profiling Workshop - SOLUTIONS

Complete solutions for the Profiling Workshop exercises.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Exercise Solution: Finding and Fixing Bottlenecks

The inefficient training loop has several issues. Let's identify and fix them.

In [None]:
# Setup
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        self.fc = nn.Sequential(
            nn.Linear(64 * 8 * 8, 256),
            nn.ReLU(),
            nn.Linear(256, 10),
        )
    
    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform
)

In [None]:
# ORIGINAL INEFFICIENT VERSION
def inefficient_training(model, trainset, epochs=1, max_batches=100):
    """
    This function has FOUR performance issues:
    
    Issue 1: num_workers=0 - Data loading is single-threaded
    Issue 2: No pin_memory - Slower CPU-GPU transfer
    Issue 3: Using set_to_none=False - Slower gradient zeroing
    Issue 4: loss.item() every iteration - Forces CPU-GPU sync
    """
    # Issue 1: num_workers=0 means all data loading happens on main thread
    loader = DataLoader(trainset, batch_size=32, shuffle=True, num_workers=0)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    
    start = time.time()
    for epoch in range(epochs):
        for i, (inputs, labels) in enumerate(loader):
            # Issue 2: Not using non_blocking transfer
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Issue 3: zero_grad() without set_to_none=True is slower
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Issue 4: .item() forces CPU-GPU synchronization EVERY iteration
            if i % 1 == 0:  # Every iteration!
                print(f"Loss: {loss.item():.4f}", end='\r')
            
            if i >= max_batches:
                break
    
    return time.time() - start

# Benchmark inefficient version
model = SimpleModel().to(device)
inefficient_time = inefficient_training(model, trainset)
print(f"\nInefficient time: {inefficient_time:.2f}s")

In [None]:
# OPTIMIZED VERSION
def efficient_training(model, trainset, epochs=1, max_batches=100):
    """
    Fixed version with all issues resolved:
    
    Fix 1: num_workers=4 - Parallel data loading
    Fix 2: pin_memory=True + non_blocking transfer
    Fix 3: set_to_none=True - Faster gradient zeroing
    Fix 4: Reduced logging frequency, accumulate loss
    """
    # Fix 1 & 2: Multi-worker loading with pinned memory
    loader = DataLoader(
        trainset, 
        batch_size=32, 
        shuffle=True, 
        num_workers=4,           # Fix 1: Parallel data loading
        pin_memory=True,         # Fix 2: Pinned memory for faster transfer
        persistent_workers=True  # Bonus: Keep workers alive
    )
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    
    start = time.time()
    for epoch in range(epochs):
        running_loss = 0.0  # Accumulate without sync
        
        for i, (inputs, labels) in enumerate(loader):
            # Fix 2: Non-blocking transfer
            inputs = inputs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Fix 3: set_to_none is faster than zeroing
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()
            
            # Fix 4: Accumulate loss as tensor, only sync occasionally
            running_loss += loss.detach()  # detach() keeps it on GPU
            
            if (i + 1) % 20 == 0:  # Log every 20 batches, not every 1
                avg_loss = running_loss.item() / 20  # Single sync point
                print(f"Batch {i+1}: Loss: {avg_loss:.4f}", end='\r')
                running_loss = 0.0
            
            if i >= max_batches:
                break
    
    return time.time() - start

# Benchmark efficient version
model = SimpleModel().to(device)
efficient_time = efficient_training(model, trainset)
print(f"\nEfficient time: {efficient_time:.2f}s")
print(f"Speedup: {inefficient_time/efficient_time:.2f}x")

## Summary of Fixes

| Issue | Problem | Fix | Impact |
|-------|---------|-----|--------|
| 1 | num_workers=0 | num_workers=4 | Data loading parallelized |
| 2 | No pin_memory | pin_memory=True + non_blocking | Faster CPU-GPU transfer |
| 3 | zero_grad() default | set_to_none=True | ~5-10% speedup |
| 4 | .item() every iteration | Accumulate, sync occasionally | Reduced CPU-GPU sync |

### Additional Optimizations

- `persistent_workers=True`: Workers stay alive between epochs
- `prefetch_factor`: Increase to prefetch more batches
- Larger batch size: Better GPU utilization

In [None]:
# Cleanup
import gc
torch.cuda.empty_cache()
gc.collect()
print("Cleanup complete!")