# Lab 1.4.6 Solution: GPU Acceleration Exercises

This notebook contains solutions to the exercises from Lab 1.4.6.

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
import os

# Check PyTorch availability
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset
    PYTORCH_AVAILABLE = True
    print(f"PyTorch version: {torch.__version__}")
except ImportError:
    PYTORCH_AVAILABLE = False
    print("PyTorch not available")

if PYTORCH_AVAILABLE and torch.cuda.is_available():
    print(f"CUDA available: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

np.random.seed(42)
if PYTORCH_AVAILABLE:
    torch.manual_seed(42)

%matplotlib inline

In [None]:
# Load MNIST data
import gzip
import urllib.request

def load_mnist(path='../data'):
    os.makedirs(path, exist_ok=True)
    base_url = 'http://yann.lecun.com/exdb/mnist/'
    files = {
        'train_images': 'train-images-idx3-ubyte.gz',
        'train_labels': 'train-labels-idx1-ubyte.gz',
        'test_images': 't10k-images-idx3-ubyte.gz',
        'test_labels': 't10k-labels-idx1-ubyte.gz'
    }
    
    def download(filename):
        filepath = os.path.join(path, filename)
        if not os.path.exists(filepath):
            print(f"Downloading {filename}...")
            urllib.request.urlretrieve(base_url + filename, filepath)
        return filepath
    
    def load_images(fp):
        with gzip.open(fp, 'rb') as f:
            f.read(16)
            return np.frombuffer(f.read(), dtype=np.uint8).reshape(-1, 784).astype(np.float32) / 255.0
    
    def load_labels(fp):
        with gzip.open(fp, 'rb') as f:
            f.read(8)
            return np.frombuffer(f.read(), dtype=np.uint8)
    
    return (load_images(download(files['train_images'])),
            load_labels(download(files['train_labels'])),
            load_images(download(files['test_images'])),
            load_labels(download(files['test_labels'])))

X_train_np, y_train_np, X_test_np, y_test_np = load_mnist()
print(f"Loaded {len(X_train_np)} training samples")

## Exercise 1 Solution: Train a Larger Model

Train a model with architecture `[784, 1024, 512, 256, 128, 10]` and compare CPU vs GPU times.

In [None]:
if PYTORCH_AVAILABLE:
    class LargerMLP(nn.Module):
        """
        Larger MLP with architecture: [784, 1024, 512, 256, 128, 10]
        
        This model has ~1.5M parameters vs ~250K in the smaller version.
        """
        
        def __init__(self):
            super().__init__()
            
            self.model = nn.Sequential(
                nn.Linear(784, 1024),
                nn.ReLU(),
                nn.Linear(1024, 512),
                nn.ReLU(),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Linear(256, 128),
                nn.ReLU(),
                nn.Linear(128, 10)
            )
            
            # He initialization
            for m in self.modules():
                if isinstance(m, nn.Linear):
                    nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                    nn.init.zeros_(m.bias)
        
        def forward(self, x):
            return self.model(x)
    
    # Count parameters
    model = LargerMLP()
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")

In [None]:
if PYTORCH_AVAILABLE:
    def train_model(model, train_loader, epochs, lr, device):
        """Train model and return time and final accuracy."""
        model = model.to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=lr)
        
        # Warmup for GPU
        if device.type == 'cuda':
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
                break
            torch.cuda.synchronize()
        
        start_time = time.time()
        
        for epoch in range(epochs):
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
        
        if device.type == 'cuda':
            torch.cuda.synchronize()
        
        elapsed = time.time() - start_time
        return elapsed

In [None]:
if PYTORCH_AVAILABLE:
    print("Exercise 1: Training Larger Model [784, 1024, 512, 256, 128, 10]")
    print("=" * 70)
    
    # Prepare data
    X_train_torch = torch.FloatTensor(X_train_np)
    y_train_torch = torch.LongTensor(y_train_np)
    X_test_torch = torch.FloatTensor(X_test_np)
    y_test_torch = torch.LongTensor(y_test_np)
    
    train_dataset = TensorDataset(X_train_torch, y_train_torch)
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    
    EPOCHS = 5
    LR = 0.1
    
    # CPU Training
    print(f"\nTraining on CPU...")
    torch.manual_seed(42)
    model_cpu = LargerMLP()
    time_cpu = train_model(model_cpu, train_loader, EPOCHS, LR, torch.device('cpu'))
    
    model_cpu.eval()
    with torch.no_grad():
        preds_cpu = model_cpu(X_test_torch).argmax(dim=1)
        acc_cpu = (preds_cpu == y_test_torch).float().mean().item()
    
    print(f"CPU Time: {time_cpu:.2f}s | Accuracy: {acc_cpu:.2%}")
    
    # GPU Training (if available)
    if torch.cuda.is_available():
        print(f"\nTraining on GPU...")
        torch.manual_seed(42)
        model_gpu = LargerMLP()
        time_gpu = train_model(model_gpu, train_loader, EPOCHS, LR, torch.device('cuda'))
        
        model_gpu.eval()
        with torch.no_grad():
            preds_gpu = model_gpu(X_test_torch.cuda()).argmax(dim=1).cpu()
            acc_gpu = (preds_gpu == y_test_torch).float().mean().item()
        
        print(f"GPU Time: {time_gpu:.2f}s | Accuracy: {acc_gpu:.2%}")
        
        # Summary
        print(f"\n" + "=" * 70)
        print(f"SPEEDUP: {time_cpu / time_gpu:.1f}x faster on GPU!")
        print(f"\nWith the larger model, GPU advantage is more pronounced")
        print(f"because there's more computation to parallelize.")
    else:
        print("\nGPU not available for comparison.")

## Exercise 2 Solution: Mixed Precision Training

Use `torch.cuda.amp` for automatic mixed precision training to see additional speedup.

In [None]:
if PYTORCH_AVAILABLE and torch.cuda.is_available():
    def train_mixed_precision(model, train_loader, epochs, lr):
        """
        Train with Automatic Mixed Precision (AMP).

        This implementation automatically detects the best precision:
        - DGX Spark (Blackwell): Uses bfloat16 (no scaler needed!)
        - Older GPUs: Uses float16 with GradScaler

        Key insight: bfloat16 has same dynamic range as float32, so it's
        more stable and doesn't need gradient scaling.
        """
        device = torch.device('cuda')
        model = model.to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=lr)

        # Detect best precision for this GPU
        # DGX Spark (Blackwell) supports bfloat16 natively
        use_bfloat16 = torch.cuda.is_bf16_supported()
        dtype = torch.bfloat16 if use_bfloat16 else torch.float16

        # GradScaler only needed for float16, NOT for bfloat16
        scaler = None if use_bfloat16 else torch.amp.GradScaler('cuda')

        print(f"   Using {'bfloat16 (optimal for DGX Spark)' if use_bfloat16 else 'float16 with GradScaler'}")

        # Warmup
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            with torch.amp.autocast(device_type='cuda', dtype=dtype):
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
            if scaler:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()
            break
        torch.cuda.synchronize()

        start_time = time.time()

        for epoch in range(epochs):
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)

                optimizer.zero_grad()

                # Forward pass with autocast (uses mixed precision)
                with torch.amp.autocast(device_type='cuda', dtype=dtype):
                    outputs = model(X_batch)
                    loss = criterion(outputs, y_batch)

                # Backward pass (with scaling only for float16)
                if scaler:
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    loss.backward()
                    optimizer.step()

        torch.cuda.synchronize()
        elapsed = time.time() - start_time
        return elapsed

    print("Mixed precision training function defined.")

In [None]:
if PYTORCH_AVAILABLE and torch.cuda.is_available():
    print("Exercise 2: Mixed Precision Training Comparison")
    print("=" * 70)
    
    EPOCHS = 5
    LR = 0.1
    
    # Standard float32 GPU training
    print("\n1. Standard float32 training on GPU...")
    torch.manual_seed(42)
    model_fp32 = LargerMLP()
    time_fp32 = train_model(model_fp32, train_loader, EPOCHS, LR, torch.device('cuda'))
    
    model_fp32.eval()
    with torch.no_grad():
        preds_fp32 = model_fp32(X_test_torch.cuda()).argmax(dim=1).cpu()
        acc_fp32 = (preds_fp32 == y_test_torch).float().mean().item()
    
    print(f"   Time: {time_fp32:.2f}s | Accuracy: {acc_fp32:.2%}")
    
    # Mixed precision (float16) training
    print("\n2. Mixed precision (AMP) training on GPU...")
    torch.manual_seed(42)
    model_amp = LargerMLP()
    time_amp = train_mixed_precision(model_amp, train_loader, EPOCHS, LR)
    
    model_amp.eval()
    with torch.no_grad():
        preds_amp = model_amp(X_test_torch.cuda()).argmax(dim=1).cpu()
        acc_amp = (preds_amp == y_test_torch).float().mean().item()
    
    print(f"   Time: {time_amp:.2f}s | Accuracy: {acc_amp:.2%}")
    
    # Summary
    print(f"\n" + "=" * 70)
    print("RESULTS SUMMARY")
    print("=" * 70)
    print(f"{'Method':<25} {'Time (s)':<12} {'Speedup':<15} {'Accuracy'}")
    print("-" * 70)
    print(f"{'GPU (float32)':<25} {time_fp32:<12.2f} {'1.0x (baseline)':<15} {acc_fp32:.2%}")
    speedup = time_fp32 / time_amp
    print(f"{'GPU (mixed precision)':<25} {time_amp:<12.2f} {f'{speedup:.2f}x':<15} {acc_amp:.2%}")
    print("=" * 70)
    
    print(f"\nMixed precision speedup: {speedup:.2f}x")
    print("\nNote: Speedup varies by GPU architecture:")
    print("  - Tensor Cores (Volta+): 2-3x speedup")
    print("  - Without Tensor Cores: ~1.2-1.5x speedup")
    print("  - DGX Spark (Blackwell): Excellent AMP support with 192 Tensor Cores")
else:
    print("GPU not available. Mixed precision training requires CUDA.")

In [None]:
# Visualize comparison
if PYTORCH_AVAILABLE and torch.cuda.is_available():
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    methods = ['CPU', 'GPU (FP32)', 'GPU (AMP)']
    times = [time_cpu, time_fp32, time_amp]
    accs = [acc_cpu, acc_fp32, acc_amp]
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
    
    # Time comparison
    bars = axes[0].bar(methods, times, color=colors, edgecolor='black', linewidth=1.5)
    axes[0].set_ylabel('Time (seconds)', fontsize=12)
    axes[0].set_title('Training Time Comparison (5 epochs)', fontsize=14)
    for bar, t in zip(bars, times):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
                     f'{t:.2f}s', ha='center', fontsize=11)
    axes[0].grid(True, alpha=0.3, axis='y')
    
    # Speedup comparison
    speedups = [1.0, time_cpu/time_fp32, time_cpu/time_amp]
    bars = axes[1].bar(methods, speedups, color=colors, edgecolor='black', linewidth=1.5)
    axes[1].set_ylabel('Speedup vs CPU', fontsize=12)
    axes[1].set_title('Speedup Factor', fontsize=14)
    axes[1].axhline(y=1, color='red', linestyle='--', alpha=0.5)
    for bar, s in zip(bars, speedups):
        axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3, 
                     f'{s:.1f}x', ha='center', fontsize=11)
    axes[1].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()

## Bonus: BFloat16 Training (DGX Spark Optimal)

DGX Spark's Blackwell GPU has native bfloat16 support, which provides:
- Same dynamic range as float32 (better stability than float16)
- Same speed benefits as float16
- No need for gradient scaling in most cases

In [None]:
if PYTORCH_AVAILABLE and torch.cuda.is_available():
    # Check if bfloat16 is supported
    if torch.cuda.is_bf16_supported():
        print("BFloat16 Training (Optimal for DGX Spark)")
        print("=" * 70)

        def train_bfloat16(model, train_loader, epochs, lr):
            """
            Train with bfloat16 precision using the modern PyTorch API.

            This is the OPTIMAL approach for DGX Spark's Blackwell GPU!
            """
            device = torch.device('cuda')
            model = model.to(device)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.SGD(model.parameters(), lr=lr)

            # Warmup
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                optimizer.zero_grad()
                # Use modern torch.amp.autocast API with explicit dtype
                with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
                    outputs = model(X_batch)
                    loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
                break
            torch.cuda.synchronize()

            start_time = time.time()

            for epoch in range(epochs):
                for X_batch, y_batch in train_loader:
                    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                    optimizer.zero_grad()

                    # No GradScaler needed for bfloat16!
                    with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
                        outputs = model(X_batch)
                        loss = criterion(outputs, y_batch)

                    loss.backward()
                    optimizer.step()

            torch.cuda.synchronize()
            return time.time() - start_time

        torch.manual_seed(42)
        model_bf16 = LargerMLP()
        time_bf16 = train_bfloat16(model_bf16, train_loader, EPOCHS, LR)

        model_bf16.eval()
        with torch.no_grad():
            preds_bf16 = model_bf16(X_test_torch.cuda()).argmax(dim=1).cpu()
            acc_bf16 = (preds_bf16 == y_test_torch).float().mean().item()

        print(f"BFloat16 Time: {time_bf16:.2f}s | Accuracy: {acc_bf16:.2%}")
        print(f"\nBFloat16 advantages:")
        print(f"  - Same dynamic range as float32 (more stable than float16)")
        print(f"  - No gradient scaling needed (simpler code)")
        print(f"  - Native support on Blackwell GPUs (DGX Spark)")
        print(f"  - Uses modern torch.amp.autocast() API")
    else:
        print("BFloat16 not supported on this GPU.")
        print("Note: DGX Spark's Blackwell GPU has native bfloat16 support.")

---

## Key Takeaways

1. **Larger models benefit more from GPU** - More computation = more parallelism
2. **Mixed precision (AMP) provides additional speedup** - Often 1.5-3x on top of GPU acceleration
3. **BFloat16 is ideal for training** - Better stability than float16, same speed
4. **DGX Spark advantages**:
   - 128GB unified memory allows huge batch sizes
   - 192 Tensor Cores accelerate mixed precision
   - Native bfloat16 support on Blackwell

### When to Use Each Precision:
- **float32**: Default, maximum compatibility
- **float16 (AMP)**: General GPU training speedup
- **bfloat16**: Best for training (if supported) - DGX Spark optimal
- **int8/FP4**: Inference only (quantized models)

In [None]:
# Cleanup
import gc

if PYTORCH_AVAILABLE and torch.cuda.is_available():
    torch.cuda.empty_cache()

gc.collect()

print("Cleanup complete!")