# MNIST CNN Profiling & Optimization
- Step 1: Baseline Model (No Optimizations)
- Step 2: Profiling to Find Bottlenecks
- Step 3: Optimized Model
- Step 4: Performance Comparison

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import time
from torch.profiler import profile, record_function, ProfilerActivity
import matplotlib.pyplot as plt

In [2]:
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Step 1: Baseline Model
- Simple CNN for MNIST
- No optimizations (default settings)

In [3]:
transform = transforms.ToTensor()
train_data = datasets.MNIST(root="data", train=True, download=True, transform=transform)
test_data = datasets.MNIST(root="data", train=False, download=True, transform=transform)

# DataLoader (baseline)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1000)

In [4]:
# Define model
class BaselineCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.fc1 = nn.Linear(1600, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, 2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [5]:
model = BaselineCNN().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=1.0)
criterion = nn.CrossEntropyLoss()

### Training/Inference Timing (Baseline)

In [6]:
def train(model, loader, epochs=2):
    model.train()
    start_time = time.time()
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
    return time.time() - start_time

def inference(model, loader):
    model.eval()
    correct = 0
    start_time = time.time()
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()
    acc = correct / len(loader.dataset)
    return acc, time.time() - start_time

In [7]:
baseline_train_time = train(model, train_loader)
baseline_acc, baseline_inference_time = inference(model, test_loader)

print(f"Baseline Training Time: {baseline_train_time:.2f}s")
print(f"Baseline Inference Time: {baseline_inference_time:.4f}s | Accuracy: {baseline_acc*100:.2f}%")

Baseline Training Time: 4.90s
Baseline Inference Time: 0.2392s | Accuracy: 98.89%


## Step 2: Profiling the Baseline
- Identify bottlenecks with PyTorch Profiler

In [8]:
def profile_model():
    model.train()
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True,
        with_stack=True,
    ) as prof:
        with record_function("train_batch"):
            data, target = next(iter(train_loader))
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
    
    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=15))

profile_model()

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            train_batch         0.00%       0.000us         0.00%       0.000us       0.000us       2.642ms       160.68%       2.642ms       2.642ms             1  
                                            train_batch        20.22%       2.796ms        92.04%      12.727ms      12.727ms       0.000us         0.00%       1.027ms       1.027ms             1  
         

### **Critical Bottlenecks Identified**
1. **Adadelta Optimizer Overhead** (39.8% CUDA time)
   - `Optimizer.step` dominates GPU usage
2. **Convolution Backward Pass** (27.3% CUDA time)
   - `aten::convolution_backward` is expensive
3. **Data Transfer (`aten::to`, `aten::copy_`)** (17% CPU time)
   - Slow host-to-device transfers
4. **Multi-Tensor Ops** (`_foreach_mul`, `_foreach_add`)
   - Significant CUDA time (17.8% + 14.4%)

## Step 3: Optimized Model with

* Adam optimizer
* Mixed precision training (`torch.cuda.amp`)
* `pin_memory=True` and `num_workers=4` in DataLoader
* `torch.backends.cudnn.benchmark=True` enabled


In [10]:
import torch.backends.cudnn

# Enable cudnn benchmark for better conv performance
torch.backends.cudnn.benchmark = True

# Updated DataLoader with pin_memory, num_workers (you already have this, repeated for clarity)
train_loader = DataLoader(
    train_data,
    batch_size=64,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
    persistent_workers=True
)
test_loader = DataLoader(
    test_data,
    batch_size=1000,
    num_workers=4,
    pin_memory=True,
    persistent_workers=True
)

# Switch optimizer to Adam
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Use GradScaler for mixed precision
scaler = torch.cuda.amp.GradScaler()

def train_optimized(model, loader, epochs=2):
    model.train()
    start_time = time.time()
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(loader):
            # Move data to device with non_blocking=True for async H2D copies
            data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                output = model(data)
                loss = criterion(output, target)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
    return time.time() - start_time

def inference_optimized(model, loader):
    model.eval()
    correct = 0
    start_time = time.time()
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)
            with torch.cuda.amp.autocast():
                output = model(data)
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()
    acc = correct / len(loader.dataset)
    return acc, time.time() - start_time

# Run optimized training & inference
optimized_train_time = train_optimized(model, train_loader)
optimized_acc, optimized_inference_time = inference_optimized(model, test_loader)

train_speedup = baseline_train_time / optimized_train_time
inference_speedup = baseline_inference_time / optimized_inference_time

print(f"Optimized Training Time: {optimized_train_time:.2f}s")
print(f"Optimized Inference Time: {optimized_inference_time:.4f}s | Accuracy: {optimized_acc*100:.2f}%\n")

print(f"Baseline Training Time: {baseline_train_time:.2f}s")
print(f"Baseline Inference Time: {baseline_inference_time:.4f}s | Accuracy: {baseline_acc*100:.2f}%\n")

print(f"Training time improved by: {train_speedup:.2f}x")
print(f"Inference time improved by: {inference_speedup:.2f}x")



  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Optimized Training Time: 4.03s
Optimized Inference Time: 0.1124s | Accuracy: 99.08%

Baseline Training Time: 4.90s
Baseline Inference Time: 0.2392s | Accuracy: 98.89%

Training time improved by: 1.21x
Inference time improved by: 2.13x


  with torch.cuda.amp.autocast():
