# GPU Performance (Module 2 — Deep Learning & NLP)

Goal: benchmark CPU vs accelerator (CUDA/MPS) for a few common ops used in DL.


## What this notebook checks
- Python & OS info
- PyTorch installation and version
- GPU/accelerator availability (CUDA / MPS)
- A tiny forward pass to verify basic functionality
- Reproducibility seed


In [1]:
# If you see "No module named torch", install requirements first:
# pip install -r requirements.txt
import os, sys, platform, time
import torch
print('Python:', sys.version.split()[0])
print('Platform:', platform.platform())
print('PyTorch:', torch.__version__)


Python: 3.13.9
Platform: Windows-11-10.0.26200-SP0
PyTorch: 2.9.1+cpu


In [2]:
def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    # Apple Silicon
    if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        return torch.device('mps')
    return torch.device('cpu')

device = get_device()
print('Selected device:', device)
if device.type == 'cuda':
    print('CUDA device:', torch.cuda.get_device_name(0))
    print('CUDA capability:', torch.cuda.get_device_capability(0))
    print('CUDA version (runtime):', torch.version.cuda)
    print('cuDNN enabled:', torch.backends.cudnn.enabled)


Selected device: cpu


In [3]:
def seed_everything(seed: int = 42):
    import random
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(42)
print('Seed set ✅')


Seed set ✅


## Benchmark helpers

We use `torch.cuda.synchronize()` when on CUDA to get accurate timings.


In [4]:
import torch
from time import perf_counter

def sync():
    if device.type == 'cuda':
        torch.cuda.synchronize()

def timeit(fn, warmup=10, iters=50):
    # Warmup
    for _ in range(warmup):
        fn()
    sync()
    t0 = perf_counter()
    for _ in range(iters):
        fn()
    sync()
    t1 = perf_counter()
    return (t1 - t0) / iters

print('Ready ✅')


Ready ✅


## 1) Matrix multiplication (GEMM)

This is a core operation behind many layers.


In [5]:
import torch
import math

def bench_matmul(n=2048, dtype=torch.float32):
    a_cpu = torch.randn(n, n, dtype=dtype)
    b_cpu = torch.randn(n, n, dtype=dtype)

    def run_cpu():
        _ = a_cpu @ b_cpu

    cpu_s = timeit(run_cpu, warmup=3, iters=10)

    if device.type in ('cuda','mps'):
        a = a_cpu.to(device)
        b = b_cpu.to(device)
        def run_dev():
            _ = a @ b
        dev_s = timeit(run_dev, warmup=10, iters=50)
        return cpu_s, dev_s
    return cpu_s, None

cpu_s, dev_s = bench_matmul(n=1024)
print(f'CPU matmul avg: {cpu_s*1000:.2f} ms')
if dev_s is not None:
    print(f'{device.type.upper()} matmul avg: {dev_s*1000:.2f} ms')
    print(f'Speedup: {cpu_s/dev_s:.2f}x')


CPU matmul avg: 13.42 ms


## 2) Convolution (Conv2D)

Common in CNNs and also used in some NLP/CV hybrids.


In [6]:
import torch
import torch.nn as nn

def bench_conv(batch=32, channels=3, h=224, w=224):
    conv_cpu = nn.Conv2d(channels, 32, kernel_size=3, padding=1)
    x_cpu = torch.randn(batch, channels, h, w)

    def run_cpu():
        _ = conv_cpu(x_cpu)
    cpu_s = timeit(run_cpu, warmup=3, iters=10)

    if device.type in ('cuda','mps'):
        conv = conv_cpu.to(device)
        x = x_cpu.to(device)
        def run_dev():
            _ = conv(x)
        dev_s = timeit(run_dev, warmup=10, iters=50)
        return cpu_s, dev_s
    return cpu_s, None

cpu_s, dev_s = bench_conv(batch=8, h=128, w=128)
print(f'CPU conv avg: {cpu_s*1000:.2f} ms')
if dev_s is not None:
    print(f'{device.type.upper()} conv avg: {dev_s*1000:.2f} ms')
    print(f'Speedup: {cpu_s/dev_s:.2f}x')


CPU conv avg: 6.26 ms


## 3) Small training step

A minimal training step (forward + backward) to reflect real training.


In [7]:
import torch
import torch.nn as nn

def train_step(device_to_use):
    model = nn.Sequential(nn.Linear(512, 1024), nn.ReLU(), nn.Linear(1024, 10)).to(device_to_use)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    x = torch.randn(256, 512, device=device_to_use)
    y = torch.randint(0, 10, (256,), device=device_to_use)

    def step():
        opt.zero_grad(set_to_none=True)
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        opt.step()
    return step

cpu_step = train_step(torch.device('cpu'))
cpu_s = timeit(cpu_step, warmup=2, iters=5)
print(f'CPU train-step avg: {cpu_s*1000:.2f} ms')

if device.type in ('cuda','mps'):
    dev_step = train_step(device)
    dev_s = timeit(dev_step, warmup=5, iters=20)
    print(f'{device.type.upper()} train-step avg: {dev_s*1000:.2f} ms')
    print(f'Speedup: {cpu_s/dev_s:.2f}x')


CPU train-step avg: 29.99 ms


## Interpretation tips
- If speedup is small, that can be normal for tiny models/batches.
- Bigger batches and heavier models usually benefit more from GPU.
- For stable results, close other heavy apps and run the notebook twice.
