# Step 1: Baseline Model (AlexNet adapted for MNIST)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import time
from torch.profiler import profile, record_function, ProfilerActivity
import matplotlib.pyplot as plt

# Set seed and device
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Data loading
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize MNIST to match AlexNet input size
    transforms.ToTensor()
])

train_data = datasets.MNIST(root="data", train=True, download=True, transform=transform)
test_data = datasets.MNIST(root="data", train=False, download=True, transform=transform)

# DataLoader (baseline)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1000)

# Define model: Modified AlexNet for grayscale MNIST images
class AlexNetMNIST(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 10),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

model = AlexNetMNIST().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=1.0)
criterion = nn.CrossEntropyLoss()

# Timing functions
def train(model, loader, epochs=5):
    model.train()
    start_time = time.time()
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
    return time.time() - start_time

def inference(model, loader):
    model.eval()
    correct = 0
    start_time = time.time()
    with torch.no_grad():
        for data, target in loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()
    acc = correct / len(loader.dataset)
    return acc, time.time() - start_time

# Baseline Training/Inference
baseline_train_time = train(model, train_loader)
baseline_acc, baseline_inference_time = inference(model, test_loader)


Using device: cuda


# Baseline Training/Inference


In [3]:
print(f"Baseline Training Time: {baseline_train_time:.2f}s")
print(f"Baseline Inference Time: {baseline_inference_time:.4f}s | Accuracy: {baseline_acc*100:.2f}%")

Baseline Training Time: 185.06s
Baseline Inference Time: 8.4124s | Accuracy: 98.97%


# Step 2: Profiling the Baseline


In [9]:
from torch.profiler import tensorboard_trace_handler
torch.backends.cudnn.benchmark = False

def profile_model():
    model.train()
    with profile(
        activities=[ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True,
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
        on_trace_ready=tensorboard_trace_handler("./profiler_output")
    ) as prof:
        for step, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            with record_function("train_step"):
                output = model(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
            prof.step()
            
print("\nProfiling Baseline Model...")
profile_model()


Profiling Baseline Model...




# Step 3: Analyze the Trace Using HTA - HolisticTraceAnalysis Model


In [15]:
# !pip install HolisticTraceAnalysis

In [2]:
from hta.trace_analysis import TraceAnalysis
analyzer = TraceAnalysis(trace_dir = "./profiler_output")
kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown(num_kernels=5, visualize=False)

If the trace file does not have the rank specified in it, then add the following snippet key to the json files to use HTA; "distributedInfo": {"rank": 0}. If there are multiple traces files, then each file should have a unique rank value.For now we will default to rank = 0.
If the trace file does not have the rank specified in it, then add the following snippet key to the json files to use HTA; "distributedInfo": {"rank": 0}. If there are multiple traces files, then each file should have a unique rank value.For now we will default to rank = 0.
If the trace file does not have the rank specified in it, then add the following snippet key to the json files to use HTA; "distributedInfo": {"rank": 0}. If there are multiple traces files, then each file should have a unique rank value.For now we will default to rank = 0.
If the trace file does not have the rank specified in it, then add the following snippet key to the json files to use HTA; "distributedInfo": {"rank": 0}. If there are multipl

# Kery observations

void cutlass::Kernel2<cutlass_80_simt_sgemm_12...	11057.0 µs - This is a GEMM (General Matrix Multiply) kernel. Likely comes from a linear layer or convolution in your AlexNet model. High total time suggests it's a performance bottleneck.

Memcpy HtoD (Pageable -> Device)	2500.0 µs - Involves copying data from CPU to GPU. Relatively small compared to compute ops, but still worth monitoring.

multi_tensor_apply(...), multi_tensor_axpby(...) - These are internal PyTorch functions used in optimizers like Adam.
They're involved in parameter updates and gradient operations.
The high cumulative time here suggests that optimizer steps are contributing significantly to overall runtime.

In [3]:
kernel_metrics_df

Unnamed: 0,name,sum (us),max (us),min (us),stddev,mean (us),kernel_type,rank
0,others,49483.0,4912.0,0.0,1442.389059,1124.613636,COMPUTATION,0
1,void at::native::(anonymous namespace)::multi_...,4947.0,4947.0,4947.0,0.0,4947.0,COMPUTATION,0
2,void at::native::(anonymous namespace)::multi_...,4989.0,4989.0,4989.0,0.0,4989.0,COMPUTATION,0
3,void at::native::(anonymous namespace)::multi_...,7594.0,7594.0,7594.0,0.0,7594.0,COMPUTATION,0
4,void cutlass::Kernel2<cutlass_80_simt_sgemm_12...,6345.0,6345.0,6345.0,0.0,6345.0,COMPUTATION,0
5,void cutlass::Kernel2<cutlass_80_simt_sgemm_12...,11057.0,11057.0,11057.0,0.0,11057.0,COMPUTATION,0
6,Memcpy HtoD (Pageable -> Device),2500.0,894.0,-1.0,461.199595,416.666667,MEMORY,0
7,Memset (Device),-1.0,3.0,-1.0,0.961563,-0.035714,MEMORY,0


In [4]:
kernel_type_metrics_df

Unnamed: 0,kernel_type,sum,percentage
0,COMPUTATION,84415,97.1
1,MEMORY,2511,2.9


This is the GPU kernel breakdown from our trace and it shows how much GPU time was spent on different types of operations.

As you see **COMPUTATION (97.1%)** - Refers to compute-bound kernels , such as:

* Matrix multiplications (GEMM)

* Convolution operations (Conv2D)

* Activation functions like ReLU, Softmax, etc.

These are arithmetic-heavy operations that utilize CUDA cores heavily.

**MEMORY (2.9%)** - Refers to memory-related operations , such as:

* Memory copies (HtoD, DtoH, or DtoD)

* Memory set operations

These operations are typically not compute-intensive but involve moving data around in memory.

**Our model is heavily compute-bound — over 97% of the GPU time is spent doing actual computation. We should Look into optimizing compute-heavy ops (e.g., mixed precision training, kernel fusion)**


To explore more about the features HTA provides refer to: https://hta.readthedocs.io/en/latest/source/intro/using_hta.html