In [1]:
import warnings
warnings.filterwarnings("ignore") 

In [2]:
import os
os.environ['OMP_NUM_THREADS'] = "128"
os.environ['KMP_AFFINITY'] = "granularity=fine,compact,1,0"
os.environ['KMP_BLOCKTIME'] = "0"
#os.environ['TORCH_USE_CUDA_DSA'] = "1"
#os.environ['TORCHDYNAMO_REPRO_AFTER'] ="aot" 
#os.environ['TORCHDYNAMO_REPRO_LEVEL'] = "4"

In [3]:
import time
import logging
import numpy as np
import torch
import torch.nn as nn
import torch._dynamo
import torch._inductor
from torchvision import datasets
from torchvision import transforms
from torchvision import models
from torch.utils.data.sampler import SubsetRandomSampler
from torch.profiler import profile, record_function, ProfilerActivity

In [4]:
#torch.backends.cuda.matmul.allow_tf32 = True
#torch._dynamo.config.cache_size_limit = 2

In [5]:
def build_data_loader(data_dir, batch_size, random_seed=42, valid_size=0.1, shuffle=True, test=False):

    transform = transforms.Compose([transforms.ToTensor()])
    transform.crop_size=224
    transform.resize_size=224
    
    train_dataset = datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
    test_dataset = datasets.CIFAR10(root=data_dir, train=False, download=True, transform=transform)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)

    return (train_loader, test_loader)

In [6]:
def train(model, data_loader, num_epochs, criterion, optimizer, device):
    total_steps = len(train_loader)
    epoch_time = []
    for epoch in range(num_epochs):
        start = time.time()
        for step, (images, labels) in enumerate(train_loader):  
            # Move tensors to the configured device
            images = images.to(device)
            labels = labels.to(device)
        
            # Forward pass
            outputs = model(images)
            outputs.to(device)
            loss = criterion(outputs, labels)
        
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print ('Step [{}/{}], Loss: {:.4f}'.format(step+1, total_steps, loss.item()))
        end = time.time()
        elapsed_time = int(end - start)
        epoch_time.append(elapsed_time)
               
        print ('Epoch [{}/{}], Loss: {:.4f}, time: {} seconds'.format(epoch+1, num_epochs, loss.item(), elapsed_time))
    return epoch_time            

In [7]:
def test(model, test_loader, device):
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

    accuracy = 100 * (correct / total)
    print('Accuracy of the network on the {} test images: {} %'.format(10000, accuracy))
    return accuracy

In [8]:
import torch
import warnings

gpu_ok = False
if torch.cuda.is_available():
    device_cap = torch.cuda.get_device_capability()
    if device_cap in ((7, 0), (8, 0), (9, 0)):
        gpu_ok = True

if not gpu_ok:
    warnings.warn("GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower than expected.")

In [9]:
# General parameters
data_dir = '/tmp'
device = "cuda"

# Hyperparameters
lr = 0.00001
weight_decay = 0.005
batch_size = 64
num_epochs = 50
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

# FashionMNIST dataset 
train_loader, test_loader = build_data_loader(data_dir=data_dir, batch_size=batch_size)

# Model
weights = models.DenseNet121_Weights.DEFAULT
net = models.densenet121(weights=weights)
#net = net.train()
net = net.to(device)

Files already downloaded and verified
Files already downloaded and verified


In [10]:
#backends = torch._dynamo.list_backends()
backends = ['aot_ts_nvfuser', 'cudagraphs', 'inductor']
results = []
for backend in backends:
    #del compiled_model
    torch._dynamo.reset()
    print('Compiling model with backend {}'.format(backend))
    mode = None
    if backend == "inductor":
        mode = "reduce-overhead"
    model = torch.compile(net, backend=backend, mode=mode)
    opt = optimizer(model.parameters(), lr, weight_decay=weight_decay)
    start = time.time()
    epoch_time = train(model, train_loader, num_epochs, criterion, opt, device)
    end = time.time()
    training_time = int(end-start)
    print('Training time of {}: {} seconds'.format(backend, training_time))
    accuracy = test(model, test_loader, device)
    result = [backend, training_time, epoch_time, accuracy]
    results.append(result)

Compiling model with backend aot_ts_nvfuser
Epoch [1/50], Loss: 2.2836, time: 142 seconds
Epoch [2/50], Loss: 1.7009, time: 108 seconds
Epoch [3/50], Loss: 0.9180, time: 46 seconds
Epoch [4/50], Loss: 0.4004, time: 46 seconds
Epoch [5/50], Loss: 0.2509, time: 46 seconds
Epoch [6/50], Loss: 0.1606, time: 46 seconds
Epoch [7/50], Loss: 0.0944, time: 46 seconds
Epoch [8/50], Loss: 0.0646, time: 45 seconds
Epoch [9/50], Loss: 0.0440, time: 46 seconds
Epoch [10/50], Loss: 0.0290, time: 46 seconds
Epoch [11/50], Loss: 0.0202, time: 46 seconds
Epoch [12/50], Loss: 0.0129, time: 46 seconds
Epoch [13/50], Loss: 0.0120, time: 46 seconds
Epoch [14/50], Loss: 0.0135, time: 46 seconds
Epoch [15/50], Loss: 0.0062, time: 46 seconds
Epoch [16/50], Loss: 0.0040, time: 46 seconds
Epoch [17/50], Loss: 0.0035, time: 46 seconds
Epoch [18/50], Loss: 0.0044, time: 46 seconds
Epoch [19/50], Loss: 0.0069, time: 46 seconds
Epoch [20/50], Loss: 0.0038, time: 46 seconds
Epoch [21/50], Loss: 0.0040, time: 46 secon

skipping cudagraphs due to input mutation
skipping cudagraphs due to input mutation


Accuracy of the network on the 10000 test images: 79.9 %


In [11]:
results

[['aot_ts_nvfuser',
  2474,
  [142,
   108,
   46,
   46,
   46,
   46,
   46,
   45,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46],
  74.68],
 ['cudagraphs',
  2290,
  [86,
   45,
   44,
   45,
   45,
   45,
   45,
   45,
   45,
   45,
   44,
   44,
   44,
   44,
   44,
   44,
   45,
   45,
   45,
   45,
   45,
   45,
   45,
   44,
   45,
   45,
   45,
   45,
   45,
   45,
   45,
   45,
   45,
   45,
   45,
   45,
   44,
   44,
   44,
   44,
   45,
   45,
   45,
   44,
   44,
   44,
   44,
   44,
   44,
   44],
  77.57],
 ['inductor',
  1407,
  [140,
   28,
   26,
   26,
   25,
   25,
   26,
   25,
   26,
   25,
   26,
   25,
   26,
   25,
   26,
   25,
   26,
   25,
   25,
   26,
   26,
   25,
   25,
   26,
   26,
   25,
   25,
   26,
   25

In [12]:
torch._dynamo.list_backends()

['aot_ts_nvfuser',
 'cudagraphs',
 'inductor',
 'ipex',
 'nvprims_nvfuser',
 'onnxrt',
 'tvm']