In [1]:
import os
os.environ['OMP_NUM_THREADS'] = "128"
os.environ['KMP_AFFINITY'] = "granularity=fine,compact,1,0"
os.environ['KMP_BLOCKTIME'] = "0"
#os.environ['TORCH_USE_CUDA_DSA'] = "1"
#os.environ['TORCHDYNAMO_REPRO_AFTER'] ="aot" 
#os.environ['TORCHDYNAMO_REPRO_LEVEL'] = "4"

In [2]:
import time
import logging
import numpy as np
import torch
import torch.nn as nn
import torch._dynamo
import torch._inductor
from torchvision import datasets
from torchvision import transforms
from torchvision import models
from torch.utils.data.sampler import SubsetRandomSampler
from torch.profiler import profile, record_function, ProfilerActivity

In [3]:
#torch.backends.cuda.matmul.allow_tf32 = True
#torch._dynamo.config.cache_size_limit = 2

In [4]:
def build_data_loader(data_dir, batch_size, random_seed=42, valid_size=0.1, shuffle=True, test=False):

    transform = transforms.Compose([transforms.ToTensor()])
    transform.crop_size=224
    transform.resize_size=224
    
    train_dataset = datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
    test_dataset = datasets.CIFAR10(root=data_dir, train=False, download=True, transform=transform)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)

    return (train_loader, test_loader)

In [5]:
def train(model, data_loader, num_epochs, criterion, optimizer, device):
    total_steps = len(train_loader)
    for epoch in range(num_epochs):
        start = time.time()
        for step, (images, labels) in enumerate(train_loader):  
            # Move tensors to the configured device
            images = images.to(device)
            labels = labels.to(device)
        
            # Forward pass
            outputs = model(images)
            outputs.to(device)
            loss = criterion(outputs, labels)
        
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print ('Step [{}/{}], Loss: {:.4f}'.format(step+1, total_steps, loss.item()))
        end = time.time()
               
        print ('Epoch [{}/{}], Loss: {:.4f}, time: {} seconds'.format(epoch+1, num_epochs, loss.item(), int(end-start)))
            

In [6]:
def test(model, test_loader, device):
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))  

In [7]:
class CNN(nn.Module):
    def __init__(self, num_classes=10):
        super(CNN, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))

        self.fc1 = nn.Linear(4096, 512)
        
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [8]:
import torch
import warnings

gpu_ok = False
if torch.cuda.is_available():
    device_cap = torch.cuda.get_device_capability()
    if device_cap in ((7, 0), (8, 0), (9, 0)):
        gpu_ok = True

if not gpu_ok:
    warnings.warn("GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower than expected.")

In [9]:
# General parameters
data_dir = '/tmp'
device = "cuda"

# Hyperparameters
lr = 0.00001
weight_decay = 0.005
batch_size = 64
num_epochs = 50
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

# FashionMNIST dataset 
train_loader, test_loader = build_data_loader(data_dir=data_dir, batch_size=batch_size)

# Models
torch._dynamo.reset()
weights = models.DenseNet121_Weights.DEFAULT
net = models.densenet121(weights=weights)
net.train()
eager_model = net.to(device)
compiled_model = torch.compile(net.to(device), backend="inductor", mode="reduce-overhead")

# Model selection
model = eager_model

# Optimizer
optimizer = optimizer(model.parameters(), lr, weight_decay=weight_decay)

Files already downloaded and verified
Files already downloaded and verified


In [10]:
#torch._inductor.config.fallback_random = True
#torch._inductor.config.triton.cuda_graphs = False
#torch._inductor.config.debug = True
start = time.time()
train(model, train_loader, num_epochs, criterion, optimizer, device)
end = time.time()
print('Training time: {} seconds'.format(int(end-start)))
test(model, test_loader, device)

Epoch [1/50], Loss: 2.2007, time: 47 seconds
Epoch [2/50], Loss: 1.5061, time: 44 seconds
Epoch [3/50], Loss: 0.8362, time: 44 seconds
Epoch [4/50], Loss: 0.4706, time: 44 seconds
Epoch [5/50], Loss: 0.2696, time: 45 seconds
Epoch [6/50], Loss: 0.1660, time: 45 seconds
Epoch [7/50], Loss: 0.1110, time: 45 seconds
Epoch [8/50], Loss: 0.0678, time: 45 seconds
Epoch [9/50], Loss: 0.0482, time: 45 seconds
Epoch [10/50], Loss: 0.0284, time: 45 seconds
Epoch [11/50], Loss: 0.0272, time: 45 seconds
Epoch [12/50], Loss: 0.0221, time: 45 seconds
Epoch [13/50], Loss: 0.0862, time: 45 seconds
Epoch [14/50], Loss: 0.0099, time: 45 seconds
Epoch [15/50], Loss: 0.0125, time: 45 seconds
Epoch [16/50], Loss: 0.0051, time: 45 seconds
Epoch [17/50], Loss: 0.0033, time: 45 seconds
Epoch [18/50], Loss: 0.0086, time: 45 seconds
Epoch [19/50], Loss: 0.0027, time: 45 seconds
Epoch [20/50], Loss: 0.0023, time: 45 seconds
Epoch [21/50], Loss: 0.0037, time: 45 seconds
Epoch [22/50], Loss: 0.0031, time: 45 secon

In [None]:
#schedule = torch.profiler.schedule(wait=1, warmup=3, active=5, repeat=5)
#activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
prof = profile(activities=activities)
        
input_sample, _ = next(iter(train_loader))

prof.start()
train(model, train_loader, num_epochs, criterion, optimizer, device)
prof.stop()

#prof.export_chrome_trace("./cnn_trace.json")
print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=10))

STAGE:2023-06-11 18:41:18 1616255:1616255 ActivityProfilerController.cpp:312] Completed Stage: Warm Up


Epoch [1/50], Loss: 0.0016, time: 83 seconds
Epoch [2/50], Loss: 0.0200, time: 83 seconds
Epoch [3/50], Loss: 0.0037, time: 83 seconds
Epoch [4/50], Loss: 0.0029, time: 83 seconds
Epoch [5/50], Loss: 0.0023, time: 83 seconds
Epoch [6/50], Loss: 0.0020, time: 83 seconds
Epoch [7/50], Loss: 0.0020, time: 83 seconds
Epoch [8/50], Loss: 0.0024, time: 83 seconds
Epoch [9/50], Loss: 0.0027, time: 83 seconds
Epoch [10/50], Loss: 0.0033, time: 83 seconds
Epoch [11/50], Loss: 0.2236, time: 83 seconds
Epoch [12/50], Loss: 0.0928, time: 83 seconds
Epoch [13/50], Loss: 0.1257, time: 83 seconds
Epoch [14/50], Loss: 0.1975, time: 83 seconds
Epoch [15/50], Loss: 0.0043, time: 83 seconds
Epoch [16/50], Loss: 0.0040, time: 83 seconds
Epoch [17/50], Loss: 0.0041, time: 83 seconds
Epoch [18/50], Loss: 0.0045, time: 83 seconds
Epoch [19/50], Loss: 0.0053, time: 83 seconds
Epoch [20/50], Loss: 0.0164, time: 83 seconds
Epoch [21/50], Loss: 0.0086, time: 83 seconds
Epoch [22/50], Loss: 0.0076, time: 83 secon

STAGE:2023-06-11 20:04:30 1616255:1616255 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-06-11 20:12:02 1616255:1616255 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
