In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "16"
os.environ["MKL_NUM_THREADS"] = "16"
#os.environ["OMP_DISPLAY_ENV"] = "TRUE"
os.environ["KMP_AFFINITY"] = "granularity=fine,compact,1,0"
os.environ["KMP_BLOCKTIME"] = "1"

In [2]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torchvision import models
from torch.utils.data.sampler import SubsetRandomSampler

print(torch.__version__)

2.0.0+cpu


In [11]:
from torch.profiler import profile, record_function, ProfilerActivity

In [12]:
def build_data_loader(data_dir, batch_size, random_seed=42, valid_size=0.1, shuffle=True, test=False):

    transform = transforms.Compose([transforms.ToTensor()])
    
    train_dataset = datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
    valid_dataset = datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
    test_dataset = datasets.CIFAR10(root=data_dir, train=False, download=True, transform=transform)
  
    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, sampler=valid_sampler)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)

    return (train_loader, valid_loader, test_loader)

In [13]:
def train(model, data_loader, valid_loader, num_epochs, criterion, optimizer, device):
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):  
            # Move tensors to the configured device
            images = images.to(device)
            labels = labels.to(device)
        
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
        
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        validate(model, valid_loader, device)
        print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
            

In [14]:
def validate(model, valid_loader, device):
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs
    
    print('Accuracy of the network on the {} validation images: {:.2f} %'.format(5000, 100 * correct / total)) 

In [15]:
def test(model, test_loader, device):
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))  

In [16]:
def count_parameters(model):
    parameters = list(model.parameters())
    total_parms = sum([np.prod(p.size()) for p in parameters if p.requires_grad])
    return total_parms


In [17]:
# General parameters
data_dir = '/tmp'
device = 'cpu'
num_classes = 10

# Hyperparameters
max_lr = 0.00001
weight_decay = 0.005
batch_size = 64
learning_rate = 0.0001
num_epochs = 1
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

# CIFAR10 dataset 
train_loader, valid_loader, test_loader = build_data_loader(data_dir=data_dir, batch_size=batch_size)

# Model definition
#model = models.vgg11().to(device)
model = models.vgg16().to(device)

# Optimizer
optimizer = optimizer(model.parameters(), max_lr, weight_decay=weight_decay)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [18]:
print(count_parameters(model))

138357544


In [20]:
%%time
# Train the model
with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof:
    with record_function("training"):
        train(model, train_loader, valid_loader, num_epochs, criterion, optimizer, device)
        
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
prof.export_chrome_trace("trace.json")

STAGE:2023-04-05 16:27:15 995074:995074 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
[W CPUAllocator.cpp:235] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event


Accuracy of the network on the 5000 validation images: 22.60 %
Epoch [1/1], Loss: 1.7761


STAGE:2023-04-05 16:30:39 995074:995074 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-04-05 16:30:39 995074:995074 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               training         0.50%        1.008s       100.00%      200.741s      200.741s     527.79 Mb      -8.45 Gb             1  
                               Optimizer.step#Adam.step         3.88%        7.783s        37.15%       74.573s     105.928ms           0 b   -1088.36 Gb           704  
autograd::engine::evaluate_function: ConvolutionBack...         0.45%     909.688ms        25.84%       51.881s       5.669ms      17.65 Gb     -51.50

In [None]:
test(model, test_loader, device)