In [1]:
import os
os.environ['OMP_NUM_THREADS'] = "16"
os.environ['OMP_DISPLAY_ENV'] = "True"
os.environ['KMP_AFFINITY'] = "granularity=fine,compact,1,0"
#os.environ['KMP_BLOCKTIME'] = "0"
#os.environ['MALLOC_CONF'] = "background_thread:false,metadata_thp:no,dirty_decay_ms:2000,muzzy_decay_ms:2000"

In [2]:
import time
import logging
import numpy as np
import torch
import torch.nn as nn
import intel_extension_for_pytorch as ipex
from torchvision import datasets
from torchvision import transforms
from torchvision import models
from torch.utils.data.sampler import SubsetRandomSampler
from torch.profiler import profile, record_function, ProfilerActivity


OPENMP DISPLAY ENVIRONMENT BEGIN
  _OPENMP = '201511'
  OMP_DYNAMIC = 'FALSE'
  OMP_NESTED = 'FALSE'
  OMP_NUM_THREADS = '16'
  OMP_SCHEDULE = 'DYNAMIC'
  OMP_PROC_BIND = 'FALSE'
  OMP_PLACES = ''
  OMP_STACKSIZE = '39006274'
  OMP_WAIT_POLICY = 'PASSIVE'
  OMP_THREAD_LIMIT = '4294967295'
  OMP_MAX_ACTIVE_LEVELS = '2147483647'
  OMP_CANCELLATION = 'FALSE'
  OMP_DEFAULT_DEVICE = '0'
  OMP_MAX_TASK_PRIORITY = '0'
OPENMP DISPLAY ENVIRONMENT END

OPENMP DISPLAY ENVIRONMENT BEGIN
   _OPENMP='201611'
  [host] OMP_AFFINITY_FORMAT='OMP: pid %P tid %i thread %n bound to OS proc set {%A}'
  [host] OMP_ALLOCATOR='omp_default_mem_alloc'
  [host] OMP_CANCELLATION='FALSE'
  [host] OMP_DEBUG='disabled'
  [host] OMP_DEFAULT_DEVICE='0'
  [host] OMP_DISPLAY_AFFINITY='FALSE'
  [host] OMP_DISPLAY_ENV='TRUE'
  [host] OMP_DYNAMIC='FALSE'
  [host] OMP_MAX_ACTIVE_LEVELS='1'
  [host] OMP_MAX_TASK_PRIORITY='0'
  [host] OMP_NESTED: deprecated; max-active-levels-var=1
  [host] OMP_NUM_TEAMS='0'
  [host] OMP_NUM_

In [3]:
def build_data_loader(data_dir, batch_size, random_seed=42, valid_size=0.1, shuffle=True, test=False):
    transform = transforms.Compose([transforms.ToTensor()])
    transform.crop_size=224
    transform.resize_size=224
    
    train_dataset = datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
    test_dataset = datasets.CIFAR10(root=data_dir, train=False, download=True, transform=transform)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)

    return (train_loader, test_loader)

In [4]:
def train(model, data_loader, num_epochs, criterion, optimizer, device):
    total_steps = len(train_loader)
    for epoch in range(num_epochs):
        start = time.time()
        for step, (images, labels) in enumerate(train_loader):  
            # Move tensors to the configured device
            images = images.to(device)
            labels = labels.to(device)
        
            # Forward pass
            outputs = model(images)
            outputs.to(device)
            loss = criterion(outputs, labels)
        
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print ('Step [{}/{}], Loss: {:.4f}'.format(step+1, total_steps, loss.item()))
        end = time.time()
               
        print ('Epoch [{}/{}], Loss: {:.4f}, time: {} seconds'.format(epoch+1, num_epochs, loss.item(), int(end-start)))
            

In [5]:
def test(model, test_loader, device):
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))  

In [6]:
# General parameters
data_dir = '/tmp'
device = "cpu"

# Hyperparameters
lr = 0.00001
weight_decay = 0.005
batch_size = 64
num_epochs = 1
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

# FashionMNIST dataset 
train_loader, test_loader = build_data_loader(data_dir=data_dir, batch_size=batch_size)

# Models
weights = models.DenseNet121_Weights.DEFAULT
net = models.densenet121(weights=weights)
net.train()
model = net.to(device)

# Optimizer
optimizer = optimizer(model.parameters(), lr, weight_decay=weight_decay)

#model, optimizer = ipex.optimize(model, optimizer=optimizer)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
start = time.time()
train(model, train_loader, num_epochs, criterion, optimizer, device)
end = time.time()
print('Training time: {} seconds'.format(int(end-start)))
test(model, test_loader, device)

Epoch [1/1], Loss: 2.2360, time: 108 seconds
Training time: 108 seconds
Accuracy of the network on the 10000 test images: 42.55 %


In [8]:
#schedule = torch.profiler.schedule(wait=1, warmup=3, active=5, repeat=5)
#activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
activities = [ProfilerActivity.CPU]
prof = profile(activities=activities)
        
input_sample, _ = next(iter(train_loader))

prof.start()
train(model, train_loader, num_epochs, criterion, optimizer, device)
prof.stop()

#prof.export_chrome_trace("./cnn_trace.json")
print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=20))

STAGE:2023-06-28 15:52:29 1408624:1408624 ActivityProfilerController.cpp:311] Completed Stage: Warm Up


Epoch [1/1], Loss: 2.0365, time: 132 seconds


STAGE:2023-06-28 15:54:54 1408624:1408624 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-06-28 15:54:56 1408624:1408624 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             aten::convolution_backward        27.01%       33.697s        27.62%       34.454s     367.156us         93840  
                               aten::mkldnn_convolution        12.44%       15.517s        12.75%       15.899s     169.427us         93840  
                       aten::native_batch_norm_backward         8.85%       11.041s         9.43%       11.758s     124.263us         94622  
                                aten::native_batch_norm         7.06%        8.803s         7.45%        9.298s      98.262us         94622  
      