In [1]:
import os
os.environ['OMP_NUM_THREADS'] = "32"
os.environ['MKL_NUM_THREADS'] = "32"

In [2]:
import time
import logging
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torchvision import models
from torch.utils.data.sampler import SubsetRandomSampler
from torch.profiler import profile, record_function, ProfilerActivity

In [3]:
torch.backends.cudnn.benchmark = True
torch.set_num_threads(32)
torch.set_num_interop_threads(1)

In [4]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True

In [5]:
def build_data_loader(data_dir, batch_size, random_seed=42, valid_size=0.1, shuffle=True, test=False):
    transform = transforms.Compose([transforms.ToTensor()])
    transform.crop_size=512
    transform.resize_size=512
    
    train_dataset = datasets.CIFAR10(root=data_dir, train=True, download=False, transform=transform)
    test_dataset = datasets.CIFAR10(root=data_dir, train=False, download=False, transform=transform)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)

    return (train_loader, test_loader)

In [6]:
def train(model, data_loader, num_epochs, criterion, optimizer, device):
    total_steps = len(train_loader)
    for epoch in range(num_epochs):
        start = time.time()
        for step, (images, labels) in enumerate(train_loader):  
            # Move tensors to the configured device
            images = images.to(device)
            labels = labels.to(device)
        
            # Forward pass
            outputs = model(images)
            outputs.to(device)
            loss = criterion(outputs, labels)
        
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print ('Step [{}/{}], Loss: {:.4f}'.format(step+1, total_steps, loss.item()))
        end = time.time()
               
        print ('Epoch [{}/{}], Loss: {:.4f}, time: {} seconds'.format(epoch+1, num_epochs, loss.item(), int(end-start)))
            

In [7]:
def train_amp(model, data_loader, num_epochs, criterion, optimizer, device):
    scaler = torch.cuda.amp.GradScaler()
    total_steps = len(train_loader)
    for epoch in range(num_epochs):
        start = time.time()
        for step, (images, labels) in enumerate(train_loader): 
            # Move tensors to the configured device
            images = images.to(device)
            labels = labels.to(device)

            with torch.autocast(device_type=device, dtype=torch.float16):
                output = model(images).to(device)
                loss = criterion(output, labels)

            # Backward and optimize
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            #print ('Step [{}/{}], Loss: {:.4f}'.format(step+1, total_steps, loss.item()))
        end = time.time()
               
        print ('Epoch [{}/{}], Loss: {:.4f}, time: {} seconds'.format(epoch+1, num_epochs, loss.item(), int(end-start)))
            

In [8]:
def train_amp_without_scaler(model, data_loader, num_epochs, criterion, optimizer, device):
    total_steps = len(train_loader)
    for epoch in range(num_epochs):
        start = time.time()
        for step, (images, labels) in enumerate(train_loader): 
            # Move tensors to the configured device
            images = images.to(device)
            labels = labels.to(device)

            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                output = model(images).to(device)
                loss = criterion(output, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print ('Step [{}/{}], Loss: {:.4f}'.format(step+1, total_steps, loss.item()))
        end = time.time()
               
        print ('Epoch [{}/{}], Loss: {:.4f}, time: {} seconds'.format(epoch+1, num_epochs, loss.item(), int(end-start)))
            

In [9]:
def test(model, test_loader, device):
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))  

In [10]:
# General parameters
data_dir = '/tmp'
device = "cuda"

# Hyperparameters
lr = 0.001
weight_decay = 0.005
batch_size = 2048
num_epochs = 50
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

# Dataset 
train_loader, test_loader = build_data_loader(data_dir=data_dir, batch_size=batch_size)

# Model
#weights = models.EfficientNet_B7_Weights.DEFAULT
net = models.efficientnet_b7()
net.train()
model = net.to(device)

# Optimizer
optimizer = optimizer(model.parameters(), lr, weight_decay=weight_decay)

In [11]:
# Training
start = time.time()
train_amp_without_scaler(model, train_loader, num_epochs, criterion, optimizer, device)
end = time.time()
print('Training time: {} seconds'.format(int(end-start)))
test(model, test_loader, device)

Epoch [1/50], Loss: 3.8119, time: 19 seconds
Epoch [2/50], Loss: 2.5468, time: 14 seconds
Epoch [3/50], Loss: 2.3518, time: 14 seconds
Epoch [4/50], Loss: 2.2284, time: 14 seconds
Epoch [5/50], Loss: 2.0671, time: 14 seconds
Epoch [6/50], Loss: 2.0159, time: 14 seconds
Epoch [7/50], Loss: 1.9560, time: 14 seconds
Epoch [8/50], Loss: 1.9813, time: 14 seconds
Epoch [9/50], Loss: 1.9518, time: 14 seconds
Epoch [10/50], Loss: 1.8865, time: 14 seconds
Epoch [11/50], Loss: 1.9012, time: 14 seconds
Epoch [12/50], Loss: 1.8329, time: 14 seconds
Epoch [13/50], Loss: 1.8781, time: 14 seconds
Epoch [14/50], Loss: 1.8450, time: 14 seconds
Epoch [15/50], Loss: 1.8052, time: 14 seconds
Epoch [16/50], Loss: 1.8072, time: 14 seconds
Epoch [17/50], Loss: 1.8629, time: 14 seconds
Epoch [18/50], Loss: 1.7391, time: 14 seconds
Epoch [19/50], Loss: 1.6643, time: 14 seconds
Epoch [20/50], Loss: 1.6310, time: 14 seconds
Epoch [21/50], Loss: 1.6280, time: 14 seconds
Epoch [22/50], Loss: 1.5911, time: 14 secon

Sem AMP (benchmark=False): 1404 (não usou TC)

Com AMP FP16 (benchmark=False): 1224 (não usou TC)

Com AMP BFP16 (benchmark=False): 1390 (não usou TC)

Sem AMP (benchmark=True): 811 (usou 1.80% de TC)

Com AMP FP16 (benchmark=True): 486 (usou 1.58% de TC)

Com AMP BFP16 (benchmark=True): 754 (usou 1.00% de TC)
