<a href="https://colab.research.google.com/github/Tensor-Reloaded/Advanced-Topics-in-Neural-Networks-Template-2024/blob/main/Lab02/CIFAR10/CIFAR_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install timed-decorator

Collecting timed-decorator
  Downloading timed_decorator-1.5.2-py3-none-any.whl.metadata (18 kB)
Downloading timed_decorator-1.5.2-py3-none-any.whl (12 kB)
Installing collected packages: timed-decorator
Successfully installed timed-decorator-1.5.2


In [2]:
%pip install triton

Collecting triton
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.0.0


In [3]:
import os
from multiprocessing import freeze_support
from typing import Optional

import torch
import torch.nn.functional as F
from timed_decorator.simple_timed import timed
from torch import Tensor, nn
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import CIFAR10
from torchvision.transforms import v2
from tqdm import tqdm

In [4]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    if torch.backends.mps.is_available():
        return torch.device('mps')
    return torch.device('cpu')

In [5]:
class PreActBlock(nn.Module):
    """Pre-activation version of the BasicBlock."""

    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
        )
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(
            planes, planes, kernel_size=3, stride=1, padding=1, bias=False
        )

        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Conv2d(
                in_planes,
                self.expansion * planes,
                kernel_size=1,
                stride=stride,
                bias=False,
            )

    def forward(self, x: Tensor) -> Tensor:
        out = F.relu(self.bn1(x), inplace=True)
        shortcut = self.shortcut(out) if hasattr(self, "shortcut") else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out), inplace=True))
        out += shortcut
        return out


class PreActBottleneck(nn.Module):
    """Pre-activation version of the original Bottleneck module."""

    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(
            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
        )
        self.bn3 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(
            planes, self.expansion * planes, kernel_size=1, bias=False
        )

        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Conv2d(
                in_planes,
                self.expansion * planes,
                kernel_size=1,
                stride=stride,
                bias=False,
            )

    def forward(self, x: Tensor) -> Tensor:
        out = F.relu(self.bn1(x), inplace=True)
        shortcut = self.shortcut(out) if hasattr(self, "shortcut") else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out), inplace=True))
        out = self.conv3(F.relu(self.bn3(out), inplace=True))
        out += shortcut
        return out


class PreActResNet_C10(nn.Module):
    """Pre-activation ResNet for CIFAR-10"""

    def __init__(self, block, num_blocks, num_classes):
        super(PreActResNet_C10, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x: Tensor) -> Tensor:
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def PreActResNet18_C10(num_classes):
    return PreActResNet_C10(PreActBlock, [2, 2, 2, 2], num_classes)


def get_model():
    return PreActResNet18_C10(10)


In [6]:
class CachedDataset(Dataset):
    def __init__(self, dataset: Dataset, runtime_transforms: Optional[v2.Transform], cache: bool):
        if cache:
            dataset = tuple([x for x in dataset])
        self.dataset = dataset
        self.runtime_transforms = runtime_transforms

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        image, label = self.dataset[i]
        if self.runtime_transforms is None:
            return image, label
        return self.runtime_transforms(image), label

def get_dataset(data_path: str, is_train: bool):
    initial_transforms = v2.Compose([
        v2.ToImage(),
        v2.ToDtype(torch.float32, scale=True),
        v2.Normalize(
            mean=(0.491, 0.482, 0.446),
            std=(0.247, 0.243, 0.261)
        ),
    ])
    cifar10 = CIFAR10(root=data_path, train=is_train, transform=initial_transforms, download=True)
    runtime_transforms = None
    if is_train:
        runtime_transforms = v2.Compose([
            v2.RandomCrop(size=32, padding=4),
            v2.RandomHorizontalFlip(),
            v2.RandomVerticalFlip(),
            v2.RandomErasing()
        ])
    return CachedDataset(cifar10, runtime_transforms, True)

In [7]:
@torch.jit.script
def accuracy(output: Tensor, labels: Tensor):
    fp_plus_fn = torch.logical_not(output == labels).sum().item()
    all_elements = len(output)
    return (all_elements - fp_plus_fn) / all_elements


def train(model, train_loader, criterion, optimizer, device):
    model.train()

    all_outputs = []
    all_labels = []

    for data, labels in train_loader:
        data = data.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        output = model(data)
        loss = criterion(output, labels)

        loss.backward()

        # torch.nn.utils.clip_grad_norm_(model.parameters(), 5)

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

        output = output.softmax(dim=1).detach().cpu().squeeze()
        labels = labels.cpu().squeeze()
        all_outputs.append(output)
        all_labels.append(labels)

    all_outputs = torch.cat(all_outputs).argmax(dim=1)
    all_labels = torch.cat(all_labels)

    return round(accuracy(all_outputs, all_labels), 4)


@torch.inference_mode()
def val(model, val_loader, device):
    model.eval()

    all_outputs = []
    all_labels = []

    for data, labels in val_loader:
        data = data.to(device, non_blocking=True)
        output = model(data)

        output = output.softmax(dim=1).cpu().squeeze()
        labels = labels.squeeze()
        all_outputs.append(output)
        all_labels.append(labels)

    all_outputs = torch.cat(all_outputs).argmax(dim=1)
    all_labels = torch.cat(all_labels)

    return round(accuracy(all_outputs, all_labels), 4)


def do_epoch(model, train_loader, val_loader, criterion, optimizer, device):
    acc = train(model, train_loader, criterion, optimizer, device)
    acc_val = val(model, val_loader, device)
    # torch.cuda.empty_cache()
    return acc, acc_val


In [8]:
def main(device: torch.device = get_default_device(), data_path: str = './data',
         checkpoint_path: str = "./checkpoints"):
    print(f"Using {device}")
    os.makedirs(checkpoint_path, exist_ok=True)
    if device.type == 'cuda':
        torch.backends.cudnn.benchmark = True
        torch.backends.cuda.matmul.allow_tf32 = True

    train_dataset = get_dataset(data_path, is_train=True)
    val_dataset = get_dataset(data_path, is_train=False)

    model = get_model()
    model = model.to(device)
    model = torch.jit.script(model)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, nesterov=True, weight_decay=0.00001,
                                fused=True)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.2, patience=10,
                                                           threshold=0.001, threshold_mode='rel')
    criterion = torch.nn.CrossEntropyLoss()

    batch_size = 50
    val_batch_size = 500
    num_workers = 0
    persistent_workers = (num_workers != 0) and False
    train_loader = DataLoader(train_dataset, shuffle=True, pin_memory=(device.type == 'cuda'), num_workers=num_workers,
                              batch_size=batch_size, drop_last=True, persistent_workers=persistent_workers)
    val_loader = DataLoader(val_dataset, shuffle=False, pin_memory=True, num_workers=0, batch_size=val_batch_size,
                            drop_last=False)

    epochs = tuple(range(200))
    best_val = 0.0
    with tqdm(epochs) as tbar:
        for _ in tbar:
            acc, acc_val = do_epoch(model, train_loader, val_loader, criterion, optimizer, device)
            scheduler.step(acc)

            if acc_val > best_val:
                torch.save(model.state_dict(), os.path.join(checkpoint_path, "best.pth"))
                best_val = acc_val
            tbar.set_description(f"Acc: {acc}, Acc_val: {acc_val}, Best_val: {best_val}")

In [9]:
@timed(stdout=False, return_time=True)
def infer(model, val_loader, device, tta, dtype, inference_mode):
    model.eval()
    all_outputs = []
    all_labels = []

    inference_mode = torch.inference_mode if inference_mode else torch.no_grad

    enable_autocast = device.type != 'cpu' and dtype != torch.float32
    # Autocast is slow for cpu, so we disable it.
    # Also, if the device type is mps, autocast might not work (?) and disabling it might also not work (?)
    with torch.autocast(device_type=device.type, dtype=dtype, enabled=enable_autocast), inference_mode():
        for data, labels in val_loader:
            data = data.to(device, non_blocking=True)

            output = model(data)
            if tta:
                # Horizontal rotation:
                output += model(v2.functional.hflip(data))
                # Vertical rotation:
                output += model(v2.functional.vflip(data))
                # Horizontal rotation + Vertical rotation:
                output += model(v2.functional.hflip(v2.functional.vflip(data)))

            output = output.softmax(dim=1).cpu().squeeze()
            labels = labels.squeeze()
            all_outputs.append(output)
            all_labels.append(labels)

    all_outputs = torch.cat(all_outputs).argmax(dim=1)
    all_labels = torch.cat(all_labels)

    return round(accuracy(all_outputs, all_labels), 4)

In [10]:
def create_model(device: torch.device, checkpoint_path: str, model_type: str):
    model = get_model()
    model = model.to(device)
    model.load_state_dict(torch.load(os.path.join(checkpoint_path, "best.pth"), map_location=device, weights_only=True))
    model.eval()

    if model_type == 'raw model':
        return model
    if model_type == 'scripted model':
        return torch.jit.script(model)
    if model_type == 'traced model':
        return torch.jit.trace(model, torch.rand((5, 3, 32, 32), device=device))
    if model_type == 'frozen model':
        return torch.jit.freeze(torch.jit.script(model))
    if model_type == 'optimized for inference':
        return torch.jit.optimize_for_inference(torch.jit.script(model))
    if model_type == 'compiled model':
        if os.name == 'nt':
            print("torch.compile is not supported on Windows. Try Linux or WSL instead.")
            raise RuntimeError('windows')
        return torch.compile(model)

In [11]:
def predict(device: torch.device = get_default_device(), data_path: str = './data',
            checkpoint_path: str = "./checkpoints"):
    if device.type == 'cuda':
        torch.backends.cudnn.benchmark = True
        torch.backends.cuda.matmul.allow_tf32 = True

    val_dataset = get_dataset(data_path, is_train=False)
    val_batch_size = 500
    val_loader = DataLoader(val_dataset, shuffle=False, pin_memory=True, num_workers=0, batch_size=val_batch_size,
                            drop_last=False)

    use_tta = (False, True)
    dtypes = (torch.bfloat16, torch.half, torch.float32) if device.type == 'cuda' else (torch.float32,)
    model_types = (
        'raw model', 'scripted model', 'traced model', 'frozen model', 'optimized for inference', 'compiled model')

    for tta in use_tta:
        for dtype in dtypes:
            for model_type in model_types:
                inference_mode = True
                if model_type == 'compiled model':
                    # On google colab, torch.compile might not like torch.inference_mode and wants torch.no_grad instead
                    inference_mode = False
                try:
                    model = create_model(device, checkpoint_path, model_type)
                    acc_val, elapsed = infer(
                        model, val_loader, device, tta=tta, dtype=dtype, inference_mode=inference_mode)

                    print(f"Device {device.type}, val acc: {acc_val}, tta: {tta}, dtype: {dtype}, model type: {model_type}, "
                          f"took: {elapsed / 1e9}s")
                except Exception as _:
                    # Debug only
                    # import traceback
                    # traceback.print_exc()
                    # print()

                    print(f"Model type {model_type} failed on {dtype} on {device.type}")
            print()

In [12]:
!wget https://github.com/Tensor-Reloaded/Advanced-Topics-in-Neural-Networks-Template-2024/raw/refs/heads/main/Lab02/CIFAR10/checkpoints/best.pth -O best.pth

--2024-10-09 08:44:54--  https://github.com/Tensor-Reloaded/Advanced-Topics-in-Neural-Networks-Template-2024/raw/refs/heads/main/Lab02/CIFAR10/checkpoints/best.pth
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Tensor-Reloaded/Advanced-Topics-in-Neural-Networks-Template-2024/refs/heads/main/Lab02/CIFAR10/checkpoints/best.pth [following]
--2024-10-09 08:44:55--  https://raw.githubusercontent.com/Tensor-Reloaded/Advanced-Topics-in-Neural-Networks-Template-2024/refs/heads/main/Lab02/CIFAR10/checkpoints/best.pth
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 44739432 (43M) [application/octet-strea

## Results 1 (Google Colab): Tesla T4, Intel(R) Xeon(R) CPU @ 2.20GHz (1 core)

In [13]:
predict(checkpoint_path='./', device=torch.device('cpu'))

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:04<00:00, 40081480.08it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: raw model, took: 155.261570115s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: scripted model, took: 141.854798342s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: traced model, took: 137.268522927s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: frozen model, took: 139.033478704s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: optimized for inference, took: 122.189013241s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: compiled model, took: 155.985327564s

Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: raw model, took: 604.946253624s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: scripted model, took: 605.447234843s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, mod

In [14]:
predict(checkpoint_path='./', device=torch.device('cuda:0'))

Files already downloaded and verified
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: raw model, took: 35.6871717s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: scripted model, took: 35.490419012s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: traced model, took: 35.465361561s
Device cuda, val acc: 0.9366, tta: False, dtype: torch.bfloat16, model type: frozen model, took: 35.363827883s
Model type optimized for inference failed on torch.bfloat16 on cuda




Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: compiled model, took: 38.592140568s

Device cuda, val acc: 0.937, tta: False, dtype: torch.float16, model type: raw model, took: 1.63877895s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.float16, model type: scripted model, took: 1.056358059s
Device cuda, val acc: 0.937, tta: False, dtype: torch.float16, model type: traced model, took: 1.106966217s
Device cuda, val acc: 0.937, tta: False, dtype: torch.float16, model type: frozen model, took: 1.096997849s
Model type optimized for inference failed on torch.float16 on cuda
Device cuda, val acc: 0.937, tta: False, dtype: torch.float16, model type: compiled model, took: 1.058264296s

Device cuda, val acc: 0.937, tta: False, dtype: torch.float32, model type: raw model, took: 3.504440625s
Device cuda, val acc: 0.937, tta: False, dtype: torch.float32, model type: scripted model, took: 2.057086411s
Device cuda, val acc: 0.937, tta: False, dtype: torch.float32,

## Results 2: NVIDIA A40, Intel(R) Xeon(R) Silver 4310T CPU @ 2.30GHz


In [None]:
# CPU

Files already downloaded and verified
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: raw model, took: 29.807860215s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: scripted model, took: 30.61111084s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: traced model, took: 29.401212288s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: frozen model, took: 27.164150199s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: optimized for inference, took: 19.921779904s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: compiled model, took: 18.938307475s

Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: raw model, took: 118.477152989s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: scripted model, took: 118.600493398s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: traced model, took: 119.661402172s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: frozen model, took: 109.021421908s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: optimized for inference, took: 80.80285353s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: compiled model, took: 62.447971997s

In [None]:
# GPU

Files already downloaded and verified
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: raw model, took: 1.662712012s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: scripted model, took: 0.555742949s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: traced model, took: 0.473243101s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: frozen model, took: 0.430498959s
Model type optimized for inference failed on torch.bfloat16 on cuda
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: compiled model, took: 7.265515185s

Device cuda, val acc: 0.9367, tta: False, dtype: torch.float16, model type: raw model, took: 0.802255959s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.float16, model type: scripted model, took: 0.367570652s
Device cuda, val acc: 0.9367, tta: False, dtype: torch.float16, model type: traced model, took: 0.438119956s
Device cuda, val acc: 0.9368, tta: False, dtype: torch.float16, model type: frozen model, took: 0.411527476s
Model type optimized for inference failed on torch.float16 on cuda
Device cuda, val acc: 0.9369, tta: False, dtype: torch.float16, model type: compiled model, took: 3.105313892s

Device cuda, val acc: 0.937, tta: False, dtype: torch.float32, model type: raw model, took: 0.788350018s
Device cuda, val acc: 0.937, tta: False, dtype: torch.float32, model type: scripted model, took: 0.642222682s
Device cuda, val acc: 0.937, tta: False, dtype: torch.float32, model type: traced model, took: 0.582183428s
Device cuda, val acc: 0.937, tta: False, dtype: torch.float32, model type: frozen model, took: 0.63175085s
Device cuda, val acc: 0.937, tta: False, dtype: torch.float32, model type: optimized for inference, took: 0.860065195s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.float32, model type: compiled model, took: 3.26706575s

Device cuda, val acc: 0.9423, tta: True, dtype: torch.bfloat16, model type: raw model, took: 1.29290948s
Device cuda, val acc: 0.9423, tta: True, dtype: torch.bfloat16, model type: scripted model, took: 1.310474388s
Device cuda, val acc: 0.9423, tta: True, dtype: torch.bfloat16, model type: traced model, took: 1.329242116s
Device cuda, val acc: 0.9424, tta: True, dtype: torch.bfloat16, model type: frozen model, took: 1.313594689s
Model type optimized for inference failed on torch.bfloat16 on cuda
Device cuda, val acc: 0.9422, tta: True, dtype: torch.bfloat16, model type: compiled model, took: 3.165473544s

Device cuda, val acc: 0.9426, tta: True, dtype: torch.float16, model type: raw model, took: 1.289959265s
Device cuda, val acc: 0.9425, tta: True, dtype: torch.float16, model type: scripted model, took: 1.29710779s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float16, model type: traced model, took: 1.320457467s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float16, model type: frozen model, took: 1.312074838s
Model type optimized for inference failed on torch.float16 on cuda
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float16, model type: compiled model, took: 3.2982492s

Device cuda, val acc: 0.9426, tta: True, dtype: torch.float32, model type: raw model, took: 2.130355637s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float32, model type: scripted model, took: 2.137523323s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float32, model type: traced model, took: 2.161863139s
Device cuda, val acc: 0.9425, tta: True, dtype: torch.float32, model type: frozen model, took: 2.148259623s
Device cuda, val acc: 0.9425, tta: True, dtype: torch.float32, model type: optimized for inference, took: 1.885170125s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float32, model type: compiled model, took: 3.971305783s

## Results 3: NVIDIA GeForce RTX 3090, AMD Ryzen Threadripper 1920X 12-Core Processor

In [None]:
# CPU

Files already downloaded and verified
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: raw model, took: 35.64374242s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: scripted model, took: 34.561748777s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: traced model, took: 34.939105964s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: frozen model, took: 34.126268667s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: optimized for inference, took: 26.412186696s
Device cpu, val acc: 0.937, tta: False, dtype: torch.float32, model type: compiled model, took: 31.803454048s

Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: raw model, took: 141.637965004s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: scripted model, took: 140.86321628s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: traced model, took: 140.037531754s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: frozen model, took: 133.869568961s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: optimized for inference, took: 106.60755021s
Device cpu, val acc: 0.9425, tta: True, dtype: torch.float32, model type: compiled model, took: 95.432965662s

In [None]:
# GPU

Files already downloaded and verified
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: raw model, took: 1.045950173s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: scripted model, took: 0.507382365s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: traced model, took: 0.483161688s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: frozen model, took: 0.487306687s
Model type optimized for inference failed on torch.bfloat16 on cuda
Device cuda, val acc: 0.9369, tta: False, dtype: torch.bfloat16, model type: compiled model, took: 8.428094218s

Device cuda, val acc: 0.937, tta: False, dtype: torch.float16, model type: raw model, took: 0.755911188s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.float16, model type: scripted model, took: 0.437476219s
Device cuda, val acc: 0.937, tta: False, dtype: torch.float16, model type: traced model, took: 0.466745924s
Device cuda, val acc: 0.9368, tta: False, dtype: torch.float16, model type: frozen model, took: 0.453104697s
Model type optimized for inference failed on torch.float16 on cuda
Device cuda, val acc: 0.9368, tta: False, dtype: torch.float16, model type: compiled model, took: 5.717307314s

Device cuda, val acc: 0.9369, tta: False, dtype: torch.float32, model type: raw model, took: 0.930242167s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.float32, model type: scripted model, took: 0.646624205s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.float32, model type: traced model, took: 0.66108906s
Device cuda, val acc: 0.9368, tta: False, dtype: torch.float32, model type: frozen model, took: 0.672550878s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.float32, model type: optimized for inference, took: 1.047787336s
Device cuda, val acc: 0.9369, tta: False, dtype: torch.float32, model type: compiled model, took: 5.533834633s

Device cuda, val acc: 0.9423, tta: True, dtype: torch.bfloat16, model type: raw model, took: 1.381747827s
Device cuda, val acc: 0.9423, tta: True, dtype: torch.bfloat16, model type: scripted model, took: 1.381004199s
Device cuda, val acc: 0.9423, tta: True, dtype: torch.bfloat16, model type: traced model, took: 1.419552829s
Device cuda, val acc: 0.9423, tta: True, dtype: torch.bfloat16, model type: frozen model, took: 1.388475839s
Model type optimized for inference failed on torch.bfloat16 on cuda
Device cuda, val acc: 0.9423, tta: True, dtype: torch.bfloat16, model type: compiled model, took: 3.802048509s

Device cuda, val acc: 0.9426, tta: True, dtype: torch.float16, model type: raw model, took: 1.389953422s
Device cuda, val acc: 0.9425, tta: True, dtype: torch.float16, model type: scripted model, took: 1.405247425s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float16, model type: traced model, took: 1.424549875s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float16, model type: frozen model, took: 1.395574333s
Model type optimized for inference failed on torch.float16 on cuda
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float16, model type: compiled model, took: 3.865965502s

Device cuda, val acc: 0.9426, tta: True, dtype: torch.float32, model type: raw model, took: 2.163474659s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float32, model type: scripted model, took: 2.210117595s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float32, model type: traced model, took: 2.17419933s
Device cuda, val acc: 0.9425, tta: True, dtype: torch.float32, model type: frozen model, took: 2.234084868s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float32, model type: optimized for inference, took: 2.144272799s
Device cuda, val acc: 0.9426, tta: True, dtype: torch.float32, model type: compiled model, took: 4.714134742s