# TensorRT

## Установка зависимостей

In [None]:

!pip list | grep torch

torch                                    2.8.0+cu126
torchao                                  0.10.0
torchaudio                               2.8.0+cu126
torchdata                                0.11.0
torchsummary                             1.5.1
torchtune                                0.6.1
torchvision                              0.23.0+cu126


In [None]:
!pip3 install torch-tensorrt==2.8.0 -f https://github.com/pytorch/TensorRT/releases/expanded_assets/2.8.0

## Датасет

In [None]:
import torchvision
from torchvision import transforms
import torch
from torch import nn
import torch_tensorrt

testing_dataset = torchvision.datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ]
    ),
)

testing_dataloader = torch.utils.data.DataLoader(
    testing_dataset, batch_size=1, shuffle=False, num_workers=1
)


In [None]:
calibrator = torch_tensorrt.ptq.DataLoaderCalibrator(
    testing_dataloader,
    cache_file="./calibration.cache",
    use_cache=True,
    algo_type=torch_tensorrt.ptq.CalibrationAlgo.ENTROPY_CALIBRATION_2,
    device=torch.device("cuda:0"),
)



In [None]:
trt_mod = torch_tensorrt.compile(model, inputs=[torch_tensorrt.Input((1, 3, 32, 32))],
                                    enabled_precisions={torch.float, torch.half, torch.int8},
                                    calibrator=calibrator,
                                    device={
                                         "device_type": torch_tensorrt.DeviceType.GPU,
                                         "gpu_id": 0,
                                         "dla_core": 0,
                                         "allow_gpu_fallback": False,
                                         "disable_tf32": False
                                     })

Если нужно переквантизировать модель, то можно использовать кэш калибратора

In [None]:
calibrator = torch_tensorrt.ptq.CacheCalibrator("./calibration.cache")

trt_mod = torch_tensorrt.compile(model, inputs=[torch_tensorrt.Input([1, 3, 32, 32])],
                                      enabled_precisions={torch.float, torch.half, torch.int8},
                                      calibrator=calibrator)

# Задание на семинар

Нужно квантизировать при помощи TensorRT любую модель из torchvision или timm (hugging-face) до int8 и до float16. Затем нужно проверить скорость работы получившихся вариаций модели (float32 - исходная, float16 и int8) и их размер. Результаты привести в блокноте.

In [None]:
!pip install torch torchvision

In [None]:
!pip install tensorrt-cu12  torch-tensorrt==2.8.0 -f https://github.com/pytorch/TensorRT/releases/expanded_assets/2.8.0

In [None]:
from typing import Tuple
from torchao.utils import (
    benchmark_model,
    unwrap_tensor_subclass,
)
from torch import nn
import torch
import tensorrt

import os

def benchmark_speed(model_orig: nn.Module,
                    model_quant: nn.Module,
                    example_inputs: torch.Tensor,
                    num_runs: int = 100):
    torch._dynamo.reset()
    orig_time = benchmark_model(model_orig, num_runs, example_inputs)
    quant_time = benchmark_model(model_quant, num_runs, example_inputs)

    print("orig mean time: %0.3f ms" % orig_time)
    print("quant mean time: %0.3f ms" % quant_time)
    print("speedup: %0.1fx" % (orig_time / quant_time))
    torch._dynamo.reset()


def benchmark_size(model_orig: nn.Module, model_quant: nn.Module):
    torch.save(model_orig, "/tmp/orig_model.pt")
    torch.save(model_quant, "/tmp/quant_model.pt")
    quant_model_size_mb = os.path.getsize("/tmp/quant_model.pt") / 1024 / 1024
    orig_model_size_mb = os.path.getsize("/tmp/orig_model.pt") / 1024 / 1024

    print("quant model size: %.2f MB" % quant_model_size_mb)

    print("original model size: %.2f MB" % orig_model_size_mb)


In [None]:
import timm

model = timm.create_model('vit_base_patch14_dinov2.lvd142m').cuda()
model

In [None]:
import torchvision
from torchvision import transforms

testing_dataset = torchvision.datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
            transforms.Resize((518, 518))
        ]
    ),
)

testing_dataloader = torch.utils.data.DataLoader(
    testing_dataset, batch_size=1, shuffle=False, num_workers=1
)

In [None]:
from torch_tensorrt.ts.ptq import DataLoaderCalibrator, CalibrationAlgo

calibrator = DataLoaderCalibrator(
    testing_dataloader,
    cache_file="./calibration.cache",
    use_cache=False,
    algo_type=CalibrationAlgo.ENTROPY_CALIBRATION_2,
    device=torch.device("cuda:0"),
)


  calibrator = DataLoaderCalibrator(



In [None]:
quantized_model = torch_tensorrt.compile(model, inputs=[torch_tensorrt.Input((1, 3, 518, 518))],
                                    enabled_precisions={torch.float, torch.half, torch.int8},
                                    # calibrator=calibrator,
                                    device='cuda:0')

In [None]:
benchmark_size(model, quantized_model)

quant model size: 937.36 MB
original model size: 330.39 MB
