# TensorRT

## Установка зависимостей

In [None]:
# !pip3 install torch-tensorrt==2.8.0 -f https://github.com/pytorch/TensorRT/releases/expanded_assets/2.8.0
# !pip install -U "nvidia-modelopt[all]"

## Датасет

In [1]:
import torchvision
from torchvision import transforms
import torch
import torch.utils.cpp_extension
from torch import nn

import torch_tensorrt
import modelopt.torch.quantization as mtq

df: /home/ubuntu/.triton/autotune: No such file or directory
/home/ubuntu/miniconda3/envs/sem10/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/ubuntu/miniconda3/envs/sem10/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

testing_dataset = torchvision.datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ]
    ),
)

testing_dataloader = torch.utils.data.DataLoader(
    testing_dataset, batch_size=1, shuffle=False, num_workers=1
)


In [3]:
device

device(type='cuda')

### Функция калибровки

In [4]:
def forward_loop(model):
    for img, _ in testing_dataloader:
        img = img.to(device)
        model(img)


### Модель

In [5]:
model = torchvision.models.resnet18().to(device)

### Квантизация при помощи TensorRT Model Optimiser

In [6]:
from functools import partial

config = mtq.INT8_DEFAULT_CFG

quantized_model = mtq.quantize(model, config, forward_loop)

Inserted 107 quantizers


In [8]:
mtq.print_quant_summary(quantized_model)

conv1.input_quantizer                                                            TensorQuantizer(8 bit fake per-tensor amax=2.7537 calibrator=MaxCalibrator quant)
conv1.output_quantizer                                                           TensorQuantizer(disabled)
conv1.weight_quantizer                                                           TensorQuantizer(8 bit fake axis=0 amax=[0.0550, 0.0980](64) calibrator=MaxCalibrator quant)
bn1.input_quantizer                                                              TensorQuantizer(disabled)
bn1.output_quantizer                                                             TensorQuantizer(disabled)
maxpool.input_quantizer                                                          TensorQuantizer(8 bit fake per-tensor amax=2.3091 calibrator=MaxCalibrator quant)
maxpool.output_quantizer                                                         TensorQuantizer(disabled)
layer1.0.conv1.input_quantizer                                           

# Задание на семинар

Нужно квантизировать при помощи TensorRT любую модель из torchvision или timm (hugging-face) до int8 и до float16. Затем нужно проверить скорость работы получившихся вариаций модели (float32 - исходная, float16 и int8) и их размер. Результаты привести в блокноте.

## Решение

In [1]:
import timm
import torch
import torch_tensorrt
import torchvision
from torchvision import transforms
from torchao.utils import benchmark_model
import os

from typing import Tuple
from torch import nn

/home/ubuntu/miniconda3/envs/sem10/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/ubuntu/miniconda3/envs/sem10/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def benchmark_speed(model_orig: nn.Module,
                    model_quant: nn.Module,
                    example_inputs: torch.Tensor,
                    num_runs: int = 100):
    torch._dynamo.reset()
    orig_time = benchmark_model(model_orig, num_runs, example_inputs)
    quant_time = benchmark_model(model_quant, num_runs, example_inputs)

    print("orig mean time: %0.3f ms" % orig_time)
    print("quant mean time: %0.3f ms" % quant_time)
    print("speedup: %0.1fx" % (orig_time / quant_time))
    torch._dynamo.reset()


def benchmark_size(model_orig: nn.Module, model_quant: nn.Module):
    """
    re-implemented function - i use modelopt and those models can't be pickled
    so i calculate size another way
    """
    def get_model_size_mb(model: nn.Module):
        param_size = 0
        buffer_size = 0

        for param in model.parameters():
            param_size += param.nelement() * param.element_size()

        for buffer in model.buffers():
            buffer_size += buffer.nelement() * buffer.element_size()

        size_all_mb = (param_size + buffer_size) / 1024.0 / 1024.0
        return size_all_mb

    # Try to save original model normally
    try:
        torch.save(model_orig, "/tmp/orig_model.pt")
        orig_model_size_mb = os.path.getsize("/tmp/orig_model.pt") / 1024 / 1024
        print("original model using save")
    except Exception:
        # Fallback to parameter-based calculation
        orig_model_size_mb = get_model_size_mb(model_orig)
        print("original model using parameter-based calculation")

    try:
        # Try saving state_dict instead of full model
        torch.save(model_quant.state_dict(), "/tmp/quant_model_state.pt")
        quant_model_size_mb = os.path.getsize("/tmp/quant_model_state.pt") / 1024 / 1024
        print("quant model using save")
    except Exception:
        # Fallback to parameter-based calculation
        quant_model_size_mb = get_model_size_mb(model_quant)
        print("quant model using parameter-based calculation")

    print("quant model size: %.2f MB" % quant_model_size_mb)
    print("original model size: %.2f MB" % orig_model_size_mb)


Using device: cuda


In [3]:
from copy import deepcopy

In [4]:
model = timm.create_model('vit_base_patch14_dinov2.lvd142m').cuda()
model.eval()
;

''

In [5]:
dataset = torchvision.datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
            transforms.Resize((518, 518))
        ]
    ),
)

dataloader = torch.utils.data.DataLoader(
    dataset, batch_size=1, shuffle=False, num_workers=1
)

In [6]:
def forward_loop(model):
    for img, _ in dataloader:
        img = img.to(device)
        model(img)

In [7]:
import modelopt.torch.quantization as mtq

original_model = deepcopy(model)
fp8_model = mtq.quantize(model, mtq.FP8_DEFAULT_CFG, forward_loop)

Inserted 147 quantizers


In [8]:
original_model2 = deepcopy(original_model)
int8_model = mtq.quantize(original_model, mtq.INT8_DEFAULT_CFG, forward_loop)

Inserted 147 quantizers


In [9]:
print("=" * 50)
print("BENCHMARKING SPEED")
print("=" * 50)

example_input = torch.rand(1, 3, 518, 518).cuda()

print("\n1. Original model (float32) vs Float8:")
benchmark_speed(original_model2, fp8_model, (example_input, ))

print("\n2. Original model (float32) vs Int8:")
benchmark_speed(original_model2, int8_model, (example_input, ))


BENCHMARKING SPEED

1. Original model (float32) vs Float8:
Loading extension modelopt_cuda_ext_fp8...
Loaded extension modelopt_cuda_ext_fp8 in 0.0 seconds
orig mean time: 12.764 ms
quant mean time: 23.147 ms
speedup: 0.6x

2. Original model (float32) vs Int8:
Loading extension modelopt_cuda_ext...
Loaded extension modelopt_cuda_ext in 0.0 seconds
orig mean time: 12.729 ms
quant mean time: 19.448 ms
speedup: 0.7x


In [15]:
print("=" * 50)
print("BENCHMARKING SIZE")
print("=" * 50)

print("\n1. Original model (float32) vs Float16:")
benchmark_size(original_model2, fp8_model)

print("\n2. Original model (float32) vs Int8:")
benchmark_size(original_model2, int8_model)


BENCHMARKING SIZE

1. Original model (float32) vs Float16:
original model using save
quant model using save
quant model size: 330.38 MB
original model size: 330.39 MB

2. Original model (float32) vs Int8:
original model using save
quant model using save
quant model size: 330.70 MB
original model size: 330.39 MB
