## Create a model and export it as ONNX

In [None]:
import torch

batch_size = 32
device = 'cuda'
model = torch.hub.load('ultralytics/yolov5', 'yolov5s').to(device)

After training...

In [None]:
torch.save(model.state_dict(), './models/obj.pt')

In [None]:
dummy_input = torch.randn(batch_size, 3, 256, 256).to(device)

with torch.no_grad():
    print(model(dummy_input).shape)

torch.onnx.export(model, dummy_input, './models/obj.onnx')

## Convert the ONNX model to TensorRT engine

In [None]:
# Restart notebook kernel
import os
os._exit(00)

- Convert to FP32 engine

In [None]:
!trtexec --onnx=./models/obj.onnx --saveEngine=./models/obj_32.engine 

- Convert to FP16 engine

In [None]:
!trtexec --onnx=./models/obj.onnx --saveEngine=./models/obj_16.engine --fp16 

## Benchmark

In [1]:
from benchmark import NativeTorchBenchmark, TensorRTBehcnmark
import numpy as np
import torch

n_infers = 100
batch_size = 32
input_image = np.random.normal(size=[batch_size, 3, 256, 256])

- Native PyTorch

In [2]:
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

pt_bm = NativeTorchBenchmark(n_infers=n_infers,
                             batch_size=batch_size,
                             samples=input_image,
                             model_arch=model,
                             model_ckpt='./models/obj.pt')

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-5-2 Python-3.8.10 torch-2.1.0a0+fe05266 CUDA:0 (NVIDIA GeForce RTX 3060, 12042MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


[31m[1mrequirements:[0m /root/.cache/torch/hub/requirements.txt not found, check failed.


In [3]:
pt_bm.benchmark()

100%|████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 21.06it/s]

Throughputs (Native PyTorch): 673.2782





- TensorRT FP32

In [4]:
trt_bm_fp32 = TensorRTBehcnmark(n_infers=n_infers,
                                batch_size=batch_size,
                                samples=input_image,
                                engine_path='./models/obj_32.engine')

In [5]:
trt_bm_fp32.benchmark()

100%|████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 36.23it/s]

Throughputs: 1158.5222





- TensorRT FP16

In [6]:
trt_bm_fp16 = TensorRTBehcnmark(n_infers=n_infers,
                                batch_size=batch_size,
                                samples=input_image,
                                engine_path='./models/obj_16.engine')

In [7]:
trt_bm_fp16.benchmark()

100%|████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 53.86it/s]

Throughputs: 1722.2369



