In [9]:
import onnxruntime as ort
import onnx
import torch
import openvino as ov
import tensorrt as trt
from timm import create_model
import numpy as np

In [2]:
torch.cuda.is_available()

True

In [11]:
DEVICE = 'cuda'
ONNX_MODEL_NAME = 'onnx_model.onnx'
BATCH_SIZE = 8

In [4]:
torch_model = create_model(model_name='resnet50')
_ = torch_model.eval()
_ = torch_model.to(DEVICE)

In [34]:
dummy_input = torch.rand(1, 3, 224, 224, device=DEVICE)

In [36]:
%%timeit
with torch.no_grad():
    torch_output = torch_model(dummy_input).cpu().detach().numpy()

2.36 ms ± 27.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
torch.onnx.export(
    torch_model,
    dummy_input,
    ONNX_MODEL_NAME,
    input_names=['input'],
    output_names=['output'],
)

In [6]:
onnx_model = onnx.load(ONNX_MODEL_NAME)
onnx.checker.check_model(onnx_model)
# print(onnx.printer.to_text(onnx_model.graph))

In [7]:
print(ort.get_available_providers())

['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']


In [8]:
providers = [
    'CUDAExecutionProvider',
#     'CPUExecutionProvider',
]

ort_session = ort.InferenceSession(
    ONNX_MODEL_NAME,
    providers=providers
)

print(f'{[input_.name for input_ in ort_session.get_inputs()]}')
print(f'{[output_.name for output_ in ort_session.get_outputs()]}')

['input']
['output']


In [31]:
onnx_input = dummy_input.detach().cpu().numpy()
ort_inputs = {ort_session.get_inputs()[0].name: onnx_input}
ort_outputs = ort_session.run(None, ort_inputs)

In [33]:
%%timeit
ort_outputs = ort_session.run(None, ort_inputs)[0]

1.05 ms ± 1.32 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
