## TensorRT Tool
- Built-in package in the PyTorch container
- Simple one line command!

#### Convert to TensorRT

In [None]:
!trtexec --onnx=../MONAICore/model.onnx --saveEngine=model.engine 

#### Convert to TensorRT with FP16

In [None]:
!trtexec --onnx=../MONAICore/model.onnx --saveEngine=model_fp16.engine --fp16 

## Benchmark

In [None]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time
import monai
import torch
import matplotlib.pyplot as plt

n_infers = 100
batch_size = 32
input_image = np.random.normal(size=[batch_size, 3, 256, 256])

- PyTroch

In [None]:
device = torch.device("cuda")
model = monai.networks.nets.SegResNet(
    spatial_dims=2,
    in_channels=3,
    out_channels=1,
    dropout_prob=.5
).to(device)
state_dict = torch.load('../MONAICore/checkpoints/best.pt')
model.load_state_dict(state_dict)
model.eval()
img_tensor = torch.Tensor(input_image)

In [None]:
t1 = time.time()
with torch.no_grad():
    for i in range(n_infers):
        model(img_tensor.to(device))
t2 = time.time()
throughputs_torch = batch_size*n_infers/(t2-t1)
print('Throughputs:', round(throughputs_torch, 4))

- TensorRT FP32

In [None]:
def TRT_setup(engine_path='./model.engine'):
    TRT_LOGGER = trt.Logger()
    trt.init_libnvinfer_plugins(None,'')
    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())

    context = engine.create_execution_context()
    bindings = []
    for binding in engine:
        binding_idx = engine.get_binding_index(binding)
        size = trt.volume(context.get_binding_shape(binding_idx))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        if engine.binding_is_input(binding):
            input_buffer = np.ascontiguousarray(input_image)
            input_memory = cuda.mem_alloc(input_image.nbytes)
            bindings.append(int(input_memory))
        else:
            output_buffer = cuda.pagelocked_empty(size, dtype)
            output_memory = cuda.mem_alloc(output_buffer.nbytes)
            bindings.append(int(output_memory))
    return context, input_buffer, input_memory, output_buffer, output_memory, bindings
    
def infer(context, input_buffer, input_memory, output_buffer, output_memory, bindings):
    stream = cuda.Stream()
    # Transfer input data to the GPU.
    cuda.memcpy_htod_async(input_memory, input_buffer, stream)
    # Run inference
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer prediction output from the GPU.
    cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
    # Synchronize the stream
    stream.synchronize()

    return output_buffer

In [None]:
context, input_buffer, input_memory, output_buffer, output_memory, bindings = TRT_setup('./model.engine')

In [None]:
t1 = time.time()
for i in range(n_infers):
    pred = infer(context, input_buffer, input_memory, output_buffer, output_memory, bindings)
t2 = time.time()
throughputs_trtfp32 = batch_size*n_infers/(t2-t1)
print('Throughputs:', round(throughputs_trtfp32, 4))

- TensorRT FP16

In [None]:
context, input_buffer, input_memory, output_buffer, output_memory, bindings = TRT_setup('./model_fp16.engine')

In [None]:
t1 = time.time()
for i in range(n_infers):
    pred = infer(context, input_buffer, input_memory, output_buffer, output_memory, bindings)
t2 = time.time()
throughputs_trtfp16 = batch_size*n_infers/(t2-t1)
print('Throughputs:', round(throughputs_trtfp16, 4))

## Exercise
Try `trtexec` yourself, explore the tool's configs

In [None]:
!trtexec -h