In [1]:

from random import randint
from PIL import Image
import numpy as np
import pickle
import pycuda.driver as cuda
import pycuda.autoinit
import uff
import tensorrt as trt
import ctypes

from matplotlib import pyplot as plt
from six.moves import cPickle
import time

In [2]:
def do_inference(context, h_input, d_input, h_output, d_output, stream):
    # Transfer input data to the GPU.
    cuda.memcpy_htod_async(d_input, h_input, stream)
    # Run inference.
    inference_success = context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    # Synchronize the stream
    stream.synchronize()


In [3]:
# Allocate host and device buffers, and create a stream.
def allocate_buffers(engine):
    # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()
    return h_input, d_input, h_output, d_output, stream

In [4]:
def load_CIFAR10_dataset(file_name):
    with open(file_name, 'rb') as f:                
        d = cPickle.load(f)
        # decode utf8
        d_decoded = {}
        for k, v in d.items():
            d_decoded[k.decode('utf8')] = v
        d = d_decoded
        data = d['data']
        labels = d['labels']
        raw_float_data = np.array(data, dtype=float) / 255.0
        return raw_float_data, labels


In [16]:

serialized_engine = "fp32.engine"
TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
with trt.Runtime(TRT_LOGGER) as runtime:
    with open(serialized_engine, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
        h_input, d_input, h_output, d_output, stream = allocate_buffers(engine)
        with engine.create_execution_context() as context:
                file_name = "/home/vtpc/Documents/Alvils/tensorrt/cifar-10-batches-py/test_batch";
                # Reshape the array to 4-dimensions.
                imgs, labels = load_CIFAR10_dataset(file_name)
                processed_imgs = imgs.reshape([-1, 3, 32, 32]).astype(trt.nptype(trt.float32)).ravel()
                t0 = time.time()
                for i in range(0, 1000000):
                    np.copyto(h_input, processed_imgs[1])
                    do_inference(context, h_input, d_input, h_output, d_output, stream)
                t1 = time.time()     
                total_t = t1-t0
                
                print(total_t)


324.187404871
