In [1]:
from random import randint
from PIL import Image
import numpy as np
import pickle
import pycuda.driver as cuda
import pycuda.autoinit
import uff
import tensorrt as trt
import ctypes

from matplotlib import pyplot as plt
from six.moves import cPickle
import time
import tensorflow as tf
import logging

import caffe
import skimage.transform



In [2]:
def prep_image(im, mean_val):
    h, w, _ = im.shape
    if h < w:
        im = skimage.transform.resize(im, (224, w*224/h), preserve_range=True)
    else:
        im = skimage.transform.resize(im, (h*224/w, 224), preserve_range=True)
    # Central crop to 224x224
    h, w, _ = im.shape
    im = im[h//2-112:h//2+112, w//2-112:w//2+112]      
    #shuffle axes
    im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1)
    # convert to BGR
    im = im[::-1, :, :]
    # scales back to 0 ... 255 (caffe loads images as 0 ... 1)
    im = im * 255.0
    im = im - mean_val
    return im

In [15]:
def inference(engine, imgs):
    with engine.create_execution_context() as context:
        batch_size = engine.max_batch_size; 
        nb_of_imgs = len(imgs); 
        nb_of_batches = calculateNbOfBatches(batch_size, nb_of_imgs); 
        nb_of_predictions = 1000; # predictions per img
        results = []
        stream = cuda.Stream()
        for i in range(0, nb_of_batches):
            imgs_index = calculateImgIndicesToUseForInference(batch_size, nb_of_batches, i, nb_of_imgs);
            current_batch_size = imgs_index[1] - imgs_index[0];
            raveled_imgs = imgs[imgs_index[0] : imgs_index[1]].ravel();
            
            # Preallocated host and device data
            output = np.empty(nb_of_predictions * current_batch_size, dtype = np.float32)
            d_input = cuda.mem_alloc(1 * raveled_imgs.nbytes)
            d_output = cuda.mem_alloc(1 * output.nbytes)
            
            # creates device binding to assign to inference
            bindings=[d_input, d_output]
            
            #copies host image data to device array
            cuda.memcpy_htod_async(d_input, raveled_imgs, stream)
            
            # inference
            context.execute_async(bindings = bindings, stream_handle=stream.handle, batch_size = current_batch_size)
            
            # copies device output back to host output
            cuda.memcpy_dtoh_async(output, d_output, stream)
            
            #synchronizes stream before appending results
            stream.synchronize()
            
            results = np.append(results, output)
        return results.reshape(-1, nb_of_predictions)
 

In [6]:
# img loading
img_load_count = 2000
img_path = "/home/vtpc/Documents/Alvils/tensorrt/data/ilsvrc12/imgs/"
img_names_and_labels_path = '/home/vtpc/Documents/Alvils/tensorrt/data/ilsvrc12/val.txt'
imgs_file_names_and_labels = np.loadtxt(img_names_and_labels_path,  dtype=str)
# img loading
imgs = []
labels = []
for i in range(0, img_load_count):
    imgs.append(caffe.io.load_image(img_path + imgs_file_names_and_labels[i][0]))
    labels.append(imgs_file_names_and_labels[i][1])



  warn('`as_grey` has been deprecated in favor of `as_gray`')


In [9]:
# preprocess
mean_values = np.array([104, 117, 123]).reshape((3,1,1))
imgs_transformed = [];
for img in imgs:
    imgs_transformed.append(prep_image(img, mean_values))
    
imgs_raveled = np.zeros((img_load_count, 224 * 224 * 3), dtype= np.float32)
for i in range(0, len(imgs_transformed)):
    imgs_raveled[i] = imgs_transformed[i].ravel()
    



In [14]:
trt_engine_path = "int8.engine"
engine = loadEngine(trt_engine_path)
inference(engine, imgs_raveled, labels)


150
Time without transfer: 0.0322749614716
Time without transfer: 0.031848192215
Time without transfer: 0.0318028926849
Time without transfer: 0.0305640697479
Time without transfer: 0.0290699005127
Time without transfer: 0.0291080474854
Time without transfer: 0.0290269851685
Time without transfer: 0.0285038948059
Time without transfer: 0.0281829833984
Time without transfer: 0.0282139778137
Time without transfer: 0.0289361476898
Time without transfer: 0.0278789997101
Time without transfer: 0.0277390480042
Time without transfer: 0.00987792015076
Time with transfer: 0.601088047028
0.6755
