## 1. Imports and Create Team

In [1]:
import sys
import os

sys.path.append(os.path.abspath("../common"))

import math
import time
import numpy as np
from PIL import Image
from matplotlib import pyplot
import cv2
from datetime import datetime

# import pynq
import dac_sdc
from IPython.display import display

team_name = 'PCCC'
dac_sdc.BATCH_SIZE = 2
team = dac_sdc.Team(team_name)

## 2. Preparing the library and model

In [2]:
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda 
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import tensorrt as trt

kernel_code = """
    __global__ void resizeImage(const unsigned char* input, float* output, int input_width, int input_height, int output_width, int output_height)
    {
        int x = blockIdx.x * blockDim.x + threadIdx.x;
        int y = blockIdx.y * blockDim.y + threadIdx.y;

        if (x < output_width && y < output_height)
        {
            float scale_x = (float)input_width / output_width;
            float scale_y = (float)input_height / output_height;

            int src_x = (int)(x * scale_x);
            int src_y = (int)(y * scale_y);

            int src_index = (src_y * input_width + src_x) * 3;
            int dst_index = (y * output_width + x) * 3;

            output[dst_index] = (float)input[src_index];
            output[dst_index + 1] = (float)input[src_index + 1];
            output[dst_index + 2] = (float)input[src_index + 2];
        }
    }
"""
mod = SourceModule(kernel_code)
resize_func = mod.get_function("resizeImage")
block_size = (16, 16, 1)
grid_size = ((640 - 1) // block_size[0] + 1, (384 - 1) // block_size[1] + 1, 1)

# 0606 batch_size = 1 推理，完成
class RUNNER(object):
    def __init__(self, engine, batch_size):
        #cuda.init()

        logger = trt.Logger(trt.Logger.WARNING)
        logger.min_severity = trt.Logger.Severity.ERROR
        trt.init_libnvinfer_plugins(logger,'')
        
        self.batch_size = batch_size
        self.context = engine.create_execution_context()
        self.imgsz = engine.get_binding_shape(0)[1:3]
        self.inp_img_size = self.imgsz[0]*self.imgsz[1]*3*4
        self.inputs, self.outputs, self.bindings = [], [], []
        self.stream = cuda.Stream()

        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding))
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            if engine.binding_is_input(binding):
                self.inp_size = size 
                host_mem = cuda.pagelocked_empty(size, dtype)
                device_mem = gpuarray.GPUArray((batch_size, self.imgsz[0], self.imgsz[1], 3), np.float32)
                self.bindings.append(device_mem.ptr)
                self.inputs.append({'host': host_mem, 'device': device_mem})
            else:
                host_mem = cuda.pagelocked_empty(size, dtype)
                device_mem = cuda.mem_alloc(host_mem.nbytes)
                self.bindings.append(int(device_mem))
                self.outputs.append({'host': host_mem, 'device': device_mem})

        # time statistic 
        self.inp_temp = cuda.mem_alloc(4096*4096*3)

    def _infer(self, imgs, out_w, out_h):
        infer_num = len(imgs)
        h_ratio_list, w_ratio_list = [], []

        for idx, img in enumerate(imgs):
            d_img = self.inp_temp
            d_out = self.inputs[0]['device'][idx]

            inp_h, inp_w = img.shape[:2]
            h_ratio, w_ratio = out_h / inp_h, out_w / inp_w
            h_ratio_list.append(h_ratio)
            w_ratio_list.append(w_ratio)
            # padding img if the last is less than batch_size 
            cuda.memcpy_htod_async(d_img, img, self.stream)
            resize_func(d_img, d_out, np.int32(inp_w), np.int32(inp_h), np.int32(out_w), np.int32(out_h), block=block_size, grid=grid_size)
            
        self.stream.synchronize()

        # run inference
        self.context.execute_async_v2(
            bindings=self.bindings,
            stream_handle=self.stream.handle
            )

        # fetch outputs from gpu
        for out in self.outputs:
            cuda.memcpy_dtoh_async(out['host'], out['device'], self.stream)
        self.stream.synchronize()

        # synchronize stream
        data = [out['host'] for out in self.outputs]
        return infer_num, data, w_ratio_list, h_ratio_list


def _get_engine(engine_path):
    logger = trt.Logger(trt.Logger.WARNING)
    logger.min_severity = trt.Logger.Severity.ERROR
    runtime = trt.Runtime(logger)
    trt.init_libnvinfer_plugins(logger,'') # initialize TensorRT plugins
    with open(engine_path, "rb") as f:
        serialized_engine = f.read()
    engine = runtime.deserialize_cuda_engine(serialized_engine)
    return engine

batch_size = 2
engine_path = './PCCC.trt'
engine = _get_engine(engine_path)
runner = RUNNER(engine, batch_size)
size_h, size_w = runner.imgsz
print("input size is w:%d, h:%d"%(size_w, size_h))
dtype = np.float32

# warmp up 
print("warmuping")
dummy_inp = [np.zeros((size_h,size_w,3), dtype=np.uint8) for _ in range(batch_size)]
for _ in range(50):
    runner._infer(dummy_inp, size_w, size_h)
print("warmup done")


input...
6266880
<class 'numpy.float32'>
output...
913920
<class 'numpy.float32'>
output...
522240
<class 'numpy.float32'>
output...
130560
<class 'numpy.float32'>
output...
228480
<class 'numpy.float32'>
output...
130560
<class 'numpy.float32'>
output...
32640
<class 'numpy.float32'>
output...
57120
<class 'numpy.float32'>
output...
32640
<class 'numpy.float32'>
output...
8160
<class 'numpy.float32'>
output...
14280
<class 'numpy.float32'>
output...
8160
<class 'numpy.float32'>
output...
2040
<class 'numpy.float32'>


## 3. Python Callback Function and Helper Functions

In [5]:
def my_callback(rgb_imgs):
    preds = {}
    type_mapping = {"0": 1, "1": 2, "2": 3, "3": 7, "4": 6, "5":4, "6": 5}
    start_idx = 0 
    while start_idx < len(rgb_imgs):
        inp_imgs, inp_paths = [], []
        end_start = min(len(rgb_imgs), start_idx+batch_size)
        for i in range(start_idx, end_start):
            img_path, rgb_img = rgb_imgs[i]
            inp_paths.append(img_path)
            inp_imgs.append(rgb_img)
        start_idx += batch_size

        infer_num, res, ratio_w_list, ratio_h_list = runner._infer(inp_imgs, size_w, size_h)
        num, final_boxes, final_scores, final_cls_inds = res

        final_boxes = final_boxes.reshape(batch_size, 100, 4)
        # final_scores = final_scores.reshape(batch_size, 100)
        final_cls_inds = final_cls_inds.reshape(batch_size, 100)

        for batch_idx in range(infer_num):
            ratio_w, ratio_h = ratio_w_list[batch_idx], ratio_h_list[batch_idx]
            ratio_array = np.asarray([ratio_w, ratio_h, ratio_w, ratio_h]).reshape(-1, 4)

            bbox_pred = final_boxes[batch_idx, :num[batch_idx]] / ratio_array
            cls_inds = final_cls_inds[batch_idx, :num[batch_idx]]

            pred = []
            for idx in range(num[batch_idx]):
                pred.append({
                    "type": type_mapping[str(int(cls_inds[idx]))],
                    "x": int(bbox_pred[idx, 0]),
                    "y": int(bbox_pred[idx, 1]),
                    "width": int((bbox_pred[idx, 2] - bbox_pred[idx, 0])),
                    "height": int((bbox_pred[idx, 3] - bbox_pred[idx, 1]))
                })
            preds[inp_paths[batch_idx].name] = pred 
        
    return preds

## 4. Running Object Detection

In [6]:
team.run(my_callback, debug=True)

Batch 1 starting. 4 images.
Batch 1 done. Runtime = 1.9253458976745605 seconds.
Batch 2 starting. 1 images.
Batch 2 done. Runtime = 1.3250751495361328 seconds.
Done all batches. Total runtime = 3.2504210472106934 seconds. Total energy = 0 J.
{'00001.jpg': [{'type': 0, 'x': 1174, 'y': 642, 'width': 455, 'height': 249}, {'type': 0, 'x': 828, 'y': 625, 'width': 73, 'height': 60}, {'type': 1, 'x': 755, 'y': 625, 'width': 42, 'height': 54}, {'type': 0, 'x': 110, 'y': 553, 'width': 168, 'height': 104}, {'type': 0, 'x': 807, 'y': 623, 'width': 40, 'height': 49}, {'type': 1, 'x': 730, 'y': 620, 'width': 24, 'height': 33}, {'type': 0, 'x': 1282, 'y': 611, 'width': 34, 'height': 41}, {'type': 0, 'x': 612, 'y': 618, 'width': 27, 'height': 21}], '00002.jpg': [{'type': 0, 'x': 771, 'y': 490, 'width': 168, 'height': 157}, {'type': 0, 'x': 386, 'y': 477, 'width': 222, 'height': 152}, {'type': 0, 'x': 1150, 'y': 55, 'width': 747, 'height': 831}, {'type': 0, 'x': 738, 'y': 553, 'width': 42, 'height': 7