In [None]:
#PIP安装
!python3 -m pip install --upgrade setuptools pip
!python3 -m pip install nvidia-pyindex
!pip install polygraphy
!python3 -m pip install --upgrade nvidia-tensorrt

!python3 -m pip install colored
!pip install onnx-graphsurgeon
!pip install onnxruntime
#!pip cuda-python
#!pip install pycuda --default-timeout=100 -i https://pypi.tuna.tsinghua.edu.cn/simple/

# 自定义网络

In [None]:
# 导入
import tensorrt as trt
#print(trt.__version__)
assert trt.Builder(trt.Logger())
from PIL import Image
import numpy as np

### 过程
    1.创建Builder
    2.创建Network
    3.创建Parser
    4.绑定输入、输出及自定义组件
    5.序列化或者反序列化
 
    6.传输计算数据
    7.执行计算
    8.传输计算结果


In [None]:
#记录器
logger     = trt.Logger(trt.Logger.INFO)
builder    = trt.Builder(logger)

#三元素
network    = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))#动态的shape模式
config     = builder.create_builder_config()
profile    = builder.create_optimization_profile()

#设置参数
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)     # 设置空间给 TensoRT 尝试优化，单位 Byte
#fp16模式
#config.flags= 1<<int(trt.BuilderFlag.FP16)

#对于特定层的精度设置
#layer.precision = trt.float32
#手写网络
#设置输入，动态输入在这里设置
inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1]) 

#                                  最小形状    最优形状   最大形状
profile.set_shape(inputTensor.name, [1, 1, 1], [3, 4, 5], [6, 8, 10]) 
config.add_optimization_profile(profile)

#层，层的方法在下面
identityLayer  = network.add_identity(inputTensor)

#标记输出
network.mark_output(identityLayer.get_output(0))

# INFERENCE

In [None]:
# 10.0.0版本的
import pycuda.driver as cuda
import pycuda.autoinit
#执行推断需要对数据进行传输
#生成序列化
#engine = builder.build_engine(network,config)
engineString   =builder.build_serialized_network(network,config)
engine        = trt.Runtime(logger).deserialize_cuda_engine(engineString)
#创建执行的类似进程
context        =engine.create_execution_context()
stream         =cuda.Stream()

#设置第一个输入的大小,动态输入才能设置
#context.set_binding_shape(0,[3,4,5])
context.set_input_shape('input',[1,3,224,224])
#输入和输出的个数[input0,input1,output0,output1]
nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])  # 获取 engine 绑定信息
nOutput = engine.num_bindings - nInput
#具体数据，考虑使用torch的数据张量
data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5)              # 准备数据和 Host/Device 端内存

bufferH = []
bufferH.append(np.ascontiguousarray(data.reshape(-1)))
#填充主机上的输入、输出数据
for i in range(nInput, nInput + nOutput):
    s = engine.get_tensor_name(1)
    bufferH.append(np.empty(context.get_tensor_shape(s), dtype=trt.nptype(engine.get_tensor_dtype(s))))
bufferD = []
for i in range(nInput + nOutput):
    bufferD.append(cuda.mem_alloc(bufferH[i].nbytes))
    #bufferD.append(cuda.cuMemAlloc(bufferH[i].nbytes)[1])
for i in range(nInput):               # 首先将 Host 数据拷贝到 Device 端
    cuda.memcpy_htod_async(bufferD[i], bufferH[i],stream)
    #cuda.cuMemcpyHtoD(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes)
    
#context.execute_v2(bufferD)
context.execute_async_v2([a.data_ptr(),b.data_ptr()],0)

for i in range(nInput, nInput + nOutput):                                   # 将结果从 Device 端拷回 Host 端
    cuda.memcpy_dtoh_async(bufferH[i], bufferD[i], stream)
    #cuda.cuMemcpyDtoH(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes)
stream.synchronize()

'''
for b in bufferD:              # 释放 Device 端内存
    cuda.cuMemFree(b)
'''
outputs = [out.cpu().numpy() for out in bufferH]
print('data:',data.shape,'\n',data)
print('outputH0',outputH0.shape,'\n',outputH0)

In [None]:
#10.2以后版本的推断方法
stream = cuda.Stream()
a=torch.rand(32,3,224,224).cuda()
b= torch.rand(32,1000).cuda()
context.set_binding_shape(engine.get_binding_index("input"), (1, 3, image_height, image_width))
#engine.num_io_tensors
context.set_tensor_address(engine.get_tensor_name(0), a.data_ptr())#输入
context.set_tensor_address(engine.get_tensor_name(1), b.data_ptr())#输出
context.execute_async_v3(stream.handle)

# 细化网络处理

In [None]:
#创建日志,用于查看信息，可以共享使用
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
#手动创建一个TENSORRT网络
#builder 作为构建网络的入口
#网络的参数统一使用builder_config保存

#准备创建builder 
with trt.Builder(TRT_LOGGER) as builder, 
    #准备创建网络
    builder.create_network() as network，
    #网络的参数统一使用builder_config保存
    builder.create_builder_config() as config:
    #设置参数
    config.max_workspace_size = 1 << 20 
        
    
    #指定好输入，构建完engine后要使用这个
    input_tensor = network.add_input(name=INPUT_NAME,#输入名称
                                     dtype=trt.float32, #输入类型
                                     shape=INPUT_SHAPE)#输入大小
    
    
    # ---------------------------Add a convolution layer----------------------------
    conv1_w = weights['conv1.weight'].numpy()#从权重中提取出参数
    conv1_b = weights['conv1.bias'].numpy()
    conv1 = network.add_convolution(input=input_tensor,#这是上一层的输出
                                    num_output_maps=20,
                                    kernel_shape=(5, 5),
                                    kernel=conv1_w,
                                    bias=conv1_b)#添加一个5*5卷积层
    conv1.stride = (1, 1)
    
    #------------------------------------------------------------------------------
    
    pool1 = network.add_pooling(input=conv1.get_output(0), #用这个函数从上一层中获得输出
                                type=trt.PoolingType.MAX,
                                window_size=(2, 2))
    #conv1.get_output(0)为上一层的输出
    pool1.stride = (2, 2)
    #-----------------------------------------------------------------------
    
    conv2_w = weights['conv2.weight'].numpy()
    conv2_b = weights['conv2.bias'].numpy()
    conv2 = network.add_convolution(pool1.get_output(0),
                                    50, (5, 5),
                                    conv2_w, 
                                    conv2_b)
    conv2.stride = (1, 1)
    #-----------------------------------------------------------------------------
    
    pool2 = network.add_pooling(conv2.get_output(0), trt.PoolingType.MAX, (2, 2))
    pool2.stride = (2, 2)
    
    #-----------------------------------------------------------------------------
    fc1_w = weights['fc1.weight'].numpy()
    fc1_b = weights['fc1.bias'].numpy()
    fc1 = network.add_fully_connected(input=pool2.get_output(0),
                                      num_outputs=500,
                                      kernel=fc1_w,
                                        bias=fc1_b)
    #-----------------------------------------------------------------------------
    relu1 = network.add_activation(fc1.get_output(0), 
                                   trt.ActivationType.RELU)
    #-----------------------------------------------------------------------------
    
    fc2_w = weights['fc2.weight'].numpy()
    fc2_b = weights['fc2.bias'].numpy()
    fc2 = network.add_fully_connected(relu1.get_output(0), 
                                      OUTPUT_SIZE, fc2_w, fc2_b)
    fc2.get_output(0).name =OUTPUT_NAME
    #-----------------------------------------------------------------------------
    #指定好网络的输出，也是后面要使用到的
    network.mark_output(fc2.get_output(0))

In [None]:
# 打印逐层信息（注意是上面网络逐层信息而不是 serialozedNetwork 的逐层信息）
for i in range(network.num_layers):
    layer = network.get_layer(i)
    print(i, "%s,in=%d,out=%d,%s" % (str(layer.type)[10:], layer.num_inputs, layer.num_outputs, layer.name))
    for j in range(layer.num_inputs):
        tensor = layer.get_input(j)
        if tensor == None:
            print("\tInput  %2d:" % j, "None")
        else:
            print("\tInput  %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name))
    for j in range(layer.num_outputs):
        tensor = layer.get_output(j)
        if tensor == None:
            print("\tOutput %2d:" % j, "None")
        else:
            print("\tOutput %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name))

# 动态输入

In [None]:
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
profile = builder.create_optimization_profile()
config.max_workspace_size = 1 << 30
inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1])  # 指定输入张量
#                                  最小， 最常见， 最大
profile.set_shape(inputTensor.name, [1, 1, 1], [3, 4, 5], [6, 8, 10])   # 指定输入张量 Dynamic Shape 范围
config.add_optimization_profile(profile)

context.set_binding_shape(0, [3, 4, 5])  # Dynamic Shape 模式需要绑定真实数据形状


# 快速解析onnx格式模型

In [None]:
parser = trt.OnnxParser(network, logger)  #先创建一个解析器
with open(onnxFile, 'rb') as model: # 再解析onnx格式模型
    parser.parse(model.read())
#再保存称为engine文件
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20) 
serialized_engine = builder.build_serialized_network(network, config)
with open('sample.engine', 'wb') as f:
    #也可以是.plan格式文件
    f.write(serialized_engine)

In [None]:
inputTensor = network.get_input(0)
profile.set_shape(inputTensor.name, (1, 1, 28, 28), (4, 1, 28, 28), (16, 1, 28, 28))
config.add_optimization_profile(profile)

network.unmark_output(network.get_output(0))  # 去掉输出张量 'y'

# 可以保存的模型

In [None]:
with open('tmp.plan','rb') as f:
    file.write(engineString)
#反序列化
engine = trt.Runtime(logger).deserialize_cuda_engine(file)     # 使用 Runtime 来创建 engine

# 自定义网络层
    这个只是使用Python进行模块添加的工作，函数还是要C++实现，需要先编译好op的动态库so

In [None]:
import ctypes
TRT_LOGGER = trt.Logger()
soFile = "./AddScalarPlugin.so"
#这一行很重要，当解码模型时候要用到
trt.init_libnvinfer_plugins(TRT_LOGGER, '')
ctypes.cdll.LoadLibrary(soFile)
#ctypes.CDLL(osp.join(dir_path, 'libamirstan_plugin.so'))

PLUGIN_CREATORS = trt.get_plugin_registry().plugin_creator_list

#获得注册的操作，并且创建，和上面的方法是一样的
def get_trt_plugin(plugin_name):
    PLUGIN_CREATORS = trt.get_plugin_registry().plugin_creator_list
    plugin = None
    for plugin_creator in PLUGIN_CREATORS:
        if plugin_creator.name == plugin_name:

            lrelu_slope_field = trt.PluginField("neg_slope", np.array([0.1], dtype=np.float32), trt.PluginFieldType.FLOAT32)
            #列表形式保存属性
            field_collection = trt.PluginFieldCollection([lrelu_slope_field])

            plugin = plugin_creator.create_plugin(name=plugin_name, field_collection=field_collection)
    return plugin
    
#构建新的层,查看creater中如何定义plugin的名字
lrelu = network.add_plugin_v2(inputs=[input_layer], plugin=get_trt_plugin("LReLU_TRT"))


# 量化模型

In [None]:
config.set_flag(trt.BuilderFlag.INT8)
inputs=torch.rand(1,2,2,2).cuda()
int8_calib_batch_size=2
int8_calib_dataset = TensorBatchDataset(inputs)
calibrator = DatasetCalibrator(
    int8_calib_dataset, 
    algorithm=trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2
)

config.int8_calibrator = calibrator

# 可视化所有信息

In [None]:
inspector = engine.create_engine_inspector()
inspector.execution_context = context
print(inspector.get_layer_information(0, LayerInformationFormat.JSON)
print(inspector.get_engine_information(LayerInformationFormat.JSON)

# 量化模型

In [None]:
NUM_IMAGES_PER_BATCH = 5
batchstream = ImageBatchStream(NUM_IMAGES_PER_BATCH, calibration_files)
Int8_calibrator = EntropyCalibrator(["input_node_name"], batchstream)


In [None]:
config.set_flag(trt.BuilderFlag.INT8)
config.int8_calibrator = Int8_calibrator


# polygraphy
     转化模型，并且评估
     !pip install colored

In [None]:
#run模式
#fold节点，topk
!polygraphy surgeon sanitize /kaggle/input/testonnx/tmp.onnx \
    --fold-constants \
    -o /kaggle/working/folded.onnx

#运行,评估onnxruntime和tensorrt分别的执行效率，并且保存引擎文件
!polygraphy run /kaggle/working/folded.onnx \
    #选用多种后端
    --trt  --onnxrt \
    #输入的名字和形状
    --input-shapes input:[1,3,1333,800] \
    --save-engine temp.plane
    #标记所有层，对比输出，可能有些层合并后不能用了
    --onnx-outputs mark all \
    --trt-outputs mark all 
'''
[I]         PASSED | Difference is within tolerance (rel=1e-05, abs=1e-05)
[I]     PASSED | All outputs matched | Outputs: ['labels', 'dets']
[I] PASSED | Command: /opt/conda/bin/polygraphy run /kaggle/working/folded.onnx --trt --onnxrt --input-shapes input:[1,3,1333,800] --save-engine temp.plan

'''

In [None]:
#inspect模式 可以显示所有层
#可查看的文件包括.onnx,  .plan,  .engine,  .pb,  .json
!polygraphy inspect model folded.onnx \
    --mode=basic --display-as=trt

In [None]:
#surgeon 模式 可以修改图：分割图，折叠图，


# 错误处理

In [None]:
RuntimeError: CUDA error: an illegal memory access was encountered CUDA kernel errors might be 
asynchronously  reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Compile with `TORCH_USE_CUDA_DSA` to enable 
device-side assertions.

检查数据，要放到cuda上才对

In [None]:
nvprof 被nsys替换了
也是在cuda/bin下面

nsys profile XXX

# 最新格式使用

In [None]:
from onnx_helper import ONNXClassifierWrapper
N_CLASSES = 1000 # Our ResNet-50 is trained on a 1000 class ImageNet task
trt_model = ONNXClassifierWrapper("resnet_engine.trt", [BATCH_SIZE, N_CLASSES],
 target_dtype = PRECISION)


In [None]:
BATCH_SIZE=32
dummy_input_batch = np.zeros((BATCH_SIZE, 224, 224, 3), dtype = PRECISION)

In [None]:
predictions = trt_model.predict(dummy_input_batch)
