In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.pardir))
from argparse import ArgumentParser

import torch
from voltaml.compile import VoltaGPUCompiler
from voltaml.inference import gpu_performance
import torchvision
import segmentation_models_pytorch as smp

  from .autonotebook import tqdm as notebook_tqdm


### Load Model 

In [2]:
model = smp.DeepLabV3Plus(
            encoder_name="resnet50",        # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
            encoder_weights="imagenet",     # use `imagenet` pre-trained weights for encoder initialization
            in_channels=3,                  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
            classes=1,                      # model output channels (number of classes in your dataset)
        )

## Set parameters for FP16

In [3]:
input_shape = (1,3,224,224)
precision = 'fp16'
compiled_model_dir = 'deeplab.engine' ## Set Model dir
throughput_batch_size = 1

### Compile Model

In [4]:
compiler = VoltaGPUCompiler(
    model=model,
    output_dir=compiled_model_dir,
    input_shape=input_shape,
    precision=precision,
    simplify=True
)

compiled_model = compiler.compile()



-------- Loading ONNX ---------------


INFO:EngineBuilder:Network Description
Network Description
Network Description
INFO:EngineBuilder:Input 'onnx::Conv_0' with shape (1, 3, 224, 224) and dtype DataType.FLOAT
Input 'onnx::Conv_0' with shape (1, 3, 224, 224) and dtype DataType.FLOAT
Input 'onnx::Conv_0' with shape (1, 3, 224, 224) and dtype DataType.FLOAT
INFO:EngineBuilder:Output '614' with shape (1, 1, 224, 224) and dtype DataType.FLOAT
Output '614' with shape (1, 1, 224, 224) and dtype DataType.FLOAT
Output '614' with shape (1, 1, 224, 224) and dtype DataType.FLOAT
INFO:EngineBuilder:Building fp16 Engine in /workspace/voltaML/demo/deeplab.engine
Building fp16 Engine in /workspace/voltaML/demo/deeplab.engine
Building fp16 Engine in /workspace/voltaML/demo/deeplab.engine


[08/29/2022-18:40:16] [TRT] [I] [MemUsageChange] Init CUDA: CPU +313, GPU +0, now: CPU 1116, GPU 781 (MiB)
[08/29/2022-18:40:16] [TRT] [I] [MemUsageSnapshot] Begin constructing builder kernel library: CPU 1116 MiB, GPU 781 MiB
[08/29/2022-18:40:16] [TRT] [I] [MemUsageSnapshot] End constructing builder kernel library: CPU 1251 MiB, GPU 815 MiB
[08/29/2022-18:40:16] [TRT] [W] parsers/onnx/onnx2trt_utils.cpp:364: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
-------------------------------------
Precision :  fp16
[08/29/2022-18:40:17] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +513, GPU +224, now: CPU 1867, GPU 1039 (MiB)
[08/29/2022-18:40:17] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +115, GPU +52, now: CPU 1982, GPU 1091 (MiB)
[08/29/2022-18:40:17] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.


INFO:EngineBuilder:Serializing engine to file: /workspace/voltaML/demo/deeplab.engine
Serializing engine to file: /workspace/voltaML/demo/deeplab.engine
Serializing engine to file: /workspace/voltaML/demo/deeplab.engine


[08/29/2022-18:40:41] [TRT] [I] Detected 1 inputs and 1 output network tensors.
[08/29/2022-18:40:41] [TRT] [I] Total Host Persistent Memory: 144912
[08/29/2022-18:40:41] [TRT] [I] Total Device Persistent Memory: 52366336
[08/29/2022-18:40:41] [TRT] [I] Total Scratch Memory: 0
[08/29/2022-18:40:41] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 51 MiB, GPU 2526 MiB
[08/29/2022-18:40:41] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 3.81615ms to assign 7 blocks to 78 nodes requiring 7028736 bytes.
[08/29/2022-18:40:41] [TRT] [I] Total Activation Memory: 7028736
[08/29/2022-18:40:41] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 2548, GPU 1371 (MiB)
[08/29/2022-18:40:41] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 2548, GPU 1379 (MiB)
[08/29/2022-18:40:41] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in building engine: CPU +51, GPU +52, now: CPU 51, GPU 52 (MiB)


In [5]:
gpu_performance(compiled_model_dir, model, input_shape=input_shape, throughput_batch_size=throughput_batch_size, is_yolo=False)

calculating latency...: 100%|█████████████| 1000/1000 [00:06<00:00, 165.70it/s]
calculating throughput: 100%|███████████████| 100/100 [00:00<00:00, 166.73it/s]


Latency:
--------------------------------------------------
VoltaML GPU Inference Latency: 1.08 ms / sample
PyTorch Inference Latency: 6.04 ms / sample


Throughput:
--------------------------------------------------
VoltaML GPU Inference Throughput: 978.84 samples / s
PyTorch Inference Throughput: 167.57 samples / s


### Set parameters for INT8

In [None]:
input_shape = (1,3,224,224)
precision = 'int8'
compiled_model_dir = '' ## Compiled model directory
throughput_batch_size = 1
calib_input = '' ## Calib input images path
calib_cache = '' ## Cache name
calib_num_images=25000
calib_batch_size=8
calib_preprocessor='V2'

### Compile Model

In [None]:
compiler = VoltaGPUCompiler(
    model=model,
    output_dir=compiled_model_dir,
    input_shape=input_shape,
    precision=precision,
    calib_input=calib_input,
    calib_cache=calib_cache,
    calib_num_images=calib_num_images,
    calib_batch_size=calib_batch_size,
    calib_preprocessor=calib_preprocessor
)

compiled_model = compiler.compile()

In [None]:
gpu_performance(compiled_model_dir, model, input_shape=input_shape, throughput_batch_size=throughput_batch_size)