In [3]:
import os
import sys
sys.path.append(os.path.abspath(os.path.pardir))
from argparse import ArgumentParser

import torch
from voltaml.compile import VoltaGPUCompiler
from voltaml.inference import gpu_performance
import torchvision

### Load Model 

In [4]:
model = torchvision.models.resnet34(pretrained=True)
# model=torch.hub.load('pytorch/vision:v0.6.0', 'fcn_resnet101', pretrained=True)
# model = torch.load('')

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth
100%|████████████████████████████████████████████████████████████| 83.3M/83.3M [00:00<00:00, 87.8MB/s]


## Set parameters for FP16

In [5]:
input_shape = (1,3,224,224)
precision = 'fp16'
compiled_model_dir = 'r34.engine' ## Set Model dir
throughput_batch_size = 1

### Compile Model

In [6]:
compiler = VoltaGPUCompiler(
    model=model,
    output_dir=compiled_model_dir,
    input_shape=input_shape,
    precision=precision
)

compiled_model = compiler.compile()

INFO:EngineBuilder:Network Description
Network Description
INFO:EngineBuilder:Input 'onnx::Conv_0' with shape (1, 3, 224, 224) and dtype DataType.FLOAT
Input 'onnx::Conv_0' with shape (1, 3, 224, 224) and dtype DataType.FLOAT
INFO:EngineBuilder:Output '343' with shape (1, 1000) and dtype DataType.FLOAT
Output '343' with shape (1, 1000) and dtype DataType.FLOAT
INFO:EngineBuilder:Building fp16 Engine in /workspace/voltav0.3/voltaML/demo/r34.engine
Building fp16 Engine in /workspace/voltav0.3/voltaML/demo/r34.engine


[10/14/2022-17:07:54] [TRT] [I] [MemUsageChange] Init CUDA: CPU +313, GPU +0, now: CPU 515, GPU 5991 (MiB)
[10/14/2022-17:07:54] [TRT] [I] [MemUsageSnapshot] Begin constructing builder kernel library: CPU 515 MiB, GPU 5991 MiB
[10/14/2022-17:07:54] [TRT] [I] [MemUsageSnapshot] End constructing builder kernel library: CPU 650 MiB, GPU 6025 MiB
[10/14/2022-17:07:55] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +513, GPU +224, now: CPU 1246, GPU 6249 (MiB)
[10/14/2022-17:07:55] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +116, GPU +52, now: CPU 1362, GPU 6301 (MiB)
[10/14/2022-17:07:55] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.


INFO:EngineBuilder:Serializing engine to file: /workspace/voltav0.3/voltaML/demo/r34.engine
Serializing engine to file: /workspace/voltav0.3/voltaML/demo/r34.engine


[10/14/2022-17:08:06] [TRT] [I] Detected 1 inputs and 1 output network tensors.
[10/14/2022-17:08:06] [TRT] [I] Total Host Persistent Memory: 88032
[10/14/2022-17:08:06] [TRT] [I] Total Device Persistent Memory: 44731392
[10/14/2022-17:08:06] [TRT] [I] Total Scratch Memory: 0
[10/14/2022-17:08:06] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 45 MiB, GPU 1131 MiB
[10/14/2022-17:08:06] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 0.643163ms to assign 3 blocks to 42 nodes requiring 2408448 bytes.
[10/14/2022-17:08:06] [TRT] [I] Total Activation Memory: 2408448
[10/14/2022-17:08:06] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 1916, GPU 6578 (MiB)
[10/14/2022-17:08:06] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 1916, GPU 6586 (MiB)
[10/14/2022-17:08:06] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in building engine: CPU +40, GPU +43, now: CPU 40, GPU 43 (MiB)


In [7]:
gpu_performance(compiled_model_dir, model, input_shape=input_shape, throughput_batch_size=throughput_batch_size, is_yolo=False)

calculating latency...: 100%|████████████████████████████████████| 1000/1000 [00:03<00:00, 309.17it/s]
calculating throughput: 100%|██████████████████████████████████████| 100/100 [00:00<00:00, 312.79it/s]


Latency:
--------------------------------------------------
VoltaML GPU Inference Latency: 0.76 ms / sample
PyTorch Inference Latency: 3.24 ms / sample


Throughput:
--------------------------------------------------
VoltaML GPU Inference Throughput: 1464.33 samples / s
PyTorch Inference Throughput: 314.62 samples / s


### Set parameters for INT8

In [None]:
input_shape = (1,3,224,224)
precision = 'int8'
compiled_model_dir = '' ## Compiled model directory
throughput_batch_size = 1
calib_input = '' ## Calib input images path
calib_cache = '' ## Cache name
calib_num_images=25000
calib_batch_size=8
calib_preprocessor='V2'

### Compile Model

In [None]:
compiler = VoltaGPUCompiler(
    model=model,
    output_dir=compiled_model_dir,
    input_shape=input_shape,
    precision=precision,
    calib_input=calib_input,
    calib_cache=calib_cache,
    calib_num_images=calib_num_images,
    calib_batch_size=calib_batch_size,
    calib_preprocessor=calib_preprocessor
)

compiled_model = compiler.compile()

In [None]:
gpu_performance(compiled_model_dir, model, input_shape=input_shape, throughput_batch_size=throughput_batch_size)