In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.pardir))
from argparse import ArgumentParser

import torch
from voltaml.compile import VoltaGPUCompiler
from voltaml.inference import gpu_performance
import torchvision

### Load Model 

In [2]:
model = torchvision.models.resnet50(pretrained=True)
# model = torch.load('')

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

## Set parameters for FP16

In [5]:
input_shape = (1,3,224,224)
precision = 'fp16'
compiled_model_dir = 'resnet50.engine' ## Set Model dir
throughput_batch_size = 1

### Compile Model

In [6]:
compiler = VoltaGPUCompiler(
    model=model,
    output_dir=compiled_model_dir,
    input_shape=input_shape,
    precision=precision
)

compiled_model = compiler.compile()

INFO:EngineBuilder:Network Description
Network Description
Network Description
INFO:EngineBuilder:Input 'onnx::Conv_0' with shape (1, 3, 224, 224) and dtype DataType.FLOAT
Input 'onnx::Conv_0' with shape (1, 3, 224, 224) and dtype DataType.FLOAT
Input 'onnx::Conv_0' with shape (1, 3, 224, 224) and dtype DataType.FLOAT
INFO:EngineBuilder:Output '495' with shape (1, 1000) and dtype DataType.FLOAT
Output '495' with shape (1, 1000) and dtype DataType.FLOAT
Output '495' with shape (1, 1000) and dtype DataType.FLOAT
INFO:EngineBuilder:Building fp16 Engine in /workspace/voltaML/demo/resnet50.engine
Building fp16 Engine in /workspace/voltaML/demo/resnet50.engine
Building fp16 Engine in /workspace/voltaML/demo/resnet50.engine


[07/27/2022-09:39:42] [TRT] [I] The logger passed into createInferBuilder differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.

[07/27/2022-09:39:42] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 1980, GPU 1938 (MiB)
[07/27/2022-09:39:43] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +10, now: CPU 2078, GPU 1950 (MiB)
[07/27/2022-09:39:43] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 2078, GPU 1958 (MiB)
[07/27/2022-09:39:43] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.


INFO:EngineBuilder:Serializing engine to file: /workspace/voltaML/demo/resnet50.engine
Serializing engine to file: /workspace/voltaML/demo/resnet50.engine
Serializing engine to file: /workspace/voltaML/demo/resnet50.engine


[07/27/2022-09:40:03] [TRT] [I] Detected 1 inputs and 1 output network tensors.
[07/27/2022-09:40:04] [TRT] [I] Total Host Persistent Memory: 127232
[07/27/2022-09:40:04] [TRT] [I] Total Device Persistent Memory: 47153664
[07/27/2022-09:40:04] [TRT] [I] Total Scratch Memory: 4194304
[07/27/2022-09:40:04] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 94 MiB, GPU 2385 MiB
[07/27/2022-09:40:04] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 2.04392ms to assign 4 blocks to 62 nodes requiring 6201345 bytes.
[07/27/2022-09:40:04] [TRT] [I] Total Activation Memory: 6201345
[07/27/2022-09:40:04] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +1, GPU +8, now: CPU 2125, GPU 2021 (MiB)
[07/27/2022-09:40:04] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 2125, GPU 2029 (MiB)
[07/27/2022-09:40:04] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in building engine: CPU +45, GPU +64, now: CPU 90, GPU 128 (MiB)


In [7]:
gpu_performance(compiled_model_dir, model, input_shape=input_shape, throughput_batch_size=throughput_batch_size)

calculating latency...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 214.66it/s]


LogicError: cuMemcpyHtoD failed: invalid argument

### Set parameters for INT8

In [None]:
input_shape = (1,3,224,224)
precision = 'int8'
compiled_model_dir = '' ## Compiled model directory
throughput_batch_size = 1
calib_input = '' ## Calib input images path
calib_cache = '' ## Cache name
calib_num_images=25000
calib_batch_size=8
calib_preprocessor='V2'

### Compile Model

In [None]:
compiler = VoltaGPUCompiler(
    model=model,
    output_dir=compiled_model_dir,
    input_shape=input_shape,
    precision=precision,
    calib_input=calib_input,
    calib_cache=calib_cache,
    calib_num_images=calib_num_images,
    calib_batch_size=calib_batch_size,
    calib_preprocessor=calib_preprocessor
)

compiled_model = compiler.compile()

In [None]:
gpu_performance(compiled_model_dir, model, input_shape=input_shape, throughput_batch_size=throughput_batch_size)