In [1]:
import os
import multiprocessing
num_cpus = multiprocessing.cpu_count()

print("Number of CPUs:", num_cpus)

Number of CPUs: 16


In [2]:
from importlib import reload
import matplotlib.pyplot as plt
import mxnet as mx
from mxnet.contrib import amp
import time
from tqdm.notebook import tqdm

import gluoncv as gcv

In [3]:
# Dataset Loading & Transforming
image_size = 480

input_transform_fn = mx.gluon.data.vision.transforms.Compose([
    mx.gluon.data.vision.transforms.Resize(image_size, keep_ratio=True),
    mx.gluon.data.vision.transforms.CenterCrop(image_size),
    mx.gluon.data.vision.transforms.ToTensor(),
    mx.gluon.data.vision.transforms.Normalize([.485, .456, .406], [.229, .224, .225])
])

to_gpu_fn = lambda x: x.as_in_context(mx.gpu())

input_transform_fn_gpu = mx.gluon.data.vision.transforms.Compose([
    to_gpu_fn,
    input_transform_fn
])

# Pre-processing in GPU, with transforms
# then copying back to CPU memory space

# Pre-processing in GPU, with transforms
# Unfortunately, we cannot copy directly into GPU the labels
# Not supported ty ADE20KSegmentation class

to_cpu_fn = lambda x: x.as_in_context(mx.cpu())

input_transform_fn_gpu_cpu = mx.gluon.data.vision.transforms.Compose([
    input_transform_fn_gpu,
    to_cpu_fn
])

# No need for mask transform changes
ade20k_val_cpu = gcv.data.ADE20KSegmentation(split='val')

# Limit to 100 samples
max_samples = 100
samples = range(0, max_samples)

ade20k_val_cpu_pre = mx.gluon.data.SimpleDataset([(ade20k_val_cpu[i][0], ade20k_val_cpu[i][1]) for i in tqdm(samples)])
ade20k_val_gpu_cpu = ade20k_val_cpu_pre.transform_first(input_transform_fn_gpu_cpu, lazy=False)

# Single sample for forward pass (AMP & Quantization requirement)
original_shape = ade20k_val_gpu_cpu[0][0].shape[1:]
single_sample_cpu = ade20k_val_gpu_cpu[0][0].reshape((1, 3) + original_shape)
single_sample_gpu = ade20k_val_gpu_cpu[0][0].reshape((1, 3) + original_shape).as_in_context(mx.gpu())

  0%|          | 0/100 [00:00<?, ?it/s]



In [5]:
num_workers = 0
batch_size = 4

# DataLoader for data processed in GPU, loaded in CPU
ade20k_val_loader_gpu_cpu = mx.gluon.data.DataLoader(
    ade20k_val_gpu_cpu,
    batch_size=batch_size,
    num_workers=num_workers,
    last_batch="discard")

## Hybridize (CPU)

In [5]:
deeplab_pt_cpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu.predict(data)

mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 112.51623106002808


In [6]:
deeplab_pt_cpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid.hybridize()

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid.predict(data)

mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 68.06627893447876


In [7]:
deeplab_pt_cpu_hybrid_mkldnn = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid_mkldnn.hybridize(backend = "MKLDNN")

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid_mkldnn.predict(data)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 64.86776876449585


In [24]:
deeplab_pt_cpu_hybrid_static_alloc = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid_static_alloc.hybridize(backend = "MKLDNN", static_alloc=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid_static_alloc(data)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 54.037999868392944


In [25]:
deeplab_pt_cpu_hybrid_static_alloc_shape = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid_static_alloc_shape.hybridize(backend = "MKLDNN", static_alloc=True, static_shape=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid_static_alloc(data)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 54.02296566963196


## Hybridize (GPU)

In [4]:
# We can work with a larger batch size in GPU
num_workers = 0
batch_size = 16

# DataLoader for data processed in GPU, loaded in CPU
ade20k_val_loader_gpu_cpu_bs16 = mx.gluon.data.DataLoader(
    ade20k_val_gpu_cpu,
    batch_size=batch_size,
    num_workers=num_workers,
    last_batch="discard")

In [5]:
deeplab_pt_gpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu_bs16):
    deeplab_pt_gpu.predict(data.as_in_context(mx.gpu()))
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/6 [00:00<?, ?it/s]

[09:44:01] /work/mxnet/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:96: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)


Time (s): 30.928452253341675


In [6]:
deeplab_pt_gpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid.hybridize()

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu_bs16):
    deeplab_pt_gpu_hybrid.predict(data.as_in_context(mx.gpu()))
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/6 [00:00<?, ?it/s]

Time (s): 11.506299495697021


In [7]:
deeplab_pt_gpu_hybrid_static_alloc = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid_static_alloc.hybridize(static_alloc=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu_bs16):
    deeplab_pt_gpu_hybrid_static_alloc.predict(data.as_in_context(mx.gpu()))
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/6 [00:00<?, ?it/s]

Time (s): 11.254050970077515


In [8]:
deeplab_pt_gpu_hybrid_static_alloc_shape = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid_static_alloc_shape.hybridize(static_alloc=True, static_shape=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu_bs16):
    deeplab_pt_gpu_hybrid_static_alloc_shape.predict(data.as_in_context(mx.gpu()))
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/6 [00:00<?, ?it/s]

Time (s): 11.182417869567871


I can talk about symbolic and imperative programming
HybridSequential / HybridBlock / Hybrid_forward

https://github.com/awslabs/dynamic-training-with-apache-mxnet-on-aws/blob/master/docs/tutorials/gluon/custom_layer.md

## Automatic Mixed Precision (AMP)

In [26]:
from mxnet.contrib import amp

In [28]:
# Single sample for forward pass (AMP requirement)
original_shape = ade20k_val_gpu_cpu[0][0].shape[1:]
single_sample_cpu = ade20k_val_gpu_cpu[0][0].reshape((1, 3) + original_shape)
single_sample_gpu = ade20k_val_gpu_cpu[0][0].reshape((1, 3) + original_shape).as_in_context(mx.gpu())

### AMP CPU

In [32]:
deeplab_pt_cpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid.hybridize(backend="MKLDNN", static_alloc=True, static_shape=True)
deeplab_pt_cpu_hybrid(single_sample_cpu);

In [72]:
deeplab_pt_cpu_hybrid_amp = amp.convert_hybrid_block(deeplab_pt_cpu_hybrid, ctx=mx.cpu())

In [74]:
start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid_amp(data)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 55.70594525337219


### AMP GPU

In [75]:
deeplab_pt_gpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid.hybridize(static_alloc=True, static_shape=True)
deeplab_pt_gpu_hybrid(single_sample_gpu);

In [76]:
deeplab_pt_gpu_hybrid_amp = amp.convert_hybrid_block(deeplab_pt_gpu_hybrid, ctx=mx.gpu())

In [77]:
start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu_bs16):
    deeplab_pt_gpu_hybrid_amp(data.as_in_context(mx.gpu()))
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/6 [00:00<?, ?it/s]

Time (s): 3.168325662612915


## Quantisation (INT8) - CPU Only

In [10]:
from mxnet.contrib import quantization

### Calibration Dataset

In [11]:
# Dataset Loading & Transforming
# Limit to 10 samples (last ones)
max_samples = 10
samples = range(0, max_samples)

ade20k_cal_cpu_pre = mx.gluon.data.SimpleDataset([(ade20k_val_cpu[-i][0], ade20k_val_cpu[-i][1]) for i in tqdm(samples)])
ade20k_cal_gpu     = ade20k_cal_cpu_pre.transform_first(input_transform_fn_gpu, lazy=False)
ade20k_cal_gpu_cpu = ade20k_cal_gpu.transform_first(to_cpu_fn, lazy=False)

  0%|          | 0/10 [00:00<?, ?it/s]

In [12]:
# DataLoader for Calibration
# For CPU, Pre-processed in GPU, copied back to CPU memory space)
num_workers = 0
batch_size = 4

# DataLoader for calibration data processed in GPU, loaded in CPU
ade20k_cal_loader_gpu_cpu = mx.gluon.data.DataLoader(
    ade20k_cal_gpu_cpu,
    batch_size=batch_size,
    num_workers=num_workers,
    last_batch="discard")

# For GPU, All done in GPU
num_workers = 0
batch_size = 16

# DataLoader for calibration data processed in GPU, loaded in CPU
ade20k_cal_loader_gpu_bs16 = mx.gluon.data.DataLoader(
    ade20k_cal_gpu,
    batch_size=batch_size,
    num_workers=num_workers,
    last_batch="discard")

In [13]:
deeplab_pt_cpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu(single_sample_cpu);

In [18]:
# Log Quantization info
import logging
logging.basicConfig()
logger = logging.getLogger('logger')
logger.setLevel(logging.INFO)

In [19]:
deeplab_pt_cpu_q_hybrid = quantization.quantize_net_v2(
    deeplab_pt_cpu,
    quantized_dtype='auto',
    exclude_layers=None,
    exclude_layers_match=None,
    calib_data=ade20k_cal_loader_gpu_cpu,
    calib_mode='entropy',
    logger=logger,
    ctx=mx.cpu())

INFO:logger:Export HybridBlock
INFO:logger:These layers have been excluded []
INFO:logger:Quantizing graph
INFO:logger:Create a layer output collector for entropy calibration.
INFO:logger:Collector created, please use set_monitor_callback to collect calibration information.
INFO:logger:Quantizing parameters
[10:37:50] /work/mxnet/src/executor/graph_executor.cc:1991: Subgraph backend MKLDNN is activated.
INFO:logger:Collected statistics from 2 batches with batch_size=4
INFO:logger:Collected layer output values from FP32 model using 8 examples
INFO:logger:Calculating optimal thresholds for quantization
INFO:logger:Calculating optimal thresholds for quantization using KL divergence with num_quantized_bins=255
INFO:logger:Quantizing parameters


In [20]:
deeplab_pt_cpu_q_hybrid.hybridize(backend="MKLDNN", static_alloc=True, static_shape=True)

In [21]:
start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_q_hybrid(data)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 36.085768938064575


## MXNet Profiler

### CPU Profiling

In [5]:
deeplab_pt_cpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())

In [6]:
mx.profiler.set_config(
    profile_all=True,
    aggregate_stats=True,
    continuous_dump=True,
    filename='profile_output_cpu.json')

In [7]:
mx.profiler.set_state('run')

deeplab_pt_cpu(single_sample_cpu)

# Wait until all operations have completed
mx.nd.waitall()
# Stop recording
mx.profiler.set_state('stop')
# Log results
mx.profiler.dump()

In [5]:
deeplab_pt_cpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid.hybridize(backend="MKLDNN", static_alloc=True, static_shape=True)

In [6]:
mx.profiler.set_config(
    profile_all=True,
    aggregate_stats=True,
    continuous_dump=True,
    filename='profile_output_cpu_hybrid.json')

In [7]:
mx.profiler.set_state('run')

deeplab_pt_cpu_hybrid(single_sample_cpu)

# Wait until all operations have completed
mx.nd.waitall()
# Stop recording
mx.profiler.set_state('stop')
# Log results
mx.profiler.dump()

### GPU Profiling

In [5]:
deeplab_pt_gpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())

In [7]:
mx.profiler.set_config(
    profile_all=True,
    aggregate_stats=True,
    continuous_dump=True,
    filename='profile_output_gpu.json')

In [8]:
mx.profiler.set_state('run')

deeplab_pt_gpu(single_sample_gpu)

# Wait until all operations have completed
mx.nd.waitall()
# Stop recording
mx.profiler.set_state('stop')
# Log results
mx.profiler.dump()

In [5]:
deeplab_pt_gpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid.hybridize(static_alloc=True, static_shape=True)

In [6]:
mx.profiler.set_config(
    profile_all=True,
    aggregate_stats=True,
    continuous_dump=True,
    filename='profile_output_gpu_hybrid.json')

In [7]:
mx.profiler.set_state('run')

deeplab_pt_gpu_hybrid(single_sample_gpu)

# Wait until all operations have completed
mx.nd.waitall()
# Stop recording
mx.profiler.set_state('stop')
# Log results
mx.profiler.dump()

[11:00:02] /work/mxnet/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:96: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)


## ONNX Export

In [6]:
# ONNX library must be installed for this
# !python3 -m pip install onnx

In [10]:
deeplab_pt_gpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid.hybridize(static_alloc=True, static_shape=True)
deeplab_pt_gpu_hybrid(single_sample_gpu)

# Need to be exported externally for the symbols to be loaded
deeplab_pt_gpu_hybrid_filename = "deeplab_resnet101_coco_pt_gpu_hybrid"
deeplab_pt_gpu_hybrid.export(deeplab_pt_gpu_hybrid_filename)

# Files exported
sym_filename = deeplab_pt_gpu_hybrid_filename + "-symbol.json"
params_filename = deeplab_pt_gpu_hybrid_filename + "-0000.params"

# Verify generated files
assert os.path.exists(sym_filename)
assert os.path.exists(params_filename)

[11:03:27] /work/mxnet/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:96: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)


In [11]:
onnx_file_name = "deeplab_resnet101_coco_pt_gpu_hybrid.onnx"
in_shapes = [single_sample_gpu.shape]
in_types = [mx.np.float32]

onnx_model_path = mx.onnx.export_model(
    sym_filename,
    params_filename,
    in_shapes,
    in_types,
    onnx_file_name)

onnx_model_path

'deeplab_resnet101_coco_pt_gpu_hybrid.onnx'

In [13]:
# Model Verification
import onnx

# Load the ONNX model
onnx_model = onnx.load_model(onnx_model_path)

# Check the ONNX graph
onnx.checker.check_graph(onnx_model.graph)

## TensorRT Export

In [14]:
import tensorrt as trt

In [27]:
trt_file_name = "deeplab_resnet101_coco_pt_gpu_hybrid.trt"

In [37]:
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(TRT_LOGGER)
config = builder.create_builder_config()

explicit_batch = 1 << (int) (trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
deeplab_pt_gpu_hybrid_trt = builder.create_network(explicit_batch)

[10/21/2023-11:12:58] [TRT] [I] The logger passed into createInferBuilder differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.

[10/21/2023-11:12:58] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 6398, GPU 3186 (MiB)
[10/21/2023-11:12:58] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars


In [38]:
with open(onnx_file_name, 'rb') as model:
    with trt.OnnxParser(deeplab_pt_gpu_hybrid_trt, TRT_LOGGER) as parser:
        assert parser.parse(model.read()) == True
    deeplab_pt_gpu_hybrid_engine = builder.build_engine(deeplab_pt_gpu_hybrid_trt, config=config)

  deeplab_pt_gpu_hybrid_engine = builder.build_engine(deeplab_pt_gpu_hybrid_trt, config=config)


[10/21/2023-11:13:01] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 6868, GPU 3194 (MiB)
[10/21/2023-11:13:01] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 6868, GPU 3202 (MiB)
[10/21/2023-11:13:01] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[10/21/2023-11:13:39] [TRT] [I] Total Activation Memory: 16775153152
[10/21/2023-11:13:39] [TRT] [I] Detected 1 inputs and 2 output network tensors.
[10/21/2023-11:13:39] [TRT] [I] Total Host Persistent Memory: 338096
[10/21/2023-11:13:39] [TRT] [I] Total Device Persistent Memory: 2997760
[10/21/2023-11:13:39] [TRT] [I] Total Scratch Memory: 134217728
[10/21/2023-11:13:39] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 490 MiB, GPU 10873 MiB
[10/21/2023-11:13:39] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 124 steps to complete.
[10/21/2023-11:13:39] [TRT] [I] [BlockAssignment] Algorit

In [39]:
with open(trt_file_name, 'wb') as f:
    f.write(bytearray(deeplab_pt_gpu_hybrid_engine.serialize()))