In [2]:
import os
import multiprocessing
num_cpus = multiprocessing.cpu_count()

print("Number of CPUs:", num_cpus)

Number of CPUs: 16


In [3]:
from importlib import reload
import matplotlib.pyplot as plt
import mxnet as mx
from mxnet.contrib import amp
import time
from tqdm.notebook import tqdm

import gluoncv as gcv

In [3]:
# Dataset Loading & Transforming
image_size = 480

input_transform_fn = mx.gluon.data.vision.transforms.Compose([
    mx.gluon.data.vision.transforms.Resize(image_size, keep_ratio=True),
    mx.gluon.data.vision.transforms.CenterCrop(image_size),
    mx.gluon.data.vision.transforms.ToTensor(),
    mx.gluon.data.vision.transforms.Normalize([.485, .456, .406], [.229, .224, .225])
])

to_gpu_fn = lambda x: x.as_in_context(mx.gpu())

input_transform_fn_gpu = mx.gluon.data.vision.transforms.Compose([
    to_gpu_fn,
    input_transform_fn
])

# Pre-processing in GPU, with transforms
# then copying back to CPU memory space

# Pre-processing in GPU, with transforms
# Unfortunately, we cannot copy directly into GPU the labels
# Not supported ty ADE20KSegmentation class

to_cpu_fn = lambda x: x.as_in_context(mx.cpu())

input_transform_fn_gpu_cpu = mx.gluon.data.vision.transforms.Compose([
    input_transform_fn_gpu,
    to_cpu_fn
])

# No need for mask transform changes
ade20k_val_cpu = gcv.data.ADE20KSegmentation(split='val')

# Limit to 100 samples
max_samples = 100
samples = range(0, max_samples)

ade20k_val_cpu_pre = mx.gluon.data.SimpleDataset([(ade20k_val_cpu[i][0], ade20k_val_cpu[i][1]) for i in tqdm(samples)])
ade20k_val_gpu_cpu = ade20k_val_cpu_pre.transform_first(input_transform_fn_gpu_cpu, lazy=False)

# Single sample for forward pass (AMP & Quantization requirement)
original_shape = ade20k_val_gpu_cpu[0][0].shape[1:]
single_sample_cpu = ade20k_val_gpu_cpu[0][0].reshape((1, 3) + original_shape)
single_sample_gpu = ade20k_val_gpu_cpu[0][0].reshape((1, 3) + original_shape).as_in_context(mx.gpu())

  0%|          | 0/100 [00:00<?, ?it/s]



In [4]:
num_workers = 0
batch_size = 4

# DataLoader for data processed in GPU, loaded in CPU
ade20k_val_loader_gpu_cpu = mx.gluon.data.DataLoader(
    ade20k_val_gpu_cpu,
    batch_size=batch_size,
    num_workers=num_workers,
    last_batch="discard")

## Hybridize (CPU)

In [5]:
deeplab_pt_cpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu(data)

mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 115.22693085670471


In [6]:
deeplab_pt_cpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid.hybridize()

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid(data)

mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 64.75840330123901


In [7]:
deeplab_pt_cpu_hybrid_mkldnn = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid_mkldnn.hybridize(backend = "MKLDNN")

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid_mkldnn(data)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 55.860424757003784


In [8]:
deeplab_pt_cpu_hybrid_mkldnn_alloc = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid_mkldnn_alloc.hybridize(backend = "MKLDNN", static_alloc=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid_mkldnn_alloc(data)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 53.905478715896606


In [10]:
deeplab_pt_cpu_hybrid_mkldnn_alloc_shape = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid_mkldnn_alloc_shape.hybridize(backend = "MKLDNN", static_alloc=True, static_shape=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid_mkldnn_alloc_shape(data)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 52.464826822280884


## Hybridize (GPU)

In [19]:
deeplab_pt_gpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_gpu(data.as_in_context(mx.gpu()))
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 13.315197944641113


In [18]:
deeplab_pt_gpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid.hybridize()

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_gpu_hybrid(data.as_in_context(mx.gpu()))
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 12.873461246490479


In [22]:
deeplab_pt_gpu_hybrid_static_alloc = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid_static_alloc.hybridize(static_alloc=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_gpu_hybrid_static_alloc(data.as_in_context(mx.gpu()))
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 12.752988815307617


In [24]:
deeplab_pt_gpu_hybrid_static_alloc_shape = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid_static_alloc_shape.hybridize(static_alloc=True, static_shape=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_gpu_hybrid_static_alloc_shape(data.as_in_context(mx.gpu()))
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 12.583650827407837


## Automatic Mixed Precision (AMP) - Float16

In [5]:
from mxnet.contrib import amp

### AMP CPU - Only with MKLDNN

In [40]:
deeplab_pt_cpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid.hybridize(backend="MKLDNN", static_alloc=True, static_shape=True)
deeplab_pt_cpu_hybrid(single_sample_cpu);

In [41]:
deeplab_pt_cpu_hybrid_amp = amp.convert_hybrid_block(deeplab_pt_cpu_hybrid, ctx=mx.cpu())

In [42]:
start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid_amp(data)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 56.16465926170349


### AMP GPU

In [6]:
deeplab_pt_gpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid.hybridize(static_alloc=True, static_shape=True)
deeplab_pt_gpu_hybrid(single_sample_gpu);

[13:11:24] /work/mxnet/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:96: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)


In [7]:
deeplab_pt_gpu_hybrid_amp = amp.convert_hybrid_block(deeplab_pt_gpu_hybrid, ctx=mx.gpu())

	data: None
  input_sym_arg_type = in_param.infer_type()[0]


In [10]:
start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_gpu_hybrid_amp(data.as_in_context(mx.gpu()))
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 3.371366024017334


## Quantisation (INT8) - CPU Only

In [5]:
from mxnet.contrib import quantization

### Example

In [11]:
a = mx.nd.array([1/3], dtype=mx.np.float32)
int_value = 85
scaling_factor = 255
b = int_value / scaling_factor

print("1/3 as 0.333... (Float32): {0:.30f}".format(a.asscalar())) 
print("1/3 as 85/255   (Int8)   : {0:.30f}".format(b))

1/3 as 0.333... (Float32): 0.333333343267440795898437500000
1/3 as 85/255   (Int8)   : 0.333333333333333314829616256247


### Calibration Dataset

In [6]:
# Dataset Loading & Transforming
# Limit to 10 samples (last ones)
max_samples = 10
samples = range(0, max_samples)

ade20k_cal_cpu_pre = mx.gluon.data.SimpleDataset([(ade20k_val_cpu[-i][0], ade20k_val_cpu[-i][1]) for i in tqdm(samples)])
ade20k_cal_gpu     = ade20k_cal_cpu_pre.transform_first(input_transform_fn_gpu, lazy=False)
ade20k_cal_gpu_cpu = ade20k_cal_gpu.transform_first(to_cpu_fn, lazy=False)

  0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
# DataLoader for Calibration
# For CPU, Pre-processed in GPU, copied back to CPU memory space)
num_workers = 0
batch_size = 4

# DataLoader for calibration data processed in GPU, loaded in CPU
ade20k_cal_loader_gpu_cpu = mx.gluon.data.DataLoader(
    ade20k_cal_gpu_cpu,
    batch_size=batch_size,
    num_workers=num_workers,
    last_batch="discard")

In [8]:
deeplab_pt_cpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu(single_sample_cpu);

In [9]:
# Log Quantization info
import logging
logging.basicConfig()
logger = logging.getLogger('logger')
logger.setLevel(logging.INFO)

In [10]:
deeplab_pt_cpu_q_hybrid = quantization.quantize_net_v2(
    deeplab_pt_cpu,
    quantized_dtype='auto',
    exclude_layers=None,
    exclude_layers_match=None,
    calib_data=ade20k_cal_loader_gpu_cpu,
    calib_mode='entropy',
    logger=logger,
    ctx=mx.cpu())

INFO:logger:Export HybridBlock
INFO:logger:These layers have been excluded []
INFO:logger:Quantizing graph
[12:50:56] /work/mxnet/src/operator/quantization/quantize_graph_pass.cc:302: sg_mkldnn_conv_bn_act_0 is quantized.
[12:50:56] /work/mxnet/src/operator/quantization/quantize_graph_pass.cc:302: sg_mkldnn_conv_bn_act_1 is quantized.
[12:50:56] /work/mxnet/src/operator/quantization/quantize_graph_pass.cc:302: sg_mkldnn_conv_bn_act_2 is quantized.
[12:50:56] /work/mxnet/src/operator/quantization/quantize_graph_pass.cc:302: deeplabv30_resnetv1s_pool0_fwd is quantized.
[12:50:56] /work/mxnet/src/operator/quantization/quantize_graph_pass.cc:302: sg_mkldnn_conv_bn_act_3 is quantized.
[12:50:56] /work/mxnet/src/operator/quantization/quantize_graph_pass.cc:302: sg_mkldnn_conv_bn_act_4 is quantized.
[12:50:56] /work/mxnet/src/operator/quantization/quantize_graph_pass.cc:302: sg_mkldnn_conv_bn_6 is quantized.
[12:50:56] /work/mxnet/src/operator/quantization/quantize_graph_pass.cc:302: sg_mkldn

In [11]:
deeplab_pt_cpu_q_hybrid.hybridize(backend="MKLDNN", static_alloc=True, static_shape=True)

In [65]:
start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_q_hybrid(data)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

  0%|          | 0/25 [00:00<?, ?it/s]

Time (s): 36.10324692726135


## MXNet Profiler

### CPU Profiling

In [5]:
deeplab_pt_cpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())

In [6]:
mx.profiler.set_config(
    profile_all=True,
    aggregate_stats=True,
    continuous_dump=True,
    filename='profile_output_cpu.json')

In [7]:
mx.profiler.set_state('run')

deeplab_pt_cpu(single_sample_cpu)

# Wait until all operations have completed
mx.nd.waitall()
# Stop recording
mx.profiler.set_state('stop')
# Log results
mx.profiler.dump()

In [5]:
deeplab_pt_cpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid.hybridize(backend="MKLDNN", static_alloc=True, static_shape=True)

In [6]:
mx.profiler.set_config(
    profile_all=True,
    aggregate_stats=True,
    continuous_dump=True,
    filename='profile_output_cpu_hybrid.json')

In [7]:
mx.profiler.set_state('run')

deeplab_pt_cpu_hybrid(single_sample_cpu)

# Wait until all operations have completed
mx.nd.waitall()
# Stop recording
mx.profiler.set_state('stop')
# Log results
mx.profiler.dump()

In [12]:
mx.profiler.set_config(
    profile_all=True,
    aggregate_stats=True,
    continuous_dump=True,
    filename='profile_output_cpu_q_hybrid.json')

In [13]:
mx.profiler.set_state('run')

deeplab_pt_cpu_q_hybrid(single_sample_cpu)

# Wait until all operations have completed
mx.nd.waitall()
# Stop recording
mx.profiler.set_state('stop')
# Log results
mx.profiler.dump()

### GPU Profiling

In [5]:
deeplab_pt_gpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())

In [7]:
mx.profiler.set_config(
    profile_all=True,
    aggregate_stats=True,
    continuous_dump=True,
    filename='profile_output_gpu.json')

In [8]:
mx.profiler.set_state('run')

deeplab_pt_gpu(single_sample_gpu)

# Wait until all operations have completed
mx.nd.waitall()
# Stop recording
mx.profiler.set_state('stop')
# Log results
mx.profiler.dump()

In [5]:
deeplab_pt_gpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_gpu_hybrid.hybridize(static_alloc=True, static_shape=True)

In [6]:
mx.profiler.set_config(
    profile_all=True,
    aggregate_stats=True,
    continuous_dump=True,
    filename='profile_output_gpu_hybrid.json')

In [7]:
mx.profiler.set_state('run')

deeplab_pt_gpu_hybrid(single_sample_gpu)

# Wait until all operations have completed
mx.nd.waitall()
# Stop recording
mx.profiler.set_state('stop')
# Log results
mx.profiler.dump()

[11:00:02] /work/mxnet/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:96: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)


In [17]:
# AMP Warm-up
start_time = time.time()

deeplab_pt_gpu_hybrid_amp(single_sample_gpu)
    
mx.nd.waitall()
print("Time (s):", time.time() - start_time)

Time (s): 0.06405162811279297


In [18]:
mx.profiler.set_config(
    profile_all=True,
    aggregate_stats=True,
    continuous_dump=True,
    filename='profile_output_gpu_hybrid_amp.json')

In [19]:
mx.profiler.set_state('run')

deeplab_pt_gpu_hybrid_amp(single_sample_gpu)

# Wait until all operations have completed
mx.nd.waitall()
# Stop recording
mx.profiler.set_state('stop')
# Log results
mx.profiler.dump()