In [1]:
import os
import multiprocessing
multiprocessing.cpu_count()

4

In [2]:
from importlib import reload
import matplotlib.pyplot as plt
import mxnet as mx
from mxnet.contrib import amp
import time
from tqdm.notebook import tqdm

import gluoncv as gcv

In [3]:
# Dataset Loading & Transforming
image_size = 640

input_transform_fn = mx.gluon.data.vision.transforms.Compose([
    mx.gluon.data.vision.transforms.Resize(image_size, keep_ratio=True),
    mx.gluon.data.vision.transforms.CenterCrop(image_size),
    mx.gluon.data.vision.transforms.ToTensor(),
    mx.gluon.data.vision.transforms.Normalize([.485, .456, .406], [.229, .224, .225])
])

to_gpu_fn = lambda x: x.as_in_context(mx.gpu())

input_transform_fn_gpu = mx.gluon.data.vision.transforms.Compose([
    to_gpu_fn,
    input_transform_fn
])

# Pre-processing in GPU, with transforms
# then copying back to CPU memory space

# Pre-processing in GPU, with transforms
# Unfortunately, we cannot copy directly into GPU the labels
# Not supported ty ADE20KSegmentation class

to_cpu_fn = lambda x: x.as_in_context(mx.cpu())

input_transform_fn_gpu_cpu = mx.gluon.data.vision.transforms.Compose([
    input_transform_fn_gpu,
    to_cpu_fn
])

# No need for mask transform changes
ade20k_val_cpu = gcv.data.ADE20KSegmentation(split='val')

# Limit to 500 samples
max_samples = 500
samples = range(0, max_samples)

ade20k_val_cpu_pre = mx.gluon.data.SimpleDataset([(ade20k_val_cpu[i][0], ade20k_val_cpu[i][1]) for i in tqdm(samples)])
ade20k_val_gpu_cpu = ade20k_val_cpu_pre.transform_first(input_transform_fn_gpu_cpu, lazy=False)

  0%|          | 0/500 [00:00<?, ?it/s]



In [4]:
num_workers = 0
batch_size = 4

# DataLoader for data processed in GPU, loaded in CPU
ade20k_val_loader_gpu_cpu = mx.gluon.data.DataLoader(
    ade20k_val_gpu_cpu,
    batch_size=batch_size,
    num_workers=num_workers,
    last_batch="rollover")

## Hybridize (CPU)

In [5]:
deeplab_pt_cpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu.predict(data)
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/125 [00:00<?, ?it/s]

Time (s): 3.9511704444885254


In [6]:
deeplab_pt_cpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid.hybridize()

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid.predict(data)
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/125 [00:00<?, ?it/s]

Time (s): 0.6758739948272705


In [7]:
deeplab_pt_cpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid.hybridize(backend = "MKLDNN")

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid.predict(data)
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/125 [00:00<?, ?it/s]

Time (s): 0.6158411502838135


## Hybridize (GPU)

In [4]:
num_workers = 0
batch_size = 16

# DataLoader for data processed in GPU, loaded in CPU
ade20k_val_loader_gpu_cpu = mx.gluon.data.DataLoader(
    ade20k_val_cpu_pre,
    batch_size=batch_size,
    num_workers=num_workers,
    last_batch="rollover")

In [5]:
deeplab_pt_gpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_gpu.predict(data.as_in_context(mx.gpu()))
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/31 [00:00<?, ?it/s]

[18:27:05] /work/mxnet/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:96: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)


Time (s): 1.6065938472747803


In [7]:
deeplab_pt_hybrid_default = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_hybrid_default.hybridize()

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_hybrid_default.predict(data.as_in_context(mx.gpu()))
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/31 [00:00<?, ?it/s]

Time (s): 0.3877408504486084


In [6]:
deeplab_pt_hybrid_static_alloc = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_hybrid_static_alloc.hybridize(static_alloc=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_hybrid_static_alloc.predict(data.as_in_context(mx.gpu()))
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/31 [00:00<?, ?it/s]

Time (s): 0.29736924171447754


In [16]:
deeplab_pt_hybrid_static_alloc_shape = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_hybrid_static_alloc_shape.hybridize(static_alloc=True, static_shape=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_hybrid_static_alloc_shape.predict(data.as_in_context(mx.gpu()))
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/31 [00:00<?, ?it/s]

Time (s): 0.2136681079864502


I can talk about symbolic and imperative programming
HybridSequential / HybridBlock / Hybrid_forward

https://github.com/awslabs/dynamic-training-with-apache-mxnet-on-aws/blob/master/docs/tutorials/gluon/custom_layer.md

## Automatic Mixed Precision (AMP)

In [5]:
from mxnet.contrib import amp

In [6]:
# Dataset Loading & Transforming
image_size = 1280

input_transform_fn = mx.gluon.data.vision.transforms.Compose([
    mx.gluon.data.vision.transforms.Resize(image_size, keep_ratio=True),
    mx.gluon.data.vision.transforms.CenterCrop(image_size),
    mx.gluon.data.vision.transforms.ToTensor(),
    mx.gluon.data.vision.transforms.Normalize([.485, .456, .406], [.229, .224, .225])
])

to_gpu_fn = lambda x: x.as_in_context(mx.gpu())

input_transform_fn_gpu = mx.gluon.data.vision.transforms.Compose([
    to_gpu_fn,
    input_transform_fn
])

# Pre-processing in GPU, with transforms
# then copying back to CPU memory space
to_cpu_fn = lambda x: x.as_in_context(mx.cpu())

input_transform_fn_gpu_cpu = mx.gluon.data.vision.transforms.Compose([
    input_transform_fn_gpu,
    to_cpu_fn
])

# No need for mask transform changes
ade20k_val_cpu = gcv.data.ADE20KSegmentation(split='val')

# Limit to 500 samples
max_samples = 500
samples = range(0, max_samples)

ade20k_val_cpu_pre = mx.gluon.data.SimpleDataset([(ade20k_val_cpu[i][0], ade20k_val_cpu[i][1]) for i in tqdm(samples)])
ade20k_val_gpu_cpu = ade20k_val_cpu_pre.transform_first(input_transform_fn_gpu_cpu, lazy=False)

# Single sample for forward pass (AMP requirement)
original_shape = ade20k_val_gpu_cpu[0][0].shape[1:]
single_sample_gpu = ade20k_val_gpu_cpu[0][0].reshape((1, 3) + original_shape).as_in_context(mx.gpu())

  0%|          | 0/500 [00:00<?, ?it/s]



In [19]:
num_workers = 0
batch_size = 16

# DataLoader for data processed in GPU, loaded in CPU
ade20k_val_loader_gpu_cpu = mx.gluon.data.DataLoader(
    ade20k_val_gpu_cpu,
    batch_size=batch_size,
    num_workers=num_workers,
    last_batch="rollover")

In [8]:
deeplab_pt_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_hybrid.predict(single_sample_gpu)
deeplab_pt_hybrid.hybridize(static_alloc=True, static_shape=True)
deeplab_pt_hybrid.forward(single_sample_gpu);

[20:19:59] /work/mxnet/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:96: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)


In [9]:
deeplab_pt_hybrid_amp = amp.convert_hybrid_block(deeplab_pt_hybrid)

	data: None
  input_sym_arg_type = in_param.infer_type()[0]


In [10]:
start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_hybrid_amp.forward(data.as_in_context(mx.gpu()))
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/31 [00:00<?, ?it/s]

Time (s): 0.29921650886535645


## Quantisation (INT8)

In [4]:
from mxnet.contrib import quantization

### Calibration Dataset

In [5]:
# Dataset Loading & Transforming
# Limit to last 10 samples
max_samples = 10
samples = range(0, max_samples)

ade20k_cal_cpu_pre = mx.gluon.data.SimpleDataset([(ade20k_val_cpu[-i][0], ade20k_val_cpu[-i][1]) for i in tqdm(samples)])
ade20k_cal_gpu_cpu = ade20k_cal_cpu_pre.transform_first(input_transform_fn_gpu_cpu, lazy=False)

  0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
num_workers = 0
batch_size = 2

# DataLoader for data processed in GPU, loaded in CPU
ade20k_cal_loader_gpu_cpu = mx.gluon.data.DataLoader(
    ade20k_cal_gpu_cpu,
    batch_size=batch_size,
    num_workers=num_workers,
    last_batch="rollover")

In [11]:
# Single sample for forward pass (Quantization requirement)
original_shape = ade20k_cal_gpu_cpu[0][0].shape[1:]
single_sample_gpu = ade20k_cal_gpu_cpu[0][0].reshape((1, 3) + original_shape).as_in_context(mx.cpu())

In [13]:
deeplab_pt_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_hybrid.predict(single_sample_gpu);

In [14]:
deeplab_pt_hybrid_q = quantization.quantize_net(
    deeplab_pt_hybrid,
    quantized_dtype='auto',
    exclude_layers=None,
    exclude_layers_match=None,
    calib_data=ade20k_cal_loader_gpu_cpu,
    calib_mode='entropy',
    num_calib_examples=None,
    ctx=mx.cpu())

[20:39:23] /work/mxnet/src/executor/graph_executor.cc:1991: Subgraph backend MKLDNN is activated.
	data: None
  input_sym_arg_type = in_param.infer_type()[0]


In [21]:
start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_hybrid_q.forward(data)
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/31 [00:00<?, ?it/s]

Time (s): 0.13666963577270508
