In [1]:
import os
import multiprocessing
multiprocessing.cpu_count()

4

In [2]:
# Reference: https://mxnet.apache.org/versions/1.9.1/api/faq/env_var

# os.environ["MXNET_CPU_WORKER_NTHREADS"] = "1"
# os.environ["MXNET_CPU_PRIORITY_NTHREADS"] = "1"
# os.environ["MXNET_CPU_NNPACK_NTHREADS"] = "1"
# os.environ["MXNET_EXEC_NUM_TEMP"] = "1"
# os.environ["MXNET_ENGINE_TYPE"] = "NaiveEngine"

# Default Mode: "ThreadedEnginePerDevice"
# os.environ["MXNET_CPU_WORKER_NTHREADS"] = "4"
# os.environ["MXNET_CPU_PRIORITY_NTHREADS"] = "4"
# os.environ["MXNET_CPU_NNPACK_NTHREADS"] = "4"
# os.environ["MXNET_EXEC_NUM_TEMP"] = "4"
# os.environ["MXNET_ENGINE_TYPE"] = "ThreadedEnginePerDevice"

In [3]:
from importlib import reload
import matplotlib.pyplot as plt
import mxnet as mx
from mxnet.contrib import amp
import time
import timeit
from tqdm.notebook import tqdm

import gluoncv as gcv

In [None]:
# Dataset Loading & Transforming
image_size = 479

input_transform_fn = mx.gluon.data.vision.transforms.Compose([
    mx.gluon.data.vision.transforms.Resize(image_size, keep_ratio=True),
    mx.gluon.data.vision.transforms.CenterCrop(image_size),
    mx.gluon.data.vision.transforms.ToTensor(),
    mx.gluon.data.vision.transforms.Normalize([.485, .456, .406], [.229, .224, .225])
])

to_gpu_fn = lambda x: x.as_in_context(mx.gpu())

input_transform_fn_gpu = mx.gluon.data.vision.transforms.Compose([
    to_gpu_fn,
    input_transform_fn
])

# Pre-processing in GPU, with transforms
# then copying back to CPU memory space

# Pre-processing in GPU, with transforms
# Unfortunately, we cannot copy directly into GPU the labels
# Not supported ty ADE20KSegmentation class

to_cpu_fn = lambda x: x.as_in_context(mx.cpu())

input_transform_fn_gpu_cpu = mx.gluon.data.vision.transforms.Compose([
    input_transform_fn_gpu,
    to_cpu_fn
])

# No need for mask transform changes
ade20k_val_gpu_cpu = gcv.data.ADE20KSegmentation(split='val', transform=input_transform_fn_gpu_cpu)

num_workers = 4
batch_size = 4

# DataLoader all in GPU, no copies necessary
ade20k_val_loader_gpu_cpu = mx.gluon.data.DataLoader(
    ade20k_val_gpu_cpu,
    batch_size=batch_size,
    num_workers=num_workers,
    thread_pool=True)

## Hybridize

In [None]:
deeplab_pt_cpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu(data)
    
print("Time (s):", time.time() - start_time)

In [5]:
deeplab_pt_cpu_hybrid = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.cpu())
deeplab_pt_cpu_hybrid.hybridize()

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_cpu_hybrid(data)
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/1000 [00:00<?, ?it/s]



KeyboardInterrupt: 

In [5]:
deeplab_pt_gpu = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_gpu(data.as_in_context(mx.gpu()))
    
print("Time (s):", time.time() - start_time)



  0%|          | 0/500 [00:00<?, ?it/s]

[20:53:59] /work/mxnet/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:96: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)


Time (s): 45.90499544143677


Check memory footprint

In [6]:
deeplab_pt_hybrid_default = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_hybrid_default.hybridize()

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_hybrid_default(data.as_in_context(mx.gpu()))
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/500 [00:00<?, ?it/s]

Time (s): 35.678776025772095


Check memory footprint

In [4]:
deeplab_pt_hybrid_static_alloc = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_hybrid_static_alloc.hybridize(static_alloc=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_hybrid_static_alloc(data.as_in_context(mx.gpu()))
    
print("Time (s):", time.time() - start_time)



  0%|          | 0/500 [00:00<?, ?it/s]

[20:56:38] /work/mxnet/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:96: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)


Time (s): 33.17140793800354


Check memory footprint

In [5]:
deeplab_pt_hybrid_static_alloc_space = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=mx.gpu())
deeplab_pt_hybrid_static_alloc_space.hybridize(static_alloc=True, static_space=True)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    data = data.as_in_context(mx.gpu())
    deeplab_pt_hybrid_static_alloc_space(data)
    
print("Time (s):", time.time() - start_time)

  0%|          | 0/500 [00:00<?, ?it/s]

Time (s): 38.07335376739502


In [None]:
Check memory footprint and example with a different shape

In [None]:
deeplab_pt_hybrid_static_alloc_space.hybridize(active=False)

start_time = time.time()

for data, _ in tqdm(ade20k_val_loader_gpu_cpu):
    deeplab_pt_gpu(data)
    
print("Time (s):", time.time() - start_time)

net = Net()
net.collect_params().initialize()
x = nd.random_normal(shape=(1, 512))
print('=== 1st forward ===')
y = net(x)
print('=== 2nd forward ===')
y = net(x)
=== 1st forward ===
type(x): NDArray, F: mxnet.ndarray
=== 2nd forward ===
type(x): NDArray, F: mxnet.ndarray

net.hybridize()
print('=== 1st forward ===')
y = net(x)
print('=== 2nd forward ===')
y = net(x)
=== 1st forward ===
type(x): Symbol, F: mxnet.symbol
=== 2nd forward ===

I can talk about symbolic and imperative programming
HybridSequential / HybridBlock / Hybrid_forward

https://github.com/awslabs/dynamic-training-with-apache-mxnet-on-aws/blob/master/docs/tutorials/gluon/custom_layer.md

## Multiplication of Matrices

In [None]:
wait_for_operations = True
compute_results = False

start_time = time.time()

for _ in tqdm(range(10)):
    a = mx.nd.random.normal(shape=(10000, 10000))
    if wait_for_operations:
        a.wait_to_read()
    b = mx.nd.random.normal(shape=(10000, 10000))
    if wait_for_operations:
        b.wait_to_read()
    c = mx.nd.random.normal(shape=(10000, 10000))
    if wait_for_operations:
        c.wait_to_read()
    d = mx.nd.random.normal(shape=(10000, 10000))
    if wait_for_operations:
        d.wait_to_read()

    s1 = a * a * a * a * b * b * b * b
    if wait_for_operations:
        s1.wait_to_read()
    s2 = c * c * c * c * d * d * d * d
    if wait_for_operations:
        s2.wait_to_read()
    s1 = s1 * s1 * s1 * s1 * s1 * s1 * s1 * s1
    if wait_for_operations:
        s1.wait_to_read()
    s2 = s2 * s2 * s2 * s2 * s2 * s2 * s2 * s2
    if wait_for_operations:
        s2.wait_to_read()
    s_total = s1 * s2
    s_total = s_total * s_total * s_total * s_total
    if wait_for_operations:
        s_total.wait_to_read()
        
    if compute_results:
        s_total.wait_to_read()
        
print("Time (s):", time.time() - start_time)

In [None]:
start_time = time.time()

s_total = []

for _ in tqdm(range(10)):
    a = mx.nd.random.normal(shape=(10000, 10000))
    b = mx.nd.random.normal(shape=(10000, 10000))
    c = mx.nd.random.normal(shape=(10000, 10000))
    d = mx.nd.random.normal(shape=(10000, 10000))

    s1 = a * a * a * a * b * b * b * b
    s2 = c * c * c * c * d * d * d * d
    s1 = s1 * s1 * s1 * s1 * s1 * s1 * s1 * s1 
    s2 = s2 * s2 * s2 * s2 * s2 * s2 * s2 * s2
    s_total = s1 * s2
    s_total = s_total * s_total * s_total * s_total

mx.nd.waitall()
        
print("Time (s):", time.time() - start_time)

## DataLoaders

In [None]:
import gluoncv as gcv

# ADE20K Preliminary steps
# Needs source code from Gluon-CV, run (in your desired code folder):
# gh repo clone dmlc/gluon-cv
# in the gluon-cv/scripts/datasets folder, there is a script called
# ade20k.py, run it with:
# !python3 ade20k.py 

In [None]:
# All in CPU, no transforms
ade20k_val = gcv.data.ADE20KSegmentation(split='val')

data_shape = []
labels_shape = []

start_time = time.time()

for data, label in tqdm(ade20k_val):
    data_shape.append(data.shape)
    labels_shape.append(label.shape)
    
print("Time (s):", time.time() - start_time)

In [None]:
image_size = 1200

input_transform_fn = mx.gluon.data.vision.transforms.Compose([
    mx.gluon.data.vision.transforms.Resize(image_size, keep_ratio=True),
    mx.gluon.data.vision.transforms.CenterCrop(image_size),
    mx.gluon.data.vision.transforms.ToTensor(),
    mx.gluon.data.vision.transforms.Normalize([.485, .456, .406], [.229, .224, .225])
])

to_gpu_fn = lambda x: x.as_in_context(mx.gpu())

input_transform_fn_gpu = mx.gluon.data.vision.transforms.Compose([
    to_gpu_fn,
    input_transform_fn
])

In [None]:
# Pre-processing in CPU, with transforms
ade20k_val_cpu = gcv.data.ADE20KSegmentation(split='val', transform=input_transform_fn)

data_shape = []
labels_shape = []

start_time = time.time()

for data, label in tqdm(ade20k_val_cpu):
    data_shape.append(data.shape)
    labels_shape.append(label.shape)
    
print("Time (s):", time.time() - start_time)

In [None]:
# Pre-processing in GPU, with transforms
# Unfortunately, we cannot copy directly into GPU the labels
# Not supported ty ADE20KSegmentation class
ade20k_val_gpu = gcv.data.ADE20KSegmentation(split='val', transform=input_transform_fn_gpu)

_mask_transform_fn = mx.gluon.data.vision.transforms.Compose([
    ade20k_val_gpu._mask_transform,
    to_gpu_fn
])

ade20k_val_gpu._mask_transform = _mask_transform_fn

data_shape = []
labels_shape = []

start_time = time.time()

for data, label in tqdm(ade20k_val_gpu):
    data_shape.append(data.shape)
    labels_shape.append(label.shape)
    
print("Time (s):", time.time() - start_time)

In [None]:
# Pre-processing in GPU, with transforms
# then copying back to CPU memory space

# Pre-processing in GPU, with transforms
# Unfortunately, we cannot copy directly into GPU the labels
# Not supported ty ADE20KSegmentation class

to_cpu_fn = lambda x: x.as_in_context(mx.cpu())

input_transform_fn_gpu_cpu = mx.gluon.data.vision.transforms.Compose([
    input_transform_fn_gpu,
    to_cpu_fn
])

# No need for mask transform changes
ade20k_val_gpu_cpu = gcv.data.ADE20KSegmentation(split='val', transform=input_transform_fn_gpu_cpu)

data_shape = []
labels_shape = []

start_time = time.time()

for data, label in tqdm(ade20k_val_gpu_cpu):
    data_shape.append(data.shape)
    labels_shape.append(label.shape)
    
print("Time (s):", time.time() - start_time)

In [None]:
def process_data_loader_cpu(num_workers, batch_size):
    # DataLoader all in CPU, copied to GPU (for model processing)
    ade20k_val_loader_cpu = mx.gluon.data.DataLoader(
        ade20k_val_cpu,
        batch_size=batch_size,
        num_workers=num_workers)

    for data, label in tqdm(ade20k_val_loader_cpu):
        data = data.as_in_context(mx.gpu())
        label = label.as_in_context(mx.gpu())
        
def process_data_loader_gpu(num_workers, batch_size):
    # DataLoader all in GPU, no copies necessary
    ade20k_val_loader_gpu = mx.gluon.data.DataLoader(
        ade20k_val_gpu,
        batch_size=batch_size,
        num_workers=num_workers,
        thread_pool=False)

    for data, label in tqdm(ade20k_val_loader_gpu):
        pass
    
def process_data_loader_gpu_cpu(num_workers, batch_size):
    # DataLoader all in GPU, no copies necessary
    ade20k_val_loader_gpu_cpu = mx.gluon.data.DataLoader(
        ade20k_val_gpu_cpu,
        batch_size=batch_size,
        num_workers=num_workers,
        thread_pool=False)

    for data, label in tqdm(ade20k_val_loader_gpu_cpu):
        data = data.as_in_context(mx.gpu())
        label = label.as_in_context(mx.gpu())

In [None]:
start_time = time.time()

process_data_loader_cpu(0, 4)

print("Time (s):", time.time() - start_time)

In [None]:
start_time = time.time()

process_data_loader_gpu(0, 4)

print("Time (s):", time.time() - start_time)

In [None]:
start_time = time.time()

process_data_loader_gpu_cpu(0, 4)

print("Time (s):", time.time() - start_time)

## DataLoader Chart

In [None]:
num_workers_list = [0, 1, 2, 4, 8]
batch_sizes = [1, 2, 4, 8, 16]

In [None]:
results_gpu = []

for batch_size in batch_sizes:
    num_workers_gpu = 0
    result = %timeit  -n 3 -r 1 -o process_data_loader_gpu(num_workers_gpu, batch_size)
    results_gpu.append(result.average)

In [None]:
results_gpu_cpu = []

for batch_size in batch_sizes:
    num_workers_gpu = 0
    result = %timeit  -n 3 -r 1 -o process_data_loader_gpu_cpu(num_workers_gpu, batch_size)
    results_gpu_cpu.append(result.average)

In [None]:
results_cpu = []
    
for num_workers_cpu in num_workers_list:
    temp_list = []
    for batch_size in batch_sizes:
        result = %timeit -n 3 -r 1 -o process_data_loader_cpu(num_workers_cpu, batch_size)
        temp_list.append(result.average)
    results_cpu.append(temp_list)

In [None]:
fig = plt.figure()
plt.plot(batch_sizes, results_cpu[0], color='blue', marker='o')
plt.plot(batch_sizes, results_cpu[1], color='green', marker='p')
plt.plot(batch_sizes, results_cpu[2], color='yellow', marker='^')
plt.plot(batch_sizes, results_cpu[3], color='orange', marker='*')
plt.plot(batch_sizes, results_cpu[4], color='purple', marker='x')
plt.plot(batch_sizes, results_gpu_cpu, color='pink', marker='x')
plt.plot(batch_sizes, results_gpu, color='red', marker='s')
plt.title("DataLoader Times", fontsize=14)
plt.xlabel("Batch Size", fontsize=14)
plt.ylabel("Runtime (s)", fontsize=14)
plt.grid(True)
plt.legend(["CPU (workers: 0)", "CPU (workers: 1)", "CPU (workers: 2)", "CPU (workers: 4)", "CPU (workers: 8)", "GPU+CPU (workers: 0)", "GPU (workers: 0)"])
plt.show()

## Automatic Mixed Precision (AMP)

In [None]:
# Global Libraries
import collections
import gluoncv as gcv
from gluoncv.utils.metrics.segmentation import SegmentationMetric
from mxnet.gluon.data.vision import transforms
from importlib import reload
import matplotlib.pyplot as plt
import mxnet as mx
import numpy as np
import os
from sklearn import metrics
import time
from tqdm.notebook import tqdm

# Local Libraries
import pedestrian
import seg_model

# GPU mode
ctx = mx.gpu()

In [None]:
# Penn-Fudan Pedestrian Dataset
# https://www.cis.upenn.edu/~jshi/ped_html/
reload(pedestrian)

if not os.path.exists(pedestrian.PEDESTRIAN_FILE):
    !wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip

pedestrian_path = os.getcwd()

# Datasets
pedestrian_train_dataset = pedestrian.PedestrianDataset(
    pedestrian_path,
    split="train",
    is_segmentation_task=True,
    invert_masks=False)

pedestrian_val_dataset = pedestrian.PedestrianDataset(
    pedestrian_path,
    split="val",
    is_segmentation_task=True,
    invert_masks=False)

pedestrian_test_dataset = pedestrian.PedestrianDataset(
    pedestrian_path,
    split="test",
    is_segmentation_task=True,
    invert_masks=False)


# Further pre-processing
# Training pre-processing optimized for speed
# Evaluation pre-processing optimized for visualizations
image_size = 480

imagenet_transform = transforms.Compose([
    transforms.Resize(image_size, keep_ratio=True),
    transforms.CenterCrop(image_size),
    transforms.ToTensor(),
    transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
])

size_transform = transforms.Compose([
    transforms.Resize(image_size, keep_ratio=True),
    transforms.CenterCrop(image_size)
])

train_val_transform = lambda data, output: (imagenet_transform(data), size_transform(output))
test_transform = lambda data, output: (size_transform(data), mx.nd.moveaxis(size_transform(output), -1, 0))

p_train = pedestrian_train_dataset.transform(train_val_transform)
p_val   = pedestrian_val_dataset.transform(train_val_transform)
p_test  = pedestrian_test_dataset.transform(test_transform)

In [None]:
deeplab_pt = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=ctx)

In [None]:
print("Input data type:", p_val[0][0].dtype, "Model Parameters data type:", deeplab_pt.conv1[0].params["deeplabv337_resnetv1s_conv0_weight"].dtype)

In [None]:
a = mx.nd.array([1/3], dtype=np.float32)
b = a.astype(np.float16)

print("1/3 as Float32: {0:.30f}".format(a.asscalar()))
print("1/3 as Float16: {0:.30f}".format(b.asscalar()))

### Float32 Training

In [None]:
# Model
deeplab_ft_direct_f32 = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=ctx)

# FT-Direct, no freezing layers
# # Freeze Layers (keeping track of the updated parameters)
# updated_params = []
# for param in deeplab_ft_direct_f32.collect_params().values():
#     if param.grad_req == "write":
#         param.grad_req = "null"
#         updated_params += [param.name]

# Replace the last layers
deeplab_ft_direct_f32.head = gcv.model_zoo.deeplabv3._DeepLabHead(2)
deeplab_ft_direct_f32.head.initialize(ctx=ctx)

# DeepLab v3 has an additional auxiliary output for training/loss
# Not required for our person detector
deeplab_ft_direct_f32.aux = False
for param in deeplab_ft_direct_f32.auxlayer.collect_params().values():
    if param.grad_req == "write":
        param.grad_req = "null"

deeplab_ft_direct_f32.hybridize(static_alloc=True, static_shape=True)

model_filename_ft_direct_f32 = "deeplab_resnet101_coco_ft_direct_f32.params"

reload(seg_model)

loss_fn = gcv.loss.SoftmaxCrossEntropyLoss()

# Epochs & Batch Size
epochs = 10
batch_size = 4

# Define Optimizer and Hyper Parameters
trainer = mx.gluon.Trainer(deeplab_ft_direct_f32.collect_params(), "sgd", {"learning_rate": 1.0})

start_time = time.time()

training_loss, validation_loss = seg_model.training_loop(
    deeplab_ft_direct_f32,
    loss_fn, 
    trainer, 
    epochs, 
    batch_size, 
    p_train, 
    p_val, 
    model_filename_ft_direct_f32, 
    ctx)

print("Training time for 10 epochs:", time.time() - start_time, "/ Best validation loss:", min(validation_loss))

### Float16 Training

In [None]:
# Model
deeplab_ft_direct_f16 = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=ctx)

# FT-Direct, no freezing layers
# # Freeze Layers (keeping track of the updated parameters)
# updated_params = []
# for param in deeplab_ft_direct_f32.collect_params().values():
#     if param.grad_req == "write":
#         param.grad_req = "null"
#         updated_params += [param.name]

# Replace the last layers
deeplab_ft_direct_f16.head = gcv.model_zoo.deeplabv3._DeepLabHead(2)
deeplab_ft_direct_f16.head.initialize(ctx=ctx)

# Float16 model
deeplab_ft_direct_f16.cast('float16')

# DeepLab v3 has an additional auxiliary output for training/loss
# Not required for our person detector
deeplab_ft_direct_f16.aux = False
for param in deeplab_ft_direct_f16.auxlayer.collect_params().values():
    if param.grad_req == "write":
        param.grad_req = "null"

deeplab_ft_direct_f16.hybridize(static_alloc=True, static_shape=True)

model_filename_ft_direct_f16 = "deeplab_resnet101_coco_ft_direct_f16.params"

reload(seg_model)

loss_fn = gcv.loss.SoftmaxCrossEntropyLoss()

# Epochs & Batch Size
epochs = 10
batch_size = 4

# Define Optimizer and Hyper Parameters
trainer = mx.gluon.Trainer(deeplab_ft_direct_f16.collect_params(), "sgd", {
    "learning_rate": 1e-6, 
    "multi_precision": True
})

start_time = time.time()

training_loss, validation_loss = seg_model.training_loop(
    deeplab_ft_direct_f16,
    loss_fn, 
    trainer, 
    epochs, 
    batch_size, 
    p_train, 
    p_val, 
    model_filename_ft_direct_f16, 
    ctx,
    half_precision=True)

print("Training time for 10 epochs:", time.time() - start_time, "/ Best validation loss:", min(validation_loss))

In [None]:
a = mx.nd.array([65519], dtype=np.float16)
b = mx.nd.array([65520], dtype=np.float16)

print("65519 as Float16: {0:.30f}".format(a.asscalar()))
print("65520 as Float16: {0:.30f}".format(b.asscalar()))

a = mx.nd.array([1e-7], dtype=np.float16)
b = mx.nd.array([1e-8], dtype=np.float16)

print("65519 as Float16: {0:.30f}".format(a.asscalar()))
print("65520 as Float16: {0:.30f}".format(b.asscalar()))

### Automatic Mixed Precision (AMP) Training

In [None]:
# AMP
amp.init()

# Model
deeplab_ft_direct_amp = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=ctx)

# FT-Direct, no freezing layers
# # Freeze Layers (keeping track of the updated parameters)
# updated_params = []
# for param in deeplab_ft_direct_f32.collect_params().values():
#     if param.grad_req == "write":
#         param.grad_req = "null"
#         updated_params += [param.name]

# Replace the last layers
deeplab_ft_direct_amp.head = gcv.model_zoo.deeplabv3._DeepLabHead(2)
deeplab_ft_direct_amp.head.initialize(ctx=ctx)

# DeepLab v3 has an additional auxiliary output for training/loss
# Not required for our person detector
deeplab_ft_direct_amp.aux = False
for param in deeplab_ft_direct_amp.auxlayer.collect_params().values():
    if param.grad_req == "write":
        param.grad_req = "null"

deeplab_ft_direct_amp.hybridize(static_alloc=True, static_shape=True)

model_filename_ft_direct_amp = "deeplab_resnet101_coco_ft_direct_amp.params"

reload(seg_model)

loss_fn = gcv.loss.SoftmaxCrossEntropyLoss()

# Epochs & Batch Size
epochs = 10
batch_size = 4

# Define Optimizer and Hyper Parameters
trainer = mx.gluon.Trainer(deeplab_ft_direct_amp.collect_params(), "sgd", {"learning_rate": 1.0})

start_time = time.time()

training_loss, validation_loss = seg_model.training_loop(
    deeplab_ft_direct_amp,
    loss_fn, 
    trainer, 
    epochs, 
    batch_size, 
    p_train, 
    p_val, 
    model_filename_ft_direct_amp, 
    ctx,
    amp_enabled=True)

print("Training time for 10 epochs:", time.time() - start_time, "/ Best validation loss:", min(validation_loss))

#### 2x BatchSize

In [None]:
# AMP
amp.init()

# Model
deeplab_ft_direct_amp = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=ctx)

# FT-Direct, no freezing layers
# # Freeze Layers (keeping track of the updated parameters)
# updated_params = []
# for param in deeplab_ft_direct_f32.collect_params().values():
#     if param.grad_req == "write":
#         param.grad_req = "null"
#         updated_params += [param.name]

# Replace the last layers
deeplab_ft_direct_amp.head = gcv.model_zoo.deeplabv3._DeepLabHead(2)
deeplab_ft_direct_amp.head.initialize(ctx=ctx)

# DeepLab v3 has an additional auxiliary output for training/loss
# Not required for our person detector
deeplab_ft_direct_amp.aux = False
for param in deeplab_ft_direct_amp.auxlayer.collect_params().values():
    if param.grad_req == "write":
        param.grad_req = "null"

deeplab_ft_direct_amp.hybridize(static_alloc=True, static_shape=True)

model_filename_ft_direct_amp = "deeplab_resnet101_coco_ft_direct_amp.params"

reload(seg_model)

loss_fn = gcv.loss.SoftmaxCrossEntropyLoss()

# Epochs & Batch Size
epochs = 10
batch_size = 8

# Define Optimizer and Hyper Parameters
trainer = mx.gluon.Trainer(deeplab_ft_direct_amp.collect_params(), "sgd", {"learning_rate": 1.0})

start_time = time.time()

training_loss, validation_loss = seg_model.training_loop(
    deeplab_ft_direct_amp,
    loss_fn, 
    trainer, 
    epochs, 
    batch_size, 
    p_train, 
    p_val, 
    model_filename_ft_direct_amp, 
    ctx,
    amp_enabled=True)

print("Training time for 10 epochs:", time.time() - start_time, "/ Best validation loss:", min(validation_loss))

#### 2x Batchsize + 3x epochs

In [None]:
# AMP
amp.init()

# Model
deeplab_ft_direct_amp = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=ctx)

# FT-Direct, no freezing layers
# # Freeze Layers (keeping track of the updated parameters)
# updated_params = []
# for param in deeplab_ft_direct_f32.collect_params().values():
#     if param.grad_req == "write":
#         param.grad_req = "null"
#         updated_params += [param.name]

# Replace the last layers
deeplab_ft_direct_amp.head = gcv.model_zoo.deeplabv3._DeepLabHead(2)
deeplab_ft_direct_amp.head.initialize(ctx=ctx)

# DeepLab v3 has an additional auxiliary output for training/loss
# Not required for our person detector
deeplab_ft_direct_amp.aux = False
for param in deeplab_ft_direct_amp.auxlayer.collect_params().values():
    if param.grad_req == "write":
        param.grad_req = "null"

deeplab_ft_direct_amp.hybridize(static_alloc=True, static_shape=True)

model_filename_ft_direct_amp = "deeplab_resnet101_coco_ft_direct_amp.params"

reload(seg_model)

loss_fn = gcv.loss.SoftmaxCrossEntropyLoss()

# Epochs & Batch Size
epochs = 30
batch_size = 8

# Define Optimizer and Hyper Parameters
trainer = mx.gluon.Trainer(deeplab_ft_direct_amp.collect_params(), "sgd", {"learning_rate": 1.0})

start_time = time.time()

training_loss, validation_loss = seg_model.training_loop(
    deeplab_ft_direct_amp,
    loss_fn, 
    trainer, 
    epochs, 
    batch_size, 
    p_train, 
    p_val, 
    model_filename_ft_direct_amp, 
    ctx,
    amp_enabled=True)

print("Training time for 10 epochs:", time.time() - start_time, "/ Best validation loss:", min(validation_loss))

### Multi-GPU Training

In [None]:
# Context variable is now a list,
# with each element corresponding to a GPU device
ctx_list = [mx.gpu(0)]

In [None]:
# Model
deeplab_ft_direct_f32 = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=ctx_list)

# FT-Direct, no freezing layers
# # Freeze Layers (keeping track of the updated parameters)
# updated_params = []
# for param in deeplab_ft_direct_f32.collect_params().values():
#     if param.grad_req == "write":
#         param.grad_req = "null"
#         updated_params += [param.name]

# Replace the last layers
deeplab_ft_direct_f32.head = gcv.model_zoo.deeplabv3._DeepLabHead(2)
deeplab_ft_direct_f32.head.initialize(ctx=ctx_list)

# DeepLab v3 has an additional auxiliary output for training/loss
# Not required for our person detector
deeplab_ft_direct_f32.aux = False
for param in deeplab_ft_direct_f32.auxlayer.collect_params().values():
    if param.grad_req == "write":
        param.grad_req = "null"

deeplab_ft_direct_f32.hybridize(static_alloc=True, static_shape=True)

model_filename_ft_direct_f32 = "deeplab_resnet101_coco_ft_direct_f32.params"

reload(seg_model)

loss_fn = gcv.loss.SoftmaxCrossEntropyLoss()

# Epochs & Batch Size
epochs = 10
batch_size = 4

# Define Optimizer and Hyper Parameters
trainer = mx.gluon.Trainer(deeplab_ft_direct_f32.collect_params(), "sgd", {"learning_rate": 0.1})

start_time = time.time()

training_loss, validation_loss = seg_model.multi_training_loop(
    deeplab_ft_direct_f32,
    loss_fn, 
    trainer, 
    epochs, 
    batch_size, 
    p_train, 
    p_val, 
    model_filename_ft_direct_f32, 
    ctx_list)

print("Training time for 10 epochs:", time.time() - start_time, "/ Best validation loss:", min(validation_loss))

In [None]:
# Context variable is now a list,
# with each element corresponding to a GPU device
ctx_list = [mx.gpu(0), mx.gpu(1), mx.gpu(2), mx.gpu(3)]
num_gpus = len(ctx_list)

In [None]:
# Model
deeplab_ft_direct_f32 = gcv.model_zoo.get_model('deeplab_resnet101_coco', pretrained=True, ctx=ctx_list)

# FT-Direct, no freezing layers
# # Freeze Layers (keeping track of the updated parameters)
# updated_params = []
# for param in deeplab_ft_direct_f32.collect_params().values():
#     if param.grad_req == "write":
#         param.grad_req = "null"
#         updated_params += [param.name]

# Replace the last layers
deeplab_ft_direct_f32.head = gcv.model_zoo.deeplabv3._DeepLabHead(2)
deeplab_ft_direct_f32.head.initialize(ctx=ctx_list)

# DeepLab v3 has an additional auxiliary output for training/loss
# Not required for our person detector
deeplab_ft_direct_f32.aux = False
for param in deeplab_ft_direct_f32.auxlayer.collect_params().values():
    if param.grad_req == "write":
        param.grad_req = "null"

deeplab_ft_direct_f32.hybridize(static_alloc=True, static_shape=True)

model_filename_ft_direct_f32 = "deeplab_resnet101_coco_ft_direct_f32.params"

reload(seg_model)

loss_fn = gcv.loss.SoftmaxCrossEntropyLoss()

# Epochs & Batch Size
epochs = 10
batch_size_per_gpu = 4
batch_size = len(ctx_list) * batch_size_per_gpu

# Define Optimizer and Hyper Parameters
trainer = mx.gluon.Trainer(deeplab_ft_direct_f32.collect_params(), "sgd", {"learning_rate": 0.5})

start_time = time.time()

training_loss, validation_loss = seg_model.multi_training_loop(
    deeplab_ft_direct_f32,
    loss_fn, 
    trainer, 
    epochs, 
    batch_size, 
    p_train, 
    p_val, 
    model_filename_ft_direct_f32, 
    ctx_list)

print("Training time for 10 epochs:", time.time() - start_time, "/ Best validation loss:", min(validation_loss))