In [1]:
import mxnet as mx
import numpy as np
from mxnet import nd, autograd, gluon
import multiprocessing

print(mx.__version__)
print(np.__version__)

1.3.0
1.14.5


In [2]:
mx.random.seed(1)
ctx = mx.gpu(0)
batch_size = 64
max_batches = 300
max_batches_infer = -1 # for whole batch
# We delay concatening arrays and calculating accuracy until after the inference
# to make sure benchmarking is consistent
acc = lambda res_list : np.mean([np.mean(np.argmax(x.softmax().asnumpy(), axis=1) == y.asnumpy()) for x, y in res_list])

def transformer(data, label):
        data = mx.image.imresize(data, 224, 224)
        data = mx.nd.transpose(data, (2,0,1))
        return data, label

def train_model(dtype, name, **sgd_kwargs):
    net = gluon.nn.HybridSequential()
    with net.name_scope():
        #  First convolutional layer
        net.add(gluon.nn.Conv2D(channels=96, kernel_size=11, strides=(4,4), activation='relu'))
        net.add(gluon.nn.MaxPool2D(pool_size=3, strides=2))
        #  Second convolutional layer
        net.add(gluon.nn.Conv2D(channels=192, kernel_size=5, activation='relu'))
        net.add(gluon.nn.MaxPool2D(pool_size=3, strides=(2,2)))
        # Third convolutional layer
        net.add(gluon.nn.Conv2D(channels=384, kernel_size=3, activation='relu'))
        # Fourth convolutional layer
        net.add(gluon.nn.Conv2D(channels=384, kernel_size=3, activation='relu'))
        # Fifth convolutional layer
        net.add(gluon.nn.Conv2D(channels=256, kernel_size=3, activation='relu'))
        net.add(gluon.nn.MaxPool2D(pool_size=3, strides=2))
        # Flatten and apply fullly connected layers
        net.add(gluon.nn.Flatten())
        net.add(gluon.nn.Dense(4096, activation="relu"))
        net.add(gluon.nn.Dense(4096, activation="relu"))
        net.add(gluon.nn.Dense(10))
    ### CAST NET TO CORRECT DTYPE ###
    net.cast(dtype)
    ###
    net.hybridize()
    # Parameter initialization
    net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .001, **sgd_kwargs})
    softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

    data_iter = gluon.data.DataLoader(
        gluon.data.vision.CIFAR10('./data', train=True, transform=transformer),
        batch_size=batch_size, shuffle=True, last_batch='discard')
    for i, batch in enumerate(data_iter):
        if max_batches != -1 and i >= max_batches:
            print("Reached {} batches. Saving model.".format(max_batches))
            break
        d, l = batch
        data = d.as_in_context(ctx).astype(dtype, copy=False)
        label = l.as_in_context(ctx)
        with autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        trainer.step(data.shape[0])
        
    net.export(name)
    return net

def get_model(name, ctx):
    return mx.gluon.block.SymbolBlock.imports(name+'-symbol.json', ['data'], name+'-0000.params', ctx=ctx)

# With normal Gluon block

In [3]:
def fp32_transformer(data, label):
    data = mx.image.imresize(data, 224, 224)
    data = mx.nd.transpose(data, (2,0,1))
    data = data.astype(np.float32, copy=False)
    return data, label

train_data = gluon.data.DataLoader(
    gluon.data.vision.CIFAR10('./data', train=True, transform=fp32_transformer),
    batch_size=32, shuffle=True, last_batch='discard')
net = train_model(np.float32, 'fp32')
# net = get_model('fp32')

res_list = []
import time
start = time.time()
for i, batch in enumerate(train_data):
    if max_batches_infer != -1 and i >= max_batches_infer:
        print("Reached {} batches.".format(max_batches_infer))
        break
    d, l = batch
    data = d.as_in_context(ctx)
    res = net(data)
    res_list.append((res, l))
print(time.time() - start)

Reached 300 batches. Saving model.
48.82793569564819


In [4]:
print(acc(res_list))

0.25358114596670933


# With fp16 Gluon block

In [5]:
def fp16_transformer(data, label):
    data = mx.image.imresize(data, 224, 224)
    data = mx.nd.transpose(data, (2,0,1))
    data = data.astype(np.float16, copy=False)
    return data, label

train_data = gluon.data.DataLoader(
    gluon.data.vision.CIFAR10('./data', train=True, transform=fp16_transformer),
    batch_size=32, shuffle=True, last_batch='discard')

net = train_model(np.float16, 'fp16', multi_precision=True)
# net = get_model('fp16')

res_list = []
import time
start = time.time()
for i, batch in enumerate(train_data):
    if max_batches_infer != -1 and i >= max_batches_infer:
        print("Reached {} batches.".format(max_batches_infer))
        break
    d, l = batch
    data = d.as_in_context(ctx)
    res = net(data)
    res_list.append((res, l))
print(time.time() - start)

Reached 300 batches. Saving model.
57.90255570411682


In [6]:
print(acc(res_list))

0.28026968629961585


In [7]:
# Double checking that all params are indeed fp16.
net.cast(np.float16)
params = net.collect_params()
for k in params:
    print(params[k].dtype)

<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
<class 'numpy.float16'>
