# MXNet-Gluon-SyncBN
Example on training MNIST, adapted from the [tutorial](http://gluon.mxnet.io/chapter04_convolutional-neural-networks/cnn-gluon.html).

### Import SyncBN and other dependencies

In [2]:
from __future__ import print_function
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon
from syncbn import BatchNorm, ModelDataParallel
mx.random.seed(1)

### Set the contexts (suppose using 4 GPUs)

In [3]:
nGPUs = 4
ctx_list = [mx.gpu(i) for i in range(nGPUs)]

### Grab the MNIST dataset

In [4]:
batch_size = 128
num_outputs = 10
def transform(data, label):
    return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)
train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True, transform=transform),
                                   batch_size, shuffle=True, last_batch='rollover',
                                   num_workers=4)
test_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=False, transform=transform),
                                     batch_size, shuffle=False, num_workers=4)

### Define a convolutional neural network

In [5]:
num_fc = 512
net = gluon.nn.Sequential()
with net.name_scope():
    net.add(gluon.nn.Conv2D(in_channels=1, channels=20, kernel_size=5))
    net.add(BatchNorm(in_channels=20, nGPUs=nGPUs))
    net.add(gluon.nn.Activation('relu'))
    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
    net.add(gluon.nn.Conv2D(in_channels=20, channels=50, kernel_size=5))
    net.add(BatchNorm(in_channels=50, nGPUs=nGPUs))
    net.add(gluon.nn.Activation('relu'))
    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
    # The Flatten layer collapses all axis, except the first one, into one axis.
    net.add(gluon.nn.Flatten())
    net.add(gluon.nn.Dense(num_fc,in_units=800))
    net.add(gluon.nn.Activation('relu'))
    net.add(gluon.nn.Dense(num_outputs, in_units=num_fc))

### Initializae the model wieghts and get Parallel mode

In [6]:
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24))
net = ModelDataParallel(net, ctx_list)
print(net)

ModelDataParallel(
  (module): Sequential(
    (0): Conv2D(1 -> 20, kernel_size=(5, 5), stride=(1, 1))
    (1): BatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, in_channels=20)
    (2): Activation(relu)
    (3): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
    (4): Conv2D(20 -> 50, kernel_size=(5, 5), stride=(1, 1))
    (5): BatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, in_channels=50)
    (6): Activation(relu)
    (7): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False)
    (8): Flatten
    (9): Dense(800 -> 512, linear)
    (10): Activation(relu)
    (11): Dense(512 -> 10, linear)
  )
)


### Softmax cross-entropy Loss and Optimizer

In [7]:
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .01})

### Write evaluation loop to calculate accuracy

In [8]:
def evaluate_accuracy(data_iterator, net):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iterator):
        data = gluon.utils.split_and_load(data, ctx_list=ctx_list)
        label = gluon.utils.split_and_load(label, ctx_list=ctx_list)
        outputs = net(data)
        predictions = []
        for i, output in enumerate(outputs):
            pred = nd.argmax(output, axis=1)
            acc.update(preds=pred, labels=label[i])
    return acc.get()[1]

### Training Loop

In [9]:
epochs = 1
smoothing_constant = .01

for e in range(epochs):
    for i, (data, label) in enumerate(train_data):
        data = gluon.utils.split_and_load(data, ctx_list=ctx_list)
        label = gluon.utils.split_and_load(label, ctx_list=ctx_list)
        with autograd.record():
            output = net(data)
            losses = [softmax_cross_entropy(yhat, y) for yhat, y in zip(output, label)]
            autograd.backward(losses)
        loss = 0
        for l in losses:
            loss += l.as_in_context(mx.gpu(0))
        trainer.step(len(data)*data[0].shape[0])
        ##########################
        #  Keep a moving average of the losses
        ##########################
        curr_loss = nd.mean(loss).asscalar()
        moving_loss = (curr_loss if ((i == 0) and (e == 0))
                       else (1 - smoothing_constant) * moving_loss + smoothing_constant * curr_loss)

    test_accuracy = evaluate_accuracy(test_data, net)
    train_accuracy = evaluate_accuracy(train_data, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, moving_loss, train_accuracy, test_accuracy))

Epoch 0. Loss: 2.046051726989087, Train_acc 0.8688199626865671, Test_acc 0.8723
