# Training with multiple GPUs from scratch
from: https://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-scratch.html
Installation tips:
Make sure install version of mxnet to matcg CUDA, e.g.
 pip install mxnet-cu92 - for CUDA 9.2 - check with AISE platform, Software versions TensorFlow 1.8.0, Python 3.6.3, CUDA 9.1.85.3, cuDNN 7.1.3 (as of Aug 2018)

In [1]:
from mxnet import nd
from time import time

start = time()
x = nd.random_uniform(shape=(2000,2000))
y = nd.dot(x, x)
print('=== workloads are pushed into the backend engine ===\n%f sec' % (time() - start))
z = y.asnumpy()
print('=== workloads are finished ===\n%f sec' % (time() - start))

=== workloads are pushed into the backend engine ===
0.001441 sec
=== workloads are finished ===
0.164171 sec


In [3]:

from mxnet import gpu

def run(x):
    """push 10 matrix-matrix multiplications"""
    return [nd.dot(x,x) for i in range(10)]

def wait(x):
    """explicitly wait until all results are ready"""
    for y in x:
        y.wait_to_read()

x0 = nd.random_uniform(shape=(4000, 4000), ctx=gpu(0))
x1 = x0.copyto(gpu(1))
# modified to use 4 GPUs (K80), on GCP AISE server (AISE TensorFlow NVidia GPU Notebook)
x2 = x0.copyto(gpu(2))
x3 = x0.copyto(gpu(3))

print('=== Run on GPU 0, 1, 2 and 3 in sequential ===')
start = time()
wait(run(x0))
wait(run(x1))
wait(run(x2))
wait(run(x3))
print('time: %f sec' %(time() - start))

print('=== Run on GPU 0, 1, 2 and 3 in parallel ===')
start = time()
y0 = run(x0)
y1 = run(x1)
y2 = run(x2)
y3 = run(x3)
wait(y0)
wait(y1)
wait(y2)
wait(y3)
print('time: %f sec' %(time() - start))

=== Run on GPU 0, 1, 2 and 3 in sequential ===
time: 5.020850 sec
=== Run on GPU 0, 1, 2 and 3 in parallel ===
time: 0.543288 sec


## Results

__2 GPUs__

=== Run on GPU 0 and 1 in sequential ===

time: 2.391721 sec

=== Run on GPU 0 and 1 in parallel ===

time: 0.544099 sec

__4 GPUs__

=== Run on GPU 0, 1, 2 and 3 in sequential ===

time: 5.020850 sec

=== Run on GPU 0, 1, 2 and 3 in parallel ===

time: 0.543288 sec


In [16]:
from mxnet import gluon
# initialize parameters
scale = .01
W1 = nd.random_normal(shape=(20,1,3,3))*scale
b1 = nd.zeros(shape=20)
W2 = nd.random_normal(shape=(50,20,5,5))*scale
b2 = nd.zeros(shape=50)
W3 = nd.random_normal(shape=(800,128))*scale
b3 = nd.zeros(shape=128)
W4 = nd.random_normal(shape=(128,10))*scale
b4 = nd.zeros(shape=10)
params = [W1, b1, W2, b2, W3, b3, W4, b4]

# network and loss
def lenet(X, params):
    # first conv
    h1_conv = nd.Convolution(data=X, weight=params[0], bias=params[1], kernel=(3,3), num_filter=20)
    h1_activation = nd.relu(h1_conv)
    h1 = nd.Pooling(data=h1_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    # second conv
    h2_conv = nd.Convolution(data=h1, weight=params[2], bias=params[3], kernel=(5,5), num_filter=50)
    h2_activation = nd.relu(h2_conv)
    h2 = nd.Pooling(data=h2_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    h2 = nd.flatten(h2)
    # first fullc
    h3_linear = nd.dot(h2, params[4]) + params[5]
    h3 = nd.relu(h3_linear)
    # second fullc
    yhat = nd.dot(h3, params[6]) + params[7]
    return yhat

loss = gluon.loss.SoftmaxCrossEntropyLoss()

# plain SGD
def SGD(params, lr):
    for p in params:
        p[:] = p - lr * p.grad

In [5]:
def get_params(params, ctx):
    new_params = [p.copyto(ctx) for p in params]
    for p in new_params:
        p.attach_grad()
    return new_params

new_params = get_params(params, gpu(0))
print('=== copy b1 to GPU(0) ===\nweight = {}\ngrad = {}'.format(
    new_params[1], new_params[1].grad))

=== copy b1 to GPU(0) ===
weight = 
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
<NDArray 20 @gpu(0)>
grad = 
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
<NDArray 20 @gpu(0)>


In [7]:
def allreduce(data):
    # sum on data[0].context, and then broadcast
    for i in range(1, len(data)):
        data[0][:] += data[i].copyto(data[0].context)
    for i in range(1, len(data)):
        data[0].copyto(data[i])

data = [nd.ones((1,2), ctx=gpu(i))*(i+1) for i in range(4)] # 4 GPUs
print("=== before allreduce ===\n {}".format(data))
allreduce(data)
print("\n=== after allreduce ===\n {}".format(data))

=== before allreduce ===
 [
[[ 1.  1.]]
<NDArray 1x2 @gpu(0)>, 
[[ 2.  2.]]
<NDArray 1x2 @gpu(1)>, 
[[ 3.  3.]]
<NDArray 1x2 @gpu(2)>, 
[[ 4.  4.]]
<NDArray 1x2 @gpu(3)>]

=== after allreduce ===
 [
[[ 10.  10.]]
<NDArray 1x2 @gpu(0)>, 
[[ 10.  10.]]
<NDArray 1x2 @gpu(1)>, 
[[ 10.  10.]]
<NDArray 1x2 @gpu(2)>, 
[[ 10.  10.]]
<NDArray 1x2 @gpu(3)>]


In [9]:
def split_and_load(data, ctx):
    n, k = data.shape[0], len(ctx)
    assert (n//k)*k == n, '# examples is not divided by # devices'
    idx = list(range(0, n+1, n//k))
    return [data[idx[i]:idx[i+1]].as_in_context(ctx[i]) for i in range(k)]

batch = nd.arange(16).reshape((4,4))
print('=== original data ==={}'.format(batch))
ctx = [gpu(0), gpu(1), gpu(2), gpu(3)]
splitted = split_and_load(batch, ctx)
print('\n=== split into {} ==={}\n{}'.format(ctx, splitted[0], splitted[1]))

=== original data ===
[[  0.   1.   2.   3.]
 [  4.   5.   6.   7.]
 [  8.   9.  10.  11.]
 [ 12.  13.  14.  15.]]
<NDArray 4x4 @cpu(0)>

=== splitted into [gpu(0), gpu(1), gpu(2), gpu(3)] ===
[[ 0.  1.  2.  3.]]
<NDArray 1x4 @gpu(0)>

[[ 4.  5.  6.  7.]]
<NDArray 1x4 @gpu(1)>


In [32]:
def train_batch(batch, params, ctx, lr):
    # split the data batch and load them on GPUs
    data = split_and_load(batch.data[0], ctx)
    label = split_and_load(batch.label[0], ctx)
    # run forward on each GPU
    with gluon.autograd.record():
        losses = [loss(lenet(X, W), Y)
                  for X, Y, W in zip(data, label, params)]
    # run backward on each gpu
    for l in losses:
        l.backward()
    # aggregate gradient over GPUs
    for i in range(len(params[0])):
        allreduce([params[c][i].grad for c in range(len(ctx))])
    # update parameters with SGD on each GPU
    for p in params:
        SGD(p, lr/batch.data[0].shape[0])

In [33]:
def valid_batch(batch, params, ctx):
    data = batch.data[0].as_in_context(ctx[0])
    pred = nd.argmax(lenet(data, params[0]), axis=1)
    return nd.sum(pred == batch.label[0].as_in_context(ctx[0])).asscalar()

In [34]:
from mxnet.test_utils import get_mnist
from mxnet.io import NDArrayIter
import mxnet as mx
from mxnet import autograd

def run(num_gpus, batch_size, lr):
    # the list of GPUs will be used
    ctx = [gpu(i) for i in range(num_gpus)]
    print('Running on {}'.format(ctx))

    # data iterator
    mnist = get_mnist()
    train_data = NDArrayIter(mnist["train_data"], mnist["train_label"], batch_size)
    valid_data = NDArrayIter(mnist["test_data"], mnist["test_label"], batch_size)
    print('Batch size is {}'.format(batch_size))

    # copy parameters to all GPUs
    dev_params = [get_params(params, c) for c in ctx]
    for epoch in range(5):
        # train
        start = time()
        train_data.reset()
        for batch in train_data:
            train_batch(batch, dev_params, ctx, lr)
        nd.waitall()  # wait all computations are finished to benchmark the time
        print('Epoch %d, training time = %.1f sec'%(epoch, time()-start))

        # validating
        valid_data.reset()
        correct, num = 0.0, 0.0
        for batch in valid_data:
            correct += valid_batch(batch, dev_params, ctx)
            num += batch.data[0].shape[0]
        print('         validation accuracy = %.4f'%(correct/num))

In [35]:
run(1, 64, 0.3)

Running on [gpu(0)]
Batch size is 64


AttributeError: module 'mxnet.gluon' has no attribute 'autograd'