# Training with multiple GPUs from scratch
from: https://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-scratch.html
Installation tips:
Make sure install version of mxnet to matcg CUDA, e.g.
 pip install mxnet-cu92 - for CUDA 9.2 - check with AISE platform, Software versions TensorFlow 1.8.0, Python 3.6.3, CUDA 9.1.85.3, cuDNN 7.1.3 (as of Aug 2018)

In [1]:
from mxnet import nd
from time import time

start = time()
x = nd.random_uniform(shape=(2000,2000))
y = nd.dot(x, x)
print('=== workloads are pushed into the backend engine ===\n%f sec' % (time() - start))
z = y.asnumpy()
print('=== workloads are finished ===\n%f sec' % (time() - start))

=== workloads are pushed into the backend engine ===
0.001441 sec
=== workloads are finished ===
0.164171 sec


In [3]:

from mxnet import gpu

def run(x):
    """push 10 matrix-matrix multiplications"""
    return [nd.dot(x,x) for i in range(10)]

def wait(x):
    """explicitly wait until all results are ready"""
    for y in x:
        y.wait_to_read()

x0 = nd.random_uniform(shape=(4000, 4000), ctx=gpu(0))
x1 = x0.copyto(gpu(1))
# modified to use 4 GPUs (K80), on GCP AISE server (AISE TensorFlow NVidia GPU Notebook)
x2 = x0.copyto(gpu(2))
x3 = x0.copyto(gpu(3))

print('=== Run on GPU 0, 1, 2 and 3 in sequential ===')
start = time()
wait(run(x0))
wait(run(x1))
wait(run(x2))
wait(run(x3))
print('time: %f sec' %(time() - start))

print('=== Run on GPU 0, 1, 2 and 3 in parallel ===')
start = time()
y0 = run(x0)
y1 = run(x1)
y2 = run(x2)
y3 = run(x3)
wait(y0)
wait(y1)
wait(y2)
wait(y3)
print('time: %f sec' %(time() - start))

=== Run on GPU 0, 1, 2 and 3 in sequential ===
time: 5.020850 sec
=== Run on GPU 0, 1, 2 and 3 in parallel ===
time: 0.543288 sec


## Results

__2 GPUs__

=== Run on GPU 0 and 1 in sequential ===

time: 2.391721 sec

=== Run on GPU 0 and 1 in parallel ===

time: 0.544099 sec

__4 GPUs__

=== Run on GPU 0, 1, 2 and 3 in sequential ===

time: 5.020850 sec

=== Run on GPU 0, 1, 2 and 3 in parallel ===

time: 0.543288 sec
