http://preview.d2l.ai/d2l-en/master/chapter_computational-performance/auto-parallelism.html

https://mxnet.apache.org/versions/1.6/api/python/docs/tutorials/getting-started/crash-course/6-use_gpus.html

In [1]:
from d2l import mxnet as d2l
from mxnet import np, npx
npx.set_np()

In [2]:
def run(x):
    return [x.dot(x) for _ in range(10)]

x_cpu = np.random.uniform(size=(2000, 2000))
x_gpu = np.random.uniform(size=(6000, 6000), ctx=d2l.try_gpu())

In [3]:
run(x_cpu)  # Warm-up both devices
run(x_gpu)
npx.waitall()

with d2l.Benchmark('CPU time'):
    run(x_cpu)
    npx.waitall()

with d2l.Benchmark('GPU time'):
    run(x_gpu)
    npx.waitall()

CPU time: 4.7144 sec
GPU time: 98.0142 sec


In [4]:
with d2l.Benchmark('CPU & GPU'):
    run(x_cpu)
    run(x_gpu)
    npx.waitall()

CPU & GPU: 97.4487 sec


In [5]:
def copy_to_cpu(x):
    return [y.copyto(npx.cpu()) for y in x]

with d2l.Benchmark('Run on GPU'):
    y = run(x_gpu)
    npx.waitall()

with d2l.Benchmark('Copy to CPU'):
    y_cpu = copy_to_cpu(y)
    npx.waitall()

Run on GPU: 91.6302 sec
Copy to CPU: 0.6711 sec


In [6]:
with d2l.Benchmark('Run on GPU and copy to CPU'):
    y = run(x_gpu)
    y_cpu = copy_to_cpu(y)
    npx.waitall()

Run on GPU and copy to CPU: 89.1339 sec
