In [1]:
!nvidia-smi

Thu Jun 27 01:09:10 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.14       Driver Version: 430.14       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce 920MX       Off  | 00000000:03:00.0 Off |                  N/A |
| N/A   45C    P8    N/A /  N/A |      5MiB /  2004MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [2]:
!glxinfo|egrep "OpenGL vendor|OpenGL renderer*"

OpenGL vendor string: NVIDIA Corporation
OpenGL renderer string: GeForce 920MX/PCIe/SSE2


In [1]:
from numba import cuda
import numpy as np
import math

In [2]:
print(cuda.gpus)

<Managed Device 0>


In [3]:
cuda.select_device(0)

<weakproxy at 0x7f5a953a99f8 to Device at 0x7f5a69196390>

In [4]:
# CUDA kernel
@cuda.jit
def matmul(A, B, C):
    """Perform matrix multiplication of C = A * B
    """
    row, col = cuda.grid(2)
    if row < C.shape[0] and col < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[row, k] * B[k, col]
        C[row, col] = tmp

In [5]:
%%time
# Initialize the data arrays
A = np.random.randn(2400, 1200) # matrix containing all 3's
B = np.random.randn(1200, 2200) # matrix containing all 4's

CPU times: user 171 ms, sys: 100 µs, total: 172 ms
Wall time: 171 ms


In [6]:
(A.nbytes+B.nbytes+np.dot(A,B).nbytes)/1024/1024

82.3974609375

In [7]:
%%time
# Copy the arrays to the device
A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)

# Allocate memory on the device for the result
C_global_mem = cuda.device_array((2400, 2200))

CPU times: user 21.6 ms, sys: 13.2 ms, total: 34.8 ms
Wall time: 33.2 ms


In [8]:
%%time
# Configure the blocks
threadsperblock = (30, 30)
blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[0]))
blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)

CPU times: user 14 µs, sys: 1 µs, total: 15 µs
Wall time: 20 µs


In [9]:
%%time
np.dot(A,B).shape

CPU times: user 774 ms, sys: 75.2 ms, total: 849 ms
Wall time: 240 ms


(2400, 2200)

In [11]:
%%time
matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)

CPU times: user 708 µs, sys: 65 µs, total: 773 µs
Wall time: 708 µs


In [12]:
%%time
C = C_global_mem.copy_to_host()
print(C.shape)

(2400, 2200)
CPU times: user 24.3 ms, sys: 7.09 ms, total: 31.4 ms
Wall time: 29.9 ms
