In [2]:
from numba import cuda
import numpy as np
import math

In [3]:
print(cuda.gpus)

<Managed Device 0>


In [4]:
cuda.select_device(0)

<weakproxy at 0x7f71d966ab88 to Device at 0x7f71d45d52b0>

In [5]:
# CUDA kernel
@cuda.jit
def matmul(A, B, C):
    """Perform matrix multiplication of C = A * B
    """
    row, col = cuda.grid(2)
    if row < C.shape[0] and col < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[row, k] * B[k, col]
        C[row, col] = tmp

In [72]:
# %%time
# Initialize the data arrays
A = np.random.randn(2400, 1200*3) # matrix containing all 3's
B = np.random.randn(3*1200, 2200) # matrix containing all 4's

In [75]:
%%time
# Copy the arrays to the device
A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)

# Allocate memory on the device for the result
C_global_mem = cuda.device_array((2400, 2200))

CPU times: user 61.9 ms, sys: 21 ms, total: 82.8 ms
Wall time: 81.5 ms


In [86]:
%%time
# Configure the blocks
threadsperblock = (32,32)
blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[0]))
blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)

CPU times: user 11 µs, sys: 4 µs, total: 15 µs
Wall time: 20 µs


In [87]:
blockspergrid

(75, 69)

In [73]:
(A.nbytes+B.nbytes+np.dot(A,B).nbytes)/1024/1024

166.6259765625

In [79]:
%%time
matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)

CPU times: user 655 µs, sys: 280 µs, total: 935 µs
Wall time: 677 µs


In [81]:
%%time
C = C_global_mem.copy_to_host()
print(C.shape)

(2400, 2200)
CPU times: user 17.2 ms, sys: 10.8 ms, total: 28 ms
Wall time: 25.7 ms


In [82]:
%%time
np.dot(A,B).shape

CPU times: user 6.92 s, sys: 162 ms, total: 7.08 s
Wall time: 1.88 s


(2400, 2200)

In [83]:
import ctypes

In [84]:
cudlib=ctypes.CDLL('libcuda.so')

In [85]:
cudlib.cuInit(0)

0

In [120]:
%time
@cuda.jit
def my_kernel_2D(io_array):
    x, y = cuda.grid(2)
    ### YOUR SOLUTION HERE
    if x<io_array.shape[0] and y<io_array.shape[1]:
        io_array[x,y]*=2

data = np.ones((16, 16))
data_glob=cuda.device_array((16,16))
threadsperblock = (16, 16)
blockspergrid_x = math.ceil(data.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(data.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)
my_kernel_2D[blockspergrid, threadsperblock](data_glob)
print(data_glob.copy_to_host())

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.11 µs
[[4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
 [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]]
