### CUDA with Numba

In [16]:
from numba import cuda
import numpy as np 
import math

In [17]:
x_host = np.ones(shape=(65536))

In [31]:
def host_increment_by_one(arr):
    for i in range(len(arr)):
        arr[i] += 1

In [19]:
%%timeit
host_increment_by_one(x_host)

10.9 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [41]:
@cuda.jit
def device_increment_by_one(arr):
    pos = cuda.grid(1)
    if pos < arr.size:  # Check array boundaries
        arr[pos] += 1

In [42]:
x_host = np.ones(shape=(65536))
x_device = cuda.to_device(x_host)
threadsperblock = 256
blockspergrid = (x_device.size + (threadsperblock - 1)) // threadsperblock

In [43]:
%%timeit
device_increment_by_one[blockspergrid, threadsperblock](x_device)

32.7 µs ± 511 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [35]:
x_device.copy_to_host()

array([2., 2., 2., ..., 2., 2., 2.])

In [36]:
x_device = cuda.to_device(np.ones(shape=(256, 256)))

In [47]:
@cuda.jit
def device_increment_a_2D_arr(arr):
    x, y = cuda.grid(2)
    if x < arr.shape[0] and y < arr.shape[1]:
        arr[x, y] += 1

In [48]:
threadsperblock = (16, 16)
blockspergrid_x = math.ceil(x_device.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(x_device.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)

In [49]:
%%timeit
device_increment_a_2D_arr[blockspergrid, threadsperblock](x_device)

265 µs ± 6.85 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


cupy array is __cuda_array_interface__ hence it can be used with numba.cuda

In [50]:
import cupy as cp

In [51]:
x_device = cp.ones(shape=(256,256))
device_increment_a_2D_arr[blockspergrid, threadsperblock](x_device)

In [52]:
print(type(x_device))
x_device

<class 'cupy.ndarray'>


array([[2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.],
       ...,
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.]])