In [1]:
from numba import cuda
import numpy as np 
import math



In [23]:
x_host = np.ones(shape=(1048575))

In [24]:
def host_inc_one(arr):
    for i  in range(arr.shape[0]):
        arr[i] += 1

In [25]:
%%timeit
host_inc_one(x_host)

157 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
@cuda.jit()
def device_inc_one(arr):
    tx = cuda.threadIdx.x
    ty = cuda.blockIdx.x
    bw = cuda.blockDim.x

    pos = tx + ty * bw
    if pos < arr.size:
        arr[pos] += 1

In [34]:
x_device = cuda.to_device(x_host)
print(x_device.size)
threadsperblock = 256
blockspergrid = (x_device.size + (threadsperblock - 1)) // threadsperblock

1048575


In [33]:
%%timeit
device_inc_one[blockspergrid,threadsperblock](x_device)

39.5 μs ± 1.5 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [30]:
x_device.copy_to_host()

array([81193., 81193., 81193., ..., 81193., 81193., 81193.])

In [35]:
x_device = cuda.to_device(np.ones(shape=(256,256)))

In [36]:
@cuda.jit
def device_inc_one_2d(arr):
    x,y = cuda.grid(2)
    if(x<arr.shape[0] and y<arr.shape[1]):
        arr[x,y] += 1

In [46]:
threadsperblock = (8,8)
blockspergrid_x = math.ceil(x_device.shape[0]/threadsperblock[0])
blockspergrid_y = math.ceil(x_device.shape[1]/threadsperblock[1])
blockspergrid = (blockspergrid_x,blockspergrid_y)



In [47]:
%%timeit
device_inc_one_2d[blockspergrid,threadsperblock](x_device)

20.3 μs ± 500 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
