In [None]:
from numba import cuda

@cuda.jit
def add_kernel(x, y, out):
    idx = cuda.grid(1)
    out[idx] = x[idx] + y[idx]

In [None]:
import numpy as np

n = 4096
x = np.arange(n).astype(np.int32) 
y = np.ones_like(x)               

d_x = cuda.to_device(x) 
d_y = cuda.to_device(y) 
d_out = cuda.device_array_like(d_x) 

threads_per_block = 128
blocks_per_grid = 32

In [None]:
add_kernel[blocks_per_grid, threads_per_block](d_x, d_y, d_out)
cuda.synchronize()
print(d_out.copy_to_host()) # Should be [1...4096]

In [None]:
@cuda.jit
def square_device(a, out):
    idx = cuda.grid(1)
    out[idx] = a[idx]**2

In [None]:
n = 4096

a = np.arange(n)
out = a**2 

In [None]:
d_a = cuda.to_device(a)                  
d_out = cuda.device_array(shape=(n,), dtype=np.float32) 

blocks = 128
threads = 32

square_device[blocks, threads](d_a, d_out)
#cuda.synchronize()
#d_out = d_out.copy_to_host()

In [None]:
from numpy import testing
testing.assert_almost_equal(d_out, out)
print()