See [Cuda intro](https://lulaoshi.info/gpu/python-cuda/cuda-intro)

In [11]:
from numba import cuda

import numpy as np
from time import time

In [2]:
# check available gpu device in this computer
print(cuda.gpus)

<Managed Device 0>


# GPU print function

In [21]:
@cuda.jit
def gpu_add(a, b, result, n):
    """
    The inputs should be copied to_device or initialized on device, such as:    
        x_device = cuda.to_device(x)
        y_device = cuda.to_device(y)
        gpu_result = cuda.device_array(n)
    """
    idx = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
    if idx < n :
        result[idx] = a[idx] + b[idx]

def main():
    n = 20000000
    x = np.arange(n).astype(np.int32)
    y = 2 * x

    # copy data to device
    x_device = cuda.to_device(x)
    y_device = cuda.to_device(y)

    # initialize a space on the device to save the results get by GPU
    gpu_result = cuda.device_array(n)
    cpu_result = np.empty(n)

    threads_per_block = 512
    blocks_per_grid = int(n / threads_per_block) + 1

    start = time()
    gpu_add[blocks_per_grid, threads_per_block](x_device, y_device, gpu_result, n)
    cuda.synchronize()
    print("gpu vector add time " + str(time() - start))

    start = time()
    cpu_result = np.add(x, y)
    print("cpu vector add time " + str(time() - start))

    if (np.array_equal(cpu_result, gpu_result.copy_to_host())):
        print("result correct!")

if __name__ == "__main__":
    main()

gpu vector add time 0.11353397369384766
cpu vector add time 0.03344368934631348
result correct!


About grid, block and thread: \
<img src=http://aixingqiu-1258949597.cos.ap-beijing.myqcloud.com/2019-11-21-071231.png width='500'>