In [1]:
# Step 1: Install required libraries
!pip install numba
!pip install numpy




In [2]:

# Step 2: Import necessary libraries
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import math
from numba import jit, njit, vectorize, cuda


In [3]:

# Step 3: Setting up the environment for GPU
# Find the locations of the necessary libraries
!find / -iname 'libdevice'
!find / -iname 'libnvvm.so'



/usr/local/lib/python3.10/dist-packages/jaxlib/cuda/nvvm/libdevice
/usr/local/cuda-12.2/nvvm/libdevice
find: ‘/proc/59/task/59/net’: Invalid argument
find: ‘/proc/59/net’: Invalid argument
/usr/local/cuda-12.2/nvvm/lib64/libnvvm.so
find: ‘/proc/59/task/59/net’: Invalid argument
find: ‘/proc/59/net’: Invalid argument


In [4]:
# Set the environment variables (you might need to adjust these paths based on the output of the above commands)
import os
os.environ['NUMBAPRO_LIBDEVICE'] = "/usr/local/lib/python3.10/dist-packages/jaxlib/cuda/nvvm/libdevice"
os.environ['NUMBAPRO_NVVM'] = "/usr/local/cuda-12.2/nvvm/libdevice"



In [5]:
# Step 4: Vector Addition on GPUs using Numba
@vectorize(['int64(int64, int64)'], target='cuda')
def add_ufunc_gpu(x, y):
    return x + y

x = np.arange(10)
y = 2 * x
print("GPU Vector Addition Output: ", add_ufunc_gpu(x, y))



GPU Vector Addition Output:  [ 0  3  6  9 12 15 18 21 24 27]




In [6]:
# Step 5: Timing the GPU function
%timeit add_ufunc_gpu(x, y)



1.53 ms ± 649 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [7]:
# Step 6: Compare with CPU version
@vectorize(['int64(int64, int64)'], target='cpu')
def add_ufunc_cpu(x, y):
    return x + y

%timeit add_ufunc_cpu(x, y)



1.56 µs ± 312 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [8]:
# Step 7: Writing CUDA Kernels
@cuda.jit
def add_kernel(x, y, out):
    tidx = cuda.threadIdx.x  # unique thread ID within a 1D block
    bidx = cuda.blockIdx.x  # unique block ID within the 1D grid
    block_dimx = cuda.blockDim.x  # number of threads per block
    grid_dimx = cuda.gridDim.x  # number of blocks in the grid

    start = tidx + bidx * block_dimx
    stride = block_dimx * grid_dimx

    for i in range(start, x.shape[0], stride):
        out[i] = x[i] + y[i]

n = 100000
x = np.arange(n).astype(np.float32)
y = 2 * x
out = np.empty_like(x)

threads_per_block = 128
blocks_per_grid = 30

add_kernel[blocks_per_grid, threads_per_block](x, y, out)
print(out[:10])



[ 0.  3.  6.  9. 12. 15. 18. 21. 24. 27.]




In [9]:
# Step 8: Using Numba helper functions for CUDA Kernels
@cuda.jit
def add_kernel_simplified(x, y, out):
    start = cuda.grid(1)
    stride = cuda.gridsize(1)
    for i in range(start, x.shape[0], stride):
        out[i] = x[i] + y[i]



In [10]:
# Step 9: Memory Management
x_device = cuda.to_device(x)
y_device = cuda.to_device(y)
out_device = cuda.device_array(shape=(n), dtype=np.float32)

%timeit add_kernel_simplified[blocks_per_grid, threads_per_block](x_device, y_device, out_device)
%timeit add_kernel_simplified[blocks_per_grid, threads_per_block](x, y, out)

out = out_device.copy_to_host()
print("Output from device array: ", out[:10])





57.1 µs ± 9.66 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)




2.39 ms ± 371 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Output from device array:  [ 0.  3.  6.  9. 12. 15. 18. 21. 24. 27.]


In [11]:
# Step 10: Scaling up the problem size
n = 100000
x = np.arange(n).astype(np.float32)
y = 2 * x
x_device = cuda.to_device(x)
y_device = cuda.to_device(y)
out_device = cuda.device_array(shape=(n), dtype=np.float32)

%timeit add_kernel_simplified[blocks_per_grid, threads_per_block](x_device, y_device, out_device)
%timeit add_kernel_simplified[blocks_per_grid, threads_per_block](x, y, out)


53.6 µs ± 1.05 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
1.99 ms ± 142 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
