In [8]:
import numpy as np
from numba import cuda, njit

# CUDA kernel for matrix multiplication
@cuda.jit
def matrix_multi_kernel(A, B, C):
    i, j = cuda.grid(2)
    if i < C.shape[0] and j < C.shape[1]:
        temp = 0.0
        for k in range(A.shape[1]):
            temp += A[i, k] * B[k, j]
        C[i, j] = temp

def cuda_setup(A, B):
    # can be changed
    num_threads_per_block = (1, 1)  # 16 each direction

    # number of blocks along x to cover the rows of A
    num_blocks_x = int(np.ceil(A.shape[0] / num_threads_per_block[0]))

    # number of blocks along y to cover the columns of B
    num_blocks_y = int(np.ceil(B.shape[1] / num_threads_per_block[1]))

    # total number of blocks per grid
    num_blocks_per_grid = (num_blocks_x, num_blocks_y)

    return num_blocks_per_grid, num_threads_per_block


def cuda_allocate(A, B):
    # copying from CPU to GPU
    d_A = cuda.to_device(A)
    d_B = cuda.to_device(B)
    # allocating device memory for result C on GPU
    d_C = cuda.device_array((A.shape[0], B.shape[1]))
    return d_A, d_B, d_C


def cuda_matrix_mult(A, B):
    # setting up dimensions of grid and block
    num_blocks_per_grid, num_threads_per_block = cuda_setup(A, B)

    # memory allocation on GPU
    d_A, d_B, d_C = cuda_allocate(A, B)

    # launching CUDA kernel
    matrix_multi_kernel[num_blocks_per_grid, num_threads_per_block](d_A, d_B, d_C)

    # Copying result back to the host
    C = d_C.copy_to_host()

    return C

In [9]:
import numpy as np
A = np.random.rand(32, 32)
B = np.random.rand(32, 32)
print(np.__version__)

1.26.2


In [10]:
import time

# GPU 
start_time_gpu = time.time()
result_gpu = cuda_matrix_mult(A, B)
end_time_gpu = time.time()
gpu_time = end_time_gpu - start_time_gpu
print("GPU: {} \n time: {}".format(result_gpu, gpu_time))

# CPU 
start_time_cpu = time.time()
result_cpu = np.dot(A, B)
end_time_cpu = time.time()
cpu_time = end_time_cpu - start_time_cpu
print("CPU: {} \n time: {}".format(result_cpu, cpu_time))


# Check if results match
np.testing.assert_allclose(result_gpu, result_cpu, rtol=1e-5)

NvvmSupportError: No supported GPU compute capabilities found. Please check your cudatoolkit version matches your CUDA version.

In [5]:
import numba
print(numba.__version__)
import torch
def gpu_info():
    return {"Number of Devices: ": torch.cuda.device_count(),
            "Device name: ": torch.cuda.get_device_name(),
            "Device properties: ": torch.cuda.get_device_properties(torch.cuda.device(0))}

gpu_info()

0.58.1


{'Number of Devices: ': 1,
 'Device name: ': 'NVIDIA GeForce RTX 3060 Laptop GPU',
 'Device properties: ': _CudaDeviceProperties(name='NVIDIA GeForce RTX 3060 Laptop GPU', major=8, minor=6, total_memory=6143MB, multi_processor_count=30)}

In [6]:
from numba import cuda 
gpu = cuda.get_current_device()
print(gpu)

<CUDA device 0 'b'NVIDIA GeForce RTX 3060 Laptop GPU''>


In [7]:
import numba.cuda

gpus = numba.cuda.list_devices()
for gpu in gpus:
    print(f"GPU Name: {gpu.name}")
    print(f"Compute Capability: {gpu.compute_capability}")

GPU Name: b'NVIDIA GeForce RTX 3060 Laptop GPU'
Compute Capability: (8, 6)
