In [2]:
from numba import cuda, float32
import numpy as np
import time

In [4]:
N = np.random.randint(1,5000, 1024**3).astype('float32')
M = np.array([0.2, 0.5, 0.3, 0.2, 0.4, 0.1, 0.3, 0.5,0.6,0.7,0.8,0.1,0.2], dtype=np.float32)
t0 = time.time()
for i in range(1):
    P_true = np.convolve(N,np.flip(M), mode='same')
t1 = time.time()
print(t1-t0)

14.082029819488525


In [5]:
# Input size
Width = len(N)
MaskWidth = len(M)
P_cuda = np.zeros_like(N)

In [6]:
# Define the 1D convolution kernel
@cuda.jit
def cuda_1D_convolution(N, M, P):
    i = cuda.grid(1)
    if i < len(P):
        # Start position for the convolution
        start = i - (len(M) // 2)
        pValue = 0.0
        # Convolution loop
        for j in range(len(M)):
            if 0 <= start + j < len(N):  # Ensure we are within bounds
                pValue += N[start + j] * M[j]
        P[i] = pValue

In [24]:


cuda.select_device(0)  # Change to the device ID you want to use

# Copy data to GPU
d_N = cuda.to_device(N)
d_M = cuda.to_device(M)
d_P = cuda.to_device(P_cuda)

# Configure the kernel
threads_per_block = 1024
blocks_per_grid = (N.size + (threads_per_block - 1)) // threads_per_block
print(blocks_per_grid)

1048576


In [26]:
t0 = time.time()
for i in range(1):
    cuda_1D_convolution[blocks_per_grid, threads_per_block](d_N, d_M, d_P)
_=d_P.copy_to_host(P_cuda)
t1 = time.time()
print(t1-t0)



0.3680393695831299


In [18]:
print("\nYour result is {}".format("correct!" if np.allclose(P_true, P_cuda) else "incorrect."))


Your result is correct!


In [12]:
### with shared memory

In [27]:
P_cuda = np.zeros_like(N)
cuda.select_device(1)  # Change to the device ID you want to use

# Copy data to GPU
d_N = cuda.to_device(N)
d_M = cuda.to_device(M)
d_P = cuda.to_device(P_cuda)

# Configure the kernel
threads_per_block = 1024
blocks_per_grid = (N.size + (threads_per_block - 1)) // threads_per_block
print(blocks_per_grid)

1048576


In [29]:
mask = M.copy()

In [30]:

# Define the 1D convolution kernel with shared memory optimization
@cuda.jit
def faster_1D_convolution(N, P):
    M = cuda.const.array_like(mask)
    radius = len(M) // 2
    tile = cuda.shared.array(threads_per_block, dtype=float32)

    # Calculate global index and start position for this thread
    tid = cuda.threadIdx.x
    i = cuda.blockIdx.x * cuda.blockDim.x + tid    

    # Load data into shared memory with padding
    # Load the main element
    if i < len(N):
        tile[tid + radius] = N[i]
    else:
        tile[tid + radius] = 0  # Padding with zero for out-of-bounds

    # Load left halo elements for boundary handling
    if tid < radius:
        if i >= radius:
            tile[tid] = N[i - radius]
        else:
            tile[tid] = 0  # Zero padding for left edge

    # Load right halo elements for boundary handling
    if tid >= cuda.blockDim.x - radius:
        if i + radius < len(N):
            tile[tid + 2 * radius] = N[i + radius]
        else:
            tile[tid + 2 * radius] = 0  # Zero padding for right edge

    # Synchronize to make sure all threads have loaded data into shared memory
    cuda.syncthreads()

    # Perform the convolution
    if i < len(P):  # Ensure we don't exceed the output length
        pValue = 0.0
        for j in range(len(M)):
            pValue += tile[tid + j] * M[j]
        P[i] = pValue

In [39]:
t0 = time.time()
for i in range(1):
    faster_1D_convolution[blocks_per_grid, threads_per_block](d_N, d_P)
_=d_P.copy_to_host(P_cuda)
t1 = time.time()
print(t1-t0)

0.37580442428588867


In [37]:
print("\nYour result is {}".format("correct!" if np.allclose(P_true, P_cuda) else "incorrect."))


Your result is correct!


In [3]:
from numba import cuda

# Select the GPU (if you have more than one, specify the device ID)
device = cuda.get_current_device()

# Get device properties
max_threads_per_block = device.MAX_THREADS_PER_BLOCK
max_block_dim_x = device.MAX_BLOCK_DIM_X
max_block_dim_y = device.MAX_BLOCK_DIM_Y
max_block_dim_z = device.MAX_BLOCK_DIM_Z
max_grid_dim_x = device.MAX_GRID_DIM_X
max_grid_dim_y = device.MAX_GRID_DIM_Y
max_grid_dim_z = device.MAX_GRID_DIM_Z
total_shared_memory = device.MAX_SHARED_MEMORY_PER_BLOCK

print(f"Max threads per block: {max_threads_per_block}")
print(f"Max block dimensions: {max_block_dim_x}, {max_block_dim_y}, {max_block_dim_z}")
print(f"Max grid dimensions: {max_grid_dim_x}, {max_grid_dim_y}, {max_grid_dim_z}")
print(f"Total shared memory per block: {total_shared_memory} bytes")


Max threads per block: 1024
Max block dimensions: 1024, 1024, 64
Max grid dimensions: 2147483647, 65535, 65535
Total shared memory per block: 49152 bytes
