In [45]:
import numpy as np
from numba import cuda, types as numba_types

In [60]:
#INITIALIZER
x = 1024
#x = 20480
n = x * x #total elements in an integer array

original_array = np.arange(n).reshape((x, x)).astype(np.int32)
cpu_res = np.zeros_like(original_array)

In [59]:
threads_per_block = 64
blocks = int(x / threads_per_block)

arr = cuda.to_device(original_array)
gpu_res = cuda.to_device(np.zeros_like(original_array))

In [33]:
def cpu_func_(sol, arr, m, n):
    for i in range(m): #no. of rows
        for j in range(n): #no. of half columns
                       sol[i][j] = arr[i][n - j - 1]

In [66]:
@cuda.jit
def gpu_func_(sol, arr, n):
    share = cuda.shared.array((1, 65), numba_types.int32) #shared memory bank conflict handled
    j, i = cuda.grid(2)
    share[1][cuda.threadIdx.x] = arr[i][j]
    cuda.syncthreads()
    j = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.x
    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.y
    sol[i][j] = share[n - cuda.threadIdx.x]

In [67]:
cpu_func_(cpu_res, original_array, x, x)

In [68]:
gpu_func_[blocks, threads_per_block](gpu_res, arr, x)
cuda.synchronize()

sum_from_gpu = res[0]

CudaAPIError: [1] Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE

# FOR CPU

In [69]:
%timeit cpu_func_(cpu_res, original_array, x, x)

537 ms ± 6.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# FOR GPU

In [73]:
%timeit gpu_func_[blocks, threads_per_block](gpu_res, arr, x)

CudaAPIError: [1] Call to cuLaunchKernel results in CUDA_ERROR_INVALID_VALUE

# COMPARING RESULTS

In [74]:
cpu_res == gpu_res

array([[False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [77]:
print("CPU swapped array = \n", cpu_res)
print("GPU swapped array = \n", gpu_res.copy_to_host())

CPU swapped array = 
 [[   1023    1022    1021 ...       2       1       0]
 [   2047    2046    2045 ...    1026    1025    1024]
 [   3071    3070    3069 ...    2050    2049    2048]
 ...
 [1046527 1046526 1046525 ... 1045506 1045505 1045504]
 [1047551 1047550 1047549 ... 1046530 1046529 1046528]
 [1048575 1048574 1048573 ... 1047554 1047553 1047552]]
GPU swapped array = 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
