In [23]:
import numpy as np
from numba import cuda

In [24]:
#INITIALIZER
n = 40006 #total elements in an integer array

a = range(1000)
original_array = np.random.choice(a, n).astype('int32')

print(original_array)

[695 169 204 ... 200 514  18]


In [25]:
threads_per_block = 64
blocks = 10

res = cuda.to_device(np.array([original_array[0], original_array[0]], dtype = np.int32))
arr = cuda.to_device(original_array)

In [26]:
def cpu_func_(arr, n):
    min = arr[1]
    max = arr[0]
    for i in range(n):
        if (arr[i] < min): #sol[1] is to hold min
            min = arr[i]
        elif(arr[i] > max): #sol[0] is to hold max
            max = arr[i]
    return [max, min]

In [27]:
@cuda.jit
def gpu_func_(sol, arr, n):
    idx = cuda.grid(1)
    stride = cuda.gridsize(1)
    min = arr[idx]
    max = arr[idx]
    for i in range(idx, n, stride):
        if (arr[i] < min): #sol[1] is to hold min
            min = arr[i]
        elif(arr[i] > max): #sol[0] is to hold max
            max = arr[i]
    cuda.atomic.max(sol, 0, max)
    cuda.atomic.min(sol, 1, min)

In [28]:
max1_, min1_ = cpu_func_(original_array, len(original_array))

gpu_func_[blocks, threads_per_block](res, arr, len(original_array))
cuda.synchronize()

max2_ = res[0]
min2_ = res[1]

# FOR CPU

In [29]:
%timeit cpu_func_(original_array, len(original_array))

8.96 ms ± 52.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# FOR GPU

In [30]:
%timeit gpu_func_[blocks, threads_per_block](res, arr, len(original_array))

133 µs ± 482 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# COMPARING RESULTS

In [31]:
max2_ == max1_

True

In [32]:
min2_ == min1_

True

In [33]:
print("MAXIMUM = ", max1_, max2_)
print("MINIMUM = ", min1_, min2_)

MAXIMUM =  999 999
MINIMUM =  0 0
