In [1]:
from numba import cuda
import numpy as np

@cuda.jit
def histogram_kernel(image, histogram):
    """
    CUDA kernel for calculating the histogram of an image.
    Each thread calculates the histogram for one pixel and atomically increments the histogram bin.
    """
    # Calculate the position of the current thread in the grid
    x, y = cuda.grid(2)
    if x < image.shape[0] and y < image.shape[1]:
        # Get the pixel value
        pixel_value = image[x, y]
        # Atomically increment the corresponding histogram bin
        cuda.atomic.add(histogram, pixel_value, 1)

def compute_histogram(image):
    """
    Compute the histogram of an image using CUDA.
    """
    # Assuming image pixel values range from 0 to 255
    histogram = np.zeros(256, dtype=np.uint32)
    image = np.asarray(image)
    
    # Allocate memory on the device
    d_image = cuda.to_device(image)
    d_histogram = cuda.to_device(histogram)
    
    # Configure the blocks
    threadsperblock = (16, 16)
    blockspergrid_x = int(np.ceil(image.shape[0] / threadsperblock[0]))
    blockspergrid_y = int(np.ceil(image.shape[1] / threadsperblock[1]))
    blockspergrid = (blockspergrid_x, blockspergrid_y)
    
    # Launch the kernel
    histogram_kernel[blockspergrid, threadsperblock](d_image, d_histogram)
    
    # # Copy the histogram back to the host
    histogram = d_histogram.copy_to_host()
    
    return histogram

from PIL import Image
image = Image.open("/home/rjia/Pictures/activate_call_button.png")
# image.show()
compute_histogram(image)

TypingError: Failed in cuda mode pipeline (step: native lowering)
[1mFailed in nopython mode pipeline (step: nopython frontend)
[1m[1m[1m[1m- Resolution failure for literal arguments:
[1m[1mNo implementation of function Function(<function numpy_take at 0x7fb561f458b0>) found for signature:

 >>> numpy_take(array(uint8, 1d, C), Literal[int](0))

There are 2 candidate implementations:
[1m    - Of which 2 did not match due to:
    Overload in function 'numpy_take': File: numba/np/arrayobj.py: Line 4638.
      With argument(s): '(array(uint8, 1d, C), int64)':[0m
[1m     Rejected as the implementation raised a specific error:
       NumbaTypeError: Failed in cuda mode pipeline (step: native lowering)
     [1mFailed in nopython mode pipeline (step: nopython frontend)
     [1mOnly accept returning of array passed into the function as argument[0m
     [0m[1mDuring: lowering "$40call_method.2 = call $38load_method.1(func=$38load_method.1, args=[], kws=(), vararg=None, varkwarg=None, target=None)" at /home/rjia/.local/lib/python3.8/site-packages/numba/np/arrayobj.py (4646)[0m[0m
  raised from /home/rjia/.local/lib/python3.8/site-packages/numba/core/typed_passes.py:155
[0m[0m
[0m[1m- Resolution failure for non-literal arguments:
[1m[1mNo implementation of function Function(<function numpy_take at 0x7fb561f458b0>) found for signature:

 >>> numpy_take(array(uint8, 1d, C), int64)

There are 2 candidate implementations:
[1m   - Of which 2 did not match due to:
   Overload in function 'numpy_take': File: numba/np/arrayobj.py: Line 4638.
     With argument(s): '(array(uint8, 1d, C), int64)':[0m
[1m    Rejected as the implementation raised a specific error:
      NumbaTypeError: Failed in cuda mode pipeline (step: native lowering)
    [1mFailed in nopython mode pipeline (step: nopython frontend)
    [1mOnly accept returning of array passed into the function as argument[0m
    [0m[1mDuring: lowering "$40call_method.2 = call $38load_method.1(func=$38load_method.1, args=[], kws=(), vararg=None, varkwarg=None, target=None)" at /home/rjia/.local/lib/python3.8/site-packages/numba/np/arrayobj.py (4646)[0m[0m
  raised from /home/rjia/.local/lib/python3.8/site-packages/numba/core/typed_passes.py:155
[0m[0m
[0m[0m
[0m[1mDuring: resolving callee type: BoundFunction((<class 'numba.core.types.npytypes.Array'>, 'take') for array(uint8, 1d, C))[0m
[0m[1mDuring: typing of call at /home/rjia/.local/lib/python3.8/site-packages/numba/np/arrayobj.py (6411)
[0m
[1m
File "../../../../../.local/lib/python3.8/site-packages/numba/np/arrayobj.py", line 6411:[0m
[1m    def impl(a):
        <source elided>
        # a is an array(T, 0d, O), T is type, O is order
[1m        return a.take(0)
[0m        [1m^[0m[0m

[0m[1mDuring: lowering "$68call_method.11 = call $60load_method.7(histogram, pixel_value, $const66.10, func=$60load_method.7, args=[Var(histogram, 3549984451.py:4), Var(pixel_value, 3549984451.py:14), Var($const66.10, 3549984451.py:16)], kws=(), vararg=None, varkwarg=None, target=None)" at /tmp/ipykernel_235869/3549984451.py (16)[0m

In [3]:
from __future__ import division
from numba import cuda
import numpy
import math
import time

@cuda.jit
def my_kernel(io_array):
    pos = cuda.grid(1)
    if pos < io_array.size:
        io_array[pos] *= 4 # do the computation

# Host code   
data = numpy.ones(256)
threadsperblock = 256
blockspergrid = math.ceil(data.shape[0] / threadsperblock)
#TODO Remember to remove
start = time.time()
my_kernel[blockspergrid, threadsperblock](data)
data *= 4
print(time.time() - start)
print(data)




LinkerError: [222] Call to cuLinkAddData results in CUDA_ERROR_UNSUPPORTED_PTX_VERSION
ptxas application ptx input, line 9; fatal   : Unsupported .version 8.4; current version is '8.2'

In [2]:
from __future__ import division
from numba import cuda
import numpy as np
import math
import time

@cuda.jit
def my_kernel(io_array):
    pos = cuda.grid(1)
    if pos < io_array.size:
        io_array[pos] = 4 * np.cos(io_array[pos])  # Perform the computation

# Host code   
data = np.ones(512 * 512, dtype=np.float32)  # Use float32 for better GPU performance
threadsperblock = 256
blockspergrid = math.ceil(512)

# Allocate memory on device
device_data = cuda.to_device(data)

# Run kernel
my_kernel[blockspergrid, threadsperblock](device_data)


# Copy result back to host
start = time.time()
cuda.synchronize()
# Synchronize device to ensure computation is finished before stopping timer
print("GPU time:", time.time() - start)
data = device_data.copy_to_host()
print(data)

# Compare with CPU performance
start = time.time()
data = np.ones(512 * 512, dtype=np.float32)  # Use float32 for better GPU performance
data = 4 * np.cos(data)
print("CPU time:", time.time() - start)
print(data)

LinkerError: [222] Call to cuLinkAddData results in CUDA_ERROR_UNSUPPORTED_PTX_VERSION
ptxas application ptx input, line 9; fatal   : Unsupported .version 8.4; current version is '8.2'