In [None]:
import numba
import numba.cuda

In [None]:
from numba import cuda
import numpy as np
from PIL import Image
import time
import matplotlib.pyplot as plt


img_path = '../images/8k.jpg'
img = np.array(Image.open(img_path))
img_flat = img.flatten().astype(np.uint8)
src = cuda.to_device(img_flat)

@cuda.jit
def rgb2gray(src, dest):
    idx = cuda.grid(1)
    if idx < dest.size: 
        r = src[idx * 3]
        g = src[idx * 3 + 1]
        b = src[idx * 3 + 2]
        gray = (r + g + b) // 3
        dest[idx] = gray

times = []

for threads_per_block in range (1, 1025):
    dest = cuda.device_array(img_flat.size // 3, dtype=np.uint8)
    blocks_per_grid = (dest.size + (threads_per_block - 1)) // threads_per_block
    average_time = 0
    for _ in range(10):  # Warm-up
        start = time.time()
        rgb2gray[blocks_per_grid, threads_per_block](src, dest)
        end = time.time()
        elapsed = end - start
        average_time += elapsed
    average_time /= 10
    times.append(average_time)

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(range(1, 1025), times)
plt.title("CUDA rgb2gray Performance vs Threads per Block")
plt.xlabel("Threads per Block")
plt.ylabel("Execution Time (s)")
plt.grid()
plt.show()

In [64]:
print(max(times), min(times))
print(times.index(max(times))+1, times.index(min(times))+1)

0.8098772525787353 6.318092346191406e-05
178 476


In [None]:
from numba import cuda
import numpy as np
from PIL import Image
import time

img_path = '../images/8k.jpg'

img = np.array(Image.open(img_path))
h, w, c = img.shape
img_flat = img.reshape(-1, 3)

@cuda.jit
def grayscale_flat(src, dat):
    idx = cuda.grid(1)  # 1D index
    if idx < src.shape[0]:
        r = src[idx, 0]
        g = src[idx, 1]
        b = src[idx, 2]
        dat[idx] = (r + g + b) // 3


threadsperblock = 256
blockspergrid = (img_flat.shape[0] + (threadsperblock - 1)) // threadsperblock

start = time.time()
img_gpu = cuda.to_device(img_flat)
dat_gpu = cuda.device_array(img_flat.shape[0], dtype=np.uint8)
grayscale_flat[blockspergrid, threadsperblock](img_gpu, dat_gpu)
dat = dat_gpu.copy_to_host()
dat_2d = dat.reshape(h, w)  # back to 2D grayscale image
end = time.time() - start


img_out = Image.fromarray(dat_2d)
img_out.save('../images/8k_gray_flat.png')
print(f"Time taken (1D kernel): {end} seconds")

Time taken (1D kernel): 0.12628698348999023 seconds


In [None]:
from PIL import Image 

img_path = '../images/8k.jpg'
img = np.array(Image.open(img_path))

start = time.time()
width = img.shape[1]
height = img.shape[0]

red = img[:, :, 0]
green = img[:, :, 1]
blue = img[:, :, 2]

gray = (red.astype(np.int8) + green.astype(np.int8) + blue.astype(np.int8)) / 3
target = np.zeros((height, width), dtype=np.uint8)
target[:, :] = gray

img_out = Image.fromarray(target)
img_out.save('../images/8k_gray_cpu.png')
end = time.time() - start

print(f"Time taken: {end} seconds")


Time taken: 1.7936859130859375 seconds
