In [None]:
import cv2
import numpy as np
import torch
import cupy as cp
import time
import matplotlib.pyplot as plt
from scipy.signal import convolve2d
from google.colab import files
uploaded = files.upload()
image_path = next(iter(uploaded))
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
height, width = image_rgb.shape[:2]
CPU_CORES = 2
torch.set_num_threads(CPU_CORES)
def gaussian_kernel(size=21, sigma=5.0):
    ax = np.linspace(-(size // 2), size // 2, size)
    xx, yy = np.meshgrid(ax, ax)
    kernel = np.exp(-(xx**2 + yy**2) / (2.0 * sigma**2))
    return kernel / np.sum(kernel)
def gaussian_blur_serial(image, kernel):
  blurred = np.zeros_like(image, dtype=np.float32)
    for c in range(3):
        blurred[..., c] = convolve2d(image[..., c], kernel, mode='same', boundary='symm')
    return np.clip(blurred, 0, 255).astype(np.uint8)
def gaussian_blur_pytorch_cpu(image, kernel, kernel_size):
    image_tensor = torch.tensor(image.transpose(2, 0, 1), dtype=torch.float32)
    kernel_tensor = torch.tensor(kernel, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    blurred = []
    for c in range(3):
        input_c = image_tensor[c].unsqueeze(0).unsqueeze(0)
        padded = torch.nn.functional.pad(input_c, (kernel_size//2,)*4, mode='reflect')
        out = torch.nn.functional.conv2d(padded, kernel_tensor)
        blurred.append(out.squeeze().numpy())
    blurred_img = np.stack(blurred, axis=2)
    return np.clip(blurred_img, 0, 255).astype(np.uint8)
raw_kernel_code = r'''
extern "C" __global__
void gaussian_blur(const float* img, float* out, const float* kernel,
                   int width, int height, int channels, int ksize) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int half_k = ksize / 2;
    if (x >= width || y >= height) return;
    for (int c = 0; c < channels; ++c) {
        float val = 0.0;
        for (int ky = -half_k; ky <= half_k; ++ky) {
            for (int kx = -half_k; kx <= half_k; ++kx) {
                int ix = min(max(x + kx, 0), width - 1);
                int iy = min(max(y + ky, 0), height - 1);
                int img_idx = (iy * width + ix) * channels + c;
                int k_idx = (ky + half_k) * ksize + (kx + half_k);
                val += img[img_idx] * kernel[k_idx];
            }
        }
        int out_idx = (y * width + x) * channels + c;
        out[out_idx] = val;
    }
}
'''
module = cp.RawModule(code=raw_kernel_code)
raw_gaussian_blur = module.get_function("gaussian_blur")
def gaussian_blur_cupy_raw(image, kernel, kernel_size):
    height, width, channels = image.shape
    img_cp = cp.asarray(image, dtype=cp.float32)
    out_cp = cp.zeros_like(img_cp)
    img_flat = img_cp.ravel()
    out_flat = out_cp.ravel()
    kernel_flat = cp.asarray(kernel, dtype=cp.float32).ravel()
      block = (16, 16)
    grid = ((width + block[0] - 1) // block[0],
            (height + block[1] - 1) // block[1])
    raw_gaussian_blur(grid, block,
                      (img_flat, out_flat, kernel_flat,
                       np.int32(width), np.int32(height),
                       np.int32(channels), np.int32(kernel_size)))
    cp.cuda.Device(0).synchronize()
    cp._default_memory_pool.free_all_blocks()
    return cp.asnumpy(cp.clip(out_cp, 0, 255)).astype(np.uint8)
kernel_sizes = [7, 13, 15, 17, 31]
results = []
GPU_SMS = 40
for size in kernel_sizes:
    sigma = size / 2.0
    kernel_np = gaussian_kernel(size, sigma)

    _ = gaussian_blur_pytorch_cpu(image_rgb, kernel_np, size)
    _ = gaussian_blur_cupy_raw(image_rgb, kernel_np, size)
    start = time.time()
    blur_serial = gaussian_blur_serial(image_rgb, kernel_np)
    time_serial = time.time() - start
    start = time.time()
    blur_pytorch = gaussian_blur_pytorch_cpu(image_rgb, kernel_np, size)
    time_pytorch = time.time() - start
    start = time.time()
    blur_gpu = gaussian_blur_cupy_raw(image_rgb, kernel_np, size)
    time_gpu = time.time() - start
    speedup_pytorch = time_serial / time_pytorch
    speedup_cupy = time_serial / time_gpu
    cpu_eff = speedup_pytorch / CPU_CORES
    gpu_eff_sm = speedup_cupy / GPU_SMS
    cpu_gpu_ratio = time_pytorch / time_gpu
    results.append({
        'kernel_size': f"{size}x{size}",
        'serial': time_serial,
        'pytorch': time_pytorch,
        'cupy': time_gpu,
        'speedup_pytorch': speedup_pytorch,
        'speedup_cupy': speedup_cupy,
        'cpu_eff': cpu_eff,
        'gpu_eff_sm': gpu_eff_sm,
        'cpu_gpu_ratio': cpu_gpu_ratio,
        'blurred_serial': blur_serial,
        'blurred_pytorch': blur_pytorch,
        'blurred_gpu': blur_gpu
    })
   GPU_CORES = 2560
print("="*90)
print("Gaussian Blur ")
print(f"Image Size: {width} x {height}")
print("="*90)
print("Processing Units Used")
print(f"CPU Cores: {CPU_CORES}")
print(f"GPU: NVIDIA Tesla T4")
print(f"GPU Streaming Multiprocessors (SMs): 40")
print(f"Estimated CUDA Cores: {GPU_CORES}")
print(f"GPU Memory: 16.0 GB")
print("="*135)
print(f"{'Kernel Size':<15}{'Serial(s)':<15}{'PyTorch(s)':<15}{'CuPy(s)':<15}"
      f"{'CPU SpdUp':<15}{'GPU SpdUp':<15}{'CPU Eff':<15}{'GPU Eff(SM)':<15}{'CPU:GPU Ratio':<15}")
print("-" * 135)
for r in results:
    print(f"{r['kernel_size']:<15}{r['serial']:<15.6f}{r['pytorch']:<15.6f}"
          f"{r['cupy']:<15.6f}{r['speedup_pytorch']:<15.2f}"
          f"{r['speedup_cupy']:<15.2f}{r['cpu_eff']:<15.4f}"
          f"{r['gpu_eff_sm']:<15.4f}{r['cpu_gpu_ratio']:<15.4f}")
plt.figure(figsize=(15, 5))
last_result = results[-1]
plt.subplot(1, 4, 1); plt.imshow(image_rgb); plt.title("Original"); plt.axis('off')
plt.subplot(1, 4, 2); plt.imshow(last_result['blurred_serial']); plt.title(f"Serial
{last_result['kernel_size']}"); plt.axis('off')
plt.subplot(1, 4, 3); plt.imshow(last_result['blurred_pytorch']); plt.title("PyTorch CPU"); plt.axis('off')
plt.subplot(1, 4, 4); plt.imshow(last_result['blurred_gpu']); plt.title("CuPy GPU"); plt.axis('off')
plt.tight_layout()
plt.show()