In [None]:
import cv2
import numpy as np
import torch
import cupy as cp
import time
import matplotlib.pyplot as plt
from scipy.signal import convolve2d
from google.colab import files
uploaded = files.upload()
image_path = next(iter(uploaded))
image = cv2.imread(image_path)
image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
height, width = image_gray.shape
CPU_CORES = 2
torch.set_num_threads(CPU_CORES)
def sobel_serial(image):
    Kx = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
    Ky = np.array([[1, 2, 1], [0, 0, 0], [-1, -2, -1]])
    Gx = convolve2d(image, Kx, mode='same', boundary='symm')
    Gy = convolve2d(image, Ky, mode='same', boundary='symm')
    edge = np.hypot(Gx, Gy)
    return np.clip(edge, 0, 255).astype(np.uint8)
def sobel_pytorch_cpu(image):
    image_tensor = torch.tensor(image, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    Kx = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    Ky = torch.tensor([[1, 2, 1], [0, 0, 0], [-1, -2, -1]], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    padded = torch.nn.functional.pad(image_tensor, (1, 1, 1, 1), mode='reflect')
    Gx = torch.nn.functional.conv2d(padded, Kx)
    Gy = torch.nn.functional.conv2d(padded, Ky)
    edge = torch.sqrt(Gx ** 2 + Gy ** 2).squeeze().numpy()
    return np.clip(edge, 0, 255).astype(np.uint8)
raw_sobel_code = r'''
extern "C" _global_
void sobel_edge(const float* img, float* out, int width, int height) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;

    float Kx[3][3] = {{-1, 0, 1}, {-2, 0, 2}, {-1, 0, 1}};
    float Ky[3][3] = {{1, 2, 1}, {0, 0, 0}, {-1, -2, -1}};

    if (x >= 1 && x < width - 1 && y >= 1 && y < height - 1) {
        float gx = 0.0f;
        float gy = 0.0f;
        for (int ky = -1; ky <= 1; ky++) {
          for (int kx = -1; kx <= 1; kx++) {
                float pixel = img[(y + ky) * width + (x + kx)];
                gx += pixel * Kx[ky + 1][kx + 1];
                gy += pixel * Ky[ky + 1][kx + 1];
            }
        }
        float val = sqrtf(gx * gx + gy * gy);
        out[y * width + x] = val;
    }
}
'''
sobel_module = cp.RawModule(code=raw_sobel_code)
sobel_kernel = sobel_module.get_function("sobel_edge")

def sobel_cupy(image):
    h, w = image.shape
    img_cp = cp.asarray(image, dtype=cp.float32)
    out_cp = cp.zeros_like(img_cp)
    block = (16, 16)
    grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
    sobel_kernel(grid, block, (img_cp, out_cp, np.int32(w), np.int32(h)))
    cp.cuda.Device(0).synchronize()
    cp._default_memory_pool.free_all_blocks()
    return cp.asnumpy(cp.clip(out_cp, 0, 255)).astype(np.uint8)
_ = sobel_pytorch_cpu(image_gray)
_ = sobel_cupy(image_gray)
start = time.time()
edge_serial = sobel_serial(image_gray)
t_serial = time.time() - start

start = time.time()
edge_pytorch = sobel_pytorch_cpu(image_gray)
t_pytorch = time.time() - start

start = time.time()
edge_gpu = sobel_cupy(image_gray)
t_gpu = time.time() - start

speedup_pytorch = t_serial / t_pytorch
speedup_cupy = t_serial / t_gpu

# Report
print("="*80)
print("Sobel Edge Detection Benchmark")
print(f"Image Size: {width} x {height}")
print(f"CPU Cores Used: {CPU_CORES}")
print(f"GPU: NVIDIA Tesla T4 (40 SMs)")
print("="*80)
print(f"{'Method':<20}{'Time (s)':<15}{'Speedup':<15}")
print("-"*50)
print(f"{'Serial CPU':<20}{t_serial:<15.6f}{'1.00':<15}")
print(f"{'PyTorch CPU':<20}{t_pytorch:<15.6f}{speedup_pytorch:<15.2f}")
print(f"{'CuPy GPU':<20}{t_gpu:<15.6f}{speedup_cupy:<15.2f}")
plt.figure(figsize=(15, 4))
plt.subplot(1, 4, 1); plt.imshow(image_gray, cmap='gray'); plt.title("Original"); plt.axis('off')
plt.subplot(1, 4, 2); plt.imshow(edge_serial, cmap='gray'); plt.title("Serial CPU"); plt.axis('off')
plt.subplot(1, 4, 3); plt.imshow(edge_pytorch, cmap='gray'); plt.title("PyTorch CPU"); plt.axis('off')
plt.subplot(1, 4, 4); plt.imshow(edge_gpu, cmap='gray'); plt.title("CuPy GPU"); plt.axis('off')
plt.tight_layout()
plt.show()