In [None]:
!pip install ninja --quiet
!nvidia-smi
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pathlib import Path
import torch
from torchvision.io import read_image, write_png
from torch.utils.cpp_extension import load_inline
import os


def compile_extension():
    cuda_source = Path("/content/drive/MyDrive/Cuda_Learning/kernels/blur.cu").read_text()
    cpp_source = "torch::Tensor blur_image(torch::Tensor image);"

    build_directory = './cuda_build'
    if not os.path.exists(build_directory):
        os.makedirs(build_directory)

    # Load the CUDA kernel as a PyTorch extension
    blur_extension = load_inline(
        name="blur",
        cpp_sources=cpp_source,
        cuda_sources=cuda_source,
        functions=["blur_image"],
        with_cuda=True,
        extra_cuda_cflags=["-O2"],
        verbose=True,
        build_directory=build_directory,
    )
    return blur_extension


def _main():
    """
    Use torch cpp inline extension function to compile the kernel in grayscale_kernel.cu.
    Read input image, convert it to grayscale via custom cuda kernel and write it out as png.
    """
    ext = compile_extension()

    x = read_image("/content/drive/MyDrive/Cuda_Learning/imgs/test.png").permute(1, 2, 0).cuda()
    print("mean:", x.float().mean())
    print("Input image:", x.shape, x.dtype)

    assert x.dtype == torch.uint8

    y = ext.blur_image(x)

    print("Output image:", y.shape, y.dtype)
    print("mean", y.float().mean())
    write_png(y.permute(2, 0, 1).cpu(), "/content/drive/MyDrive/Cuda_Learning/imgs_out/output.png")

_main()