In [1]:
pip install ninja



In [2]:
%%writefile blur.cu

#include<stdio.h>
#include<math.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAStream.h>
#include<torch/torch.h>



#define BLUR 1 //3x3 blur filter

__global__
void blurKernel(unsigned char* output, unsigned char* input, int width, int height)
{
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int channel = threadIdx.z;

    int baseOffset = channel * height * width;
    if (col < width && row < height) {

        int pixVal = 0;
        int pixels = 0;

        for (int blurRow=-BLUR; blurRow <= BLUR; blurRow += 1) {
            for (int blurCol=-BLUR; blurCol <= BLUR; blurCol += 1) {
                int curRow = row + blurRow;
                int curCol = col + blurCol;
                if (curRow >= 0 && curRow < height && curCol >=0 && curCol < width) {
                    pixVal += input[baseOffset + curRow * width + curCol];
                    pixels += 1;
                }
            }
        }

        output[baseOffset + row * width + col] = (unsigned char)(pixVal / pixels);
    }
    return;
}

torch::Tensor blur(torch::Tensor image){
  const auto channels = image.size(0);
  const auto height = image.size(1);
  const auto width = image.size(2);
  //Create output tensor, set dtype as unsigned int 8 bits and set device as image's device
  auto result = torch::empty_like(image);
  dim3 threads_per_block(16, 16, channels);
  dim3 number_of_blocks(ceil(width/ 16.0),ceil(height/ 16.0));
  //launch the kernel, 0 is the shared memory size per block and getCurrentCUDAStream() is the stream to use for the kernel ensuring kernel executes in current stream
  blurKernel<<<number_of_blocks, threads_per_block, 0, at::cuda::getCurrentCUDAStream()>>>(
        result.data_ptr<unsigned char>(),
        image.data_ptr<unsigned char>(),
        width,
        height
    );
  //Macro for cuda error checks
  C10_CUDA_KERNEL_LAUNCH_CHECK();
  return result;
}

Overwriting blur.cu


In [3]:
!nvcc -o blur_extension.so blur.cu -I/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -I/usr/local/lib/python3.10/dist-packages/torch/include -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcudart -lc10 -ltorch -ltorch_cpu -ltorch_cuda -shared -std=c++11 -Xcompiler -fPIC -O2

In file included from [01m[K/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include/torch/torch.h:3[m[K,
                 from [01m[Kblur.cu:6[m[K:
[01m[K/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include/torch/all.h:4:2:[m[K [01;31m[Kerror: [m[K#error C++17 or later compatible compiler is required to use PyTorch.
    4 | #[01;31m[Kerror[m[K C++17 or later compatible compiler is required to use PyTorch.
      |  [01;31m[K^~~~~[m[K
In file included from [01m[K/usr/local/cuda/include/thrust/detail/config/config.h:27[m[K,
                 from [01m[K/usr/local/cuda/include/thrust/detail/config.h:23[m[K,
                 from [01m[K/usr/local/cuda/include/thrust/complex.h:24[m[K,
                 from [01m[K/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/complex.h:8[m[K,
                 from [01m[K/usr/local/lib/python3.10/dist-packages/torch/include/c10/util/Half.h:15[m[K,
          

In [4]:
from pathlib import Path
import torch
from torchvision.io import read_image, write_png
from torch.utils.cpp_extension import load_inline

def compile_extension():
    #this is the source for cuda kernel code(runs on gpu)
    cuda_source = Path("blur.cu").read_text()
    #this is the source for non cuda kernel code(runs on host) that is the wrapper function
    cpp_source = "torch::Tensor blur(torch::Tensor image);"

    # Load the CUDA kernel as a PyTorch extension
    blur_extension = load_inline(
        name="blur_extension",
        cpp_sources=cpp_source,
        cuda_sources=cuda_source,
        # this is the wrapper function calling the CUDA kernel
        functions=["blur"],
        with_cuda=True,
        extra_cuda_cflags=["-O2"],
        #build_directory='./cuda_build'
    )
    return blur_extension

In [9]:
def main():
    # Load the extension
    ext = compile_extension()
    x = read_image("test.jpg").contiguous().cuda()
    y = ext.blur(x)
    write_png(y.cpu(), "output.png")

if __name__ == "__main__":
    main()

torch.Size([3, 800, 1200])
