In [1]:
from common_lib import *

In [2]:
import sys
sys.path

['/home/bunspeader/.pyenv/versions/3.9.19/lib/python39.zip',
 '/home/bunspeader/.pyenv/versions/3.9.19/lib/python3.9',
 '/home/bunspeader/.pyenv/versions/3.9.19/lib/python3.9/lib-dynload',
 '',
 '/home/bunspeader/.pyenv/versions/3.9.19/envs/pt39/lib/python3.9/site-packages',
 '/home/bunspeader/.pyenv/versions/3.9.19/envs/pt39/lib/python3.9/site-packages/setuptools/_vendor',
 '/tmp/tmp0u62eh87']

In [5]:
os.environ['CUDA_LAUNCH_BLOCKING']='1'

In [6]:
%pip install -q wurlitzer ninja


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Vector Addition

In [11]:
def load_cuda(cuda_srcs, cpp_srcs, funcs, opt=False, verbose=False):
    return load_inline(cuda_sources=cuda_srcs, cpp_sources=cpp_srcs, functions=funcs,
                       extra_cuda_cflags=["-O2"] if opt else [], verbose=verbose, name="inline_ext")

In [8]:
cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
'''

In [98]:
cuda_srcs = cuda_begin + r'''
__global__ void rgb_to_grayscale_kernel(unsigned char* x, unsigned char* out, int n) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i<n) out[i] = 0.2989*x[i] + 0.5870*x[i+n] + 0.1140*x[i+2*n];
}

__global__ void vecAddKernel(unsigned char* A, unsigned char* B, unsigned char* C, int n) {
  int i = threadIdx.x + blockDim.x * blockIdx.x;
  if (i < n) {
    C[i] = A[i] + B[i];
  }
}

torch::Tensor rgb_to_grayscale(torch::Tensor input) {
    CHECK_INPUT(input);
    int h = input.size(1);
    int w = input.size(2);
    printf("h*w: %d*%d\n", h, w);
    auto output = torch::empty({h,w}, input.options());
    int threads = 256;
    rgb_to_grayscale_kernel<<<cdiv(w*h,threads), threads>>>(
        input.data_ptr<unsigned char>(), output.data_ptr<unsigned char>(), w*h);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}

void vec_add(torch::Tensor A, torch::Tensor B, torch::Tensor C, int n) {
  CHECK_INPUT(A);
  CHECK_INPUT(B);
  CHECK_INPUT(C);
  int threads = 256;
  vecAddKernel<<<cdiv(n, threads), threads>>>(A.data_ptr<unsigned char>(), B.data_ptr<unsigned char>(), C.data_ptr<unsigned char>(), n);
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}'''

In [99]:
cpp_srcs = ["torch::Tensor rgb_to_grayscale(torch::Tensor input);", 'void vec_add(torch::Tensor A, torch::Tensor B, torch::Tensor C, int n);']

In [100]:
module = load_cuda(cuda_srcs, cpp_srcs, ["rgb_to_grayscale", "vec_add"], verbose=True)

Using /home/bunspeader/.cache/torch_extensions/py39_cu124 as PyTorch extensions root...
The input conditions for extension module inline_ext have changed. Bumping to version 6 and re-building as inline_ext_v6...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/bunspeader/.cache/torch_extensions/py39_cu124/inline_ext/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module inline_ext_v6...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=inline_ext_v6 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/bunspeader/.pyenv/versions/3.9.19/envs/pt39/lib/python3.9/site-packages/torch/include -isystem /home/bunspeader/.pyenv/versions/3.9.19/envs/pt39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/bunspeader/.pyenv/versions/3.9.19/envs/pt39/lib/python3.9/site-packages/torch/include/TH -isystem /home/bunspeader/.pyenv/versions/3.9.19/envs/pt39/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda-12.8/include -isystem /home/bunspeader/.pyenv/versions/3.9.19/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /home/bunspeader/.cache/torch_extensions/py39_cu124/inline_ext/main.cpp -o main.o 
[2/3] /usr/local/cuda-12.8/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=

Loading extension module inline_ext_v6...


In [101]:
dir(module)

['__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'rgb_to_grayscale',
 'vec_add']

In [102]:
n = 100000000
a = torch.randint(0, 100, (n,), dtype=torch.uint8)
a_cuda = a.to("cuda")
b = torch.randint(0, 100, (n,), dtype=torch.uint8)
b_cuda = b.to("cuda")
c = torch.empty_like(a).to("cuda")

In [121]:
%%time
for _ in range(1000):
    a + b

CPU times: user 1min 40s, sys: 32.8 s, total: 2min 13s
Wall time: 13.4 s


In [122]:
%%time
for _ in range(1000):
    module.vec_add(a_cuda, b_cuda, c, n)

CPU times: user 1.81 s, sys: 9.09 ms, total: 1.82 s
Wall time: 1.81 s
