<a href="https://colab.research.google.com/github/PuppyQ08/CUDA-in-notebook/blob/main/vecadd_pytorch_load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Hi, I am QQY. Here I used the pytorch load inline to customize the vecadd kernel and do profiling. Let's how it works!

In [3]:
!pip install ninja

Collecting ninja
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/180.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.13.0


In [11]:
import torch
from torch.utils.cpp_extension import load_inline

# Define the CUDA kernel and C++ wrapper
cuda_source = '''
__global__ void square_matrix_kernel(const float* matrix, float* result, int width, int height) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < height && col < width) {
        int idx = row * width + col;
        result[idx] = matrix[idx] * matrix[idx];
    }
}

torch::Tensor square_matrix(torch::Tensor matrix) {
    const auto height = matrix.size(0);
    const auto width = matrix.size(1);

    auto result = torch::empty_like(matrix);

    dim3 threads_per_block(16, 16);
    dim3 number_of_blocks((width + threads_per_block.x - 1) / threads_per_block.x,
                          (height + threads_per_block.y - 1) / threads_per_block.y);

    square_matrix_kernel<<<number_of_blocks, threads_per_block>>>(
        matrix.data_ptr<float>(), result.data_ptr<float>(), width, height);

    return result;
    }
'''

cpp_source = "torch::Tensor square_matrix(torch::Tensor matrix);"

# Load the CUDA kernel as a PyTorch extension
square_matrix_ext = load_inline(
    name='square_matrix_ext',
    cpp_sources=cpp_source,
    cuda_sources=cuda_source,
    functions=['square_matrix'],
    with_cuda=True,
    extra_cuda_cflags=["-O2", "-gencode=arch=compute_75,code=sm_75"],
    build_directory='./',
    # extra_cuda_cflags=['--expt-relaxed-constexpr']
)

a = torch.tensor([[1., 2., 3.], [4., 5., 6.]], device='cuda')

# Profiling the execution time
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

start_event.record()
result = square_matrix_ext.square_matrix(a)
end_event.record()
torch.cuda.synchronize() # Wait for the events to complete
elapsed_time_ms = start_event.elapsed_time(end_event)

print(result)
print(f"Execution time of square_matrix_ext.square_matrix: {elapsed_time_ms:.3f} ms")

tensor([[ 1.,  4.,  9.],
        [16., 25., 36.]], device='cuda:0')
Execution time of square_matrix_ext.square_matrix: 0.356 ms


In [12]:
import torch
from torch.utils.cpp_extension import load
import os

# Define the CUDA kernel source code
cuda_source_content = '''
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void square_matrix_kernel(const float* matrix, float* result, int width, int height) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < height && col < width) {
        int idx = row * width + col;
        result[idx] = matrix[idx] * matrix[idx];
    }
}

// C++ wrapper for the CUDA kernel
torch::Tensor square_matrix_cuda(torch::Tensor matrix) {
    const auto height = matrix.size(0);
    const auto width = matrix.size(1);

    auto result = torch::empty_like(matrix);

    dim3 threads_per_block(16, 16);
    dim3 number_of_blocks((width + threads_per_block.x - 1) / threads_per_block.x,
                          (height + threads_per_block.y - 1) / threads_per_block.y);

    square_matrix_kernel<<<number_of_blocks, threads_per_block>>>(
        matrix.data_ptr<float>(), result.data_ptr<float>(), width, height);

    return result;
}
'''

# Define the C++ source code with Pybind11 bindings
cpp_source_content = '''
#include <torch/extension.h>

torch::Tensor square_matrix_cuda(torch::Tensor matrix);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("square_matrix", &square_matrix_cuda, "Square matrix CUDA kernel");
}
'''

# Write the source codes to files
with open('square_matrix_cuda.cu', 'w') as f:
    f.write(cuda_source_content)

with open('square_matrix_cpp.cpp', 'w') as f:
    f.write(cpp_source_content)

# Load the CUDA kernel as a PyTorch extension
square_matrix_ext_load = load(
    name='square_matrix_ext_load',
    sources=['square_matrix_cpp.cpp', 'square_matrix_cuda.cu'],
    extra_cuda_cflags=["-O2", "-gencode=arch=compute_75,code=sm_75"],
    is_python_module=True,
    verbose=True
)

a = torch.tensor([[1., 2., 3.], [4., 5., 6.]], device='cuda')

# Profiling the execution time
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

start_event.record()
result_load = square_matrix_ext_load.square_matrix(a)
end_event.record()
torch.cuda.synchronize() # Wait for the events to complete
elapsed_time_ms_load = start_event.elapsed_time(end_event)

print("Result using load style:")
print(result_load)
print(f"Execution time of square_matrix_ext_load.square_matrix: {elapsed_time_ms_load:.3f} ms")

# Clean up the generated files (optional)
# os.remove('square_matrix_cuda.cu')
# os.remove('square_matrix_cpp.cpp')


Result using load style:
tensor([[ 1.,  4.,  9.],
        [16., 25., 36.]], device='cuda:0')
Execution time of square_matrix_ext_load.square_matrix: 0.604 ms


Jesus, it is super slow....It talke 1 min to compile. It lost the point for quick showing.