<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_convolution_1d_naive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpowqfzzw1".


In [4]:
%%cuda

#include <assert.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

__global__ void convolve(
    int *d_vec, int *d_msk, int *d_res,
    int m_numElementsVec, int m_numElementsMsk
) {
    // Step 0: Get the thread index.
    int g_ti = blockIdx.x * blockDim.x + threadIdx.x;
    // if (g_ti >= m_numElementsVec) return;

    // Step 1: Compute the convolved result of the current grid.
    int sum = 0;
    int radius = m_numElementsMsk / 2;

    for (int mi = 0; mi < m_numElementsMsk; mi++) {
        int vi = g_ti - radius + mi;
        if (vi >= 0 && vi < m_numElementsVec) {
            sum += d_vec[vi] * d_msk[mi];
        }
    }

    // Step 2: Register the result back to the vector.
    d_res[g_ti] = sum;
}

void m_init(int *m_array, int m_size) {
    for (int i = 0; i < m_size; i++) {
        m_array[i] = rand() % 100;
    }
}

void verify(
    int *h_vec, int *h_msk, int *h_res,
    int m_numElementsVec, int m_numElementsMsk
) {
    int radius = m_numElementsMsk / 2;

    for (int ri = 0; ri < m_numElementsVec; ri++) {
        int sum = 0;
        for (int mi = 0; mi < m_numElementsMsk; mi++) {
            int vi = ri - radius + mi;
            if (vi >= 0 && vi < m_numElementsVec) {
                sum += h_vec[vi] * h_msk[mi];
            }
        }
        assert(sum == h_res[ri]);
    }
}

int main() {
    // Step 0: Set up parameters.
    int m_numElementsVec = 1 << 20;
    int m_numElementsMsk = 7;
    size_t m_numBytesVec = sizeof(int) * m_numElementsVec;
    size_t m_numBytesMsk = sizeof(int) * m_numElementsMsk;

    int d_blockDimX = 1 << 8;
    int d_gridDimX = (int) ceil(1.0 * m_numElementsVec / d_blockDimX);

    // Step 1: Init memories on both cpu and gpu.
    int *h_vec = new int[m_numElementsVec];
    int *h_msk = new int[m_numElementsMsk];
    int *h_res = new int[m_numElementsVec];
    m_init(h_vec, m_numElementsVec);
    m_init(h_msk, m_numElementsMsk);

    int *d_vec, *d_msk, *d_res;
    cudaMalloc(&d_vec, m_numBytesVec);
    cudaMalloc(&d_msk, m_numBytesMsk);
    cudaMalloc(&d_res, m_numBytesVec);

    // Step 2: Launch the kernel function to convolve the vector with the mask.
    cudaMemcpy(d_vec, h_vec, m_numElementsVec, cudaMemcpyHostToDevice);
    cudaMemcpy(d_msk, h_msk, m_numElementsMsk, cudaMemcpyHostToDevice);
    convolve<<<d_gridDimX, d_blockDimX>>>(
        d_vec, d_msk, d_res,
        m_numElementsVec, m_numElementsMsk
    );
    cudaMemcpy(h_res, d_res, m_numElementsVec, cudaMemcpyDeviceToHost);

    verify(h_vec, h_msk, h_res, m_numElementsVec, m_numElementsMsk);

    // Step 3: Clear memories.
    delete[] h_vec;
    delete[] h_msk;
    delete[] h_res;
    cudaFree(d_vec);
    cudaFree(d_msk);
    cudaFree(d_res);

    printf("Success!");
    return 0;
}

cuda_exec.out: /tmp/tmpowqfzzw1/43099496-155f-4d0e-af94-e1ffe3b00020/single_file.cu:52: void verify(int*, int*, int*, int, int): Assertion `sum == h_res[ri]' failed.

