<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_convolution_1d_naive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpiyig94bm".


# Chapter 1: Naive 1-D Convolution


## Attempt 1

In [None]:
%%cuda

#include <assert.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

__global__ void convolve(
    int *d_vec, int *d_msk, int *d_res,
    int m_numElementsVec, int m_numElementsMsk
) {
    // Step 0: Get the thread index.
    int g_ti = blockIdx.x * blockDim.x + threadIdx.x;
    // if (g_ti >= m_numElementsVec) return;

    // Step 1: Compute the convolved result of the current grid.
    int sum = 0;
    int radius = m_numElementsMsk / 2;

    for (int mi = 0; mi < m_numElementsMsk; mi++) {
        int vi = g_ti - radius + mi;
        if (vi >= 0 && vi < m_numElementsVec) {
            sum += d_vec[vi] * d_msk[mi];
        }
    }

    // Step 2: Register the result back to the vector.
    d_res[g_ti] = sum;
}

void m_init(int *m_array, int m_size) {
    for (int i = 0; i < m_size; i++) {
        m_array[i] = rand() % 100;
    }
}

void verify(
    int *h_vec, int *h_msk, int *h_res,
    int m_numElementsVec, int m_numElementsMsk
) {
    int radius = m_numElementsMsk / 2;

    for (int ri = 0; ri < m_numElementsVec; ri++) {
        int sum = 0;
        for (int mi = 0; mi < m_numElementsMsk; mi++) {
            int vi = ri - radius + mi;
            if (vi >= 0 && vi < m_numElementsVec) {
                sum += h_vec[vi] * h_msk[mi];
            }
        }
        assert(sum == h_res[ri]);
    }
}

int main() {
    // Step 0: Set up parameters.
    int m_numElementsVec = 1 << 20;
    int m_numElementsMsk = 7;
    size_t m_numBytesVec = sizeof(int) * m_numElementsVec;
    size_t m_numBytesMsk = sizeof(int) * m_numElementsMsk;

    int d_blockDimX = 1 << 8;
    int d_gridDimX = (int) ceil(1.0 * m_numElementsVec / d_blockDimX);

    // Step 1: Init memories on both cpu and gpu.
    int *h_vec = new int[m_numElementsVec];
    int *h_msk = new int[m_numElementsMsk];
    int *h_res = new int[m_numElementsVec];
    m_init(h_vec, m_numElementsVec);
    m_init(h_msk, m_numElementsMsk);

    int *d_vec, *d_msk, *d_res;
    cudaMalloc(&d_vec, m_numBytesVec);
    cudaMalloc(&d_msk, m_numBytesMsk);
    cudaMalloc(&d_res, m_numBytesVec);

    // Step 2: Launch the kernel function to convolve the vector with the mask.
    cudaMemcpy(d_vec, h_vec, m_numElementsVec, cudaMemcpyHostToDevice);
    cudaMemcpy(d_msk, h_msk, m_numElementsMsk, cudaMemcpyHostToDevice);
    convolve<<<d_gridDimX, d_blockDimX>>>(
        d_vec, d_msk, d_res,
        m_numElementsVec, m_numElementsMsk
    );
    cudaMemcpy(h_res, d_res, m_numElementsVec, cudaMemcpyDeviceToHost);

    verify(h_vec, h_msk, h_res, m_numElementsVec, m_numElementsMsk);

    // Step 3: Clear memories.
    delete[] h_vec;
    delete[] h_msk;
    delete[] h_res;
    cudaFree(d_vec);
    cudaFree(d_msk);
    cudaFree(d_res);

    printf("Success!");
    return 0;
}

cuda_exec.out: /tmp/tmpdd2962sg/34018522-89d7-47b9-bbf7-641209b6eae9/single_file.cu:51: void verify(int*, int*, int*, int, int): Assertion `sum == h_res[ri]' failed.



## Attempt 2

In [None]:
%%cuda

#include <assert.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <iostream>

__global__ void convolve_1d(int *d_vec, int *d_msk, int *d_res, int n, int m) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid >= n) return;

    int tmp = 0;

    int lo = tid - m / 2;
    int hi = tid + m / 2;

    for (int i = lo; i <= hi; i++) {
        if (i >= 0 && i < n) {
            tmp += d_vec[i] * d_msk[i-lo];
        }
    }

    d_res[tid] = tmp;
}

void verify_result(int *h_vec, int *h_msk, int *h_res, int n, int m) {
    for (int i = 0; i < n; i++) {
        int tmp = 0;

        int lo = i - m / 2;
        int hi = i + m / 2;

        for (int j = lo; j <= hi; j++) {
            if (j >= 0 && j < n) {
                tmp += h_vec[j] * h_msk[j-lo];
            }
        }

        assert(tmp == h_res[i]);
    }
}

int main() {
    int n = 1 << 20;
    int m = 7;
    int bytes_n = sizeof(int) * n;
    int bytes_m = sizeof(int) * m;

    int *h_vec = new int[n];
    int *h_msk = new int[m];
    int *h_res = new int[n];

    for (int i = 0; i < n; i++) {
        h_vec[i] = rand() % 100;
        h_msk[i] = 0;
    }
    for (int i = 0; i < m; i++) {
        h_msk[i] = rand() % 10;
    }

    int *d_vec, *d_msk, *d_res;
    cudaMalloc(&d_vec, bytes_n);
    cudaMalloc(&d_msk, bytes_m);
    cudaMalloc(&d_res, bytes_n);

    cudaMemcpy(d_vec, h_vec, bytes_n, cudaMemcpyHostToDevice);
    cudaMemcpy(d_msk, h_msk, bytes_m, cudaMemcpyHostToDevice);

    int size_blck = 256;
    int size_grid = (n + size_blck - 1) / size_blck;

    // convolve_1d<<<size_grid, size_blck>>>(d_vec, d_msk, d_res, n, m);

    cudaMemcpy(h_res, d_res, bytes_n, cudaMemcpyDeviceToHost);

    // verify_result(h_vec, h_msk, h_res, n, m);

    cudaFree(d_vec);
    cudaFree(d_msk);
    cudaFree(d_res);

    delete h_vec;
    delete h_msk;
    delete h_res;

    printf("Success!");
    std::cout << "Success!" << std::endl;
    return 0;
}




## Attempt 3

In [2]:
%%cuda

#include <assert.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

__global__ void convolve_1d(int *d_vec, int *d_msk, int *d_res, int n, int m) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid >= n) return;

    int tmp = 0;
    int lo = tid - m / 2;
    int hi = tid + m / 2;

    for (int i = lo; i <= hi; i++) {
        if (i < 0 || i >= n) continue;
        tmp += d_vec[i] * d_msk[i-lo];
    }

    d_res[tid] = tmp;
}

void verify_result(int *h_vec, int *h_msk, int *h_res, int n, int m) {
    for (int i = 0; i < n; i++) {
        int tmp = 0;
        int lo = i - m / 2;
        int hi = i + m / 2;

        for (int j = lo; j <= hi; j++) {
            if (j < 0 || j >= n) continue;
            tmp += h_vec[j] * h_msk[j-lo];
        }

        if (tmp != h_res[i]) {
            // printf("Incorrect: (tmp) %d != %d\n", tmp, h_res[i]);
        }
        assert(tmp == h_res[i]);
    }

    printf("All pass!\n");
}

int main() {
    int n = 1 << 20;
    int m = 7;
    int n_bytes = sizeof(int) * n;
    int m_bytes = sizeof(int) * m;

    int *h_vec = (int*) malloc(n_bytes);
    int *h_msk = (int*) malloc(m_bytes);
    int *h_res = (int*) malloc(n_bytes);

    int *d_vec, *d_msk, *d_res;
    cudaMalloc(&d_vec, n_bytes);
    cudaMalloc(&d_msk, m_bytes);
    cudaMalloc(&d_res, n_bytes);

    for (int i = 0; i < n; i++) {
        h_vec[i] = rand() % 100;
        h_res[i] = 0;
    }
    for (int i = 0; i < m; i++) {
        h_msk[i] = rand() % 10;
    }

    cudaMemcpy(d_vec, h_vec, n_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_msk, h_msk, m_bytes, cudaMemcpyHostToDevice);

    int num_threads = 1 << 8;
    dim3 blck_size(num_threads);
    dim3 grid_size((n + num_threads - 1) / num_threads);

    convolve_1d<<<grid_size, blck_size>>>(d_vec, d_msk, d_res, n, m);
    cudaMemcpy(h_res, d_res, n_bytes, cudaMemcpyDeviceToHost);
    verify_result(h_vec, h_msk, h_res, n, m);

    cudaFree(d_vec);
    cudaFree(d_msk);
    cudaFree(d_res);

    free(h_vec);
    free(h_msk);
    free(h_res);

    printf("Success!");
    return 0;
}

All pass!
Success!


# Chapter 2: 1D-Convolution with Constant Memory

## Attempt 1