<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_convolution_1d_tiled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version
!python3 -m pip install nvcc4jupyter
%load_ext nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp85qaatc3".


In [2]:
%%cuda

#include <assert.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define M 7

__constant__ int d_msk[M];

__global__ void convolve_1d(int *d_vec, int *d_res, int n) {
    extern __shared__ int s_vec[];
    int g_tid = threadIdx.x + blockIdx.x * blockDim.x;
    int l_tid = threadIdx.x;
    if (g_tid >= n) return;

    int r = M / 2;
    s_vec[l_tid] = d_vec[g_tid - r];
    if (l_tid < 2 * r) {
        s_vec[l_tid + blockDim.x] = d_vec[g_tid - r + blockDim.x];
    }

    __syncthreads();

    int tmp = 0;
    for (int i = g_tid; i <= g_tid + 2 * r; i++) {
        tmp += d_vec[i] * d_msk[i - g_tid];
    }
    d_res[g_tid] = tmp;
}

void verify_result(int *h_vec, int *h_msk, int *h_res, int n) {
    for (int i = 0; i < n; i++) {
        int tmp = 0;
        int r = M / 2;

        for (int j = i; j <= i + 2 * r; j++) {
            tmp += h_vec[j] * h_msk[j - i];
        }

        assert(tmp == h_res[i]);
    }
}

int main() {
    int n = 1 << 20;
    int r = M / 2;
    int p = n + r * 2;

    int n_bytes = sizeof(int) * n;
    int m_bytes = sizeof(int) * M;
    int p_bytes = sizeof(int) * p;

    int *h_vec = (int*) malloc(p_bytes);
    int *h_msk = (int*) malloc(m_bytes);
    int *h_res = (int*) malloc(n_bytes);

    int *d_vec, *d_res;
    cudaMalloc(&d_vec, p_bytes);
    cudaMalloc(&d_res, n_bytes);

    for (int i = 0; i < p; i++) {
        if (i < r || i >= n + r) {
            h_vec[i] = 0;
        } else {
            h_vec[i] = rand() % 100;
        }
    }
    for (int i = 0; i < M; i++) {
        h_msk[i] = rand() % 10;
    }
    for (int i = 0; i < n; i++) {
        h_res[i] = 0;
    }

    cudaMemcpy(d_vec, h_vec, p_bytes, cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(d_msk, h_msk, m_bytes);

    int num_threads = 1 << 8;
    dim3 blck_size(num_threads);
    dim3 grid_size((n + num_threads - 1) / num_threads);
    size_t smem_size = (num_threads + r * 2) * sizeof(int);

    convolve_1d<<<grid_size, blck_size, smem_size>>>(d_vec, d_res, n);
    cudaMemcpy(h_res, d_res, n_bytes, cudaMemcpyDeviceToHost);

    verify_result(h_vec, h_msk, h_res, n);

    cudaFree(d_vec);
    cudaFree(d_res);

    free(h_vec);
    free(h_msk);
    free(h_res);

    printf("Succees!");
    return 0;
}

Succees!


# Profiler

In [3]:
# Mount google drive to use a persistent directory structure
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
FOLDER = "cuda"
ORG = "TechDailyNotes"
REPO = "study-notes-cuda"

In [20]:
%mkdir /content/gdrive/MyDrive/{FOLDER}
%cd /content/gdrive/MyDrive/{FOLDER}
!echo 'YOUR_TOKEN' > /content/gdrive/MyDrive/{FOLDER}/token.txt
!echo 'YOUR_HANDLE' > /content/gdrive/MyDrive/{FOLDER}/git_username.txt

mkdir: cannot create directory ‘/content/gdrive/MyDrive/cuda’: File exists
/content/gdrive/MyDrive/cuda


In [22]:
with open(f'/content/gdrive/MyDrive/{FOLDER}/token.txt') as f:
    token = f.readline().strip()
# Use another file to store your github username
with open(f'/content/gdrive/MyDrive/{FOLDER}/git_username.txt') as f:
    handle = f.readline().strip()

YOUR_TOKEN = token
YOUR_HANDLE = handle

!git clone https://{YOUR_TOKEN}@github.com/{ORG}/{REPO}.git
!git pull

Cloning into 'study-notes-cuda'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 72 (delta 36), reused 16 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (72/72), 2.43 MiB | 3.33 MiB/s, done.
Resolving deltas: 100% (36/36), done.
fatal: not a git repository (or any parent up to mount point /content)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [7]:
%cd /content/gdrive/MyDrive/{FOLDER}/{REPO}
!ls

/content/gdrive/MyDrive/cuda/study-notes-cuda
basics					   cuda_convolution_1d_tiled.ipynb
cuda_basics.ipynb			   cuda_cublas.ipynb
cuda_convolution_1d_constant_memory.ipynb  cuda.ipynb
cuda_convolution_1d_naive.ipynb		   cuda_parallel_reduction.ipynb
cuda_convolution_1d_tiled		   README.md
cuda_convolution_1d_tiled.cu


In [9]:
!nvcc -o cuda_convolution_1d_tiled cuda_convolution_1d_tiled.cu

In [10]:
!nvprof ./cuda_convolution_1d_tiled

==2002== NVPROF is profiling process 2002, command: ./cuda_convolution_1d_tiled
==2002== Profiling application: ./cuda_convolution_1d_tiled
Succees!==2002== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   55.02%  785.13us         2  392.57us     640ns  784.49us  [CUDA memcpy HtoD]
                   40.96%  584.50us         1  584.50us  584.50us  584.50us  [CUDA memcpy DtoH]
                    4.02%  57.375us         1  57.375us  57.375us  57.375us  convolve_1d(int*, int*, int)
      API calls:   96.12%  101.79ms         2  50.897ms  134.00us  101.66ms  cudaMalloc
                    1.73%  1.8364ms         2  918.21us  905.01us  931.41us  cudaMemcpy
                    1.52%  1.6096ms         1  1.6096ms  1.6096ms  1.6096ms  cudaMemcpyToSymbol
                    0.41%  434.84us         2  217.42us  135.71us  299.13us  cudaFree
                    0.16%  166.73us       114  1.4620us     162ns  60.500us  cuDeviceGe