<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_convolution_1d_tiled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version
!python3 -m pip install nvcc4jupyter
%load_ext nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp85qaatc3".


In [2]:
%%cuda

#include <assert.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define M 7

__constant__ int d_msk[M];

__global__ void convolve_1d(int *d_vec, int *d_res, int n) {
    extern __shared__ int s_vec[];
    int g_tid = threadIdx.x + blockIdx.x * blockDim.x;
    int l_tid = threadIdx.x;
    if (g_tid >= n) return;

    int r = M / 2;
    s_vec[l_tid] = d_vec[g_tid - r];
    if (l_tid < 2 * r) {
        s_vec[l_tid + blockDim.x] = d_vec[g_tid - r + blockDim.x];
    }

    __syncthreads();

    int tmp = 0;
    for (int i = g_tid; i <= g_tid + 2 * r; i++) {
        tmp += d_vec[i] * d_msk[i - g_tid];
    }
    d_res[g_tid] = tmp;
}

void verify_result(int *h_vec, int *h_msk, int *h_res, int n) {
    for (int i = 0; i < n; i++) {
        int tmp = 0;
        int r = M / 2;

        for (int j = i; j <= i + 2 * r; j++) {
            tmp += h_vec[j] * h_msk[j - i];
        }

        assert(tmp == h_res[i]);
    }
}

int main() {
    int n = 1 << 20;
    int r = M / 2;
    int p = n + r * 2;

    int n_bytes = sizeof(int) * n;
    int m_bytes = sizeof(int) * M;
    int p_bytes = sizeof(int) * p;

    int *h_vec = (int*) malloc(p_bytes);
    int *h_msk = (int*) malloc(m_bytes);
    int *h_res = (int*) malloc(n_bytes);

    int *d_vec, *d_res;
    cudaMalloc(&d_vec, p_bytes);
    cudaMalloc(&d_res, n_bytes);

    for (int i = 0; i < p; i++) {
        if (i < r || i >= n + r) {
            h_vec[i] = 0;
        } else {
            h_vec[i] = rand() % 100;
        }
    }
    for (int i = 0; i < M; i++) {
        h_msk[i] = rand() % 10;
    }
    for (int i = 0; i < n; i++) {
        h_res[i] = 0;
    }

    cudaMemcpy(d_vec, h_vec, p_bytes, cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(d_msk, h_msk, m_bytes);

    int num_threads = 1 << 8;
    dim3 blck_size(num_threads);
    dim3 grid_size((n + num_threads - 1) / num_threads);
    size_t smem_size = (num_threads + r * 2) * sizeof(int);

    convolve_1d<<<grid_size, blck_size, smem_size>>>(d_vec, d_res, n);
    cudaMemcpy(h_res, d_res, n_bytes, cudaMemcpyDeviceToHost);

    verify_result(h_vec, h_msk, h_res, n);

    cudaFree(d_vec);
    cudaFree(d_res);

    free(h_vec);
    free(h_msk);
    free(h_res);

    printf("Succees!");
    return 0;
}

Succees!


# Profiler

In [3]:
# Mount google drive to use a persistent directory structure
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
FOLDER = "cuda"
ORG = "TechDailyNotes"
REPO = "study-notes-cuda"

In [20]:
%mkdir /content/gdrive/MyDrive/{FOLDER}
%cd /content/gdrive/MyDrive/{FOLDER}
!echo 'YOUR_TOKEN' > /content/gdrive/MyDrive/{FOLDER}/token.txt
!echo 'YOUR_HANDLE' > /content/gdrive/MyDrive/{FOLDER}/git_username.txt

mkdir: cannot create directory ‘/content/gdrive/MyDrive/cuda’: File exists
/content/gdrive/MyDrive/cuda


In [17]:
%cd /content/gdrive/MyDrive/{FOLDER}

with open(f'/content/gdrive/MyDrive/{FOLDER}/token.txt') as f:
    token = f.readline().strip()
# Use another file to store your github username
with open(f'/content/gdrive/MyDrive/{FOLDER}/git_username.txt') as f:
    handle = f.readline().strip()

YOUR_TOKEN = token
YOUR_HANDLE = handle

!git clone https://{YOUR_TOKEN}@github.com/{ORG}/{REPO}.git

%cd /content/gdrive/MyDrive/{FOLDER}/{REPO}
!git pull

/content/gdrive/MyDrive/cuda
fatal: destination path 'study-notes-cuda' already exists and is not an empty directory.
/content/gdrive/MyDrive/cuda/study-notes-cuda
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 1), reused 3 (delta 1), pack-reused 0 (from 0)[K
Unpacking objects: 100% (3/3), 977 bytes | 20.00 KiB/s, done.
From https://github.com/TechDailyNotes/study-notes-cuda
   bd81ae0..3c99fc9  main       -> origin/main
Updating bd81ae0..3c99fc9
Fast-forward
 cuda_convolution_1d_naive.cu | 134 [32m+++++++++++++++++++++++++++++[m[31m----------------------------------[m
 1 file changed, 62 insertions(+), 72 deletions(-)


In [18]:
%cd /content/gdrive/MyDrive/{FOLDER}/{REPO}
!ls

/content/gdrive/MyDrive/cuda/study-notes-cuda
basics					   cuda_convolution_1d_tiled
cuda_basics.ipynb			   cuda_convolution_1d_tiled.cu
cuda_convolution_1d_constant_memory	   cuda_convolution_1d_tiled.ipynb
cuda_convolution_1d_constant_memory.cu	   cuda_cublas.ipynb
cuda_convolution_1d_constant_memory.ipynb  cuda.ipynb
cuda_convolution_1d_naive		   cuda_parallel_reduction.ipynb
cuda_convolution_1d_naive.cu		   README.md
cuda_convolution_1d_naive.ipynb


In [19]:
!nvcc -o cuda_convolution_1d_naive cuda_convolution_1d_naive.cu
!nvcc -o cuda_convolution_1d_constant_memory cuda_convolution_1d_constant_memory.cu
!nvcc -o cuda_convolution_1d_tiled cuda_convolution_1d_tiled.cu

In [20]:
!nvprof ./cuda_convolution_1d_naive

==3872== NVPROF is profiling process 3872, command: ./cuda_convolution_1d_naive
All pass!
Success!==3872== Profiling application: ./cuda_convolution_1d_naive
==3872== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   49.04%  729.33us         2  364.66us     672ns  728.65us  [CUDA memcpy HtoD]
                   44.95%  668.46us         1  668.46us  668.46us  668.46us  [CUDA memcpy DtoH]
                    6.01%  89.374us         1  89.374us  89.374us  89.374us  convolve_1d(int*, int*, int*, int, int)
      API calls:   64.52%  103.21ms         3  34.403ms  79.511us  103.03ms  cudaMalloc
                   33.78%  54.035ms         1  54.035ms  54.035ms  54.035ms  cudaLaunchKernel
                    1.29%  2.0662ms         3  688.73us  88.452us  1.0962ms  cudaMemcpy
                    0.30%  485.78us         3  161.93us  131.77us  201.56us  cudaFree
                    0.09%  141.31us       114  1.2390us     161ns  5

In [22]:
!nvprof ./cuda_convolution_1d_constant_memory

==3926== NVPROF is profiling process 3926, command: ./cuda_convolution_1d_constant_memory
Success!
==3926== Profiling application: ./cuda_convolution_1d_constant_memory
==3926== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   52.86%  783.28us         2  391.64us     640ns  782.64us  [CUDA memcpy HtoD]
                   43.89%  650.48us         1  650.48us  650.48us  650.48us  [CUDA memcpy DtoH]
                    3.25%  48.159us         1  48.159us  48.159us  48.159us  convolve_1d(int*, int*, int)
      API calls:   97.15%  116.13ms         2  58.067ms  86.377us  116.05ms  cudaMalloc
                    1.72%  2.0611ms         2  1.0306ms  981.38us  1.0797ms  cudaMemcpy
                    0.47%  563.27us         1  563.27us  563.27us  563.27us  cudaMemcpyToSymbol
                    0.45%  541.11us         2  270.55us  253.52us  287.59us  cudaFree
                    0.15%  184.95us       114  1.6220us     157ns 

In [23]:
!nvprof ./cuda_convolution_1d_tiled

==3947== NVPROF is profiling process 3947, command: ./cuda_convolution_1d_tiled
Succees!==3947== Profiling application: ./cuda_convolution_1d_tiled
==3947== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   53.17%  766.92us         2  383.46us     672ns  766.25us  [CUDA memcpy HtoD]
                   42.86%  618.26us         1  618.26us  618.26us  618.26us  [CUDA memcpy DtoH]
                    3.97%  57.246us         1  57.246us  57.246us  57.246us  convolve_1d(int*, int*, int)
      API calls:   96.50%  86.547ms         2  43.273ms  109.58us  86.437ms  cudaMalloc
                    2.16%  1.9380ms         2  969.02us  955.15us  982.89us  cudaMemcpy
                    0.64%  577.89us         1  577.89us  577.89us  577.89us  cudaMemcpyToSymbol
                    0.47%  420.71us         2  210.35us  128.90us  291.81us  cudaFree
                    0.15%  137.93us       114  1.2090us     147ns  53.785us  cuDeviceGe