<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_convolution_1d_cache.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
%%sh
nvcc --version
python3 -m pip install nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [6]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


# Chapter 1: 1D Convolution Completely Using SRAM/Shared On-Chip Memory

In [15]:
%%cuda

#include <assert.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define M 7

__constant__ int d_msk[M];

__global__ void convolve_1d(int *d_vec, int *d_res, int n) {
    extern __shared__ int s_mem[];
    int r = M / 2;

    int l_tid = threadIdx.x;
    int g_tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (g_tid >= n) return;

    s_mem[l_tid] = d_vec[g_tid];
    if (l_tid < 2 * r) {
        s_mem[l_tid + blockDim.x] = d_vec[g_tid + blockDim.x];
    }
    __syncthreads();

    int tmp = 0;

    for (int i = l_tid; i <= l_tid + 2 * r; i++) {
        tmp += s_mem[i] * d_msk[i - l_tid];
    }

    d_res[g_tid] = tmp;
}

void verify_result(int *h_vec, int *h_msk, int *h_res, int n) {
    for (int i = 0; i < n; i++) {
        int tmp = 0;
        int r = M / 2;

        for (int j = i; j <= i + 2 * r; j++) {
            tmp += h_vec[j] * h_msk[j - i];
        }

        // printf("tmp = %d, res = %d\n", tmp, h_res[i]);
        assert(tmp == h_res[i]);
    }
}

int main() {
    int n = 1 << 20;
    int r = M / 2;
    int p = n + r * 2;

    int n_bytes = sizeof(int) * n;
    int m_bytes = sizeof(int) * M;
    int p_bytes = sizeof(int) * p;

    int *h_vec = (int*) malloc(p_bytes);
    int *h_msk = (int*) malloc(m_bytes);
    int *h_res = (int*) malloc(n_bytes);

    for (int i = 0; i < p; i++) h_vec[i] = rand() % 100;
    for (int i = 0; i < M; i++) h_msk[i] = rand() % 10;

    int *d_vec, *d_res;
    cudaMalloc(&d_vec, p_bytes);
    cudaMalloc(&d_res, n_bytes);

    cudaMemcpy(d_vec, h_vec, p_bytes, cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(d_msk, h_msk, m_bytes);

    int num_threads = 1 << 8;
    dim3 blck_size(num_threads);
    dim3 grid_size((n + num_threads - 1) / num_threads);
    size_t smem_size = sizeof(int) * (num_threads + 2 * r);

    convolve_1d<<<grid_size, blck_size, smem_size>>>(d_vec, d_res, n);
    cudaMemcpy(h_res, d_res, n_bytes, cudaMemcpyDeviceToHost);
    verify_result(h_vec, h_msk, h_res, n);

    cudaFree(d_vec);
    cudaFree(d_res);

    free(h_vec);
    free(h_msk);
    free(h_res);

    printf("Success!");
    return 0;
}

Success!


# Chapter 2: 1D Convolution Partially Using L1 Cache

In [26]:
%%cuda

#include <assert.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define M 7

__constant__ int d_msk[M];

__global__ void convolve_1d(int *d_vec, int *d_res, int n) {
    extern __shared__ int s_mem[];

    int g_tid = threadIdx.x + blockIdx.x * blockDim.x;
    int l_tid = threadIdx.x;
    if (g_tid >= n) return;

    s_mem[l_tid] = d_vec[g_tid];
    __syncthreads();

    int tmp = 0;
    int r = M / 2;

    for (int i = l_tid; i <= l_tid + 2 * r; i++) {
        if (i < blockDim.x) tmp += s_mem[i] * d_msk[i - l_tid];
        else tmp += d_vec[g_tid + i - l_tid] * d_msk[i - l_tid];
    }

    d_res[g_tid] = tmp;
}

void verify_result(int *h_vec, int *h_msk, int *h_res, int n) {
    for (int i = 0; i < n; i++) {
        int tmp = 0;
        int r = M / 2;

        for (int j = i; j <= i + 2 * r; j++) {
            tmp += h_msk[j - i] * h_vec[j];
        }

        // printf("tmp = %d, res = %d\n", tmp, h_res[i]);
        assert(tmp == h_res[i]);
    }
}

int main() {
    int n = 1 << 20;
    int r = M / 2;
    int p = n + 2 * r;

    int n_bytes = sizeof(int) * n;
    int m_bytes = sizeof(int) * M;
    int p_bytes = sizeof(int) * p;

    int *h_vec = (int *) malloc(p_bytes);
    int *h_msk = (int *) malloc(m_bytes);
    int *h_res = (int *) malloc(n_bytes);

    for (int i = 0; i < p; i++) {
        if (i < r || i >= n + r) h_vec[i] = 0;
        else h_vec[i] = rand() % 100;
    }
    for (int i = 0; i < M; i++) {
        h_msk[i] = rand() % 10;
    }

    int *d_vec, *d_res;
    cudaMalloc(&d_vec, p_bytes);
    cudaMalloc(&d_res, n_bytes);

    cudaMemcpy(d_vec, h_vec, p_bytes, cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(d_msk, h_msk, m_bytes);

    int num_threads = 1 << 8;
    dim3 blck_size(num_threads);
    dim3 grid_size((n + num_threads - 1) / num_threads);
    size_t smem_size = sizeof(int) * num_threads;
    convolve_1d<<<grid_size, blck_size, smem_size>>>(d_vec, d_res, n);

    cudaMemcpy(h_res, d_res, n_bytes, cudaMemcpyDeviceToHost);
    verify_result(h_vec, h_msk, h_res, n);

    free(h_vec);
    free(h_msk);
    free(h_res);

    printf("Succees!");
    return 0;
}

Succees!


# Profiler

In [27]:
# Mount google drive to use a persistent directory structure
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [28]:
FOLDER = "cuda"
ORG = "TechDailyNotes"
REPO = "study-notes-cuda"

In [28]:
%mkdir /content/gdrive/MyDrive/{FOLDER}
%cd /content/gdrive/MyDrive/{FOLDER}
!echo 'YOUR_TOKEN' > /content/gdrive/MyDrive/{FOLDER}/token.txt
!echo 'YOUR_HANDLE' > /content/gdrive/MyDrive/{FOLDER}/git_username.txt

In [34]:
%cd /content/gdrive/MyDrive/{FOLDER}

with open(f'/content/gdrive/MyDrive/{FOLDER}/token.txt') as f:
    token = f.readline().strip()
# Use another file to store your github username
with open(f'/content/gdrive/MyDrive/{FOLDER}/git_username.txt') as f:
    handle = f.readline().strip()

YOUR_TOKEN = token
YOUR_HANDLE = handle

!git clone https://{YOUR_TOKEN}@github.com/{ORG}/{REPO}.git

%cd /content/gdrive/MyDrive/{FOLDER}/{REPO}
!git pull

/content/gdrive/MyDrive/cuda
fatal: destination path 'study-notes-cuda' already exists and is not an empty directory.
/content/gdrive/MyDrive/cuda/study-notes-cuda
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (1/1), done.[K
remote: Total 3 (delta 2), reused 3 (delta 2), pack-reused 0 (from 0)[K
Unpacking objects: 100% (3/3), 286 bytes | 5.00 KiB/s, done.
From https://github.com/TechDailyNotes/study-notes-cuda
   ab942c6..bdaf760  main       -> origin/main
Updating ab942c6..bdaf760
Fast-forward
 cuda_convolution_1d_cache.cu | 2 [31m--[m
 1 file changed, 2 deletions(-)


In [35]:
%cd /content/gdrive/MyDrive/{FOLDER}/{REPO}
!ls

/content/gdrive/MyDrive/cuda/study-notes-cuda
basics					   cuda_convolution_1d_naive.ipynb
cuda_basics.ipynb			   cuda_convolution_1d_tiled
cuda_convolution_1d_cache.cu		   cuda_convolution_1d_tiled.cu
cuda_convolution_1d_cache.ipynb		   cuda_convolution_1d_tiled.ipynb
cuda_convolution_1d_constant_memory	   cuda_cublas.ipynb
cuda_convolution_1d_constant_memory.cu	   cuda.ipynb
cuda_convolution_1d_constant_memory.ipynb  cuda_parallel_reduction.ipynb
cuda_convolution_1d_naive		   README.md
cuda_convolution_1d_naive.cu


In [36]:
%%sh
nvcc -o cuda_convolution_1d_naive cuda_convolution_1d_naive.cu
nvcc -o cuda_convolution_1d_constant_memory cuda_convolution_1d_constant_memory.cu
nvcc -o cuda_convolution_1d_tiled cuda_convolution_1d_tiled.cu
nvcc -o cuda_convolution_1d_cache cuda_convolution_1d_cache.cu

In [38]:
!nvprof ./cuda_convolution_1d_naive

==22181== NVPROF is profiling process 22181, command: ./cuda_convolution_1d_naive
All pass!
Success!==22181== Profiling application: ./cuda_convolution_1d_naive
==22181== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   51.45%  733.50us         2  366.75us     672ns  732.83us  [CUDA memcpy HtoD]
                   42.27%  602.56us         1  602.56us  602.56us  602.56us  [CUDA memcpy DtoH]
                    6.28%  89.536us         1  89.536us  89.536us  89.536us  convolve_1d(int*, int*, int*, int, int)
      API calls:   63.41%  94.589ms         3  31.530ms  69.734us  94.447ms  cudaMalloc
                   34.77%  51.859ms         1  51.859ms  51.859ms  51.859ms  cudaLaunchKernel
                    1.37%  2.0419ms         3  680.65us  88.021us  1.0644ms  cudaMemcpy
                    0.33%  494.05us         3  164.68us  116.57us  203.86us  cudaFree
                    0.10%  149.13us       114  1.3080us     149n

In [39]:
!nvprof ./cuda_convolution_1d_constant_memory

==22220== NVPROF is profiling process 22220, command: ./cuda_convolution_1d_constant_memory
Success!
==22220== Profiling application: ./cuda_convolution_1d_constant_memory
==22220== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   52.81%  784.73us         2  392.36us     672ns  784.06us  [CUDA memcpy HtoD]
                   43.95%  653.15us         1  653.15us  653.15us  653.15us  [CUDA memcpy DtoH]
                    3.24%  48.191us         1  48.191us  48.191us  48.191us  convolve_1d(int*, int*, int)
      API calls:   64.40%  119.32ms         2  59.660ms  97.677us  119.22ms  cudaMalloc
                   34.16%  63.297ms         1  63.297ms  63.297ms  63.297ms  cudaMemcpyToSymbol
                    1.10%  2.0318ms         2  1.0159ms  956.38us  1.0754ms  cudaMemcpy
                    0.21%  385.62us         2  192.81us  166.23us  219.39us  cudaFree
                    0.10%  188.37us       114  1.6520us     23

In [40]:
!nvprof ./cuda_convolution_1d_tiled

==22303== NVPROF is profiling process 22303, command: ./cuda_convolution_1d_tiled
Succees!==22303== Profiling application: ./cuda_convolution_1d_tiled
==22303== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   51.68%  715.55us         2  357.77us     672ns  714.88us  [CUDA memcpy HtoD]
                   44.18%  611.71us         1  611.71us  611.71us  611.71us  [CUDA memcpy DtoH]
                    4.14%  57.280us         1  57.280us  57.280us  57.280us  convolve_1d(int*, int*, int)
      API calls:   69.38%  91.911ms         2  45.956ms  99.528us  91.812ms  cudaMalloc
                   28.73%  38.061ms         1  38.061ms  38.061ms  38.061ms  cudaMemcpyToSymbol
                    1.43%  1.8958ms         2  947.91us  901.41us  994.40us  cudaMemcpy
                    0.32%  426.43us         2  213.21us  141.54us  284.89us  cudaFree
                    0.10%  132.14us       114  1.1590us     137ns  52.480us  cuDevi

In [41]:
!nvprof ./cuda_convolution_1d_cache

==22354== NVPROF is profiling process 22354, command: ./cuda_convolution_1d_cache
Succees!==22354== Profiling application: ./cuda_convolution_1d_cache
==22354== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   67.42%  1.6561ms         1  1.6561ms  1.6561ms  1.6561ms  [CUDA memcpy DtoH]
                   29.69%  729.15us         2  364.57us     640ns  728.51us  [CUDA memcpy HtoD]
                    2.89%  70.975us         1  70.975us  70.975us  70.975us  convolve_1d(int*, int*, int)
      API calls:   95.04%  91.574ms         2  45.787ms  111.03us  91.463ms  cudaMalloc
                    4.21%  4.0591ms         2  2.0296ms  928.12us  3.1310ms  cudaMemcpy
                    0.53%  511.97us         1  511.97us  511.97us  511.97us  cudaMemcpyToSymbol
                    0.15%  146.20us       114  1.2820us     141ns  52.870us  cuDeviceGetAttribute
                    0.03%  29.642us         1  29.642us  29.642us  29.6