<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_convolution_2d_tiled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%sh
nvcc --version
python3 -m pip install nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmphaqfpnf8".


In [24]:
%%cuda

#include <assert.h>
#include <cuda_runtime.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define KERNEL_DIM 7
#define KERNEL_PAD (KERNEL_DIM / 2)

__constant__ int d_kernel[KERNEL_DIM * KERNEL_DIM];

__global__ void convolve_2d(int *d_matrix, int *d_result, int result_dim) {
    extern __shared__ int s_memory[];
    int smem_dim = blockDim.x + 2 * KERNEL_PAD;
    int matrix_dim = result_dim + 2 * KERNEL_PAD;

    int l_tid_c = threadIdx.x;
    int l_tid_r = threadIdx.y;
    int g_tid_c = threadIdx.x + blockIdx.x * blockDim.x;
    int g_tid_r = threadIdx.y + blockIdx.y * blockDim.y;
    if (g_tid_c >= result_dim) return;
    if (g_tid_r >= result_dim) return;

    int offset_r = 0;
    while (offset_r < smem_dim) {
        int offset_c = 0;
        while (offset_c < smem_dim) {
            int smem_r = l_tid_r + offset_r;
            int smem_c = l_tid_c + offset_c;
            if (smem_r < smem_dim && smem_c < smem_dim) {
                s_memory[smem_r * smem_dim + smem_c] = \
                d_matrix[(g_tid_r + offset_r) * matrix_dim + (g_tid_c + offset_c)];
            }

            offset_c += blockDim.x;
        }

        offset_r += blockDim.y;
    }

    __syncthreads();

    int tmp = 0;
    int lo_r = l_tid_r;
    int hi_r = l_tid_r + 2 * KERNEL_PAD;
    int lo_c = l_tid_c;
    int hi_c = l_tid_c + 2 * KERNEL_PAD;

    for (int ri = lo_r; ri <= hi_r; ri++) {
        for (int ci = lo_c; ci <= hi_c; ci++) {
            tmp += (
                s_memory[ri * smem_dim + ci] *
                d_kernel[(ri - lo_r) * KERNEL_DIM + (ci - lo_c)]
            );
        }
    }

    d_result[g_tid_r * result_dim + g_tid_c] = tmp;
}

void verify_result(int *h_matrix, int *h_kernel, int *h_result, int result_dim) {
    for (int rowi = 0; rowi < result_dim; rowi++) {
        for (int coli = 0; coli < result_dim; coli++) {
            int tmp = 0;
            int matrix_dim = result_dim + 2 * KERNEL_PAD;

            for (int ri = rowi; ri <= rowi + 2 * KERNEL_PAD; ri++) {
                for (int ci = coli; ci <= coli + 2 * KERNEL_PAD; ci++) {
                    tmp += (
                        h_matrix[ri * matrix_dim + ci] *
                        h_kernel[(ri - rowi) * KERNEL_DIM + (ci - coli)]
                    );
                }
            }

            // printf(
            //     "tmp = %d, result = %d\n",
            //     tmp, h_result[rowi * result_dim + coli]
            // );
            assert(tmp == h_result[rowi * result_dim + coli]);
        }
    }
}

int main() {
    int result_dim = 1 << 10;
    int matrix_dim = result_dim + KERNEL_PAD * 2;

    int result_bytes = sizeof(int) * result_dim * result_dim;
    int matrix_bytes = sizeof(int) * matrix_dim * matrix_dim;
    int kernel_bytes = sizeof(int) * KERNEL_DIM * KERNEL_DIM;

    int *h_matrix = (int *) malloc(matrix_bytes);
    int *h_kernel = (int *) malloc(kernel_bytes);
    int *h_result = (int *) malloc(result_bytes);

    for (int rowi = 0; rowi < matrix_dim; rowi++) {
        for (int coli = 0; coli < matrix_dim; coli++) {
            if (
                rowi < KERNEL_PAD || rowi >= result_dim + KERNEL_PAD ||
                coli < KERNEL_PAD || coli >= result_dim + KERNEL_PAD
            ) {
                h_matrix[rowi * matrix_dim + coli] = 0;
            } else {
                h_matrix[rowi * matrix_dim + coli] = rand() % 100;
            }
        }
    }
    for (int rowi = 0; rowi < KERNEL_DIM; rowi++) {
        for (int coli = 0; coli < KERNEL_DIM; coli++) {
            h_kernel[rowi * KERNEL_DIM + coli] = rand() % 10;
        }
    }

    int *d_matrix, *d_result;
    cudaMalloc(&d_matrix, matrix_bytes);
    cudaMalloc(&d_result, result_bytes);

    cudaMemcpy(d_matrix, h_matrix, matrix_bytes, cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(d_kernel, h_kernel, kernel_bytes);

    int threads_per_block_dim = 1 << 4;
    int blocks_per_grid_dim = (result_dim + threads_per_block_dim - 1) / threads_per_block_dim;
    dim3 block_size(threads_per_block_dim, threads_per_block_dim);
    dim3 grid_size(blocks_per_grid_dim, blocks_per_grid_dim);

    int smem_per_block_dim = threads_per_block_dim + 2 * KERNEL_PAD;
    size_t smem_size = sizeof(int) * smem_per_block_dim * smem_per_block_dim;

    convolve_2d<<<grid_size, block_size, smem_size>>>(d_matrix, d_result, result_dim);
    cudaMemcpy(h_result, d_result, result_bytes, cudaMemcpyDeviceToHost);
    verify_result(h_matrix, h_kernel, h_result, result_dim);

    cudaFree(d_matrix);
    cudaFree(d_result);

    free(h_matrix);
    free(h_kernel);
    free(h_result);

    printf("Success!\n");
    return 0;
}

Success!



# Profiler

In [25]:
# Mount google drive to use a persistent directory structure
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [27]:
FOLDER = "cuda"
ORG = "TechDailyNotes"
REPO = "study-notes-cuda"

In [None]:
%mkdir /content/gdrive/MyDrive/{FOLDER}
%cd /content/gdrive/MyDrive/{FOLDER}
!echo 'YOUR_TOKEN' > /content/gdrive/MyDrive/{FOLDER}/token.txt
!echo 'YOUR_HANDLE' > /content/gdrive/MyDrive/{FOLDER}/git_username.txt

In [31]:
%cd /content/gdrive/MyDrive/{FOLDER}

with open(f'/content/gdrive/MyDrive/{FOLDER}/token.txt') as f:
    token = f.readline().strip()
# Use another file to store your github username
with open(f'/content/gdrive/MyDrive/{FOLDER}/git_username.txt') as f:
    handle = f.readline().strip()

YOUR_TOKEN = token
YOUR_HANDLE = handle

!git clone https://{YOUR_TOKEN}@github.com/{ORG}/{REPO}.git

%cd /content/gdrive/MyDrive/{FOLDER}/{REPO}
!git pull

/content/gdrive/MyDrive/cuda
fatal: destination path 'study-notes-cuda' already exists and is not an empty directory.
/content/gdrive/MyDrive/cuda/study-notes-cuda
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 1), reused 3 (delta 1), pack-reused 0 (from 0)[K
Unpacking objects: 100% (3/3), 1.23 KiB | 24.00 KiB/s, done.
From https://github.com/TechDailyNotes/study-notes-cuda
   fbbf646..583305d  main       -> origin/main
Updating fbbf646..583305d
Fast-forward
 cuda_convolution_2d_constant_memory.cu | 111 [32m+++++++++++++++++++++++++++++++++++++++++++++++++++++[m
 1 file changed, 111 insertions(+)
 create mode 100644 cuda_convolution_2d_constant_memory.cu


In [32]:
%cd /content/gdrive/MyDrive/{FOLDER}/{REPO}
!ls

/content/gdrive/MyDrive/cuda/study-notes-cuda
basics					   cuda_convolution_1d_tiled.cu
cuda_basics.ipynb			   cuda_convolution_1d_tiled.ipynb
cuda_convolution_1d_cache		   cuda_convolution_2d_constant_memory.cu
cuda_convolution_1d_cache.cu		   cuda_convolution_2d_constant_memory.ipynb
cuda_convolution_1d_cache.ipynb		   cuda_convolution_2d_tiled
cuda_convolution_1d_constant_memory	   cuda_convolution_2d_tiled.cu
cuda_convolution_1d_constant_memory.cu	   cuda_convolution_2d_tiled.ipynb
cuda_convolution_1d_constant_memory.ipynb  cuda_cublas.ipynb
cuda_convolution_1d_naive		   cuda.ipynb
cuda_convolution_1d_naive.cu		   cuda_parallel_reduction.ipynb
cuda_convolution_1d_naive.ipynb		   README.md
cuda_convolution_1d_tiled


In [33]:
%%sh
nvcc -o cuda_convolution_2d_constant_memory cuda_convolution_2d_constant_memory.cu
nvcc -o cuda_convolution_2d_tiled cuda_convolution_2d_tiled.cu

In [34]:
!nvprof ./cuda_convolution_2d_constant_memory

==21658== NVPROF is profiling process 21658, command: ./cuda_convolution_2d_constant_memory
Success!
==21658== Profiling application: ./cuda_convolution_2d_constant_memory
==21658== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   65.43%  1.9924ms         1  1.9924ms  1.9924ms  1.9924ms  [CUDA memcpy DtoH]
                   24.22%  737.55us         2  368.78us     704ns  736.85us  [CUDA memcpy HtoD]
                   10.35%  315.03us         1  315.03us  315.03us  315.03us  convolve_2d(int*, int*, int)
      API calls:   63.83%  104.89ms         2  52.444ms  76.129us  104.81ms  cudaMalloc
                   32.79%  53.886ms         1  53.886ms  53.886ms  53.886ms  cudaMemcpyToSymbol
                    2.98%  4.8955ms         2  2.4478ms  909.86us  3.9857ms  cudaMemcpy
                    0.25%  410.98us         2  205.49us  200.19us  210.80us  cudaFree
                    0.09%  152.75us       114  1.3390us     15

In [35]:
!nvprof ./cuda_convolution_2d_tiled

==21679== NVPROF is profiling process 21679, command: ./cuda_convolution_2d_tiled
Success!
==21679== Profiling application: ./cuda_convolution_2d_tiled
==21679== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   60.01%  1.6611ms         1  1.6611ms  1.6611ms  1.6611ms  [CUDA memcpy DtoH]
                   28.02%  775.44us         2  387.72us     704ns  774.73us  [CUDA memcpy HtoD]
                   11.97%  331.42us         1  331.42us  331.42us  331.42us  convolve_2d(int*, int*, int)
      API calls:   94.21%  91.520ms         2  45.760ms  106.30us  91.414ms  cudaMalloc
                    4.56%  4.4282ms         2  2.2141ms  944.44us  3.4838ms  cudaMemcpy
                    0.59%  573.17us         1  573.17us  573.17us  573.17us  cudaMemcpyToSymbol
                    0.44%  429.81us         2  214.91us  141.27us  288.55us  cudaFree
                    0.14%  135.31us       114  1.1860us     140ns  53.923us  cuDev