<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_histogram_shared_memory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%sh
nvcc --version
python3 -m pip install nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpdl6a8uhm".


In [32]:
%%cuda

#include <cuda_runtime.h>
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <numeric>
#include <vector>

using namespace std;

constexpr int NUM_CHARS = 26;

ostream& operator<< (ostream& os, vector<char>& vec) {
    for (char c: vec) os << c << endl;
    return os;
}

__global__ void histogram(char *d_elements, int *d_bins, int num_elements, int num_bins) {
    extern __shared__ int s_memory[];

    int l_tid = threadIdx.x;
    int g_tid = threadIdx.x + blockIdx.x * blockDim.x;
    int num_l_threads = blockDim.x;
    int num_g_threads = gridDim.x * blockDim.x;

    // Step 1: Initialize initial values of all the bins as 0.
    for (int i = l_tid; i < num_bins; i += num_l_threads) s_memory[i] = 0;
    __syncthreads();

    // Step 2: Aggregate elements in the same range to the same bin and
    //         Compute the number of values in all the bins.
    for (int i = g_tid; i < num_elements; i += num_g_threads) {
        int idx_bin = (d_elements[i] - 'a') % num_bins;
        atomicAdd(&s_memory[idx_bin], 1);
    }
    __syncthreads();

    // Step 3: Move the number of values from the shared memory to the global memory.
    for (int i = l_tid; i < num_bins; i += num_l_threads) {
        atomicAdd(&d_bins[i], s_memory[i]);
    }
}

int main() {
    int num_elements = 1 << 20;
    int num_bins = 7;

    size_t byte_elements = sizeof(char) * num_elements;
    size_t byte_bins = sizeof(int) * num_bins;

    vector<char> h_elements(num_elements);
    vector<int> h_bins(num_bins);

    srand(1);
    generate(begin(h_elements), end(h_elements), [](){return 'a' + rand() % NUM_CHARS;});
    // cout << h_elements;

    char *d_elements;
    int *d_bins;
    cudaMalloc(&d_elements, byte_elements);
    cudaMalloc(&d_bins, byte_bins);
    cudaMemcpy(d_elements, h_elements.data(), byte_elements, cudaMemcpyHostToDevice);

    int num_threads = 1 << 8;
    int num_blocks = (num_elements + num_threads - 1) / num_threads;
    int scale_grid = 1;
    dim3 size_block(num_threads);
    dim3 size_grid(num_blocks / scale_grid);
    size_t size_cache = byte_bins;
    histogram<<<size_grid, size_block, size_cache>>>(d_elements, d_bins, num_elements, num_bins);

    cudaMemcpy(h_bins.data(), d_bins, byte_bins, cudaMemcpyDeviceToHost);
    assert(num_elements == accumulate(begin(h_bins), end(h_bins), 0));

    ofstream f_bins;
    f_bins.open("histogram.dat", ios::out | ios::trunc);
    for (int i: h_bins) f_bins << i << endl;
    f_bins.close();

    cudaFree(d_elements);
    cudaFree(d_bins);

    cout << "[int main()] pass!" << endl;
    return 0;
}

[int main()] pass!



# Practice

# Profiler

In [33]:
# Mount google drive to use a persistent directory structure
from google.colab import drive
drive.mount('/content/gdrive')

FOLDER = "cuda"
ORG = "TechDailyNotes"
REPO = "study-notes-cuda"

Mounted at /content/gdrive


In [34]:
# %mkdir /content/gdrive/MyDrive/{FOLDER}
# %cd /content/gdrive/MyDrive/{FOLDER}
# !echo 'YOUR_TOKEN' > /content/gdrive/MyDrive/{FOLDER}/token.txt
# !echo 'YOUR_HANDLE' > /content/gdrive/MyDrive/{FOLDER}/git_username.txt

In [47]:
%cd /content/gdrive/MyDrive/{FOLDER}
with open(f'/content/gdrive/MyDrive/{FOLDER}/token.txt') as f:
    token = f.readline().strip()
with open(f'/content/gdrive/MyDrive/{FOLDER}/git_username.txt') as f:
    handle = f.readline().strip()

YOUR_TOKEN = token
YOUR_HANDLE = handle

!git clone https://{YOUR_TOKEN}@github.com/{ORG}/{REPO}.git
%cd /content/gdrive/MyDrive/{FOLDER}/{REPO}
!git pull
!ls

/content/gdrive/MyDrive/cuda
fatal: destination path 'study-notes-cuda' already exists and is not an empty directory.
/content/gdrive/MyDrive/cuda/study-notes-cuda
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 1), reused 3 (delta 1), pack-reused 0 (from 0)[K
Unpacking objects: 100% (3/3), 1.20 KiB | 43.00 KiB/s, done.
From https://github.com/TechDailyNotes/study-notes-cuda
   7b6787a..ce1eb9a  main       -> origin/main
Updating 7b6787a..ce1eb9a
Fast-forward
 cuda_histogram_shared_memory_32.cu | 86 [32m++++++++++++++++++++++++++++++++++++++++++++++++++++++++++[m
 1 file changed, 86 insertions(+)
 create mode 100644 cuda_histogram_shared_memory_32.cu
basics					   cuda_convolution_2d_tiled.ipynb
cuda_basics.ipynb			   cuda_cublas.ipynb
cuda_convolution_1d_cache		   cuda_histogram
cuda_convolution_1d_cache.cu		   cuda_histogram.cu
cuda_convolution_1d_cache.ipynb		   cuda_h

In [48]:
%%sh
nvcc -o cuda_histogram_global_memory cuda_histogram_global_memory.cu
nvcc -o cuda_histogram_shared_memory cuda_histogram_shared_memory.cu
nvcc -o cuda_histogram_shared_memory_4 cuda_histogram_shared_memory_4.cu
nvcc -o cuda_histogram_shared_memory_32 cuda_histogram_shared_memory_32.cu

In [49]:
!nvprof ./cuda_histogram_global_memory

==23047== NVPROF is profiling process 23047, command: ./cuda_histogram_global_memory
Success!
==23047== Profiling application: ./cuda_histogram_global_memory
==23047== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   85.39%  539.44us         1  539.44us  539.44us  539.44us  histogram(char*, int*, int, int)
                   14.28%  90.206us         1  90.206us  90.206us  90.206us  [CUDA memcpy HtoD]
                    0.33%  2.1120us         1  2.1120us  2.1120us  2.1120us  [CUDA memcpy DtoH]
      API calls:   99.21%  176.59ms         2  88.296ms  6.0060us  176.59ms  cudaMalloc
                    0.47%  833.40us         2  416.70us  282.80us  550.60us  cudaMemcpy
                    0.12%  215.45us         2  107.72us  17.700us  197.75us  cudaFree
                    0.11%  204.39us         1  204.39us  204.39us  204.39us  cudaLaunchKernel
                    0.08%  135.32us       114  1.1870us     145ns  53.149u

In [50]:
!nvprof ./cuda_histogram_shared_memory

==23062== NVPROF is profiling process 23062, command: ./cuda_histogram_shared_memory
[int main()] pass!
==23062== Profiling application: ./cuda_histogram_shared_memory
==23062== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   58.42%  88.030us         1  88.030us  88.030us  88.030us  [CUDA memcpy HtoD]
                   40.18%  60.543us         1  60.543us  60.543us  60.543us  histogram(char*, int*, int, int)
                    1.40%  2.1120us         1  2.1120us  2.1120us  2.1120us  [CUDA memcpy DtoH]
      API calls:   99.45%  162.83ms         2  81.414ms  3.7440us  162.83ms  cudaMalloc
                    0.20%  323.63us         2  161.81us  74.841us  248.79us  cudaMemcpy
                    0.12%  201.97us         2  100.99us  19.063us  182.91us  cudaFree
                    0.11%  181.60us         1  181.60us  181.60us  181.60us  cudaLaunchKernel
                    0.10%  163.59us       114  1.4340us     137n

In [51]:
!nvprof ./cuda_histogram_shared_memory_4

==23077== NVPROF is profiling process 23077, command: ./cuda_histogram_shared_memory_4
[int main()] pass!
==23077== Profiling application: ./cuda_histogram_shared_memory_4
==23077== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   68.98%  88.030us         1  88.030us  88.030us  88.030us  [CUDA memcpy HtoD]
                   29.36%  37.471us         1  37.471us  37.471us  37.471us  histogram(char*, int*, int, int)
                    1.66%  2.1120us         1  2.1120us  2.1120us  2.1120us  [CUDA memcpy DtoH]
      API calls:   99.51%  159.26ms         2  79.630ms  3.9450us  159.26ms  cudaMalloc
                    0.19%  309.63us         2  154.82us  49.586us  260.04us  cudaMemcpy
                    0.12%  187.00us         1  187.00us  187.00us  187.00us  cudaLaunchKernel
                    0.09%  141.00us         2  70.502us  18.736us  122.27us  cudaFree
                    0.08%  124.40us       114  1.0910us     

In [53]:
!nvprof ./cuda_histogram_shared_memory_32 cuda_histogram_shared_memory_32.cu

==23132== NVPROF is profiling process 23132, command: ./cuda_histogram_shared_memory_32 cuda_histogram_shared_memory_32.cu
[int main()] pass!
==23132== Profiling application: ./cuda_histogram_shared_memory_32 cuda_histogram_shared_memory_32.cu
==23132== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   72.31%  89.854us         1  89.854us  89.854us  89.854us  [CUDA memcpy HtoD]
                   25.96%  32.255us         1  32.255us  32.255us  32.255us  histogram(char*, int*, int, int)
                    1.73%  2.1450us         1  2.1450us  2.1450us  2.1450us  [CUDA memcpy DtoH]
      API calls:   99.49%  184.39ms         2  92.196ms  6.1250us  184.39ms  cudaMalloc
                    0.18%  334.61us         2  167.31us  47.815us  286.80us  cudaMemcpy
                    0.11%  200.71us         1  200.71us  200.71us  200.71us  cudaLaunchKernel
                    0.10%  193.54us         2  96.770us  18.075us  175.47u