<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_histogram_global_memory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
%%sh
nvcc --version
python3 -m pip install nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [22]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [23]:
%%cuda

#include <cuda_runtime.h>
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <numeric>
#include <vector>

using namespace std;

ostream& operator<< (ostream& os, vector<char>& vec) {
    for (char c: vec) os << c << endl;
    return os;
}

__global__ void histogram(char *d_input, int *d_output, int num_elements, int num_bins) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    for (int i = tid; i < num_elements; i += gridDim.x * blockDim.x) {
        int idx_char = d_input[i] - 'a';
        int idx_bin = idx_char % num_bins;
        atomicAdd(&d_output[idx_bin], 1);
    }
}

int main() {
    int num_elements = 1 << 20;
    int num_bins = 7;

    size_t byte_elements = num_elements * sizeof(char);
    size_t byte_bins = num_bins * sizeof(int);

    vector<char> h_input(num_elements);
    vector<int> h_output(num_bins);

    srand(1);
    generate(begin(h_input), end(h_input), [](){return 'a' + rand() % 26;});
    // cout << h_input;

    char *d_input;
    int *d_output;

    cudaMalloc(&d_input, byte_elements);
    cudaMalloc(&d_output, byte_bins);
    cudaMemcpy(d_input, h_input.data(), byte_elements, cudaMemcpyHostToDevice);

    int num_threads = 1 << 8;
    dim3 size_block(num_threads);
    dim3 size_grid((num_elements + num_threads - 1) / num_threads);
    histogram<<<size_grid, size_block>>>(d_input, d_output, num_elements, num_bins);

    cudaMemcpy(h_output.data(), d_output, byte_bins, cudaMemcpyDeviceToHost);
    assert(num_elements == accumulate(begin(h_output), end(h_output), 0));

    ofstream output_file;
    output_file.open("histogram.dat", ios::out | ios::trunc);
    for (int i = 0; i < num_bins; i++) {
        output_file << "h_output[" << i << "] = " << h_output[i] << "\n";
    }
    output_file.close();

    cudaFree(d_input);
    cudaFree(d_output);

    cout << "Success!" << endl;
    return 0;
}

Success!



# Practice

In [37]:
%%cuda

#include <cuda_runtime.h>
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <numeric>
#include <vector>

using namespace std;

constexpr int NUM_CHARS = 26;

ostream& operator<< (ostream& os, vector<char>& vec) {
    for (char c: vec) os << c << endl;
    return os;
}

__global__ void histogram(char *d_elements, int *d_bins, int num_elements, int num_bins) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    for (int i = tid; i < num_elements; i += gridDim.x * blockDim.x) {
        int idx_bin = (d_elements[i] - 'a') % num_bins;
        atomicAdd(&d_bins[idx_bin], 1);
    }
}

int main() {
    int num_elements = 1 << 20;
    int num_bins = 7;

    size_t byte_elements = sizeof(char) * num_elements;
    size_t byte_bins = sizeof(int) * num_bins;

    vector<char> h_elements(num_elements);
    vector<int> h_bins(num_bins);

    srand(1);
    generate(
        begin(h_elements), end(h_elements),
        [](){return 'a' + rand() % NUM_CHARS;}
    );
    // cout << h_elements;

    char *d_elements;
    int *d_bins;

    cudaMalloc(&d_elements, byte_elements);
    cudaMalloc(&d_bins, byte_bins);
    cudaMemcpy(d_elements, h_elements.data(), byte_elements, cudaMemcpyHostToDevice);

    int num_threads = 1 << 8;
    dim3 size_block(num_threads);
    dim3 size_grid((num_elements + num_threads - 1) / num_threads);
    histogram<<<size_grid, size_block>>>(d_elements, d_bins, num_elements, num_bins);

    cudaMemcpy(h_bins.data(), d_bins, byte_bins, cudaMemcpyDeviceToHost);
    assert(num_elements == accumulate(begin(h_bins), end(h_bins), 0));

    ofstream f_output;
    f_output.open("histogram.dat", ios::out | ios::trunc);
    for (int i = 0; i < num_bins; i++) {
        f_output << "h_output[" << i << "] = " << h_bins[i] << endl;
    }
    f_output.close();

    cudaFree(d_elements);
    cudaFree(d_bins);

    cout << "Success!" << endl;

    return 0;
}

Success!



# Profiler

In [38]:
# Mount google drive to use a persistent directory structure
from google.colab import drive
drive.mount('/content/gdrive')

FOLDER = "cuda"
ORG = "TechDailyNotes"
REPO = "study-notes-cuda"

Mounted at /content/gdrive


In [40]:
# %mkdir /content/gdrive/MyDrive/{FOLDER}
# %cd /content/gdrive/MyDrive/{FOLDER}
# !echo 'YOUR_TOKEN' > /content/gdrive/MyDrive/{FOLDER}/token.txt
# !echo 'YOUR_HANDLE' > /content/gdrive/MyDrive/{FOLDER}/git_username.txt

In [47]:
%cd /content/gdrive/MyDrive/{FOLDER}
with open(f'/content/gdrive/MyDrive/{FOLDER}/token.txt') as f:
    token = f.readline().strip()
with open(f'/content/gdrive/MyDrive/{FOLDER}/git_username.txt') as f:
    handle = f.readline().strip()

YOUR_TOKEN = token
YOUR_HANDLE = handle

!git clone https://{YOUR_TOKEN}@github.com/{ORG}/{REPO}.git
%cd /content/gdrive/MyDrive/{FOLDER}/{REPO}
!git pull
!ls

/content/gdrive/MyDrive/cuda
fatal: destination path 'study-notes-cuda' already exists and is not an empty directory.
/content/gdrive/MyDrive/cuda/study-notes-cuda
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)[K
Unpacking objects: 100% (3/3), 1010 bytes | 22.00 KiB/s, done.
From https://github.com/TechDailyNotes/study-notes-cuda
   d9662d1..27664ee  main       -> origin/main
Updating d9662d1..27664ee
Fast-forward
 cuda_histogram.cu | 10 [32m+++++[m[31m-----[m
 1 file changed, 5 insertions(+), 5 deletions(-)
basics					   cuda_convolution_1d_tiled.ipynb
cuda_basics.ipynb			   cuda_convolution_2d_constant_memory
cuda_convolution_1d_cache		   cuda_convolution_2d_constant_memory.cu
cuda_convolution_1d_cache.cu		   cuda_convolution_2d_constant_memory.ipynb
cuda_convolution_1d_cache.ipynb		   cuda_convolution_2d_tiled
cuda_conv

In [48]:
%%sh
nvcc -o cuda_histogram cuda_histogram.cu

In [54]:
!nvprof ./cuda_histogram

==19866== NVPROF is profiling process 19866, command: ./cuda_histogram
Success!
==19866== Profiling application: ./cuda_histogram
==19866== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   85.65%  539.54us         1  539.54us  539.54us  539.54us  histogram(char*, int*, int, int)
                   14.02%  88.350us         1  88.350us  88.350us  88.350us  [CUDA memcpy HtoD]
                    0.33%  2.0800us         1  2.0800us  2.0800us  2.0800us  [CUDA memcpy DtoH]
      API calls:   98.02%  68.976ms         2  34.488ms  7.5160us  68.969ms  cudaMalloc
                    1.17%  823.67us         2  411.83us  273.61us  550.05us  cudaMemcpy
                    0.32%  226.77us         2  113.39us  18.660us  208.11us  cudaFree
                    0.26%  186.10us         1  186.10us  186.10us  186.10us  cudaLaunchKernel
                    0.18%  128.48us       114  1.1260us     150ns  50.529us  cuDeviceGetAttribute
    