<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_histogram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%sh
nvcc --version
python3 -m pip install nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [3]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpcnsu9cdj".


In [20]:
%%cuda

#include <cuda_runtime.h>
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <numeric>
#include <vector>

using namespace std;

ostream& operator<< (ostream& os, vector<char>& vec) {
    for (char c: vec) os << c << endl;
    return os;
}

__global__ void histogram(char *d_input, int *d_output, int num_elements, int num_bins) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    for (int i = tid; i < num_elements; i += gridDim.x * blockDim.x) {
        int idx_char = d_input[i] - 'a';
        int idx_bin = idx_char % num_bins;
        atomicAdd(&d_output[idx_bin], 1);
    }
}

int main() {
    int num_elements = 1 << 20;
    int num_bins = 7;

    size_t byte_elements = num_elements * sizeof(char);
    size_t byte_bins = num_bins * sizeof(int);

    vector<char> h_input(num_elements);
    vector<int> h_output(num_bins);

    srand(1);
    generate(begin(h_input), end(h_input), [](){return 'a' + rand() % 26;});
    // cout << h_input;

    char *d_input;
    int *d_output;

    cudaMalloc(&d_input, byte_elements);
    cudaMalloc(&d_output, byte_bins);
    cudaMemcpy(d_input, h_input.data(), byte_elements, cudaMemcpyHostToDevice);

    int num_threads = 1 << 8;
    dim3 size_block(num_threads);
    dim3 size_grid((num_elements + num_threads - 1) / num_threads);
    histogram<<<size_grid, size_block>>>(d_input, d_output, num_elements, num_bins);

    cudaMemcpy(h_output.data(), d_output, byte_bins, cudaMemcpyDeviceToHost);
    assert(num_elements == accumulate(begin(h_output), end(h_output), 0));

    ofstream output_file;
    output_file.open("histogram.dat", ios::out | ios::trunc);
    for (int i = 0; i < num_bins; i++) {
        output_file << "h_output[" << i << "] = " << h_output[i] << "\n";
    }
    output_file.close();

    cudaFree(d_input);
    cudaFree(d_output);

    cout << "Success!" << endl;
    return 0;
}

Success!



# Practice

# Profiler