<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_histogram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
%%sh
nvcc --version
python3 -m pip install nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [22]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [23]:
%%cuda

#include <cuda_runtime.h>
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <numeric>
#include <vector>

using namespace std;

ostream& operator<< (ostream& os, vector<char>& vec) {
    for (char c: vec) os << c << endl;
    return os;
}

__global__ void histogram(char *d_input, int *d_output, int num_elements, int num_bins) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    for (int i = tid; i < num_elements; i += gridDim.x * blockDim.x) {
        int idx_char = d_input[i] - 'a';
        int idx_bin = idx_char % num_bins;
        atomicAdd(&d_output[idx_bin], 1);
    }
}

int main() {
    int num_elements = 1 << 20;
    int num_bins = 7;

    size_t byte_elements = num_elements * sizeof(char);
    size_t byte_bins = num_bins * sizeof(int);

    vector<char> h_input(num_elements);
    vector<int> h_output(num_bins);

    srand(1);
    generate(begin(h_input), end(h_input), [](){return 'a' + rand() % 26;});
    // cout << h_input;

    char *d_input;
    int *d_output;

    cudaMalloc(&d_input, byte_elements);
    cudaMalloc(&d_output, byte_bins);
    cudaMemcpy(d_input, h_input.data(), byte_elements, cudaMemcpyHostToDevice);

    int num_threads = 1 << 8;
    dim3 size_block(num_threads);
    dim3 size_grid((num_elements + num_threads - 1) / num_threads);
    histogram<<<size_grid, size_block>>>(d_input, d_output, num_elements, num_bins);

    cudaMemcpy(h_output.data(), d_output, byte_bins, cudaMemcpyDeviceToHost);
    assert(num_elements == accumulate(begin(h_output), end(h_output), 0));

    ofstream output_file;
    output_file.open("histogram.dat", ios::out | ios::trunc);
    for (int i = 0; i < num_bins; i++) {
        output_file << "h_output[" << i << "] = " << h_output[i] << "\n";
    }
    output_file.close();

    cudaFree(d_input);
    cudaFree(d_output);

    cout << "Success!" << endl;
    return 0;
}

Success!



# Practice

In [37]:
%%cuda

#include <cuda_runtime.h>
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <numeric>
#include <vector>

using namespace std;

constexpr int NUM_CHARS = 26;

ostream& operator<< (ostream& os, vector<char>& vec) {
    for (char c: vec) os << c << endl;
    return os;
}

__global__ void histogram(char *d_elements, int *d_bins, int num_elements, int num_bins) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    for (int i = tid; i < num_elements; i += gridDim.x * blockDim.x) {
        int idx_bin = (d_elements[i] - 'a') % num_bins;
        atomicAdd(&d_bins[idx_bin], 1);
    }
}

int main() {
    int num_elements = 1 << 20;
    int num_bins = 7;

    size_t byte_elements = sizeof(char) * num_elements;
    size_t byte_bins = sizeof(int) * num_bins;

    vector<char> h_elements(num_elements);
    vector<int> h_bins(num_bins);

    srand(1);
    generate(
        begin(h_elements), end(h_elements),
        [](){return 'a' + rand() % NUM_CHARS;}
    );
    // cout << h_elements;

    char *d_elements;
    int *d_bins;

    cudaMalloc(&d_elements, byte_elements);
    cudaMalloc(&d_bins, byte_bins);
    cudaMemcpy(d_elements, h_elements.data(), byte_elements, cudaMemcpyHostToDevice);

    int num_threads = 1 << 8;
    dim3 size_block(num_threads);
    dim3 size_grid((num_elements + num_threads - 1) / num_threads);
    histogram<<<size_grid, size_block>>>(d_elements, d_bins, num_elements, num_bins);

    cudaMemcpy(h_bins.data(), d_bins, byte_bins, cudaMemcpyDeviceToHost);
    assert(num_elements == accumulate(begin(h_bins), end(h_bins), 0));

    ofstream f_output;
    f_output.open("histogram.dat", ios::out | ios::trunc);
    for (int i = 0; i < num_bins; i++) {
        f_output << "h_output[" << i << "] = " << h_bins[i] << endl;
    }
    f_output.close();

    cudaFree(d_elements);
    cudaFree(d_bins);

    cout << "Success!" << endl;

    return 0;
}

Success!



# Profiler