<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_parallel_reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp_qkkp8qv".


# Chapter 1: CUDA Parallel Reduction Part 1

In [10]:
%%cuda

#include <assert.h>
#include <cuda_runtime.h>
#include <stdio.h>

const int GRID_DIM_X = 1 << 8;
const int BLOCK_DIM_X = 1 << 8;

__global__ void sumReduce(int *vector, int *vectorSum) {
    // Step 0: Get the current thread's index.
    int ti = blockIdx.x * blockDim.x + threadIdx.x;

    // Step 1: Move elements from memory to cache.
    __shared__ int partialSum[BLOCK_DIM_X];
    partialSum[threadIdx.x] = vector[ti];
    __syncthreads();

    // Step 2: Divide and conquer the sum in one block.
    for (int si = 1; si < BLOCK_DIM_X; si *= 2) {
        if (threadIdx.x % (si * 2) == 0) {
            partialSum[threadIdx.x] += partialSum[threadIdx.x + si];
        }
        __syncthreads();
    }

    // Step 3: Move the sum from cache to memory.
    if (threadIdx.x == 0) {
        vectorSum[blockIdx.x] = partialSum[0];
    }
}

void vectorInit(int *h_vector, int numElements) {
    for (int i = 0; i < numElements; i++) {
        h_vector[i] = 1;
    }
}

int main() {
    // Step 0: Set the number and bytes of the vector.
    int numElements = GRID_DIM_X * BLOCK_DIM_X;
    size_t numBytes = sizeof(int) * numElements;

    // Step 1: Initialize the host and device memories.
    int *h_vector = (int*) malloc(numBytes);
    int *h_vectorSum = (int*) malloc(numBytes);
    vectorInit(h_vector, numElements);

    int *d_vector, *d_vectorSum;
    cudaMalloc(&d_vector, numBytes);
    cudaMalloc(&d_vectorSum, numBytes);

    // Step 2: Launch the kernel function to sum up the vector.
    cudaMemcpy(d_vector, h_vector, numBytes, cudaMemcpyHostToDevice);
    sumReduce<<<GRID_DIM_X, BLOCK_DIM_X>>>(d_vector, d_vectorSum);
    sumReduce<<<1, BLOCK_DIM_X>>>(d_vectorSum, d_vectorSum);
    cudaMemcpy(h_vectorSum, d_vectorSum, numBytes, cudaMemcpyDeviceToHost);

    printf("h_vectorSum[0] == %d\n", h_vectorSum[0]);
    assert(h_vectorSum[0] == 65536);

    // Step 3: Clear the allocated memories.
    free(h_vector);
    free(h_vectorSum);
    cudaFree(d_vector);
    cudaFree(d_vectorSum);

    printf("Success!");
    return 0;
}

h_vectorSum[0] == 65536
Success!


## Practice

In [13]:
%%cuda

#include <algorithm>
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>

using namespace std;

const int GRID_DIM_X = 1 << 8;
const int BLOCK_DIM_X = 1 << 8;

__global__ void sumReduce(int *d_vector, int *d_vectorSum) {
    // Step 0: Get the thread id and element id.
    int vi = blockDim.x * blockIdx.x + threadIdx.x;
    int ti = threadIdx.x;

    // Step 1: Move elements from the vector to the cache.
    __shared__ int partialSum[BLOCK_DIM_X];
    partialSum[ti] = d_vector[vi];
    __syncthreads();

    // Step 2: Sum all elements in the same block.
    for (int si = 1; si < BLOCK_DIM_X; si *= 2) {
        if (ti % (2 * si) == 0) {
            partialSum[ti] += partialSum[ti + si];
        }
        __syncthreads();
    }

    // Step 3: Move the sum value to the vector.
    if (threadIdx.x == 0) {
        d_vectorSum[blockIdx.x] = partialSum[0];
    }
}

void vectorInit(int *h_vector, int numElements) {
    fill_n(h_vector, numElements, 1);
    // memset(h_vector, 1, numElements);
}

int main() {
    // Step 0: Set the hyperparameters of vectors.
    int numElements = GRID_DIM_X * BLOCK_DIM_X;
    size_t numBytes = sizeof(int) * numElements;

    // Step 1: Initialize memories for vectors in both the host and device.
    int *h_vector = (int*) malloc(numBytes);
    int *h_vectorSum = (int*) malloc(numBytes);
    vectorInit(h_vector, numElements);

    int *d_vector, *d_vectorSum;
    cudaMalloc(&d_vector, numBytes);
    cudaMalloc(&d_vectorSum, numBytes);

    // Step 2: Launch the kernel function to sum up all elements.
    cudaMemcpy(d_vector, h_vector, numBytes, cudaMemcpyHostToDevice);

    time_t start = time(NULL);
    sumReduce<<<GRID_DIM_X, BLOCK_DIM_X>>>(d_vector, d_vectorSum);
    sumReduce<<<1, BLOCK_DIM_X>>>(d_vectorSum, d_vectorSum);
    time_t end = time(NULL);
    printf("Time taken is %f seconds.\n", difftime(end, start));

    cudaMemcpy(h_vectorSum, d_vectorSum, numBytes, cudaMemcpyDeviceToHost);
    printf("h_vectorSum[0] == %d\n", h_vectorSum[0]);

    // Step 3: Clear allocated memories.
    free(h_vector);
    free(h_vectorSum);
    cudaFree(d_vector);
    cudaFree(d_vectorSum);

    printf("Success!");
    return 0;
}

Time taken is 0.000000 seconds.
h_vectorSum[0] == 65536
Success!


# Chapter 2: CUDA Parallel Reduction Part 2

## Optimizations

1. Get rid of the wrap divergence.
2. Get rid of the modulo operation.

In [23]:
%%cuda

#include <algorithm>
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>

const int GRID_DIM_X = 1 << 8;
const int BLOCK_DIM_X = 1 << 8;

__global__ void reduceSum(int *vector, int *vectorSum) {
    // Step 0: Get the vector index and the thread index.
    int ti = threadIdx.x;
    int vi = blockIdx.x * blockDim.x + threadIdx.x;

    // Step 1: Move elements from the vector to the cache.
    __shared__ int partialSum[BLOCK_DIM_X];
    partialSum[ti] = vector[vi];
    __syncthreads();

    // Step 2: Accumulate all elements.
    for (int si = 1; si < blockDim.x; si *= 2) {
        int index = 2 * si * ti;
        if (index < blockDim.x) {
            partialSum[index] += partialSum[index + si];
        }
        __syncthreads();
    }

    // Step 3: Move the sum to the vector.
    if (ti == 0) {
        vectorSum[blockIdx.x] = partialSum[0];
    }
}

int main() {
    // Step 0: Setup the parameters.
    int numElements = GRID_DIM_X * BLOCK_DIM_X;
    size_t numBytes = sizeof(int) * numElements;

    // Step 1: Initialize both the cpu and gpu memories.
    int *h_vector = (int*) malloc(numBytes);
    int *h_vectorSum = (int*) malloc(numBytes);
    std::fill_n(h_vector, numElements, 1);

    int *d_vector, *d_vectorSum;
    cudaMalloc(&d_vector, numBytes);
    cudaMalloc(&d_vectorSum, numBytes);

    // Step 2: Launch the reduce sum kernel function.
    cudaMemcpy(d_vector, h_vector, numBytes, cudaMemcpyHostToDevice);

    time_t start, end;
    time(&start);
    reduceSum<<<GRID_DIM_X, BLOCK_DIM_X>>>(d_vector, d_vectorSum);
    reduceSum<<<1, BLOCK_DIM_X>>>(d_vectorSum, d_vectorSum);
    time(&end);
    printf("Time taken is %f seconds.\n", difftime(end, start));

    cudaMemcpy(h_vectorSum, d_vectorSum, numBytes, cudaMemcpyDeviceToHost);

    printf("Accumulated result is %d.\n", h_vectorSum[0]);

    // Step 3: Clear allocated memories.
    free(h_vector);
    free(h_vectorSum);
    cudaFree(d_vector);
    cudaFree(d_vectorSum);

    printf("Success!");
    return 0;
}

Time taken is 0.000000 seconds.
Accumulated result is 65536.
Success!
