Reference: https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda

## Hillis-Steele Prefix Scan Algorithm

### Initial State

Before any operations, the output array is initialized to:

| Index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|-------|---|---|---|---|---|---|---|---|
| Value | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |

### Step 1: s = 1

Each element adds the value of its immediate left neighbor (**i - 1**):

| Index  | 0      | 1      | 2      | 3      | 4      | 5      | 6      | 7      |
|--------|--------|--------|--------|--------|--------|--------|--------|--------|
| Operation | -      | 0 + 1  | 1 + 2  | 2 + 3  | 3 + 4  | 4 + 5  | 5 + 6  | 6 + 7  |
| Result | 0      | 1      | 3      | 5      | 7      | 9      | 11     | 13     |


### Step 2: s = 2

Each element adds the value two positions to its left (**i - 2**):

| Index  | 0 | 1 | 2        | 3        | 4        | 5        | 6        | 7        |
|--------|---|---|----------|----------|----------|----------|----------|----------|
| Operation | - | - | 0 + (1 + 2) | (0 + 1) + (2 + 3) | (1 + 2) + (3 + 4) | (2 + 3) + (4 + 5) | (3 + 4) + (5 + 6) | (4 + 5) + (6 + 7) |
| Result | 0 | 1 | 3        | 6        | 10       | 14       | 18       | 22       |

### Step 3: s = 4

Each element adds the value four positions to its left (**i - 4**):

| Index  | 0 | 1 | 2 | 3 | 4                | 5                | 6                | 7                |
|--------|---|---|---|---|------------------|------------------|------------------|------------------|
| Operation | - | - | - | - | 0 + (1 + 2 + 3 + 4) | (0 + 1) + (2 + 3 + 4 + 5) | (0 + 1 + 2) + (3 + 4 + 5 + 6)| (0 + 1 + 2 + 3) + (4 + 5 + 6 + 7) |
| Result | 0 | 1 | 3 | 6 | 10               | 15               | 21               | 28               |


In [None]:
%%writefile prefix_scan.cu
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>

#define TYPE int
#define N 2048
#define BLOCK_SIZE 1024
#define NUM_PER_THREAD 8
#define WARP_SIZE (BLOCK_SIZE / NUM_PER_THREAD / 32)

__global__ void  warm_up()
{
    int indexX = threadIdx.x + blockIdx.x * blockDim.x;
    if (indexX < N)
    {
        float a = 0.0f;
        float b = 1.0f;
        float c = a + b;
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
void prefix_scan(T* input, T* output, int n)
{
    output[0] = input[0];
    for (int i = 1; i < n; ++i)
    {
        output[i] = output[i - 1] + input[i];
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void prefix_scan_0(T* input, T* output, int n)
{
    extern __shared__ T sdata[];

    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int tid = threadIdx.x;

    if (idx < n)
    {
        sdata[tid] = input[idx];
    }
    else
    {
        sdata[tid] = 0;
    }

    __syncthreads();

    for (unsigned int s = 1; s < blockDim.x; s *= 2)
    {
        T temp = 0;
        if (tid >= s)
        {
            temp = sdata[tid - s];
        }
        __syncthreads();
        sdata[tid] += temp;
    }

    if (idx < n)
    {
        output[idx] = sdata[tid];
    }

    __threadfence();

    for(int i = 1; i < gridDim.x; i *= 2)
    {
        T temp = output[i * blockDim.x - 1];
        if (idx < n && blockIdx.x == i)
        {
            output[idx] += temp;
        }
        __threadfence();
    }
}




int main()
{
    TYPE h_input[N];
    for (int i = 0; i < N; ++i)
    {
        h_input[i] = 1;
    }

    TYPE h_output[N];
    prefix_scan(h_input, h_output, N);


    thrust::device_vector<TYPE> d_input(h_input, h_input + N);
    thrust::device_vector<TYPE> d_output(N);

    int threads_per_block = BLOCK_SIZE;
    int no_of_blocks = (N + threads_per_block - 1) / threads_per_block;

    warm_up<<<no_of_blocks, threads_per_block>>>();
    prefix_scan_0<<<no_of_blocks, threads_per_block, threads_per_block * sizeof(TYPE)>>>(thrust::raw_pointer_cast(d_input.data()), thrust::raw_pointer_cast(d_output.data()), int(N));


    thrust::host_vector<TYPE> h_result = d_output;

    bool match = true;
    for (int i = 0; i < N; ++i)
    {
        if (h_output[i] != h_result[i])
        {
            std::cout << h_output[i] << " " << h_result[i] << std::endl;
            match = false;
            break;
        }
    }

    if (match)
        std::cout << "Results match!" << std::endl;
    else
        std::cout << "Results do not match!" << std::endl;

    return 0;
}

In [None]:
!nvcc -o prefix_scan -lineinfo prefix_scan.cu

In [None]:
!./prefix_scan