In [None]:
%%writefile reduction.cu
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>

#define TYPE int
#define N 2029
#define BLOCK_SIZE 1024
#define NUM_PER_THREAD 8

__global__ void  warm_up()
{
    int indexX = threadIdx.x + blockIdx.x * blockDim.x;
    if (indexX < N)
    {
        float a = 0.0f;
        float b = 1.0f;
        float c = a + b;
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void reduce_0(T *input, T *output, T size)
{
    extern __shared__ T sdata[];

    int tid = threadIdx.x;
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    if (index < size)
    {
        sdata[tid] = input[index];
    }
    else
    {
        sdata[tid] = 0;
    }
    __syncthreads();

    for (unsigned int s = 1; s < blockDim.x; s *= 2)
    {
        if (tid % (2 * s) == 0)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        output[blockIdx.x] = sdata[0];
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void reduce_1(T *input, T *output, T size)
{
    extern __shared__ T sdata[];

    int tid = threadIdx.x;
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    if (index < size)
    {
        sdata[tid] = input[index];
    }
    else
    {
        sdata[tid] = 0;
    }
    __syncthreads();

    for (unsigned int s = 1; s < blockDim.x; s *= 2)
    {
        int index = 2 * s * tid;
        if (index < blockDim.x)
        {
            sdata[index] += sdata[index + s];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        output[blockIdx.x] = sdata[0];
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void reduce_2(T *input, T *output, T size)
{
    extern __shared__ T sdata[];

    int tid = threadIdx.x;
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    if (index < size)
    {
        sdata[tid] = input[index];
    }
    else
    {
        sdata[tid] = 0;
    }
    __syncthreads();

    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        output[blockIdx.x] = sdata[0];
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void reduce_3(T *input, T *output, T size)
{
    extern __shared__ T sdata[];

    int tid = threadIdx.x;
    int index = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

    if (index + blockDim.x < size)
    {
        sdata[tid] = input[index] + input[index + blockDim.x];
    }
    else if (index + blockDim.x >= size && index < size)
    {
        sdata[tid] = input[index];
    }
    else
    {
        sdata[tid] = 0;
    }
    __syncthreads();

    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        output[blockIdx.x] = sdata[0];
    }
}



int main()
{
    TYPE h_input[N];
    for (int i = 0; i < N; ++i)
    {
        h_input[i] = 1; // 初始化数组为1
    }

    thrust::device_vector<TYPE> d_input(h_input, h_input + N);
    thrust::device_vector<TYPE> d_output(ceil(N / (BLOCK_SIZE * 1.0)), 0);

    int threads_per_block = BLOCK_SIZE;
    int no_of_blocks = (N + threads_per_block - 1) / threads_per_block;

    reduce_0<<<no_of_blocks, threads_per_block, BLOCK_SIZE * sizeof(TYPE)>>>(thrust::raw_pointer_cast(d_input.data()), thrust::raw_pointer_cast(d_output.data()), N);
    reduce_1<<<no_of_blocks, threads_per_block, BLOCK_SIZE * sizeof(TYPE)>>>(thrust::raw_pointer_cast(d_input.data()), thrust::raw_pointer_cast(d_output.data()), N);
    reduce_2<<<no_of_blocks, threads_per_block, BLOCK_SIZE * sizeof(TYPE)>>>(thrust::raw_pointer_cast(d_input.data()), thrust::raw_pointer_cast(d_output.data()), N);
    reduce_3<<<no_of_blocks, threads_per_block / 2, BLOCK_SIZE * sizeof(TYPE)>>>(thrust::raw_pointer_cast(d_input.data()), thrust::raw_pointer_cast(d_output.data()), N);

    thrust::host_vector<TYPE> h_output = d_output;

    // int final_sum = thrust::reduce(d_input.begin(), d_input.end(), 0, thrust::plus<int>());

    int final_sum = 0;
    for (int i = 0; i < h_output.size(); ++i)
    {
        final_sum += h_output[i];
    }

    std::cout << "Sum: " << final_sum << std::endl;

    return 0;
}

In [None]:
!nvcc -o reduction -lineinfo reduction.cu

In [None]:
!./reduction

In [None]:
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/nsight-systems-2024.2.3_2024.2.3.38-1_amd64.deb
!apt update
!apt install ./nsight-systems-2024.2.3_2024.2.3.38-1_amd64.deb
!apt --fix-broken install

In [None]:
!nsys profile -o report_nsys_reduction ./reduction -f

In [None]:
!ncu --set full --replay-mode kernel --target-processes all -o report_ncu_reduction -f ./reduction