In [None]:
%%writefile reduction_operations.cu
#include <stdio.h>
#include <cuda.h>
#include <limits.h>

// CUDA kernel for Sum reduction
__global__ void sumReduction(int *input, int *output, int n) {
    __shared__ int shared[1024];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tid;

    // Load data to shared memory
    shared[tid] = (idx < n) ? input[idx] : 0;
    __syncthreads();

    // Reduction in shared memory
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            shared[tid] += shared[tid + s];
        }
        __syncthreads();
    }

    // Write result for this block to global memory
    if (tid == 0) output[blockIdx.x] = shared[0];
}

// CUDA kernel for Min reduction
__global__ void minReduction(int *input, int *output, int n) {
    __shared__ int shared[1024];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tid;

    shared[tid] = (idx < n) ? input[idx] : INT_MAX;
    __syncthreads();

    for (int s = blockDim.x/2; s > 0; s >>= 1) {
        if (tid < s) {
            if (shared[tid + s] < shared[tid])
                shared[tid] = shared[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = shared[0];
}

// CUDA kernel for Max reduction
__global__ void maxReduction(int *input, int *output, int n) {
    __shared__ int shared[1024];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tid;

    shared[tid] = (idx < n) ? input[idx] : INT_MIN;
    __syncthreads();

    for (int s = blockDim.x/2; s > 0; s >>= 1) {
        if (tid < s) {
            if (shared[tid + s] > shared[tid])
                shared[tid] = shared[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = shared[0];
}

int main() {
    const int N = 1024;
    int h_input[N];
    int *d_input, *d_output;
    int h_sum, h_min, h_max;

    // Initialize input data
    for (int i = 0; i < N; i++)
        h_input[i] = i + 1;

    cudaMalloc((void**)&d_input, N * sizeof(int));
    cudaMalloc((void**)&d_output, sizeof(int));

    cudaMemcpy(d_input, h_input, N * sizeof(int), cudaMemcpyHostToDevice);

    // Sum Reduction
    sumReduction<<<1, 1024>>>(d_input, d_output, N);
    cudaMemcpy(&h_sum, d_output, sizeof(int), cudaMemcpyDeviceToHost);

    // Min Reduction
    minReduction<<<1, 1024>>>(d_input, d_output, N);
    cudaMemcpy(&h_min, d_output, sizeof(int), cudaMemcpyDeviceToHost);

    // Max Reduction
    maxReduction<<<1, 1024>>>(d_input, d_output, N);
    cudaMemcpy(&h_max, d_output, sizeof(int), cudaMemcpyDeviceToHost);

    // Display Results
    printf("Sum      : %d\\n", h_sum);
    printf("Average  : %.2f\\n", (float)h_sum / N);
    printf("Minimum  : %d\\n", h_min);
    printf("Maximum  : %d\\n", h_max);

    // Free device memory
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}


Writing reduction_operations.cu


In [None]:
!nvcc reduction_operations.cu -o reduction
!./reduction

Sum      : 2189552\nAverage  : 2138.23\nMinimum  : 0\nMaximum  : 1024\n