<a href="https://colab.research.google.com/github/Vishal13252/100-Days-GPU-Challenge/blob/master/Day_13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile parallel_reduction_shared_memory.cu

#include <iostream>
#include <cuda.h>

__global__ void reduce_sum_shared(int *input, int *output, int n) {
    extern __shared__ int sdata[];

    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // Load input to shared memory
    sdata[tid] = (i < n) ? input[i] : 0;
    __syncthreads();

    // Do reduction in shared memory
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // Write result for this block to output
    if (tid == 0) {
        output[blockIdx.x] = sdata[0];
    }
}

int main() {
    const int N = 1024;
    int h_input[N], h_output[N / 256];

    for (int i = 0; i < N; i++) h_input[i] = 1;

    int *d_input, *d_output;
    cudaMalloc(&d_input, N * sizeof(int));
    cudaMalloc(&d_output, (N / 256) * sizeof(int));

    cudaMemcpy(d_input, h_input, N * sizeof(int), cudaMemcpyHostToDevice);

    reduce_sum_shared<<<N / 256, 256, 256 * sizeof(int)>>>(d_input, d_output, N);

    cudaMemcpy(h_output, d_output, (N / 256) * sizeof(int), cudaMemcpyDeviceToHost);

    // Final sum from blocks
    int total = 0;
    for (int i = 0; i < N / 256; i++) total += h_output[i];

    std::cout << "Total Sum: " << total << std::endl;

    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}


Writing parallel_reduction_shared_memory.cu


In [2]:
# Compile with the specified architecture
!nvcc parallel_reduction_shared_memory.cu -o parallel_reduction_shared_memory -gencode arch=compute_75,code=sm_75

# Run the executable
!./parallel_reduction_shared_memory

Total Sum: 1024
