In [1]:
%%writefile shared_memory.cu

#include <stdio.h>

#define N 16 //Size of the array

// Kernel function

__global__ void sumWithSharedMemory(int *d_input, int *d_output, int n) {

    __shared__ int sharedData[4];                           //Shared memory for each block

    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    //Each thread loads one element from global memory to shared memory
    if (idx < n)
    {
        sharedData[threadIdx.x] = d_input[idx];
    }
    else
    {
      sharedData[threadIdx.x] = 0;                          //Padding if idx is out of bounds
    }

    __syncthreads();

    // Perform reduction (sum within the block)
    if (threadIdx.x == 0)
    {
        int blockSum = 0;
        // Sum all elements in shared memory for the block
        for (int i=0;i<4; i++)
        {
          blockSum += sharedData[i];
        }
        d_output[blockIdx.x] = blockSum;                     //Store the sum of this block
    }
}


int main(){

   int h_input[N], h_output[N/4], finalSum=0;
   int *d_input, *d_output;

   // Initialize the input array
   for (int i=0; i<N; i++){
      h_input[i] = i;   // Set all elements to 1 for simplicity
   }

   // Allocate device memory
   cudaMalloc((void**)&d_input, N * sizeof(int));
   cudaMalloc((void**)&d_output, (N/4) * sizeof(int));

   //copy input array to device
   cudaMemcpy(d_input, h_input, N * sizeof(int), cudaMemcpyHostToDevice);

   //Launch the kernel with enough blocks to cover the array
   sumWithSharedMemory<<<4, 4>>>(d_input, d_output, N);

   //copy the result back to host to get the total sum
   cudaMemcpy(h_output, d_output, (N/4) * sizeof(int), cudaMemcpyDeviceToHost);

   //Final reduction on the host to get the total sum
   for (int i=0; i<(N/4); i++){
      finalSum += h_output[i];
   }

   printf("Total sum of array elements: %d\n", finalSum);

   cudaFree(d_input);
   cudaFree(d_output);

   return 0;
}

Writing shared_memory.cu


In [2]:
!nvcc -o exe shared_memory.cu
!./exe

Total sum of array elements: 120
