In [2]:
%%writefile prog.cu

#include <stdio.h>

#define N 16


__device__ int blockReduceSum(int *sharedData, int blockSize)
{
  int blockSum = 0;
  for (int i=0; i<blockSize; i++)
  {
    blockSum += sharedData[i];
  }
  return blockSum;
}

__global__ void sumWithSharedMemory(int *d_input, int *d_output, int n) {

    __shared__ int sharedData[4];

    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n)
    {
        sharedData[threadIdx.x] = d_input[idx];
    }
    else
    {
      sharedData[threadIdx.x] = 0;
    }

    __syncthreads();

    if (threadIdx.x == 0)
    {
        int blockSum = blockReduceSum(sharedData, blockDim.x);
        d_output[blockIdx.x] = blockSum;
    }
}


int main(){

   int h_input[N], h_output[N/4], finalSum=0;
   int *d_input, *d_output;

   for (int i=0; i<N; i++){
      h_input[i] = i;
   }

   cudaMalloc((void**)&d_input, N * sizeof(int));
   cudaMalloc((void**)&d_output, (N/4) * sizeof(int));


   cudaMemcpy(d_input, h_input, N * sizeof(int), cudaMemcpyHostToDevice);


   sumWithSharedMemory<<<4, 4>>>(d_input, d_output, N);

   cudaMemcpy(h_output, d_output, (N/4) * sizeof(int), cudaMemcpyDeviceToHost);


   for (int i=0; i<(N/4); i++){
      finalSum += h_output[i];
   }

   printf("Total sum of array elements: %d\n", finalSum);

   cudaFree(d_input);
   cudaFree(d_output);

   return 0;
}

Writing prog.cu


In [3]:
!nvcc -o exe prog.cu
!./exe

Total sum of array elements: 120
