Adding two arrays A and B element wise to get C using CUDA

In [None]:
%%writefile array2.cu

#include <stdio.h>
#include <cuda.h>

__global__ void add_arrays(int *c, const int *a, const int *b, int size){
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    c[i] = a[i] + b[i];
}


int main()
{
   const int size = 5;
   int a[size] = {1,2,3,4,5};
   int b[size] = {1,2,3,4,5};

   //Allocate memory on the device for array C
   int *d_c;
   cudaMalloc((void **)&d_c, size * sizeof(int));

   //Allocate memory on the device for array A and B
   int *d_a, *d_b;
   cudaMalloc((void **)&d_a, size * sizeof(int));
   cudaMalloc((void **)&d_b, size * sizeof(int));

   cudaMemcpy(d_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
   cudaMemcpy(d_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

   //Launch the kernel with one block and size threads
//   add_arrays<<<1,5>>>(d_c, d_a, d_b, size);    // thread equal to number of array elements
//   add_arrays<<<1,3>>>(d_c, d_a, d_b, size);    // thread number less than number of array elements, no error


   add_arrays<<<1,8>>>(d_c, d_a, d_b, size);    // thread number larger than number of array elements, gives error


   // Add synchronization to ensure kernel execution completes
   cudaDeviceSynchronize();

   //copy the result back from the device to the host
   //int *c = (int*)malloc(size * sizeof(int));
   int c[size];

   cudaMemcpy(c, d_c, size * sizeof(int), cudaMemcpyDeviceToHost);

   //Print the result;
   for(int i = 0; i < size; i++)
   {
       printf("%d + %d = %d \n", a[i], b[i], c[i]);
   }

   //Free the memory on device
   //cudaFree(c);
   cudaFree(d_a);
   cudaFree(d_b);
   cudaFree(d_c);

   return 0;
}



Writing array2.cu


In [None]:
!nvcc -o exe array2.cu
!./exe

1 + 1 = 2 
2 + 2 = 4 
3 + 3 = 6 
4 + 4 = 8 
5 + 5 = 10 
