In [7]:
%%writefile prog.cu

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define M 4
#define N 5

__global__ void addArrays2D(int* d_result, int* d_array1, int* d_array2)
{

  int blockIdxX = blockIdx.x;
  int blockIdxY = blockIdx.y;
  int threadIdxX = threadIdx.x;
  int threadIdxY = threadIdx.y;

  int row = blockIdxY * blockDim.y + threadIdxY;

  int col = blockIdxX * blockDim.x + threadIdxX;

  int tid = row*blockDim.x + col;
  printf("taseng----->>row=%d col=%d \n",row,col);


  printf("Globally unique thread number = %d\n",tid);

  // check for valid element within the array bounds

  if (row < M && col < N)
  {
      int result = d_array1[row*N +col] + d_array2[row*N +col];
      d_result[row*N +col] = result;
  }

}



int main (){
// Host memory for the addArrays2D

int host_array1[M][N] =
{
{1,2,3,4,5},
{6,7,8,9,10},
{11,12,13,14,15},
{16,17,18,19,20}
};

int host_array2[M][N] =
{
{10,20,30,40,50},
{60,70,80,90,100},
{110,120,130,140,150},
{160,170,180,190,200}
};

int* d_array1,*d_array2,*d_result;

cudaMalloc(&d_array1, M*N*sizeof(int));
cudaMalloc(&d_array2, M*N*sizeof(int));
cudaMalloc(&d_result, M*N*sizeof(int));

cudaMemcpy(d_array1, host_array1, M*N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_array2, host_array2, M*N*sizeof(int), cudaMemcpyHostToDevice);

int threadsPerBlockX = N; //5
int threadsPerBlockY = M; //4

dim3 threadsPerBlock(threadsPerBlockX, threadsPerBlockY, 1);

int numBlocksX = (N + threadsPerBlockX - 1)/threadsPerBlockX;
int numBlocksY = (N + threadsPerBlockY - 1)/threadsPerBlockY;

printf("taseng---> %d, %d\n", numBlocksX, numBlocksY);

dim3 blocksPerGrid(numBlocksX, numBlocksY, 1);

addArrays2D<<<blocksPerGrid,threadsPerBlock>>>(d_result, d_array1, d_array2);

int host_result[M][N];

cudaMemcpy(host_result, d_result, M*N*sizeof(int), cudaMemcpyDeviceToHost);

printf("\n\nResultant Array is : \n");

for (int i=0;i<M;i++){
  for (int j=0;j<N;++j)
  {
    printf("%d ",host_result[i][j]);
  }
  printf("\n");
}


// free device memory
cudaFree(d_array1);
cudaFree(d_array2);
cudaFree(d_result);

return 0;

}

Overwriting prog.cu


In [6]:
!nvcc -o exe prog.cu
!./exe

taseng---> 1, 2
taseng row=4 col=0 
taseng row=4 col=1 
taseng row=4 col=2 
taseng row=4 col=3 
taseng row=4 col=4 
taseng row=5 col=0 
taseng row=5 col=1 
taseng row=5 col=2 
taseng row=5 col=3 
taseng row=5 col=4 
taseng row=6 col=0 
taseng row=6 col=1 
taseng row=6 col=2 
taseng row=6 col=3 
taseng row=6 col=4 
taseng row=7 col=0 
taseng row=7 col=1 
taseng row=7 col=2 
taseng row=7 col=3 
taseng row=7 col=4 
taseng row=0 col=0 
taseng row=0 col=1 
taseng row=0 col=2 
taseng row=0 col=3 
taseng row=0 col=4 
taseng row=1 col=0 
taseng row=1 col=1 
taseng row=1 col=2 
taseng row=1 col=3 
taseng row=1 col=4 
taseng row=2 col=0 
taseng row=2 col=1 
taseng row=2 col=2 
taseng row=2 col=3 
taseng row=2 col=4 
taseng row=3 col=0 
taseng row=3 col=1 
taseng row=3 col=2 
taseng row=3 col=3 
taseng row=3 col=4 
Globally unique thread number = 20
Globally unique thread number = 21
Globally unique thread number = 22
Globally unique thread number = 23
Globally unique thread number = 24
Globally 