In [None]:
%%writefile array.cu

#include<stdio.h>
#include<stdlib.h>

//kernel function to add two integers and stor the result in an array

__global__ void add(int a, int b, int *result){
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

//  printf("Thread index: %d\n",idx);

  printf("Block Dimension: %d ... Block id: %d ... Thread id: %d ... Thread index: %d\n", blockDim.x, blockIdx.x, threadIdx.x, idx);

  result[idx] = a+b;
}


int main(){
  const int N = 10;
  int a = 5, b = 10;
  int h_result[N];                       // Host result array
  int *d_result;                         // Device result array

  //Allocate memory on the Device
  cudaMalloc(&d_result, N*sizeof(int));

  // --- Test 1: Configuration with add<<<1,10>>>(5,10) ---
  printf("Running with add<<<1,10>>>...\n");

  add<<<1,10>>>(a,b,d_result);                   //Launch kernel with 1 block, 10 threads per block
  cudaDeviceSynchronize();

  //Copy result back to Host
  cudaMemcpy(h_result, d_result, N*sizeof(int), cudaMemcpyDeviceToHost);

  //Print result for each thread
  for (int i=0; i<N; ++i)
  {
    printf("Thread %d result: %d\n", i, h_result[i]);
  }



  // --- Test 2: Configuration with add<<<10,1>>>(5,10) ---
  printf("Running with add<<<10,1>>>...\n");

  add<<<10,1>>>(a,b,d_result);                     //Launch kernel with 10 block, 1 thread per block
  cudaDeviceSynchronize();

  //Copy result back to Host
  cudaMemcpy(h_result, d_result, N*sizeof(int), cudaMemcpyDeviceToHost);

  //Print result for each thread
  for (int i=0; i<N; ++i)
  {
    printf("Thread %d result: %d\n", i, h_result[i]);
  }



  // --- Test 3: Configuration with add<<<2,5>>>(5,10) ---
  printf("Running with add<<<2,5>>>...\n");

  add<<<2,5>>>(a,b,d_result);                     //Launch kernel with 2 block, 5 thread per block

  cudaDeviceSynchronize();

  //Copy result back to Host
  cudaMemcpy(h_result, d_result, N*sizeof(int), cudaMemcpyDeviceToHost);

  //Print result for each thread
  for (int i=0; i<N; ++i)
  {
    printf("Thread %d result: %d\n", i, h_result[i]);
  }

   //Free device memory
   cudaFree(d_result);

   return 0;

}

Writing array.cu


In [None]:
!nvcc -o exe array.cu
!./exe

Running with add<<<1,10>>>...
Block Dimension: 10 ... Block id: 0 ... Thread id: 0 ... Thread index: 0
Block Dimension: 10 ... Block id: 0 ... Thread id: 1 ... Thread index: 1
Block Dimension: 10 ... Block id: 0 ... Thread id: 2 ... Thread index: 2
Block Dimension: 10 ... Block id: 0 ... Thread id: 3 ... Thread index: 3
Block Dimension: 10 ... Block id: 0 ... Thread id: 4 ... Thread index: 4
Block Dimension: 10 ... Block id: 0 ... Thread id: 5 ... Thread index: 5
Block Dimension: 10 ... Block id: 0 ... Thread id: 6 ... Thread index: 6
Block Dimension: 10 ... Block id: 0 ... Thread id: 7 ... Thread index: 7
Block Dimension: 10 ... Block id: 0 ... Thread id: 8 ... Thread index: 8
Block Dimension: 10 ... Block id: 0 ... Thread id: 9 ... Thread index: 9
Thread 0 result: 15
Thread 1 result: 15
Thread 2 result: 15
Thread 3 result: 15
Thread 4 result: 15
Thread 5 result: 15
Thread 6 result: 15
Thread 7 result: 15
Thread 8 result: 15
Thread 9 result: 15
Running with add<<<10,1>>>...
Block Dime