<a href="https://colab.research.google.com/github/RakshithSuresh2001/CUDA/blob/main/Addition_of_elements_in_two_arrays.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

CUDA GPUs have many parallel processors grouped into Streaming Multiprocessors, or SMs. Each SM can run multiple concurrent thread blocks. As an example, a Tesla P100 GPU based on the Pascal GPU Architecture has 56 SMs, each capable of supporting up to 2048 active threads. To take full advantage of all these threads, I should launch the kernel with multiple thread blocks.

By now you may have guessed that the first parameter of the execution configuration specifies the number of thread blocks. Together, the blocks of parallel threads make up what is known as the grid. Since I have N elements to process, and 256 threads per block, I just need to calculate the number of blocks to get at least N threads. I simply divide N by the block size (being careful to round up in case N is not a multiple of blockSize).

C++ program that adds the elements of two arrays with a million elements

In [2]:
%%writefile add.cpp

#include <iostream>
#include <math.h>

// function to add the elements of two arrays
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
      y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20; // 1M elements

  float *x = new float[N];
  float *y = new float[N];

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the CPU
  add(N, x, y);

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  delete [] x;
  delete [] y;

  return 0;
}

Writing add.cpp


In [3]:
%%shell
g++ add.cpp -o add



In [4]:
%%shell
./add

Max error: 0




In [5]:
%%writefile add.cu

#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
    y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20
 ;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 1>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}

Writing add.cu


In [6]:
%%shell

nvcc add.cu -o add_cuda
./add_cuda

Max error: 0




In [7]:
%%shell

nvprof ./add_cuda


==3571== NVPROF is profiling process 3571, command: ./add_cuda
Max error: 0
==3571== Profiling application: ./add_cuda
==3571== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  108.54ms         1  108.54ms  108.54ms  108.54ms  add(int, float*, float*)
      API calls:   73.05%  305.27ms         2  152.63ms  48.854us  305.22ms  cudaMallocManaged
                   25.98%  108.54ms         1  108.54ms  108.54ms  108.54ms  cudaDeviceSynchronize
                    0.60%  2.5084ms       114  22.003us      83ns  1.3669ms  cuDeviceGetAttribute
                    0.25%  1.0384ms         1  1.0384ms  1.0384ms  1.0384ms  cudaLaunchKernel
                    0.12%  485.88us         2  242.94us  234.81us  251.07us  cudaFree
                    0.00%  10.909us         1  10.909us  10.909us  10.909us  cuDeviceGetName
                    0.00%  5.3710us         2  2.6850us     266ns  5.1050us  cuDeviceGet
                



In [8]:
%%shell

nvidia-smi

Tue Feb 10 19:42:09 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8             13W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------



In [9]:
%%writefile add_block.cu

#include <iostream>
#include <math.h>

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  int index = threadIdx.x;
  int stride = blockDim.x;
  for (int i = index; i < n; i += stride)
      y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 256>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}

Writing add_block.cu


In [10]:
%%shell

nvcc add_block.cu -o add_block
nvprof ./add_block

==3845== NVPROF is profiling process 3845, command: ./add_block
Max error: 0
==3845== Profiling application: ./add_block
==3845== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  3.4803ms         1  3.4803ms  3.4803ms  3.4803ms  add(int, float*, float*)
      API calls:   91.73%  282.59ms         2  141.29ms  53.286us  282.53ms  cudaMallocManaged
                    6.19%  19.058ms         1  19.058ms  19.058ms  19.058ms  cudaLaunchKernel
                    1.13%  3.4880ms         1  3.4880ms  3.4880ms  3.4880ms  cudaDeviceSynchronize
                    0.80%  2.4565ms       114  21.548us      87ns  1.3018ms  cuDeviceGetAttribute
                    0.15%  458.08us         2  229.04us  216.61us  241.46us  cudaFree
                    0.00%  12.155us         1  12.155us  12.155us  12.155us  cuDeviceGetName
                    0.00%  4.7330us         2  2.3660us     225ns  4.5080us  cuDeviceGet
              



In [11]:
%%writefile add_grid.cu

#include <iostream>
#include <math.h>

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = index; i < n; i += stride)
    y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  int blockSize = 256;
  int numBlocks = (N + blockSize - 1) / blockSize;
  add<<<numBlocks, blockSize>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}

Writing add_grid.cu


In [12]:
%%shell

nvcc add_grid.cu -o add_grid
nvprof ./add_grid

==4368== NVPROF is profiling process 4368, command: ./add_grid
Max error: 0
==4368== Profiling application: ./add_grid
==4368== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  2.5771ms         1  2.5771ms  2.5771ms  2.5771ms  add(int, float*, float*)
      API calls:   92.87%  280.65ms         2  140.33ms  50.978us  280.60ms  cudaMallocManaged
                    5.26%  15.888ms         1  15.888ms  15.888ms  15.888ms  cudaLaunchKernel
                    0.86%  2.5933ms         1  2.5933ms  2.5933ms  2.5933ms  cudaDeviceSynchronize
                    0.85%  2.5812ms       114  22.641us      86ns  1.4344ms  cuDeviceGetAttribute
                    0.15%  464.91us         2  232.45us  211.38us  253.53us  cudaFree
                    0.00%  11.462us         1  11.462us  11.462us  11.462us  cuDeviceGetName
                    0.00%  4.8710us         2  2.4350us     176ns  4.6950us  cuDeviceGet
                

