<a href="https://colab.research.google.com/github/Sooraj9503/cuda/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
%%writefile vec_add.cu

#include<stdio.h>
#include<cuda_runtime.h>
#include<stdlib.h>
#include<device_launch_parameters.h>
#include<math.h>
#include<assert.h>


// Check for errors in the result
void error_check(int* a, int* b, int* c, int n)
{
    for (int i = 0; i < n; i++)
    {
        assert(c[i] == a[i] + b[i]);
    }
}

__global__ void addition(int* a, int* b, int *c, int n)
{
  //calculate global thread id
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if(idx < n)
  {
    c[idx] = a[idx] + b[idx];
  }
}

int main()
{
  int *ha,*hb, *hc, *da, *db, *dc;
  int  n = 8;
  size_t size = n * sizeof(int);

  //Allocate memory on host
  ha = (int*)malloc(size);
  hb = (int*)malloc(size);
  hc = (int*)malloc(size);

  //Allocate device memory
  cudaMalloc(&da, size);
  cudaMalloc(&db, size);
  cudaMalloc(&dc, size);

  //Initialize vectors with rand values 0 - 99
for(int i = 0; i < n; i++)
{
  ha[i] = rand() % 100;
  hb[i] = rand() % 100;
}

//Copy data from host to device
cudaMemcpy(da, ha, size, cudaMemcpyHostToDevice);
cudaMemcpy(db, hb, size, cudaMemcpyHostToDevice);

//ThreadBlockSize
int numberOfThreads = 256;

//GridSize
int numberOfBlcks = (n + numberOfThreads - 1) / numberOfThreads;

addition<<<numberOfBlcks, numberOfThreads>>>(da,db,dc,n);

//Copy the result from device to host
cudaMemcpy(hc, dc, size, cudaMemcpyDeviceToHost);

//Check result from errors
error_check(ha, hb, hc, n);

printf("COMPLETED SUCCESFULLY\n");

//Display result
for(int i = 0; i < n; i++)
{
  printf("%d ",hc[i]);
}
printf("\n");

// Free device memory
cudaFree(da);
cudaFree(db);
cudaFree(dc);

// Free host memory
free(ha);
free(hb);
free(hc);
  return 0;
}

Overwriting vec_add.cu


In [24]:
!nvcc vec_add.cu

In [25]:
!./a.out


COMPLETED SUCCESFULLY
169 92 128 178 70 89 149 89 
