In [2]:
%%writefile vectoradd.cu

#include<stdio.h>
#include<cuda.h>
#include<math.h>

__global__
void vecAddKernel(float *a, float *b, float *c, int n)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i < n)
    {
        c[i] = a[i] + b[i];
    }
}

void vecAdd(float *a, float *b, float *c, int n){
    int size = n * sizeof(float);
    float *a_dev, *b_dev, *c_dev;
    cudaError_t err_a = cudaMalloc((void **)&a_dev, size);
    cudaError_t err_b = cudaMalloc((void **)&b_dev, size);
    cudaError_t err_c = cudaMalloc((void **)&c_dev, size);
    if(err_a != cudaSuccess || err_b != cudaSuccess || err_c != cudaSuccess)
    {
        printf("Memory allocation failed\n");
        exit(EXIT_FAILURE);
    }
    cudaError_t cpy_a = cudaMemcpy(a_dev, a, size, cudaMemcpyHostToDevice);
    cudaError_t cpy_b = cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);
    if(cpy_a != cudaSuccess || cpy_b != cudaSuccess)
    {
        printf("Memory copy from host to device failed\n");
        exit(EXIT_FAILURE);
    }
    dim3 block(256);
    dim3 grid(ceil(n/256.0));
    vecAddKernel<<<grid,block>>>(a_dev, b_dev, c_dev, n);
    cudaError_t cpy_c = cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);
    if(cpy_c != cudaSuccess)
    {
        printf("Memory copy from device to host failed\n");
        exit(EXIT_FAILURE);
    }
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
}

int main(int argc,char **argv)
{
    float *a, *b, *c;
    int n = 5000;
    a = (float *)malloc(n * sizeof(float));
    b = (float *)malloc(n * sizeof(float));
    c = (float *)malloc(n * sizeof(float));
    for(int i = 0; i < n; i++)
    {
        a[i] = 1.0f;
        b[i] = 2.0f;
    }
    vecAdd(a, b, c, n);
    for(int i = 0; i < 10; i++)
    {
        printf("%f + %f = %f\n", a[i], b[i], c[i]);
    }
    free(a);
    free(b);
    free(c);
    return 0;
}

Overwriting vectoradd.cu


In [3]:
!nvcc -arch=sm_75 -gencode=arch=compute_75,code=sm_75 vectoradd.cu -o vectoradd

In [4]:
!./vectoradd

1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
1.000000 + 2.000000 = 3.000000
