In [None]:
!pip install nvcc4jupyter

In [None]:
%load_ext nvcc4jupyter

In [None]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include <type_traits>

#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/functional.h>

#define N 512
#define TYPE int

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void device_add(T *a, T *b, T *c, int n)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < n) {
        c[id] = a[id] + b[id];
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
void print_output(T *a, T *b, T *c)
{
    for (int i = 0; i < N; ++i)
    {
        printf("\n %d + %d  = %d", a[i], b[i], c[i]);
    }
}

int main()
{
    thrust::device_vector<TYPE> a(N);
    thrust::device_vector<TYPE> b(N);
    thrust::device_vector<TYPE> c(N);

    thrust::fill(a.begin(), a.end(), 0);
    thrust::fill(b.begin(), b.end(), 0);

    for (int i = 0; i < N; ++i)
    {
        a[i] = i;
        b[i] = i;
    }

    int threads_per_block = 256;
    int no_of_blocks = (N + threads_per_block - 1) / threads_per_block;

    device_add<<<no_of_blocks, threads_per_block>>>(thrust::raw_pointer_cast(a.data()),
           thrust::raw_pointer_cast(b.data()),
           thrust::raw_pointer_cast(c.data()),
           N);

    std::vector<TYPE> host_a(N);
    std::vector<TYPE> host_b(N);
    std::vector<TYPE> host_c(N);

    thrust::copy(a.begin(), a.end(), host_a.begin());
    thrust::copy(b.begin(), b.end(), host_b.begin());
    thrust::copy(c.begin(), c.end(), host_c.begin());

    print_output(host_a.data(), host_b.data(), host_c.data());

    return 0;
}