In [None]:
%%writefile vector_addition_const_memory.cu

#include <stdio.h>
#include <stdlib.h>
#include <type_traits>

#include <thrust/device_vector.h>
#include <thrust/fill.h>
#include <thrust/functional.h>

#define N 102400
#define TYPE int

__global__ void warm_up() {
    int indexX = threadIdx.x + blockIdx.x * blockDim.x;
    if(indexX < N)
    {
        float a = 0.0f;
        float b = 1.0f;
        float c = a + b;
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void device_add_const_memory(T * __restrict__ a, T * __restrict__ b, T * __restrict__ c, int n)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id >= 1 && id < n - 1) {
        c[id] = __ldg(a + id) + __ldg(a + id + 1) + __ldg(b + id) + __ldg(b + id + 1);
        c[id] += __ldg(a + id) + __ldg(a + id - 1) + __ldg(b + id) + __ldg(b + id - 1);
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void device_add(T *a, T *b, T *c, int n)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id >= 1 && id < n - 1) {
        c[id] = a[id] + a[id + 1] + b[id] + b[id + 1];
        c[id] = a[id] + a[id - 1] + b[id] + b[id - 1];
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
void print_output(T *a, T *b, T *c)
{
    for (int i = 0; i < N; ++i)
    {
        printf("\n %d + %d  = %d", a[i], b[i], c[i]);
    }
}

int main()
{
    thrust::device_vector<TYPE> a(N);
    thrust::device_vector<TYPE> b(N);
    thrust::device_vector<TYPE> c(N);

    thrust::device_vector<TYPE> a1(N);
    thrust::device_vector<TYPE> b1(N);
    thrust::device_vector<TYPE> c1(N);


    thrust::fill(a.begin(), a.end(), 0);
    thrust::fill(b.begin(), b.end(), 0);

    thrust::fill(a1.begin(), a1.end(), 0);
    thrust::fill(b1.begin(), b1.end(), 0);

    for (int i = 0; i < N; ++i)
    {
        a[i] = i;
        b[i] = i;

        a1[i] = i;
        b1[i] = i;
    }

    int threads_per_block = 256;
    int no_of_blocks = (N + threads_per_block - 1) / threads_per_block;

    warm_up<<<no_of_blocks, threads_per_block>>>();
    device_add_const_memory<<<no_of_blocks, threads_per_block>>>(
            thrust::raw_pointer_cast(a.data()),
            thrust::raw_pointer_cast(b.data()),
            thrust::raw_pointer_cast(c.data()),
            N);

    device_add<<<no_of_blocks, threads_per_block>>>(
            thrust::raw_pointer_cast(a1.data()),
            thrust::raw_pointer_cast(b1.data()),
            thrust::raw_pointer_cast(c1.data()),
            N);

    std::vector<TYPE> host_a(N);
    std::vector<TYPE> host_b(N);
    std::vector<TYPE> host_c(N);

    std::vector<TYPE> host_a1(N);
    std::vector<TYPE> host_b1(N);
    std::vector<TYPE> host_c1(N);

    thrust::copy(a.begin(), a.end(), host_a.begin());
    thrust::copy(b.begin(), b.end(), host_b.begin());
    thrust::copy(c.begin(), c.end(), host_c.begin());

    thrust::copy(a1.begin(), a1.end(), host_a1.begin());
    thrust::copy(b1.begin(), b1.end(), host_b1.begin());
    thrust::copy(c1.begin(), c1.end(), host_c1.begin());

    // print_output(host_a1.data(), host_b1.data(), host_c1.data());
    // print_output(host_a.data(), host_b.data(), host_c.data());

    return 0;
}

In [None]:
!nvcc -o vector_addition_const_memory vector_addition_const_memory.cu

In [None]:
!./vector_addition_const_memory

In [None]:
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/nsight-systems-2024.2.3_2024.2.3.38-1_amd64.deb

In [None]:
!apt update
!apt install ./nsight-systems-2024.2.3_2024.2.3.38-1_amd64.deb
!apt --fix-broken install

In [None]:
!nsys profile -o report_nsys_vector_addition_const_memory ./vector_addition_const_memory -f

In [None]:
!ncu --set full --replay-mode kernel --target-processes all -o report_ncu_vector_addition_const_memory -f ./vector_addition_const_memory

In [None]:
!ncu --list-sets