In [None]:
%%writefile vector_addition_pinned_memory.cu
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>

#define TYPE int
#define N 512

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void device_add(T *a, T *b, T *c, int n)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < n) {
        c[id] = a[id] + b[id];
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
void print_output(T *a, T *b, T *c)
{
    for (int i = 0; i < N; ++i)
    {
        printf("\n %d + %d  = %d", a[i], b[i], c[i]);
    }
}

int main()
{
    TYPE *a, *b, *c;
    TYPE *d_a, *d_b, *d_c;
    int threads_per_block = 0, no_of_blocks = 0;

    int size = N * sizeof(TYPE);

    // Allocate space for host copies of a, b, c and setup input values
    cudaHostAlloc(&a, size, cudaHostAllocDefault);
    cudaHostAlloc(&b, size, cudaHostAllocDefault);
    cudaHostAlloc(&c, size, cudaHostAllocDefault);

    for (int i = 0; i < N; ++i)
    {
        a[i] = i;
        b[i] = i;
    }

    // Allocate space for device copies of a, b, c
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Copy data from host to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    threads_per_block = 4;
    no_of_blocks = (N + threads_per_block - 1) / threads_per_block;
    device_add<<<no_of_blocks, threads_per_block>>>(d_a, d_b, d_c, N);

    // Copy result back to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    print_output(a, b, c);

    cudaFreeHost(a);
    cudaFreeHost(b);
    cudaFreeHost(c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


In [None]:
!nvcc -o vector_addition_pinned_memory vector_addition_pinned_memory.cu

In [None]:
!./vector_addition_pinned_memory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/nsight-systems-2023.2.3_2023.2.3.1001-1_amd64.deb

In [None]:
!apt update
!apt install ./nsight-systems-2023.2.3_2023.2.3.1001-1_amd64.deb
!apt --fix-broken install

In [None]:
!nsys profile -o report_vector_addition_pinned_memory ./vector_addition_pinned_memory

In [None]:
!nvprof ./vector_addition_pinned_memory