In [None]:
%%writefile vector_addition_stream_thrust.cu
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>

#define TYPE int
#define N 51200

#define nStream 4

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
__global__ void device_add(T *a, T *b, T *c, int n)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < n) {
        c[id] = a[id] + b[id];
    }
}

template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value>>
void print_output(T *a, T *b, T *c)
{
    for (int i = 0; i < N; ++i)
    {
        printf("\n %d + %d  = %d", a[i], b[i], c[i]);
    }
}

int main()
{
    int threads_per_block = 0, no_of_blocks = 0;

    // Allocate space for host copies of a, b, c and setup input values
    using pinned_allocator = thrust::mr::stateless_resource_allocator<TYPE, thrust::system::cuda::universal_host_pinned_memory_resource>;
    thrust::host_vector<TYPE, pinned_allocator> a(N);
    thrust::host_vector<TYPE, pinned_allocator> b(N);
    thrust::host_vector<TYPE, pinned_allocator> c(N);


    for (int i = 0; i < N; ++i)
    {
        a[i] = i;
        b[i] = i;
    }

    // Allocate space for device copies of a, b, c
    thrust::device_vector<TYPE> d_a(N);
    thrust::device_vector<TYPE> d_b(N);
    thrust::device_vector<TYPE> d_c(N);

    // Create n streams
    cudaStream_t *streams = new cudaStream_t[nStream];
    for (int i = 0; i < nStream; ++i)
    {
        cudaStreamCreate(&streams[i]);
    }

    // Divide the work between n streams
    int partSize = N / nStream;
    int parts[nStream * 2];
    for (int i = 0; i < nStream; ++i)
    {
        parts[i * 2] = i * partSize; // Initial index
        parts[i * 2 + 1] = (i == nStream - 1) ? (N - (nStream - 1) * partSize) : partSize; // Element number of this part
    }

    // Copy data from host to device
    // for (int i = 0; i < nStream; ++i)
    // {
    //     cudaMemcpyAsync(thrust::raw_pointer_cast(d_a.data()) + parts[i * 2], a.data().get() + parts[i * 2], parts[i * 2 + 1] * sizeof(TYPE), cudaMemcpyHostToDevice, streams[i]);
    //     cudaMemcpyAsync(thrust::raw_pointer_cast(d_b.data()) + parts[i * 2], b.data().get() + parts[i * 2], parts[i * 2 + 1] * sizeof(TYPE), cudaMemcpyHostToDevice, streams[i]);
    // }

    // Execute kernel
    // for (int i = 0; i < nStream; ++i)
    // {
    //     threads_per_block = 128;
    //     no_of_blocks = (parts[i * 2 + 1] + threads_per_block - 1) / threads_per_block;
    //     device_add<<<no_of_blocks, threads_per_block, 0, streams[i]>>>(
    //         thrust::raw_pointer_cast(d_a.data()) + parts[i * 2],
    //         thrust::raw_pointer_cast(d_b.data()) + parts[i * 2],
    //         thrust::raw_pointer_cast(d_c.data()) + parts[i * 2],
    //         parts[i * 2 + 1]);
    // }

    // Copy result back to host
    // for (int i = 0; i < nStream; ++i)
    // {
    //     cudaMemcpyAsync(c.data().get() + parts[i * 2], thrust::raw_pointer_cast(d_c.data()) + parts[i * 2], parts[i * 2 + 1] * sizeof(TYPE), cudaMemcpyDeviceToHost, streams[i]);
    // }

    // Another way to achieve the overlapping
    for (int i = 0; i < nStream; ++i)
    {
        cudaMemcpyAsync(thrust::raw_pointer_cast(d_a.data()) + parts[i * 2], a.data().get() + parts[i * 2], parts[i * 2 + 1] * sizeof(TYPE), cudaMemcpyHostToDevice, streams[i]);
        cudaMemcpyAsync(thrust::raw_pointer_cast(d_b.data()) + parts[i * 2], b.data().get() + parts[i * 2], parts[i * 2 + 1] * sizeof(TYPE), cudaMemcpyHostToDevice, streams[i]);

        threads_per_block = 128;
        no_of_blocks = (parts[i * 2 + 1] + threads_per_block - 1) / threads_per_block;

        device_add<<<no_of_blocks, threads_per_block, 0, streams[i]>>>(
            thrust::raw_pointer_cast(d_a.data()) + parts[i * 2],
            thrust::raw_pointer_cast(d_b.data()) + parts[i * 2],
            thrust::raw_pointer_cast(d_c.data()) + parts[i * 2],
            parts[i * 2 + 1]);

        cudaMemcpyAsync(c.data().get() + parts[i * 2], thrust::raw_pointer_cast(d_c.data()) + parts[i * 2], parts[i * 2 + 1] * sizeof(TYPE), cudaMemcpyDeviceToHost, streams[i]);
    }

    cudaDeviceSynchronize();

    //  print_output(a.data().get(), b.data().get(), c.data().get());

    // Clean up
    for (int i = 0; i < nStream; ++i)
    {
        cudaStreamDestroy(streams[i]);
    }
    delete[] streams;

    return 0;
}

In [22]:
!nvcc -o vector_addition_stream_thrust vector_addition_stream_thrust.cu

In [23]:
!./vector_addition_stream_thrust

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!apt update
!apt install ./drive/MyDrive/Nsight/nsight-systems-2023.2.3_2023.2.3.1001-1_amd64.deb
!apt --fix-broken install

In [None]:
!nsys profile -o report_vector_addition_stream_thrust ./vector_addition_stream_thrust