# CUDA on Colab
This Colab notebook shows how you can not only run your CUDA code on Colab but also use it profile them. It includes a simple vector add template code for both **CUDA events** for quick execution time mertrics and **Nsight Compute (`ncu `)** for profiling.
I wish you success in reaching SOL with your kernels faster!


In [None]:
%%shell
wget https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb
dpkg -i NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb

--2025-08-12 11:48:05--  https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb
Resolving developer.nvidia.com (developer.nvidia.com)... 23.45.207.91, 23.45.207.74
Connecting to developer.nvidia.com (developer.nvidia.com)|23.45.207.91|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://developer.download.nvidia.com/assets/tools/secure/nsight-systems/2025_3/NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb?__token__=exp=1754999885~hmac=e5ffcbddd1839a7b40c32bf05c84b76c932264148c1353e56a1372c34c214e77 [following]
--2025-08-12 11:48:05--  https://developer.download.nvidia.com/assets/tools/secure/nsight-systems/2025_3/NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb?__token__=exp=1754999885~hmac=e5ffcbddd1839a7b40c32bf05c84b76c932264148c1353e56a1372c34c214e77
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.215.7.31, 23.215.7



In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
!ncu --version

NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2024 NVIDIA Corporation
Version 2024.2.1.0 (build 34372528) (public-release)


# CUDA Events
In the code below, we use **CUDA events** to measure kernel execution time.
The kernel is run multiple times, and the average for a stable and accurate measurement , this is standard practice in GPU benchmarking .

CUDA events are GPU-side timestamps. We create two events (start, stop), record them before and after the kernel launch, then use cudaEventElapsedTime() to get the duration. Because the timing is done on the GPU, it excludes CPU scheduling delays and unrelated host code.

*NOTE:*
The kernel launch is in solve() instead of main() directly only to match leetGPU submission blueprint .

In [None]:
code = r"""
#include <cuda_runtime.h>
#include <iostream>

#define BLOCK_DIM 256

__global__ void vector_add(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int N) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = blockDim.x * gridDim.x;
    size_t vec_size = N >> 2;

    const float4* A4 = reinterpret_cast<const float4*>(A);
    const float4* B4 = reinterpret_cast<const float4*>(B);
    float4* C4 = reinterpret_cast<float4*>(C);

    for (size_t i = tid; i < vec_size; i += stride) {
        float4 va = __ldg(&A4[i]);
        float4 vb = __ldg(&B4[i]);
        float4 vc;
        vc.x = va.x + vb.x;
        vc.y = va.y + vb.y;
        vc.z = va.z + vb.z;
        vc.w = va.w + vb.w;
        C4[i] = vc;
    }

    int tail = N % 4;
    int base = N - tail;

    if (tid < tail) {
        C[base + tid] = __ldg(&A[base + tid]) + __ldg(&B[base + tid]);
    }
}

extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = BLOCK_DIM;
    int blocksPerGrid = max(1, (N/4 + threadsPerBlock - 1) / threadsPerBlock);
    vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
}

int main() {
    size_t total_elements = 10000;
    size_t total_bytes = total_elements * sizeof(float);

    float* h_A = (float*)malloc(total_bytes);
    float* h_B = (float*)malloc(total_bytes);
    float* h_C = (float*)malloc(total_bytes);

    for (size_t i = 0; i < total_elements; ++i) {
        h_A[i] = 1.0f;
        h_B[i] = 1.0f;
    }

    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, total_bytes);
    cudaMalloc(&d_B, total_bytes);
    cudaMalloc(&d_C, total_bytes);

    cudaMemcpy(d_A, h_A, total_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, total_bytes, cudaMemcpyHostToDevice);

    // -------------------------------------------------------------------
    // CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int runs = 10;
    cudaEventRecord(start);
    for (int i = 0; i < runs; i++) {
        solve(d_A, d_B, d_C, total_elements);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float total_ms = 0;
    cudaEventElapsedTime(&total_ms, start, stop);
    // Compute average per run
    float avg_ms = total_ms / runs;
    //------------------------------------------------------------------------
    cudaMemcpy(h_C, d_C, total_bytes, cudaMemcpyDeviceToHost);

    std::cout << "First 10 results:\n";
    for (int i = 0; i < 10; i++) {
        std::cout << h_C[i] << " ";
    }
    std::cout << "Average kernel time: " << avg_ms << " ms\n";

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}
"""


In [None]:
with open("vec1.cu", "w") as f:
    f.write(code)

In [None]:
!nvcc -O2 -lineinfo -arch=sm_75 -o vec1_ vec1.cu

In [None]:
!./vec1_

First 10 results:
2 2 2 2 2 2 2 2 2 2 Average kernel time: 0.0139584 ms


# Nsight Compute

Below is the same code as above, except without CUDA events — we don’t need them here because **`ncu`** provides execution time and far more detailed metrics.

Compile with the `-lineinfo` flag. This allows Nsight Compute to show a side-by-side comparison of your source code and the generated low-level instructions in the **Source** section of the GUI.

I also recommend watching the GPU Mode Nsight Compute tutorial: [NVIDIA GPU Profiling](https://youtu.be/F_BazucyCMw?si=sMiZRy_erDidhQXT) for a detailed walkthrough on interpreting metrics, navigating the deatil and source view, and optimizing kernels to the SOL (Speed of Light).


In [None]:
code2 = r"""
#include <cuda_runtime.h>
#include <iostream>

#define BLOCK_DIM 256

__global__ void vector_add(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int N) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    size_t stride = blockDim.x * gridDim.x;
    size_t vec_size = N >> 2;

    const float4* A4 = reinterpret_cast<const float4*>(A);
    const float4* B4 = reinterpret_cast<const float4*>(B);
    float4* C4 = reinterpret_cast<float4*>(C);

    for (size_t i = tid; i < vec_size; i += stride) {
        float4 va = __ldg(&A4[i]);
        float4 vb = __ldg(&B4[i]);
        float4 vc;
        vc.x = va.x + vb.x;
        vc.y = va.y + vb.y;
        vc.z = va.z + vb.z;
        vc.w = va.w + vb.w;
        C4[i] = vc;
    }

    int tail = N % 4;
    int base = N - tail;

    if (tid < tail) {
        C[base + tid] = __ldg(&A[base + tid]) + __ldg(&B[base + tid]);
    }
}

extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = BLOCK_DIM;
    int blocksPerGrid = max(1, (N/4 + threadsPerBlock - 1) / threadsPerBlock);
    vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
}

int main() {
    size_t total_elements = 10000;
    size_t total_bytes = total_elements * sizeof(float);

    float* h_A = (float*)malloc(total_bytes);
    float* h_B = (float*)malloc(total_bytes);
    float* h_C = (float*)malloc(total_bytes);

    for (size_t i = 0; i < total_elements; ++i) {
        h_A[i] = 1.0f;
        h_B[i] = 1.0f;
    }

    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, total_bytes);
    cudaMalloc(&d_B, total_bytes);
    cudaMalloc(&d_C, total_bytes);

    cudaMemcpy(d_A, h_A, total_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, total_bytes, cudaMemcpyHostToDevice);
    solve(d_A, d_B, d_C, total_elements);
    cudaDeviceSynchronize();
    cudaMemcpy(h_C, d_C, total_bytes, cudaMemcpyDeviceToHost);

    std::cout << "First 10 results:\n";
    for (int i = 0; i < 10; i++) {
        std::cout << h_C[i] << " ";
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}
"""


In [None]:
with open("vec2.cu", "w") as f:
    f.write(code2)

In [None]:
!nvcc -O2 -lineinfo -arch=sm_75 -o vec2_ vec2.cu

In [None]:
!./vec2_

First 10 results:
2 2 2 2 2 2 2 2 2 2 

In [None]:
!ncu --set full -o vecprofile2 -f ./vec2_

==PROF== Connected to process 2116 (/content/vec2_)
==PROF== Profiling "vector_add" - 0: 0%....50%....100% - 30 passes
First 10 results:
2 2 2 2 2 2 2 2 2 2 ==PROF== Disconnected from process 2116
==PROF== Report: /content/vecprofile2.ncu-rep
