In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
!ls /usr/local

bin    cuda	cuda-12.5	  etc	 include  libexec     man  sbin   src
colab  cuda-12	dist_metrics.pxd  games  lib	  LICENSE.md  opt  share


In [3]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [4]:
!nvidia-smi

Fri May 16 20:38:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
%%writefile matrix_mul.cu

#include <iostream>
#include <vector>
#include <cstdlib> // For rand(), srand()
#include <ctime>   // For time()
#include <cmath>   // For sqrtf()
#include <cuda_runtime.h> // For float4, cudaMalloc, etc.
#include <iomanip> // For std::fixed, std::setprecision
#include <chrono>  // For CPU timing
#include <numeric> // For std::iota (optional, for other types of initializations)
#include <algorithm> // For std::min

// --- CUDA Kernel: Vector Addition ---
__global__ void vectorAddKernel(const float *a, const float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;

    for (int i = idx; i < n; i += stride) {
        c[i] = a[i] + b[i];
    }
}

// --- CUDA Kernel: Normalize 4D Vectors ---
// float4 is a struct { float x, y, z, w; } defined in cuda_runtime.h
__global__ void normalizeVectorsKernel(const float4 *v_in, float4 *v_out, int n_vectors) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;

    for (int i = idx; i < n_vectors; i += stride) {
        float4 vec = v_in[i];
        float len_sq = vec.x * vec.x + vec.y * vec.y + vec.z * vec.z + vec.w * vec.w;

        if (len_sq > 1e-9f) { // Epsilon to avoid division by zero / very small numbers
            float inv_len = rsqrtf(len_sq); // CUDA intrinsic for 1/sqrt(x)
            v_out[i].x = vec.x * inv_len;
            v_out[i].y = vec.y * inv_len;
            v_out[i].z = vec.z * inv_len;
            v_out[i].w = vec.w * inv_len;
        } else {
            v_out[i].x = 0.0f;
            v_out[i].y = 0.0f;
            v_out[i].z = 0.0f;
            v_out[i].w = 0.0f;
        }
    }
}

// --- Problem 1: Vector Addition ---

void vector_addition_cpu(int n, const std::vector<float>& h_a, const std::vector<float>& h_b, std::vector<float>& h_c_cpu, int print_count) {
    std::cout << "\n--- Vector Addition (CPU) ---\n";
    h_c_cpu.assign(n, 0.0f); // Resize and initialize

    // START CPU TIMING
    auto start_time = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < n; ++i) {
        h_c_cpu[i] = h_a[i] + h_b[i];
    }

    auto end_time = std::chrono::high_resolution_clock::now();
    // END CPU TIMING
    std::chrono::duration<double, std::milli> duration_ms = end_time - start_time;
    std::cout << "CPU Execution time: " << duration_ms.count() << " ms\n"; // PRINT CPU TIME

    std::cout << "CPU Result (first " << print_count << " elements):\n";
    for (int i = 0; i < std::min(n, print_count); ++i) {
        std::cout << "h_c_cpu[" << i << "]: " << h_c_cpu[i] << " (Expected: " << h_a[i] + h_b[i] << ")" << std::endl;
    }

    bool ok = true;
    for (int i = 0; i < n; ++i) {
        float expected = h_a[i] + h_b[i];
        if (std::abs(h_c_cpu[i] - expected) > 1e-5) {
            ok = false;
            break;
        }
    }
    if(ok) std::cout << "CPU Verification: PASSED\n";
    else std::cout << "CPU Verification: FAILED\n";
}

void vector_addition_gpu(int n, const std::vector<float>& h_a, const std::vector<float>& h_b, std::vector<float>& h_c_gpu, int print_count) {
    std::cout << "\n--- Vector Addition (GPU) ---\n";
    const size_t bytes = n * sizeof(float);
    h_c_gpu.assign(n, 0.0f); // Resize and initialize

    float *d_a = nullptr, *d_b = nullptr, *d_c = nullptr;
    cudaEvent_t start_event = nullptr, stop_event = nullptr;
    cudaError_t err;

    auto cleanup = [&]() {
        if (d_a) cudaFree(d_a);
        if (d_b) cudaFree(d_b);
        if (d_c) cudaFree(d_c);
        if (start_event) cudaEventDestroy(start_event);
        if (stop_event) cudaEventDestroy(stop_event);
    };

    err = cudaMalloc(&d_a, bytes);
    if (err != cudaSuccess) { std::cerr << "GPU CUDA Malloc error d_a: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }
    err = cudaMalloc(&d_b, bytes);
    if (err != cudaSuccess) { std::cerr << "GPU CUDA Malloc error d_b: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }
    err = cudaMalloc(&d_c, bytes);
    if (err != cudaSuccess) { std::cerr << "GPU CUDA Malloc error d_c: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }

    cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);
    if (cudaGetLastError() != cudaSuccess) { std::cerr << "GPU CUDA EventCreate error" << std::endl; cleanup(); return;}


    err = cudaMemcpy(d_a, h_a.data(), bytes, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) { std::cerr << "GPU CUDA Memcpy H2D error d_a: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }
    err = cudaMemcpy(d_b, h_b.data(), bytes, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) { std::cerr << "GPU CUDA Memcpy H2D error d_b: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    // START GPU KERNEL TIMING
    cudaEventRecord(start_event);
    vectorAddKernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    err = cudaGetLastError();
    if (err != cudaSuccess) { std::cerr << "GPU Kernel launch error: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }
    cudaEventRecord(stop_event);
    cudaEventSynchronize(stop_event); // Wait for kernel to finish
    // END GPU KERNEL TIMING

    float ms = 0;
    cudaEventElapsedTime(&ms, start_event, stop_event);
    std::cout << "GPU Kernel execution time: " << ms << " ms\n"; // PRINT GPU TIME

    err = cudaMemcpy(h_c_gpu.data(), d_c, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) { std::cerr << "GPU CUDA Memcpy D2H error d_c: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }

    std::cout << "GPU Result (first " << print_count << " elements):\n";
    for (int i = 0; i < std::min(n, print_count); ++i) {
        std::cout << "h_c_gpu[" << i << "]: " << h_c_gpu[i] << " (Expected: " << h_a[i] + h_b[i] << ")" << std::endl;
    }

    bool ok = true;
    for (int i = 0; i < n; ++i) {
        float expected = h_a[i] + h_b[i];
        if (std::abs(h_c_gpu[i] - expected) > 1e-5) {
            ok = false;
            break;
        }
    }
    if(ok) std::cout << "GPU Verification: PASSED\n";
    else std::cout << "GPU Verification: FAILED\n";

    cleanup();
}

void run_vector_addition_experiment() {
    std::cout << "\n===== Problem 1: Vector Addition =====\n";
    const int n = 1 << 24;
    const int print_count = 5;

    std::vector<float> h_a(n);
    std::vector<float> h_b(n);
    std::vector<float> h_c_cpu;
    std::vector<float> h_c_gpu;

    srand(static_cast<unsigned int>(time(0)));
    for (int i = 0; i < n; ++i) {
        h_a[i] = static_cast<float>(rand()) / RAND_MAX;
        h_b[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    std::cout << "Initial Host Data for Vector Addition (first " << print_count << " elements):\n";
    for (int i = 0; i < std::min(n, print_count); ++i) {
        std::cout << "h_a[" << i << "]: " << h_a[i] << ", h_b[" << i << "]: " << h_b[i] << std::endl;
    }

    vector_addition_cpu(n, h_a, h_b, h_c_cpu, print_count);
    vector_addition_gpu(n, h_a, h_b, h_c_gpu, print_count);
}


// --- Problem 2: Normalize 4D Vectors ---

void normalize_vectors_cpu(int n_vectors, const std::vector<float4>& h_v_in, std::vector<float4>& h_v_out_cpu, int print_count) {
    std::cout << "\n--- Normalize 4D Vectors (CPU) ---\n";
    h_v_out_cpu.assign(n_vectors, {0.0f, 0.0f, 0.0f, 0.0f});

    // START CPU TIMING
    auto start_time = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < n_vectors; ++i) {
        const float4& vec_in = h_v_in[i];
        float len_sq = vec_in.x * vec_in.x + vec_in.y * vec_in.y + vec_in.z * vec_in.z + vec_in.w * vec_in.w;

        if (len_sq > 1e-9f) {
            float inv_len = 1.0f / sqrtf(len_sq);
            h_v_out_cpu[i].x = vec_in.x * inv_len;
            h_v_out_cpu[i].y = vec_in.y * inv_len;
            h_v_out_cpu[i].z = vec_in.z * inv_len;
            h_v_out_cpu[i].w = vec_in.w * inv_len;
        } else {
            h_v_out_cpu[i] = {0.0f, 0.0f, 0.0f, 0.0f};
        }
    }

    auto end_time = std::chrono::high_resolution_clock::now();
    // END CPU TIMING
    std::chrono::duration<double, std::milli> duration_ms = end_time - start_time;
    std::cout << "CPU Execution time: " << duration_ms.count() << " ms\n"; // PRINT CPU TIME

    std::cout << "CPU Result (first " << print_count << " normalized vectors):\n";
    bool all_ok_cpu = true;
    for (int i = 0; i < std::min(n_vectors, print_count); ++i) {
        const float4& vec_out = h_v_out_cpu[i];
        float len_sq_out = vec_out.x * vec_out.x + vec_out.y * vec_out.y +
                           vec_out.z * vec_out.z + vec_out.w * vec_out.w;
        float len_out = sqrtf(len_sq_out);
        std::cout << "h_v_out_cpu[" << i << "]: (" << vec_out.x << ", " << vec_out.y
                  << ", " << vec_out.z << ", " << vec_out.w << ")"
                  << " Length: " << len_out;

        const float4& vec_in = h_v_in[i];
        float len_sq_in = vec_in.x * vec_in.x + vec_in.y * vec_in.y + vec_in.z * vec_in.z + vec_in.w * vec_in.w;

        if (len_sq_in > 1e-9f) {
            if (std::abs(len_out - 1.0f) > 1e-5) {
                 std::cout << " (VERIFICATION FAILED: length not 1.0)";
                 all_ok_cpu = false;
            }
        } else {
            if (len_sq_out > 1e-9f) {
                std::cout << " (VERIFICATION FAILED: original near zero, output non-zero)";
                all_ok_cpu = false;
            } else {
                std::cout << " (original near zero)";
            }
        }
        std::cout << std::endl;
    }
    if(all_ok_cpu) std::cout << "CPU Verification: PASSED\n";
    else std::cout << "CPU Verification: FAILED\n";
}


void normalize_vectors_gpu(int n_vectors, const std::vector<float4>& h_v_in, std::vector<float4>& h_v_out_gpu, int print_count) {
    std::cout << "\n--- Normalize 4D Vectors (GPU) ---\n";
    const size_t bytes_float4 = n_vectors * sizeof(float4);
    h_v_out_gpu.assign(n_vectors, {0.0f, 0.0f, 0.0f, 0.0f});

    float4 *d_v_in = nullptr, *d_v_out = nullptr;
    cudaEvent_t start_event = nullptr, stop_event = nullptr;
    cudaError_t err;

    auto cleanup = [&]() {
        if (d_v_in) cudaFree(d_v_in);
        if (d_v_out) cudaFree(d_v_out);
        if (start_event) cudaEventDestroy(start_event);
        if (stop_event) cudaEventDestroy(stop_event);
    };

    err = cudaMalloc(&d_v_in, bytes_float4);
    if (err != cudaSuccess) { std::cerr << "GPU CUDA Malloc error d_v_in: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }
    err = cudaMalloc(&d_v_out, bytes_float4);
    if (err != cudaSuccess) { std::cerr << "GPU CUDA Malloc error d_v_out: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }

    cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);
    if (cudaGetLastError() != cudaSuccess) { std::cerr << "GPU CUDA EventCreate error" << std::endl; cleanup(); return;}

    err = cudaMemcpy(d_v_in, h_v_in.data(), bytes_float4, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) { std::cerr << "GPU CUDA Memcpy H2D error d_v_in: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }

    int blockSize = 256;
    int gridSize = (n_vectors + blockSize - 1) / blockSize;

    // START GPU KERNEL TIMING
    cudaEventRecord(start_event);
    normalizeVectorsKernel<<<gridSize, blockSize>>>(d_v_in, d_v_out, n_vectors);
    err = cudaGetLastError();
    if (err != cudaSuccess) { std::cerr << "GPU Kernel launch error: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }
    cudaEventRecord(stop_event);
    cudaEventSynchronize(stop_event); // Wait for kernel to finish
    // END GPU KERNEL TIMING

    float ms = 0;
    cudaEventElapsedTime(&ms, start_event, stop_event);
    std::cout << "GPU Kernel execution time: " << ms << " ms\n"; // PRINT GPU TIME

    err = cudaMemcpy(h_v_out_gpu.data(), d_v_out, bytes_float4, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) { std::cerr << "GPU CUDA Memcpy D2H error d_v_out: " << cudaGetErrorString(err) << std::endl; cleanup(); return; }

    std::cout << "GPU Result (first " << print_count << " normalized vectors):\n";
    bool all_ok_gpu = true;
    for (int i = 0; i < std::min(n_vectors, print_count); ++i) {
        const float4& vec_out = h_v_out_gpu[i];
        float len_sq_out = vec_out.x * vec_out.x + vec_out.y * vec_out.y +
                           vec_out.z * vec_out.z + vec_out.w * vec_out.w;
        float len_out = sqrtf(len_sq_out);
        std::cout << "h_v_out_gpu[" << i << "]: (" << vec_out.x << ", " << vec_out.y
                  << ", " << vec_out.z << ", " << vec_out.w << ")"
                  << " Length: " << len_out;

        const float4& vec_in = h_v_in[i];
        float len_sq_in = vec_in.x * vec_in.x + vec_in.y * vec_in.y + vec_in.z * vec_in.z + vec_in.w * vec_in.w;

        if (len_sq_in > 1e-9f) {
            if (std::abs(len_out - 1.0f) > 1e-5) {
                 std::cout << " (VERIFICATION FAILED: length not 1.0)";
                 all_ok_gpu = false;
            }
        } else {
             if (len_sq_out > 1e-9f) {
                std::cout << " (VERIFICATION FAILED: original near zero, output non-zero)";
                all_ok_gpu = false;
            } else {
                std::cout << " (original near zero)";
            }
        }
        std::cout << std::endl;
    }
    if(all_ok_gpu) std::cout << "GPU Verification: PASSED\n";
    else std::cout << "GPU Verification: FAILED\n";

    cleanup();
}


void run_normalize_vectors_experiment() {
    std::cout << "\n===== Problem 2: Normalize 4D Vectors =====\n";
    const int n_vectors = 1 << 22;
    const int print_count = 3;

    std::vector<float4> h_v_in(n_vectors);
    std::vector<float4> h_v_out_cpu;
    std::vector<float4> h_v_out_gpu;

    srand(static_cast<unsigned int>(time(0)) + 1);
    for (int i = 0; i < n_vectors; ++i) {
        h_v_in[i].x = (static_cast<float>(rand()) / RAND_MAX) * 2.0f - 1.0f;
        h_v_in[i].y = (static_cast<float>(rand()) / RAND_MAX) * 2.0f - 1.0f;
        h_v_in[i].z = (static_cast<float>(rand()) / RAND_MAX) * 2.0f - 1.0f;
        h_v_in[i].w = (static_cast<float>(rand()) / RAND_MAX) * 2.0f - 1.0f;
    }

    std::cout << "Initial Host Data for 4D Vector Normalization (first " << print_count << " vectors):\n";
    for (int i = 0; i < std::min(n_vectors, print_count); ++i) {
        std::cout << "h_v_in[" << i << "]: (" << h_v_in[i].x << ", " << h_v_in[i].y
                  << ", " << h_v_in[i].z << ", " << h_v_in[i].w << ")" << std::endl;
    }

    normalize_vectors_cpu(n_vectors, h_v_in, h_v_out_cpu, print_count);
    normalize_vectors_gpu(n_vectors, h_v_in, h_v_out_gpu, print_count);
}


int main() {
    std::cout << std::fixed << std::setprecision(6);

    run_vector_addition_experiment();
    run_normalize_vectors_experiment();

    cudaDeviceReset();
    return 0;
}


Writing matrix_mul.cu


In [6]:
!nvcc -arch=sm_75 -o matrix_mul matrix_mul.cu -std=c++11

In [7]:
!./matrix_mul



===== Problem 1: Vector Addition =====
Initial Host Data for Vector Addition (first 5 elements):
h_a[0]: 0.437488, h_b[0]: 0.501753
h_a[1]: 0.861631, h_b[1]: 0.011961
h_a[2]: 0.809351, h_b[2]: 0.789040
h_a[3]: 0.985795, h_b[3]: 0.411478
h_a[4]: 0.536040, h_b[4]: 0.723452

--- Vector Addition (CPU) ---
CPU Execution time: 216.277727 ms
CPU Result (first 5 elements):
h_c_cpu[0]: 0.939241 (Expected: 0.939241)
h_c_cpu[1]: 0.873593 (Expected: 0.873593)
h_c_cpu[2]: 1.598390 (Expected: 1.598390)
h_c_cpu[3]: 1.397273 (Expected: 1.397273)
h_c_cpu[4]: 1.259491 (Expected: 1.259491)
CPU Verification: PASSED

--- Vector Addition (GPU) ---
GPU Kernel execution time: 0.946336 ms
GPU Result (first 5 elements):
h_c_gpu[0]: 0.939241 (Expected: 0.939241)
h_c_gpu[1]: 0.873593 (Expected: 0.873593)
h_c_gpu[2]: 1.598390 (Expected: 1.598390)
h_c_gpu[3]: 1.397273 (Expected: 1.397273)
h_c_gpu[4]: 1.259491 (Expected: 1.259491)
GPU Verification: PASSED

===== Problem 2: Normalize 4D Vectors =====
Initial Host Da