In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
!ls /usr/local

bin    cuda	cuda-12.5	  etc	 include  libexec     man  sbin   src
colab  cuda-12	dist_metrics.pxd  games  lib	  LICENSE.md  opt  share


In [3]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [4]:
!nvidia-smi

Mon May 12 08:59:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [20]:
%%writefile matrix_mul.cu

#include <iostream>
#include <vector>
#include <cstdlib> // For rand(), srand()
#include <ctime>   // For time()
#include <cmath>   // For sqrtf()
#include <cuda_runtime.h>
#include <iomanip> // For std::fixed, std::setprecision

// --- Vector Addition ---
__global__ void vectorAddKernel(const float *a, const float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;

    for (int i = idx; i < n; i += stride) {
        c[i] = a[i] + b[i];
    }
}

void vector_addition() {
    std::cout << "\n--- Vector Addition ---\n";
    const int n = 1 << 24; // 2^24 elements
    const size_t bytes = n * sizeof(float);
    const int print_count = 5; // Number of elements to print

    std::vector<float> h_a(n);
    std::vector<float> h_b(n);
    std::vector<float> h_c(n); // For GPU result

    srand(static_cast<unsigned int>(time(0)));
    for (int i = 0; i < n; ++i) {
        h_a[i] = static_cast<float>(rand()) / RAND_MAX;
        h_b[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    std::cout << "Initial Host Data (first " << print_count << " elements):\n";
    for (int i = 0; i < std::min(n, print_count); ++i) {
        std::cout << "h_a[" << i << "]: " << h_a[i] << ", h_b[" << i << "]: " << h_b[i] << std::endl;
    }

    float *d_a, *d_b, *d_c;
    cudaError_t err;

    err = cudaMalloc(&d_a, bytes);
    if (err != cudaSuccess) { std::cerr << "CUDA Malloc error d_a: " << cudaGetErrorString(err) << std::endl; return; }
    err = cudaMalloc(&d_b, bytes);
    if (err != cudaSuccess) { std::cerr << "CUDA Malloc error d_b: " << cudaGetErrorString(err) << std::endl; return; }
    err = cudaMalloc(&d_c, bytes);
    if (err != cudaSuccess) { std::cerr << "CUDA Malloc error d_c: " << cudaGetErrorString(err) << std::endl; return; }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Copy H2D
    err = cudaMemcpy(d_a, h_a.data(), bytes, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) { std::cerr << "CUDA Memcpy H2D error d_a: " << cudaGetErrorString(err) << std::endl; return; }
    err = cudaMemcpy(d_b, h_b.data(), bytes, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) { std::cerr << "CUDA Memcpy H2D error d_b: " << cudaGetErrorString(err) << std::endl; return; }

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    // Kernel Launch
    cudaEventRecord(start);
    vectorAddKernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    err = cudaGetLastError(); // Check for kernel launch errors
    if (err != cudaSuccess) { std::cerr << "Kernel launch error: " << cudaGetErrorString(err) << std::endl; return; }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop); // Wait for kernel to finish for accurate timing

    float ms = 0;
    cudaEventElapsedTime(&ms, start, stop);
    std::cout << "GPU Kernel execution time: " << ms << " ms\n";

    // Copy D2H
    err = cudaMemcpy(h_c.data(), d_c, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) { std::cerr << "CUDA Memcpy D2H error d_c: " << cudaGetErrorString(err) << std::endl; return; }

    std::cout << "GPU Result (first " << print_count << " elements):\n";
    for (int i = 0; i < std::min(n, print_count); ++i) {
        std::cout << "h_c[" << i << "]: " << h_c[i] << " (Expected: " << h_a[i] + h_b[i] << ")" << std::endl;
    }

    // Simple Verification (already prints some results above, so this is more of a status)
    bool ok = true;
    for (int i = 0; i < n; ++i) {
        float expected = h_a[i] + h_b[i];
        if (std::abs(h_c[i] - expected) > 1e-5) {
            // std::cerr << "Full Verification failed at index " << i << ": GPU=" << h_c[i] << ", CPU_expected=" << expected << std::endl;
            ok = false;
            break;
        }
    }
    if(ok) std::cout << "Basic Verification: PASSED\n";
    else std::cout << "Basic Verification: FAILED\n";

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

// --- Normalize 4D Vectors ---
__global__ void normalizeVectorsKernel(const float4 *v_in, float4 *v_out, int n_vectors) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;

    for (int i = idx; i < n_vectors; i += stride) {
        float4 vec = v_in[i];
        float len_sq = vec.x * vec.x + vec.y * vec.y + vec.z * vec.z + vec.w * vec.w;

        if (len_sq > 1e-9f) {
            float inv_len = rsqrtf(len_sq);
            v_out[i].x = vec.x * inv_len;
            v_out[i].y = vec.y * inv_len;
            v_out[i].z = vec.z * inv_len;
            v_out[i].w = vec.w * inv_len;
        } else {
            v_out[i].x = 0.0f;
            v_out[i].y = 0.0f;
            v_out[i].z = 0.0f;
            v_out[i].w = 0.0f;
        }
    }
}

void normalize_vectors() {
    std::cout << "\n--- Normalize 4D Vectors ---\n";
    const int n_vectors = 1 << 22; // 2^22 4D vectors
    const size_t bytes_float4 = n_vectors * sizeof(float4);
    const int print_count = 3; // Number of vectors to print

    std::vector<float4> h_v_in(n_vectors);
    std::vector<float4> h_v_out(n_vectors);

    srand(static_cast<unsigned int>(time(0)) + 1);
    for (int i = 0; i < n_vectors; ++i) {
        h_v_in[i].x = (static_cast<float>(rand()) / RAND_MAX) * 2.0f - 1.0f;
        h_v_in[i].y = (static_cast<float>(rand()) / RAND_MAX) * 2.0f - 1.0f;
        h_v_in[i].z = (static_cast<float>(rand()) / RAND_MAX) * 2.0f - 1.0f;
        h_v_in[i].w = (static_cast<float>(rand()) / RAND_MAX) * 2.0f - 1.0f;
    }

    std::cout << "Initial Host Data (first " << print_count << " vectors):\n";
    for (int i = 0; i < std::min(n_vectors, print_count); ++i) {
        std::cout << "h_v_in[" << i << "]: (" << h_v_in[i].x << ", " << h_v_in[i].y
                  << ", " << h_v_in[i].z << ", " << h_v_in[i].w << ")" << std::endl;
    }


    float4 *d_v_in, *d_v_out;
    cudaError_t err;

    err = cudaMalloc(&d_v_in, bytes_float4);
    if (err != cudaSuccess) { std::cerr << "CUDA Malloc error d_v_in: " << cudaGetErrorString(err) << std::endl; return; }
    err = cudaMalloc(&d_v_out, bytes_float4);
    if (err != cudaSuccess) { std::cerr << "CUDA Malloc error d_v_out: " << cudaGetErrorString(err) << std::endl; return; }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Copy H2D
    err = cudaMemcpy(d_v_in, h_v_in.data(), bytes_float4, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) { std::cerr << "CUDA Memcpy H2D error d_v_in: " << cudaGetErrorString(err) << std::endl; return; }

    int blockSize = 256;
    int gridSize = (n_vectors + blockSize - 1) / blockSize;

    // Kernel Launch
    cudaEventRecord(start);
    normalizeVectorsKernel<<<gridSize, blockSize>>>(d_v_in, d_v_out, n_vectors);
    err = cudaGetLastError();
    if (err != cudaSuccess) { std::cerr << "Kernel launch error: " << cudaGetErrorString(err) << std::endl; return; }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms = 0;
    cudaEventElapsedTime(&ms, start, stop);
    std::cout << "GPU Kernel execution time: " << ms << " ms\n";

    // Copy D2H
    err = cudaMemcpy(h_v_out.data(), d_v_out, bytes_float4, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) { std::cerr << "CUDA Memcpy D2H error d_v_out: " << cudaGetErrorString(err) << std::endl; return; }

    std::cout << "GPU Result (first " << print_count << " normalized vectors):\n";
     bool all_ok = true;
    for (int i = 0; i < std::min(n_vectors, print_count); ++i) {
        float4 first_vec = h_v_out[i];
        float len_sq = first_vec.x * first_vec.x + first_vec.y * first_vec.y +
                       first_vec.z * first_vec.z + first_vec.w * first_vec.w;
        std::cout << "h_v_out[" << i << "]: (" << first_vec.x << ", " << first_vec.y
                  << ", " << first_vec.z << ", " << first_vec.w << ")"
                  << " Length: " << sqrtf(len_sq);
        if (len_sq > 1e-9f) {
            if (std::abs(sqrtf(len_sq) - 1.0f) > 1e-5) {
                 std::cout << " (VERIFICATION FAILED: length not 1.0)";
                 all_ok = false;
            }
        } else {
            std::cout << " (original likely zero)";
        }
        std::cout << std::endl;
    }
    if(all_ok) std::cout << "Basic Verification: PASSED\n";
    else std::cout << "Basic Verification: FAILED (at least one printed vector's length was not 1.0)\n";


    cudaFree(d_v_in);
    cudaFree(d_v_out);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

int main() {
    std::cout << std::fixed << std::setprecision(6); // More precision for vector components
    vector_addition();
    normalize_vectors();
    return 0;
}

Overwriting matrix_mul.cu


In [21]:
!nvcc -arch=sm_75 -o matrix_mul matrix_mul.cu

In [22]:
!./matrix_mul



--- Problem 1: Vector Addition ---
Problem 1: Initial Host Data (first 5 elements):
h_a[0]: 0.074234, h_b[0]: 0.097864
h_a[1]: 0.825736, h_b[1]: 0.231200
h_a[2]: 0.782525, h_b[2]: 0.998622
h_a[3]: 0.394453, h_b[3]: 0.087803
h_a[4]: 0.187307, h_b[4]: 0.104967
GPU Kernel execution time: 0.845376 ms
Problem 1: GPU Result (first 5 elements):
h_c[0]: 0.172098 (Expected: 0.172098)
h_c[1]: 1.056937 (Expected: 1.056937)
h_c[2]: 1.781147 (Expected: 1.781147)
h_c[3]: 0.482257 (Expected: 0.482257)
h_c[4]: 0.292274 (Expected: 0.292274)
Basic Verification: PASSED

--- Problem 2: Normalize 4D Vectors ---
Problem 2: Initial Host Data (first 3 vectors):
h_v_in[0]: (0.590030, 0.856545, 0.872543, 0.752562)
h_v_in[1]: (-0.688787, -0.107003, 0.244758, 0.367581)
h_v_in[2]: (-0.388621, -0.437134, 0.303062, 0.543584)
GPU Kernel execution time: 0.590944 ms
Problem 2: GPU Result (first 3 normalized vectors):
h_v_out[0]: (0.380112, 0.551808, 0.562115, 0.484819) Length: 1.000000
h_v_out[1]: (-0.834725, -0.12967