In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
!nvidia-smi

Wed Apr 30 05:00:15 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   56C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile vectorAdd1.cu
#include <cstdlib>
#include <iostream>
#include <chrono>

using namespace std;
using namespace chrono;

// VectorAdd parallel function
__global__ void vectorAdd(int *a, int *b, int *result, int n) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < n) {
        result[tid] = a[tid] + b[tid];
    }
}

int main() {
    int *a, *b, *c;
    int *a_dev, *b_dev, *c_dev;
    int n = 1 << 20;  // Large vector size, 1 million elements

    a = new int[n];
    b = new int[n];
    c = new int[n];
    int *d = new int[n];
    int size = n * sizeof(int);
    cudaMalloc(&a_dev, size);
    cudaMalloc(&b_dev, size);
    cudaMalloc(&c_dev, size);

    // Array initialization
    for (int i = 0; i < n; i++) {
        a[i] = rand() % 1000;
        b[i] = rand() % 1000;
    }

    // Print vectors A and B
    cout << "Vector A is =>\n";
    for (int i = 0; i < 10; i++) {  // Only printing first 10 elements for brevity
        cout << a[i] << ", ";
    }
    cout << "...\n\n";

    cout << "Vector B is =>\n";
    for (int i = 0; i < 10; i++) {  // Only printing first 10 elements for brevity
        cout << b[i] << ", ";
    }
    cout << "...\n\n";

    // CPU addition timing
    auto cpu_start = high_resolution_clock::now();
    for (int i = 0; i < n; i++) {
        d[i] = a[i] + b[i];
    }
    auto cpu_end = high_resolution_clock::now();
    auto cpu_time = duration_cast<microseconds>(cpu_end - cpu_start).count();

    // GPU setup
    cudaEvent_t start, end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);

    cudaMemcpy(a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);

    int threads = 1024;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start);
    vectorAdd<<<blocks, threads>>>(a_dev, b_dev, c_dev, n);
    cudaEventRecord(end);
    cudaEventSynchronize(end);

    float gpu_time = 0.0;
    cudaEventElapsedTime(&gpu_time, start, end);  // GPU time in ms

    cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);

    // Print sums
    cout << "CPU sum is =>\n";
    for (int i = 0; i < 10; i++) {  // Only printing first 10 results for brevity
        cout << d[i] << ", ";
    }
    cout << "...\n\n";

    cout << "GPU sum is =>\n";
    for (int i = 0; i < 10; i++) {  // Only printing first 10 results for brevity
        cout << c[i] << ", ";
    }
    cout << "...\n\n";

   cout << "CPU Time Elapsed: " << (cpu_time / 1000.0) << " milliseconds\n";

    cout << "GPU Time Elapsed: " << gpu_time << " miliseconds\n";

    // Cleanup
    delete[] a;
    delete[] b;
    delete[] c;
    delete[] d;
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);

    return 0;
}


Writing vectorAdd1.cu


In [None]:
!nvcc -arch=sm_75 vectorAdd1.cu -o h

In [None]:
!./h

Vector A is =>
383, 777, 793, 386, 649, 362, 690, 763, 540, 172, ...

Vector B is =>
886, 915, 335, 492, 421, 27, 59, 926, 426, 736, ...

CPU sum is =>
1269, 1692, 1128, 878, 1070, 389, 749, 1689, 966, 908, ...

GPU sum is =>
1269, 1692, 1128, 878, 1070, 389, 749, 1689, 966, 908, ...

CPU Time Elapsed: 4.867 milliseconds
GPU Time Elapsed: 0.10208 miliseconds


In [None]:
%%writefile matrixMul1.cu
#include <cmath>
#include <chrono>
#include <cstdlib>
#include <iostream>

using namespace std;
using namespace chrono;

// Matrix multiplication CUDA kernel
__global__ void matrixMultiplication(int *a, int *b, int *c, int n) {
    int row = threadIdx.y + blockDim.y * blockIdx.y;
    int col = threadIdx.x + blockDim.x * blockIdx.x;

    if (row < n && col < n) {
        int sum = 0;
        for (int j = 0; j < n; j++) {
            sum += a[row * n + j] * b[j * n + col];
        }
        c[n * row + col] = sum;
    }
}

int main() {
    int *a, *b, *c;
    int *a_dev, *b_dev, *c_dev;
    int n = 100;

    a = new int[n * n];
    b = new int[n * n];
    c = new int[n * n];
    int *d = new int[n * n];
    int size = n * n * sizeof(int);

    cudaMalloc(&a_dev, size);
    cudaMalloc(&b_dev, size);
    cudaMalloc(&c_dev, size);

    // Initialize matrices
    for (int i = 0; i < n * n; i++) {
        a[i] = rand() % 10;
        b[i] = rand() % 10;
    }

    cout << "Given matrix A is =>\n";
    for (int row = 0; row < n; row++) {
        for (int col = 0; col < n; col++) {
            cout << a[row * n + col] << " ";
        }
        cout << "\n";
    }
    cout << "\n";

    cout << "Given matrix B is =>\n";
    for (int row = 0; row < n; row++) {
        for (int col = 0; col < n; col++) {
            cout << b[row * n + col] << " ";
        }
        cout << "\n";
    }
    cout << "\n";

    cudaEvent_t start, end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);

    cudaMemcpy(a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((n + 15) / 16, (n + 15) / 16);

    cudaEventRecord(start);
    matrixMultiplication<<<blocksPerGrid, threadsPerBlock>>>(a_dev, b_dev, c_dev, n);
    cudaEventRecord(end);
    cudaEventSynchronize(end);

    float time = 0.0f;
    cudaEventElapsedTime(&time, start, end);
    cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);

    // CPU matrix multiplication
    auto cpu_start = high_resolution_clock::now();
    for (int row = 0; row < n; row++) {
        for (int col = 0; col < n; col++) {
            int sum = 0;
            for (int k = 0; k < n; k++) {
                sum += a[row * n + k] * b[k * n + col];
            }
            d[row * n + col] = sum;
        }
    }
    auto cpu_end = high_resolution_clock::now();
    auto cpu_time = duration_cast<microseconds>(cpu_end - cpu_start).count();

    // Convert CPU time from microseconds to milliseconds
    float cpu_time_ms = cpu_time / 1000.0f;

    cout << "CPU product is =>\n";
    for (int row = 0; row < n; row++) {
        for (int col = 0; col < n; col++) {
            cout << d[row * n + col] << " ";
        }
        cout << "\n";
    }
    cout << "\n";

    cout << "GPU product is =>\n";
    for (int row = 0; row < n; row++) {
        for (int col = 0; col < n; col++) {
            cout << c[row * n + col] << " ";
        }
        cout << "\n";
    }
    cout << "\n";

    // Output CPU and GPU times
    cout << "CPU Time Elapsed: " << cpu_time_ms << " milliseconds\n";
    cout << "GPU Time Elapsed: " << time << " milliseconds\n";

    // Cleanup
    delete[] a;
    delete[] b;
    delete[] c;
    delete[] d;
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);

    return 0;
}


Writing matrixMul1.cu


In [None]:
!nvcc -arch=sm_75 matrixMul1.cu -o h

In [None]:
!./h

Given matrix A is =>
3 7 3 6 9 2 0 3 0 2 1 7 2 2 7 9 2 9 3 1 9 1 4 8 5 3 1 6 2 6 5 4 6 6 3 4 2 4 4 3 7 6 8 3 4 2 6 9 6 4 5 4 7 7 7 2 1 6 5 4 0 1 7 1 9 7 7 6 6 9 8 2 3 0 8 0 6 8 6 1 9 4 1 3 4 4 7 3 7 9 2 7 5 4 8 9 5 8 3 8 
6 3 3 6 4 8 9 7 4 0 0 2 4 5 4 9 2 7 5 8 2 9 6 0 1 5 1 8 0 4 2 8 2 4 2 0 2 9 8 3 1 3 0 9 9 9 3 0 6 4 0 6 6 5 9 7 8 9 6 2 6 3 1 9 1 9 0 5 7 4 0 2 6 0 2 2 5 2 0 8 8 4 9 9 2 4 9 3 0 0 9 3 1 4 1 6 4 2 4 2 
8 2 8 6 3 3 3 0 7 8 0 8 9 3 3 3 6 2 5 7 6 4 0 8 0 6 4 9 9 8 0 7 9 5 9 5 4 9 5 3 7 8 9 7 2 3 9 2 1 6 1 0 3 1 0 6 7 0 4 4 5 2 0 6 6 8 6 7 1 1 7 2 4 2 2 0 9 5 0 7 8 0 6 6 9 5 7 5 3 3 9 7 7 1 0 8 5 4 7 3 
0 7 9 2 3 1 2 2 7 1 4 7 1 7 4 8 1 6 1 6 8 8 0 2 7 6 6 7 7 9 7 6 8 3 4 5 1 5 9 3 5 2 7 3 6 6 3 4 9 2 8 0 4 6 7 3 3 5 0 7 3 0 0 1 3 9 4 5 8 5 5 9 7 3 6 5 6 0 1 2 9 0 2 4 3 8 3 0 3 9 7 2 2 4 8 0 9 2 1 3 
2 4 1 5 1 9 1 3 7 8 7 4 4 1 8 2 9 6 6 9 0 9 1 8 6 7 7 2 1 0 0 0 3 4 1 0 2 7 6 4 2 7 4 6 7 5 2 3 4 9 2 1 3 2 5 5 0 4 6 2 8 5 6 8 7 2 0 8 5 7 8 3 7 7 9 1 0 9 8 3 0 9 1 7 7 2 1 8