<a href="https://colab.research.google.com/github/SiddhiNKabra/LP-5/blob/main/assign4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%writefile assign4.cu
#include <cuda_runtime.h>
#include <bits/stdc++.h>

using namespace std;
using namespace std::chrono;

__global__ void multiply(int *A, int *B, int *C, int size) {
    // blockIdx = which block
    // threadIdx = which thread within the block
    // blockDim = size of the block
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < size && col < size) {
        int sum = 0;
        for (int i=0; i<size; i++) {
            sum += A[row * size + i] * B[i * size + col];  // dot product
        }
        C[row * size + col] = sum;
    }
}


// optional: sequential matrix multiplication
void seqMultiply(int *A, int *B, int *C, int size) {
    // initialize result matrix to 0
    for (int i = 0; i <size*size; i++) {
        C[i] = 0;
    }

    for (int row=0; row<size; row++) {
        for (int col=0; col<size; col++) {
            int sum = 0;
            for (int k=0; k<size; k++) {
                sum += A[row * size + k] * B[k * size + col];
            }
            C[row * size + col] = sum;
        }
    }
}


void initialize(int *matrix, int size) {
    for (int i=0; i<size*size; i++) {
        matrix[i] = rand() % 10;
    }
}


void print(int *matrix, int size) {
    for (int row=0; row<size; row++) {
        for (int col=0; col<size; col++) {
            cout<<matrix[row*size + col]<<" ";
        }
        cout<<endl;
    }
    cout<<endl;
}


int main() {

    int N = 3;
    int matrixSize = N * N;
    size_t matrixBytes = matrixSize * sizeof(int);

    int *A, *B, *C;
    A = new int[matrixSize];
    B = new int[matrixSize];
    C = new int[matrixSize];

    initialize(A, N);
    initialize(B, N);

    cout<<"Matrix A: \n";
    print(A, N);
    cout<<"Matrix B: \n";
    print(B, N);

    int *X, *Y, *Z;
    cudaMalloc(&X, matrixBytes);
    cudaMalloc(&Y, matrixBytes);
    cudaMalloc(&Z, matrixBytes);

    cudaMemcpy(X, A, matrixBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, matrixBytes, cudaMemcpyHostToDevice);

    // threads per CTA dimension
    int THREADS = 3;

    // blocks per grid dimension (assumes N is divisible by THREADS)
    int BLOCKS = N / THREADS;

    dim3 threads(THREADS, THREADS);  // each block has 9 threads arranged in a 3×3 grid
    dim3 blocks(BLOCKS, BLOCKS);  // only 1 block is launched

    // launch kernel
    auto start = high_resolution_clock::now();
    multiply<<<blocks, threads>>>(X, Y, Z, N);
    cudaMemcpy(C, Z, matrixBytes, cudaMemcpyDeviceToHost);
    auto stop = high_resolution_clock::now();

    auto duration = duration_cast<microseconds>(stop-start);

    cout<<"Multiplication A x B: \n";
    print(C, N);
    cout<<"Time: "<<duration.count()<<" microseconds"<<endl;

    delete[] A; delete[] B; delete[] C;
    cudaFree(X); cudaFree(Y); cudaFree(Z);

    return 0;
}

Writing assign4.cu


In [3]:
!nvidia-smi

Mon May  5 16:01:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
!nvcc -arch=sm_75 assign4.cu

In [5]:
!./a.out

Matrix A: 
3 6 7 
5 3 5 
6 2 9 

Matrix B: 
1 2 7 
0 9 3 
6 0 6 

Multiplication A x B: 
45 60 81 
35 37 74 
60 30 102 

Time: 139 microseconds


In [6]:
%%writefile add_sub.cu
#include <cuda_runtime.h>
#include <bits/stdc++.h>

using namespace std;
using namespace std::chrono;

// a function that runs on the GPU is declared with __global__
__global__ void add(int *A, int *B, int *C, int size) {
    // calculate the global thread ID "id"
    // each GPU thread computes one element of the sum if "id" is within bounds.
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id < size) {
        C[id] = A[id] + B[id];
    }
}

// optional: sequential vector additionv
void seqAdd(int *A, int *B, int *C, int size) {
    for (int i=0; i<size; i++) {
        C[i] = A[i] + B[i];
    }
}

// fills vector with random integers (0 to 9).
void initialize(int *vector, int size) {
    for (int i=0; i<size; i++) {
        vector[i] = rand() % 10;
    }
}

void print(int *vector, int size) {
    for (int i=0; i<size; i++) {
        cout<<vector[i]<<" ";
    }
    cout<<endl;
}


int main() {

    int vectorSize = 4;
    size_t vectorBytes = vectorSize * sizeof(int);

    int *A, *B, *C;
    A = new int[vectorSize];
    B = new int[vectorSize];
    C = new int[vectorSize];

    initialize(A, vectorSize);
    initialize(B, vectorSize);

    cout<<"\nVector A: ";
    print(A, vectorSize);
    cout<<"\nVector B: ";
    print(B, vectorSize);

    // allocate memory on GPU (device) for vectors X, Y, and Z
    int *X, *Y, *Z;
    cudaMalloc(&X, vectorBytes);
    cudaMalloc(&Y, vectorBytes);
    cudaMalloc(&Z, vectorBytes);

    // copy host data (A, B) to device memory (X, Y)
    cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (vectorSize + threadsPerBlock - 1) / threadsPerBlock;

    // launch gpuAdd() on GPU
    // each thread adds one pair of elements from X & Y
    auto start = high_resolution_clock::now();
    add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, vectorSize);
    cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);  // transfer result vector Z from GPU to CPU (C)
    auto stop = high_resolution_clock::now();

    auto duration = duration_cast<microseconds>(stop-start);

    cout<<"\nAddition: ";
    print(C, vectorSize);
    cout<<"\nTime: "<<duration.count()<<" microseconds"<<endl;

    // free all memory on host and device
    free(A); free(B); free(C);
    cudaFree(X); cudaFree(Y); cudaFree(Z);

    return 0;
}

Writing add_sub.cu


In [7]:
!nvidia-smi

Mon May  5 16:05:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [9]:
!nvcc -arch=sm_75 add_sub.cu

In [10]:
!./a.out


Vector A: 3 6 7 5 

Vector B: 3 5 6 2 

Addition: 6 11 13 7 

Time: 105 microseconds
