<a href="https://colab.research.google.com/github/Tawfique07/Parallel-Processing-And-Distributed-System-Lab/blob/main/matrixCUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-ttp169ll
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-ttp169ll
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0d2ab99cccbbc682722e708515fe9c4cfc50185a
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4716 sha256=628dab7add33bbe3db241f27cae90d2e08fab6039c9c40938873f96265651feb
  Stored in directory: /tmp/pip-ephem-wheel-cache-1hho9y28/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content

In [2]:
%%cu
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>

const int M = 2;  // Rows of A
const int N = 3;  // Columns of A and Rows of B
const int P = 4;  // Columns of B
const int K = 3;  // Number of matrices

__global__ void matrixMultiplication(int *A, int *B, int *C, int m, int n, int p) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < m && col < p) {
        int sum = 0;
        for (int k = 0; k < n; ++k) {
            sum += A[row * n + k] * B[k * p + col];
        }
        C[row * p + col] = sum;
    }
}

void printMatrix(int *matrix, int rows, int cols, const char *name) {
    std::cout << name << ":\n";
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            std::cout << matrix[i * cols + j] << "\t";
        }
        std::cout << "\n";
    }
    std::cout << "\n";
}

void initializeRandomMatrix(int *matrix, int rows, int cols) {
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            matrix[i * cols + j] = rand() % 10;  // Adjust the range as needed
        }
    }
}

int main() {
    srand(time(NULL));  // Seed for random number generation

    int *h_A, *h_B, *h_C;  // Host matrices
    int *d_A, *d_B, *d_C;  // Device matrices

    // Allocate host memory
    h_A = (int *)malloc(K * M * N * sizeof(int));
    h_B = (int *)malloc(K * N * P * sizeof(int));
    h_C = (int *)malloc(K * M * P * sizeof(int));

    // Initialize host matrices A and B with random data
    for (int k = 0; k < K; ++k) {
        initializeRandomMatrix(h_A + k * M * N, M, N);
        initializeRandomMatrix(h_B + k * N * P, N, P);
    }

    // Print input matrices
    for (int k = 0; k < K; ++k) {
        printMatrix(h_A + k * M * N, M, N, ("Matrix A" + std::to_string(k)).c_str());
        printMatrix(h_B + k * N * P, N, P, ("Matrix B" + std::to_string(k)).c_str());
    }

    // Allocate device memory
    cudaMalloc((void **)&d_A, K * M * N * sizeof(int));
    cudaMalloc((void **)&d_B, K * N * P * sizeof(int));
    cudaMalloc((void **)&d_C, K * M * P * sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, K * M * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, K * N * P * sizeof(int), cudaMemcpyHostToDevice);

    // Define thread block and grid dimensions
    dim3 blockDim(16, 16);
    dim3 gridDim((P + blockDim.x - 1) / blockDim.x, (M + blockDim.y - 1) / blockDim.y);

    // Launch the matrix multiplication kernel for each pair of matrices
    for (int k = 0; k < K; ++k) {
        matrixMultiplication<<<gridDim, blockDim>>>(d_A + k * M * N, d_B + k * N * P, d_C + k * M * P, M, N, P);
    }

    // Copy the result back to the host
    cudaMemcpy(h_C, d_C, K * M * P * sizeof(int), cudaMemcpyDeviceToHost);

    // Print output matrices
    for (int k = 0; k < K; ++k) {
        printMatrix(h_C + k * M * P, M, P, ("Result Matrix" + std::to_string(k)).c_str());
    }

    // Free device and host memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}


Matrix A0:
8	3	3	
1	3	9	

Matrix B0:
0	5	3	8	
6	2	7	4	
9	6	6	6	

Matrix A1:
9	4	6	
8	7	7	

Matrix B1:
3	8	4	9	
8	6	6	6	
1	1	9	4	

Matrix A2:
1	9	1	
6	9	7	

Matrix B2:
8	9	4	7	
5	2	5	6	
6	3	6	3	

Result Matrix0:
45	64	63	94	
99	65	78	74	

Result Matrix1:
65	102	114	129	
87	113	137	142	

Result Matrix2:
59	30	55	64	
135	93	111	117	


