<a href="https://colab.research.google.com/github/RaoEhsanElahi/Parallel_Processing/blob/main/matrix_multiplication_cu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-9gl0_yev
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-9gl0_yev
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4293 sha256=acea591b9f66104dcde38286939e22293e34b4bc011673ab6bad8b5cee47a3bd
  Stored in directory: /tmp/pip-ephem-wheel-cache-nmioro7h/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


##Task1
Objective: Implement a CUDA kernel to perform matrix multiplication using shared memory to optimize memory access.
###Requirements:

1. Use a 2D grid of blocks and a 2D arrangement of threads within each block.
2. Utilize shared memory to cache portions of input matrices for faster access.
3. Handle matrices of arbitrary size (consider matrices A, B, and C).
5. Example: Consider two matrices A and B,
where:  
        Matrix A: [1 2 3] [4 5 6] [7 8 9]
        Matrix B: [91 512 52] [−5 21 5] [−21 52 21]
###Tasks :
1. Write a CUDA kernel to perform matrix multiplication using shared memory.
2. Use the provided matrices A and B for testing. (you have to do this on nxn matrix so that is the main task).
3. Compare the results obtained from the GPU with the CPU (serial) implementation to verify correctness.
4. Also compute the results of using global and shared memory and their time difference.
5. Measure and compare the execution time between the GPU and CPU implementations.

In [28]:
%%cu
#include <iostream>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <cuda_profiler_api.h>

__global__ void matrixMultiplicationKernel(float* A, float* B, float* C, int N) {
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;

    float Cvalue = 0.0;
    for (int k = 0; k < N; ++k)
        Cvalue += A[Row * N + k] * B[k * N + Col];

    C[Row * N + Col] = Cvalue;
}

void matrixMulCPU(float* A, float* B, float* C, int N) {
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            for (int k = 0; k < N; k++)
                C[i * N + j] += A[i * N + k] * B[k * N + j];
}

int main() {
    const int N = 4; //size of nxn matrix
    const int size = N * N * sizeof(float);

    float *h_A, *h_B, *h_C_CPU, *h_C_GPU;
    h_A = (float*)malloc(size);
    h_B = (float*)malloc(size);
    h_C_CPU = (float*)malloc(size);
    h_C_GPU = (float*)malloc(size);

    // initialize the matrices h_A and h_B
    for (int i = 0; i < N * N; i++) {
        h_A[i] = rand() % 100;
        h_B[i] = rand() % 100;
    }
    // Print Matrix A and B
    printf("\nMatrix A:\n");
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            printf("%f ", h_A[i * N + j]);
        }
        printf("\n");
    }
    std::cout << std::endl << "Matrix B:" << std::endl;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            printf("%f ", h_B[i * N + j]);
        }
        printf("\n");
    }

    float cpu_time_used = 0.0;
    clock_t start_cpu, end_cpu;
    start_cpu = clock();
    matrixMulCPU(h_A, h_B, h_C_CPU, N);
    end_cpu = clock();
    cpu_time_used = ((float) (end_cpu - start_cpu)) / CLOCKS_PER_SEC;

    std::cout << "\nMatrix C (BY CPU):\n";
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            std::cout << h_C_CPU[i * N + j] << " ";
        }
        std::cout << std::endl;
    }
    printf("\nCPU time: %f s\n", cpu_time_used);

    // Allocate memory on the GPU
    float *d_A, *d_B, *d_C, gpu_time;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    dim3 blockDim(16, 16);
    dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);

    cudaEventRecord(start);
    matrixMultiplicationKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);
    cudaMemcpy(h_C_GPU, d_C, size, cudaMemcpyDeviceToHost);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    std::cout  << "\nMatrix C (BY CUDA):\n";
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            printf("%f ", h_C_GPU[i * N + j]);
        }
        printf("\n");
    }
    cudaEventElapsedTime(&gpu_time, start, stop);
    printf("\nGPU time: %f ms\n", gpu_time);



    // Compare CPU and GPU results
    int i, j, k;
    int difference = 0;
    for (i = 0; i < N; i++)
        for (j = 0; j < N; j++)
            difference += fabs(h_C_CPU[i * N + j] - h_C_GPU[i * N + j]);
    if (difference > 1e-5) {
        printf("\nCPU and GPU results are different\n");
    } else {
        printf("\nMatrices are equal\n");
    }

    free(h_A);
    free(h_B);
    free(h_C_CPU);
    free(h_C_GPU);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}



Matrix A:
83.000000 77.000000 93.000000 86.000000 
49.000000 62.000000 90.000000 63.000000 
40.000000 72.000000 11.000000 67.000000 
82.000000 62.000000 67.000000 29.000000 

Matrix B:
86.000000 15.000000 35.000000 92.000000 
21.000000 27.000000 59.000000 26.000000 
26.000000 36.000000 68.000000 29.000000 
30.000000 23.000000 35.000000 2.000000 

Matrix C (BY CPU):
13753 8650 16782 12507 
9746 7098 13698 8856 
7248 4481 8741 6005 
10966 5983 12099 11157 

CPU time: 0.000001 s

Matrix C (BY CUDA):
13753.000000 8650.000000 16782.000000 12507.000000 
6535.000000 7152.000000 13388.000000 4577.000000 
4468.000000 4759.000000 8339.000000 2561.000000 
2490.000000 1909.000000 2905.000000 166.000000 

GPU time: 0.243904 ms

CPU and GPU results are different

