<a href="https://colab.research.google.com/github/WMadaraChamudini/PC_Assignment3/blob/main/CUDA/CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Matrix Multiplication-IT23292154-PC Assignment 03**
##**CUDA Code**

In [1]:
%%writefile cuda_mat_mul.cu
// Compile: nvcc -O3 -arch=sm_75 cuda_mat_mul.cu -o cuda_mat_mul
// Run: ./cuda_mat_mul N_A_row N_A_col N_B_row N_B_col blockSize

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>      //for runtime API functions

//macro for CUDA error checking
#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
void check(cudaError_t err, const char* const func, const char* const file, const int line){
  if (err!=cudaSuccess){
    fprintf (stderr, "CUDA Error at %s:%d in function %s: %s\n", file, line, func, cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
}

float* alloc_mat(int rows, int cols){
  float *m = (float*) malloc((size_t)rows*cols*sizeof(float));
  if (!m){
    perror("malloc");
    exit(1);
  }
  return m;
}

void init_ex(float *A, float *B, int N_A_row, int N_A_col, int N_B_row, int N_B_col){
  //initialize Matrix A
  for (long i=0; i<(long)N_A_row*N_A_col; i++){
    A[i] = (float)((i/N_A_col)+(i%N_A_col)+1);
  }
  //initialize Matrix B
  for (long i=0; i<(long)N_B_row*N_B_col; i++){
    B[i] = (float)( ((i/N_B_col)+1) * ((i%N_B_col)+2));
  }
}

void print_mat(const char *name, float *M, int rows, int cols){
  printf("%s:\n", name);
  for (int i=0; i<rows; i++){
    for (int j=0; j<cols; j++){
      printf("%8.2f ", M[(long)i*cols+j]);
    }
    printf("\n");
  }
}

//CUDA kernel for matrix multiplication
__global__ void matmul_kernel(float *A, float *B, float *C, int N_A_row, int N_A_col, int N_B_row, int N_B_col){
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  if (row < N_A_row && col < N_B_col){
    float sum = 0.0f;
    for (int k=0; k<N_A_col; k++){
      sum += A[(long)row*N_A_col+k] * B[(long)k*N_B_col+col];
    }
    C[(long)row*N_B_col + col]=sum;
  }
}

int main(int argc, char **argv){
  if (argc<6){
    printf ("Usage: %s N_A_row N_A_col N_B_row N_B_col blockSize\n", argv[0]);
    return 1;
  }

  int N_A_row = atoi(argv[1]);
  int N_A_col = atoi(argv[2]);
  int N_B_row = atoi(argv[3]);
  int N_B_col = atoi(argv[4]);
  int blockSize = atoi(argv[5]);

  if (N_A_row<=0 || N_A_col<=0 || N_B_row<=0 || N_B_col<=0 || blockSize<=0){
    printf("Invalid arguments. All dimensions and block size must be positive.\n");
    return 1;
  }

  if (N_A_col != N_B_row){
    printf("Error: N_A_col must be equal to N_B_row for matrix multiplication.\n");
    return 1;
  }

  //host matrices allocation
  float *A = alloc_mat(N_A_row, N_A_col);
  float *B = alloc_mat(N_B_row, N_B_col);
  float *C = alloc_mat(N_A_row, N_B_col);   //Result matrix C will have dimensions N_A_row x N_B_col

  init_ex(A,B,N_A_row, N_A_col, N_B_row, N_B_col);

  //initialize host C to zeros(to debugging comparison in kernel failures)
  for (long i=0; i<(long)N_A_row*N_B_col; i++){
    C[i] = 0.0f;
  }

  //device memory allocation
  float *d_A, *d_B, *d_C;
  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_A, (size_t)N_A_row*N_A_col*sizeof(float)));
  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_B, (size_t)N_B_row*N_B_col*sizeof(float)));
  CHECK_CUDA_ERROR(cudaMalloc((void**)&d_C, (size_t)N_A_row*N_B_col*sizeof(float)));

  // Copy data -> device
  CHECK_CUDA_ERROR(cudaMemcpy(d_A, A, (size_t)N_A_row*N_A_col*sizeof(float), cudaMemcpyHostToDevice));
  CHECK_CUDA_ERROR(cudaMemcpy(d_B, B, (size_t)N_B_row*N_B_col*sizeof(float), cudaMemcpyHostToDevice));
  //didn't copy host C to device <-- d_C is written to device by kernel

  //configure grid and block
  dim3 dimBlock(blockSize, blockSize);
  dim3 dimGrid( (N_B_col+blockSize-1)/blockSize, (N_A_row+blockSize-1)/blockSize);   //grid dimensions based on C (N_A_row x N_B_col)

  //CUDA events for timing
  cudaEvent_t start, stop;
  CHECK_CUDA_ERROR(cudaEventCreate(&start));
  CHECK_CUDA_ERROR(cudaEventCreate(&stop));
  CHECK_CUDA_ERROR(cudaEventRecord(start));

  // Launch kernel
  matmul_kernel <<<dimGrid, dimBlock>>> (d_A, d_B, d_C, N_A_row, N_A_col, N_B_row, N_B_col);
  CHECK_CUDA_ERROR(cudaGetLastError());   //check for errors from kernel launch

  //wait for kernel to finish
  CHECK_CUDA_ERROR(cudaDeviceSynchronize());

  CHECK_CUDA_ERROR(cudaEventRecord(stop));
  CHECK_CUDA_ERROR(cudaEventSynchronize(stop));

  float milliseconds = 0;
  CHECK_CUDA_ERROR(cudaEventElapsedTime (&milliseconds, start, stop));
  float seconds = milliseconds/1000.0f; //convert milliseconds to seconds

  //copy result back to host
  CHECK_CUDA_ERROR(cudaMemcpy(C, d_C, (size_t)N_A_row*N_B_col*sizeof(float), cudaMemcpyDeviceToHost));

  //print matrices(when N_A_row and N_B_col are smaller than 10)
  if (N_A_row<=10 && N_A_col<=10 && N_B_row<=10 && N_B_col<=10){
    print_mat("Matrix A", A,N_A_row, N_A_col);
    printf("\n");
    print_mat("Matrix B", B,N_B_row, N_B_col);
    printf("\n");
    print_mat("Result Matrix C = A * B", C,N_A_row, N_B_col);
    printf("\n");
  }

  //compute checksum and row sums
  float checksum = 0.0f;
  printf("Summary:\n");
  printf("Matrix A size: %d x %d \n", N_A_row, N_A_col);
  printf("Matrix B size: %d x %d \n", N_B_row, N_B_col);
  printf("Result Matrix C size: %d x %d \n", N_A_row, N_B_col);
  printf("Execution time(seconds): %.6f\n", seconds);
  for(int i=0; i<N_A_row; i++){                //iterate over rows of C
    float rowSum = 0.0f;
    for (int j=0; j<N_B_col; j++){             //iterate over columns of C
      rowSum += C[(long)i*N_B_col + j];
    }
    if (N_A_row <= 50 && N_B_col <= 50) {   // Only print row sums if result matrix dimensions < 50
      printf(" Row %2d sum = %.2f\n", i,rowSum);
    }
    checksum += rowSum;
  }
  printf("Checksum (sum of all elements) = %.6e\n", checksum);

  //free memory
  free(A);
  free(B);
  free(C);
  CHECK_CUDA_ERROR(cudaFree(d_A));
  CHECK_CUDA_ERROR(cudaFree(d_B));
  CHECK_CUDA_ERROR(cudaFree(d_C));
  CHECK_CUDA_ERROR(cudaEventDestroy(start));
  CHECK_CUDA_ERROR(cudaEventDestroy(stop));

  return 0;
}

Writing cuda_mat_mul.cu


In [2]:
!nvcc -O3 -arch=sm_75 cuda_mat_mul.cu -o cuda_mat_mul

In [3]:
!./cuda_mat_mul 4 3 3 5 4

Matrix A:
    1.00     2.00     3.00 
    2.00     3.00     4.00 
    3.00     4.00     5.00 
    4.00     5.00     6.00 

Matrix B:
    2.00     3.00     4.00     5.00     6.00 
    4.00     6.00     8.00    10.00    12.00 
    6.00     9.00    12.00    15.00    18.00 

Result Matrix C = A * B:
   28.00    42.00    56.00    70.00    84.00 
   40.00    60.00    80.00   100.00   120.00 
   52.00    78.00   104.00   130.00   156.00 
   64.00    96.00   128.00   160.00   192.00 

Summary:
Matrix A size: 4 x 3 
Matrix B size: 3 x 5 
Result Matrix C size: 4 x 5 
Execution time(seconds): 0.000172
 Row  0 sum = 280.00
 Row  1 sum = 400.00
 Row  2 sum = 520.00
 Row  3 sum = 640.00
Checksum (sum of all elements) = 1.840000e+03


In [4]:
!./cuda_mat_mul 40 4000 4000 40 4

Summary:
Matrix A size: 40 x 4000 
Matrix B size: 4000 x 40 
Result Matrix C size: 40 x 40 
Execution time(seconds): 0.000300
 Row  0 sum = 18353544495104.00
 Row  1 sum = 18360423153664.00
 Row  2 sum = 18367306006528.00
 Row  3 sum = 18374188859392.00
 Row  4 sum = 18381069615104.00
 Row  5 sum = 18387950370816.00
 Row  6 sum = 18394833223680.00
 Row  7 sum = 18401713979392.00
 Row  8 sum = 18408594735104.00
 Row  9 sum = 18415479685120.00
 Row 10 sum = 18422360440832.00
 Row 11 sum = 18429243293696.00
 Row 12 sum = 18436124049408.00
 Row 13 sum = 18443004805120.00
 Row 14 sum = 18449891852288.00
 Row 15 sum = 18456770510848.00
 Row 16 sum = 18463655460864.00
 Row 17 sum = 18470529925120.00
 Row 18 sum = 18477416972288.00
 Row 19 sum = 18484295630848.00
 Row 20 sum = 18491182678016.00
 Row 21 sum = 18498061336576.00
 Row 22 sum = 18504946286592.00
 Row 23 sum = 18511824945152.00
 Row 24 sum = 18518711992320.00
 Row 25 sum = 18525586456576.00
 Row 26 sum = 18532473503744.00
 Row 27 su

In [5]:
!./cuda_mat_mul 40 4000 4000 40 8

Summary:
Matrix A size: 40 x 4000 
Matrix B size: 4000 x 40 
Result Matrix C size: 40 x 40 
Execution time(seconds): 0.000231
 Row  0 sum = 18353544495104.00
 Row  1 sum = 18360423153664.00
 Row  2 sum = 18367306006528.00
 Row  3 sum = 18374188859392.00
 Row  4 sum = 18381069615104.00
 Row  5 sum = 18387950370816.00
 Row  6 sum = 18394833223680.00
 Row  7 sum = 18401713979392.00
 Row  8 sum = 18408594735104.00
 Row  9 sum = 18415479685120.00
 Row 10 sum = 18422360440832.00
 Row 11 sum = 18429243293696.00
 Row 12 sum = 18436124049408.00
 Row 13 sum = 18443004805120.00
 Row 14 sum = 18449891852288.00
 Row 15 sum = 18456770510848.00
 Row 16 sum = 18463655460864.00
 Row 17 sum = 18470529925120.00
 Row 18 sum = 18477416972288.00
 Row 19 sum = 18484295630848.00
 Row 20 sum = 18491182678016.00
 Row 21 sum = 18498061336576.00
 Row 22 sum = 18504946286592.00
 Row 23 sum = 18511824945152.00
 Row 24 sum = 18518711992320.00
 Row 25 sum = 18525586456576.00
 Row 26 sum = 18532473503744.00
 Row 27 su

In [6]:
!./cuda_mat_mul 4000 400 400 4000 1

Summary:
Matrix A size: 4000 x 400 
Matrix B size: 400 x 4000 
Result Matrix C size: 4000 x 4000 
Execution time(seconds): 0.767903
Checksum (sum of all elements) = 5.821111e+18


In [7]:
!./cuda_mat_mul 4000 400 400 4000 2

Summary:
Matrix A size: 4000 x 400 
Matrix B size: 400 x 4000 
Result Matrix C size: 4000 x 4000 
Execution time(seconds): 0.305794
Checksum (sum of all elements) = 5.821111e+18


In [8]:
!./cuda_mat_mul 4000 400 400 4000 4

Summary:
Matrix A size: 4000 x 400 
Matrix B size: 400 x 4000 
Result Matrix C size: 4000 x 4000 
Execution time(seconds): 0.139503
Checksum (sum of all elements) = 5.821111e+18


In [9]:
!./cuda_mat_mul 4000 400 400 4000 8

Summary:
Matrix A size: 4000 x 400 
Matrix B size: 400 x 4000 
Result Matrix C size: 4000 x 4000 
Execution time(seconds): 0.071058
Checksum (sum of all elements) = 5.821111e+18
