<a href="https://colab.research.google.com/github/Shrutika-TechSavvy/Google-Colab-Codes/blob/main/Final_Matrix_Multiplication_CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
%%writefile matrix_multiplication.cu

#include<stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>

//The kernel function
__global__ void matrix_mul(int *A, int *B, int *C, int N){
  int row = threadIdx.x;
  int col = threadIdx.y;
  int sum = 0;

  if(row < N && col < N){
    for(int i = 0; i < N; i++){
      sum += A[row * N + i] * B[i * N + col];
    }
    C[row * N + col] = sum;
  }
}

int main(){
  int N;
  printf("\nEnter the N : ");
  scanf("%d" , &N);

  // Get GPU device properties
  cudaDeviceProp prop;
  int device;
  cudaGetDevice(&device);
  cudaGetDeviceProperties(&prop, device);

  // Check if N*N exceeds max threads per block
  if (N * N > prop.maxThreadsPerBlock) {
    printf("Error: N*N (%d) exceeds max threads per block (%d)\n", N*N, prop.maxThreadsPerBlock);
    return -1;
  }

  int* h_A = (int*)malloc(N * N * sizeof(int));
  int* h_B = (int*)malloc(N * N * sizeof(int));
  int* h_C = (int*)malloc(N * N * sizeof(int)) ;

  printf("\nEnter the elements of the matrix A : \n");
  for(int i=0; i<N * N; i++){
    scanf("%d" , &h_A[i]);
  }

  printf("\nEnter the elements of the matrix B : \n");
  for(int i = 0; i < N * N; i++)
  {
    scanf("%d" , &h_B[i]);
  }

  //Device memory pointers
  int* d_A , *d_B , *d_C;

  //Allocate memory on the device
  cudaMalloc(&d_A , N * N * sizeof(int));
  cudaMalloc(&d_B , N * N * sizeof(int));
  cudaMalloc(&d_C , N * N * sizeof(int));

  //Transferring the data from host to the device
  cudaMemcpy(d_A , h_A , N * N * sizeof(int) , cudaMemcpyHostToDevice);
  cudaMemcpy(d_B , h_B , N * N * sizeof(int) , cudaMemcpyHostToDevice);
  cudaMemcpy(d_C , h_C , N * N * sizeof(int) , cudaMemcpyHostToDevice);

  //Launching the kernel
  dim3 dimGrid(1, 1);
  dim3 dimBlock(N, N); // Block size is N x N

  //Calculating the time
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaEventRecord(start);
  matrix_mul<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);

  float milliseconds = 0;
  cudaEventElapsedTime(&milliseconds, start, stop);

  //Transfer the data from the device to the host
  cudaMemcpy(h_C , d_C , N * N * sizeof(int) , cudaMemcpyDeviceToHost);

  //Free device memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  //PRINT THE RESULT
  printf("The matrix multiplication result is : \n");
  for(int i=0;i<N;i++){
    for(int j=0;j<N;j++){
      printf("%d " , h_C[i * N + j]);
    }
    printf("\n");
  }

  printf("The time required for the execution: %f milliseconds\n", milliseconds);

  free(h_A);
  free(h_B);
  free(h_C);

  return 0;
}


Overwriting matrix_multiplication.cu


In [8]:
!nvcc -arch=sm_75 matrix_multiplication.cu -o ma
!./ma


Enter the N : 1078
Error: N*N (1162084) exceeds max threads per block (1024)
