<a href="https://colab.research.google.com/github/Shrutika-TechSavvy/Google-Colab-Codes/blob/main/Matrix_Multiplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile matrix_mul_colab.cu
#include <stdio.h>
#include <cuda.h>

#define N 3

__global__ void matMul(int *A, int *B, int *C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n && col < n) {
        int sum = 0;
        for (int k = 0; k < n; ++k)
            sum += A[row * n + k] * B[k * n + col];
        C[row * n + col] = sum;
    }
}

int main() {
    int size = N*N*sizeof(int);
    int h_A[N*N], h_B[N*N], h_C[N*N];

    // Initialize matrices
    for (int i = 0; i < N*N; i++) {
        h_A[i] = i + 1;
        h_B[i] = (i + 1) * 2;
    }

    int *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    dim3 threads(N, N);
    dim3 blocks(1, 1);
    matMul<<<blocks, threads>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    printf("Result matrix C:\n");
    for (int i = 0; i < N*N; ++i) {
        printf("%d ", h_C[i]);
        if ((i+1) % N == 0) printf("\n");
    }

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    return 0;
}


Writing matrix_mul_colab.cu


In [None]:
!nvcc -arch=sm_75 matrix_mul_colab.cu -o matrix_mul_colab
!./matrix_mul_colab


Result matrix C:
60 72 84 
132 162 192 
204 252 300 


In [None]:
!./matmul

In [2]:
%%writefile matrix_mul_colab1.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

// GPU Kernel for Matrix Multiplication
__global__ void matMul(int *A, int *B, int *C, int n)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n && col < n) {
        int sum = 0;
        for (int k = 0; k < n; k++) {
            sum += A[row * n + k] * B[k * n + col];
        }
        C[row * n + col] = sum;
    }
}

int main()
{
    int N;
    printf("Enter matrix size N for NxN matrix: ");
    scanf("%d", &N);

    int size = N * N * sizeof(int);

    // Dynamic allocation on CPU
    int *h_A = (int *)malloc(size);
    int *h_B = (int *)malloc(size);
    int *h_C = (int *)malloc(size);
    int *h_C_CPU = (int *)malloc(size);

    // User input
    printf("\nEnter %d elements for Matrix A:\n", N*N);
    for (int i = 0; i < N*N; i++)
        scanf("%d", &h_A[i]);

    printf("\nEnter %d elements for Matrix B:\n", N*N);
    for (int i = 0; i < N*N; i++)
        scanf("%d", &h_B[i]);


    // Measure CPU execution time
    clock_t startCPU = clock();

    // CPU matrix multiplication
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            int sum = 0;
            for (int k = 0; k < N; k++) {
                sum += h_A[i*N + k] * h_B[k*N + j];
            }
            h_C_CPU[i*N + j] = sum;
        }
    }

    clock_t endCPU = clock();
    double cpu_time = ((double)(endCPU - startCPU) / CLOCKS_PER_SEC) * 1000.0;


    // GPU Memory allocation
    int *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Query GPU hardware specifications
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("\nGPU Streaming Multiprocessors (SMs): %d\n", prop.multiProcessorCount);

    // Block and grid configuration
    dim3 threads(16, 16);
    dim3 blocks((N + threads.x - 1) / threads.x,
                (N + threads.y - 1) / threads.y);

    printf("Threads per Block: %d x %d = %d\n", threads.x, threads.y, threads.x*threads.y);
    printf("Blocks in Grid: %d x %d\n", blocks.x, blocks.y);

    // GPU timing variables
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Start recording GPU time
    cudaEventRecord(start);

    // Launch kernel
    matMul<<<blocks, threads>>>(d_A, d_B, d_C, N);

    // Stop GPU timing
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float gpu_time = 0;
    cudaEventElapsedTime(&gpu_time, start, stop);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);


    // Display output
    printf("\nResult Matrix from GPU (C = A x B):\n");
    for (int i = 0; i < N*N; i++) {
        printf("%d ", h_C[i]);
        if ((i+1) % N == 0) printf("\n");
    }

    // Show timing results
    printf("\nCPU Execution Time: %.4f ms\n", cpu_time);
    printf("GPU Execution Time: %.4f ms\n", gpu_time);

    // Free memory
    free(h_A); free(h_B); free(h_C); free(h_C_CPU);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

    h_A = NULL;
    h_B = NULL;
    h_C = NULL;

    return 0;
}


Writing matrix_mul_colab1.cu


In [5]:
!nvcc -arch=sm_75 matrix_mul_colab1.cu -o matrix_mul_colab1
!./matrix_mul_colab1


Enter matrix size N for NxN matrix: 10

Enter 100 elements for Matrix A:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100

Enter 100 elements for Matrix B:
2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198 200

GPU Streaming Multiprocessors (SMs): 40
Threads per Block: 16 x 16 = 256
Blocks in Grid: 1 x 1

Result Matrix from GPU (C = A x B):
6710 6820 6930 7040 7150 7260 7370 7480 7590 7700 
15910 16220 16530 16840 17150 17460 17770 18080 18390 18700 
25110 256