<a href="https://colab.research.google.com/github/Shrutika-TechSavvy/Google-Colab-Codes/blob/main/Matrix_Multiplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile matrix_mul_colab.cu
#include <stdio.h>
#include <cuda.h>

#define N 3

__global__ void matMul(int *A, int *B, int *C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n && col < n) {
        int sum = 0;
        for (int k = 0; k < n; ++k)
            sum += A[row * n + k] * B[k * n + col];
        C[row * n + col] = sum;
    }
}

int main() {
    int size = N*N*sizeof(int);
    int h_A[N*N], h_B[N*N], h_C[N*N];

    // Initialize matrices
    for (int i = 0; i < N*N; i++) {
        h_A[i] = i + 1;
        h_B[i] = (i + 1) * 2;
    }

    int *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    dim3 threads(N, N);
    dim3 blocks(1, 1);
    matMul<<<blocks, threads>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    printf("Result matrix C:\n");
    for (int i = 0; i < N*N; ++i) {
        printf("%d ", h_C[i]);
        if ((i+1) % N == 0) printf("\n");
    }

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    return 0;
}


Writing matrix_mul_colab.cu


In [None]:
!nvcc -arch=sm_75 matrix_mul_colab.cu -o matrix_mul_colab
!./matrix_mul_colab


Result matrix C:
60 72 84 
132 162 192 
204 252 300 


In [None]:
!./matmul

Hardcoded Data's prgram neeche

In [1]:
%%writefile matrix_mul_colab1.cu
#include <stdio.h>
#include <cuda.h>

#define N 2   // Fixed matrix size 2x2

__global__ void matMulKernel(int *A, int *B, int *C) {
    int row = threadIdx.y;
    int col = threadIdx.x;

    int sum = 0;
    for (int k = 0; k < N; k++) {
        sum += A[row * N + k] * B[k * N + col];
    }
    C[row * N + col] = sum;
}

int main() {
    int h_A[N*N] = {1, 2,
                    3, 4};  // Matrix A
    int h_B[N*N] = {5, 6,
                    7, 8};  // Matrix B
    int h_C[N*N];           // Output Matrix C

    int size = N * N * sizeof(int);

    // Device memory pointers
    int *d_A, *d_B, *d_C;

    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    dim3 threads(N, N);  // 2x2 threads â†’ 4 threads total
    matMulKernel<<<1, threads>>>(d_A, d_B, d_C);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    printf("Result Matrix C:\n");
    for (int i = 0; i < N*N; i++) {
        printf("%d ", h_C[i]);
        if ((i + 1) % N == 0) printf("\n");
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Writing matrix_mul_colab1.cu


In [2]:
!nvcc -arch=sm_75 matrix_mul_colab1.cu -o matrix_mul_colab1
!./matrix_mul_colab1


Result Matrix C:
19 22 
43 50 
