#**ASSIGNMENT 4: MATRIX MULTIPLICATION**
**1] Install cuda toolkit of nvidia**

In [None]:
!apt-get update
!apt-get install -y --no-install-recommends nvidia-cuda-toolkit

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [673 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,065 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease

**2] Set environment variables**

In [None]:
!export PATH=/usr/local/cuda/bin/:$PATH
!export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH

**3]Sequential code**

In [None]:
%%writefile matrix_multiplication.cpp
#include <iostream>
#include <cstdlib>
#include <ctime>

#define N 1000

// Function to perform matrix multiplication sequentially on CPU
void matMulSequential(int* A, int* B, int* C) {
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            int sum = 0;
            for (int k = 0; k < N; ++k) {
                sum += A[i * N + k] * B[k * N + j];
            }
            C[i * N + j] = sum;
        }
    }
}

// Function to initialize matrices with random values
void initMatrix(int* mat, int size) {
    for (int i = 0; i < size; ++i) {
        for (int j = 0; j < size; ++j) {
            mat[i * size + j] = rand() % 10; // Random values between 0 and 9
        }
    }
}

// Function to compare results
void compareResults(int* A, int* B, int* C) {
    // Implement your comparison logic here
}

int main() {
    int* A, * B, * C; // Host matrices

    // Allocate host memory
    A = new int[N * N];
    B = new int[N * N];
    C = new int[N * N];

    // Initialize matrices with random values
    initMatrix(A, N);
    initMatrix(B, N);

    // Record start time
    clock_t startTime = clock();

    // Perform matrix multiplication sequentially on CPU
    matMulSequential(A, B, C);

    // Record stop time
    clock_t endTime = clock();

    // Calculate elapsed time
    double elapsedTime = double(endTime - startTime) / CLOCKS_PER_SEC * 1000.0;

    std::cout << "Time for sequential matrix multiplication: " << elapsedTime << " ms\n";

    // Compare results with CUDA version
    // Note: Implement your comparison logic in the compareResults function

    // Free allocated memory
    delete[] A;
    delete[] B;
    delete[] C;

    return 0;
}


Writing matrix_multiplication.cpp


**4] Compile and Run**

In [None]:
!g++ matrix_multiplication.cpp -o matrix_multiplication
!./matrix_multiplication

Time for sequential matrix multiplication: 3688.37 ms


**5] Cuda code**

In [None]:
%%writefile matrix_multiplication.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define N 1000
#define THREADS_PER_BLOCK 16

// Matrix multiplication kernel
__global__ void matMul(int* A, int* B, int* C) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int sum = 0;
        for (int i = 0; i < N; ++i) {
            sum += A[row * N + i] * B[i * N + col];
        }
        C[row * N + col] = sum;
    }
}

// Function to initialize matrices with random values
void initMatrix(int* mat, int size) {
    for (int i = 0; i < size; ++i) {
        for (int j = 0; j < size; ++j) {
            mat[i * size + j] = rand() % 10; // Random values between 0 and 9
        }
    }
}

// Function to compare results on CPU
void compareResults(int* A, int* B, int* C) {
    // Implement your comparison logic here
}

int main() {
    int* A, * B, * C; // Host matrices
    int* d_A, * d_B, * d_C; // Device matrices

    // Allocate host memory
    A = (int*)malloc(N * N * sizeof(int));
    B = (int*)malloc(N * N * sizeof(int));
    C = (int*)malloc(N * N * sizeof(int));

    // Initialize matrices with random values
    initMatrix(A, N);
    initMatrix(B, N);

    // Allocate device memory
    cudaMalloc((void**)&d_A, N * N * sizeof(int));
    cudaMalloc((void**)&d_B, N * N * sizeof(int));
    cudaMalloc((void**)&d_C, N * N * sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
    dim3 blocksPerGrid((N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);

    // Create CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Record start time
    cudaEventRecord(start);

    // Launch kernel
    matMul<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C);

    // Record stop time
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    // Calculate elapsed time
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time for matrix multiplication: %f ms\n", milliseconds);

    // Copy result from device to host
    cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    // Compare results with CPU
    compareResults(A, B, C);

    // Free allocated memory
    free(A);
    free(B);
    free(C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Destroy CUDA events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Writing matrix_multiplication.cu


**6]Compile and Run**

In [None]:
!nvcc matrix_multiplication.cu -o matrix_multiplication
!./matrix_multiplication

Time for matrix multiplication: 158.064224 ms
