<a href="https://colab.research.google.com/github/NineNineAFK/DC/blob/master/Copy_of_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
%%writefile matrix_mul.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cstdlib>
#include <vector>

#define N 3               // Matrix dimension (NxN)
#define BLOCK_SIZE 16     // Max number of threads per block dimension

using namespace std;

// CUDA kernel for matrix multiplication
__global__ void matMulKernel(float* A, float* B, float* C, int n) {
    // Calculate row and column index for the element this thread will compute
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;

    // Perform the dot product of row from A and column from B
    for (int k = 0; k < n; ++k) {
        if (row < n && col < n) {
            sum += A[row * n + k] * B[k * n + col];
        }
    }

    // Store the result in matrix C
    if (row < n && col < n) {
        C[row * n + col] = sum;
    }
}

// Helper function to print a matrix
void printMatrix(float* mat, int n, const char* name) {
    cout << name << ":\n";
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            cout << mat[i * n + j] << " ";
        }
        cout << "\n";
    }
}

// Helper function to take matrix input from the user
void takeInput(float* mat, int n) {
    cout << "Enter elements of the matrix (" << n << "x" << n << "):\n";
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            cout << "Element [" << i + 1 << "][" << j + 1 << "]: ";
            cin >> mat[i * n + j];
        }
    }
}

int main() {
    size_t size = N * N * sizeof(float); // Total size of each matrix in bytes

    // Allocate host memory for matrices A, B, and C
    float *h_A = new float[N * N], *h_B = new float[N * N], *h_C = new float[N * N];

    // Take input for matrices A and B from the user
    takeInput(h_A, N);
    takeInput(h_B, N);

    // Allocate device memory for matrices A, B, and C
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // Copy host matrices A and B to device memory
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Check for CUDA-capable device
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0) {
        cerr << "No CUDA-capable devices found!" << endl;
        return -1;
    }

    // Use the first CUDA device
    int deviceID = 0;
    cudaSetDevice(deviceID);

    // Print the name of the GPU being used
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, deviceID);
    cout << "Using GPU: " << deviceProp.name << endl;

    // Determine block and grid dimensions
    int blockSize = min(deviceProp.maxThreadsPerBlock, BLOCK_SIZE);
    dim3 threads(blockSize, blockSize); // Threads per block (blockDim)
    dim3 blocks((N + blockSize - 1) / blockSize, (N + blockSize - 1) / blockSize); // Grid size (blockIdx)

    // Launch the kernel
    matMulKernel<<<blocks, threads>>>(d_A, d_B, d_C, N);

    // Check for kernel launch errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        cerr << "CUDA error: " << cudaGetErrorString(err) << endl;
        return -1;
    }

    // Copy the result matrix from device to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Print matrices if N is small enough
    if (N <= 16) {
        printMatrix(h_A, N, "Matrix A");
        printMatrix(h_B, N, "Matrix B");
        printMatrix(h_C, N, "Matrix C");
    }

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Free host memory
    delete[] h_A;
    delete[] h_B;
    delete[] h_C;

    return 0;
}

Overwriting matrix_mul.cu


In [None]:
!nvcc matrix_mul.cu -o matrix_mul -arch=sm_75

!./matrix_mul

Enter elements of the matrix (3x3):
Element [1][1]: 1
Element [1][2]: 2
Element [1][3]: 3
Element [2][1]: 4
Element [2][2]: 5
Element [2][3]: 6
Element [3][1]: 7
Element [3][2]: 8
Element [3][3]: 9
Enter elements of the matrix (3x3):
Element [1][1]: 9
Element [1][2]: 8
Element [1][3]: 7
Element [2][1]: 6
Element [2][2]: 5
Element [2][3]: 4
Element [3][1]: 3
Element [3][2]: 2
Element [3][3]: 1
Using GPU: Tesla T4
Matrix A:
1 2 3 
4 5 6 
7 8 9 
Matrix B:
9 8 7 
6 5 4 
3 2 1 
Matrix C:
30 24 18 
84 69 54 
138 114 90 
