In [20]:
%%writefile matrix_mul.cu
//lab 7
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <cmath>
#include <iomanip>
using namespace std;

__global__ void matMulKernel(float* A, float* B, float* C, int N, int TILE_SIZE) {
    extern __shared__ float shared[];
    float* tileA = shared;
    float* tileB = &shared[TILE_SIZE * TILE_SIZE];

    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;

    float sum = 0.0f;

    for (int i = 0; i < (N + TILE_SIZE - 1) / TILE_SIZE; ++i) {
        int tiledRow = row;
        int tiledCol = i * TILE_SIZE + threadIdx.x;
        if (tiledRow < N && tiledCol < N)
            tileA[threadIdx.y * TILE_SIZE + threadIdx.x] = A[tiledRow * N + tiledCol];
        else
            tileA[threadIdx.y * TILE_SIZE + threadIdx.x] = 0.0f;

        tiledRow = i * TILE_SIZE + threadIdx.y;
        tiledCol = col;
        if (tiledRow < N && tiledCol < N)
            tileB[threadIdx.y * TILE_SIZE + threadIdx.x] = B[tiledRow * N + tiledCol];
        else
            tileB[threadIdx.y * TILE_SIZE + threadIdx.x] = 0.0f;

        __syncthreads();

        for (int j = 0; j < TILE_SIZE; ++j)
            sum += tileA[threadIdx.y * TILE_SIZE + j] * tileB[j * TILE_SIZE + threadIdx.x];

        __syncthreads();
    }

    if (row < N && col < N)
        C[row * N + col] = sum;
}

void printMatrix(float* M, int N, const string& name) {
    cout << name << ":\n";
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j)
            cout << setw(5) << M[i * N + j] << " ";
        cout << "\n";
    }
    cout << endl;
}

int main() {
    int N;
    cin >> N;

    size_t size = N * N * sizeof(float);
    float* A = new float[N * N];
    float* B = new float[N * N];
    float* C = new float[N * N];

    for (int i = 0; i < N * N; ++i) {
        A[i] = rand() % 5;
        B[i] = rand() % 5;
    }

    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    int maxThreads = prop.maxThreadsPerBlock;
    int TILE_SIZE = min(32, (int)sqrt((float)maxThreads));

    cout << "Using TILE_SIZE: " << TILE_SIZE << endl;

    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    dim3 block(TILE_SIZE, TILE_SIZE);
    dim3 grid((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) / TILE_SIZE);
    size_t sharedMemSize = 2 * TILE_SIZE * TILE_SIZE * sizeof(float);

    matMulKernel<<<grid, block, sharedMemSize>>>(d_A, d_B, d_C, N, TILE_SIZE);
    cudaDeviceSynchronize();

    cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);

    printMatrix(A, N, "Matrix A");
    printMatrix(B, N, "Matrix B");
    printMatrix(C, N, "Matrix C (A x B)");

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    delete[] A;
    delete[] B;
    delete[] C;

    return 0;
}


Overwriting matrix_mul.cu


In [21]:
!nvcc matrix_mul.cu -o matrix_mul


In [22]:
!echo 2 > input.txt  # You can change 512 to any N



In [23]:
!./matrix_mul < input.txt


Using TILE_SIZE: 32
Matrix A:
    3     2 
    3     1 

Matrix B:
    1     0 
    0     2 

Matrix C (A x B):
    0     0 
    0     0 

