<a href="https://colab.research.google.com/github/SamigullinRT/HPC/blob/main/lab_1(Matmul)/LAB_1(Matmul).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-d2ykw94d
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-d2ykw94d
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=bb9a84d6291f5c7c40b78cbeb09e80cbe6a53a840c0dd8bb735b64f461fc6289
  Stored in directory: /tmp/pip-ephem-wheel-cache-6arwt0xg/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [17]:
%%cu
#include <iostream>
#include <cstdlib>
#include <ctime>

// Функция для перемножения матриц на CPU
void matrixMultiplicationCPU(int *A, int *B, int *C, int rowsA, int colsA, int colsB) {
    for (int i = 0; i < rowsA; ++i) {
        for (int j = 0; j < colsB; ++j) {
            int sum = 0;
            for (int k = 0; k < colsA; ++k) {
                sum += A[i * colsA + k] * B[k * colsB + j];
            }
            C[i * colsB + j] = sum;
        }
    }
}

// Функция для перемножения матриц на GPU с использованием CUDA
__global__
void matrixMultiplicationGPU(int *A, int *B, int *C, int rowsA, int colsA, int colsB) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < rowsA && col < colsB) {
        int sum = 0;
        for (int k = 0; k < colsA; ++k) {
            sum += A[row * colsA + k] * B[k * colsB + col];
        }
        C[row * colsB + col] = sum;
    }
}

int main() {
    // Задаем размеры матриц
    int rowsA = 100;
    int colsA = 100;
    int colsB = 100;

    // Вычисляем количество элементов в матрицах
    int sizeA = rowsA * colsA;
    int sizeB = colsA * colsB;
    int sizeC = rowsA * colsB;

    // Выделяем память на хосте (CPU)
    int *h_A = new int[sizeA];
    int *h_B = new int[sizeB];
    int *h_C_CPU = new int[sizeC];
    int *h_C_GPU = new int[sizeC];

    // Заполняем матрицы A и B случайными значениями
    srand(time(NULL));
    for (int i = 0; i < sizeA; ++i) {
        h_A[i] = rand() % 10;
    }
    for (int i = 0; i < sizeB; ++i) {
        h_B[i] = rand() % 10;
    }

    // Выделяем память на устройстве (GPU)
    int *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, sizeA * sizeof(int));
    cudaMalloc((void **)&d_B, sizeB * sizeof(int));
    cudaMalloc((void **)&d_C, sizeC * sizeof(int));

    // Копируем данные из хоста в память устройства
    cudaMemcpy(d_A, h_A, sizeA * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeB * sizeof(int), cudaMemcpyHostToDevice);

    // Задаем размерность сетки и блока
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((colsB + threadsPerBlock.x - 1) / threadsPerBlock.x, (rowsA + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Выполняем перемножение матриц на GPU и замерьте время
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    matrixMultiplicationGPU<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, rowsA, colsA, colsB);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Копируем результат из памяти устройства на хост
    cudaMemcpy(h_C_GPU, d_C, sizeC * sizeof(int), cudaMemcpyDeviceToHost);

    // Выполняем перемножение матриц на CPU
    clock_t cpu_start = clock();
    matrixMultiplicationCPU(h_A, h_B, h_C_CPU, rowsA, colsA, colsB);
    clock_t cpu_end = clock();
    double cpu_time = double(cpu_end - cpu_start) / CLOCKS_PER_SEC;

    // Проверяем корректность перемножения матриц
    bool correctness = true;
    for (int i = 0; i < sizeC; ++i) {
        if (abs(h_C_CPU[i] - h_C_GPU[i]) > 0) {
            correctness = false;
            break;
        }
    }

    // Выводим результаты
    std::cout << "Matrix multiplication correctness: " << (correctness ? "CORRECT" : "INCORRECT") << std::endl;
    std::cout << "CPU Time: " << cpu_time * 1000 << " milliseconds" << std::endl;
    std::cout << "GPU Time: " << milliseconds << " milliseconds" << std::endl;

    // Освобождаем память
    delete[] h_A;
    delete[] h_B;
    delete[] h_C_CPU;
    delete[] h_C_GPU;
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

Matrix multiplication correctness: CORRECT
CPU Time: 2.777 milliseconds
GPU Time: 0.030752 milliseconds

