In [4]:
!/usr/local/cuda/bin/nvcc --version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin
!cuda-install-samples-11.2.sh ~ && cd /root/NVIDIA_CUDA-11.2_Samples/

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-tl17143y
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-tl17143y
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=796989245dabfaa40ac05192435ec6f71fd8b3cfd13ecff919db0b7ea79471a1
  Stored in directory: /tmp/pip-ephem-wheel-cache-9q46eapv/wheels/

In [217]:
%%cuda --name curand.cu
#include <cstdlib>
#include <stdlib.h>
#include <iostream>
#include <curand.h>
#include <cublas_v2.h>
#include <time.h>
#include <cmath>

using namespace std;

// C(m,n) = A(m,k) * B(k,n)
//Умножение матриц на GPU
void gpu_matmul(const float *A, const float *B, float *C, const int m, const int k, const int n) {
    int lda=m,ldb=n,ldc=k;
    const float alf = 1;
    const float bet = 0;
    const float *alpha = &alf;
    const float *beta = &bet;
    // Create a handle for CUBLAS
    cublasHandle_t handle;
    cublasCreate(&handle);
    // Do the actual multiplication
    cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
    // Destroy the handle
    cublasDestroy(handle);
}
//Умножение матриц на CPU
void cpu_matmul(const float* A, const float* B, float* C, const int m, const int k, const int n) {
        for (int i = 0; i < m; ++i)
          for (int j = 0; j < n; ++j)
            for (int r = 0; r < k; ++r)
                C[i * n + j] += A[i * k + r] * B[r * n + j];
}
//Проверка корректности перемножения
void check_matmul(const float* C, const float* C2, const int m, const int n) {
    for (int i = 0; i < m * n; ++i)
        if (abs(C[i] - C2[i]) > 0.001) {
            cout << "The matrices are not equal\n";
            return;
        }
    cout << "The matrices are equal\n";
}
//Вывод матрицы
void print_matrix(float *h_C, int nr_rows_C, int nr_cols_C) {
    for (int i = 0; i < nr_rows_C; ++i){
      for (int j = 0; j < nr_cols_C; ++j)
        cout << *(h_C + i + j * nr_cols_C) << " ";
      cout << endl;
    }
}
//Транспонирование матрицы
void transpose(float* C, const int m, const int n) {
    float *C2 = (float *)malloc(m * n * sizeof(float));
    for (int i = 0; i < m; ++i)
          for (int j = 0; j < n; ++j)
              C2[i * n + j] = C[j * m + i];
    for (int i = 0; i < m * n; ++i)
        C[i] = C2[i];
    free(C2);
}

int main() {
    int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
    nr_rows_A = nr_cols_A = nr_rows_B = nr_cols_B = nr_rows_C = nr_cols_C = 2000;
    float *h_A = (float *)malloc(nr_rows_A * nr_cols_A * sizeof(float));
    float *h_B = (float *)malloc(nr_rows_B * nr_cols_B * sizeof(float));
    float *h_C = (float *)malloc(nr_rows_C * nr_cols_C * sizeof(float));
    float *h_C2 = (float *)malloc(nr_rows_C * nr_cols_C * sizeof(float));
    float *d_A, *d_B, *d_C;
    printf("Размерность матриц с типом float: %d (c T, c T)\n", nr_rows_A);
    for (int i = 0; i < nr_rows_A * nr_rows_A; ++i) {
        h_A[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
        h_B[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
    }
    cudaMalloc(&d_A,nr_rows_A * nr_cols_A * sizeof(float));
    cudaMalloc(&d_B,nr_rows_B * nr_cols_B * sizeof(float));
    cudaMalloc(&d_C,nr_rows_C * nr_cols_C * sizeof(float));
    cudaMemcpy(d_A,h_A,nr_rows_A * nr_cols_A * sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,h_B,nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyHostToDevice);

    srand(time(0));
    clock_t start, end;
    start = clock();
    gpu_matmul(d_A, d_B, d_C, nr_rows_A, nr_cols_A, nr_cols_B);
    end = clock();
    double gpu_time = static_cast <double>(end - start) / static_cast <double>(CLOCKS_PER_SEC);

    cudaMemcpy(h_C,d_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyDeviceToHost);
    //Транспонирование матрицы, тк cublas возвращает результат по столбцам
    transpose(h_C, nr_rows_C, nr_cols_C);
    //cout << "C =" << endl;
    //print_matrix(h_C, nr_rows_C, nr_cols_C);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    clock_t start1, end1;
    start1 = clock();
    cpu_matmul(h_A, h_B, h_C2, nr_rows_A, nr_cols_A, nr_cols_B);
    end1 = clock();
    double cpu_time = static_cast <double>(end1 - start1) / static_cast <double>(CLOCKS_PER_SEC);
    //cout << "C2 =" << endl;
    //print_matrix(h_C2, nr_rows_C, nr_cols_C);

    check_matmul(h_C, h_C2, nr_rows_C, nr_cols_C);

    cout << "CPU Time: " << cpu_time << endl;
    cout << "GPU Time: " << gpu_time << endl;

    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C2);

    return 0;
}


'File written in /content/src/curand.cu'

In [218]:
!nvcc -o /content/src/curand /content/src/curand.cu -lcurand -lcublas

In [219]:
!/content/src/curand

Размерность матриц с типом float: 2000 (c T, c T)
The matrices are equal
CPU Time: 93.7883
GPU Time: 0.58442
