In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-moc6ygzu
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-moc6ygzu
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4305 sha256=d06389d43bbcd7f6ff7a53a5c1dcf18f6ea9defaf2d4231a6a7ac2649ae96bbc
  Stored in directory: /tmp/pip-ephem-wheel-cache-46apjd_u/wheels/db/c1/1f/a2bb07bbb4a1ce3c43921252aeafaa6205f08637e292496f04
Successfully built NVCCPlugin
Installing collecte

In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [8]:
%%cu

#include <stdio.h>
#include <cuda_runtime.h>

#define N 5
#define BLOCK_SIZE 256

__global__ void matrixVectorMultiplication(float* A, float* B, float* C)
{
    int tid = threadIdx.x;
    int bid = blockIdx.x;
    int idx = bid * blockDim.x + tid;
    for (int i = 0; i < N; i++) {
        float sum = 0.0;
        sum += A[idx * N + i] * B[i];
        C[idx] = sum;
    }
}

int main()
{
    float *A, *B, *C;
    float *dev_A, *dev_B, *dev_C;

    // Выделение памяти на хосте
    A = (float*)malloc(N * N * sizeof(float));
    B = (float*)malloc(N * sizeof(float));
    C = (float*)malloc(N * sizeof(float));

    // Инициализация матрицы и векторов
    for (int i = 0; i < N; i++) {
        B[i] = i;
        for (int j = 0; j < N; j++) {
            A[i * N + j] = i + j;
        }
    }

    // Вывод матрицы и векторов на экран
    printf("Matrix A:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", A[i * N + j]);
        }
        printf("\n");
    }
    printf("\nVector B:\n");
    for (int i = 0; i < N; i++) {
        printf("%f ", B[i]);
    }
    printf("\n\n");

    // Выделение памяти на устройстве
    cudaMalloc(&dev_A, N * N * sizeof(float));
    cudaMalloc(&dev_B, N * sizeof(float));
    cudaMalloc(&dev_C, N * sizeof(float));

    // Копирование данных с хоста на устройство
    cudaMemcpy(dev_A, A, N * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_B, B, N * sizeof(float), cudaMemcpyHostToDevice);

    // Вычисление количества блоков потоков
    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;

    // Выполнение ядра CUDA на устройстве
    matrixVectorMultiplication<<<numBlocks, BLOCK_SIZE>>>(dev_A, dev_B, dev_C);

    // Копирование результата с устройства на хост
    cudaMemcpy(C, dev_C, N * sizeof(float), cudaMemcpyDeviceToHost);

    // Вывод результата на экран
    printf("Result C:\n");
    for (int i = 0; i < N; i++) {
        printf("%f ", C[i]);
    }
    printf("\n");

    // Освобождение памяти на хосте и устройстве
    free(A);
    free(B);
    free(C);
    cudaFree(dev_A);
    cudaFree(dev_B);
    cudaFree(dev_C);

    return 0;
}


Matrix A:
0.000000 1.000000 2.000000 3.000000 4.000000 
1.000000 2.000000 3.000000 4.000000 5.000000 
2.000000 3.000000 4.000000 5.000000 6.000000 
3.000000 4.000000 5.000000 6.000000 7.000000 
4.000000 5.000000 6.000000 7.000000 8.000000 

Vector B:
0.000000 1.000000 2.000000 3.000000 4.000000 

Result C:
16.000000 20.000000 24.000000 28.000000 32.000000 

