<a href="https://colab.research.google.com/github/VetaAgafonova/HPC_labs/blob/VectorSum/Lab1_VectorSum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!/usr/local/cuda/bin/nvcc --version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin
!cuda-install-samples-11.2.sh ~ && cd /root/NVIDIA_CUDA-11.2_Samples/

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-z5el1wlv
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-z5el1wlv
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=d753a7d7ceccce19e12df72f9b7cc15a213d02a8bfa0ccad94d11533e37585db
  Stored in directory: /tmp/pip-ephem-wheel-cache-wtswyk86/wheels/

In [87]:
%%cuda --name curand.cu
#include <stdio.h>
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

#define BLOCK_DIM_X 1000

using namespace std;

float vectorSum_cpu(float* vec, int n) {
    float sum = 0;
    for (size_t i = 0; i < n; ++i)
        sum += vec[i];
    return sum;
}
//Код функции ядра
__global__ void vectorSum_gpu(float* vec, int n, float* res)
{
    __shared__ float temp[BLOCK_DIM_X];
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index < n)
        temp[threadIdx.x] = vec[index];
    __syncthreads();
    if (threadIdx.x == 0) {
        float sum = 0;
        for (int i = 0; i < blockDim.x; ++i)
            sum += temp[i];
        atomicAdd(res, sum);
    }
}

int main()
{
    int n = 1000000; //количество элементов в векторе

    float* vec = new float[n];
    for (size_t i = 0; i < n; ++i)
        vec[i] = 1;

    srand(time(0));
    clock_t start, end;
    start = clock();
    float sum_cpu = vectorSum_cpu(vec, n);
    end = clock();
    double time_cpu = static_cast <double>(end - start) / static_cast <double>(CLOCKS_PER_SEC);
    cout << "\nSum CPU = " << sum_cpu << "\tCPU time = " << time_cpu;

    float* vecdev;
    float* sum_gpu = new float;
    *sum_gpu = 0;
    float* sumdev;
    //выделение памяти на устройстве
    cudaMalloc(&vecdev, n * sizeof(float));
    cudaMalloc(&sumdev, sizeof(float));
    //копирование данных с хоста на девайс
    cudaMemcpy(vecdev, vec, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(sumdev, sum_gpu, sizeof(float), cudaMemcpyHostToDevice);

    dim3 block_dim(BLOCK_DIM_X);
    dim3 grid_dim(ceil(static_cast <float> (n) / static_cast <float> (block_dim.x)));
    //создание обработчиков событий
    cudaEvent_t begin, stop;
    cudaEventCreate(&begin);
    cudaEventCreate(&stop);
    //установка точки старта
    cudaEventRecord(begin, 0);
    //запуск ядра
    vectorSum_gpu << <grid_dim, block_dim >> > (vecdev, n, sumdev);
    //установка точки окончания
    cudaEventRecord(stop, 0);
    //синхронизация устройств
    cudaEventSynchronize(stop);
    float gpu_time;
    //расчет времени
    cudaEventElapsedTime(&gpu_time, begin, stop);
    //копирование данных с девайса на хост
    cudaMemcpy(sum_gpu, sumdev, sizeof(float), cudaMemcpyDeviceToHost);
    cout << "\nSum GPU = " << *sum_gpu << "\tGPU time = " << gpu_time / 1000.;
    //очищение памяти
    cudaFree(vecdev);
    delete[] vec;

    return 0;
}

'File written in /content/src/curand.cu'

In [88]:
!nvcc -o /content/src/curand /content/src/curand.cu -lcurand -lcublas

In [89]:
!/content/src/curand


Sum CPU = 1e+06	CPU time = 0.003799
Sum GPU = 1e+06	GPU time = 0.0003256