#**Jednoduchy priklad vyuzitia grafickej karty pomocou CUDA**

In [None]:
%%bash
nvidia-smi
nvcc --version

Fri Jan  8 18:07:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

**Instalacia a nacitanie nvcc pluginu pre Jupyter notebook**

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-98ma2l77
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-98ma2l77
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4308 sha256=534162f6d7bb3770d552a63a76e57532add3e5ec01a2151f803f05429ca50619
  Stored in directory: /tmp/pip-ephem-wheel-cache-mmk_smsi/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [None]:
!git clone https://github.com/Wicwik/cuda-paralpr.git

fatal: destination path 'cuda-paralpr' already exists and is not an empty directory.


**Jednoduchy priklad pre test funkcnosti nvcc pluginu pre jupyter notebook**

(Priklad prevzany z [blogu](https://harshityadav95.medium.com/how-to-run-cuda-c-or-c-on-google-colab-or-azure-notebook-ea75a23a5962))

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void add(int *a, int *b, int *c) 
{
    *c = *a + *b;
}

int main() 
{
    int a, b, c;
 
    // host copies of variables a, b & c
    int *d_a, *d_b, *d_c;
 
    // device copies of variables a, b & c
    int size = sizeof(int);
    
    // Allocate space for device copies of a, b, c
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Setup input values  
    c = 0;
    a = 3;
    b = 5;
  
    // Copy inputs to device
    cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

    // Launch add() kernel on GPU
    add<<<1,1>>>(d_a, d_b, d_c);
    
    // Copy result back to host
    cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
    if(err!=cudaSuccess) 
    {
        printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
    }
 
    printf("result is %d\n",c);
    
    // Cleanup
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    
    return 0;
}

result is 8



**Viac ako 1M scitani cisel s desatinou ciarkou na CPU**

(Inspiracia z Nvidia [blogu](https://developer.nvidia.com/blog/even-easier-introduction-cuda/))

In [None]:
%%cu
#include <iostream>
#include <cmath>
#include <chrono>

void add(int n, float *x, float *y)
{
    for (int i = 0; i < n; i++)
    {
        y[i]+= x[i];
    }
}

int main()
{
    int N = 1 << 20;

    float *x = new float[N];
    float *y = new float[N];
 
    for (int i = 0; i < N; i++)
    {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }
 
    auto start = std::chrono::high_resolution_clock::now();
    add(N, x, y);
    auto stop = std::chrono::high_resolution_clock::now();
 
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); 
 
    float max_err = 0.0f;
    for (int i = 0; i < N; i++)
    {
        max_err = std::fmax(max_err, std::fabs(y[i]-3.0f));
    }
    std::cout << duration.count() << " microseconds" << std::endl; 
    std::cout << "Max error: " << max_err << std::endl;
 
    delete[] x;
    delete[] y;
}

2844 microseconds
Max error: 0



**Viac ako 1M scitani cisel s desatinou ciarkou na GPU (velkost bloku aj pocet blokov je 1)**

(Inspiracia z Nvidia [blogu](https://developer.nvidia.com/blog/even-easier-introduction-cuda/))

In [None]:
%%cu
#include <iostream>
#include <cmath>
#include <chrono>

__global__ void add(int n, float *x, float *y)
{
    for (int i = 0; i < n; i++)
    {
        y[i]+= x[i];
    }
}

int main()
{
    int N = 1 << 20;
    float *x, *y;
 
    // unified memory allocation
    cudaMallocManaged(&x, N*sizeof(float)); 
    cudaMallocManaged(&y, N*sizeof(float));
 
    for (int i = 0; i < N; i++)
    {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }
 
    auto start = std::chrono::high_resolution_clock::now();
    add<<<1,1>>>(N, x, y);
 
    cudaDeviceSynchronize();
    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
 
    float max_err = 0.0f;
    for (int i = 0; i < N; i++)
    {
        max_err = std::fmax(max_err, std::fabs(y[i]-3.0f));
    }
    std::cout << duration.count() << " microseconds" << std::endl; 
    std::cout << "Max error: " << max_err << std::endl;
 
    cudaFree(x);
    cudaFree(y);
}

127381 microseconds
Max error: 0



In [None]:
!nvcc cuda-paralpr/basics/src/cuda_example0.cu
!nvprof ./a.out

==423== NVPROF is profiling process 423, command: ./a.out
127130 microseconds
Max error: 0
==423== Profiling application: ./a.out
==423== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  127.05ms         1  127.05ms  127.05ms  127.05ms  add(int, float*, float*)
      API calls:   60.98%  200.41ms         2  100.20ms  34.934us  200.37ms  cudaMallocManaged
                   38.66%  127.07ms         1  127.07ms  127.07ms  127.07ms  cudaDeviceSynchronize
                    0.18%  581.40us         2  290.70us  286.15us  295.25us  cudaFree
                    0.11%  369.00us         1  369.00us  369.00us  369.00us  cuDeviceTotalMem
                    0.04%  137.32us        97  1.4150us     134ns  57.655us  cuDeviceGetAttribute
                    0.02%  51.236us         1  51.236us  51.236us  51.236us  cudaLaunchKernel
                    0.01%  28.624us         1  28.624us  28.624us  28.624us  cuDeviceGetName
 

In [None]:
%%cu
#include <iostream>
#include <algorithm>
#include <chrono>

__global__ void add(float *x, float *y, float *z, int size)
{
    int index = threadIdx.x;
    int stride = blockDim.x;
 
    for (int i = index; i < size; i += stride)
    {
        z[i] = x[i] + y[i];
    }
}

cudaError_t cuda_add(float *x, float *y, float *z, int size);

int main()
{
    const int N = 1 << 20;

    float *x = new float[N];
    float *y = new float[N];
    float *z = new float[N];
 
    std::fill_n(x, N, 1.0f);
    std::fill_n(y, N, 2.0f);
 
    cudaError_t cudaStatus = cuda_add(x, y, z, N);
 
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "add_cuda failed!");
        return 1;
    }
 
    float max_err = 0.0f;
    for (int i = 0; i < N; i++)
    {
        max_err = std::fmax(max_err, std::fabs(z[i]-3.0f));
    }
    std::cout << "Max error: " << max_err << std::endl;
 
    delete[] x;
    delete[] y;
    delete[] z;
}

cudaError_t cuda_add(float *x, float *y, float *z, int size)
{
    float *dev_x = 0;
    float *dev_y = 0;
    float *dev_z = 0;
    cudaError_t cudaStatus;
 
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?"); 
        return cudaStatus;
    }
 
    cudaStatus = cudaMalloc((void**)&dev_x, size * sizeof(float));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        cudaFree(dev_x);
        return cudaStatus;
    }
 
    cudaStatus = cudaMalloc((void**)&dev_y, size * sizeof(float));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        cudaFree(dev_x);
        cudaFree(dev_y);
    
        return cudaStatus;
    }
 
    cudaStatus = cudaMalloc((void**)&dev_z, size * sizeof(float));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    cudaStatus = cudaMemcpy(dev_x, x, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMemcpy failed!");
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    cudaStatus = cudaMemcpy(dev_y, y, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMemcpy failed!");
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    auto start = std::chrono::high_resolution_clock::now();
    add<<<1, 1>>>(dev_x, dev_y, dev_z, size);
    

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    cudaStatus = cudaDeviceSynchronize();
    auto stop = std::chrono::high_resolution_clock::now();
 
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
    std::cout << duration.count() << " microseconds" << std::endl;
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    cudaStatus = cudaMemcpy(z, dev_z, size * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMemcpy failed!");
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    cudaFree(dev_x);
    cudaFree(dev_y);
    cudaFree(dev_z);
 
    return cudaStatus;
}

180208 microseconds
Max error: 0



In [None]:
!nvcc cuda-paralpr/basics/src/cuda_example1.cu
!nvprof ./a.out

==517== NVPROF is profiling process 517, command: ./a.out
180093 microseconds
Max error: 0
==517== Profiling application: ./a.out
==517== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   98.17%  179.93ms         1  179.93ms  179.93ms  179.93ms  add(float*, float*, float*, int)
                    0.96%  1.7565ms         1  1.7565ms  1.7565ms  1.7565ms  [CUDA memcpy DtoH]
                    0.88%  1.6049ms         2  802.43us  793.58us  811.27us  [CUDA memcpy HtoD]
      API calls:   49.53%  180.06ms         1  180.06ms  180.06ms  180.06ms  cudaDeviceSynchronize
                   48.79%  177.34ms         3  59.114ms  75.336us  177.19ms  cudaMalloc
                    1.38%  5.0027ms         3  1.6676ms  918.76us  3.0522ms  cudaMemcpy
                    0.16%  568.21us         3  189.40us  151.92us  211.31us  cudaFree
                    0.09%  336.30us         1  336.30us  336.30us  336.30us  cuDeviceTotalMem
     

**Viac ako 1M scitani cisel s desatinou ciarkou na GPU (pridane osetrenia, velkost bloku je 256)**

In [None]:
%%cu
#include <iostream>
#include <algorithm>
#include <chrono>

__global__ void add(float *x, float *y, float *z, int size)
{
    int index = threadIdx.x;
    int stride = blockDim.x;
 
    for (int i = index; i < size; i += stride)
    {
        z[i] = x[i] + y[i];
    }
}

cudaError_t cuda_add(float *x, float *y, float *z, int size);

int main()
{
    const int N = 1 << 20;

    float *x = new float[N];
    float *y = new float[N];
    float *z = new float[N];
 
    std::fill_n(x, N, 1.0f);
    std::fill_n(y, N, 2.0f);
 
    cudaError_t cudaStatus = cuda_add(x, y, z, N);
 
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "add_cuda failed!");
        return 1;
    }
 
    float max_err = 0.0f;
    for (int i = 0; i < N; i++)
    {
        max_err = std::fmax(max_err, std::fabs(z[i]-3.0f));
    }
    std::cout << "Max error: " << max_err << std::endl;
 
    delete[] x;
    delete[] y;
    delete[] z;
}

cudaError_t cuda_add(float *x, float *y, float *z, int size)
{
    float *dev_x = 0;
    float *dev_y = 0;
    float *dev_z = 0;
    cudaError_t cudaStatus;
 
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?"); 
        return cudaStatus;
    }
 
    cudaStatus = cudaMalloc((void**)&dev_x, size * sizeof(float));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        cudaFree(dev_x);
        return cudaStatus;
    }
 
    cudaStatus = cudaMalloc((void**)&dev_y, size * sizeof(float));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        cudaFree(dev_x);
        cudaFree(dev_y);
    
        return cudaStatus;
    }
 
    cudaStatus = cudaMalloc((void**)&dev_z, size * sizeof(float));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    cudaStatus = cudaMemcpy(dev_x, x, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMemcpy failed!");
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    cudaStatus = cudaMemcpy(dev_y, y, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMemcpy failed!");
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }
 
    int block_size = 256;
    int number_of_blocks = (size + block_size - 1) / block_size;
 
    std::cout << number_of_blocks << std::endl;

    auto start = std::chrono::high_resolution_clock::now();
    add<<<number_of_blocks, block_size>>>(dev_x, dev_y, dev_z, size);
    

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    cudaStatus = cudaDeviceSynchronize();
    auto stop = std::chrono::high_resolution_clock::now();
 
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
    std::cout << duration.count() << " microseconds" << std::endl;
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    cudaStatus = cudaMemcpy(z, dev_z, size * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMemcpy failed!");
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
    
        return cudaStatus;
    }

    cudaFree(dev_x);
    cudaFree(dev_y);
    cudaFree(dev_z);
 
    return cudaStatus;
}

4096
70690 microseconds
Max error: 0



In [None]:
!nvcc cuda-paralpr/basics/src/cuda_example2.cu
!nvprof ./a.out

==613== NVPROF is profiling process 613, command: ./a.out
68846 microseconds
Max error: 0
==613== Profiling application: ./a.out
==613== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.57%  68.690ms         1  68.690ms  68.690ms  68.690ms  add(float*, float*, float*, int)
                    2.28%  1.6420ms         2  821.00us  815.08us  826.92us  [CUDA memcpy HtoD]
                    2.14%  1.5387ms         1  1.5387ms  1.5387ms  1.5387ms  [CUDA memcpy DtoH]
      API calls:   70.37%  177.34ms         3  59.112ms  77.078us  177.16ms  cudaMalloc
                   27.30%  68.807ms         1  68.807ms  68.807ms  68.807ms  cudaDeviceSynchronize
                    1.88%  4.7349ms         3  1.5783ms  932.16us  2.7633ms  cudaMemcpy
                    0.23%  573.16us         3  191.05us  164.06us  205.70us  cudaFree
                    0.14%  343.81us         1  343.81us  343.81us  343.81us  cuDeviceTotalMem
      