# CUDA Exercise 05
> Parallelized Vector add. 

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_05.ipynb)

## Initialize the CUDA dev environment

In [1]:
# clone the code repo,
!pip install git+git://github.com/depctg/nvcc4jupyter.git
%load_ext nvcc_plugin

# Check the environment 
!lsb_release -a
!nvcc --version
!nvidia-smi

Collecting git+git://github.com/depctg/nvcc4jupyter.git
  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-2r93udvh
  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-2r93udvh
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=4010fe33cb0bdc3a44bc6c4d10aea34076d9daf8c6daec21c1a1544f0ab1b3f4
  Stored in directory: /tmp/pip-ephem-wheel-cache-y67t9ubh/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
Default out bin result.out
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 18.04.5 LTS
Release:	18.04
Codename:	bionic
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:

## Vector Add with Single Thread

In [2]:
%%writefile verctor_add_signal_thread.cu

#include <stdio.h>
#include <assert.h>

#define VECTOR_LENGTH 10000 
#define MAX_ERR 1e-4

__global__ void vector_add(float *out, float *a, float *b, int n) 
{
    for(int i = 0; i < n; i++)
    {
        out[i] = a[i] + b[i];
    }
}

int main()
{
    float *a, *b, *out;
    float *d_a, *d_b, *d_out; 

    // Allocate memory on CPU
    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);

    // data initializtion
    for(int i = 0; i < VECTOR_LENGTH; i++)
    {
        a[i] = 3.0f;
        b[i] = 0.14f;
    }

    // Allocate memory on GPU
    cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);

    // copy operator to GPU
    cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);

    for(int i=0;i<100;i++)
    {
      // GPU do the work, CPU waits
      vector_add<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);
    }
    // Get results from the GPU
    cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, 
               cudaMemcpyDeviceToHost);
 
    // Test the result
    for(int i = 0; i < VECTOR_LENGTH; i++){
        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
    }
    printf("out[0] = %f\n", out[0]);
    printf("PASSED\n");

    // Free the memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);
}

Writing verctor_add_signal_thread.cu


## Vector Add with Multiple Threads

In [3]:
%%writefile verctor_add_multi_thread.cu

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#define VECTOR_LENGTH 10000
#define MAX_ERR 1e-4

__global__ void vector_add(float *out, float *a, float *b, int n) 
{
    int index = threadIdx.x;
    int stride = blockDim.x;
    for(int i = index; i < n; i=i+stride)
    {
        out[i] = a[i] + b[i];
    }
}

int main(int argc, char *argv[])
{
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;
    int list_of_test_block_size[]={1,64,128,256,512,1024};
    int block_size = 1;
 
    if( argc == 2 ) {
      //printf("The argument supplied is %s\n", argv[1]);
      int arg1 = atoi(argv[1]);  //argv[0] is the program name
                                //atoi = ascii to int
                     
      block_size = list_of_test_block_size[arg1];
    }
    else if( argc > 2 ) {
      printf("Too many arguments supplied.\n");
    }
    else {
      printf("One argument expected.\n");
      
    }
 
    printf("The Block size is %d.\n", block_size);

    // Allocate memory on CPU
    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);

    // data initializtion
    for(int i = 0; i < VECTOR_LENGTH; i++)
    {
        a[i] = 3.0f;
        b[i] = 0.14f;
    }

    // Allocate memory on GPU
    cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);

    // copy operator to GPU
    cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);

    for(int i=0;i<100;i++)
    {
      // GPU do the work, CPU waits
      vector_add<<<1,block_size>>>(d_out, d_a, d_b, VECTOR_LENGTH);
    }
    // Get results from the GPU
    cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, 
               cudaMemcpyDeviceToHost);
 
    // Test the result
    for(int i = 0; i < VECTOR_LENGTH; i++){
        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
    }
    printf("out[0] = %f\n", out[0]);
    printf("PASSED\n");

    // Free the memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);
}

Writing verctor_add_multi_thread.cu


## Evaluation

Measuring the time cost of executing the CUDA fucntion with **signle thread**

In [4]:
!nvcc -o verctor_add_signal_thread verctor_add_signal_thread.cu
!nvprof ./verctor_add_signal_thread

==166== NVPROF is profiling process 166, command: ./verctor_add_signal_thread
out[0] = 3.140000
PASSED
==166== Profiling application: ./verctor_add_signal_thread
==166== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.99%  118.76ms       100  1.1876ms  1.1875ms  1.1882ms  vector_add(float*, float*, float*, int)
                    0.01%  9.6960us         2  4.8480us  4.7040us  4.9920us  [CUDA memcpy HtoD]
                    0.00%  5.1840us         1  5.1840us  5.1840us  5.1840us  [CUDA memcpy DtoH]
      API calls:   72.18%  312.33ms         3  104.11ms  2.8630us  312.32ms  cudaMalloc
                   27.39%  118.53ms         3  39.510ms  27.121us  118.47ms  cudaMemcpy
                    0.14%  603.24us         1  603.24us  603.24us  603.24us  cuDeviceGetPCIBusId
                    0.11%  481.38us       100  4.8130us  3.4180us  35.589us  cudaLaunchKernel
                    0.08%  356.39us         1  356.39u

Measuring the time cost of executing the CUDA fucntion with **multi-threads**

In [5]:
!nvcc -o verctor_add_multi_thread verctor_add_multi_thread.cu
!nvprof ./verctor_add_multi_thread 0
!nvprof ./verctor_add_multi_thread 1
!nvprof ./verctor_add_multi_thread 2
!nvprof ./verctor_add_multi_thread 3
!nvprof ./verctor_add_multi_thread 4
!nvprof ./verctor_add_multi_thread 5

The Block size is 1.
==210== NVPROF is profiling process 210, command: ./verctor_add_multi_thread 0
out[0] = 3.140000
PASSED
==210== Profiling application: ./verctor_add_multi_thread 0
==210== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.99%  110.00ms       100  1.1000ms  1.0999ms  1.1004ms  vector_add(float*, float*, float*, int)
                    0.01%  9.4400us         2  4.7200us  4.5760us  4.8640us  [CUDA memcpy HtoD]
                    0.00%  5.1520us         1  5.1520us  5.1520us  5.1520us  [CUDA memcpy DtoH]
      API calls:   67.94%  235.85ms         3  78.615ms  2.9820us  235.84ms  cudaMalloc
                   31.64%  109.82ms         3  36.607ms  25.740us  109.77ms  cudaMemcpy
                    0.15%  533.49us         1  533.49us  533.49us  533.49us  cuDeviceTotalMem
                    0.13%  448.80us       100  4.4880us  3.4570us  33.193us  cudaLaunchKernel
                    0.07%  230.29u