# CUDA Exercise 06
> Another approach of parallelized Vector add. 

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_06.ipynb)

## Initialize the CUDA dev environment

In [1]:
# clone the code repo,
!pip install git+git://github.com/depctg/nvcc4jupyter.git
%load_ext nvcc_plugin

# Check the environment 
!lsb_release -a
!nvcc --version
!nvidia-smi

Collecting git+git://github.com/depctg/nvcc4jupyter.git
  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-0h_on20m
  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-0h_on20m
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=c3bbe482d2b7bd608c155ee0855393664aee1a212eb71f17e7c1d5c7be4d469c
  Stored in directory: /tmp/pip-ephem-wheel-cache-s37pn594/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
Default out bin result.out
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 18.04.5 LTS
Release:	18.04
Codename:	bionic
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:

## Vector Add with Multiple Threads across Blocks

In [2]:
%%writefile verctor_add_multi_blocks_thread.cu

#include <stdio.h>
#include <assert.h>

#define VECTOR_LENGTH 10000
#define MAX_ERR 1e-4

__global__ void vector_add(float *out, float *a, float *b, int n) 
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
    if(tid<n)
    {
        out[tid] = a[tid] + b[tid];
    }
}

int main(int argc, char *argv[])
{
  float *a, *b, *out;
  float *d_a, *d_b, *d_out;
  int list_of_test_block_size[]={1,64,128,256,512,1024};
  int block_size = 1;

  if( argc == 2 ) {
    //printf("The argument supplied is %s\n", argv[1]);
    int arg1 = atoi(argv[1]);  //argv[0] is the program name
                              //atoi = ascii to int
                    
    block_size = list_of_test_block_size[arg1];
  }
  else if( argc > 2 ) {
    printf("Too many arguments supplied.\n");
  }
  else {
    printf("One argument expected.\n");
    
  }

  printf("The Block size is %d.\n", block_size);


  // Allocate memory on CPU
  a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
  b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
  out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);

  // data initializtion
  for(int i = 0; i < VECTOR_LENGTH; i++)
  {
      a[i] = 3.0f;
      b[i] = 0.14f;
  }

  // Allocate memory on GPU
  cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);
  cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);
  cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);

  // copy operator to GPU
  cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);

  for(int i=0;i<100;i++)
  {
    // GPU do the work, CPU waits
    // Executing kernel 
    int grid_size = ((VECTOR_LENGTH + block_size) / block_size);
    vector_add<<<grid_size,block_size>>>(d_out, d_a, d_b, VECTOR_LENGTH);
  }
  // Get results from the GPU
  cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, 
              cudaMemcpyDeviceToHost);

  // Test the result
  for(int i = 0; i < VECTOR_LENGTH; i++){
      assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
  }
  printf("out[0] = %f\n", out[0]);
  printf("PASSED\n");

  // Free the memory
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_out);
  free(a);
  free(b);
  free(out);
  }

Writing verctor_add_multi_blocks_thread.cu


## Evaluation

Measuring the time cost of executing the CUDA fucntion

In [3]:
!nvcc -o verctor_add_multi_blocks_thread verctor_add_multi_blocks_thread.cu
!nvprof ./verctor_add_multi_blocks_thread 0
!nvprof ./verctor_add_multi_blocks_thread 1
!nvprof ./verctor_add_multi_blocks_thread 2
!nvprof ./verctor_add_multi_blocks_thread 3

The Block size is 1.
==165== NVPROF is profiling process 165, command: ./verctor_add_multi_blocks_thread 0
out[0] = 3.140000
PASSED
==165== Profiling application: ./verctor_add_multi_blocks_thread 0
==165== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.53%  3.0884ms       100  30.884us  30.785us  31.265us  vector_add(float*, float*, float*, int)
                    0.31%  9.5690us         2  4.7840us  4.6400us  4.9290us  [CUDA memcpy HtoD]
                    0.16%  5.1200us         1  5.1200us  5.1200us  5.1200us  [CUDA memcpy DtoH]
      API calls:   98.68%  329.13ms         3  109.71ms  3.1850us  329.12ms  cudaMalloc
                    0.86%  2.8551ms         3  951.69us  25.687us  2.7982ms  cudaMemcpy
                    0.14%  471.47us       100  4.7140us  3.2380us  32.273us  cudaLaunchKernel
                    0.12%  398.11us         1  398.11us  398.11us  398.11us  cuDeviceGetPCIBusId
                 