# CUDA Exercise 02
> Vector add example with CPU and GPU, only applied with single thread. 

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_02.ipynb)

## Initialize the CUDA dev environment

In [1]:
# clone the code repo,
!pip install git+git://github.com/depctg/nvcc4jupyter.git
%load_ext nvcc_plugin

# Check the environment 
!lsb_release -a
!nvcc --version
!nvidia-smi

Collecting git+git://github.com/depctg/nvcc4jupyter.git
  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-22k37xu7
  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-22k37xu7
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=bc12d7017a71a934fd7d39e61241824922d949a1086f514170ffd209c2dc57b5
  Stored in directory: /tmp/pip-ephem-wheel-cache-4zyegsxi/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
Default out bin result.out
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 18.04.5 LTS
Release:	18.04
Codename:	bionic
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:

## Hello World

In [2]:
%%cu

#include <stdio.h>
#include <assert.h>

#define VECTOR_LENGTH 10000 
#define MAX_ERR 1e-4

__global__ void vector_add(float *out, float *a, float *b, int n) 
{
    for(int i = 0; i < n; i++)
    {
        out[i] = a[i] + b[i];
    }
}

int main()
{
    float *a, *b, *out;
    float *d_a, *d_b, *d_out; 

    //===================步骤1===================
    // Allocate memory on CPU
    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);

    // data initializtion
    for(int i = 0; i < VECTOR_LENGTH; i++)
    {
        a[i] = 3.0f;
        b[i] = 0.14f;
    }
    //===================步骤1===================

    //===================步骤2===================
    // Allocate memory on GPU
    cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);
    //===================步骤2===================

    //===================步骤3===================
    // copy operator to GPU
    cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    //===================步骤3===================

    //===================步骤4===================
    // GPU do the work, CPU waits
    vector_add<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);
    //===================步骤4===================

    //===================步骤5===================
    // Get results from the GPU
    cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, 
               cudaMemcpyDeviceToHost);
 
    // Test the result
    for(int i = 0; i < VECTOR_LENGTH; i++)
    {
        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
    }
    printf("out[0] is %f\n", out[0]);
    printf("PASSED\n");
    //===================步骤5===================

    //===================步骤6===================
    // Free the memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);
    //===================步骤6===================
}

'out[0] is 3.140000\nPASSED\n'