# CUDA Exercise 03
> Vector dot product(inner product) example GPU, only applied with single thread. 

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_03.ipynb)

## Initialize the CUDA dev environment

In [1]:
# clone the code repo,
!pip install git+git://github.com/depctg/nvcc4jupyter.git
%load_ext nvcc_plugin

# Check the environment 
!lsb_release -a
!nvcc --version
!nvidia-smi

Collecting git+git://github.com/depctg/nvcc4jupyter.git
  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-dcn3mih6
  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-dcn3mih6
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=502f57f1df304061f8b68db3c23567f7917f40794f6bdf2e09e21eef86af5570
  Stored in directory: /tmp/pip-ephem-wheel-cache-mk6amdyq/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
Default out bin result.out
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 18.04.5 LTS
Release:	18.04
Codename:	bionic
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:

## Vector Dot Production

In [2]:
%%cu
#include <stdio.h>
#include <assert.h>

#define VECTOR_LENGTH 10 
#define MAX_ERR 1e-5

__global__ void vector_dot_product(float *out, float *a, float *b, int n) 
{
    float sum=0;
    for(int i = 0; i < n; i++)
    {
        sum = sum +  a[i] * b[i];
    }
    *out = sum;
}

void test_vector_dot_product(void)
{
    float *a, *b, *out;
    float *d_a, *d_b, *d_out; 

    // Allocate memory on CPU
    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    out = (float*)malloc(sizeof(float));

    // data initializtion
    for(int i = 0; i < VECTOR_LENGTH; i++)
    {
        a[i] = 3.14f;
        b[i] = 2.0f;
    }

    // Allocate memory on GPU
    cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_out, sizeof(float));

    // copy operator to GPU
    cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);

    // GPU do the work, CPU waits
    vector_dot_product<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);
 
    // Get results from the GPU
    cudaMemcpy(out, d_out, sizeof(float), 
               cudaMemcpyDeviceToHost);
 
    // Test the result
    assert(fabs(*out - 20*3.14) < MAX_ERR);
 
    printf("out[0] = %f\n", out[0]);
    printf("PASSED\n");

    // Free the memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);
}

int main()
{
    test_vector_dot_product();
}

'out[0] = 62.799995\nPASSED\n'