# CUDA Exercise 04
> Matrix summation example on GPU, only applied with single thread. 

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_04.ipynb)

## Initialize the CUDA dev environment

In [4]:
# clone the code repo,
!pip install git+git://github.com/depctg/nvcc4jupyter.git
%load_ext nvcc_plugin

# Check the environment 
!lsb_release -a
!nvcc --version
!nvidia-smi

Collecting git+git://github.com/depctg/nvcc4jupyter.git
  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-t778hzfn
  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-t778hzfn
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=8873f16186676dbca7cd6b1588c46f86c101f2a8cdd29b38b813a2ca468ed8f7
  Stored in directory: /tmp/pip-ephem-wheel-cache-yr5jb27e/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c
Successfully built NVCCPlugin
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 18.04.5 LTS
Release:	18.04
Codename:	bionic
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda co

## Matrix Summation

In [5]:
%%cu
#include <stdio.h>
#include <assert.h>

#define M 10
#define N 10
#define MAX_ERR 1e-4

__global__ void matrix_summation(float* out, float *a, float *b, int m, int n) 
{
  int index;
  for(int i = 0; i < m; i++)
  {
      for(int j = 0; j < n; j++)
      {
          index = i*n+j;
          out[index] = a[index] + b[index];
      }
  }
}

int main()
{
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;
 
    a = (float*)malloc(sizeof(float) * (M * N));
    b = (float*)malloc(sizeof(float) * (M * N));
    out = (float*)malloc(sizeof(float) * (M * N));

    // data initializtion
    for(int i = 0; i < M; i++)
    {
        for(int j = 0; j < N; j++)
        {
            int index = i*N+j;
            a[index] = i*3.14f;
            b[index] = j;
        }
    }
    printf("a[12] = %f\n", a[12]);
    printf("b[12] = %f\n", b[12]);

    // Allocate memory on GPU
    cudaMalloc((void**)&d_a, sizeof(float) * (M * N));
    cudaMalloc((void**)&d_b, sizeof(float) * (M * N));
    cudaMalloc((void**)&d_out, sizeof(float) * (M * N));

    // copy operator to GPU
    cudaMemcpy(d_a, a, sizeof(float) * (M * N), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * (M * N), cudaMemcpyHostToDevice);

    // GPU do the work, CPU waits
    matrix_summation<<<1,1>>>(d_out, d_a, d_b, M, N);
 
    // Get results from the GPU
    cudaMemcpy(out, d_out, sizeof(float) * (M * N), 
               cudaMemcpyDeviceToHost);
 
    // Test the result
    for(int i = 0; i < M; i++)
    {
        for(int j = 0; j < N; j++)
        {
            int index = i*N+j;
            assert(fabs(out[index] - a[index] - b[index]) < MAX_ERR);
        }
    }
    printf("out[12] = %f\n", out[12]);
    printf("PASSED\n");
 
    cudaDeviceSynchronize();
    // Free the memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);
    
    return 0;
}

'a[12] = 3.140000\nb[12] = 2.000000\nout[12] = 5.140000\nPASSED\n'