
# CUDA Exercise 08
> You should try to implement your own solution for matrix vector multiplication, and try to parallelize the computation.

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_08.ipynb)


## Initialize the CUDA dev environment

In [1]:
# clone the code repo,
!pip install git+git://github.com/depctg/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/depctg/nvcc4jupyter.git
  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-6ri04v_g
  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-6ri04v_g
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=4d14ae8e1b5d4553791c7785ff742a5ca7908444bfa86c9a7f151acbb55ff62c
  Stored in directory: /tmp/pip-ephem-wheel-cache-83ylvme0/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
Default out bin result.out


## Check the environment 

In [2]:
!lsb_release -a
!nvcc --version
!nvidia-smi

No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 18.04.5 LTS
Release:	18.04
Codename:	bionic
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0
Sun Apr 25 20:46:45 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|    

## Naive approach of matrix vector multiplication
Try to optimize it, you can do much better!

In [4]:
%%writefile matrix_vector_multiplication.cu
#include <stdio.h>
#include <assert.h>

#define M 100
#define N 100
#define MAX_ERR 1e-4

__global__ void matrix_vector_multiplication(float* vector_result, float *matrix_a, float *vector_b, int m_row, int n_col) 
{
    extern __shared__ float temp[];
 
    // blockIdx.x => which row
    // blockDim.x => row length
    // threadIdx.x => which element in this row
 
    // Unique tid which can index each single element in the matrix
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    // the condiction logic make sure, we only do the calculation in the matrix space
    int size_of_the_matrix = m_row*n_col;
    if(tid<size_of_the_matrix)
    {
        temp[tid] = matrix_a[tid] * vector_b[threadIdx.x]; // sum
    }
 
    __syncthreads(); // synchronize all threads
 
    // The accumulation only needs to happen at thread_0
    if (threadIdx.x == 0)
    {
        float sum = 0;
        int index = blockIdx.x * blockDim.x;
        for (int i = index; i < index + blockDim.x ; i++)
        {
            sum += temp[i];
        }
        vector_result[blockIdx.x] = sum;
    }
}

int main()
{
    float *martix_a, *martix_b, *vector_result;
    float *d_martix_a, *d_martix_b, *d_vector_result;
 
    martix_a = (float*)malloc(sizeof(float) * (M * N));
    martix_b = (float*)malloc(sizeof(float) * (N));
    vector_result = (float*)malloc(sizeof(float) * (M));

    // data initializtion
    for(int raw_num = 0; raw_num < M; raw_num++) 
    {
        for(int col_num = 0; col_num < N; col_num++)
        {
            int index = raw_num*N+col_num;
            martix_a[index] = raw_num*3.14f+col_num;
        }
    }
 
    for(int col_num = 0; col_num < N; col_num++)
    {
        martix_b[col_num] = col_num+1;
    }
 
    // Allocate memory on GPU
    cudaMalloc((void**)&d_martix_a, sizeof(float) * (M * N));
    cudaMalloc((void**)&d_martix_b, sizeof(float) * N);
    cudaMalloc((void**)&d_vector_result, sizeof(float) * M);

    // copy operator to GPU
    cudaMemcpy(d_martix_a, martix_a, sizeof(float) * (M * N), cudaMemcpyHostToDevice);
    cudaMemcpy(d_martix_b, martix_b, sizeof(float) * N, cudaMemcpyHostToDevice);

    // GPU do the work, CPU waits
    matrix_vector_multiplication<<<M,N,sizeof(float) * (M * N)>>>(d_vector_result, d_martix_a, d_martix_b, M, N);
 
    // Get results from the GPU
    cudaMemcpy(vector_result, d_vector_result, sizeof(float) * M, cudaMemcpyDeviceToHost);
 
    // Test the result
    for(int i = 0; i < M; i++)
    {
        float temp_sum =0;
        for(int j = 0; j < N; j++)
        {
            int index = i*N+j;
            temp_sum = temp_sum + martix_a[index]*martix_b[j]; 
        }
        //printf("out[%d]: %f, %f\n", i, temp_sum, vector_result[i]);
     
        assert(fabs(vector_result[i] - temp_sum) < MAX_ERR);
    }
    printf("PASSED\n");

    // Free the memory
    cudaFree(d_martix_a);
    cudaFree(d_martix_b);
    cudaFree(d_vector_result);
    free(martix_a);
    free(martix_b);
    free(vector_result);
    
    return 0;
}

Overwriting matrix_vector_multiplication.cu


## Evaluation to collect enough information for the benchmark

In [5]:
!nvcc -o matrix_vector_multiplication matrix_vector_multiplication.cu
!nvprof ./matrix_vector_multiplication 0 0
!nvprof ./matrix_vector_multiplication 1 0
!nvprof ./matrix_vector_multiplication 2 0
!nvprof ./matrix_vector_multiplication 3 0
!nvprof ./matrix_vector_multiplication 4 0

==166== NVPROF is profiling process 166, command: ./matrix_vector_multiplication 0 0
PASSED
==166== Profiling application: ./matrix_vector_multiplication 0 0
==166== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   53.11%  9.5670us         1  9.5670us  9.5670us  9.5670us  matrix_vector_multiplication(float*, float*, float*, int, int)
                   35.17%  6.3360us         2  3.1680us  1.4080us  4.9280us  [CUDA memcpy HtoD]
                   11.72%  2.1120us         1  2.1120us  2.1120us  2.1120us  [CUDA memcpy DtoH]
      API calls:   98.77%  317.58ms         3  105.86ms  3.9820us  317.57ms  cudaMalloc
                    0.98%  3.1590ms         1  3.1590ms  3.1590ms  3.1590ms  cuDeviceGetPCIBusId
                    0.11%  360.55us         1  360.55us  360.55us  360.55us  cuDeviceTotalMem
                    0.05%  146.39us       101  1.4490us     140ns  62.080us  cuDeviceGetAttribute
                    0.04%