# GPU Programming with CUDA: Matrix Product

Execute the cells below to run the code in google colab environment.

In [3]:
! pip install git+git://github.com/frehseg/nvcc4jupyter.git

Collecting git+git://github.com/frehseg/nvcc4jupyter.git
  Cloning git://github.com/frehseg/nvcc4jupyter.git to /tmp/pip-req-build-fiow7rro
  Running command git clone -q git://github.com/frehseg/nvcc4jupyter.git /tmp/pip-req-build-fiow7rro
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.1-cp36-none-any.whl size=2095 sha256=c4c4c5d937f94d182d11c28e0ae597e69d6e2b955cc2d4f84849129226334380
  Stored in directory: /tmp/pip-ephem-wheel-cache-uy7hq78v/wheels/a4/a5/24/17a2b61f9a725a10155cc6fca753aae28436921df21fa16114
Successfully built NVCCPlugin


In [0]:
%load_ext nvcc_plugin

## Classic product version without shared memory  
  
In this naive version, we compute the product $C = A * B$ in global memory with the standard matrix multiplication algorithm.

* Each thread calculates an element of C
* Each thread accesses to a whole line of A and a whole column of B

**Downsides of this method**:
* The access to data is non-aligned and scattered
* Pb of coalescing
* Repeated data access

In [39]:
%%cu
#include <stdlib.h>
#include <stdio.h>

#include <time.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>

#define TILE_WIDTH 32
#define SIZE 300
/********************** kernel **************************/
__global__
void matmul(float *A, float *B, float *C, int nb_ColA, int nb_ColB, int nb_RowA, int nb_RowB)
{
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  if (row < nb_RowA && col < nb_RowB) {
      for(int i = 0; i < nb_ColA; i++){
          C[row * nb_ColB + col] += A[row * nb_ColA + i] * B[i * nb_ColB + col];
      }
  }
}

/********************** main **************************/
int main(void)
{
  float *A, *B, *C, *gpu_A, *gpu_B, *gpu_C;
  int nbRowA, nbRowB, nbColA, nbColB;

  
  nbRowA = TILE_WIDTH * SIZE;
  nbRowB = TILE_WIDTH * SIZE;
  nbColA = TILE_WIDTH * SIZE;
  nbColB = TILE_WIDTH * SIZE;

  A = (float*) malloc(nbRowA * nbColA * sizeof(float));
  B = (float*) malloc(nbRowB * nbColB * sizeof(float));
  C = (float*) malloc(nbRowA * nbColB * sizeof(float));

  /*Allocation de l'espace pour le GPU */
  cudaMalloc((void**) &gpu_A, nbRowA * nbColA * sizeof(float));
  cudaMalloc((void**) &gpu_B, nbRowB * nbColB * sizeof(float));
  cudaMalloc((void**) &gpu_C, nbRowA * nbColB * sizeof(float));

  /* Initialisation de A et B*/
  for (int i = 0; i < nbRowA * nbColA; i++) {
    A[i] = 1.0;
  }

  for (int i = 0; i < nbRowB * nbColB; i++) {
    B[i] = 2.0;
  }

  /* Copie de A et B sur le GPU */
  cudaMemcpy(gpu_A, A, nbRowA * nbColA * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(gpu_B, B, nbRowB * nbColB * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(gpu_C, C, nbRowA * nbColB * sizeof(float), cudaMemcpyHostToDevice);


  /* Lancement du kernel avec mesure du temps */
 dim3 threadsPerBlock (TILE_WIDTH, TILE_WIDTH);
 dim3 blocksPerGrid (nbRowA / threadsPerBlock.y, nbRowB / threadsPerBlock.x);

 /** Timer start**/
 clock_t start = clock();
 
 matmul<<<blocksPerGrid, threadsPerBlock>>>(gpu_A, gpu_B, gpu_C, nbColA, nbColB, nbRowA, nbRowB);

 clock_t end = clock();
 double time_taken = ((double) end-start)/CLOCKS_PER_SEC; // in seconds
 printf("\nkernel took %f seconds to execute \n\n", time_taken);
 
  cudaMemcpy(C, gpu_C, nbRowA * nbColB * sizeof(float), cudaMemcpyDeviceToHost);

  /* Vérification du résultat*/
  float maxError = 0.0f;
  for (int i = 0; i < nbRowA * nbColB; i++){
      maxError = max(maxError, abs(C[i]- 2*nbRowB));
  }
  printf("Max error: %f\n", maxError);

  /* Libération de la mémoire sur le GPU*/ 
  cudaFree(gpu_A);
  cudaFree(gpu_B);
  cudaFree(gpu_C);

  free(A);
  free(B);
  free(C);
 
 printf("%s\n", cudaGetErrorString(cudaGetLastError()));
}



kernel took 0.000033 seconds to execute 

Max error: 0.000000
no error



## Shared memory version

Optimization: we can divide the matrix in different tiles and compute the product tile after tile.

With this, we have:

* Cooperative loading of a data tile in shared memory with regular accesses to global memory + quick accesses once in shared memory.
* Some synchronization points to ensure that the data of partial results computed by threads are correctly loaded.

In [40]:
%%cu
#include <stdlib.h>
#include <stdio.h>

#include <time.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>

#define TILE_WIDTH 32
#define SIZE 300
/********************** kernel **************************/
__global__
void matmul(float *A, float *B, float *C, int nb_ColA, int nb_ColB, int nb_RowA, int nb_RowB)
{
    int blockRow = blockIdx.y;
    int blockCol = blockIdx.x;

    // Tuile de la matrice C
    float* Csub = &C[nb_ColB * TILE_WIDTH * blockRow + TILE_WIDTH * blockCol];

    // Valeur de C calculée par un thread
    float Cvalue = 0;

    // Row et colonne dans une tuile de C
    int row = threadIdx.y;
    int col = threadIdx.x;

    // On parcoure les differentes tuiles de A et B
    // qui sont necessaires au calcul de la tuile C
    for (int m = 0; m < (nb_ColA / TILE_WIDTH); ++m) {

        // Tuile de A
        float* Asub = &A[nb_ColA * TILE_WIDTH * blockRow + TILE_WIDTH * m];

        // Tuile de B
        float* Bsub = &B[nb_ColB * TILE_WIDTH * m + TILE_WIDTH * blockCol];

        // Memoire partagee pour stocker les tuiles de A et B
        __shared__ float As[TILE_WIDTH][TILE_WIDTH];
        __shared__ float Bs[TILE_WIDTH][TILE_WIDTH];

        // Chargement de dans la memoire partagee
        // des valeurs de A et B par un thread
        As[row][col] = Asub[row * nb_ColA + col];
        Bs[row][col] = Bsub[row * nb_ColB + col];

        // Synchronisation des threads avant calcul pour C
        __syncthreads();
        // Multiply Asub and Bsub together
        for (int e = 0; e < TILE_WIDTH; ++e)
            Cvalue += As[row][e] * Bs[e][col];

        // Autre Synchronisation avant de charger des nouvelles
        // tuiles pour A et B
        __syncthreads();
    }

    // Ecriture dans C par le thread de la valeur accumulee
    Csub[row * nb_ColA + col] = Cvalue;
}

/********************** main **************************/
int main(void)
{
  float *A, *B, *C, *gpu_A, *gpu_B, *gpu_C;
  int nbRowA, nbRowB, nbColA, nbColB;

  
  nbRowA = TILE_WIDTH * SIZE;
  nbRowB = TILE_WIDTH * SIZE;
  nbColA = TILE_WIDTH * SIZE;
  nbColB = TILE_WIDTH * SIZE;

  A = (float*) malloc(nbRowA * nbColA * sizeof(float));
  B = (float*) malloc(nbRowB * nbColB * sizeof(float));
  C = (float*) malloc(nbRowA * nbColB * sizeof(float));

  /*Allocation de l'espace pour le GPU */
  cudaMalloc((void**) &gpu_A, nbRowA * nbColA * sizeof(float));
  cudaMalloc((void**) &gpu_B, nbRowB * nbColB * sizeof(float));
  cudaMalloc((void**) &gpu_C, nbRowA * nbColB * sizeof(float));

  /* Initialisation de A et B*/
  for (int i = 0; i < nbRowA * nbColA; i++) {
    A[i] = 1.0;
  }

  for (int i = 0; i < nbRowB * nbColB; i++) {
    B[i] = 2.0;
  }

  /* Copie de A et B sur le GPU */
  cudaMemcpy(gpu_A, A, nbRowA * nbColA * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(gpu_B, B, nbRowB * nbColB * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(gpu_C, C, nbRowA * nbColB * sizeof(float), cudaMemcpyHostToDevice);


  /* Lancement du kernel avec mesure du temps */
 dim3 threadsPerBlock (TILE_WIDTH, TILE_WIDTH);
 dim3 blocksPerGrid (nbRowA / threadsPerBlock.y, nbRowB / threadsPerBlock.x);

 clock_t start = clock();

 matmul<<<blocksPerGrid, threadsPerBlock>>>(gpu_A, gpu_B, gpu_C, nbColA, nbColB, nbRowA, nbRowB);

 clock_t end = clock();
 double time_taken = ((double) end-start)/CLOCKS_PER_SEC; // in seconds
 printf("\n Kernel took %f seconds to execute\n\n", time_taken);

  cudaMemcpy(C, gpu_C, nbRowA * nbColB * sizeof(float), cudaMemcpyDeviceToHost);

  /* Vérification du résultat*/
  float maxError = 0.0f;
  for (int i = 0; i < nbRowA * nbColB; i++){
      maxError = max(maxError, abs(C[i]- 2*nbRowB));
  }
  printf("Max error: %f\n", maxError);

  /* Libération de la mémoire sur le GPU*/ 
  cudaFree(gpu_A);
  cudaFree(gpu_B);
  cudaFree(gpu_C);

  free(A);
  free(B);
  free(C);
 
 printf("%s\n", cudaGetErrorString(cudaGetLastError()));
}


 Kernel took 0.000030 seconds to execute

Max error: 0.000000
no error

