In [None]:
import numpy as np
from numba import cuda
from time import time
import math

In [None]:
!lscpu |grep 'Model name'
!nvidia-smi

In [None]:
def simple_dot_cpu(matrix1, matrix2):
  N = len(matrix1)
  matrix_res = np.zeros((N,N))
  start = time()
  for i in range(N):
    for j in range(N):
      for k in range(N):
        matrix_res[i,j] += matrix1[i,k] * matrix2[k,j] 
        return matrix_res, time()-start     


In [None]:
def numpy_dot_cpu(matrix1, matrix2):
  start = time()
  matrix_res = np.dot(matrix1, matrix2)
  return matrix_res, time() - start

In [None]:
@cuda.jit
def simple_dot_gpu(matrix1, matrix2, matrix_res):
  i, j = cuda.grid(2)
  if i < matrix_res.shape[0] and j < matrix_res.shape[1]:
   temp = 0
   for k in range(matrix1.shape[1]):
     temp += matrix1[i, k] * matrix2[k,j]
     matrix_res[i,j] = temp

In [None]:
def gpu_dot_exec(matrix1, matrix2, matrix_res, N):
  
  tread_number_block = 32
  matrix1_global = cuda.to_device(matrix1)
  matrix2_global = cuda.to_device(matrix2)
  matrix_res_global = cuda.device_array((N, N))

  threadsperblock = (tread_number_block, tread_number_block)
  blockspergrid_x = int(math.ceil(matrix1.shape[0] / threadsperblock[1]))
  blockspergrid_y = int(math.ceil(matrix2.shape[1] / threadsperblock[0]))
  blockspergrid = (blockspergrid_x, blockspergrid_y)

  start = time()
  simple_dot_gpu[blockspergrid, threadsperblock] (matrix1_global, matrix2_global, matrix_res_global)
  gpu_time = time() - start
  matrix_res_gpu = matrix_res_global.copy_to_host()
  return matrix_res_gpu, gpu_time

In [None]:
def one_test(N):
  gpu_time = 0
  cpu_time = 0
  matrix1 = np.random.randint(0, 10, (N, N))
  matrix1 = matrix1.astype(np.float64)
  matrix2 = np.random.randint(0, 10, (N, N))
  matrix2 = matrix2.astype(np.float64)
  matrix_res = np.zeros((N, N))
  matrix_res = matrix_res.astype(np.float64)


  matrix_res_gpu, gpu_time = gpu_dot_exec(matrix1, matrix2, matrix_res, N)
  start = time()
  matrix_res_cpu = simple_dot_cpu(matrix1, matrix2)
  cpu_time += time() - start 
  
  print('Matrix size N =', N)
  print('Computation time by CPU =',cpu_time)
  print('Computation time by GPU =',gpu_time)
  print('Total Acceleration = ', cpu_time/gpu_time)
  return cpu_time/gpu_time, matrix_res_cpu, matrix_res_gpu

In [None]:
def check_correctly_dot (N) :
  matrix1 = np.random.randint(0, 10, (N, N))
  matrix1 = matrix1.astype(np.float64)
  matrix2 = np.random.randint(0, 10, (N, N))
  matrix2 = matrix2.astype(np.float64)
  matrix_res = np.zeros ((N, N))
  matrix_res = matrix_res.astype(np.float64)
  matrix_res_real = numpy_dot_cpu(matrix1, matrix2) [0]
  matrix_res_cpu = simple_dot_cpu(matrix1, matrix2) [0]
  matrix_res_gpu = gpu_dot_exec(matrix1, matrix2, matrix_res, N) [0]
  if np.array_equal(matrix_res_real, matrix_res_cpu):
    print('Multiplication by cpu is correct')
  else:
    print('Multiplication by cpu is not correct')
  if np.array_equal(matrix_res_real, matrix_res_gpu):
    print('Multiplication by GPU is correct')
  else:
    print('Multiplication by GPU is not correct')

In [None]:
check_correctly_dot(128)
result = np.zeros((5,2), dtype = np.float64)
result[0][0], result[0][1] = 128, one_test(128) [0]
result[1][0], result[1][1] = 256, one_test(256) [0]
result[2][0], result[2][1] = 512, one_test(512) [0]
result[3][0], result[3][1] = 1024, one_test(1024)[0]
result[4][0], result[4][1] = 2048, one_test(2048)[0]