In [1]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2021.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 1.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting mako
  Downloading Mako-1.1.5-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 4.1 MB/s 
[?25hCollecting pytools>=2011.2
  Downloading pytools-2021.2.8.tar.gz (63 kB)
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (PEP 517) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2021.1-cp37-cp37m-linux_x86_64.whl size=628011 sha256=aee1151588b17a5f7d83c5c4b32305b833f157c00b55f3f8c3858febd950a272
  Stored in directory: /root/.cache/pip/wheels/c4/ef/49/dc6a5feb8d980b37c83d465ecab24949a6aa19458522a9e001
  Building wheel for pytools (setup.py) ... [?25l[?25hdo

In [3]:
import numpy as np
from timeit import default_timer as timer
from pycuda import compiler, gpuarray, tools
import pycuda.driver as drv
import pycuda.autoinit

In [4]:
MATRIX_SIZES = [128, 256, 512, 1024,2048]
BLOCK_SIZE = 16

In [5]:
kernel_code_template = """
__global__ void matrix_multiply(int matrixsize,float *a, float *b, float *c)
{
    // 2D Thread ID 
    int tx = blockDim.x*blockIdx.x + threadIdx.x; // Compute column index
    int ty = blockDim.y*blockIdx.y + threadIdx.y; // Compute row index
    // Each thread loads one row of M and one column of N, 
    //   to produce one element of P.
    if((ty <matrixsize) && (tx < matrixsize))
    {
    // Pvalue is used to store the element of the matrix
    // that is computed by the thread
    float Pvalue = 0;
    for(int k=0; k<matrixsize; ++k)
    {
    float Aelement = a[ty*matrixsize +k];
    float Belement = b[k*matrixsize +tx];
    Pvalue += Aelement * Belement;
    }
    c[ty * matrixsize + tx] = Pvalue;
    }
}
"""

# compile the kernel code
mod = compiler.SourceModule(kernel_code_template)

# get the kernel function from the compiled module
matrix_multiply = mod.get_function("matrix_multiply")




In [6]:
def multiply_with_cpu(a, b):
  return a.dot(b)

In [7]:
def multiply_with_gpu(a, b, MATRIX_SIZE):
  # transfer host (CPU) memory to device (GPU) memory
  a_gpu = gpuarray.to_gpu(a)
  b_gpu = gpuarray.to_gpu(b)

  # create empty gpu array for the result (C = A * B)
  c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
  # set grid size
  #if MATRIX_SIZE%BLOCK_SIZE != 0:
    #  grid=(MATRIX_SIZE//BLOCK_SIZE+1,MATRIX_SIZE//BLOCK_SIZE+1,1)
  #else:
  grid=(MATRIX_SIZE//BLOCK_SIZE,MATRIX_SIZE//BLOCK_SIZE,1)

  # call the kernel on the card
  matrix_multiply(np.uint32(MATRIX_SIZE),
    # inputs
    a_gpu, b_gpu,
    # output
    c_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1),
    )
  return c_gpu  

In [8]:
def calculate(a, b, MATRIX_SIZE):
      start_cpu = timer()
      c_cpu = multiply_with_cpu(a, b)
      cpu_multiply_time = timer() - start_cpu

      start_gpu = timer()
      c_gpu = multiply_with_gpu(a, b, MATRIX_SIZE)
      gpu_multiply_time = timer() - start_gpu
  
      return cpu_multiply_time * 1000, gpu_multiply_time * 1000, np.allclose(c_cpu, c_gpu.get())

In [9]:
count = 15

print(" N \t CPU time \t GPU time \t Speedup")

for size in MATRIX_SIZES:
  cpu_time = 0
  gpu_time = 0

  for i in range(count):
    a = np.random.rand(size, size).astype(np.float32)
    b = np.random.rand(size, size).astype(np.float32)

    current_cpu_time, current_gpu_time, err = calculate (a, b, size)
    cpu_time += current_cpu_time
    gpu_time += current_gpu_time

  if err is False:
      print("N = {:d}: results not equals".format(size))

  print("{:4d} \t {:7.3f} \t {:7.3f} \t {:7.2f}".format(size, cpu_time / count, gpu_time / count, cpu_time / gpu_time))

 N 	 CPU time 	 GPU time 	 Speedup
 128 	   0.609 	   0.993 	    0.61
 256 	   0.636 	   1.266 	    0.50
 512 	   4.599 	   6.229 	    0.74
1024 	  33.211 	  36.059 	    0.92
2048 	 248.920 	 189.160 	    1.32
