In [1]:
!pip install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/46/61/47d3235a4c13eec5a5f03594ddb268f4858734e02980afbcd806e6242fa5/pycuda-2020.1.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 10.5MB/s 
[?25hCollecting pytools>=2011.2
[?25l  Downloading https://files.pythonhosted.org/packages/b7/30/c9362a282ef89106768cba9d884f4b2e4f5dc6881d0c19b478d2a710b82b/pytools-2020.4.3.tar.gz (62kB)
[K     |████████████████████████████████| 71kB 9.8MB/s 
Collecting appdirs>=1.4.0
  Downloading https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl
Collecting mako
[?25l  Downloading https://files.pythonhosted.org/packages/a6/37/0e706200d22172eb8fa17d68a7ae22dec7631a0a92266634fb518a88a5b2/Mako-1.1.3-py2.py3-none-any.whl (75kB)
[K     |████████████████████████████████| 81kB 8.4MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (setup.py) ...

In [3]:
import pycuda.autoinit

from pycuda.tools import make_default_context
make_default_context().get_device().name()

'Tesla T4'

In [4]:
import numpy as np
from numpy import linalg as la
from pycuda import driver, compiler, gpuarray, tools
import pycuda.autoinit
import time

m_size = 100

def matmul_GPU(a_gpu,b_gpu,m_size=m_size):
    kernel_code_template = """
    __global__ void MatrixMulKernel(float *A, float *B, float *C)
    {

      const uint wA = %(m_size)s;
      const uint wB = %(m_size)s;

      const uint bx = blockIdx.x;
      const uint by = blockIdx.y;

      const uint tx = threadIdx.x;
      const uint ty = threadIdx.y;

      const uint aBegin = wA * %(b_size)s * by;
      const uint aEnd = aBegin + wA - 1;

      const uint aStep = %(b_size)s;

      const uint bBegin = %(b_size)s * bx;
      const uint bStep = %(b_size)s * wB;

      float Csub = 0;

      for (int a = aBegin, b = bBegin;
           a <= aEnd;
           a += aStep, b += bStep)
        {

          __shared__ float As[%(b_size)s][%(b_size)s];

          __shared__ float Bs[%(b_size)s][%(b_size)s];

          As[ty][tx] = A[a + wA * ty + tx];
          Bs[ty][tx] = B[b + wB * ty + tx];

          __syncthreads();
          for (int k = 0; k < %(b_size)s; ++k)
            Csub += As[ty][k] * Bs[k][tx];
          __syncthreads();
        }

      const uint c = wB * %(b_size)s * by + %(b_size)s * bx;
      C[c + wB * ty + tx] = Csub;
    }
    """

    t_size = 2
    b_size = t_size

    kernel_code = kernel_code_template % {
        'm_size': m_size,
        'b_size': b_size,
        }

    mod = compiler.SourceModule(kernel_code)
    
    c_gpu = gpuarray.empty((m_size, m_size), np.float32)

    matrixmul = mod.get_function("MatrixMulKernel")

    matrixmul(
        a_gpu, b_gpu,
        c_gpu,
        grid = (m_size // t_size, m_size // t_size),
        block = (t_size, t_size, 1),
        )

    return c_gpu


def matmul_CPU(matrix1, matrix2):
    rmatrix = np.zeros(shape=(matrix1.shape[0], matrix2.shape[1]))
    for i in range(len(matrix1)):
        for j in range(len(matrix2[0])):
            for k in range(len(matrix2)):
                rmatrix[i][j] += matrix1[i][k] * matrix2[k][j]
    return rmatrix

In [5]:
cpu_time = []
gpu_time = []
diffs = []

for size in [128, 256, 512]:
    a_cpu = np.random.randn(size, size).astype(np.float32)
    b_cpu = np.random.randn(size, size).astype(np.float32)

    print("Размерность матрицы:", size)
    
    startCPU = time.time()
    c_cpu = matmul_CPU(a_cpu, b_cpu)
    endCPU = time.time()
    timeCPU = endCPU -startCPU

    print("время на CPU:", timeCPU)
    cpu_time.append(timeCPU)

    a_gpu = gpuarray.to_gpu(a_cpu)
    b_gpu = gpuarray.to_gpu(b_cpu)

    startGPU = time.time()
    c_gpu = matmul_GPU(a_gpu, b_gpu, size)
    endGPU = time.time()
    timeGPU = endGPU-startGPU

    print("время на GPU:", timeGPU)
    gpu_time.append(timeGPU)

    differensetime = timeCPU-timeGPU
    print("CPU-GPU:", differensetime)
    diffs.append(differensetime)
    
    print ("___________________________")

Размерность матрицы: 128
время на CPU: 2.354503631591797
время на GPU: 0.9202377796173096
CPU-GPU: 1.4342658519744873
___________________________
Размерность матрицы: 256
время на CPU: 19.08896255493164
время на GPU: 0.27205991744995117
CPU-GPU: 18.81690263748169
___________________________
Размерность матрицы: 512
время на CPU: 153.40163278579712
время на GPU: 0.29764509201049805
CPU-GPU: 153.10398769378662
___________________________
