In [1]:
import numpy as np
import time
import cupy as cp

In [8]:
add_kernel = cp.RawKernel(r'''
extern "C" __global__
void matmul(const float* a, const float* b, float* c, int width) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;

    float value = 0.0, elem1 = 0.0, elem2 = 0.0;

    for(int i = 0; i < width; i++)
	{
		elem1 = a[y * width + i];
		elem2 = b[i * width + x];

		value += elem1 * elem2;
	}

    c[y * width + x] = value;
}
''',
"matmul")

In [40]:
heigth = 2000
width = 2000

In [41]:
A = cp.ones((width,heigth), dtype=cp.float32)
B = cp.ones((width,heigth), dtype=cp.float32)
res_C = cp.zeros((width,heigth), dtype=cp.float32)

GPU

In [42]:
gstart = time.perf_counter()
result = add_kernel((200, 200), (10, 10), (A, B, res_C, width))
gend = time.perf_counter()

print(gend - gstart)

0.00029392799979177653


 CPU



In [43]:
def cpu_matmul(a, b, n):
    c = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            c[i, j] = 0
            for k in range(n):
                c[i, j] += a[i, j]*b[i, j]
    return c


x = np.arange(width*heigth, dtype=np.float32).reshape(width,heigth)
y = np.arange(width*heigth, dtype=np.float32).reshape(width,heigth)

cstart = time.perf_counter()
cpu_matmul(x, y, width)
cend = time.perf_counter()

print(cend-cstart)

6325.471939096
