In [2]:
# !pip install ipympl
# pip install black isort jupyterlab-code-formatter

In [3]:
%matplotlib widget
import cupy as cp
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import scipy.linalg.blas as sblas
import sympy as sy

cp.cuda.Device(), cp.cuda.device.get_compute_capability(), cp.cuda.get_current_stream()

(<CUDA Device 0>, '86', <Stream 0 (device -1)>)

In [4]:
x_cpu = np.array([1, 2, 3])
y_cpu = np.array([4, 5, 6])
x_gpu = cp.asarray(x_cpu)

In [5]:
cp.asnumpy(x_gpu) + y_cpu

array([5, 7, 9])

In [6]:
x_gpu + cp.asarray(y_cpu)

array([5, 7, 9])

In [7]:
sg_all = cp.asarray([])
s_cpall = cp.asnumpy(sg_all)

In [8]:
# plt.plot([1, 34, 23, 55, 8])
# plt.show()

In [9]:
import cupy

x = cupy.array(range(9)).reshape((3, 3))
y = cupy.array(range(9, 18)).reshape((3, 3))
expected = cupy.matmul(x, y)
cupy.cuda.Device().synchronize()

stream = cupy.cuda.stream.Stream()
with stream:
    for k in range(10000):
        z = cupy.matmul(x, y)
stream.synchronize()
cupy.testing.assert_array_equal(z, expected)

stream = cupy.cuda.stream.Stream()
stream.use()
z = cupy.matmul(x, y)
stream.synchronize()
cupy.testing.assert_array_equal(z, expected)
z

array([[ 42,  45,  48],
       [150, 162, 174],
       [258, 279, 300]])

In [10]:
# https://docs.cupy.dev/en/stable/user_guide/kernel.html
# EXAMPLE
kernel = cp.ElementwiseKernel(
    "T x, T y",
    "T z",
    """
     if (x - 2 > y) {
       z = x * y;
     } else {
       z = x + y;
     }
     """,
    "my_kernel",
)

x = cp.arange(6, dtype="d").reshape(2, 3)
y = cp.arange(3, dtype="d")
kernel(x, y)

array([[ 0.,  2.,  4.],
       [ 0.,  4., 10.]])

In [24]:
# EXAMPLE
add_kernel = cp.RawKernel(
    r"""
typedef float T;

extern "C" __global__
void my_add(const T* x1, const T* x2, T* y) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    y[tid] = x1[tid] + x2[tid];
}
""",
    "my_add",
)
x1 = cp.arange(25, dtype=cp.float32).reshape(5, 5)
x2 = cp.arange(25, dtype=cp.float32).reshape(5, 5)
y = cp.zeros((5, 5), dtype=cp.float32)
add_kernel((5,), (5,), (x1, x2, y))  # grid, block and arguments

y

array([[ 0.,  2.,  4.,  6.,  8.],
       [10., 12., 14., 16., 18.],
       [20., 22., 24., 26., 28.],
       [30., 32., 34., 36., 38.],
       [40., 42., 44., 46., 48.]], dtype=float32)

In [55]:
# cp.cuda.Device().attributes

In [104]:
powers = cp.RawKernel(
    r"""
typedef float T;

extern "C" __global__
void powers(const int p, const T* x, T* y) {
    const int tid = blockDim.x * blockIdx.x + threadIdx.x;
    T c = x[tid];
    int n = p;
    int i = tid * p;
    while (true) {
        // printf("<%d (b=%d, t=%d)> [%d] at %d = %f # %d\n", tid, blockIdx.x, threadIdx.x, i, n, c, p);
        y[i++] = c;
        if (n <= 1) break;
        n--;
        c *= x[tid];
    }
}
""",
    "powers",
)
n = 5000
p = 4
yn = p * n
try:
    x = cp.arange(n, dtype=cp.float32) + 1
    print(x)
    y = cp.zeros((yn,), dtype=cp.float32)
    print(powers.attributes)

    bs = min(powers.max_threads_per_block, n)
    b = n // bs
    if n % bs != 0:
        b += 1
    print(b, bs)

    %timeit powers((b,), (bs,), (p, x, y))  # grid (number of blocks), block and arguments
    print(y)
finally:
    del x
    del y

[1.000e+00 2.000e+00 3.000e+00 ... 9.998e+03 9.999e+03 1.000e+04]
{'max_threads_per_block': 1024, 'shared_size_bytes': 0, 'const_size_bytes': 0, 'local_size_bytes': 0, 'num_regs': 18, 'ptx_version': 86, 'binary_version': 86, 'cache_mode_ca': 0, 'max_dynamic_shared_size_bytes': 49152, 'preferred_shared_memory_carveout': -1}
10 1024
3.46 µs ± 89.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
[1.e+00 1.e+00 1.e+00 ... 1.e+08 1.e+12 1.e+16]


In [98]:
1 / 3.5e-6

285714.28571428574

In [99]:
1 / 0.006051632634559937

165.24466377703678