In [1]:
import sympy as sy
import numpy as np
import scipy as sp
import scipy.linalg.blas as sblas
import cupy 
cupy.cuda.Device(), cupy.cuda.get_current_stream()

(<CUDA Device 0>, <Stream 0 (device -1)>)

In [2]:
A = np.array([[3, 4, 5], [6, 7, 8]], dtype=float)
A

array([[3., 4., 5.],
       [6., 7., 8.]])

In [108]:
np.matmul(A, A.T)

array([[14, 32],
       [32, 77]])

In [109]:
np.matmul(A.T, A)

array([[17, 22, 27],
       [22, 29, 36],
       [27, 36, 45]])

In [40]:
C0 = np.array([[10, 20], [30, 40]], dtype=float)
C0

array([[10., 20.],
       [30., 40.]])

In [110]:
C = np.copy(C0)
assert np.matmul(A, A.T).shape == C.shape
def dsyrk_full_ref(alfa, A, beta, C):
    return alfa * np.matmul(A, A.T) + beta * C
C_ref_full = dsyrk_full_ref(1.0, A, 1.0, C)
C_ref_full

array([[ 24.,  52.],
       [ 62., 117.]])

In [115]:
# Passing in row-major matrix, in trans=False, in [lower=False] 
C = np.copy(C0)
print('c in\n', C)
alpha=1.0
beta=1.0
print(np.isfortran(C))
# C2 = sblas.dsyrk(alpha, A, beta, C, trans=False, lower=False, overwrite_c=True)
c_out = sblas.dsyrk(alpha, A.T, beta, C, trans=True, lower=True, overwrite_c=True)
print('c out\n', c_out)
print('c after\n', C)
C == C_ref_full

c in
 [[10. 20.]
 [30. 40.]]
False
c out
 [[ 24.  20.]
 [ 62. 117.]]
c after
 [[10. 20.]
 [30. 40.]]


array([[False, False],
       [False, False]])

In [65]:
# Passing in row-major matrix, in trans=False, in [lower=True]
C = np.copy(C0)
print('c in\n', C)
alpha=1.0
beta=1.0
print(np.isfortran(C))
# C2 = sblas.dsyrk(alpha, A, beta, C, trans=False, lower=False, overwrite_c=True)
c_out = sblas.dsyrk(alpha, A.T, beta, C.T, trans=True, lower=False, overwrite_c=True)
print('c out\n', c_out)
print('c t after\n', C.T)
print('c after\n', C)
C == C_ref_full

c in
 [[10. 20.]
 [30. 40.]]
False
c out
 [[ 60. 116.]
 [ 20. 189.]]
c t after
 [[ 60. 116.]
 [ 20. 189.]]
c after
 [[ 60.  20.]
 [116. 189.]]


array([[ True, False],
       [ True,  True]])

In [103]:
# Different sizes
A = np.arange(1, 1 + 6).reshape((2,3))
C = 
# c_out = sblas.dsyrk(alpha, A.T, beta, C.T, trans=True, lower=False, overwrite_c=True)

In [89]:
# TRANSPOSING A
print(np.matmul(A.T, A))
C3 = np.arange(7, 7+3*3).reshape((3,3))
C3_ref_full = dsyrk_full_ref(1.0, A.T, 1.0, C3)
print(C3_ref_full)
C3

[[45. 54. 63.]
 [54. 65. 76.]
 [63. 76. 89.]]
[[ 52.  62.  72.]
 [ 64.  76.  88.]
 [ 76.  90. 104.]]


array([[ 7,  8,  9],
       [10, 11, 12],
       [13, 14, 15]])

In [100]:
# Passing in row-major matrix, in [trans=True], in [lower=False] ===========
C = np.copy(C3, order='F')
print('c in\n', C)
print(np.isfortran(C))
c_out = sblas.dsyrk(1.0, A, 1.0, C, trans=False, lower=False, overwrite_c=True)
print('c out\n', c_out)
print('c t after\n', C.T)
print('c after\n', C)
assert c_out is C
C == C3_ref_full

c in
 [[ 7  8  9]
 [10 11 12]
 [13 14 15]]
True


ValueError: failed in converting 2nd keyword `c' of _fblas.dsyrk to C/Fortran array

In [86]:
A = np.array([[3, 4, 5]], dtype=float)
C = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=float)
triangle_indices = np.triu_indices(C.shape[0], 0)
C[triangle_indices] = (2.0 * np.matmul(A.T, A) + 5.0 * C)[triangle_indices]
C

array([[23., 34., 45.],
       [ 4., 57., 70.],
       [ 7.,  8., 95.]])

In [89]:
rng = np.random.default_rng()
rng.random((3, 4)) 
rng.choice(['a', 'r'], size=(2,3))

array([['a', 'a', 'a'],
       ['a', 'r', 'r']], dtype='<U1')

In [100]:
a = np.arange(9).reshape((3,3)) + 1
iu = np.triu_indices_from(a, 0)
il = np.tril_indices_from(a, -1)
# np.triu(a, 0)
a[il] = -1
a

array([[ 1,  2,  3],
       [-1,  5,  6],
       [-1, -1,  9]])

In [3]:
import cupy

x = cupy.array(range(9)).reshape((3,3))
y = cupy.array(range(9, 18)).reshape((3,3))
expected = cupy.matmul(x, y)
cupy.cuda.Device().synchronize()

stream = cupy.cuda.stream.Stream()
with stream:
    for k in range(10000):
        z = cupy.matmul(x, y)
stream.synchronize()
cupy.testing.assert_array_equal(z, expected)

stream = cupy.cuda.stream.Stream()
stream.use()
z = cupy.matmul(x, y)
stream.synchronize()
cupy.testing.assert_array_equal(z, expected)
z

array([[ 42,  45,  48],
       [150, 162, 174],
       [258, 279, 300]])

In [3]:
import cupy
import cupy.cublas as cupy_blas
import numpy as np

x = cupy.arange(6, dtype='f').reshape(2, 3)
y = cupy.arange(3, dtype='f')
kernel = cupy.ElementwiseKernel(
     'float32 x, float32 y', 'float32 z',
     '''
     if (x - 2 > y) {
       z = x * y;
     } else {
       z = x + y;
     }
     ''', 'my_kernel')
kernel(x, y)

array([[ 0.,  2.,  4.],
       [ 0.,  4., 10.]], dtype=float32)

In [8]:
a = cupy.arange(9, dtype=float).reshape(3, 3)
c = cupy.zeros(9, dtype=float).reshape(3, 3)
print(a)
print(c)

cu_a = cupy.asarray(a)
cu_c = cupy.asarray(c)

cupy_blas.syrk(trans='N', a=cu_a, out=cu_c, alpha=1.0, beta=1.0)
print(cu_c)
cupy_blas.syrk(trans='N', a=cu_a, out=cu_c, beta=1.0)
cupy_blas.syrk(trans='N', a=cu_a, out=cu_c, beta=1.0)
#cupy_blas.syrk(trans='N', a=cu_a, out=cu_c, beta=1.0)
#cupy_blas.syrk(trans='N', a=cu_a, out=cu_c, beta=1.0)
print(cu_c)

[[0. 1. 2.]
 [3. 4. 5.]
 [6. 7. 8.]]
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[  5.  14.  23.]
 [  0.  50.  86.]
 [  0.   0. 149.]]
[[ 15.  42.  69.]
 [  0. 150. 258.]
 [  0.   0. 447.]]


In [9]:
c = cupy.asnumpy(cu_c)
print(c)

[[ 15.  42.  69.]
 [  0. 150. 258.]
 [  0.   0. 447.]]
