In [None]:
import cupy as cp
import numpy as np
import cython
import timeit

In [None]:
dim = 128
rand_mat = cp.random.rand(dim, dim, dtype=cp.float64) + 1j*cp.random.rand(dim, dim, dtype=cp.float64)
randH = (rand_mat + rand_mat.conj().T)/2
randH[0,0]

In [None]:
%load_ext cython

In [None]:
from Cython.Build import cythonize
cythonize("matrix_multiply.pyx", annotate=True, language_level=3)

In [None]:
dim = 2
reps = 10
A = cp.random.rand(dim, dim, dtype=cp.float64) + 1j*cp.random.rand(dim, dim, dtype=cp.float64)
B = cp.random.rand(dim, dim, dtype=cp.float64) + 1j*cp.random.rand(dim, dim, dtype=cp.float64)
C = cp.zeros((dim, dim), dtype=cp.complex128)
t_start = timeit.default_timer()
for i in range(reps):
    multiply_matrices(A, B, C)
    #MM_cdot_gpu(handle, alpha, beta, A, B, C)
t_end = timeit.default_timer()
print('Each calculation took an average of {} seconds'.format((t_end-t_start)/reps))

In [None]:
import matrix_multiply

In [None]:
dir(matrix_multiply)

In [None]:
%%cython -3 -a
cimport cython

# distutils: language = c++

#cuda_include = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0\include'

# import from c cublas library
cdef extern from "cublas_v2.h":
    ctypedef struct cublasHandle_t:
        pass
    void cublasCreate(cublasHandle_t** handle)
    void cublasDestroy(cublasHandle_t* handle)
    void cublasZgemm(cublasHandle_t* handle, int transa, int transb,
                     int m, int n, int k, double complex alpha,
                     const double complex* A, int lda,
                     const double complex* B, int ldb, double complex beta,
                     double complex* C, int ldc)

cpdef multiply_matrices(double complex[:, ::1] A, double complex[:, ::1] B, double complex[:, ::1] C):
    """
    Multiplies two square complex double matrices A and B and stores the result in C.
    
    Parameters
    ----------
    A : ndarray of shape (M, K)
        The first matrix to multiply.
    B : ndarray of shape (K, N)
        The second matrix to multiply.
    C_out : ndarray of shape (M, N)
        The output matrix to store the result of the multiplication.
    """
    cdef cublasHandle_t* handle = NULL
    cublasCreate(&handle)

    cdef int n = A.shape[0]

    cdef double complex alpha = 1.0 + 0.0j
    cdef double complex beta = 0.0 + 0.0j

    cublasZgemm(handle, 0, 0, n, n, n, alpha, &A[0, 0], n, &B[0, 0], n, beta, &C[0, 0], n)
    cublasDestroy(handle)


In [None]:
import os

# get the path to the CUDA installation
cuda_path = os.environ.get('CUDA_PATH')
cuda_path = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0'
cuda_include = os.path.join(cuda_path, 'include')
# build the path to the cublas_v2.h header file
cublas_header_path = os.path.join(cuda_include, 'cublas_v2.h')
print(cublas_header_path)

In [None]:
%%cython -3 -a
cimport cython
cimport numpy as np

# distutils: language = c++
# Add the CUDA include path to the Cython compiler
from Cython.Compiler.Options import get_directive_defaults

cuda_include = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0\include'
get_directive_defaults()['include_path'].append(cuda_include)

# import from c cublas library
cdef extern from "cublas_v2.h":
    ctypedef struct cublasHandle_t:
        pass
    void cublasCreate(cublasHandle_t** handle)
    void cublasDestroy(cublasHandle_t* handle)
    void cublasZgemm(cublasHandle_t* handle, int transa, int transb,
                     int m, int n, int k, double complex alpha,
                     const double complex* A, int lda,
                     const double complex* B, int ldb, double complex beta,
                     double complex* C, int ldc)

    
def multiply_matrices(np.ndarray[np.complex128_t, ndim=2] A, np.ndarray[np.complex128_t, ndim=2] B,
                      np.ndarray[np.complex128_t, ndim=2] C_out):
    """
    Multiplies two matrices A and B and stores the result in C_out.
    
    Parameters
    ----------
    A : ndarray of shape (M, K)
        The first matrix to multiply.
    B : ndarray of shape (K, N)
        The second matrix to multiply.
    C_out : ndarray of shape (M, N)
        The output matrix to store the result of the multiplication.
    """
    if A.shape[1] != B.shape[0]:
        raise ValueError("Matrices A and B are not compatible for multiplication.")
    
    C = np.copy(C_out)
    
    cdef cublasHandle_t* handle = NULL
    status = cublasCreate(&handle)
    if status != 0:
        raise RuntimeError("Failed to create CuBLAS handle.")
    
    cdef int m = A.shape[0]
    cdef int n = B.shape[1]
    cdef int k = A.shape[1]

    cdef double complex alpha = 1.0 + 0.0j
    cdef double complex beta = 0.0 + 0.0j

    status = cublasZgemm(handle, 0, 0, m, n, k, alpha, &A[0, 0], k, &B[0, 0], n, beta, &C[0, 0], m)
    if status != 0:
        raise RuntimeError("Failed to perform matrix multiplication with CuBLAS.")
    
    cublasDestroy(handle)
    
    C_out[:] = C


In [None]:
%%cython -3 -a -I
cimport cython

# import from c cublas library
cdef extern from 'cublas_v2.h':
    ctypedef struct cublasHandle_t:
        pass
    void cublasCreate(cublasHandle_t** handle)
    void cublasDestroy(cublasHandle_t* handle)
    void cublasZgemm(cublasHandle_t* handle, int transa, int transb,
                     int m, int n, int k, double complex alpha,
                     const double complex* A, int lda,
                     const double complex* B, int ldb, double complex beta,
                     double complex* C, int ldc)

cpdef multiply_matrices(double complex[:, ::1] A, double complex[:, ::1] B, double complex[:, ::1] C):
    """
    Multiplies two square complex double matrices A and B and stores the result in C.
    
    Parameters
    ----------
    A : ndarray of shape (M, K)
        The first matrix to multiply.
    B : ndarray of shape (K, N)
        The second matrix to multiply.
    C_out : ndarray of shape (M, N)
        The output matrix to store the result of the multiplication.
    """
    cdef cublasHandle_t* handle = NULL
    cublasCreate(&handle)

    cdef int n = A.shape[0]

    cdef double complex alpha = 1.0 + 0.0j
    cdef double complex beta = 0.0 + 0.0j

    cublasZgemm(handle, 0, 0, n, n, n, alpha, &A[0, 0], n, &B[0, 0], n, beta, &C[0, 0], n)
    cublasDestroy(handle)


In [None]:
%%cython -3 -a
cimport cython
cimport numpy as np

# import from c cublas library
cdef extern from "cublas_v2.h":
    ctypedef struct cublasHandle_t:
        pass
    void cublasCreate(cublasHandle_t** handle)
    void cublasDestroy(cublasHandle_t* handle)
    void cublasZgemm(cublasHandle_t* handle, int transa, int transb,
                     int m, int n, int k, double complex alpha,
                     const double complex* A, int lda,
                     const double complex* B, int ldb, double complex beta,
                     double complex* C, int ldc)

    
def multiply_matrices(np.ndarray[np.complex128_t, ndim=2] A, np.ndarray[np.complex128_t, ndim=2] B,
                      np.ndarray[np.complex128_t, ndim=2] C_out):
    """
    Multiplies two matrices A and B and stores the result in C_out.
    
    Parameters
    ----------
    A : ndarray of shape (M, K)
        The first matrix to multiply.
    B : ndarray of shape (K, N)
        The second matrix to multiply.
    C_out : ndarray of shape (M, N)
        The output matrix to store the result of the multiplication.
    """
    if A.shape[1] != B.shape[0]:
        raise ValueError("Matrices A and B are not compatible for multiplication.")
    
    C = np.copy(C_out)
    
    cdef cublasHandle_t* handle = NULL
    status = cublasCreate(&handle)
    if status != 0:
        raise RuntimeError("Failed to create CuBLAS handle.")
    
    cdef int m = A.shape[0]
    cdef int n = B.shape[1]
    cdef int k = A.shape[1]

    cdef double complex alpha = 1.0 + 0.0j
    cdef double complex beta = 0.0 + 0.0j

    status = cublasZgemm(handle, 0, 0, m, n, k, alpha, &A[0, 0], k, &B[0, 0], n, beta, &C[0, 0], m)
    if status != 0:
        raise RuntimeError("Failed to perform matrix multiplication with CuBLAS.")
    
    cublasDestroy(handle)
    
    C_out[:] = C


In [None]:
dim = 2
reps = 10
A = cp.random.rand(dim, dim, dtype=cp.float64) + 1j*cp.random.rand(dim, dim, dtype=cp.float64)
B = cp.random.rand(dim, dim, dtype=cp.float64) + 1j*cp.random.rand(dim, dim, dtype=cp.float64)
C = cp.zeros((dim, dim), dtype=cp.complex128)
t_start = timeit.default_timer()
for i in range(reps):
    multiply_matrices(A, B, C)
    #MM_cdot_gpu(handle, alpha, beta, A, B, C)
t_end = timeit.default_timer()
print('Each calculation took an average of {} seconds'.format((t_end-t_start)/reps))

In [None]:
A_cpu = cp.asnumpy(A)
B_cpu = cp.asnumpy(B)
C_cpu =  A_cpu @ B_cpu
C_cpu

In [None]:
MM_cdot_gpu(handle, alpha, beta, A, B, C)

In [None]:
C

In [None]:
%%cython -3 -a
cimport cython
import cupy as cp
from cupy_backends.cuda.libs import cublas
from cupy.cuda import device

def MM_cdot_gpu(double complex[:,::1] a, double complex[:,::1] b, out):
    cdef int transa = 1
    cdef int transb = 1
    cdef int m = a.shape[0]
    cdef int k = a.shape[1]
    cdef int n = b.shape[1]
    cdef int lda = a.shape[1]
    cdef int ldb = b.shape[1]

    alpha = cp.array(1.0, dtype=cp.complex128)
    alpha_ptr = alpha.data.ptr
    beta = cp.array(0.0, dtype=cp.complex128)
    beta_ptr = beta.data.ptr
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)

    cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    # Computes out.T = alpha * b.T @ a.T + beta * out.T  -> due to c ordering with f backend -> already taken care off in definition
    cublas.zgemm(handle, transb, transa, n, m, k, alpha_ptr, b.data.ptr, ldb, a.data.ptr, lda, beta_ptr, out.data.ptr,n)

In [None]:
dim = 128
A = cp.random.rand(dim, dim, dtype=cp.float64) + 1j*cp.random.rand(dim, dim, dtype=cp.float64)
B = cp.random.rand(dim, dim, dtype=cp.float64) + 1j*cp.random.rand(dim, dim, dtype=cp.float64)
C = cp.zeros((dim, dim), dtype=cp.complex128)
%timeit MM_cdot_gpu(A, B, C)

In [None]:

#if trans == 'N':
#    trans = 1
#elif trans == 'T':
#    trans = 0

cpdef gemm(double complex[:,::1] a, double complex[:,::1] b, out, alpha=1.0, beta=0.0):
    """Computes out = alpha * op(a) @ op(b) + beta * out

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.
    op(b) = b if transb is 'N', op(b) = b.T if transb is 'T',
    op(b) = b.T.conj() if transb is 'H'.
    """
    transa = 1
    transb = 1
    if transa == cublas.CUBLAS_OP_N:
        m, k = a.shape
    else:
        k, m = a.shape
    if transb == cublas.CUBLAS_OP_N:
        n = b.shape[1]
        assert b.shape[0] == k
    else:
        n = b.shape[0]
        assert b.shape[1] == k


    alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype)
    beta, beta_ptr = _get_scalar_ptr(beta, a.dtype)
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)
    if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray):
        if not isinstance(alpha, cupy.ndarray):
            alpha = cupy.array(alpha)
            alpha_ptr = alpha.data.ptr
        if not isinstance(beta, cupy.ndarray):
            beta = cupy.array(beta)
            beta_ptr = beta.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    # Computes out.T = alpha * b.T @ a.T + beta * out.T  -> due to c ordering with f backend -> already taken care off in definition
    cublas.zgemm(handle, transb, transa, n, m, k, alpha_ptr, b.data.ptr, ldb, a.data.ptr, lda, beta_ptr, out.data.ptr,n)

In [None]:

#if trans == 'N':
#    trans = 1
#elif trans == 'T':
#    trans = 0

cpdef gemm(double complex[:,::1] a, double complex[:,::1] b, out, alpha=1.0, beta=0.0):
    """Computes out = alpha * op(a) @ op(b) + beta * out

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.
    op(b) = b if transb is 'N', op(b) = b.T if transb is 'T',
    op(b) = b.T.conj() if transb is 'H'.
    """
    transa = 1
    transb = 1
    if transa == cublas.CUBLAS_OP_N:
        m, k = a.shape
    else:
        k, m = a.shape
    if transb == cublas.CUBLAS_OP_N:
        n = b.shape[1]
        assert b.shape[0] == k
    else:
        n = b.shape[0]
        assert b.shape[1] == k


    alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype)
    beta, beta_ptr = _get_scalar_ptr(beta, a.dtype)
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)
    if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray):
        if not isinstance(alpha, cupy.ndarray):
            alpha = cupy.array(alpha)
            alpha_ptr = alpha.data.ptr
        if not isinstance(beta, cupy.ndarray):
            beta = cupy.array(beta)
            beta_ptr = beta.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    # Computes out.T = alpha * b.T @ a.T + beta * out.T  -> due to c ordering with f backend -> already taken care off in definition
    cublas.zgemm(handle, transb, transa, n, m, k, alpha_ptr, b.data.ptr, ldb, a.data.ptr, lda, beta_ptr, out.data.ptr,n)

In [None]:

#if trans == 'N':
#    trans = 1
#elif trans == 'T':
#    trans = 0

cpdef gemm(double complex[:,::1] a, double complex[:,::1] b, out, alpha=1.0, beta=0.0):
    """Computes out = alpha * op(a) @ op(b) + beta * out

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.
    op(b) = b if transb is 'N', op(b) = b.T if transb is 'T',
    op(b) = b.T.conj() if transb is 'H'.
    """
    transa = 1
    transb = 1
    if transa == cublas.CUBLAS_OP_N:
        m, k = a.shape
    else:
        k, m = a.shape
    if transb == cublas.CUBLAS_OP_N:
        n = b.shape[1]
        assert b.shape[0] == k
    else:
        n = b.shape[0]
        assert b.shape[1] == k


    alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype)
    beta, beta_ptr = _get_scalar_ptr(beta, a.dtype)
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)
    if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray):
        if not isinstance(alpha, cupy.ndarray):
            alpha = cupy.array(alpha)
            alpha_ptr = alpha.data.ptr
        if not isinstance(beta, cupy.ndarray):
            beta = cupy.array(beta)
            beta_ptr = beta.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    # Computes out.T = alpha * b.T @ a.T + beta * out.T  -> due to c ordering with f backend -> already taken care off in definition
    cublas.zgemm(handle, transb, transa, n, m, k, alpha_ptr, b.data.ptr, ldb, a.data.ptr, lda, beta_ptr, out.data.ptr,n)

In [None]:

#if trans == 'N':
#    trans = 1
#elif trans == 'T':
#    trans = 0

def _get_scalar_ptr(a, dtype):
    if isinstance(a, cupy.ndarray):
        if a.dtype != dtype:
            a = cupy.array(a, dtype=dtype)
        a_ptr = a.data.ptr
    else:
        if not (isinstance(a, numpy.ndarray) and a.dtype == dtype):
            a = numpy.array(a, dtype=dtype)
        a_ptr = a.ctypes.data
    return a, a_ptr

cpdef gemm(double complex[:,::1] a, double complex[:,::1] b, out, alpha=1.0, beta=0.0):
    """Computes out = alpha * op(a) @ op(b) + beta * out

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.
    op(b) = b if transb is 'N', op(b) = b.T if transb is 'T',
    op(b) = b.T.conj() if transb is 'H'.
    """
    transa = 1
    transb = 1
    if transa == cublas.CUBLAS_OP_N:
        m, k = a.shape
    else:
        k, m = a.shape
    if transb == cublas.CUBLAS_OP_N:
        n = b.shape[1]
        assert b.shape[0] == k
    else:
        n = b.shape[0]
        assert b.shape[1] == k


    alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype)
    beta, beta_ptr = _get_scalar_ptr(beta, a.dtype)
    handle = device.get_cublas_handle()
    orig_mode = cublas.getPointerMode(handle)
    if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray):
        if not isinstance(alpha, cupy.ndarray):
            alpha = cupy.array(alpha)
            alpha_ptr = alpha.data.ptr
        if not isinstance(beta, cupy.ndarray):
            beta = cupy.array(beta)
            beta_ptr = beta.data.ptr
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE)
    else:
        cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST)

    # Computes out.T = alpha * b.T @ a.T + beta * out.T  -> due to c ordering with f backend -> already taken care off in definition
    cublas.zgemm(handle, transb, transa, n, m, k, alpha_ptr, b.data.ptr, ldb, a.data.ptr, lda, beta_ptr, out.data.ptr,n)

In [None]:

from scipy.linalg.cython_blas cimport zgemm

@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
def MM_cdot(double complex[:,::1] A, double complex[:,::1] B, double complex[:,::1] C):
    # matrix multiply 2 square matrices A (n x n) and B (n x n) 
    cdef char *orientA = 'n'
    cdef char *orientB = 'n'
    cdef double complex *a0=&A[0,0]
    cdef double complex *b0=&B[0,0]
    cdef double complex *c0=&C[0,0]
    cdef double complex alpha = 1.0
    cdef double complex beta = 0.0
    cdef int n
    
    n = A.shape[0]
    zgemm(orientA, orientB, &n, &n, &n, &alpha, b0, &n, a0, &n, &beta, c0, &n) # 'N' is fortran orientation

In [None]:
%%cython -3 -a
cimport cython
import cupy as cp
from cupy_backends.cuda.libs import cublas

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef void matrix_multiply_cupy(double complex[:,::1] a, double complex[:,::1] b, double complex[:,::1] c):
    cdef int n = a.shape[0]

    # perform matrix multiplication on GPU
    cp.matmul(a, b, out=c)


In [None]:
A = cp.random.rand(1000, 1000) + 1j * cp.random.rand(1000, 1000)
B = cp.random.rand(1000, 1000) + 1j * cp.random.rand(1000, 1000)
C = cp.zeros((1000, 1000), dtype=cp.complex128)

%timeit matrix_multiply_cupy(A, B, C)

In [None]:
# a cython compiled function that multiplies two matrices