In [1]:
import timeit
import numpy as np
from time import time

In [2]:
from numba import cuda
import numba as nb
import math

In [None]:
print(cuda.gpus)

In [None]:
cuda.select_device(0)

In [None]:
# %%time
# Initialize the data arrays
A = np.asfortranarray(np.random.randn(32*2, 32*3))
B = np.asfortranarray(np.random.randn(3*32, 32))
C = np.asfortranarray(np.random.randn(32*2, 32))

In [None]:
%%time
# Copy the arrays to the device
A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)
C_global_mem = cuda.to_device(C)

# Allocate memory on the device for the result
# C_global_mem = cuda.device_array((32*2, 32))

In [None]:
%%time
# Configure the blocks
threadsperblock = (16,16)
blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[0]))
blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)

In [None]:
blockspergrid

In [None]:
(A.nbytes+B.nbytes+np.dot(A,B).nbytes)/1024/1024

In [None]:
# CUDA kernel
@cuda.jit
def matmul(A, B, C):
    """Perform matrix multiplication of C = A * B
    """
    row, col = cuda.grid(2)
    if row < C.shape[0] and col < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[row, k] * B[k, col]
        C[row, col] = tmp

In [None]:
%%time
matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)

In [None]:
%%time
C = C_global_mem.copy_to_host()
print(C.shape)

In [None]:
%%time
np.allclose(C,np.dot(A,B))

In [None]:
np.dot(A,B)

In [None]:
C

In [None]:
%time
@cuda.jit
def my_kernel_2D(io_array):
    x, y = cuda.grid(2)
    if x<io_array.shape[0] and y<io_array.shape[1]:
        io_array[x,y]*=8

data = np.ones((16, 16))
data_glob=cuda.to_device(data)
threadsperblock = (32, 32)
blockspergrid_x = math.ceil(data.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(data.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)
my_kernel_2D[blockspergrid, threadsperblock](data_glob)
print(data_glob.copy_to_host())

In [None]:
A.shape,B.shape

In [None]:
TPB=16
@cuda.jit
def fast_matmul(A,B,C):
    sA=cuda.shared.array(shape=(TPB,TPB),dtype=nb.float32)
    sB=cuda.shared.array(shape=(TPB,TPB),dtype=nb.float32)
    x,y=cuda.grid(2)
    tx=cuda.threadIdx.x
    ty=cuda.threadIdx.y
    if x>=C.shape[0] and y>=C.shape[1]:
        return
    tmp=0
    for i in range(A.shape[1]//TPB):  # for number of blocks
        sA[tx,ty]=A[x,ty+i*TPB]       # preLoad data into shared memory
        sB[tx,ty]=B[tx+i*TPB,y]
        
        cuda.syncthreads()            # wait for loading
        for j in range(TPB):
            tmp+=sA[tx,j]*sB[j,ty]
            cuda.syncthreads()            # wait for computation
    C[x,y]=tmp

In [None]:
%%time
fast_matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)

In [None]:
%%time
C = C_global_mem.copy_to_host()
print(C.shape)

In [None]:
C

In [6]:
import pycuda

In [4]:
cudlib=ctypes.CDLL('libcublas.so')

In [5]:
N=128

In [None]:
A = np.array(np.arange(N ** 2, dtype=np.float32).reshape(N, N), order='F')
B = np.array(np.arange(N) + 10, dtype=A.dtype, order='F')
D = np.zeros_like(A, order='F')

In [None]:
np.asfortranarray(A).ctypes.data

In [None]:
D

In [None]:
cudlib.cublasSdot

In [None]:
A_global_mem.device_ctypes_pointer

In [None]:
cudlib.cublasSgemm('N', 'N', N, N, N, 1, A_global_mem.device_ctypes_pointer, B_global_mem.device_ctypes_pointer, 1, C_global_mem.device_ctypes_pointer)

In [1]:
import ctypes
import numpy as np

In [2]:
def init_kernel_bias(num_inp_channels, kernel_size, num_kernels,mean=0,std=0.01):
    shape = [num_inp_channels, kernel_size, kernel_size, num_kernels]
    weights = std*np.random.randn(*shape) + mean
    # weights/=np.sqrt(num_inp_channels)
    bias = std*np.random.randn(1,num_kernels) + mean
    return weights.astype(np.float32), bias.astype(np.float32)

In [3]:
w0,b0=init_kernel_bias(num_inp_channels=32,kernel_size=3,num_kernels=128)

In [4]:
inp=np.random.randn(128,64,64,32).astype(np.float32)

In [5]:
#inp[batches,row,col,d],w0(d,ksz,ksz,num_ker),b0[1,num_ker],stride[row,col]
padding=0
stride=[1,1]
ipp=inp.transpose(0,3,1,2)  #ipp[batches,d,row,col]
output=[]
ksz=w0.shape[1]
num_ker=w0.shape[3]
if not padding: #take care of padding in backprop too
    padding=(ksz-1)//2  #currently don't give 'even' ksz
out_row,out_col=((ipp.shape[2]-ksz+2*padding)//stride[0]+1),((ipp.shape[3]-ksz+2*padding)//stride[1]+1)
batches,d,row,col=ipp.shape
row+=2*padding
col+=2*padding
padded=np.zeros((batches,d,row,col)).astype(np.float32)
padded[:,:,padding:-padding,padding:-padding]=ipp

In [6]:
img=padded[0]

In [7]:
# %%timeit
window=(np.arange(ksz)[:,None]*row+np.arange(ksz)).ravel()+np.arange(d)[:,None]*row*col
slider=(np.arange(out_row*stride[0])[:,None]*row+np.arange(out_col*stride[1]))
ind = window.ravel()+slider[::stride[0],::stride[1]].ravel()[:,None]
# bind= np.arange(batches)[:,None]*d*row*col+ind.ravel()
kern = w0.reshape(-1,num_ker)
# output=(np.dot(np.take(padded, bind).reshape(-1,d*ksz*ksz), kern)).reshape(batches,out_row,out_col,num_ker)

In [10]:
%%time
output=np.empty((batches,out_row*out_col,num_ker),dtype=np.float32,order='F')
for i,img in enumerate(padded):      #img[d,row,col]
    # windows(out_row*out_col, ksz*ksz*d) . kernels(d*ksz*ksz,num_ker)
    output[i]=np.dot(img.take(ind), kern)
output+=b0
ans2=output.reshape(batches,out_row,out_col,num_ker)

CPU times: user 2.96 s, sys: 125 ms, total: 3.09 s
Wall time: 845 ms


In [11]:
ans2.nbytes/1024/1024

256.0

In [61]:
%%time
coled=np.empty((batches,*ind.shape),dtype=np.float32)#,order='F')
for i,img in enumerate(padded):      #img[d,row,col]
    # windows(out_row*out_col, ksz*ksz*d) . kernels(d*ksz*ksz,num_ker)
    coled[i]=img.take(ind)
# output+=b0
# ans2=output.reshape(batches,out_row,out_col,num_ker)
coled=coled.reshape(-1,d*ksz*ksz,order='F')
# output=(np.dot(np.take(padded, bind), self.kern)+self.biases)
print(coled.nbytes/1024/1024)

576.0
CPU times: user 5.58 s, sys: 149 ms, total: 5.72 s
Wall time: 5.74 s


In [55]:
padded=np.asfortranarray(padded)

In [64]:
np.isfortran(b)

False

In [13]:
ind.shape,coled.shape,kern.shape,padded.shape

((4096, 288), (524288, 288), (288, 128), (128, 32, 66, 66))

In [47]:
a=coled.reshape(-1,d*ksz*ksz)
b=kern.reshape(d*ksz*ksz,-1)
(a.nbytes+b.nbytes)/1024/1024

576.140625

In [48]:
c=np.empty((a.shape[0],b.shape[1]),dtype=np.float32,order='F')
c.nbytes/1024/1024

256.0

In [70]:
np.isfortran(b)

True

In [67]:
%%time
a=np.asfortranarray(a)
b=np.asfortranarray(b)
# c=np.asfortranarray(c)

CPU times: user 340 µs, sys: 9 µs, total: 349 µs
Wall time: 216 µs


In [28]:
a=(np.arange(30)+11).reshape(6,5,order='F').astype(np.float32)  #mxk

In [29]:
b=(np.arange(20)+11).reshape(5,4,order='F').astype(np.float32)  #kxn

In [30]:
c=(np.arange(24)*0).reshape(6,4,order='F').astype(np.float32)  #mxn

In [28]:
c

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [32]:
np.asfortranarray(a).ctypes.data

93866661018848

In [71]:
sgemm=ctypes.CDLL('libsgemm.so')

In [72]:
al,bet=1,0

In [74]:
%%time
sgemm.gemm(ctypes.c_void_p(a.ctypes.data),ctypes.c_void_p(b.ctypes.data),ctypes.c_void_p(c.ctypes.data),a.shape[0],b.shape[0],c.shape[1],ctypes.c_float(al),ctypes.c_float(bet))

CPU times: user 471 ms, sys: 197 ms, total: 667 ms
Wall time: 666 ms


0

In [25]:
# al*np.dot(a,b)+bet*c

In [75]:
c

array([[ 0.09098308,  0.23124403,  0.00225085, ...,  0.03616476,
        -0.04796178, -0.06420715],
       [ 0.16357875, -0.2178955 ,  0.1399096 , ..., -0.00023655,
        -0.06357962,  0.11701868],
       [ 0.0320385 ,  0.13602817,  0.01417744, ..., -0.01406885,
        -0.11398181, -0.05260916],
       ...,
       [ 0.13534063,  0.02805037, -0.22902332, ...,  0.07264413,
        -0.08482627,  0.04021799],
       [-0.02503863, -0.10760994,  0.04979577, ..., -0.117056  ,
         0.02615827,  0.05085191],
       [ 0.07215384, -0.07748975, -0.05282587, ...,  0.17543285,
        -0.05621064,  0.10188872]], dtype=float32)

In [77]:
%%time
d=np.dot(a,b)
d

CPU times: user 2.68 s, sys: 49.4 ms, total: 2.73 s
Wall time: 730 ms
