In [1]:
import ctypes
import numpy as np

In [2]:
def init_kernel_bias(num_inp_channels, kernel_size, num_kernels,mean=0,std=0.01):
    shape = [num_inp_channels, kernel_size, kernel_size, num_kernels]
    weights = std*np.random.randn(*shape) + mean
    # weights/=np.sqrt(num_inp_channels)
    bias = std*np.random.randn(1,num_kernels) + mean
    return weights.astype(np.float32), bias.astype(np.float32)

In [5]:
w0,b0=init_kernel_bias(num_inp_channels=64,kernel_size=3,num_kernels=128)

In [4]:
inp=np.random.randn(256,32,32,64).astype(np.float32)

In [5]:
#inp[batches,row,col,d],w0(d,ksz,ksz,num_ker),b0[1,num_ker],stride[row,col]
padding=0
stride=[1,1]
ipp=inp.transpose(0,3,1,2)  #ipp[batches,d,row,col]
output=[]
ksz=w0.shape[1]
num_ker=w0.shape[3]
if not padding: #take care of padding in backprop too
    padding=(ksz-1)//2  #currently don't give 'even' ksz
out_row,out_col=((ipp.shape[2]-ksz+2*padding)//stride[0]+1),((ipp.shape[3]-ksz+2*padding)//stride[1]+1)
batches,d,row,col=ipp.shape
row+=2*padding
col+=2*padding

In [6]:
%%time
padded=np.zeros((batches,d,row,col),dtype=np.float32)
padded[:,:,padding:-padding,padding:-padding]=ipp

CPU times: user 44.8 ms, sys: 20 ms, total: 64.8 ms
Wall time: 64.1 ms


In [7]:
# %%timeit
window=(np.arange(ksz)[:,None]*row+np.arange(ksz)).ravel()+np.arange(d)[:,None]*row*col
slider=(np.arange(out_row*stride[0])[:,None]*row+np.arange(out_col*stride[1]))
ind = window.ravel()+slider[::stride[0],::stride[1]].ravel()[:,None]
# bind= np.arange(batches)[:,None]*d*row*col+ind.ravel()
kern = w0.reshape(-1,num_ker)
# output=(np.dot(np.take(padded, bind).reshape(-1,d*ksz*ksz), kern)).reshape(batches,out_row,out_col,num_ker)

In [8]:
%%time
output=np.empty((batches,out_row*out_col,num_ker),dtype=np.float32)
for i,img in enumerate(padded):      #img[d,row,col]
    # windows(out_row*out_col, ksz*ksz*d) . kernels(d*ksz*ksz,num_ker)
    output[i]=np.dot(img.take(ind), kern)
# output+=b0
ans1=output.reshape(batches,out_row,out_col,num_ker)

CPU times: user 2.99 s, sys: 172 ms, total: 3.16 s
Wall time: 859 ms


In [9]:
%%time
checker=np.empty((batches,*ind.shape),dtype=np.float32)#,order='F')
for i,img in enumerate(padded):      #img[d,row,col]
    # windows(out_row*out_col, ksz*ksz*d) . kernels(d*ksz*ksz,num_ker)
    checker[i]=img.take(ind)

CPU times: user 329 ms, sys: 46.7 ms, total: 376 ms
Wall time: 334 ms


In [10]:
import concurrent

In [11]:
def doit(i,img,ind):
    checker[i]=img.take(ind)

In [12]:
%%time
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    future_shifting = {executor.submit(doit,i,img,ind):
                         img for i,img in enumerate(padded)}

ckr=checker.reshape(-1,d*ksz*ksz)
ckr=np.asfortranarray(ckr)

CPU times: user 2.04 s, sys: 38.5 ms, total: 2.08 s
Wall time: 1.59 s


In [15]:
# %%time
# for i,future in enumerate(future_shifting):
#     coled[i]=future.result()

In [13]:
%%time
coled=np.empty((batches,*ind.shape),dtype=np.float32,order='C').reshape(-1,d*ksz*ksz,order='A')

CPU times: user 36 µs, sys: 3 µs, total: 39 µs
Wall time: 41.7 µs


In [14]:
%%time
a=coled.reshape(-1,d*ksz*ksz,order='A')
# b=np.asfortranarray(w0).reshape(d*ksz*ksz,-1,order='A')
b=w0.reshape(d*ksz*ksz,-1,order='A')
(a.nbytes+b.nbytes)/1024/1024

CPU times: user 19 µs, sys: 1 µs, total: 20 µs
Wall time: 25 µs


In [15]:
%%time
c=np.empty((a.shape[0],b.shape[1]),dtype=np.float32,order='F')
c.nbytes/1024/1024

CPU times: user 41 µs, sys: 3 µs, total: 44 µs
Wall time: 45.8 µs


In [16]:
%%time
a=np.asfortranarray(a)
b=np.asfortranarray(b)
c=np.asfortranarray(c)

CPU times: user 387 ms, sys: 69.7 ms, total: 457 ms
Wall time: 456 ms


In [17]:
a.shape

(262144, 576)

In [18]:
ctake=ctypes.CDLL("libctake.so")

In [23]:
%%time
ctake.take(ctypes.c_void_p(padded.ctypes.data),ctypes.c_void_p(ind.ctypes.data),ctypes.c_void_p(a.ctypes.data),ctypes.c_int(batches),ctypes.c_int(padded[0].size),ctypes.c_int(ind.size),ctypes.c_int(a.shape[0]),ctypes.c_int(a.shape[1]),ord('C'),ctypes.c_int(4))

CPU times: user 390 ms, sys: 47 µs, total: 390 ms
Wall time: 117 ms


0

In [27]:
%%time
coled=np.empty((batches,*ind.shape),dtype=np.float32,order='C').reshape(-1,d*ksz*ksz,order='A')
ctake.take(ctypes.c_void_p(padded.ctypes.data),ctypes.c_void_p(ind.ctypes.data),ctypes.c_void_p(coled.ctypes.data),ctypes.c_int(batches),ctypes.c_int(padded[0].size),ctypes.c_int(ind.size),ctypes.c_int(a.shape[0]),ctypes.c_int(a.shape[1]),ord('C'),ctypes.c_int(4))
output=np.dot(coled,kern)
# output+=b0
ans2=output.reshape(batches,out_row,out_col,num_ker)

CPU times: user 1.97 s, sys: 817 ms, total: 2.79 s
Wall time: 911 ms


In [28]:
(ans1==ans2).all()

True

In [29]:
%%time
a=np.asfortranarray(coled)
b=np.asfortranarray(kern)

CPU times: user 1.48 s, sys: 428 ms, total: 1.91 s
Wall time: 1.93 s


In [30]:
np.isfortran(b)

True

In [31]:
# from numba import cuda

In [32]:
# %%time
# A = cuda.to_device(a)
# B = cuda.to_device(b)
# C = cuda.to_device(c)

In [6]:
sgemm=ctypes.CDLL('libsgemm.so')

In [7]:
al,bet=1,0

In [8]:
# %%time
# sgemm.gemm(A.device_ctypes_pointer,B.device_ctypes_pointer,C.device_ctypes_pointer,a.shape[0],b.shape[0],c.shape[1],ctypes.c_float(al),ctypes.c_float(bet),ctypes.c_void_p(b0[0].ctypes.data))

In [None]:
%%time
sgemm.gemm(ctypes.c_void_p(a.ctypes.data),ctypes.c_void_p(b.ctypes.data),ctypes.c_void_p(c.ctypes.data),a.shape[0],b.shape[0],c.shape[1],ctypes.c_float(al),ctypes.c_float(bet),ctypes.c_void_p(b0[0].ctypes.data))

In [56]:
a.shape[0],b.shape[0],c.shape[1]

(6, 5, 4)

In [40]:
# al*np.dot(a,b)+bet*c

In [41]:
# c=C.copy_to_host()

In [42]:
%%time
ans3=c.reshape(ans2.shape,order='C')

CPU times: user 13 µs, sys: 3 µs, total: 16 µs
Wall time: 19.1 µs


In [43]:
ans2.shape

(256, 32, 32, 128)

In [44]:
(c.reshape(ans2.shape)==ans2).all()

False

In [45]:
np.allclose(c.reshape(ans2.shape),ans2)

False

In [46]:
abs(c.reshape(ans2.shape)-ans2).max()

1.3709068e-06

In [47]:
abs(c.reshape(ans2.shape)-ans2).sum()

2.1510658

In [12]:
a=np.random.rand(2000,3000).astype(np.float32)  #mxk
a.nbytes/1024/1024

22.88818359375

In [None]:
b=np.random.rand(3000,4000).astype(np.float32)  #kxn
b.nbytes/1024/1024

In [None]:
ct=np.empty((4000, 2000),dtype=np.float32)  #mxn
ct.nbytes/1024/1024

In [1]:
import ctypes
import numpy as np

In [2]:
a=np.arange(6).reshape(3,2).astype(np.float32)
ad=a.T.copy().reshape(a.shape)

In [3]:
b=np.arange(8).reshape(2,4).astype(np.float32)
bd=b.T.copy().reshape(b.shape)

In [4]:
c=np.empty((a.shape[0],b.shape[1]),dtype=np.float32)
cd=c.T.copy().reshape(c.shape)

In [5]:
ad.shape,bd.shape,cd.shape

((3, 2), (2, 4), (3, 4))

In [6]:
sgemm=ctypes.CDLL('libsgemm.so')
al,bet,b0=1,0,np.arange(1,dtype=np.float32)

In [7]:
m=c.shape[0]
n=c.shape[1]
k=b.shape[0]
lda=a.shape[0]
ldb=b.shape[0]
ldc=c.shape[0]
m,n,k,lda,ldb,ldc

(3, 4, 2, 3, 2, 3)

In [8]:
%%time
sgemm.gemm(ord('N'),ord('N'),ctypes.c_void_p(ad.ctypes.data),ctypes.c_void_p(bd.ctypes.data),ctypes.c_void_p(cd.ctypes.data),ctypes.c_int(m),ctypes.c_int(n),ctypes.c_int(k),ctypes.c_int(lda),ctypes.c_int(ldb),ctypes.c_int(ldc),ctypes.c_float(al),ctypes.c_float(bet),ctypes.c_void_p(b0.ctypes.data))

CPU times: user 248 ms, sys: 193 ms, total: 441 ms
Wall time: 527 ms


0

In [9]:
%%time
d=a.dot(b)

CPU times: user 316 µs, sys: 49 µs, total: 365 µs
Wall time: 271 µs


In [10]:
d.ravel(order='F')

array([ 4., 12., 20.,  5., 17., 29.,  6., 22., 38.,  7., 27., 47.],
      dtype=float32)

In [11]:
c=cd.T
c.astype(int)

array([[ 4, 17,  0],
       [12, 29,  0],
       [20,  0,  0],
       [ 5,  0,  0]])

In [12]:
abs(d-c).sum(),abs(d-c).mean(),abs(d-c).max()

ValueError: operands could not be broadcast together with shapes (3,4) (4,3) 

In [11]:
import tensorflow as tf

In [28]:
with tf.device("/device:GPU:0"):
    product = tf.matmul(a, b)

In [30]:
sess = tf.Session()

In [None]:
sess.run(tf.global_variables_initializer())
sess.run(product)