In [1]:
import ctypes
import numpy as np

In [2]:
def init_kernel_bias(num_inp_channels, kernel_size, num_kernels,mean=0,std=0.01):
    shape = [num_inp_channels, kernel_size, kernel_size, num_kernels]
    weights = std*np.random.randn(*shape) + mean
    # weights/=np.sqrt(num_inp_channels)
    bias = std*np.random.randn(1,num_kernels) + mean
    return weights.astype(np.float32), bias.astype(np.float32)

In [3]:
w0,b0=init_kernel_bias(num_inp_channels=64,kernel_size=3,num_kernels=64)

In [4]:
inp=np.random.randn(128,32,32,64).astype(np.float32)

In [5]:
#inp[batches,row,col,d],w0(d,ksz,ksz,num_ker),b0[1,num_ker],stride[row,col]
padding=0
stride=[1,1]
ipp=inp.transpose(0,3,1,2)  #ipp[batches,d,row,col]
output=[]
ksz=w0.shape[1]
num_ker=w0.shape[3]
if not padding: #take care of padding in backprop too
    padding=(ksz-1)//2  #currently don't give 'even' ksz
out_row,out_col=((ipp.shape[2]-ksz+2*padding)//stride[0]+1),((ipp.shape[3]-ksz+2*padding)//stride[1]+1)
batches,d,row,col=ipp.shape
row+=2*padding
col+=2*padding

In [6]:
%%time
padded=np.zeros((batches,d,row,col),dtype=np.float32)
padded[:,:,padding:-padding,padding:-padding]=ipp

CPU times: user 20.8 ms, sys: 19.8 ms, total: 40.7 ms
Wall time: 38.9 ms


In [7]:
# %%timeit
window=(np.arange(ksz)[:,None]*row+np.arange(ksz)).ravel()+np.arange(d)[:,None]*row*col
slider=(np.arange(out_row*stride[0])[:,None]*row+np.arange(out_col*stride[1]))
ind = window.ravel()+slider[::stride[0],::stride[1]].ravel()[:,None]
# bind= np.arange(batches)[:,None]*d*row*col+ind.ravel()
kern = w0.reshape(-1,num_ker)
# output=(np.dot(np.take(padded, bind).reshape(-1,d*ksz*ksz), kern)).reshape(batches,out_row,out_col,num_ker)

In [10]:
%%time
output=np.empty((batches,out_row*out_col,num_ker),dtype=np.float32)
for i,img in enumerate(padded):      #img[d,row,col]
    # windows(out_row*out_col, ksz*ksz*d) . kernels(d*ksz*ksz,num_ker)
    output[i]=np.dot(img.take(ind), kern)
# output+=b0
ans1=output.reshape(batches,out_row,out_col,num_ker)

250 ms ± 14.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%time
checker=np.empty((batches,*ind.shape),dtype=np.float32)#,order='F')
for i,img in enumerate(padded):      #img[d,row,col]
    # windows(out_row*out_col, ksz*ksz*d) . kernels(d*ksz*ksz,num_ker)
    checker[i]=img.take(ind)

CPU times: user 136 ms, sys: 26.6 ms, total: 163 ms
Wall time: 162 ms


In [11]:
import concurrent

In [12]:
def doit(i,img,ind):
    checker[i]=img.take(ind)

In [13]:
%%time
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    future_shifting = {executor.submit(doit,i,img,ind):
                         img for i,img in enumerate(padded)}

ckr=checker.reshape(-1,d*ksz*ksz)
ckd=np.asfortranarray(ckr)

CPU times: user 964 ms, sys: 26.5 ms, total: 990 ms
Wall time: 752 ms


In [14]:
# %%time
# for i,future in enumerate(future_shifting):
#     coled[i]=future.result()

In [15]:
%%time
coled=np.empty((batches,*ind.shape),dtype=np.float32,order='C').reshape(-1,d*ksz*ksz,order='A')

CPU times: user 39 µs, sys: 0 ns, total: 39 µs
Wall time: 45.1 µs


In [16]:
%%time
a=coled.reshape(-1,d*ksz*ksz,order='A')
# b=np.asfortranarray(w0).reshape(d*ksz*ksz,-1,order='A')
b=w0.reshape(d*ksz*ksz,-1,order='A')
(a.nbytes+b.nbytes)/1024/1024

CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 23.6 µs


In [17]:
%%time
c=np.empty((a.shape[0],b.shape[1]),dtype=np.float32,order='F')
c.nbytes/1024/1024

CPU times: user 29 µs, sys: 1e+03 ns, total: 30 µs
Wall time: 32.9 µs


In [22]:
%%time
a=np.asfortranarray(a)
b=np.asfortranarray(b)
c=np.asfortranarray(c)

CPU times: user 12 µs, sys: 0 ns, total: 12 µs
Wall time: 15.5 µs


In [19]:
a.shape

(131072, 576)

In [20]:
ctake=ctypes.CDLL("libctake.so")

In [21]:
%%time
ctake.take(ctypes.c_void_p(padded.ctypes.data),ctypes.c_void_p(ind.ctypes.data),ctypes.c_void_p(a.ctypes.data),ctypes.c_int(batches),ctypes.c_int(padded[0].size),ctypes.c_int(ind.size),ctypes.c_int(a.shape[0]),ctypes.c_int(a.shape[1]),ord('C'),ctypes.c_int(4))

CPU times: user 163 ms, sys: 3.52 ms, total: 167 ms
Wall time: 64.7 ms


0

In [30]:
%%time
coled=np.empty((batches,*ind.shape),dtype=np.float32,order='C').reshape(-1,d*ksz*ksz,order='A')
ctake.take(ctypes.c_void_p(padded.ctypes.data),ctypes.c_void_p(ind.ctypes.data),ctypes.c_void_p(coled.ctypes.data),ctypes.c_int(batches),ctypes.c_int(padded[0].size),ctypes.c_int(ind.size),ctypes.c_int(a.shape[0]),ctypes.c_int(a.shape[1]),ord('C'),ctypes.c_int(4))
output=np.dot(coled,kern)
# output+=b0
ans2=output.reshape(batches,out_row,out_col,num_ker)

CPU times: user 617 ms, sys: 59.8 ms, total: 677 ms
Wall time: 207 ms


In [32]:
(ans1==ans2).all()

True

In [96]:
(ckd==a).all()

True

In [101]:
%%timeit
np.asfortranarray(a)

630 ms ± 3.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [98]:
np.isfortran(a)

False

In [99]:
a

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         1.5933355 , -0.24140038],
       [ 0.        ,  0.        ,  0.        , ...,  1.5933355 ,
        -0.24140038,  1.4938748 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.24140038,
         1.4938748 ,  1.028721  ],
       ...,
       [-0.5295867 ,  0.57013875, -0.42779335, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.57013875, -0.42779335,  0.07762764, ...,  0.        ,
         0.        ,  0.        ],
       [-0.42779335,  0.07762764,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [57]:
# from numba import cuda

In [58]:
# %%time
# A = cuda.to_device(a)
# B = cuda.to_device(b)
# C = cuda.to_device(c)

In [59]:
sgemm=ctypes.CDLL('libsgemm.so')

In [60]:
al,bet=1,0

In [61]:
# %%time
# sgemm.gemm(A.device_ctypes_pointer,B.device_ctypes_pointer,C.device_ctypes_pointer,a.shape[0],b.shape[0],c.shape[1],ctypes.c_float(al),ctypes.c_float(bet),ctypes.c_void_p(b0[0].ctypes.data))

In [62]:
%%time
sgemm.gemm(ctypes.c_void_p(a.ctypes.data),ctypes.c_void_p(b.ctypes.data),ctypes.c_void_p(c.ctypes.data),a.shape[0],b.shape[0],c.shape[1],ctypes.c_float(al),ctypes.c_float(bet),ctypes.c_void_p(b0[0].ctypes.data))

CPU times: user 175 ms, sys: 86.2 ms, total: 261 ms
Wall time: 260 ms


0

In [63]:
a.shape[0],b.shape[0],c.shape[1]

(131072, 576, 64)

In [64]:
# al*np.dot(a,b)+bet*c

In [65]:
# c=C.copy_to_host()

In [66]:
ans2.shape

(128, 32, 32, 64)

In [71]:
np.asfortranarray(ans2)

array([[[[ 1.11248657e-01,  1.23698935e-01,  4.28952575e-02, ...,
           2.99073249e-01,  4.64981347e-02, -9.18098241e-02],
         [-3.34478110e-01,  6.38031587e-03, -2.08894134e-01, ...,
           2.23203003e-01,  2.53423631e-01, -3.54520738e-01],
         [ 2.17673123e-01, -6.67957291e-02,  4.35677916e-02, ...,
           1.62335023e-01, -1.72403812e-01,  3.29605520e-01],
         ...,
         [-2.15308443e-02,  3.48355353e-01, -7.59251043e-03, ...,
          -1.22283608e-01,  1.48752853e-01,  3.29008363e-02],
         [-1.85663462e-01, -4.83224243e-02,  1.86661586e-01, ...,
          -1.32136926e-01, -4.58692551e-01, -2.59265840e-01],
         [ 4.99134511e-03, -7.39155337e-02, -5.37355423e-01, ...,
           2.19304979e-01,  1.19999304e-01,  4.54932928e-01]],

        [[-1.53459534e-01, -1.33222938e-01,  2.21497238e-01, ...,
           1.59685463e-01, -4.28689003e-01,  1.54156908e-01],
         [-4.74078387e-01,  3.74567419e-01, -9.79259610e-03, ...,
           2.92164147e

In [72]:
c.reshape(ans2.shape)

array([[[[ 2.13725001e-01,  9.09664705e-02,  1.09683938e-01, ...,
           7.79242516e-02, -2.99185291e-02, -4.19293046e-02],
         [-8.55199546e-02, -1.64813027e-01,  1.59411326e-01, ...,
          -4.28122044e-01, -5.03792167e-02,  1.15887232e-01],
         [ 2.54085213e-02,  3.79411280e-02, -1.44441769e-01, ...,
          -2.81209834e-02,  6.53684139e-03,  8.60214531e-02],
         ...,
         [ 2.97912478e-01,  4.74449731e-02,  1.26503080e-01, ...,
           2.47407705e-02,  5.66060431e-02, -4.94419783e-02],
         [ 2.63344526e-01,  2.79668748e-01, -2.30232120e-01, ...,
          -6.39272779e-02,  5.07020354e-02,  1.10337943e-01],
         [ 6.56193197e-02, -1.22592270e-01, -1.91393867e-02, ...,
           5.60409054e-02,  8.63260329e-02, -2.09990349e-02]],

        [[-2.21537858e-01, -1.17533036e-01, -2.02655643e-02, ...,
           4.52387854e-02, -2.04234779e-01, -2.19843596e-01],
         [ 1.76210493e-01, -2.46660814e-01,  7.37771094e-02, ...,
           8.28964412e

In [73]:
(c.reshape(ans2.shape)==ans2).all()

False

In [74]:
np.allclose(c.reshape(ans2.shape),ans2)

False

In [75]:
(c.reshape(ans2.shape)-ans2).max()

1.7599666

In [23]:
# a=(np.arange(30)+11).reshape(6,5,order='F').astype(np.float32)  #mxk

In [24]:
# b=(np.arange(20)+11).reshape(5,4,order='F').astype(np.float32)  #kxn

In [25]:
# c=(np.arange(24)*0).reshape(6,4,order='F').astype(np.float32)  #mxn