In [1]:
from nnet.layers import Layer,conv2d,conv2dtranspose

Seed: 349


In [1]:
import cupy as cp
import numpy as np

In [3]:
def init_kernel_bias(num_inp_channels, kernel_size, num_kernels,mean=0,std=0.01):
    shape = [num_inp_channels, kernel_size, kernel_size, num_kernels]
    weights = std*np.random.randn(*shape) + mean
    # weights/=np.sqrt(num_inp_channels)
    bias = std*np.random.randn(1,num_kernels) + mean
    return weights.astype(np.float32), bias.astype(np.float32)

In [4]:
w0,b0=init_kernel_bias(num_inp_channels=32,kernel_size=3,num_kernels=64)

In [5]:
inp=np.random.randn(128,60,60,32).astype(np.float32)

In [6]:
inp.shape,w0.shape

((128, 60, 60, 32), (32, 3, 3, 64))

In [7]:
inpd=cp.asarray(inp.transpose(0,3,1,2))
w0d=cp.asarray(w0)

In [8]:
ch, kh, kw, nk = w0d.shape
bt, ch, h, w = inpd.shape
sy,sx = (1,1)
ph,pw = (1,1)
dy,dx = (1,1)
out_h,out_w = (h,w)
col = cp.empty((bt, ch, kh, kw, out_h, out_w), dtype=inpd.dtype)
im2col = cp.ElementwiseKernel(
    'raw T inp, int32 row, int32 col, int32 out_row, int32 out_col,'
    'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,'
    'int32 dy, int32 dx',
    'T coled',
    '''
       int c0 = i / (kh * kw * out_row * out_col);   // select channel
       int ky = i / (kw * out_row * out_col) % kh;   // select kernel y
       int kx = i / (out_row * out_col) % kw;        // select kernel x
       int out_y = i / out_col % out_row;            // select output y
       int out_x = i % out_col;                    // select output x
       int in_y = ky * dy + out_y * sy - ph;
       int in_x = kx * dx + out_x * sx - pw;
       if (in_y >= 0 && in_y < row && in_x >= 0 && in_x < col) {    // if in image bounds
         coled = inp[col * (in_y + row * c0) + in_x]; // choose pixel
       } else {
         coled = 0;                                // pad with 0
       }
    ''',
    'im2col')

In [9]:
inpd.shape,col.shape,w0d.shape

((128, 32, 60, 60), (128, 32, 3, 3, 60, 60), (32, 3, 3, 64))

In [10]:
%%time
col=im2col(inpd.reduced_view(),
              h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dy, dx, col)
outd=cp.tensordot(col, w0d, ((1, 2, 3), (0, 1, 2)))
cp.cuda.Stream.null.synchronize()

CPU times: user 742 ms, sys: 31.1 ms, total: 773 ms
Wall time: 776 ms


In [11]:
outd.shape,col.shape

((128, 60, 60, 64), (128, 32, 3, 3, 60, 60))

In [12]:
del col

In [13]:
grads=outd.transpose(0,3,1,2)
wtd=w0d#.transpose(3,0,1,2)

In [203]:
wtd.shape,grads.shape

((5, 3, 3, 3), (1, 3, 2, 2))

In [158]:
%%time
gcol=cp.tensordot(wtd,grads,(3,1))
cp.cuda.Stream.null.synchronize()

CPU times: user 0 ns, sys: 5.04 ms, total: 5.04 ms
Wall time: 4.14 ms


In [16]:
del gcol

In [159]:
%%time
gcol=cp.ascontiguousarray(cp.moveaxis(gcol,3,0))           # REMOVE THIS SOMEHOW
# gcol=gcol.transpose(3,0,1,2,4,5)
# gcol=cp.ascontiguousarray(gcol)
cp.cuda.Stream.null.synchronize()

CPU times: user 108 µs, sys: 22 µs, total: 130 µs
Wall time: 136 µs


In [207]:
gcol.shape

(1, 5, 3, 3, 2, 2)

In [226]:
n, ch, kh, kw, h, w = gcol.shape
out_h, out_w = h*sx,w*sy
img = cp.empty((n, ch, out_h, out_w), dtype=gcol.dtype)
col2im=cp.ElementwiseKernel(
    'raw T coled, int32 row, int32 col, int32 out_row, int32 out_col,'
    'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,'
    'int32 dy, int32 dx',
    'T inp',
    '''
       int c0 = i / (row * col);
       int y  = i / col % row;
       int x  = i % col;
       T val = 0;
       for (int ky = 0; ky < kh; ++ky) {
         int out_y = (y + ph - ky * dy);
         if (0 > out_y || out_y >= out_row * sy) continue;
         if (out_y % sy != 0) continue;
         out_y /= sy;
         for (int kx = 0; kx < kw; ++kx) {
           int out_x = (x + pw - kx * dx);
           if (0 > out_x || out_x >= out_col * sx) continue;
           if (out_x % sx != 0) continue;
           out_x /= sx;
           int k = out_y + out_row * (kx + kw * (ky + kh * c0));
           val = val + coled[out_x + out_col * k];
         }
       }
       inp = val;
    ''',
    'col2im')

In [228]:
%%time
img=col2im(gcol.reduced_view(),
              h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dy, dx, img)
cp.cuda.Stream.null.synchronize()

CPU times: user 1.37 ms, sys: 3.64 ms, total: 5.01 ms
Wall time: 4.18 ms


In [25]:
%%time
img=cp.ascontiguousarray(img.transpose(0,2,3,1).transpose(0,3,1,2))
cp.cuda.Stream.null.synchronize()

CPU times: user 161 µs, sys: 31 µs, total: 192 µs
Wall time: 149 µs


In [26]:
ct=conv2dtranspose(input_shape=(60,60,64),kernel_size=3,stride=[1,1],kernels=w0[:,::-1,::-1,:].transpose(3,1,2,0),biases=0)

In [27]:
cto=ct.forward(outd.get())

In [28]:
np.allclose(img.transpose(0,2,3,1).get(),cto,atol=1e-07)

True

In [2]:
from nnet_gpu.layers import Conv2Dtranspose as gc2dt
import numpy as np
import cupy as cp

In [3]:
w0=np.random.randint(1,9,(1,3,3,1)).astype(np.float32)
b0=np.random.randint(1,9,(1,2)).astype(np.float32)

In [4]:
inp=np.random.randint(1,9,(1,8,8,1)).astype(np.float32)

In [5]:
inp.shape,w0.shape

((1, 8, 8, 1), (1, 3, 3, 1))

In [6]:
# inp/=inp
# w0/=w0

In [7]:
inpd=cp.asarray(inp)
w0d=cp.asarray(w0)
b0d=cp.asarray(b0)

In [8]:
ct=conv2dtranspose(input_shape=(8,8,1),kernel_size=3,stride=[2,2],kernels=w0[:,::-1,::-1,:].transpose(3,1,2,0),biases=0)

In [9]:
ct.kernels.shape

(1, 3, 3, 1)

In [10]:
td=gc2dt(input_shape=(8,8,1),kernel_size=3,stride=[2,2],kernels=w0d,biases=0)

In [11]:
td.do_init(td.saved_parameters)

In [12]:
td.kernels.shape#,td.biases.shape

(1, 3, 3, 1)

In [13]:
od=td.forward(inpd)

In [14]:
o=ct.forward(inp)

In [15]:
coled=cp.tensordot(td.kernels,td.inp,(3,1))

In [16]:
coled=cp.moveaxis(coled,3,0)

In [17]:
coled.shape

(1, 1, 3, 3, 8, 8)

In [18]:
w0.squeeze(),ct.kernels.squeeze()

(array([[6., 8., 3.],
        [8., 8., 7.],
        [4., 2., 3.]], dtype=float32),
 array([[3., 2., 4.],
        [7., 8., 8.],
        [3., 8., 6.]], dtype=float32))

In [19]:
inp.squeeze()

array([[3., 1., 7., 6., 8., 2., 1., 5.],
       [3., 5., 8., 2., 2., 6., 1., 5.],
       [7., 8., 8., 7., 4., 2., 8., 1.],
       [5., 3., 1., 5., 2., 3., 6., 5.],
       [4., 8., 4., 6., 1., 4., 8., 5.],
       [5., 2., 7., 4., 3., 2., 8., 7.],
       [1., 6., 8., 4., 7., 2., 1., 6.],
       [7., 3., 5., 2., 4., 6., 1., 6.]], dtype=float32)

In [20]:
(o.squeeze()==od.squeeze().get()).all()

True

In [21]:
(td.backprop(od).get() == ct.backprop(o)).all()

True

In [22]:
(td.d_c_w.get() == ct.d_c_w[:,::-1,::-1,:].transpose(3,1,2,0)).all()

True

In [23]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Conv2DTranspose

In [24]:
model = Sequential()
model.add(Conv2DTranspose(1, kernel_size=(3,3), strides=(2,2), input_shape=(8, 8, 1),use_bias=False,padding='same'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_transpose (Conv2DTran (None, 16, 16, 1)         9         
Total params: 9
Trainable params: 9
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.layers[0].weights[0].shape

TensorShape([3, 3, 1, 1])

In [26]:
w0.transpose(1,2,0,3).shape

(3, 3, 1, 1)

In [27]:
model.set_weights([w0.transpose(1,2,0,3)])

In [28]:
model.compile(optimizer=tf.keras.optimizers.SGD(), loss='mse')

In [29]:
yhat = model.predict(inp)

In [30]:
model.train_on_batch(inp, np.zeros_like(yhat))

3326.45703125

In [31]:
df = (w0.transpose(1,2,0,3).squeeze()-model.weights[0].numpy().squeeze())*100
df#/df.max()

array([[162.29688 , 120.718765, 137.10938 ],
       [161.85938 , 102.6875  , 145.33597 ],
       [145.75781 ,  95.046875, 131.73438 ]], dtype=float32)

In [32]:
fd = td.d_c_w.transpose(1,2,0,3).squeeze()/200
fd#/fd.max()

array([[103.87 ,  77.26 ,  87.75 ],
       [103.59 ,  65.72 ,  93.015],
       [ 93.285,  60.83 ,  84.31 ]], dtype=float32)

In [33]:
df-fd.get()*1.5625

array([[0.0000000e+00, 1.5258789e-05, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 3.0517578e-05],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00]], dtype=float32)

In [34]:
(yhat==od.get()).all()

True

In [35]:
yhat.shape,o.shape

((1, 16, 16, 1), (1, 16, 16, 1))

In [36]:
td.kernels.shape,inp.transpose(0,3,1,2).shape

((1, 3, 3, 1), (1, 1, 8, 8))