In [1]:
import timeit
import numpy as np
import matplotlib.pyplot as plt
import pickle
from time import time

In [1]:
from nnet.layers import Layer,conv2d,conv2dtranspose

Seed: 764


In [2]:
import cupy as cp
import numpy as np

In [3]:
def init_kernel_bias(num_inp_channels, kernel_size, num_kernels,mean=0,std=0.01):
    shape = [num_inp_channels, kernel_size, kernel_size, num_kernels]
    weights = std*np.random.randn(*shape) + mean
    # weights/=np.sqrt(num_inp_channels)
    bias = std*np.random.randn(1,num_kernels) + mean
    return weights.astype(np.float32), bias.astype(np.float32)

In [11]:
w0,b0=init_kernel_bias(num_inp_channels=32,kernel_size=3,num_kernels=64)

In [12]:
inp=np.random.randn(128,60,60,32).astype(np.float32)

In [13]:
inp.shape,w0.shape

((128, 60, 60, 32), (32, 3, 3, 64))

In [14]:
inpd=cp.asarray(inp.transpose(0,3,1,2))
w0d=cp.asarray(w0)

In [15]:
ch, kh, kw, nk = w0d.shape
bt, ch, h, w = inpd.shape
sy,sx = (1,1)
ph,pw = (1,1)
dy,dx = (1,1)
out_h,out_w = (h,w)
col = cp.empty((bt, ch, kh, kw, out_h, out_w), dtype=inpd.dtype)
im2col = cp.ElementwiseKernel(
    'raw T inp, int32 row, int32 col, int32 out_row, int32 out_col,'
    'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,'
    'int32 dy, int32 dx',
    'T coled',
    '''
       int c0 = i / (kh * kw * out_row * out_col);   // select channel
       int ky = i / (kw * out_row * out_col) % kh;   // select kernel y
       int kx = i / (out_row * out_col) % kw;        // select kernel x
       int out_y = i / out_col % out_row;            // select output y
       int out_x = i % out_col;                    // select output x
       int in_y = ky * dy + out_y * sy - ph;
       int in_x = kx * dx + out_x * sx - pw;
       if (in_y >= 0 && in_y < row && in_x >= 0 && in_x < col) {    // if in image bounds
         coled = inp[col * (in_y + row * c0) + in_x]; // choose pixel
       } else {
         coled = 0;                                // pad with 0
       }
    ''',
    'im2col')

In [16]:
inpd.shape,col.shape,w0d.shape

((128, 32, 60, 60), (128, 32, 3, 3, 60, 60), (32, 3, 3, 64))

In [17]:
%%time
col=im2col(inpd.reduced_view(),
              h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dy, dx, col)
outd=cp.tensordot(col, w0d, ((1, 2, 3), (0, 1, 2)))
cp.cuda.Stream.null.synchronize()

CPU times: user 708 ms, sys: 67.7 ms, total: 775 ms
Wall time: 775 ms


In [18]:
outd.shape,col.shape

((128, 60, 60, 64), (128, 32, 3, 3, 60, 60))

In [19]:
del col

In [4]:
from nnet_gpu.layers import conv2dtranspose as gc2dt

In [5]:
inp=np.random.randn(128,14,14,32).astype(np.float32)

In [6]:
td=gc2dt(64,input_shape=(14,14,32),kernel_size=3,stride=(2,2))

In [7]:
t=conv2dtranspose(64,input_shape=(14,14,32),kernel_size=3,stride=(2,2))

In [8]:
inpd=cp.asarray(inp)

In [9]:
inp.shape,inpd.shape

((128, 14, 14, 32), (128, 14, 14, 32))

In [10]:
t.kernels.shape

(32, 3, 3, 64)

In [11]:
td.kernels=cp.asarray(t.flipped)#.transpose(3,1,2,0))

In [12]:
td.biases=cp.asarray(t.biases)

In [14]:
od=td.forward(inpd)

In [15]:
o=t.forward(inp)

In [16]:
o.shape,od.shape

((128, 28, 28, 64), (128, 28, 28, 64))

In [18]:
np.allclose(o,od.get(),atol=1e-07)

False

In [29]:
grads=outd.transpose(0,3,1,2)
wtd=w0d#.transpose(3,0,1,2)

In [31]:
wtd.shape,grads.shape

((32, 3, 3, 64), (128, 64, 60, 60))

In [32]:
%%time
gcol=cp.tensordot(wtd,grads,(3,1))
cp.cuda.Stream.null.synchronize()

CPU times: user 72.6 ms, sys: 3.69 ms, total: 76.3 ms
Wall time: 74.8 ms


In [46]:
del gcol

In [33]:
%%time
gcol=cp.ascontiguousarray(cp.moveaxis(gcol,3,0))           # REMOVE THIS SOMEHOW
# gcol=gcol.transpose(3,0,1,2,4,5)
# gcol=cp.ascontiguousarray(gcol)
cp.cuda.Stream.null.synchronize()

CPU times: user 139 ms, sys: 2.78 ms, total: 141 ms
Wall time: 141 ms


In [34]:
gcol.shape

(128, 32, 3, 3, 60, 60)

In [35]:
gcol.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True

In [36]:
n, ch, kh, kw, out_h, out_w = gcol.shape
h, w = out_h, out_w
img = cp.empty((n, ch, h, w), dtype=gcol.dtype)
col2im=cp.ElementwiseKernel(
    'raw T coled, int32 row, int32 col, int32 out_row, int32 out_col,'
    'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,'
    'int32 dy, int32 dx',
    'T inp',
    '''
       int c0 = i / (row * col);
       int y  = i / col % row;
       int x  = i % col;
       T val = 0;
       for (int ky = 0; ky < kh; ++ky) {
         int out_y = (y + ph - ky * dy);
         if (0 > out_y || out_y >= out_row * sy) continue;
         if (out_y % sy != 0) continue;
         out_y /= sy;
         for (int kx = 0; kx < kw; ++kx) {
           int out_x = (x + pw - kx * dx);
           if (0 > out_x || out_x >= out_col * sx) continue;
           if (out_x % sx != 0) continue;
           out_x /= sx;
           int k = out_y + out_row * (kx + kw * (ky + kh * c0));
           val = val + coled[out_x + out_col * k];
         }
       }
       inp = val;
    ''',
    'col2im')

In [37]:
%%time
img=col2im(gcol.reduced_view(),
              h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dy, dx, img)
cp.cuda.Stream.null.synchronize()

CPU times: user 89 ms, sys: 2.76 ms, total: 91.7 ms
Wall time: 91 ms


In [38]:
img.shape

(128, 32, 60, 60)

In [40]:
bias = cp.random.randn(1,32,dtype=cp.float32)

In [249]:
%%time
imgg=img.transpose(0,2,3,1)
# imgg=imgg+bias
imgg=cp.add(imgg,bias)
cp.cuda.Stream.null.synchronize()

CPU times: user 18.3 ms, sys: 201 µs, total: 18.5 ms
Wall time: 16.6 ms


In [219]:
%%time
img=cp.ascontiguousarray(img.transpose(0,2,3,1).transpose(0,3,1,2))
cp.cuda.Stream.null.synchronize()

CPU times: user 179 µs, sys: 24 µs, total: 203 µs
Wall time: 164 µs


In [74]:
ct=conv2dtranspose(input_shape=(60,60,64),kernel_size=3,stride=[1,1],kernels=w0[:,::-1,::-1,:].transpose(3,1,2,0),biases=0)

In [75]:
cto=ct.forward(outd.get())

In [80]:
np.allclose(img.transpose(0,2,3,1).get(),cto,atol=1e-07)

True

In [145]:
gcol.nbytes/1024/1024

506.25

In [1]:
from nnet_gpu.layers import conv2d

In [2]:
c=conv2d(64,input_shape=(60,60,32),kernel_size=3,biases=0)

In [3]:
c.d_ker.shape

(None, 3, 3, 64)

In [4]:
c.d_inp.shape

(None, 60, 60, 32)

In [None]:
%%time
out=c.forward(inpd)

In [55]:
outd.shape,out.shape

((128, 60, 60, 64), (128, 60, 60, 64))

In [139]:
np.allclose(out,outd.get(),atol=1e-06/2)

True

In [57]:
(128,60,60,32,3,3),c.kernels.shape

((128, 60, 60, 32, 3, 3), (32, 3, 3, 64))

In [105]:
odip=c.d_inp.forward(outd.get())

In [106]:
cto.shape,odip.shape

((128, 60, 60, 32), (128, 60, 60, 32))

In [107]:
(cto==odip).all()

True

In [109]:
c.backprop(out)

(128, 60, 60, 32)

In [135]:
obb=c.backprop(outd.get())

CPU times: user 3.26 s, sys: 219 ms, total: 3.48 s
Wall time: 1.15 s


In [123]:
np.allclose(odip,obb)

True

In [61]:
c.d_inp.coled.reshape(128,60,60,64,3,3).shape

(128, 60, 60, 64, 3, 3)

In [None]:
grads=outd

In [166]:
inpd.shape

(128, 32, 60, 60)

In [164]:
kinp=inpd.transpose(1,2,3,0).transpose(0,3,1,2)

In [106]:
grads.shape,inpd.shape,kinp.shape

((128, 60, 60, 64), (128, 32, 60, 60), (32, 128, 60, 60))

In [110]:
ch, kh, kw, nk = grads.shape  # w0d
bt, ch, h, w = kinp.shape    # inpd
sy,sx = (1,1)
ph,pw = (1,1)
dy,dx = (1,1)
out_h,out_w = w0d.shape[1:3]
col = cp.empty((bt, ch, kh, kw, out_h, out_w), dtype=kinp.dtype)

In [111]:
kinp.shape,col.shape,grads.shape

((32, 128, 60, 60), (32, 128, 60, 60, 3, 3), (128, 60, 60, 64))

In [165]:
%%time
kinp=cp.ascontiguousarray(kinp)
cp.cuda.Stream.null.synchronize()

CPU times: user 15 ms, sys: 3.57 ms, total: 18.5 ms
Wall time: 16.8 ms


In [163]:
%%time
col=im2col(kinp.reduced_view(),
              h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dy, dx, col)
d_ker=cp.tensordot(col, grads, ((1, 2, 3), (0, 1, 2)))
cp.cuda.Stream.null.synchronize()

CPU times: user 452 ms, sys: 2.03 ms, total: 455 ms
Wall time: 453 ms


In [114]:
d_ker.shape

(32, 3, 3, 64)