Each module will be given with tests to prove it's doing it's job properly and to save it here.

In [1]:
# Acceptable imports for project
from torch import empty , cat , arange
from torch.nn.functional import fold , unfold
import math

## ReLU

In [2]:
class relu(object) :
    def __init__(self):
        pass
    def forward(self, input) :
        self.input = input
        self.positif_mask = (input > 0)
        return self.positif_mask*(input)
    def backward(self, gradwrtoutput) :
        self.input.grad = self.positif_mask.int()*gradwrtoutput
        return self.input.grad
    def param(self) :
        return []

In [5]:
dummy_input = empty(10, 5, 7, 3).normal_()

our_M = relu();

out = our_M.forward(dummy_input);
our_M.backward(out);

## Sigmoid

In [6]:
class sigmoid(object) :
    def forward(self, input) :
        self.input = input
        self.output = 1/(1 + math.e**(-input))
        return  self.output
    def backward(self, gradwrtoutput ) :
        self.input.grad = self.output * (1-self.output) * gradwrtoutput
        return self.input.grad
    def param(self) :
        return []

In [12]:
target = empty(10, 5, 7, 3).normal_()
input = empty(10, 5, 7, 3).normal_()

model = sigmoid()

out = model.forward(input)

## MSE

In [13]:
class mse(object):
    def forward(self, input, target):
        self.input = input
        self.target = target
        return (input - target).pow(2).mean()
    def backward(self, gradwrtoutput):
        self.input.grad = 2*(self.input-self.target)/(self.input.size(-3)*self.input.size(-2)*self.input.size(-1))

In [14]:
my_mse = mse()

loss = my_mse.forward(out, target)

my_mse.backward(loss)

## Convolution

In [26]:
class convolution(object):
    def __init__(self, in_ch, out_ch, kernel_size = (3,3), padding = 0, stride = 1, use_bias = False):
        self.in_ch = in_ch
        self.out_ch = out_ch
        self.kernel_size = kernel_size
        self.k = self.kernel_size[0]
        self.use_bias = use_bias
        self.stride = stride
        self.padding = padding
        self.kernel = empty(out_ch, in_ch, self.k, self.k).normal_()
        self.bias = empty(out_ch).normal_() if use_bias else torch.zeros(out_ch)
        
    def forward(self, x):   
        
        self.batch_size = x.size(0)
        self.s_in = x.size(-1)
        self.s_out = int(math.ceil((x.size(-2)-self.k+1+self.padding*2)/(self.stride)))
        
        X_unf = unfold(x, kernel_size=(self.k, self.k), padding = self.padding, stride = self.stride)
        
        self.x = x
        self.X_unf = X_unf
    
        K_expand = self.kernel.view(self.out_ch, -1)
        O_expand = K_expand @ X_unf

        
        O = O_expand.view(self.batch_size, self.out_ch, self.s_out, self.s_out)
        return O + self.bias.view(1, -1, 1, 1) if self.use_bias else O
    
    def backward(self, gradwrtoutput):
        dL_dO = gradwrtoutput                                       # (B x OUT_CH x SO x SO)
        dO_dX = self.kernel                                         # (OUT_CH x IN_CH x SI x SI)

        dL_dO_exp = dL_dO.reshape(self.batch_size, self.out_ch, -1) # (B x OUT_CH x (SO x SO))
        dO_dX_exp = dO_dX.reshape(self.out_ch,-1).transpose(0,1)    # (OUT_CH x (IN_CH x SI x SI))
        dL_dO_unf = dO_dX_exp @ dL_dO_exp                           # (B x (IN_CH x SI x SI) x (SO x SO))

        dL_dX = fold(dL_dO_unf, kernel_size = (self.k, self.k), padding = self.padding, stride = self.stride, output_size = (self.s_in, self.s_in))
        
        # backward wrt weights
        dL_dO_exp = dL_dO.transpose(0,1).reshape(self.out_ch, -1) # (OUT_CH x (B x SO x SO))
        dO_dF_exp = self.X_unf.transpose(-1, -2).reshape(self.batch_size*self.s_out*self.s_out, -1) # ((B x SO x SO) x (IN_CH x K x K))
        dL_dF_exp = dL_dO_exp @ dO_dF_exp # (OUT_CH x  (IN_CH x K x K))
        
        self.dL_dF = dL_dF_exp.view(self.out_ch, self.in_ch, self.k, self.k)
        
        # backward wrt bias
        if self.use_bias:
            dO_dB_exp = 1+0*empty(self.batch_size * (self.s_out) * (self.s_out))
            self.dL_dB = dL_dO_exp @ dO_dB_exp
        else:
            self.dL_dB = None
        
        return dL_dX, self.dL_dF, self.dL_dB

        
    def param(self) :
        return ((self.kernel, self.dL_dF), (self.bias, self.dL_dB))

In [39]:
# Initial parameters
s_1, s_2 = 7,7
k_1, k_2 = 3,3
bs = 2
ch_in, ch_out = 2, 4
stride = 2
padding = 1

# input tensor 
X = empty(bs, ch_in, s_1, s_2).normal_()

# initialize convolution moduls
conv = convolution(ch_in, ch_out, kernel_size = (k_1, k_2), padding = padding, use_bias=True, stride = stride)

# get weigts and bias
F = conv.kernel
B = conv.bias
F.requires_grad_()
B.requires_grad_()

# forward
out = conv.forward(X)

# backward
dL_dX, dL_dF, dL_dB = conv.backward(out/out)

print(conv.param())

((tensor([[[[ 0.0069,  0.0188,  0.9383],
          [ 1.1074, -1.1675,  0.0813],
          [-0.1009, -0.5582, -1.3208]],

         [[-1.6055, -0.7318,  0.2603],
          [ 0.8248,  0.8601, -0.8107],
          [-0.6488,  1.1049,  1.0795]]],


        [[[ 0.4236,  0.0909,  1.2867],
          [-0.8907, -1.8950,  0.9544],
          [-0.7525, -0.9782, -1.6160]],

         [[-0.6253, -0.4612, -0.1277],
          [ 1.2424,  0.8762, -1.6619],
          [-0.2318,  0.3806, -0.4159]]],


        [[[-0.8200,  1.2555, -1.0944],
          [ 0.4727, -0.3980,  1.6263],
          [-0.8485,  0.7906,  1.0962]],

         [[ 1.5212, -0.7339,  0.9833],
          [-1.2588,  0.6699,  0.0499],
          [ 1.3753,  0.3723,  0.8509]]],


        [[[-0.1410, -1.1216,  0.4658],
          [ 0.5245,  0.6062,  1.3332],
          [-0.3514,  0.4615,  0.3269]],

         [[ 1.2005, -1.0177,  0.5376],
          [-0.5855,  0.8866,  0.1924],
          [-0.4232,  0.6157, -0.8977]]]], requires_grad=True), tensor([[[[-3.6174

## Transposed Convolution

In [35]:
class transposed_convolution(object):
    def __init__(self, in_ch, out_ch, kernel_size = (3,3), padding = 0, stride = 1, use_bias = False):
        self.in_ch = in_ch
        self.out_ch = out_ch
        self.kernel_size = kernel_size
        self.k_1 = self.kernel_size[0]
        self.k_2 = self.kernel_size[1]
        self.use_bias = use_bias
        self.stride = stride
        self.padding = padding
        self.kernel = empty(in_ch, out_ch, self.k_1, self.k_2).normal_()
        self.bias = empty(out_ch).normal_() if use_bias else 0*empty(out_ch)
    def forward(self, x):
        self.x = x
        self.batch_size = x.size(0)
        self.s1 = self.x.size(-2)
        self.s2 = self.x.size(-1)
        o1 = (self.s1 - 1)*self.stride + 1 + self.k_1 - 1 - self.padding *2
        o2 = (self.s2 - 1)*self.stride + 1 + self.k_2 - 1 - self.padding *2
        
        self.o1 = o1
        self.o2 = o2
        
        x_exp = x.reshape(self.batch_size, self.in_ch, -1)
        K_exp = self.kernel.reshape(self.in_ch,-1).transpose(0,1)
        O_unf = K_exp @ x_exp
        out = fold(O_unf, kernel_size = (self.k_1, self.k_2), padding = self.padding, stride = self.stride, output_size = (o1,o2))
        
        return out + self.bias.view(1, -1, 1, 1) if self.use_bias else out
    
    def backward(self, gradwrtoutput):
        dL_dO = gradwrtoutput      # B x OUT_CH x SO x SO
        dO_dX = self.kernel
        
        dL_dO_unf = unfold(dL_dO, kernel_size = (self.k_1, self.k_2), padding = self.padding, stride = self.stride)
                                   # B x (OUT_CH x K x K) x SI x SI
        dO_dX_exp = dO_dX.view(self.in_ch, -1)
        dL_dX_exp = dO_dX_exp @ dL_dO_unf
        self.dL_dX = dL_dX_exp.view(self.batch_size, self.in_ch, self.s1, self.s2)
        
        self.dL_dO_unf_K = dL_dO_unf.transpose(0,1).reshape(self.out_ch * self.k_1 * self.k_2, -1).transpose(0,1)
                                                                    # (B x SI x SI) x (OUT_CH x K x K)
        self.dO_dF_exp = self.x.transpose(0,1).reshape(self.in_ch, -1)   # IN_CH x (B x SI x SI)
        self.dL_dF_exp = self.dO_dF_exp @ self.dL_dO_unf_K                         # IN_CH x (OUT_CH x K x K)                                                                       
        self.dL_dF = self.dL_dF_exp.view(self.in_ch, self.out_ch, self.k_1, self.k_2)  # OUT_CH x IN_CH x K x K
        
        dL_dO_exp = dL_dO.transpose(0,1).reshape(self.out_ch, -1)
        dO_dB_exp = 1+0*empty(self.batch_size * (self.o1) * (self.o2))
        self.dL_dB = dL_dO_exp @ dO_dB_exp
        
        return self.dL_dX, self.dL_dF, self.dL_dB
        
    def param(self) :
        return ((self.kernel, self.dL_dF), (self.bias, self.dL_dB))

In [37]:
padding = 2
stride = 1
in_ch = 3
out_ch = 2
batch_size = 5
si = 5

test = transposed_convolution(in_ch,out_ch, kernel_size = (3,3), use_bias = True, padding = padding, stride = stride)
kernel_test = empty((in_ch,out_ch,3,3)).normal_() #torch.ones((1,1,3,3))
bias_test = empty((out_ch)).normal_()
test.kernel = kernel_test
test.bias = bias_test
input = empty((batch_size,in_ch,si,si)).normal_() #torch.ones((1,1,5,5))
output = test.forward(input)

dL_dX, dL_dF, dL_dB = test.backward(output/output)
print(test.param())

((tensor([[[[ 1.5161,  0.0515,  1.3460],
          [ 0.5884,  0.2913, -0.0336],
          [ 0.8021,  0.0519,  0.2712]],

         [[-0.0918, -1.6271,  0.1195],
          [-1.4657,  1.0647,  0.1597],
          [ 0.7131, -0.6451,  1.4112]]],


        [[[-0.3635,  0.4464,  0.7092],
          [-1.9496,  0.1352,  0.2133],
          [ 2.3279,  1.4274, -1.0688]],

         [[-0.6493, -1.6794,  1.2861],
          [-1.0077,  0.1461,  1.0481],
          [-1.0401, -0.1514, -1.1340]]],


        [[[-0.0303,  0.2018, -1.4957],
          [ 0.7967,  1.9318,  1.6917],
          [-1.1244, -0.2905, -0.9547]],

         [[ 0.6784,  0.2066, -0.1596],
          [ 1.2293,  1.6930,  0.9298],
          [-0.8403,  2.4258, -0.1050]]]]), tensor([[[[-6.0013,  2.7847, 13.7594],
          [-5.7279,  2.0224, 10.8056],
          [-5.0121,  0.6958,  3.7170]],

         [[-6.0013,  2.7847, 13.7594],
          [-5.7279,  2.0224, 10.8056],
          [-5.0121,  0.6958,  3.7170]]],


        [[[-5.4770,  3.6271,  2.9972],