Each module will be given with tests to prove it's doing it's job properly and to save it here.

In [1]:
# Acceptable imports for project
from torch import empty , cat , arange
from torch.nn.functional import fold , unfold
import math

# For testing need
import torch
import torch.nn as nn

## ReLU

In [2]:
class relu(object) :
    def __init__(self):
        pass
    def forward(self, input) :
        self.input = input
        self.positif_mask = (input > 0)
        return self.positif_mask*(input)
    def backward(self, gradwrtoutput) :
        self.input.grad = self.positif_mask.int()*gradwrtoutput
        return self.input.grad
    def param(self) :
        return []

In [3]:
dummy_input = torch.randn(10, 5, 7, 3)
dummy_input2 = dummy_input.detach()

our_M = relu()
true_M = nn.ReLU()

out = our_M.forward(dummy_input)
out2 = true_M(dummy_input2)

print('Difference in forward is:', (out-out2).abs().sum().item())

dummy_output = torch.randn(1, 1, 5, 5)
print('From this output:\n',dummy_output)
_ = our_M.forward(dummy_output)
grad = our_M.backward(dummy_output)
print('We get the following backward:\n',grad)

Difference in forward is: 0.0
From this output:
 tensor([[[[ 0.3278, -1.8736,  0.5739,  0.1985,  1.0140],
          [ 1.4832, -0.1517,  1.7375, -1.0986, -0.5361],
          [ 1.6531, -0.4029,  2.0077,  0.2771, -0.0073],
          [-0.3105,  0.9046,  1.1402, -0.2158,  0.8267],
          [-0.1793, -1.1292, -0.8403, -1.1898, -0.0120]]]])
We get the following backward:
 tensor([[[[0.3278, -0.0000, 0.5739, 0.1985, 1.0140],
          [1.4832, -0.0000, 1.7375, -0.0000, -0.0000],
          [1.6531, -0.0000, 2.0077, 0.2771, -0.0000],
          [-0.0000, 0.9046, 1.1402, -0.0000, 0.8267],
          [-0.0000, -0.0000, -0.0000, -0.0000, -0.0000]]]])


## Sigmoid

In [4]:
class sigmoid(object) :
    def forward(self, input) :
        self.input = input
        self.output = 1/(1 + math.e**(-input))
        return  self.output
    def backward(self, gradwrtoutput ) :
        self.input.grad = self.output * (1-self.output) * gradwrtoutput
        return self.input.grad
    def param(self):
        return []

In [5]:
target = torch.randn(1, 1, 3, 3)
print('Target is\n',target)

input = torch.randn(1, 1, 3, 3, requires_grad=True)
input2 = input.detach().requires_grad_(True)

print('Input is\n',input)
model = sigmoid()
model2 = torch.nn.Sigmoid()

out = model.forward(input)
out2 = model2.forward(input2)

print('Difference in output:', (out-out2).abs().sum().item())

Target is
 tensor([[[[-0.4183,  2.5018,  0.8250],
          [-0.9432, -0.8053, -0.5207],
          [ 0.4313, -0.8316, -1.6740]]]])
Input is
 tensor([[[[ 0.2239, -1.2218, -0.1611],
          [-0.7936, -0.5843, -0.5434],
          [-0.5601,  1.9230,  0.5945]]]], requires_grad=True)
Difference in output: 1.043081283569336e-07


## MSE

In [6]:
class mse(object):
    def forward(self, input, target):
        self.input = input
        self.target = target
        return (input - target).pow(2).mean()
    def backward(self, gradwrtoutput):
        self.input.grad = 2*(self.input-self.target)/(self.input.size(-3)*self.input.size(-2)*self.input.size(-1))
    def param(self):
        return []

In [7]:
MSE = nn.MSELoss()
my_mse = mse()

loss = my_mse.forward(out, target)
loss2 = MSE(out2, target)

print('Our Loss is',loss.item(), 'while true loss is', loss2.item())
my_mse.backward(loss)
model.backward(my_mse.input.grad)

loss2.backward()

print('Difference in backward is:', (input.grad-input2.grad).abs().sum().item())

Our Loss is 2.028151035308838 while true loss is 2.028151035308838
Difference in backward is: 2.9802322387695312e-08


## Convolution

In [8]:
class convolution(object):
    def __init__(self, in_ch, out_ch, kernel_size = (3,3), padding = 0, stride = 1, use_bias = False):
        self.in_ch = in_ch
        self.out_ch = out_ch
        self.kernel_size = kernel_size
        self.k = self.kernel_size[0]
        self.use_bias = use_bias
        self.stride = stride
        self.padding = padding
        self.kernel = empty(out_ch, in_ch, self.k, self.k).normal_()
        self.bias = empty(out_ch).normal_() if use_bias else torch.zeros(out_ch)
        
    def forward(self, x):   
        
        self.batch_size = x.size(0)
        self.s_in = x.size(-1)
        self.s_out = int(math.ceil((x.size(-2)-self.k+1+self.padding*2)/(self.stride)))
        
        X_unf = unfold(x, kernel_size=(self.k, self.k), padding = self.padding, stride = self.stride)
        
        self.x = x
        self.X_unf = X_unf
    
        K_expand = self.kernel.view(self.out_ch, -1)
        O_expand = K_expand @ X_unf

        
        O = O_expand.view(self.batch_size, self.out_ch, self.s_out, self.s_out)
        return O + self.bias.view(1, -1, 1, 1) if self.use_bias else O
    
    def backward(self, gradwrtoutput):
        dL_dO = gradwrtoutput                                       # (B x OUT_CH x SO x SO)
        dO_dX = self.kernel                                         # (OUT_CH x IN_CH x SI x SI)

        dL_dO_exp = dL_dO.reshape(self.batch_size, self.out_ch, -1) # (B x OUT_CH x (SO x SO))
        dO_dX_exp = dO_dX.reshape(self.out_ch,-1).transpose(0,1)    # (OUT_CH x (IN_CH x SI x SI))
        dL_dO_unf = dO_dX_exp @ dL_dO_exp                           # (B x (IN_CH x SI x SI) x (SO x SO))

        dL_dX = fold(dL_dO_unf, kernel_size = (self.k, self.k), padding = self.padding, stride = self.stride, output_size = (self.s_in, self.s_in))
        
        # backward wrt weights
        dL_dO_exp = dL_dO.transpose(0,1).reshape(self.out_ch, -1) # (OUT_CH x (B x SO x SO))
        dO_dF_exp = self.X_unf.transpose(-1, -2).reshape(self.batch_size*self.s_out*self.s_out, -1) # ((B x SO x SO) x (IN_CH x K x K))
        dL_dF_exp = dL_dO_exp @ dO_dF_exp # (OUT_CH x  (IN_CH x K x K))
        
        self.dL_dF = dL_dF_exp.view(self.out_ch, self.in_ch, self.k, self.k)
        
        # backward wrt bias
        if self.use_bias:
            dO_dB_exp = 1+0*empty(self.batch_size * (self.s_out) * (self.s_out))
            self.dL_dB = dL_dO_exp @ dO_dB_exp
        else:
            self.dL_dB = None
        
        return dL_dX, self.dL_dF, self.dL_dB
        
    def param(self) :
        return ((self.kernel, self.dL_dF), (self.bias, self.dL_dB))

In [9]:
# Initial parameters
s_1, s_2 = 7,7
k_1, k_2 = 3,3
bs = 2
ch_in, ch_out = 2, 4
stride = 2
padding = 1
# input tensor 
X = torch.empty(bs, ch_in, s_1, s_2).normal_().requires_grad_()
X_copy = X.clone().detach().requires_grad_()

# initialize convolution moduls
conv = convolution(ch_in, ch_out, kernel_size = (k_1, k_2), padding = padding, use_bias=True, stride = stride)

# get weigts and bias
F = conv.kernel
B = conv.bias
F.requires_grad_()
B.requires_grad_()

# forward
out = conv.forward(X)
out_compare = torch.nn.functional.conv2d(X_copy, F, bias = B, stride = stride, padding=padding)

# backward
dL_dX, dL_dF, dL_dB = conv.backward(out/out)
out_compare.backward(out_compare/out_compare)

print('same output of conv: ', (out_compare - out).abs().sum().item()) 
print('same input gradient: ', (X_copy.grad - dL_dX).abs().sum().item())
print('same weigth gradient: ',(F.grad-dL_dF).abs().sum().item())
print('same bias gradient: ',(B.grad-dL_dB).abs().sum().item())

same output of conv:  2.746284008026123e-05
same input gradient:  1.5974044799804688e-05
same weigth gradient:  2.0503997802734375e-05
same bias gradient:  0.0


## Transposed Convolution

In [10]:
class transposed_convolution(object):
    def __init__(self, in_ch, out_ch, kernel_size = (3,3), padding = 0, stride = 1, use_bias = False):
        self.in_ch = in_ch
        self.out_ch = out_ch
        self.kernel_size = kernel_size
        self.k_1 = self.kernel_size[0]
        self.k_2 = self.kernel_size[1]
        self.use_bias = use_bias
        self.stride = stride
        self.padding = padding
        self.kernel = empty(in_ch, out_ch, self.k_1, self.k_2).normal_()
        self.bias = empty(out_ch).normal_() if use_bias else 0*empty(out_ch)
    def forward(self, x):
        self.x = x
        self.batch_size = x.size(0)
        self.s1 = self.x.size(-2)
        self.s2 = self.x.size(-1)
        o1 = (self.s1 - 1)*self.stride + 1 + self.k_1 - 1 - self.padding *2
        o2 = (self.s2 - 1)*self.stride + 1 + self.k_2 - 1 - self.padding *2
        
        self.o1 = o1
        self.o2 = o2
        
        x_exp = x.reshape(self.batch_size, self.in_ch, -1)
        K_exp = self.kernel.reshape(self.in_ch,-1).transpose(0,1)
        O_unf = K_exp @ x_exp
        out = fold(O_unf, kernel_size = (self.k_1, self.k_2), padding = self.padding, stride = self.stride, output_size = (o1,o2))
        
        return out + self.bias.view(1, -1, 1, 1) if self.use_bias else out
    
    def backward(self, gradwrtoutput):
        dL_dO = gradwrtoutput      # B x OUT_CH x SO x SO
        dO_dX = self.kernel
        
        dL_dO_unf = unfold(dL_dO, kernel_size = (self.k_1, self.k_2), padding = self.padding, stride = self.stride)
                                   # B x (OUT_CH x K x K) x SI x SI
        dO_dX_exp = dO_dX.view(self.in_ch, -1)
        dL_dX_exp = dO_dX_exp @ dL_dO_unf
        self.dL_dX = dL_dX_exp.view(self.batch_size, self.in_ch, self.s1, self.s2)
        
        self.dL_dO_unf_K = dL_dO_unf.transpose(0,1).reshape(self.out_ch * self.k_1 * self.k_2, -1).transpose(0,1)
                                                                    # (B x SI x SI) x (OUT_CH x K x K)
        self.dO_dF_exp = self.x.transpose(0,1).reshape(self.in_ch, -1)   # IN_CH x (B x SI x SI)
        self.dL_dF_exp = self.dO_dF_exp @ self.dL_dO_unf_K                         # IN_CH x (OUT_CH x K x K)                                                                       
        self.dL_dF = self.dL_dF_exp.view(self.in_ch, self.out_ch, self.k_1, self.k_2)  # OUT_CH x IN_CH x K x K
        
        dL_dO_exp = dL_dO.transpose(0,1).reshape(self.out_ch, -1)
        dO_dB_exp = 1+0*empty(self.batch_size * (self.o1) * (self.o2))
        self.dL_dB = dL_dO_exp @ dO_dB_exp
        
        return self.dL_dX, self.dL_dF, self.dL_dB
        
    def param(self) :
        return ((self.kernel, self.dL_dF), (self.bias, self.dL_dB))

In [11]:
padding = 2
stride = 1
in_ch = 3
out_ch = 2
batch_size = 5
si = 5

test = transposed_convolution(in_ch,out_ch, kernel_size = (3,3), use_bias = True, padding = padding, stride = stride)
kernel_test = empty((in_ch,out_ch,3,3)).normal_() #torch.ones((1,1,3,3))
bias_test = empty((out_ch)).normal_()
test.kernel = kernel_test
test.bias = bias_test
kernel_comp = kernel_test.detach().requires_grad_(True)
bias_comp = bias_test.detach().requires_grad_(True)
input = empty((batch_size,in_ch,si,si)).normal_() #torch.ones((1,1,5,5))
input_comp = input.detach().requires_grad_(True)
output = test.forward(input)

valid = nn.functional.conv_transpose2d(input_comp, kernel_comp, bias = bias_comp, padding = padding, stride = stride)
print('Difference in forward is',(valid-output).abs().sum().item())

valid.backward(valid/valid)

dL_dX, dL_dF, dL_dB = test.backward(output/output)

print('Difference in backward is',(dL_dX-input_comp.grad).abs().sum().item())
print('Difference in kernel grad is',(dL_dF-kernel_comp.grad).abs().sum().item())
print('Difference in bias grad is',(dL_dB-bias_comp.grad).abs().sum().item())

Difference in forward is 1.5497207641601562e-06
Difference in backward is 0.0
Difference in kernel grad is 3.0040740966796875e-05
Difference in bias grad is 0.0
