Each module will be given with tests to prove it's doing it's job properly and to save it here.

In [1]:
# Acceptable imports for project
from torch import empty , cat , arange
from torch.nn.functional import fold , unfold
import math

# For testing need
import torch
import torch.nn as nn

## ReLU

In [2]:
class relu(object) :
    def __init__(self):
        pass
    def forward(self, input) :
        self.input = input
        self.positif_mask = (input > 0)
        return self.positif_mask*(input)
    def backward(self, gradwrtoutput) :
        self.input.grad = self.positif_mask.int()*gradwrtoutput
        return self.input.grad
    def param(self) :
        return []

In [3]:
dummy_input = torch.randn(10, 5, 7, 3)
dummy_input2 = dummy_input.detach()

our_M = relu()
true_M = nn.ReLU()

out = our_M.forward(dummy_input)
out2 = true_M(dummy_input2)

print('Difference in forward is:', (out-out2).abs().sum().item())

dummy_output = torch.randn(1, 1, 5, 5)
print('From this output:\n',dummy_output)
_ = our_M.forward(dummy_output)
grad = our_M.backward(dummy_output)
print('We get the following backward:\n',grad)

Difference in forward is: 0.0
From this output:
 tensor([[[[-0.0597, -1.4054, -0.0167,  0.7760, -1.0205],
          [ 0.9403,  1.4587,  0.4981, -0.2956, -0.6794],
          [ 0.2983,  0.7684,  0.1283,  0.4312,  0.3987],
          [-0.1909, -1.0144,  0.3050, -0.9059, -0.5573],
          [ 1.3927,  0.3099, -1.5761,  0.1546, -0.2531]]]])
We get the following backward:
 tensor([[[[-0.0000, -0.0000, -0.0000, 0.7760, -0.0000],
          [0.9403, 1.4587, 0.4981, -0.0000, -0.0000],
          [0.2983, 0.7684, 0.1283, 0.4312, 0.3987],
          [-0.0000, -0.0000, 0.3050, -0.0000, -0.0000],
          [1.3927, 0.3099, -0.0000, 0.1546, -0.0000]]]])


## Sigmoid

In [4]:
class sigmoid(object) :
    def forward(self, input) :
        self.input = input
        self.output = 1/(1 + math.e**(-input))
        return  self.output
    def backward(self, gradwrtoutput ) :
        self.input.grad = self.output * (1-self.output) * gradwrtoutput
        return self.input.grad
    def param(self) :
        return []

In [5]:
target = torch.randn(1, 1, 3, 3)
print('Target is\n',target)

input = torch.randn(1, 1, 3, 3, requires_grad=True)
input2 = input.detach().requires_grad_(True)

print('Input is\n',input)
model = sigmoid()
model2 = torch.nn.Sigmoid()

out = model.forward(input)
out2 = model2.forward(input2)

print('Difference in output:', (out-out2).abs().sum().item())

Target is
 tensor([[[[-0.4141, -0.0572, -1.2947],
          [-0.1265,  0.2434,  0.5600],
          [ 0.7101,  0.0277, -1.2050]]]])
Input is
 tensor([[[[-0.7430,  1.7515,  0.9391],
          [-2.9050,  1.3406,  1.1136],
          [-0.9338,  0.9406,  1.3349]]]], requires_grad=True)
Difference in output: 9.313225746154785e-08


## MSE

In [6]:
class mse(object):
    def forward(self, input, target):
        self.input = input
        self.target = target
        return (input - target).pow(2).mean()
    def backward(self, gradwrtoutput):
        self.input.grad = 2*(self.input-self.target)/(self.input.size(-3)*self.input.size(-2)*self.input.size(-1))

In [7]:
MSE = nn.MSELoss()
my_mse = mse()

loss = my_mse.forward(out, target)
loss2 = MSE(out2, target)

print('Our Loss is',loss.item(), 'while true loss is', loss2.item())
my_mse.backward(loss)
model.backward(my_mse.input.grad)

loss2.backward()

print('Difference in backward is:', (input.grad-input2.grad).abs().sum().item())

Our Loss is 1.1602716445922852 while true loss is 1.1602716445922852
Difference in backward is: 1.8975697457790375e-08


## Convolution

In [8]:
class convolution(object):
    def __init__(self, in_ch, out_ch, kernel_size = (3,3), padding = 0, stride = 1, use_bias = False):
        self.in_ch = in_ch
        self.out_ch = out_ch
        self.kernel_size = kernel_size
        self.k_1 = self.kernel_size[0]
        self.k_2 = self.kernel_size[1]
        self.use_bias = use_bias
        self.stride = stride
        self.padding = padding
        self.kernel = empty(out_ch, in_ch, self.k_1, self.k_2).normal_()
        self.bias = empty(out_ch).normal_() if use_bias else 0 *empty(out_ch)
        
    def forward(self, x):   
        self.x = x
        self.batch_size = x.size(0)
        X_unf = unfold(x, kernel_size=(self.k_1, self.k_2), padding = self.padding, stride = self.stride)
        K_expand = self.kernel.view(self.out_ch, -1)
        O_expand = K_expand @ X_unf
        s1 = math.ceil((x.size(-2)-self.k_1+1+self.padding*2)/(self.stride))
        s2 = math.ceil((x.size(-1)-self.k_2+1+self.padding*2)/(self.stride))

        O = O_expand.view(self.batch_size, self.out_ch, s1, s2)
        return O + self.bias.view(1, -1, 1, 1) if self.use_bias else O
    
    def backward(self, gradwrtoutput):
        kernel_back = self.kernel.flip(-2, -1).transpose(0,1)
        s1 = self.x.size(-2)
        s2 = self.x.size(-1)
        
        # backward wrt input
        M = self.get_M(s1-self.k_1 + 1 + self.padding*2)
        dL_dO = (M.transpose(0,1) @ gradwrtoutput) @ M

        dL_dO_unf = unfold(dL_dO, kernel_size=(self.k_1, self.k_2), padding = (self.k_1 - 1 - self.padding, self.k_2-1- self.padding), stride = 1)
        dO_dX_exp = kernel_back.reshape(self.in_ch, -1)
        dL_dX_exp = dO_dX_exp @ dL_dO_unf
        dL_dX = dL_dX_exp.view(self.batch_size, self.in_ch, s1, s2)
        
        self.dL_dO = dL_dO.transpose(0,1) # K
        self.dO_dF = self.x.view(self.in_ch, self.batch_size, s1, s2).transpose(0,1) # X
        
        # backward wrt weights
        dL_dO_unf_F = self.dL_dO.reshape(self.out_ch, -1)
        dO_dF_exp = unfold(self.dO_dF, kernel_size = (s1 - self.k_1 +1 + self.padding*2, s2 - self.k_2 +1 + self.padding*2), padding = self.padding, stride = 1)
        dL_dF_exp = dL_dO_unf_F @ dO_dF_exp.view(self.in_ch,-1,self.k_1*self.k_2)
        dL_dF = dL_dF_exp.transpose(0,1).view(self.kernel.size())
        
        # backward wrt bias
        if self.use_bias:
            dL_dO_exp = self.dL_dO.reshape(self.out_ch, -1)
            dO_dB_exp = torch.ones(self.batch_size * (s1 - self.k_1 +1 + self.padding*2) * (s2 - self.k_2 +1 + self.padding*2))
            dL_dB = dL_dO_exp @ dO_dB_exp
        else:
            dL_dB = None
        self.dL_dX = dL_dX
        self.dL_dF = dL_dF
        self.dL_dB = dL_dB
        return dL_dX, dL_dF, dL_dB
    
    def get_M(self, N):
        diag = empty(N)
        eye_N = (diag == diag).float().diag()
        return eye_N[range(0,N,self.stride)]
        
    def param(self) :
        return [self.kernel, self.bias]

In [10]:
# Initial parameters
s_1, s_2 = 7,7
k_1, k_2 = 3,3
bs = 2
ch_in, ch_out = 2, 4
stride = 2
padding = 1
# input tensor 
X = torch.empty(bs, ch_in, s_1, s_2).normal_().requires_grad_()
X_copy = X.clone().detach().requires_grad_()

# initialize convolution moduls
conv = convolution(ch_in, ch_out, kernel_size = (k_1, k_2), padding = padding, use_bias=True, stride = stride)

# get weigts and bias
F = conv.kernel
B = conv.bias
F.requires_grad_()
B.requires_grad_()

# forward
out = conv.forward(X)
out_compare = torch.nn.functional.conv2d(X_copy, F, bias = B, stride = stride, padding=padding)

# backward
dL_dX, dL_dF, dL_dB = conv.backward(out/out)
out_compare.backward(out_compare/out_compare)

print('same output of conv: ', (out_compare - out).abs().sum().item()) 
print('same input gradient: ', (X_copy.grad - dL_dX).abs().sum().item())
print('same weigth gradient: ',(F.grad-dL_dF).abs().sum().item())
print('same bias gradient: ',(B.grad-dL_dB).abs().sum().item())

same output of conv:  3.135204315185547e-05
same input gradient:  2.6702880859375e-05
same weigth gradient:  0.0
same bias gradient:  0.0


## Transposed Convolution