# Test my backpropagation calculus

In [19]:
import numpy as np
import torch
from torch import nn

IN_SIZE, OUT_SIZE = 784, 100
HIDDEN_SIZE = [512, 1024, 1024]

IN_SIZE, OUT_SIZE = 300, 200
HIDDEN_SIZE = []

In [20]:
x = torch.empty((1, IN_SIZE)).uniform_(-10, 10)
y = torch.empty((1, OUT_SIZE)).uniform_(-10, 10)

In [21]:
class TestNet(nn.Module):
    def __init__(self, n_input, n_hiddens, n_output):
        super(TestNet, self).__init__()

        n_layers = [n_input] + n_hiddens + [n_output]
        self.layers = nn.ModuleList()
        for i in range(len(n_layers) - 1):
            self.layers.append(nn.Linear(n_layers[i], n_layers[i+1]))
    
    def forward(self, X):
        self.z, self.a = [], [X]
        for layer in self.layers:
            X = layer(X)
            self.z.append(X)
            X = torch.sigmoid(X)
            self.a.append(X)
        return X

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.uniform_(m.weight, -10, 10)
        nn.init.uniform_(m.bias, -10, 10)

In [22]:
net = TestNet(IN_SIZE, HIDDEN_SIZE, OUT_SIZE)
net.apply(init_weights)

TestNet(
  (layers): ModuleList(
    (0): Linear(in_features=300, out_features=200, bias=True)
  )
)

In [23]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.03)
optimizer.zero_grad()

loss = nn.MSELoss()
loss(net(x), y).backward()

In [24]:
params = [param for param in net.parameters()]
for param in params:
    print(param.shape)

torch.Size([200, 300])
torch.Size([200])


In [25]:
def D_sigmoid(x):
    return torch.sigmoid(x) * (1 - torch.sigmoid(x))

def calculate_gradients(num_layers=4):
    delta = [torch.empty(net.z[i].shape[1]) for i in range(num_layers - 1)]
    delta.append((net.a[-1] - y) * D_sigmoid(net.z[-1]))
    delta[-1] = delta[-1].reshape((-1, 1))

    for i in range(num_layers - 1, 0, -1):
        delta[i - 1] = (params[i * 2].T @ delta[i]) * D_sigmoid(net.z[i - 1]).T
    
    gradients = []
    for i in range(len(delta)):
        gradients.append(delta[i] @ net.a[i])
        gradients.append(delta[i])
    
    return gradients


In [26]:
gradients = calculate_gradients(len(HIDDEN_SIZE) + 1)

In [27]:
for i, param in enumerate(params):
    torch_grad = param.grad if len(param.shape) > 1 else param.grad.view(-1, 1)
    my_grad = gradients[i]
    diff = abs(torch_grad - my_grad)
    # print(torch_grad, my_grad)
    print(torch_grad[diff > 5], my_grad[diff > 5])
    print(diff.shape, torch.max(diff))

tensor([-0.1097, -0.0750,  0.0826, -0.0793, -0.0955,  0.0605, -0.1035,  0.0793,
        -0.1035, -0.0609, -0.0722, -0.1192,  0.0924,  0.0719,  0.0657,  0.1178,
         0.0769, -0.1124, -0.1201, -0.0601,  0.0521,  0.0636, -0.0570,  0.0899,
        -0.0570,  0.1144, -0.1021, -0.0752, -0.1197,  0.0942, -0.0834, -0.1220,
        -0.1042,  0.1179,  0.0602,  0.1148, -0.0976, -0.0770, -0.1066,  0.0783,
         0.0634, -0.0617, -0.0629, -0.1227, -0.1116, -0.0792,  0.0581,  0.0743,
        -0.1145,  0.0805, -0.0565, -0.0692,  0.1226,  0.0566,  0.1116, -0.0858,
        -0.0880, -0.1226,  0.0732,  0.0542,  0.0718,  0.1043,  0.0726,  0.0917,
         0.0560, -0.0909,  0.1184,  0.1093,  0.0787,  0.0739, -0.1194,  0.1043,
        -0.1186,  0.0528, -0.0605,  0.0812,  0.1115, -0.0915, -0.1183,  0.0903,
        -0.0793, -0.1109,  0.0959,  0.1005, -0.1215, -0.0757, -0.0656,  0.0860,
         0.1014,  0.0828, -0.0837,  0.0588, -0.1124, -0.1042, -0.0788, -0.0936,
        -0.1230, -0.1202, -0.0992,  0.06