In [2]:
import torch
import torch.nn as nn


class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self,x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0/torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [3]:
class ExampleDeepNeuralNetwork(nn.Module):

    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0],layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1],layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2],layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3],layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4],layer_sizes[4]), GELU()),
        ])

    
    def forward(self ,x):
        for layer in self.layers:
            layer_output = layer(x)
            if self.use_shortcut and x.shape == layer_output.shape:
                x  = x + layer_output
            else:
                x = layer_output
        
        return x

In [4]:
layer_size = [3,3,3,3,3,1]

sample_input = torch.tensor([1., 0., -1.])
torch.manual_seed (123)

model_without_shortcut = ExampleDeepNeuralNetwork(
    layer_size , use_shortcut=False
)

In [7]:
def print_gradients(model ,x):
    output = model(x)
    target = torch.tensor([0.])

    loss = nn.MSELoss()
    loss = loss(output, target)

    loss.backward()

    for name, param in model.named_parameters():
        if "weight" in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [8]:
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.0005546716856770217
layers.1.0.weight has gradient mean of 0.00033483587321825325
layers.2.0.weight has gradient mean of 0.0020371272694319487
layers.3.0.weight has gradient mean of 0.004004639573395252
layers.4.0.weight has gradient mean of 0.004936343524605036


  return F.mse_loss(input, target, reduction=self.reduction)


In [9]:
model_without_shortcut = ExampleDeepNeuralNetwork(
    layer_size , use_shortcut=True
)

print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 1.0842158794403076
layers.1.0.weight has gradient mean of 0.4020997881889343
layers.2.0.weight has gradient mean of 0.7748615741729736
layers.3.0.weight has gradient mean of 0.9566847681999207
layers.4.0.weight has gradient mean of 0.6406673789024353


  return F.mse_loss(input, target, reduction=self.reduction)
