Import libraries

In [1]:
import torch
import torch.nn as nn
import tiktoken

GeLU Activation function

In [2]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)) * 
        (x + 0.0044715*torch.pow(x,3))))

Example to understand shortcut connections

In [9]:
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU()),]
        )

    def forward(self,x):
        for layer in self.layers:

            layer_output = layer(x)

            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x



In [10]:
layer_sizes = [3,3,3,3,3,1]
sample_input = torch.tensor([[1., 0., -1.]])
torch.manual_seed(123) 
model_without_shortcut = ExampleDeepNeuralNetwork(
layer_sizes, use_shortcut=False
)

In [11]:
def print_gradient(model,x):

    output = model(x)
    target = torch.tensor([[0.]])

    loss = nn.MSELoss()
    loss = loss(output, target)


    loss.backward()

    for name,param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()} ")

In [12]:
print_gradient(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.00020146823953837156 
layers.1.0.weight has gradient mean of 0.00011980159615632147 
layers.2.0.weight has gradient mean of 0.0007132355822250247 
layers.3.0.weight has gradient mean of 0.0013999989023432136 
layers.4.0.weight has gradient mean of 0.005051811225712299 


In [15]:
torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(
layer_sizes, use_shortcut=True)
print_gradient(model_with_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.20885638892650604 
layers.1.0.weight has gradient mean of 0.1954943686723709 
layers.2.0.weight has gradient mean of 0.30882930755615234 
layers.3.0.weight has gradient mean of 0.25369951128959656 
layers.4.0.weight has gradient mean of 1.2657285928726196 
