In [1]:
from torch import nn
import torch
from transformers import AutoConfig

In [2]:
model_name = 'bert-base-uncased'
config = AutoConfig.from_pretrained(model_name)

Feed forward layer:

- Two layer fully connected NN
- Processes each embedding independently
- Position wise feed forward layer
- Usually hidden size is 4 times the embedding size and GELU activation is used
- Most memorization is supposed to happen here, gets scaled when scaling the model

In [3]:
class FeedForward(nn.Module):

    def __init__(self, config):
        super(FeedForward, self).__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

Read saved attention output

In [4]:
attn_output = torch.load('attn_output.pt')

In [5]:
feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_output)
ff_outputs.size()

torch.Size([1, 7, 768])

In [6]:
feed_forward

FeedForward(
  (linear_1): Linear(in_features=768, out_features=3072, bias=True)
  (linear_2): Linear(in_features=3072, out_features=768, bias=True)
  (gelu): GELU(approximate='none')
  (dropout): Dropout(p=0.1, inplace=False)
)

Info on nn.Module and forward method:

- In PyTorch, you always need to define a forward method for your neural network model. 
- But you never have to call model.forward(x). 
- The super(Net, self).__init__() refers to the fact that this is a subclass of nn.Module and is inheriting all methods. 
- In the super class, nn.Module, there is a __call__ method which obtains the forward function from the subclass and calls it. 