# Playground for FeedForward Layer

## GELU Activation
* Use Gaussian Error Linear Unit (GeLU) instead of ReLU
    * More Smooth than ReLU für better performance

In [14]:
import torch
import torch.nn as nn

torch.manual_seed(42)

class GELU(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x,3))
        ))


In [15]:
import matplotlib.pyplot as plt

def plot_GELU_and_RELU():

    x = torch.linspace(-3, 3, 100) # sample data

    gelu = GELU()
    relu = nn.ReLU()

    y_gelu = gelu(x)
    y_relu = relu(x)

    plt.figure(figsize=(8, 3))
    for i, (y, label) in enumerate(zip([y_gelu, y_relu], ["GELU", "ReLU"]), 1):
        plt.subplot(1, 2, i)
        plt.plot(x, y)
        plt.title(f"{label} activation function")
        plt.xlabel("x")
        plt.ylabel(f"{label}(x)")
        plt.grid(True)

    plt.tight_layout()
    plt.show()

# _test_plot = plot_GELU_and_RELU()

## FeedForward Layer

In [16]:
class FeedForward(nn.Module):

    def __init__(self, emb_dim, verbose=False):
        super().__init__()
        
        hidden_dim = 4 * emb_dim # some common convention

        self.layers = nn.Sequential(
            nn.Linear(emb_dim, hidden_dim),
            GELU(),
            nn.Linear(hidden_dim, emb_dim),
        )

        if verbose:
            print(f"\n=== FeedForward Initialization ===")
            print(f"    Input and output dimensions = ", emb_dim)
            print(f"    Hidden dimension = ",hidden_dim)        
            print(f"=== End Initialization ===\n")

    def forward(self, x, verbose=False):
        return self.layers(x)

## Test Run

In [17]:
def test_feedForward(verbose = False):

    embbed_dim = 6
    print(f'Embbed_dim: ', embbed_dim)

    ffn = FeedForward(embbed_dim, verbose=verbose)

    x = torch.rand(2, 3, embbed_dim) # 2 batches, 3 context_length, embed_dim
    print("Sample data: ", x)

    out = ffn(x, verbose=verbose)

    print("\nOutput shape ", out.shape)
    print("Output data ", out )


_test_run = test_feedForward(True)

Embbed_dim:  6

=== FeedForward Initialization ===
    Input and output dimensions =  6
    Hidden dimension =  24
=== End Initialization ===

Sample data:  tensor([[[0.4234, 0.6038, 0.1525, 0.3970, 0.8703, 0.7563],
         [0.1836, 0.0991, 0.1583, 0.0066, 0.1142, 0.3764],
         [0.8374, 0.5837, 0.1197, 0.0989, 0.7487, 0.1281]],

        [[0.4384, 0.7399, 0.2686, 0.4455, 0.4565, 0.3817],
         [0.2465, 0.0543, 0.0958, 0.2323, 0.9829, 0.2585],
         [0.1642, 0.6212, 0.6378, 0.7740, 0.8801, 0.7784]]])

Output shape  torch.Size([2, 3, 6])
Output data  tensor([[[-0.2428,  0.2256,  0.1109, -0.0539,  0.3551,  0.1985],
         [-0.1200,  0.1231,  0.0985, -0.0327,  0.2840,  0.1089],
         [-0.2068,  0.2289,  0.0662, -0.0584,  0.3875,  0.1665]],

        [[-0.2002,  0.2058,  0.1277, -0.0006,  0.3472,  0.2213],
         [-0.1778,  0.2566,  0.1017, -0.0911,  0.3864,  0.1499],
         [-0.2514,  0.2692,  0.1715, -0.0407,  0.3557,  0.2312]]],
       grad_fn=<ViewBackward0>)
