### 1. ENCODER LAYER

In [10]:
import torch
import torch.nn as nn 

# Initialize the TransformerEncoderLayer
encoder_layer = nn.TransformerEncoderLayer(
    d_model=3, 
    nhead=1, 
    dim_feedforward=4, 
    dropout=0.1, 
    activation="relu",
    batch_first=True
)


# run for a sample
src = torch.Tensor([[[ 0.69,  0.72, -1.41],
                     [ 0.21,  1.10, -1.31]]])

out = encoder_layer(src)
print(out)

tensor([[[ 0.5854,  0.8222, -1.4076],
         [ 0.1275,  1.1560, -1.2835]]], grad_fn=<NativeLayerNormBackward0>)


In [11]:
src.shape, out.shape 

(torch.Size([1, 2, 3]), torch.Size([1, 2, 3]))

In [12]:
# check

x = encoder_layer.self_attn(src, src, src)[0]
x = src + x
x1 = encoder_layer.norm1(x)
x = encoder_layer.linear2( torch.nn.ReLU()(encoder_layer.linear1(x1)) )
x = x + x1
x = encoder_layer.norm2(x)
print(x)

tensor([[[ 0.5983,  0.8106, -1.4089],
         [ 0.0828,  1.1812, -1.2640]]], grad_fn=<NativeLayerNormBackward0>)


### 2. MARK ENCODER LAYER

In [13]:
# Initialize the TransformerEncoderLayer
encoder_layer_mask = nn.TransformerEncoderLayer(
    d_model=3, 
    nhead=1, 
    dim_feedforward=4, 
    dropout=0.1, 
    activation="relu",
    batch_first=True
)

# run for a sample
src = torch.Tensor([[[ 0.69,  0.72, -1.41],
                     [ 0.21,  1.10, -1.31],
                     [-0.88,  0.60, -0.31]]])

mask = torch.triu(input=torch.ones(3, 3), diagonal=1).bool()

out = encoder_layer_mask(src, src_mask=mask)
print(out)

tensor([[[ 0.6199,  0.7909, -1.4108],
         [ 0.0297,  1.2096, -1.2393],
         [-0.7893,  1.4109, -0.6216]]], grad_fn=<NativeLayerNormBackward0>)


In [14]:
src.shape, out.shape

(torch.Size([1, 3, 3]), torch.Size([1, 3, 3]))

In [15]:
# check

x = encoder_layer.self_attn(src, src, src, attn_mask=mask)[0]
x = src + x
x1 = encoder_layer.norm1(x)
x = encoder_layer.linear2( torch.nn.ReLU()(encoder_layer.linear1(x1)) )
x = x + x1
x = encoder_layer.norm2(x)
print(x)

tensor([[[ 0.6375,  0.7745, -1.4120],
         [ 0.0828,  1.1812, -1.2640],
         [-1.1091,  1.3144, -0.2053]]], grad_fn=<NativeLayerNormBackward0>)


### 3. TRANSFORMER 

In [16]:
encoder_layer = nn.TransformerEncoderLayer(
    d_model=3, 
    nhead=1, 
    dim_feedforward=4, 
    dropout=0.1, 
    activation="relu",
    batch_first=True
)

src = torch.Tensor([[[0.48, 0.44, 0.71],
                     [0.65, 0.80, 0.79]]])
context = encoder_layer(src)
print(context)

tensor([[[-0.2936, -1.0512,  1.3449],
         [ 1.1113,  0.2018, -1.3131]]], grad_fn=<NativeLayerNormBackward0>)


In [17]:
decoder_layer = nn.TransformerDecoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True,
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)

# test
target = torch.Tensor( [[[0.3516, 0.9509, 0.2771],
                         [0.1993, 0.0177, 0.2628],
                         [0.0774, 0.5253, 0.6413],
                         [0.6749, 0.5501, 0.1641]]])
out = decoder_layer(target, context) # target: ~ Q, context: ~ K, V
print(out)

tensor([[[-0.0987,  1.2711, -1.1724],
         [-1.2769,  0.1121,  1.1648],
         [-1.2342,  1.2150,  0.0192],
         [ 0.6845,  0.7294, -1.4140]]], grad_fn=<NativeLayerNormBackward0>)


### 4. MASKED TRANSFORMER

In [18]:
encoder_layer = nn.TransformerEncoderLayer(
    d_model=3, 
    nhead=1, 
    dim_feedforward=4, 
    dropout=0.1, 
    activation="relu",
    batch_first=True
)

decoder_layer = nn.TransformerDecoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True,
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)


src = torch.Tensor([[[0.48, 0.44, 0.71],
                     [0.65, 0.80, 0.79]]])

tgt = torch.Tensor([[[0.3516, 0.9509, 0.2771],
                     [0.1993, 0.0177, 0.2628],
                     [0.0774, 0.5253, 0.6413],
                     [0.6749, 0.5501, 0.1641]]])

context = encoder_layer(src)
mask = torch.triu(input=torch.ones((4, 4)), diagonal=1).bool()
out = decoder_layer(tgt, context, tgt_mask=mask)
print(out)

tensor([[[ 0.0172,  1.2160, -1.2333],
         [ 0.5286, -1.4003,  0.8716],
         [-1.3543,  0.3243,  1.0299],
         [ 1.2263, -0.0031, -1.2232]]], grad_fn=<NativeLayerNormBackward0>)


In [19]:
# check 

# encoder_layer
x = encoder_layer.self_attn(src, src, src)[0]
x = src + x
x1 = encoder_layer.norm1(x)
x = encoder_layer.linear2( torch.nn.ReLU()(encoder_layer.linear1(x1)) )
x = x + x1
context = encoder_layer.norm2(x)

# decoder_layer
mask = torch.triu(torch.ones(4, 4), diagonal=1).bool() 
x = decoder_layer.self_attn(target, target, target, attn_mask=mask)[0]
x = target + x
x1 = decoder_layer.norm1(x)
x = decoder_layer.multihead_attn(x1, context, context)[0]
x2 = decoder_layer.norm2(x + x1)
x = decoder_layer.linear2( torch.nn.ReLU()(decoder_layer.linear1(x2)) )
x = decoder_layer.norm3(x + x2)
print(x)



tensor([[[ 0.0461,  1.2011, -1.2471],
         [ 0.5188, -1.3988,  0.8799],
         [-1.3214,  0.2243,  1.0971],
         [ 1.2660, -0.0871, -1.1789]]], grad_fn=<NativeLayerNormBackward0>)
