In [1]:
import torch.nn as nn
import torch

# Initialize the TransformerEncoderLayer
encoder_layer = nn.TransformerEncoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True, 
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)
# run for a sample
src = torch.Tensor([[[ 0.69,  0.72, -1.41],
                     [ 0.21,  1.10, -1.31]]])
out = encoder_layer(src)
print(out)

tensor([[[ 0.9789,  0.3944, -1.3734],
         [ 0.2169,  1.1018, -1.3187]]], grad_fn=<NativeLayerNormBackward0>)


In [2]:
# check

x = encoder_layer.self_attn(src, src, src)[0]
x = src + x
x1 = encoder_layer.norm1(x)
x = encoder_layer.linear2( torch.nn.ReLU()(encoder_layer.linear1(x1)) )
x = x + x1
x = encoder_layer.norm2(x)
print(x)

tensor([[[ 0.9789,  0.3944, -1.3734],
         [ 0.2169,  1.1018, -1.3187]]], grad_fn=<NativeLayerNormBackward0>)


## Masked Encoder

In [3]:
import torch.nn as nn
import torch

# Initialize the TransformerEncoderLayer
encoder_layer = nn.TransformerEncoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True, 
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)
mask = torch.triu(torch.ones(3, 3), diagonal=1).bool() 

In [4]:
# run for a sample
src = torch.Tensor([[[ 0.69,  0.72, -1.41],
                     [ 0.21,  1.10, -1.31],
                     [-0.88,  0.60, -0.31]]])
out = encoder_layer(src, src_mask=mask)
print(out)

tensor([[[-0.3729,  1.3678, -0.9950],
         [-0.6216,  1.4109, -0.7893],
         [-1.3178,  1.1033,  0.2145]]], grad_fn=<NativeLayerNormBackward0>)


In [5]:
# check

x = encoder_layer.self_attn(src, src, src, attn_mask=mask)[0]
x = src + x
x1 = encoder_layer.norm1(x)
x = encoder_layer.linear2( torch.nn.ReLU()(encoder_layer.linear1(x1)) )
x = x + x1
x = encoder_layer.norm2(x)
print(x)

tensor([[[-0.3729,  1.3678, -0.9950],
         [-0.6216,  1.4109, -0.7893],
         [-1.3178,  1.1033,  0.2145]]], grad_fn=<NativeLayerNormBackward0>)


## Transformer

In [6]:
import torch.nn as nn
import torch

# Encoder
encoder_layer = nn.TransformerEncoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True,
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)

# test
src = torch.Tensor([[[0.48, 0.44, 0.71],
                     [0.65, 0.80, 0.79]]])
context = encoder_layer(src)
print(context)

tensor([[[ 0.0565, -1.2520,  1.1955],
         [-0.0252, -1.2120,  1.2371]]], grad_fn=<NativeLayerNormBackward0>)


In [7]:
decoder_layer = nn.TransformerDecoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True,
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)

# test
target = torch.Tensor( [[[0.3516, 0.9509, 0.2771],
                         [0.1993, 0.0177, 0.2628],
                         [0.0774, 0.5253, 0.6413],
                         [0.6749, 0.5501, 0.1641]]])
out = decoder_layer(target, context)
print(out)

tensor([[[-0.9688,  1.3766, -0.4079],
         [-0.0033, -1.2231,  1.2264],
         [-1.3552,  0.3274,  1.0278],
         [ 1.1957,  0.0562, -1.2519]]], grad_fn=<NativeLayerNormBackward0>)


## Masked Transformer

In [9]:
# Encoder
encoder_layer = nn.TransformerEncoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True,
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)

decoder_layer = nn.TransformerDecoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True,
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)


# test
src = torch.Tensor([[[0.48, 0.44, 0.71],
                     [0.65, 0.80, 0.79]]])

tgt = torch.Tensor([[[0.3516, 0.9509, 0.2771],
                     [0.1993, 0.0177, 0.2628],
                     [0.0774, 0.5253, 0.6413],
                     [0.6749, 0.5501, 0.1641]]])

context = encoder_layer(src)
mask = torch.triu(torch.ones(4, 4), diagonal=1).bool()  
out = decoder_layer(tgt, context, tgt_mask=mask)
print(out)

tensor([[[-1.3823,  0.9500,  0.4323],
         [-0.9651, -0.4127,  1.3778],
         [-1.2738,  0.1048,  1.1690],
         [-0.0791,  1.2624, -1.1833]]], grad_fn=<NativeLayerNormBackward0>)


In [10]:
# encoder_layer
x = encoder_layer.self_attn(src, src, src)[0]
x = src + x
x1 = encoder_layer.norm1(x)
x = encoder_layer.linear2( torch.nn.ReLU()(encoder_layer.linear1(x1)) )
x = x + x1
context = encoder_layer.norm2(x)

# decoder_layer
mask = torch.triu(torch.ones(4, 4), diagonal=1).bool() 
x = decoder_layer.self_attn(target, target, target, attn_mask=mask)[0]
x = target + x
x1 = decoder_layer.norm1(x)
x = decoder_layer.multihead_attn(x1, context, context)[0]
x2 = decoder_layer.norm2(x + x1)
x = decoder_layer.linear2( torch.nn.ReLU()(decoder_layer.linear1(x2)) )
x = decoder_layer.norm3(x + x2)
print(x)

tensor([[[-1.3823,  0.9500,  0.4323],
         [-0.9651, -0.4127,  1.3778],
         [-1.2738,  0.1048,  1.1690],
         [-0.0791,  1.2624, -1.1833]]], grad_fn=<NativeLayerNormBackward0>)
