In [4]:
import torch.nn as nn
import torch

# Initialize the TransformerEncoderLayer
encoder_layer = nn.TransformerEncoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True, 
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)

# ---------------------  self_attn.in_proj_weight  --------
in_proj_weight  = torch.tensor([[ 0.01,  0.33, -0.35],
                                [-0.53,  0.46, -0.22],
                                [-0.13, -0.67,  0.28],
                                [ 0.64, -0.32, -0.59],
                                [ 0.59, -0.43,  0.26],
                                [-0.41,  0.68,  0.22],
                                [ 0.52,  0.02, -0.57],
                                [ 0.60, -0.52,  0.56],
                                [-0.08, -0.29, -0.01]]).float()
encoder_layer.self_attn.in_proj_weight = nn.Parameter(in_proj_weight)

# ---------------------  self_attn.out_proj.weight  --------
out_proj_weight  = torch.tensor([[ 0.45, -0.14, -0.43],
                                 [ 0.41,  0.21,  0.25],
                                 [-0.45, -0.52,  0.04]]).float()
encoder_layer.self_attn.out_proj.weight = nn.Parameter(out_proj_weight)

# ---------------------  linear1.weight  --------
linear1_weight  = torch.tensor([[ 0.13, -0.38, -0.21],
                                [ 0.27,  0.16,  0.26],
                                [ 0.24, -0.01,  0.11],
                                [-0.15, -0.27, -0.36]]).float()
encoder_layer.linear1.weight = nn.Parameter(linear1_weight)

# ---------------------  linear2.weight  --------
linear2_weight  = torch.tensor([[ 0.36, -0.30,  0.29,  0.06],
                                [-0.20, -0.25, -0.02, -0.10],
                                [ 0.14,  0.03, -0.30, -0.23]]).float()
encoder_layer.linear2.weight = nn.Parameter(linear2_weight)

# ---------------------  norm1.weight  --------
norm1_weight  = torch.tensor([1., 1., 1.]).float()
encoder_layer.norm1.weight = nn.Parameter(norm1_weight)

# ---------------------  norm2.weight  --------
norm2_weight  = torch.tensor([1., 1., 1.]).float()
encoder_layer.norm2.weight = nn.Parameter(norm2_weight)


'''
# Iterate through the parameters 
for name, param in encoder_layer.named_parameters():
    print(f"{name}: {param.size()}")
    print(param.shape)
    print(param)
    print()
'''

# run for a sample
src = torch.Tensor([[[ 0.69,  0.72, -1.41],
                     [ 0.21,  1.10, -1.31]]])
out = encoder_layer(src)
print(out)

tensor([[[ 0.9753,  0.3992, -1.3745],
         [ 0.5835,  0.8239, -1.4074]]], grad_fn=<NativeLayerNormBackward0>)


In [7]:
# 1. self._sa_block(x)

x = encoder_layer.self_attn(src, src, src)[0]
print(x)

# 2. self.norm1(x + self._sa_block(x))
x = src + x
x1 = encoder_layer.norm1(x)
print(x1)

# 3. self._ff_block(x)
x = encoder_layer.linear2( torch.nn.ReLU()(encoder_layer.linear1(x1)) )

# 4. self.norm2(x + self._ff_block(x))
x = x + x1
x = encoder_layer.norm2(x)
print(x)

tensor([[[ 0.7262,  0.1776, -0.0190],
         [ 0.7279,  0.1864, -0.0325]]], grad_fn=<TransposeBackward0>)
tensor([[[ 0.9063,  0.4871, -1.3933],
         [ 0.5523,  0.8513, -1.4036]]], grad_fn=<NativeLayerNormBackward0>)
tensor([[[ 0.9753,  0.3992, -1.3745],
         [ 0.5835,  0.8239, -1.4074]]], grad_fn=<NativeLayerNormBackward0>)


## Masked Encoder

In [9]:
import torch.nn as nn
import torch

# Initialize the TransformerEncoderLayer
encoder_layer = nn.TransformerEncoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True, 
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)

# ---------------------  self_attn.in_proj_weight  --------
in_proj_weight  = torch.tensor([[ 0.01,  0.33, -0.35],
                                [-0.53,  0.46, -0.22],
                                [-0.13, -0.67,  0.28],
                                [ 0.64, -0.32, -0.59],
                                [ 0.59, -0.43,  0.26],
                                [-0.41,  0.68,  0.22],
                                [ 0.52,  0.02, -0.57],
                                [ 0.60, -0.52,  0.56],
                                [-0.08, -0.29, -0.01]]).float()
encoder_layer.self_attn.in_proj_weight = nn.Parameter(in_proj_weight)

# ---------------------  self_attn.out_proj.weight  --------
out_proj_weight  = torch.tensor([[ 0.45, -0.14, -0.43],
                                 [ 0.41,  0.21,  0.25],
                                 [-0.45, -0.52,  0.04]]).float()
encoder_layer.self_attn.out_proj.weight = nn.Parameter(out_proj_weight)

# ---------------------  linear1.weight  --------
linear1_weight  = torch.tensor([[ 0.13, -0.38, -0.21],
                                [ 0.27,  0.16,  0.26],
                                [ 0.24, -0.01,  0.11],
                                [-0.15, -0.27, -0.36]]).float()
encoder_layer.linear1.weight = nn.Parameter(linear1_weight)

# ---------------------  linear2.weight  --------
linear2_weight  = torch.tensor([[ 0.36, -0.30,  0.29,  0.06],
                                [-0.20, -0.25, -0.02, -0.10],
                                [ 0.14,  0.03, -0.30, -0.23]]).float()
encoder_layer.linear2.weight = nn.Parameter(linear2_weight)

# ---------------------  norm1.weight  --------
norm1_weight  = torch.tensor([1., 1., 1.]).float()
encoder_layer.norm1.weight = nn.Parameter(norm1_weight)

# ---------------------  norm2.weight  --------
norm2_weight  = torch.tensor([1., 1., 1.]).float()
encoder_layer.norm2.weight = nn.Parameter(norm2_weight)

In [10]:
mask = torch.triu(torch.ones(3, 3), diagonal=1).bool()  
print(mask)

tensor([[False,  True,  True],
        [False, False,  True],
        [False, False, False]])


In [11]:
# run for a sample
src = torch.Tensor([[[ 0.69,  0.72, -1.41],
                     [ 0.21,  1.10, -1.31],
                     [-0.88,  0.60, -0.31]]])
out = encoder_layer(src, src_mask=mask)
print(out)

tensor([[[ 0.9379,  0.4478, -1.3856],
         [ 0.5835,  0.8239, -1.4074],
         [-0.8576,  1.4026, -0.5450]]], grad_fn=<NativeLayerNormBackward0>)


In [12]:
# 1. self._sa_block(x)

x = encoder_layer.self_attn(src, src, src, attn_mask=mask)[0]
print(x)

# 2. self.norm1(x + self._sa_block(x))
x = src + x
x1 = encoder_layer.norm1(x)
print(x1)

# 3. self._ff_block(x)
x = encoder_layer.linear2( torch.nn.ReLU()(encoder_layer.linear1(x1)) )

# 4. self.norm2(x + self._ff_block(x))
x = x + x1
x = encoder_layer.norm2(x)
print(x)

tensor([[[ 0.7421,  0.2626, -0.1496],
         [ 0.7279,  0.1864, -0.0325],
         [ 0.5771,  0.0535,  0.1394]]], grad_fn=<TransposeBackward0>)
tensor([[[ 0.8708,  0.5296, -1.4004],
         [ 0.5523,  0.8513, -1.4036],
         [-0.8576,  1.4026, -0.5450]]], grad_fn=<NativeLayerNormBackward0>)
tensor([[[ 0.9379,  0.4478, -1.3856],
         [ 0.5835,  0.8239, -1.4074],
         [-0.8576,  1.4026, -0.5450]]], grad_fn=<NativeLayerNormBackward0>)


## Transformer

In [1]:
import torch.nn as nn
import torch

# Encoder
encoder_layer = nn.TransformerEncoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True,
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)

# test
src = torch.Tensor([[[0.48, 0.44, 0.71],
                     [0.65, 0.80, 0.79]]])
context = encoder_layer(src)
print(context)

tensor([[[-0.8104, -0.5985,  1.4089],
         [-1.0062, -0.3575,  1.3637]]], grad_fn=<NativeLayerNormBackward0>)


In [3]:
decoder_layer = nn.TransformerDecoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True,
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)

# test
target = torch.Tensor( [[[0.3516, 0.9509, 0.2771],
                         [0.1993, 0.0177, 0.2628],
                         [0.0774, 0.5253, 0.6413],
                         [0.6749, 0.5501, 0.1641]]])
out = decoder_layer(target, context)
print(out)

tensor([[[ 0.6339,  0.7778, -1.4118],
         [ 1.3819, -0.9513, -0.4305],
         [-1.3955,  0.8962,  0.4993],
         [ 1.2710, -0.0984, -1.1726]]], grad_fn=<NativeLayerNormBackward0>)


In [4]:
# 1. self._sa_block(x)
x = encoder_layer.self_attn(src, src, src)[0]

# 2. self.norm1(x + self._sa_block(x))
x = src + x
x1 = encoder_layer.norm1(x)

# 3. self._ff_block(x)
x = encoder_layer.linear2( torch.nn.ReLU()(encoder_layer.linear1(x1)) )

# 4. self.norm2(x + self._ff_block(x))
x = x + x1
x = encoder_layer.norm2(x)
print(x)

tensor([[[-0.8104, -0.5985,  1.4089],
         [-1.0062, -0.3575,  1.3637]]], grad_fn=<NativeLayerNormBackward0>)


In [None]:
# x = self.norm1( x + self._sa_block(x) )
# x = self.norm2( x + self._mha_block(x, memory) )
# x = self.norm3( x + self._ff_block(x) )

In [5]:
# 1. self._sa_block(x, tgt_mask)
#mask = torch.triu(torch.ones(4, 4), diagonal=1).bool()  
#print(mask)   , attn_mask=mask
x = decoder_layer.self_attn(target, target, target)[0]
print(x.shape)

# 2. self.norm1(x + self._sa_block(x, tgt_mask))
x = target + x
x1 = decoder_layer.norm1(x)
print(x1.shape)

# 3. self.norm2( x + self._mha_block(x, memory) )
x = decoder_layer.multihead_attn(x1, context, context)[0]
x2 = decoder_layer.norm2(x + x1)

# 4. x = self.norm3(x + self._ff_block(x))
x = decoder_layer.linear2( torch.nn.ReLU()(decoder_layer.linear1(x2)) )
x = decoder_layer.norm3(x + x2)
print(x)

torch.Size([1, 4, 3])
torch.Size([1, 4, 3])
tensor([[[ 0.6339,  0.7778, -1.4118],
         [ 1.3819, -0.9513, -0.4305],
         [-1.3955,  0.8962,  0.4993],
         [ 1.2710, -0.0984, -1.1726]]], grad_fn=<NativeLayerNormBackward0>)


In [None]:
tensor([[[-0.3143,  1.3513, -1.0370],
         [ 1.0661, -1.3378,  0.2717],
         [-1.3103,  0.1943,  1.1160],
         [ 1.0749,  0.2584, -1.3333]]], grad_fn=<NativeLayerNormBackward0>)

## Masked Transformer

In [42]:
# Encoder
encoder_layer = nn.TransformerEncoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True,
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)

decoder_layer = nn.TransformerDecoderLayer(d_model=3, 
                                           nhead=1, 
                                           batch_first=True,
                                           dim_feedforward=4,
                                           dropout=0.0, 
                                           bias=False)


# test
src = torch.Tensor([[[0.48, 0.44, 0.71],
                     [0.65, 0.80, 0.79]]])

target = torch.Tensor( [[[0.3516, 0.9509, 0.2771],
                         [0.1993, 0.0177, 0.2628],
                         [0.0774, 0.5253, 0.6413],
                         [0.6749, 0.5501, 0.1641]]])

context = encoder_layer(src)
mask = torch.triu(torch.ones(4, 4), diagonal=1).bool()  
out = decoder_layer(tgt, context, tgt_mask=mask)
print(out)

tensor([[[-1.1026,  1.3182, -0.2156],
         [-0.8696, -0.5311,  1.4006],
         [-1.3835,  0.4381,  0.9454],
         [ 0.0779,  1.1839, -1.2618]]], grad_fn=<NativeLayerNormBackward0>)


In [44]:
# encoder_layer
x = encoder_layer.self_attn(src, src, src)[0]
x = src + x
x1 = encoder_layer.norm1(x)
x = encoder_layer.linear2( torch.nn.ReLU()(encoder_layer.linear1(x1)) )
x = x + x1
context = encoder_layer.norm2(x)

# decoder_layer
mask = torch.triu(torch.ones(4, 4), diagonal=1).bool() 
x = decoder_layer.self_attn(target, target, target, attn_mask=mask)[0]
x = target + x
x1 = decoder_layer.norm1(x)
x = decoder_layer.multihead_attn(x1, context, context)[0]
x2 = decoder_layer.norm2(x + x1)
x = decoder_layer.linear2( torch.nn.ReLU()(decoder_layer.linear1(x2)) )
x = decoder_layer.norm3(x + x2)
print(x)

tensor([[[-1.1026,  1.3182, -0.2156],
         [-0.8696, -0.5311,  1.4006],
         [-1.3835,  0.4381,  0.9454],
         [ 0.0779,  1.1839, -1.2618]]], grad_fn=<NativeLayerNormBackward0>)


In [None]:
tensor([[[-1.1026,  1.3182, -0.2156],
         [-0.8696, -0.5311,  1.4006],
         [-1.3835,  0.4381,  0.9454],
         [ 0.0779,  1.1839, -1.2618]]], grad_fn=<NativeLayerNormBackward0>)