在Pytorch中使用Transformer变换单元

In [1]:
import torch
import torch.nn as nn

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device.type

'cuda'

In [3]:
# 初始化一个Transformer编码器，每个输入向量的维度为4，头数为（指多头自注意力模型的头数）2
encoder = nn.TransformerEncoderLayer(d_model=4,nhead=2,device=device)
encoder

TransformerEncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
  )
  (linear1): Linear(in_features=4, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=4, bias=True)
  (norm1): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
)

通过观察结果可知Transformer结构为：  
- 1个多头自注意力层
- 两个隐含层执行两次隐式线性变换
- 两层归一化层用于将隐含层中的4维向量转换为一维向量
- 三层Droputput层用弃用一部分的神经元以防止过拟合

使用`nn.TransfomerEncoder`为`nn.TransformerEncoderLayer`进行批量编码器的创建

In [10]:
encoders = nn.TransformerEncoder(encoder_layer=encoder,num_layers=6)
encoders

TransformerEncoder(
  (layers): ModuleList(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
      )
      (linear1): Linear(in_features=4, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=4, bias=True)
      (norm1): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
      )
      (linear1): Linear(in_features=4, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=4, bias=True)


可以看到`enocders`变量的类型为：`ModuleList`，即一个包含了多层的编码器的模型列表

In [11]:
# 初始化一个2*3*4的张量
src = torch.rand(2,3,4,device=device)
src

tensor([[[0.4290, 0.6809, 0.2100, 0.5406],
         [0.3785, 0.8010, 0.8515, 0.6673],
         [0.4872, 0.2514, 0.2640, 0.5209]],

        [[0.3463, 0.2781, 0.8126, 0.1205],
         [0.1959, 0.1769, 0.4507, 0.8872],
         [0.2026, 0.6373, 0.6387, 0.8949]]], device='cuda:0')

In [12]:
output = encoders(src)
output

tensor([[[-0.7686,  1.3061, -1.1516,  0.6141],
         [-1.4517,  0.1809, -0.0900,  1.3608],
         [-1.1507, -0.2531, -0.2000,  1.6037]],

        [[ 1.0709,  0.3352,  0.2330, -1.6391],
         [-0.8468, -1.0739,  0.5829,  1.3378],
         [-1.5669, -0.1624,  0.7569,  0.9724]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward0>)

同理，我们可以使用`nn.TransformerDecoderLayer`和`nn.TransformerDecoder`来生成一个解码器

In [14]:
decoder = torch.nn.TransformerDecoderLayer(d_model=4,nhead=2,device=device) # 解码器的参数规格应与编码器保持一致
decoder

TransformerDecoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
  )
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
  )
  (linear1): Linear(in_features=4, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=4, bias=True)
  (norm1): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
  (norm3): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
  (dropout3): Dropout(p=0.1, inplace=False)
)

In [15]:
decoders = torch.nn.TransformerDecoder(decoder_layer=decoder,num_layers=6)
decoders

TransformerDecoder(
  (layers): ModuleList(
    (0): TransformerDecoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
      )
      (multihead_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=4, out_features=4, bias=True)
      )
      (linear1): Linear(in_features=4, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=4, bias=True)
      (norm1): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
      (norm3): LayerNorm((4,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (dropout3): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerDecoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDyn

请注意：在执行解码操作时，仍需准备一个输入张量的副本（输出张量）用于存储解码器的输出结果

In [18]:
outputs_decoder = torch.rand(2,3,4,device=device)
memory = output
memory

tensor([[[-0.7686,  1.3061, -1.1516,  0.6141],
         [-1.4517,  0.1809, -0.0900,  1.3608],
         [-1.1507, -0.2531, -0.2000,  1.6037]],

        [[ 1.0709,  0.3352,  0.2330, -1.6391],
         [-0.8468, -1.0739,  0.5829,  1.3378],
         [-1.5669, -0.1624,  0.7569,  0.9724]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward0>)

In [19]:
out = decoders(outputs_decoder,memory)
out

tensor([[[-0.9508,  1.5854, -0.7538,  0.1191],
         [ 0.7385, -0.8961,  1.2276, -1.0699],
         [ 0.7304, -1.1609,  1.2218, -0.7913]],

        [[-1.2124,  1.4994, -0.4905,  0.2035],
         [ 0.7933, -1.3936,  1.0902, -0.4899],
         [ 0.9058, -1.1722,  1.0739, -0.8075]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward0>)