# Embedding层

## Tocken Embedding

In [None]:
import torch
from torch import nn

class TockenEmbedding(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size) #vocab_size 表示词汇表中词元的数量，hidden_size 表示每个词元被映射到的高维空间的维度（即嵌入维度）
        

    def forward(self, x):
        embedded = self.embedding(x)

        return embedded

def test_tocken_embedding():
    vocab_size = 10000 #词汇表大小
    batch_size = 2
    seq_len = 4
    hidden_size = 512
    # torch.randint 生成随机整数张量,生成的整数会在 [0, vocab_size - 1] 这个区间内,生成的张量的形状是 (batch_size, seq_len)
    # torch.randn 生成的是从标准正态分布（均值为 0，方差为 1）中采样的随机浮点数
    x = torch.randint(0, vocab_size, (batch_size, seq_len))

    tocken_embedding = TockenEmbedding(vocab_size, hidden_size)

    output = tocken_embedding(x)

    print("Input shape:", x.shape)
    print("Output shape:", output.shape)

if __name__ == "__main__":
    test_tocken_embedding()

Input shape: torch.Size([2, 4])
Output shape: torch.Size([2, 4, 512])


## Position Embedding

In [None]:
import math
import torch
from torch import nn

class PositionalEmbedding(nn.Module):
    def __init__(self, max_len, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        position = torch.arange(0, max_len).unsqueeze(1).float() # shape:(max_len, 1)
        div_term = torch.exp(torch.arange(0, hidden_size, 2).float() * (- math.log(10000.0) / hidden_size)) # shape:(1, hidden_size/2)

        pe = torch.zeros(max_len, hidden_size)
        pe[:, 0::2] = torch.sin(position * div_term) # shape:(max_len, hidden_size)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe)
    
    def forward(self, x):
        seq_len = x.size(1)
        x = x + self.pe[:seq_len, :].unsqueeze(0) # shape:(1, seq_len, hidden_size)
        return x
    
def test_positional_embedding():
    max_len = 5000
    batch_size = 2
    seq_len = 4
    hidden_size = 512

    x = torch.randn(batch_size, seq_len, hidden_size)

    positional_embedding = PositionalEmbedding(max_len, hidden_size)
    output = positional_embedding(x)

    print("Input shape:", x.shape)
    print("Output shape:", output.shape)

if __name__ == "__main__":
    test_positional_embedding()

    

# Encoder 层

In [None]:
import torch
from torch import nn

class EncoderLayer(nn.Module):
    def __init__(self, hidden_size, num_heads, ff_size, dropout_prob = 0.1):
        super().__init__()
        self.multi_head_attention = MultiHeadAttention(hidden_size, num_heads)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.layer_norm1 = nn.LayerNorm(hidden_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(hidden_size, ff_size),
            nn.ReLU(),
            nn.Linear(hidden_size, ff_size)
        )

        self.dropout2 = nn.Dropout(dropout_prob)
        self.layer_norm2 = nn.LayerNorm(hidden_size)

    def forward(self, x, attention_mask = None):
        # mha子层
        attn_output = self.multi_head_attention(x, attention_mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.layer_norm1(x + attn_output) #残差连接

        # feed_forward 子层
        ff_output = self.feed_forward(out1)
        ff_output = self.dropout2(ff_output)
        out2 = self.layer_norm2(out1 + ff_output) # 残差连接

        return out2
    
def main():
    batch_size = 2
    seq_len = 4
    hidden_size = 512
    num_heads = 8
    ff_size = 2048

    x = torch.randn(batch_size, seq_len, hidden_size)

    encoder_layer = EncoderLayer(hidden_size, num_heads, ff_size)

    output = encoder_layer(x)
    
    print("Input shape:", x.shape)
    print("Output shape:", output.shape)

if __name__ == "__main__":
    main()

# Decoder 层

In [None]:
import torch
from torch import nn

class DecoderLayer(nn.Module):
    def __init__(self, hidden_size, num_heads, ff_size, dropout_prob = 0.1):
        super().__init__()
        self.self_attenion = MultiHeadAttention(hidden_size, num_heads)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.layer_norm1 = nn.LayerNorm(hidden_size)

        self.encoder_decoder_attention = MultiHeadAttention(hidden_size, num_heads)
        self.dropout2 = nn.Dropout(dropout_prob)
        self.layer_norm2 = nn.LayerNorm(hidden_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(hidden_size, ff_size),
            nn.ReLU(),
            nn.Linear(hidden_size, ff_size)
        )
        self.dropout3 = nn.Dropout(dropout_prob)
        self.layer_norm3 = nn.LayerNorm(hidden_size)

    def forward(self, x, encoder_output, self_attention_mask = None, encoder_attention_mask = None):
        # self_attention
        self_attn_output = self.self_attenion(x, self_attention_mask)
        self_attn_output = self.dropout1(self_attn_output)
        out1 = self.layer_norm1(x + self_attn_output)

        # enc_dec_attn
        enc_dec_attn_output = self.encoder_decoder_attention(out1, encoder_output, encoder_attention_mask)
        enc_dec_attn_output = self.dropout2(enc_dec_attn_output)
        out2 = self.layer_norm2(out1 + enc_dec_attn_output)

        # ff
        ff_output = self.feed_forward(out2)
        ff_output = self.dropout3(ff_output)
        out3 = self.layer_norm3(out2 + ff_output)

        return out3
    
def main():
    batch_size = 2
    seq_len = 4
    hidden_size = 512
    num_heads = 8
    ff_size = 2048

    x = torch.randn(batch_size, seq_len, hidden_size)

    encoder_output = torch.randn(batch_size, seq_len, hidden_size)

    encoder_layer = EncoderLayer(hidden_size, num_heads, ff_size)

    output = encoder_layer(x, encoder_output)

    print("Input shape:", x.shape)
    print("Encoder output shape:", encoder_output.shape)
    print("Output shape:", output.shape)

if __name__ == "__main__":
    main()

# 堆叠 Encoder

In [None]:
import torch
from torch import nn

class Encoder(nn.Module):
    def __init__(self, hidden_size, num_heads, ff_size, num_layers, dropout_prob = 0.1):
        super().__init__()
        self.layers = nn.ModuleList([
            EncoderLayer(hidden_size, num_heads, ff_size, dropout_prob)
            for _ in range(num_layers) 
        ])

    def forward(self, x, attention_mask = None):
        for layer in self.layers:
            x = layer(x, attention_mask)
        
        return x
    
def main():
    batch_size = 2
    seq_len = 4
    hidden_size = 512
    num_heads = 8
    ff_size = 2048
    num_layers = 6

    x = torch.randn(batch_size, seq_len, hidden_size)

    encoder = Encoder(hidden_size, num_heads, ff_size, num_layers)

    output = encoder(x)

    print("Input shape:", x.shape)
    print("Output shape:", output.shape)

if __name__ == "__main__":
    main()    

# 堆叠 Decoder

In [None]:
import torch
from torch import nn

class Decoder(nn.Module):
    def __init__(self, hidden_size, num_heads, ff_size, num_layers, dropout_prob):
        super().__init__()
        self.layers = nn.ModuleList([
            DecoderLayer(hidden_size, num_heads, ff_size, dropout_prob)
            for _ in range(num_layers)
        ])

    def forward(self, x, encoder_output, self_attention_mask = None, encoder_decoder_mask = None):
        for layer in self.layers:
            x = layer(x, encoder_output, self_attention_mask, encoder_decoder_mask)
        
        return x

def main():
    batch_size = 2
    seq_len = 4
    hidden_size = 512
    num_heads = 8
    ff_size = 2048
    num_layers = 6

    x = torch.randn(batch_size, seq_len, hidden_size)

    encoder_output = torch.randn(batch_size, seq_len, hidden_size)

    decoder = Decoder(hidden_size, num_heads, ff_size, num_layers)

    output = decoder(x, encoder_output)

    print("Input shape:", x.shape)
    print("Encoder output shape:", encoder_output.shape)
    print("Output shape:", output.shape)

if __name__ == "__main__":
    main() 

# Transformer

In [None]:
import torch
from torch import nn

class Transformer(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_heads, ff_size, num_layers, max_seq_len, dropout_prob = 0.1):
        self.token_embedding = TockenEmbedding(vocab_size, hidden_size)
        self.positional_embedding = PositionalEmbedding(hidden_size, max_seq_len)

        self.encoder = Encoder(hidden_size, num_heads, ff_size, num_layers, dropout_prob)
        self.decoder = Decoder(hidden_size, num_heads, ff_size, num_layers, dropout_prob)

        self.output_linear = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, src, tgt, src_mask = None, tgt_mask = None, src_tgt_mask = None):
        src_emb = self.token_embedding(src) + self.positional_embedding(src)
        tgt_emb = self.token_embedding(tgt) + self.positional_embedding(tgt)

        encoder_output = self.encoder(src_emb, src_mask)
        decoder_output = self.decoder(tgt_emb, encoder_output, tgt_mask, src_tgt_mask)

        output = self.output_linear(decoder_output)

        return output

def main():
    vocab_size = 1000
    hidden_size = 512
    num_heads = 8
    ff_size = 2048
    num_layers = 6
    max_seq_len = 100
    dropout_prob = 0.1
    batch_size = 2
    src_seq_len = 10
    tgt_seq_len = 10

    src = torch.randint(0, vocab_size, (batch_size, src_seq_len))
    tgt = torch.randint(0, vocab_size, (batch_size, tgt_seq_len))

    transformer = Transformer(vocab_size, hidden_size, num_heads, ff_size, num_layers, max_seq_len, dropout_prob)

    output = transformer(src, tgt)

    print("Source shape:", src.shape)
    print("Target shape:", tgt.shape)
    print("Output shape:", output.shape)

if __name__ == "__main__":
    main()