In [14]:

def get_tokens_and_segments(tokens_a, tokens_b = None):
    #获取输入序列的词元及其索引
    tokens = ['<cls>'] + tokens_a + ['<sep>']
    #0和1分别标记第一段和第二段
    segments = [0] * ( len(tokens_a) + 2)

    #如果输入的是句子对，则需要扩充segments
    if tokens_b is not None:
        tokens += tokens_b + ['<sep>']
        segments +=  [1] * ( len (tokens_b) + 1) 
    return tokens, segments

In [1]:
import torch
from torch import nn
from d2l import torch as d2l
import random

In [16]:

class BERTEncoder(nn.Module):
    # BERT编码器
    def __init__(
            self,
            vocab_size,
            num_hiddens,
            norm_shape,
            ffn_num_input,
            ffn_num_hiddens,
            num_heads,
            num_layers,
            dropout,
            max_len = 1000,
            key_size = 768,
            query_size = 768,
            value_size = 768,
            **kwargs
    ):
        super(BERTEncoder, self).__init__(**kwargs)
        ##token 和 segement
        self.token_embedding = nn.Embedding(vocab_size, num_hiddens)
        self.segement_embedding = nn.Embedding(2, num_hiddens)
        self.blks = nn.Sequential()


        for i in range(num_layers):
            self.blks.add_module(f"{i}", d2l.EncoderBlock(
                key_size,
                query_size,
                value_size,
                num_hiddens,
                norm_shape,
                ffn_num_input,
                ffn_num_hiddens,
                num_heads,
                dropout,
                True
            ))


        #因为BERT中的位置编码是通过学习得到的，所以我们需要创建一个足够长的位置嵌入参数
        #注意到这里初始化使用的是随机数
        self.pos_embedding = nn.Parameter(torch.randn(1, max_len, num_hiddens))
    
    def forward(self, tokens, segments, valid_lens):
        #注意X的尺寸大小，始终为(批量大小， 最大序列长度， num_hiddens)
        X = self.token_embedding(tokens) + self.segement_embedding(segments)
        X = X +self.pos_embedding.data[: , :X.shape[1],: ]
        for blk in self.blks:
            X = blk(X, valid_lens)
            return X


In [17]:
vocab_size, num_hiddens, ffn_num_hiddens, num_heads = 10000, 768, 1024 ,4
norm_shape, ffn_num_input, num_layers, dropout = [768], 768, 2, 0.2
encoder = BERTEncoder(vocab_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers,dropout
                      )

AttributeError: module 'd2l.torch' has no attribute 'EncoderBlock'

In [None]:
def _replace_mlm_tokens(
        tokens, candidate_pred_positions, num_mlm_preds, vocab
):
    
    #为掩蔽语言模型的输入创建新的词元副本，其中输入可能包含替换的"<mask> 词元和随机词元

    mlm_input_tokens = [token for token in tokens]
    pred_positions_and_labels = []
    #打乱后用于掩蔽语言模型任务中获取15%的随机词元进行预测
    random.shuffle(candidate_pred_positions)
    for mlm_pred_position in candidate_pred_positions:
        if len(pred_positions_and_labels) >= num_mlm_preds:
            break
        masked_token = None
        # 80%的概率，把原有词元替换为<'mask'>词元
        if random.random() < 0.8:
            masked_token = '<mask>'
        else:
            #10%的概率，原有词元不发生任何改变
            if random.random() < 0.5:
                masked_token = tokens[mlm_pred_position]
            else:
                #10%的概率，用随机词元来替换
                #这里的设计存在一个问题，vocab如果非常大，读取困难
                masked_token = random.choice(vocab.idx_to_token)
        
        mlm_input_tokens[mlm_pred_position] = masked_token
        #这里保存了idx和单词，但是是15%的都保存了，
        pred_positions_and_labels.append(
            (mlm_pred_position, tokens[mlm_pred_position])
            )

        return mlm_input_tokens, pred_positions_and_labels


In [5]:
name ="panbo"
age = 18
mlm=[]
mlm.append((name, age))
mlm

[('panbo', 18)]

In [None]:
def _get_mlm_data_from_tokens(tokens, vocab):
    candidate_pred_positions = []
    # tokens 是一个字符串列表
    for i,token in enumerate(tokens):
        # 首先屏蔽掉分隔符
        if token in ['<cls>','<sep>']:
            continue

        candidate_pred_positions.append(i)
    ##MLM 掩蔽语言模型任务中预测15%的随机词元
    num_mlm_preds = max(1, round(len(tokens) * 0.15))  # 保证要预测的数据不为空
    mlm_input_tokens, pred_positions_and_labels = _replace_mlm_tokens(
        tokens, candidate_pred_positions, num_mlm_preds, vocab
    ) 

    pred_positions_and_labels = sorted(pred_positions_and_labels,
                                       key= lambda x:x[0])
    
    pred_positions = [v[0] for v in pred_positions_and_labels]
    mlm_pred_labels = [v[1] for v in pred_positions_and_labels]
    return vocab[mlm_input_tokens], pred_positions, vocab[mlm_pred_labels]

    

In [7]:
pred_positions_and_labels = [(3, 'A'), (1, 'B'), (2, 'C')]
pred_positions_and_labels = sorted(pred_positions_and_labels,
                                       key= lambda x:x[0])
pred_positions_and_labels

[(1, 'B'), (2, 'C'), (3, 'A')]

In [9]:
# Example input
pred_positions_and_labels = [(3, 'A'), (1, 'B'), (2, 'C')]

# Extracting pred_positions
pred_positions = [v[0] for v in pred_positions_and_labels]
# Result: [3, 1, 2]

# Extracting mlm_pred_labels
mlm_pred_labels = [v[1] for v in pred_positions_and_labels]
# Result: ['A', 'B', 'C']

numlist = []
signallist = []

for A,B in pred_positions_and_labels:
    numlist.append(A)
    signallist.append(B)

numlist
signallist

['A', 'B', 'C']

In [11]:
numlist

[3, 1, 2]

In [12]:
def _pad_bert_inputs(examples, max_len, vocab):
    max_num_mlm_preds = round(max_len * 0.15)
    all_token_ids, all_segments, valid_lens,  = [], [], []
    all_pred_positions, all_mlm_weights, all_mlm_labels = [], [], []
    nsp_labels = []
    for (token_ids, pred_positions, mlm_pred_label_ids, segments,
         is_next) in examples:
        all_token_ids.append(torch.tensor(token_ids + [vocab['<pad>']] * (
            max_len - len(token_ids)), dtype=torch.long))
        all_segments.append(torch.tensor(segments + [0] * (
            max_len - len(segments)), dtype=torch.long))
        # valid_lens不包括'<pad>'的计数
        valid_lens.append(torch.tensor(len(token_ids), dtype=torch.float32))
        all_pred_positions.append(torch.tensor(pred_positions + [0] * (
            max_num_mlm_preds - len(pred_positions)), dtype=torch.long))
        # 填充词元的预测将通过乘以0权重在损失中过滤掉
        all_mlm_weights.append(
            torch.tensor([1.0] * len(mlm_pred_label_ids) + [0.0] * (
                max_num_mlm_preds - len(pred_positions)),
                dtype=torch.float32))
        all_mlm_labels.append(torch.tensor(mlm_pred_label_ids + [0] * (
            max_num_mlm_preds - len(mlm_pred_label_ids)), dtype=torch.long))
        nsp_labels.append(torch.tensor(is_next, dtype=torch.long))
    return (all_token_ids, all_segments, valid_lens, all_pred_positions,
            all_mlm_weights, all_mlm_labels, nsp_labels)

In [None]:

class _WikiTextDataset(torch.utils.data.Dataset):
    def __init__(self, paragraphs, max_len):
        # 输入paragraphs[i]是代表段落的句子字符串列表
        # 而输出paragraphs[i]是代表段落的句子列表，其中每个句子都是词元列表
        paragraphs = [d2l.tokenize(paragraph, token = 'word' ) for paragraph in paragraphs]
        sentences = [sentence for paragraph in paragraphs for sentence in paragraph]
        self.vocab = d2l.Vocab(sentences, min_freq = 5, reserved_tokens= [
            '<pad>',
            '<mask>',
            '<cls>',
            '<sep>'
        ])

        # 获取下一句预测任务的数据
        # 
        examples = [
            (_get_mlm_data_from_tokens(tokens, self.vocab) + (segments, is_next))
            for tokens, segments, is_next in examples
        ]

        # 填充输入
        (
            self.all_token_ids, self.all_segments, self.valib_lens,
            self.all_pred_positions, self.all_mlm_weights,
            self.all_mlm_labels,
            self.nsp_labels
        ) = _pad_bert_inputs(examples, max_len, self.vocab)

    
    def __getitem__(self, idx):
        return (
            self.all_token_ids[idx], self.all_segments[idx],
            self.valib_lens[idx], self.all_pred_positions[idx],
            self.all_mlm_weights[idx], self.all_mlm_labels[idx],
            self.nsp_labels[idx]
        )
    
    def __len__(self):
        return len(self.all_token_ids)


In [None]:
batch_size, max_len = 512, 64
train_iter, vocab = load_data()


In [None]:
batch_size, max_len = 512,64

net = d2l.BERTModel(
    len(vocab),
    num_hiddens = 128,
    norm_shape = [128],
    ffn_num_input = 128,
    ffn_num_hiddens = 256,
    num_heads = 2,
    num_layers = 2,
    dropout = 0.2,
    key_size = 128,
    query_size = 128,
    value_size = 128,
    hid_in_features = 128,
    mlm_in_features = 128,
    nsp_in_features = 128 
)



In [None]:
loss = nn.CrossEntropyLoss()

In [None]:
def _get_batch_loss_bert(
        net,
        loss,
        vocab_size,
        tokens_X,
        segments_X,
        valid_lens_x,
        pred_positions_X,
        mlm_weights_X,
        mlm_Y,
        nsp_y
):
    ###前向传播
    _, mlm_Y_hat, nsp_Y_hat = net(
        tokens_X,
        segments_X,
        valid_lens_x.reshape(-1),
        pred_positions_X
    )

    ## 计算掩蔽语言模型损失
    mlm_l = loss(mlm_Y_hat.reshape(-1, vocab_size), mlm_Y.reshape(-1)) * mlm_weights_X.reshap(-1,1)
    

In [2]:
import numpy as np

# 假设你有一个包含1000个元素的一维数组
original_array = np.arange(1000)

# 设置vocab_size
vocab_size = 50

# 使用reshape对数组进行重新形状
reshaped_array = original_array.reshape(-1, vocab_size)

# 打印结果
print("原始数组形状:", original_array.shape)
print("重新形状后的数组形状:", reshaped_array.shape)


原始数组形状: (1000,)
重新形状后的数组形状: (20, 50)
