In [1]:
import torch
from torch import nn
from dplearning_second_part.limu_dplearning.utils.useful_func import masked_softmax

In [2]:
import math
# 从0实现一个Encoderblock
#1、点积注意力
class DotProductAttention(nn.Module):
    def __init__(self, dropout=0.1,**kwargs):
        super(DotProductAttention, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)
    def forward(self, q, k, v, valid_lens=None):
        #q.shape[-1]是静态维度值（整数）将其包装为张量是冗余操作
        # d_lens=torch.tensor(q.shape[-1],device=q.device)
        d_lens=q.shape[-1]
        #对于标量值，PyTorch会自动处理设备兼容性 所以不用显示todevice
        attention_scores=torch.matmul(q,k.transpose(-1,-2)) / math.sqrt(d_lens)
        self.attention_weights=masked_softmax(attention_scores, valid_lens)
        return torch.matmul(self.dropout(self.attention_weights),v)

class MultiHeadAttention(nn.Module):
    def __init__(self,key_size,query_size,value_size,hidden_size,num_heads,dropout=0.1,bias=False,**kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs)
        assert hidden_size%num_heads==0,'整除条件不满足！'
        # 三个调整size的 全连接
        # 易错点 这里的全连接层都是没有偏置项 因为后续会有layer_normal 即使添加偏置项后续也会在减均值的过程中被吸收掉
        #         一个更广义的规则：
        # 如果一个线性层（或卷积层）的输出紧接着一个归一化层（Batch Norm, Layer Norm, Instance Norm, Group Norm），那么这个线性层/卷积层中的偏置项就是冗余的，通常会将其设置为 False。
        self.W_q=nn.Linear(query_size,hidden_size,bias=bias)
        self.W_k=nn.Linear(key_size,hidden_size,bias=bias)
        self.W_v=nn.Linear(value_size,hidden_size,bias=bias)
        # 最终输出用的全连接
        self.W_o=nn.Linear(hidden_size,hidden_size,bias=bias)
        # 注意力函数
        self.attention=DotProductAttention(dropout=dropout)
        # 头数
        self.num_heads=num_heads
        # 隐藏层数
        self.hidden_size=hidden_size


    def forward(self,q,k,v,valid_lens=None):
        #调整qkv最后一层
        # reshape出头数 并放在第二各维度 避免影响遮掩的softmax
        # 错了一个地方 self.hidden_size/self.num_heads结果默认是浮点即使结果是整数 reshape无法接受浮点 因此要用//
        # q_temp=self.W_q(q).reshape(q.shape[0],q.shape[1],self.num_heads,self.hidden_size/self.num_heads).permute(0,2,1,3)
        q_temp=self.W_q(q).reshape(q.shape[0],q.shape[1],self.num_heads,self.hidden_size//self.num_heads).permute(0,2,1,3)
        k_temp=self.W_k(k).reshape(k.shape[0],k.shape[1],self.num_heads,self.hidden_size//self.num_heads).permute(0,2,1,3)
        v_temp=self.W_v(v).reshape(v.shape[0],v.shape[1],self.num_heads,self.hidden_size//self.num_heads).permute(0,2,1,3)

        # 转为三维 将 1 2维度合并
        q_temp=q_temp.reshape(-1,q.shape[1],self.hidden_size//self.num_heads)
        k_temp=k_temp.reshape(-1,k.shape[1],self.hidden_size//self.num_heads)
        v_temp=v_temp.reshape(-1,v.shape[1],self.hidden_size//self.num_heads)

        if valid_lens is not None:
        # 这里很重要有一个知识点 看上面 其实是在batch_size 后增加了一个维度num_head 然后又reshape成batch_size*num_heads
        # 这跟torch和numpy的存储方式有关系 contiguous (行主序)  当然也正是这种存储方式才使得我们要把num_heads 挪到第二维
        # 由于每一个batch下增加的多个num_heads 其实都是归属在这个样本下的不同的注意力头的结果 对于这个样本其实他的valid_lens是不变的 也需要重复num_heads次
        # 所以对于valid_lens 最简单的做法就是复制num_head次就行 所以使用repeat_interleave
        # 当valid_lens 为2d明显要在batch_size维度进行复制，dim=0
        # 当valid_lens为1维时，维度大小=batch_size 这跟我们实现的masked_softmax函数有关 显然也是在batch_size维度复制 所以无论valid_lens为多少维度 都是在dim=0维复制
            valid_lens=valid_lens.repeat_interleave(self.num_heads,dim=0)


        attention_result_total=self.attention(q_temp,k_temp,v_temp,valid_lens)
        outputs=attention_result_total.reshape(q.shape[0],self.num_heads,q.shape[1],-1).permute(0,2,1,3).reshape(q.shape[0],q.shape[1],-1)
        return self.W_o(outputs)

class PositionalEncoding(nn.Module):
    def __init__(self,max_len,hidden_size,dropout=0.1,**kwargs):
        super(PositionalEncoding, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)
        self.P=torch.zeros(1,max_len,hidden_size)
        # 易错点这里建议不用除法， 直接 ：：2 否则少一个
        self.temp=torch.arange(1,max_len+1).unsqueeze(1)/(torch.pow(10000,torch.arange(0,hidden_size,2)/hidden_size))
        #1,2 用 1位置  如果一共只有3个 那就是 只有
        self.P[:,:,0::2]=torch.sin(self.temp)
        self.P[:,:,1::2]=torch.cos(self.temp)

    def forward(self,x):
        # 注意p和x在第二个维度不一定一样,device也不一定一样
        x = x + self.P[:,:x.shape[1],:].to(x.device)
        return self.dropout(x)

class AddNorm(nn.Module):
    def __init__(self,norm_shape,dropout=0.1,**kwargs):
        super(AddNorm, self).__init__(**kwargs)
        self.norm=nn.LayerNorm(norm_shape)
        self.dropout = nn.Dropout(dropout)
    def forward(self,x,y):
        return self.norm(x+self.dropout(y))

class PositionWiseFFN(nn.Module):
    def __init__(self,ffninput_size,ffnhidden_size,ffnoutput_size,**kwargs):
        super(PositionWiseFFN, self).__init__(**kwargs)
        self.dense1 = nn.Linear(ffninput_size,ffnhidden_size)
        self.relu=nn.ReLU()
        self.dense2=nn.Linear(ffnhidden_size,ffnoutput_size)
    def forward(self,x):
        x_temp = self.relu(self.dense1(x))
        return self.dense2(x_temp)

class EncoderBlock(nn.Module):
    def __init__(self,key_size,query_size,value_size,hidden_size,num_heads,norm_shape,ffninput_size,ffnhidden_size,dropout=0.1,bias=False,**kwargs):
        super(EncoderBlock, self).__init__(**kwargs)
        # 位置编码 max=1000 hidden_size 和query的size一样 不是在块里完成的
        # self.position_enc = PositionalEncoding(1000,query_size,dropout=dropout)
        # 多头自注意力key_size,query_size,value_size,hidden_size这四个应该是全都相等
        self.attention=MultiHeadAttention(key_size,query_size,value_size,hidden_size,num_heads,dropout=dropout,bias=bias)
        #位置前馈 ffninput_size=ffnoutput_size=hidden_size
        self.position_ffn=PositionWiseFFN(ffninput_size,ffnhidden_size,hidden_size,**kwargs)
        # norm_shape = (l,hidden_size)
        self.add_norm=AddNorm(norm_shape,dropout=dropout)

    def forward(self,x_position,valid_lens=None):
        y_attention=self.attention(x_position,x_position,x_position,valid_lens=valid_lens)
        x_first=self.add_norm(x_position,y_attention)
        return self.add_norm(x_first,self.position_ffn(x_first))


In [3]:
x=torch.ones((2,100,24))
valid_lens=torch.tensor([3,2])

In [4]:
encoder_blk=EncoderBlock(key_size=24,query_size=24,value_size=24,hidden_size=24,num_heads=8,norm_shape=[100,24],ffninput_size=24,ffnhidden_size=48,dropout=0.5)
encoder_blk.eval()
encoder_blk(x,valid_lens)

tensor([[[-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350],
         [-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350],
         [-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350],
         ...,
         [-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350],
         [-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350],
         [-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350]],

        [[-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350],
         [-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350],
         [-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350],
         ...,
         [-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350],
         [-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350],
         [-0.1752, -0.7221, -1.1955,  ..., -0.2713,  0.0357, -0.8350]]],
       grad_fn=<NativeLayerNormBackward0>)

In [5]:
class TransformerEncoder(nn.Module):
    def __init__(self,vocab_size,key_size,query_size,value_size,hidden_size,num_head,norm_shape,
                 num_layers,ffninput_size,ffnhidden_size,dropout=0.1,bias=False,*args):
        super(TransformerEncoder, self).__init__(*args)
        self.hidden_size=hidden_size
        self.embedding = nn.Embedding(vocab_size,hidden_size)
        self.position_embedding = PositionalEncoding(1000,hidden_size,dropout=dropout)
        self.blks=nn.Sequential()
        for i in range(num_layers):
            self.blks.add_module(f'{i}'+'blk'
                                 ,EncoderBlock(hidden_size,hidden_size,hidden_size,hidden_size,num_head,norm_shape,ffninput_size,ffnhidden_size,dropout=dropout,bias=bias))
    def forward(self,x,valid_lens=None):
        x = self.embedding(x)
        # torch.sqrt的输入必须是tensor
        # 当一个 torch.Tensor 与一个 Python 标量进行算术运算（如加、减、乘、除）时，PyTorch 会自动将该标量广播 (broadcast) 到张量的所有元素上，并进行操作。
        x_position=self.position_embedding(x*torch.sqrt(torch.tensor(self.hidden_size)))
        self.attention_weights=[None]*len(self.blks)
        # 易错点 这个地方不能这么写因为 X valid_lens是两个参数 sequential只支持一个参数传递
        # return self.blks(x_position,valid_lens)
        for num,module in enumerate(self.blks):
            x_position=module(x_position,valid_lens=valid_lens)
            self.attention_weights[num]=module.attention.attention.attention_weights
        return x_position

In [6]:
encoder = TransformerEncoder(
    200, 24, 24, 24, 24,2, [100, 24],4, 24, 48, 0.5)
encoder.eval()
encoder(torch.ones((2, 100), dtype=torch.long), valid_lens).shape

torch.Size([2, 100, 24])

In [8]:
class BERTEncoder(nn.Module):
    """BERT编码器"""
    def __init__(self,vocab_size,hidden_size,num_head,norm_shape,ffninput_size
                 ,ffnhidden_size,num_layers,dropout=0.1,bias=False,max_lens=1000,key_size=768,query_size=768,value_size=768,**kwargs):
        super(BERTEncoder, self).__init__(**kwargs)
        self.hidden_size=hidden_size
        self.token_embedding = nn.Embedding(vocab_size,hidden_size)
        self.segment_embedding = nn.Embedding(2,hidden_size)
        self.blks=nn.ModuleList()
        for i in range(num_layers):
            self.blks.add_module(f'{i}'+'blk',EncoderBlock(key_size,query_size,value_size
                    ,hidden_size,num_head,norm_shape,ffninput_size,ffnhidden_size,dropout=dropout,bias=bias))
        # 可学习的位置参数
        # 在BERT中，位置嵌入是可学习的，因此我们创建一个足够长的位置嵌入参数
        self.position_embedding = nn.Parameter(torch.randn(1,max_lens,hidden_size))
    def forward(self,tokens,segments,valid_lens=None):
        tokens,segments=self.token_embedding(tokens),self.segment_embedding(segments)
        x=tokens+segments+self.position_embedding.repeat(tokens.shape[0],1,1)[:,:tokens.shape[1],:]
        for i,blk in enumerate(self.blks):
            x=blk(x,valid_lens=valid_lens)
        return x

In [9]:
vocab_size, hidden_size, ffnhidden_size, num_heads = 10000, 768, 1024, 4
norm_shape, ffninput_size, num_layers, dropout = [768], 768, 2, 0.2
encoder = BERTEncoder(vocab_size, hidden_size, num_heads, norm_shape, ffninput_size,
                      ffnhidden_size, num_layers, dropout)

In [10]:
tokens = torch.randint(0, vocab_size, (2, 8))
segments = torch.tensor([[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 1, 1]])
encoded_X = encoder(tokens, segments, None)
encoded_X.shape

torch.Size([2, 8, 768])

In [31]:
class MaskLM(nn.Module):
    """BERT的掩蔽语言模型任务"""
    def __init__(self,vocab_size,hidden_size,inputs_size=768,**kwargs):
        super(MaskLM, self).__init__(**kwargs)
        self.mlp=nn.Sequential(nn.Linear(inputs_size,hidden_size),
                               nn.ReLU(),
                               nn.LayerNorm(hidden_size),
                               nn.Linear(hidden_size,vocab_size),
                               )
    def forward(self, X, pred_positions):
        # 每个样本要预测的个数
        num_pred_positions = pred_positions.shape[1]
        
        # 把idx展平 以便作为第二维度
        pred_positions = pred_positions.reshape(-1)
        
        # 获取第一个维度
        batch_size = X.shape[0]
        batch_idx = torch.arange(batch_size)
        # 要给每个pred_postion索引配一个batch_size索引 这样二维索引可以筛选出所有的mask位置
        batch_idx=batch_idx.repeat_interleave(num_pred_positions)
        masked_X =X[batch_idx,pred_positions]
        masked_X=masked_X.reshape(batch_size,num_pred_positions,-1)
        mlm_Y_hat = self.mlp(masked_X)
        return mlm_Y_hat

In [35]:
mlm = MaskLM(vocab_size, hidden_size)
mlm_positions = torch.tensor([[1, 5, 2], [6, 1, 5]])
mlm_Y_hat = mlm(encoded_X, mlm_positions)
mlm_Y_hat.shape

torch.Size([2, 3, 10000])

In [33]:
class NextSentencePred(nn.Module):
    """bert的下一句预测任务"""
    def __init__(self,num_inputs,**kwargs):
        super(NextSentencePred,self).__init__(**kwargs)
        self.out=nn.Linear(num_inputs,2)
    def forward(self,x):
        # X的形状：(batchsize,num_hiddens)
        return self.out(x)

In [27]:
mask_x=x[[1,1,0,1],[1,2,2,1]]

In [28]:
mask_x

tensor([ 0.3522, -0.3542,  1.7753,  0.3522])

# 数据集

In [64]:
import os
import random
import torch
import requests
import pandas as pd
from d2l import torch as d2l
train_file=r'D:\code_file\dplearning_second_part\data\wikitext2\train-00000-of-00001.parquet'

In [65]:
data_dir=train_file

In [85]:
def _read_wiki(data_dir):
    file_name=data_dir
    lines=pd.read_parquet(file_name).to_numpy().tolist()
    paragraphs = [line[0].strip().lower().split('.') for line in lines if len(line[0].split('.'))>=2]
    random.shuffle(paragraphs)
    return paragraphs
d=_read_wiki(data_dir)

In [None]:
def get_tokens_and_segments(tokens_a,tokens_b=None):
    tokens=['<cls>']+tokens_a+['<sep>']
    segments=[0]*len(tokens)
    if tokens_b is not None:
        tokens=tokens+tokens_b+['<sep>']
        segments=segments+[1]*(len(tokens_b)+1)
    return tokens,segments
get_tokens_and_segments(tokens_a=['a'])

# 生成下一句预测任务的
def _get_next_sentence(sentence,next_sentence,paragraphs):
    if random.random()<0.5:
        is_next=True
    else:
        # paragraph 三种列表嵌套
        next_sentence=random.choice(random.choice(paragraphs))
        is_next=False
    return sentence,next_sentence,is_next

def _get_nsp_data_from_paragraph(paragraph,paragraphs,vocab,max_len):
    nsp_data_from_paragraph=[]
    for i in range(len(paragraph)-1):
        tokens_a, tokens_b, is_next = _get_next_sentence(paragraph[i], paragraph[i + 1], paragraphs)
        if len(tokens_a)+len(tokens_b)>max_len:
            continue
        tokens,segments = get_tokens_and_segments(tokens_a,tokens_b)
        nsp_data_from_paragraph.append((tokens,segments,is_next))
    return nsp_data_from_paragraph
        

In [87]:
def _replace_mlm_tokens(tokens,candidate_pred_positions,num_mlm_preds,vocab):
    # 为遮蔽语言模型的输入创建新的词元副本，其中输入可能包含替换的<mask> 或随机词元
    mlm_

['the band \'s set list was similar to that of most shows on the popmart tour , but with " sunday bloody sunday " in place of the edge \'s karaoke segment and the addition of " miss sarajevo " in the second encore ',
 ' the night was a celebration of the end of the war , with bono setting the tone by shouting out " viva sarajevo ! <unk> the past , kiss the future ! " at the beginning of " even better than the real thing " ',
 ' bono had struggled with his voice throughout the tour , and the morning of the concert he <unk> up " without a voice " ',
 ' there was no intent to cancel , and the show went ahead as planned ',
 ' though bono had few difficulties through the opening quartet of " <unk> " , " i will follow " , " gone " , and " even better than the real thing " , his voice gave out during " last night on earth " ',
 ' in 2006 , the edge suggested that bono \'s vocal troubles had been caused by <unk> or by the stress of the previous few months of touring , though he later remarked 