In [3]:
import re
import math
import torch
import numpy as np
from random import *
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

# 1.准备数据集


In [4]:
text = (
    'Hello, how are you? I am Romeo.\n' # R
    'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
    'Nice meet you too. How are you today?\n' # R
    'Great. My baseball team won the competition.\n' # J
    'Oh Congratulations, Juliet\n' # R
    'Thank you Romeo\n' # J
    'Where are you going today?\n' # R
    'I am going shopping. What about you?\n' # J
    'I am going to visit my grandmother. she is not very well' # R
)# 这是一段手动输入的对话
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n') # 填充 '.', ',', '?', '!'，无用的东西
word_list = list(set(" ".join(sentences).split())) # ['hello', 'how', 'are', 'you',...]这段代码的作用是处理字符串 sentences，首先将其中的句子合并为一个字符串，然后按空格分词，并返回一个不重复的单词集合，集合具有去重性
word2idx = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2, '[MASK]' : 3}# 设置词典对照
for i, w in enumerate(word_list):# 为词表里的每一个词进行编码，enumerate() 是 Python 内建的一个函数，它将一个可迭代对象（比如列表、元组、字符串等）转换为一个可枚举的对象，返回的是一个 元组，其中包含了元素的索引和元素本身
    # w就是每个单词
    word2idx[w] = i + 4 # 为每个元素进行对应编码，+4是因为前面已经有4个值进行编码了
idx2word = {i: w for i, w in enumerate(word2idx)}# 构造键值对集合
vocab_size = len(word2idx)# 获取词表的长度

token_list = list()# 创建list
for sentence in sentences:# 遍历每一句话
    arr = [word2idx[s] for s in sentence.split()]# 对每一句话进行编码，即返回一句话中每个单词对应的索引
    token_list.append(arr)# 添加
print(token_list)# Embedding完成啦

[[10, 7, 14, 32, 38, 29, 21], [10, 21, 35, 9, 13, 22, 37, 12, 39, 32], [37, 39, 32, 5, 7, 14, 32, 30], [16, 35, 4, 11, 15, 26, 33], [19, 36, 22], [31, 32, 21], [8, 14, 32, 27, 30], [38, 29, 27, 18, 6, 20, 32], [38, 29, 27, 12, 28, 35, 34, 24, 13, 23, 25, 17]]


# 2.模型参数
maxlen 表示同一个 batch 中的所有句子都由 30 个 token 组成，不够的补 PAD（这里我实现的方式比较粗暴，直接固定所有 batch 中的所有句子都为 30）
max_pred 表示最多需要预测多少个单词，即 BERT 中的完形填空任务
n_layers 表示 Encoder Layer 的数量
d_model 表示 Token Embeddings、Segment Embeddings、Position Embeddings 的维度
d_ff 表示 Encoder Layer 中全连接层的维度
n_segments 表示 Decoder input 由几句话组成

In [5]:
# BERT Parameters
maxlen = 30
batch_size = 6
max_pred = 5 # max tokens of prediction
n_layers = 6
n_heads = 12
d_model = 768
d_ff = 768*4 # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2

# 3.数据预处理
这里就运用了泛用的两个预训练任务

In [6]:
def make_data():
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:# 一半处理一半不处理
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences)) # 都是在句子列表中随机一个索引，即随机抽一个句子，即一共抽两个句子
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]# 获取这句话的Embedding
        input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]# 做一个拼接，其实就是下一句预测任务的输入
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)# 构造一个列表，第一句话的元素全是0，第二句话的元素全是1

        # MASK LM 掩码预训练任务
        n_pred =  min(max_pred, max(1, int(len(input_ids) * 0.15))) # 15 % of tokens in one sentence，限制预测数量的区间，基本上占输入的百分之15
        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word2idx['[CLS]'] and token != word2idx['[SEP]']] # candidate masked position这段代码的目的是筛选出 input_ids 中除了 [CLS] 和 [SEP] 外的其他所有 token 的索引，并返回这些索引的列表。
        shuffle(cand_maked_pos)# 随机打乱这些索引
        masked_tokens, masked_pos = [], []# 创建两个空列表
        for pos in cand_maked_pos[:n_pred]:# 遍历这些编码，不超过最大数量
            masked_pos.append(pos)# 添加这个词的位置
            masked_tokens.append(input_ids[pos])# 添加位置对应的元素（词）的编码
            if random() < 0.8:  # 80%开始微操啦
                input_ids[pos] = word2idx['[MASK]'] # make mask掩码
            elif random() > 0.9:  # 10%
                index = randint(0, vocab_size - 1) # random index in vocabulary随机索引
                while index < 4: # can't involve 'CLS', 'SEP', 'PAD'不能是无意义的替换
                  index = randint(0, vocab_size - 1)
                input_ids[pos] = index # replace随机替换一个词

        # Zero Paddings
        n_pad = maxlen - len(input_ids)# 要填充的，不够的填0
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Zero Padding (100% - 15%) tokens
        if max_pred > n_pred:# 分别填充
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:# 如果两个句子原文连续，才有效
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext，需要处理的数据
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:# 两个句子原文不连续，那就废了
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext，无需处理的数据
            negative += 1
    return batch
# Proprecessing Finished
# input_ids掩码加下一句标记结果
# segment_ids用于区分两句话
# masked_tokens每次处理一个位置对应的词编码
# masked_pos每次一个处理的词的位置
# isNext是否处理
# 上述这些参数都是对应的累积，即每随机抽的两个句子都有对应的这些参数
# 这样batch就是所有的每一次处理结果

batch = make_data()# 返回这样一个batch，进行了两个预训练任务的输入操作
input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch)# 解包赋值
input_ids, segment_ids, masked_tokens, masked_pos, isNext = \
    torch.LongTensor(input_ids),  torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens),\
    torch.LongTensor(masked_pos), torch.LongTensor(isNext)
# 这一行代码的功能是将 input_ids, segment_ids, masked_tokens, masked_pos, 和 isNext 五个 Python 数据结构（如列表、NumPy 数组等）转换为 PyTorch 的 LongTensor 类型的张量。
class MyDataSet(Data.Dataset):
  def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):# 构造函数没得说
    self.input_ids = input_ids
    self.segment_ids = segment_ids
    self.masked_tokens = masked_tokens
    self.masked_pos = masked_pos
    self.isNext = isNext

  def __len__(self):# 返回长度
    return len(self.input_ids)

  def __getitem__(self, idx):# 返回每一行数据
    return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[idx]

loader = Data.DataLoader(MyDataSet(input_ids, segment_ids, masked_tokens, masked_pos, isNext), batch_size, True)# 创建数据集
# 输出看一下
for batch in loader:
    print("Input IDs:", batch[0])
    print("Segment IDs:", batch[1])
    print("Masked Tokens:", batch[2])
    print("Masked Positions:", batch[3])
    print("Is Next:", batch[4])
    print("---------------")

Input IDs: tensor([[ 1, 10, 21, 35,  9, 13, 22, 37, 12, 39, 32,  2, 31, 32, 21,  2,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  3,  7, 14, 32, 38, 29, 21,  2, 10, 21, 35,  9,  3, 22, 37, 12, 39,
         32,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  8, 14, 32, 27, 30,  2, 38, 29,  3, 18,  6, 20, 32,  2,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 10,  7,  3, 32, 38, 29, 21,  2, 10, 21,  3,  9,  3, 22, 37, 12, 39,
         32,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 19, 36, 22,  2, 37, 39, 32,  5,  3, 14, 32, 30,  2,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 19, 36, 22,  2, 38, 29, 27, 12, 28, 35, 34, 24, 13, 23,  3,  3,  2,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
Segment IDs: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0,

# 4.模型构建
主要是Transformer的Encoder结构

In [7]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, seq_len = seq_q.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_q.data.eq(0).unsqueeze(1)  # [batch_size, 1, seq_len]
    return pad_attn_mask.expand(batch_size, seq_len, seq_len)  # [batch_size, seq_len, seq_len]

def gelu(x):# 激活函数
    """
      Implementation of the gelu activation function.
      For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
      0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
      Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

class Embedding(nn.Module):# 编码
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # [seq_len] -> [batch_size, seq_len]
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)# 三个编码学习相加

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size, n_heads, seq_len, seq_len]注意力公式
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)# 归一化
        context = torch.matmul(attn, V)
        return context

class MultiHeadAttention(nn.Module):# 多头注意力
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size, seq_len, d_model], k: [batch_size, seq_len, d_model], v: [batch_size, seq_len, d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size, n_heads, seq_len, d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size, n_heads, seq_len, d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size, n_heads, seq_len, d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]

        # context: [batch_size, n_heads, seq_len, d_v], attn: [batch_size, n_heads, seq_len, seq_len]
        context = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size, seq_len, n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual) # output: [batch_size, seq_len, d_model]

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_ff) -> (batch_size, seq_len, d_model)
        return self.fc2(gelu(self.fc1(x)))# 激活函数

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, seq_len, d_model]
        return enc_outputs

class BERT(nn.Module):
    def __init__(self):# 构造函数，一些初始化配置
        super(BERT, self).__init__()
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])# Encoder layer
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.Dropout(0.5),
            nn.Tanh(),
        )# 这段代码是一个前馈神经网络模块，由全连接层、Dropout 层和 Tanh 激活函数层组成
        self.classifier = nn.Linear(d_model, 2)# 二分类
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        # fc2 is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        self.fc2 = nn.Linear(d_model, vocab_size, bias=False)
        self.fc2.weight = embed_weight

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids) # [bach_size, seq_len, d_model]
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids) # [batch_size, maxlen, maxlen]
        for layer in self.layers:# 前一个输出是后一个输入
            # output: [batch_size, max_len, d_model]
            output = layer(output, enc_self_attn_mask)
        # it will be decided by first token(CLS)
        h_pooled = self.fc(output[:, 0]) # [batch_size, d_model]进一个线性层
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2] predict isNext

        masked_pos = masked_pos[:, :, None].expand(-1, -1, d_model) # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]将掩码结果收集在一起
        h_masked = self.activ2(self.linear(h_masked)) # [batch_size, max_pred, d_model]掩码任务输出
        logits_lm = self.fc2(h_masked) # [batch_size, max_pred, vocab_size]两个句子是否为一句话的预测
        return logits_lm, logits_clsf
model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.001)# 梯度下降

# 模型训练

In [8]:
for epoch in range(180):
    for input_ids, segment_ids, masked_tokens, masked_pos, isNext in loader:
      logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
      loss_lm = criterion(logits_lm.view(-1, vocab_size), masked_tokens.view(-1)) # for masked LM
      loss_lm = (loss_lm.float()).mean()
      loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
      loss = loss_lm + loss_clsf# 两个预训练任务的损失和
      if (epoch + 1) % 10 == 0:
          print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

Epoch: 0010 loss = 1.800519
Epoch: 0020 loss = 1.053355
Epoch: 0030 loss = 0.829274
Epoch: 0040 loss = 0.793613
Epoch: 0050 loss = 0.813628
Epoch: 0060 loss = 0.757033
Epoch: 0070 loss = 0.781994
Epoch: 0080 loss = 0.755440
Epoch: 0090 loss = 0.779933
Epoch: 0100 loss = 0.765899
Epoch: 0110 loss = 0.749583
Epoch: 0120 loss = 0.761493
Epoch: 0130 loss = 0.769707
Epoch: 0140 loss = 0.801867
Epoch: 0150 loss = 0.731075
Epoch: 0160 loss = 0.749771
Epoch: 0170 loss = 0.715138
Epoch: 0180 loss = 0.788864


# 测试

In [9]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = batch[1]
print(text)
print('================================')
print([idx2word[w] for w in input_ids if idx2word[w] != '[PAD]'])

logits_lm, logits_clsf = model(torch.LongTensor([input_ids]), \
                 torch.LongTensor([segment_ids]), torch.LongTensor([masked_pos]))
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('masked tokens list : ',[pos for pos in masked_tokens if pos != 0])
print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])

logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_clsf else False)

ValueError: too many values to unpack (expected 5)

总体是做了两个预训练任务，让模型能够很好的进行填空以及对两个句子进行是否连续的分析，具有很强的泛用性，可以用在各种下游任务中