In [2]:
# 导入必要的库
import re
import math
import torch
import numpy as np
from random import *
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

In [3]:
# 拟造的一段对话
text = (
    'Hello, how are you? I am Romeo.\n' # R
    'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
    'Nice meet you too. How are you today?\n' # R
    'Great. My baseball team won the competition.\n' # J
    'Oh Congratulations, Juliet\n' # R
    'Thank you Romeo\n' # J
    'Where are you going today?\n' # R
    'I am going shopping. What about you?\n' # J
    'I am going to visit my grandmother. she is not very well' # R
)

In [11]:
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n') # filter '.', ',', '?', '!'
print(sentences)
# set 去重， join联合， list转列表
word_list = list(set(" ".join(sentences).split())) # ['hello', 'how', 'are', 'you',...]
print("\nwordlist: ")
print(word_list)
word2idx = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2, '[MASK]' : 3}
# 为每一个词元token创建编号
for i, w in enumerate(word_list):
    word2idx[w] = i + 4
print("\nupdated word2dix: ")
print(word2idx)
idx2word = {i: w for i, w in enumerate(word2idx)}
vocab_size = len(word2idx)

token_list = list()
for sentence in sentences:
    arr = [word2idx[s] for s in sentence.split()]
    token_list.append(arr)

print("\ntokenlist:")
print(token_list)

['hello how are you i am romeo', 'hello romeo my name is juliet nice to meet you', 'nice meet you too how are you today', 'great my baseball team won the competition', 'oh congratulations juliet', 'thank you romeo', 'where are you going today', 'i am going shopping what about you', 'i am going to visit my grandmother she is not very well']

wordlist: 
['won', 'hello', 'am', 'very', 'i', 'going', 'where', 'well', 'she', 'is', 'juliet', 'oh', 'congratulations', 'competition', 'meet', 'romeo', 'shopping', 'are', 'name', 'grandmother', 'today', 'about', 'too', 'you', 'the', 'to', 'not', 'visit', 'my', 'baseball', 'great', 'team', 'what', 'thank', 'nice', 'how']

updated word2dix: 
{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3, 'won': 4, 'hello': 5, 'am': 6, 'very': 7, 'i': 8, 'going': 9, 'where': 10, 'well': 11, 'she': 12, 'is': 13, 'juliet': 14, 'oh': 15, 'congratulations': 16, 'competition': 17, 'meet': 18, 'romeo': 19, 'shopping': 20, 'are': 21, 'name': 22, 'grandmother': 23, 'today'

In [12]:
# BERT Parameters
maxlen = 30  # 表示同一个 batch 中的所有句子都由 30 个 token 组成，不够的补 PAD
batch_size = 6  # 
max_pred = 5 # max tokens of prediction 表示Bert最多需要预测多少个单词
n_layers = 6  
n_heads = 12
d_model = 768  # 特征维度
d_ff = 768*4 # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2  # 表示 Decoder input 由几句话组成

In [None]:
# sample IsNext and NotNext to be same in small batch size
def make_data():
    batch = []
    # pos表示Bert任务中上下文两句话是否相邻，如果是那么pos+1，否则neg+1
    positive = negative = 0 
    while positive != batch_size/2 or negative != batch_size/2: # 最后争取保证数量是1比1 while循环
        # 抽取的是随机某一句话的索引，然后将两句话拼成一句话
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
        
        # 拼接起来 加 cls和 sep
        input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        # MASK LM  表示一共要做多少个mask
        n_pred =  min(max_pred, max(1, int(len(input_ids) * 0.15))) # 15 % of tokens in one sentence
        
        # 记录 所有不是 special token的位置，也就是说记录原句子中单词的位置索引的list
        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word2idx['[CLS]'] and token != word2idx['[SEP]']] # candidate masked position
       
        shuffle(cand_maked_pos)  # shuffle进行随机排序
        # 也就是说可以去做mask的列表我进行随机打乱
        
        # 存储所有的mask的token词元 和索引
        masked_tokens, masked_pos = [], []
        
        # 已经随机过了，所以取前mask数量的坐标缩进就可以了
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            
            # 80%的概率变成mask， 10%的概率变成另一个单词
            if random() < 0.8:  # 80%
                input_ids[pos] = word2idx['[MASK]'] # make mask
            elif random() > 0.9:  # 10%
                index = randint(0, vocab_size - 1) # random index in vocabulary
                
                # 不能随意替换特殊的 token_map
                while index < 4: # can't involve 'CLS', 'SEP', 'PAD'
                    index = randint(0, vocab_size - 1)
                input_ids[pos] = index # replace

        # Zero Paddings 如果这句话不满足maxlen 那么需要padding
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Zero Padding (100% - 15%) tokens
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        # 这一段主要是判断ab两句话是否为前后衔接的语言，并且给出true和false
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1
    return batch
# Proprecessing Finished