In [1]:
import torch 
import torch.nn as nn
import torchvision
import matplotlib.pyplot as plt
import re
import os
import math
import time

# hugging face的分词器，github地址：https://github.com/huggingface/tokenizers
from transformers import AutoTokenizer
# 用于构建词典
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.nn.functional import pad, log_softmax
from pathlib import Path
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# #读取数据1000w行，这个数据太大收敛速度极慢，跑完一轮需要10小时
# def getEnZh(data='train',data_num=1000):
#     #data_dum:读取数据数量，单位10000
#     if data == 'train':
#         en_open = open('./dataset/train.en',encoding='utf-8')
#         zh_open = open('./dataset/train.zh',encoding='utf-8')
#     else:
#         print('待补充')
#         return
    
#     s = int(1000/data_num)
#     en = en_open.readlines()[::s]
#     zh = zh_open.readlines()[::s]
#     en_open.close()
#     zh_open.close()

#     print('数据读取成功，返回值(en:list,zh:list,length:int)')
#     return en,zh,len(en)

In [3]:
#读取数据2w行
def getEnZh(data='train'):
    #读取训练或测试数据集文件
    file_reader = []
    if data == 'train':
        file_open = open('./transdata_2w/train.txt',encoding='utf-8')
    elif data == 'test':
        file_open = open('./transdata_2w/dev.txt',encoding='utf-8')
    else:
        print('getEnZh获得错误参数，请检查')
        return [],[]
    try:
        file_reader = file_open.readlines()
        # print(file_reader.shape)
    finally:
        file_open.close()

    train_data = []
    for i in file_reader:
        train_data.append(re.split(r'(?:[\t\n])', i))

    zh = []
    en = []
    for i in train_data:
        en.append(i[0])
        zh.append(i[1])    

    if data == 'train':
        print('训练数据集加载成功。')
    elif data == 'test':
        print('测试数据集加载成功。')
    return en,zh,len(train_data)

In [None]:
# #tokenizer
# def mytokenizer(data,lantype='en',length=64):
#     # token = ['<bos>']
#     token = []
#     if lantype == 'en':
#         for i in re.split(r'([,.?!;:\'\"\s])',data):
#             if i not in [' ','']:
#                 token.append(i.lower())
#     elif lantype == 'zh':
#         zh = list(jieba.cut(data))
#         token.extend(zh)
#     else:
#         print('tokenizer获得错误参数，请检查。')
#         return
#     # token.append('<eos>')
#     # while(len(token)<length):
#     #     token.append('<pad>')
#     # if len(token)>=length:
#     #     token = token[:length]
#     #     token[length-1] = '<eos>'
#     return token

# print(mytokenizer("I'm a English tokenizer.",'en',16))
# print(mytokenizer("我是中文分词器！",'zh',16))
# print(mytokenizer("This is a sentence over the limit of the length.",'en',8))


In [4]:
#transformers里的tokenizer基于bert
#构造一个分词器
class Mytokenizer():
    def __init__(self) -> None:
        self.entokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.zhtokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
        pass
    def en(self,s:str):
        return self.entokenizer.tokenize(s)
    def zh(self,s:str):
        return self.zhtokenizer.tokenize(s)

In [5]:
#实现分词器
mytokenizer = Mytokenizer()
te = mytokenizer.en("I'm a English tokenizer basing on Bert.")
tz = mytokenizer.zh("我是中文分词器基于Bert！")
print(te,tz)

['i', "'", 'm', 'a', 'english', 'token', '##izer', 'basin', '##g', 'on', 'bert', '.'] ['我', '是', '中', '文', '分', '词', '器', '基', '于', '[UNK]', '！']


In [6]:
#词表类与函数，包括生成词表，存储词表，读入词表
class Vocab:
    def __init__(self) -> None:
        pass
    #词表生成器
    def vocab_generator(self,en,zh,mytokenizer):
        vocab_en = ['<bos>','<eos>','<pad>','<unk>']
        vocab_zh = ['<bos>','<eos>','<pad>','<unk>']

        for i in en:
            token = mytokenizer.en(i)
            for j in token:
                if j not in vocab_en:
                    vocab_en.append(j)
        print('英文词表构建完成')

        for i in zh:
            token = mytokenizer.zh(i)
            for j in token:
                if j not in vocab_zh:
                    vocab_zh.append(j)
        
        print('中文词表构建完成')
        return vocab_en,vocab_zh

    #词表存储器
    def vocab_save(self,vocab:list,name='vocab.txt'):

        path = './vocab/'+name
        f=open(path,"w",encoding='utf-8')
    
        for line in vocab:
            f.write(line+'\n')
        f.close()
        print('词表存储成功：'+path)

    #词表读取器
    def vocab_loader(self,dir='./vocab/',name=['vocab_en.txt','vocab_zh.txt']):
        fen = open(dir+name[0])
        fzh = open(dir+name[1])

        ren = fen.readlines()
        rzh = fzh.readlines()
        vocab_en=[]
        vocab_zh=[]

        for i in ren:
            vocab_en.append(i.replace('\n',''))
        print('英文词表读取完成')

        for i in rzh:
            vocab_zh.append(i.replace('\n',''))
        print('中文词表加载完成')
        
        return vocab_en,vocab_zh

In [7]:
#实现词表,耗时很久

#1000w数据
# en,zh,_ = getEnZh('train',20)
#2w数据
en,zh,_ = getEnZh()
# vocab = Vocab()
# vocab_en,vocab_zh = vocab.vocab_generator(en,zh,mytokenizer)

# #存储词表
# vocab.vocab_save(vocab_en,'vocab_en_2.txt')
# vocab.vocab_save(vocab_zh,'vocab_zh_2.txt')

训练数据集加载成功。


In [8]:
#读取词表
vocab = Vocab()
# vocab_en,vocab_zh = vocab.vocab_loader()
vocab_en,vocab_zh = vocab.vocab_loader(name=['vocab_en_2.txt','vocab_zh_2.txt'])

英文词表读取完成
中文词表加载完成


In [9]:
#词表参数化方法
dict_en = dict(enumerate(vocab_en))
dict_zh = dict(enumerate(vocab_zh))

en_getindex = dict(zip(dict_en.values(),dict_en.keys()))
zh_getindex = dict(zip(dict_zh.values(),dict_zh.keys()))

def get_index(data, lan):
    idx = []
    #<unk>是词表的第四个，序号为3
    for i in data:
        if i == '[UNK]':
            idx.append(3)
            continue
        if lan == 'en' :
            try:
                idx.append(en_getindex[i])
            except:
                idx.append(3)
        elif lan == 'zh':
            try:
                idx.append(zh_getindex[i])
            except:
                idx.append(3)
        else:
            print('get_index参数错误')
            return
    return idx

print(get_index(mytokenizer.en("I am going to tell you my whole life, the life which did not really begin until the day I first saw you.字"),'en'))

[10, 138, 606, 457, 424, 65, 302, 1553, 306, 203, 383, 306, 1237, 338, 75, 30, 304, 1826, 383, 730, 10, 1742, 144, 65, 5, 3]


In [10]:
#dataset设计
class transDataset(Dataset):
    def __init__(self,datatype='train') -> None:
        super().__init__()
        
        self.datatype = datatype
        self.en,self.zh,self.data_num = self.load_data()
        self.en_tokens = self.enloader()
        self.zh_tokens = self.zhloader()

    def __getitem__(self,index):
        return self.en_tokens[index], self.zh_tokens[index]

    def __len__(self):
        return self.data_num
    
    def load_data(self):
        # return getEnZh(self.datatype)
        return en,zh,len(en)

    def enloader(self):
        ent = []
        for i in self.en:
            ent.append(get_index(mytokenizer.en(i),'en'))
        return ent

    def zhloader(self):
        zht = []
        for i in self.zh:
            zht.append(get_index(mytokenizer.zh(i),'zh'))
        return zht


In [11]:
#加载dataset测试
dataset = transDataset()
print(en[2000],zh[2000])
print(dataset.__getitem__(2000))
print("句子总数：",len(dataset))

They like to sing. 他们都喜欢唱歌。
([108, 214, 457, 192, 5], [27, 45, 440, 292, 102, 258, 257, 5])
句子总数： 21033


In [12]:
#collate设计，整理数据，并返回src，tgt
def collate_fn(batch):
    # 对一个batch的输入进行处理
    src_list = []
    tgt_list = []
    # 给tokens列表添加<bos>,<eos>,<pad>,这里将每句长度都变成64,算上首尾最多62个词。
    for (_src,_tgt) in batch:
        s = [0]
        t = [0]
        # 限制句子长度
        _src = _src[:62]
        for i in _src:
            s.append(i)
        s.append(1)
        # 补pad
        while len(s)<64:
            s.append(2)
        
        _tgt = _tgt[:62]
        for i in _tgt:
            t.append(i)
        t.append(1)
        while len(t)<64:
            t.append(2)

        src_list.append(s)
        tgt_list.append(t)
    
    src = torch.Tensor(src_list)
    tgt = torch.Tensor(tgt_list)

    # nn.Embedding只吃tenser.long类型
    src = src.long()
    tgt = tgt.long()
    
    # tgt_y是目标句子去掉第一个token，即去掉<bos>
    tgt_y = tgt[:, 1:]
    # tgt是目标句子去掉最后一个token
    tgt = tgt[:, :-1]

    # 计算本次batch要预测的token数
    n_tokens = (tgt_y != 2).sum()
    
    return src, tgt, tgt_y, n_tokens

In [13]:
# dataloader设计与实现,batch_size设置
batch_size = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# dataset = transDataset()
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

src, tgt, tgt_y, n_tokens = next(iter(train_loader))
src, tgt, tgt_y = src.to(device), tgt.to(device), tgt_y.to(device)

# 展示shu'jushuju
print("src.size:", src.size())
print("tgt.size:", tgt.size())
print("tgt_y.size:", tgt_y.size())
print("n_tokens:", n_tokens)

src.size: torch.Size([128, 64])
tgt.size: torch.Size([128, 63])
tgt_y.size: torch.Size([128, 63])
n_tokens: tensor(1364)


In [14]:
#模型设计-位置编码
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=64):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model).to(device)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)
    

In [15]:
#模型设计-主体模型
class TranslationModel(nn.Module):

    def __init__(self, d_model, src_vocab, tgt_vocab, dropout=0.1):
        super(TranslationModel, self).__init__()

        self.src_embedding = nn.Embedding(len(src_vocab), d_model, padding_idx=2)
        self.tgt_embedding = nn.Embedding(len(tgt_vocab), d_model, padding_idx=2)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(d_model, dropout=dropout, batch_first=True)
        self.predictor = nn.Linear(d_model, len(tgt_vocab))

    def forward(self, src, tgt):
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size()[-1]).to(device)
        src_key_padding_mask = TranslationModel.get_key_padding_mask(src)
        tgt_key_padding_mask = TranslationModel.get_key_padding_mask(tgt)

        src = self.src_embedding(src)
        tgt = self.tgt_embedding(tgt)
        src = self.positional_encoding(src)
        tgt = self.positional_encoding(tgt)

        out = self.transformer(src, tgt,
                               tgt_mask=tgt_mask,
                               src_key_padding_mask=src_key_padding_mask,
                               tgt_key_padding_mask=tgt_key_padding_mask)

        return out

    @staticmethod
    def get_key_padding_mask(tokens):
        return tokens == 2



In [16]:
#优化器与损失函数
model = TranslationModel(256, vocab_en, vocab_zh)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
class TranslationLoss(nn.Module):

    def __init__(self):
        super(TranslationLoss, self).__init__()
        # 使用KLDivLoss
        self.criterion = nn.KLDivLoss(reduction="sum")
        self.padding_idx = 2

    def forward(self, x, target):
        true_dist = torch.zeros(x.size()).to(device)
        # 将对应index的部分填充为1
        true_dist.scatter_(1, target.data.unsqueeze(1), 1)
        # 找出<pad>部分，对于<pad>标签，全部填充为0，没有1，避免其参与损失计算。
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)

        # 计算损失
        return self.criterion(x, true_dist.clone().detach())

criteria = TranslationLoss()


In [17]:
#推理函数
def translator(model,src:str):
    src_tokens = mytokenizer.en(src)
    src_list = torch.Tensor(get_index(src_tokens,'en')).unsqueeze(0).to(device).long()
    tgt = torch.tensor([[0]]).to(device)

    for i in range(64):
        
        out = model(src_list, tgt)
        # 预测结果，因为只需要看最后一个词，所以取`out[:, -1]`
        predict = model.predictor(out[:, -1])
        # 找出最大值的index
        y = torch.argmax(predict, dim=1)
        # 和之前的预测结果拼接到一起
        tgt = torch.concat([tgt, y.unsqueeze(0)], dim=1)
        # 如果为<eos>，说明预测结束，跳出循环
        if y == 1:
            break
    # 将预测tokens拼起来
    
    outputs = ''
    for i in tgt[0]:
        outputs = outputs+vocab_zh[i]

    return outputs.replace("<bos>", "").replace("<eos>","")

In [None]:
#训练器
torch.cuda.empty_cache()

step = 0
epochs = 100
save_after_step = 2000

model_dir =Path("./2w_model_dir")
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

model.train()
for epoch in range(epochs):
    loop = tqdm(enumerate(train_loader), total=len(train_loader))
    for index, data in enumerate(train_loader):
        # 生成数据
        src, tgt, tgt_y, n_tokens = data
        src, tgt, tgt_y = src.to(device), tgt.to(device), tgt_y.to(device)

        # 清空梯度
        optimizer.zero_grad()
        # 进行transformer的计算
        out = model(src, tgt)
        # 将结果送给最后的线性层进行预测
        out = model.predictor(out)

        """
        计算损失。由于训练时我们的是对所有的输出都进行预测，所以需要对out进行reshape一下。
                我们的out的Shape为(batch_size, 词数, 词典大小)，view之后变为：
                (batch_size*词数, 词典大小)。
                而在这些预测结果中，我们只需要对非<pad>部分进行，所以需要进行正则化。也就是
                除以n_tokens。
        """
        loss = criteria(out.contiguous().view(-1, out.size(-1)), tgt_y.contiguous().view(-1)) / n_tokens
        # 计算梯度
        loss.backward()
        # 更新参数
        optimizer.step()

        loop.set_description("Epoch {}/{}".format(epoch, epochs))
        loop.set_postfix(loss=loss.item())
        loop.update()

        step += 1

        del src
        del tgt
        del tgt_y

        if step != 0 and step % save_after_step == 0:
            torch.save(model, model_dir / f"model_{step}.pt")



torch.save(model, model_dir / f"model_done.pt")
print('训练完成')

In [None]:
#效果测试
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = torch.load('./2w_model_dir/model_done.pt')
model = model.eval()
model = model.to(device)

In [None]:
import random
for i in range(3):
    j = random.randint(0,20000)
    out = translator(model,en[j])
    print('原文:',en[j])
    print('预测:',out)
    print('答案:',zh[j])
