In [1]:
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torchnet import meter
import tqdm, math, torch, os, opencc, re, io

# 真实开搞

## 准备

In [2]:
# def get_data():
#     if os.path.exists(data_path):
#         datas = np.load(data_path, allow_pickle=True)      # 加载数据
#         data = datas['data']  # numpy.ndarray
#         word2ix = datas['word2ix'].item()   # dic
#         ix2word = datas['ix2word'].item()  # dic
#         return data, word2ix, ix2word
    
# data_path = '../originalDataset/tang.npz'
# data, word2ix, ix2word = get_data()
# data = torch.from_numpy(data)
# dataloader = torch.utils.data.DataLoader(data, batch_size=10, shuffle=True, num_workers=1)  # shuffle=True随机打乱

In [3]:
cc = opencc.OpenCC('t2s')
def preprocess_sentence_cn(w):
  #将繁体字转换为简体字  
    w = cc.convert(w)
    w = ' '.join(list(w))
    w = re.sub(r'[" "]+', " ", w)
    w = w.strip().rstrip()
    w = '<START> ' + w + ' <EOP>'
#     new_list = []
#     for i in w.split():
#         new_list.append(
#             word2ix.get(i, 8292)
#         )
    return w

def create_dataset(path):
    lines = io.open(path,encoding='utf8').read().strip().split('\n')
    sentence_pairs = [[preprocess_sentence_cn(w) for w in line.replace("，","。").replace("。", " ").strip().split(' ')] for line in lines]
    return zip(*sentence_pairs)

a, b = create_dataset('../originalDataset/poem5.txt')

In [4]:
a, b = np.array(list(a)), np.array(list(b))
# dataloader_1 = torch.utils.data.DataLoader(a, batch_size=10, shuffle=True, num_workers=1)  # shuffle=True随机打乱
# dataloader_2 = torch.utils.data.DataLoader(b, batch_size=10, shuffle=True, num_workers=1)  # shuffle=True随机打乱

In [5]:
a.shape, b.shape

((31072,), (31072,))

文档的第17499行是重复的。

In [6]:
def get_idx2word_word2idx_data(np_data):
    words = set()
    for line in np_data:
        for word in line.split():
            words.add(word)
    words = list(words)
    word2idx = {
        k:v for k, v in zip(words, [i for i in range(len(words))])
    }
    idx2word = {
        k:v for k, v in zip([i for i in range(len(words))], words)
    }
    
    vals = []
    for line in np_data:
        vals.append(
            [word2idx[word] for word in line.split()]
        )
    vals = torch.Tensor(vals)
            
    return idx2word, word2idx, vals.long()

a_idx2word, a_word2idx, a_vals = get_idx2word_word2idx_data(a)
b_idx2word, b_word2idx, b_vals = get_idx2word_word2idx_data(b)

In [7]:
class MyDataSet(Data.Dataset):
    def __init__(self, enc_inputs, dec_inputs, dec_outputs):
        super(MyDataSet, self).__init__()
        self.enc_inputs = enc_inputs
        self.dec_inputs = dec_inputs
        self.dec_outputs = dec_outputs
  
    def __len__(self):
        return self.enc_inputs.shape[0]
  
    def __getitem__(self, idx):
        return self.enc_inputs[idx], self.dec_inputs[idx], self.dec_outputs[idx]

In [8]:
loader = Data.DataLoader(MyDataSet(a_vals[:, :], b_vals[:, :-1], b_vals[:, 1:]), 128, True)

In [9]:
# idx2word = ix2word
src_vocab_size = len(a_idx2word)
tgt_vocab_size = len(b_idx2word)

In [10]:
src_len = 7 # enc_input max sequence length
tgt_len = 6 # dec_input(=dec_output) max sequence length

# Transformer Parameters
d_model = 512  # Embedding Size
d_ff = 2048 # FeedForward dimension
d_k = 64 # dimension of K(=Q), 
d_v = 128  # dimension of V
n_layers = 6  # number of Encoder of Decoder Layer
n_heads = 8  # number of heads in Multi-Head Attention

## 模型

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model) ## 5000 * 512, full of zeros
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        '''
        position looks like: 
            tensor(
                [[   0],
                [   1],
                [   2],
                ...,
                [4997],
                [4998],
                [4999]]
            )
        '''
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 
        ## 上面那个div_term，我自己手推导了一下，结果发现：div_term = 1/(10000**(2i / d_model))
        ## 只不过为什么要这样搞，为什么要把10000转为exp(log10000)呢？
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        '''
        x: [seq_len, batch_size, d_model]
        '''
        x = x + self.pe[:x.size(0), :]
#         print(x.shape)
#         print(x)
        return self.dropout(x)

def get_attn_pad_mask(seq_q, seq_k):
    '''
    seq_q: [batch_size, seq_len]
    seq_k: [batch_size, seq_len]
    seq_len could be src_len or it could be tgt_len
    seq_len in seq_q and seq_len in seq_k maybe not equal
    '''
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # [batch_size, 1, len_k], False is masked
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # [batch_size, len_q, len_k]

def get_attn_subsequence_mask(seq):
    '''
    seq: [batch_size, tgt_len]
    '''
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    subsequence_mask = np.triu(np.ones(attn_shape), k=1) # Upper triangular matrix
    subsequence_mask = torch.from_numpy(subsequence_mask).byte()
    return subsequence_mask # [batch_size, tgt_len, tgt_len]

In [12]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        '''
        Q: [batch_size, n_heads, len_q, d_k]
        K: [batch_size, n_heads, len_k, d_k]
        V: [batch_size, n_heads, len_v(=len_k), d_v] 
        attn_mask: [batch_size, n_heads, seq_len, seq_len]
        '''
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size, n_heads, len_q, len_k]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is True.
        
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V) # [batch_size, n_heads, len_q, d_v]
        return context, attn

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_V = nn.Linear(d_model, d_v * n_heads, bias=False)
        self.fc = nn.Linear(n_heads * d_v, d_model, bias=False)
    def forward(self, input_Q, input_K, input_V, attn_mask):
        '''
        input_Q: [batch_size, len_q, d_model]
        input_K: [batch_size, len_k, d_model]
        input_V: [batch_size, len_v(=len_k), d_model]
        attn_mask: [batch_size, seq_len, seq_len]
        '''
        residual, batch_size = input_Q, input_Q.size(0)
        # (B, S, D) -proj-> (B, S, D_new) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        Q = self.W_Q(input_Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # Q: [batch_size, n_heads, len_q, d_k]
        K = self.W_K(input_K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # K: [batch_size, n_heads, len_k, d_k]
        V = self.W_V(input_V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # V: [batch_size, n_heads, len_v(=len_k), d_v]

        # print(Q.shape, K.shape, V.shape)
        
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]

        # context: [batch_size, n_heads, len_q, d_v], attn: [batch_size, n_heads, len_q, len_k]
        context, attn = ScaledDotProductAttention()(Q, K, V, attn_mask)
        context = context.transpose(1, 2).reshape(batch_size, -1, n_heads * d_v) # context: [batch_size, len_q, n_heads * d_v]
        ## transpose的功用和其与permute的差别：https://cloud.tencent.com/developer/article/1914024
        output = self.fc(context) # [batch_size, len_q, d_model]
        # return nn.LayerNorm(d_model).cuda()(output + residual), attn
        return nn.LayerNorm(d_model)(output + residual), attn

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_ff, bias=False),
            nn.ReLU(),
            nn.Linear(d_ff, d_model, bias=False)
        )
    def forward(self, inputs):
        '''
        inputs: [batch_size, seq_len, d_model]
        '''
        residual = inputs
        output = self.fc(inputs)
        # return nn.LayerNorm(d_model).cuda()(output + residual) # [batch_size, seq_len, d_model]
        return nn.LayerNorm(d_model)(output + residual) # [batch_size, seq_len, d_model]

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        '''
        enc_inputs: [batch_size, src_len, d_model]
        enc_self_attn_mask: [batch_size, src_len, src_len]
        '''
        # enc_outputs: [batch_size, src_len, d_model], attn: [batch_size, n_heads, src_len, src_len]
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, src_len, d_model]
        return enc_outputs, attn

class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.dec_self_attn = MultiHeadAttention()
        self.dec_enc_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        '''
        dec_inputs: [batch_size, tgt_len, d_model]
        enc_outputs: [batch_size, src_len, d_model]
        dec_self_attn_mask: [batch_size, tgt_len, tgt_len]
        dec_enc_attn_mask: [batch_size, tgt_len, src_len]
        '''
        # dec_outputs: [batch_size, tgt_len, d_model], dec_self_attn: [batch_size, n_heads, tgt_len, tgt_len]
        dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
        # dec_outputs: [batch_size, tgt_len, d_model], dec_enc_attn: [batch_size, h_heads, tgt_len, src_len]
        dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_outputs = self.pos_ffn(dec_outputs) # [batch_size, tgt_len, d_model]
        return dec_outputs, dec_self_attn, dec_enc_attn

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])

    def forward(self, enc_inputs):
        '''
        enc_inputs: [batch_size, src_len]
        '''
        enc_outputs = self.src_emb(enc_inputs) # [batch_size, src_len, d_model]
        enc_outputs = self.pos_emb(enc_outputs.transpose(0, 1)).transpose(0, 1) # [batch_size, src_len, d_model]
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs) # [batch_size, src_len, src_len]
        enc_self_attns = []
        for layer in self.layers:
            # enc_outputs: [batch_size, src_len, d_model], enc_self_attn: [batch_size, n_heads, src_len, src_len]
            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
            enc_self_attns.append(enc_self_attn)
        return enc_outputs, enc_self_attns

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])

    def forward(self, dec_inputs, enc_inputs, enc_outputs):
        '''
        dec_inputs: [batch_size, tgt_len]
        enc_intpus: [batch_size, src_len]
        enc_outputs: [batsh_size, src_len, d_model]
        '''
        dec_outputs = self.tgt_emb(dec_inputs) # [batch_size, tgt_len, d_model]
        dec_outputs = self.pos_emb(dec_outputs.transpose(0, 1)).transpose(0, 1)#.cuda() # [batch_size, tgt_len, d_model]
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)#.cuda() # [batch_size, tgt_len, tgt_len]
        dec_self_attn_subsequence_mask = get_attn_subsequence_mask(dec_inputs)#.cuda() # [batch_size, tgt_len, tgt_len]
        # print(dec_self_attn_subsequence_mask)
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequence_mask), 0)#.cuda() # [batch_size, tgt_len, tgt_len]

        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs) # [batc_size, tgt_len, src_len]

        dec_self_attns, dec_enc_attns = [], []
        for layer in self.layers:
            # dec_outputs: [batch_size, tgt_len, d_model], dec_self_attn: [batch_size, n_heads, tgt_len, tgt_len], dec_enc_attn: [batch_size, h_heads, tgt_len, src_len]
            dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            dec_self_attns.append(dec_self_attn)
            dec_enc_attns.append(dec_enc_attn)
        return dec_outputs, dec_self_attns, dec_enc_attns

class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder()#.cuda()
        self.decoder = Decoder()#.cuda()
        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False)#.cuda()
    def forward(self, enc_inputs, dec_inputs):
        '''
        enc_inputs: [batch_size, src_len]
        dec_inputs: [batch_size, tgt_len]
        '''
        # tensor to store decoder outputs
        # outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        
        # enc_outputs: [batch_size, src_len, d_model], enc_self_attns: [n_layers, batch_size, n_heads, src_len, src_len]
        enc_outputs, enc_self_attns = self.encoder(enc_inputs)
        
        # dec_outpus: [batch_size, tgt_len, d_model], dec_self_attns: [n_layers, batch_size, n_heads, tgt_len, tgt_len], dec_enc_attn: [n_layers, batch_size, tgt_len, src_len]
        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        
        dec_logits = self.projection(dec_outputs) # dec_logits: [batch_size, tgt_len, tgt_vocab_size]
        
#         print(dec_outputs, dec_outputs.shape)
#         print(dec_logits, dec_logits.shape)
        
        return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns

## 训练

In [13]:
model = Transformer()#.cuda()
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.99)
# optimizer = torch.optim.Adam(modle.parameters(), lr=1e-3)  # 学习率1e-3

In [53]:
n_epoch = 1
for epoch in range(n_epoch):
    n_iter = 5
    for i, (enc_inputs, dec_inputs, dec_outputs) in enumerate(loader):
        '''
        enc_inputs: [batch_size, src_len]
        dec_inputs: [batch_size, tgt_len]
        dec_outputs: [batch_size, tgt_len]
        '''
        enc_inputs, dec_inputs, dec_outputs = enc_inputs.long(), dec_inputs.long(), dec_outputs.long()
        # outputs: [batch_size * tgt_len, tgt_vocab_size]
        outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)
        loss = criterion(outputs, dec_outputs.view(-1))
        print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        torch.save(model.state_dict(), 'model_poet_2.pth')
        
        if i >= 10:
            break

Epoch: 0001 loss = 6.366102
Epoch: 0001 loss = 6.420018
Epoch: 0001 loss = 6.420385
Epoch: 0001 loss = 6.381531
Epoch: 0001 loss = 6.371104
Epoch: 0001 loss = 6.313852
Epoch: 0001 loss = 6.183871
Epoch: 0001 loss = 6.149397
Epoch: 0001 loss = 6.045694
Epoch: 0001 loss = 5.933379
Epoch: 0001 loss = 5.887478


## 试试效果

In [54]:
enc_num = 23
dec_num = 673

enc_sample = torch.Tensor([[enc_num]]).long()
dec_sample = torch.Tensor([[dec_num]]).long()

In [55]:
a_idx2word[enc_num], b_idx2word[dec_num]

('珠', '霄')

In [56]:
output = model(enc_sample, dec_sample)[0]
output_num = int(output.data[0].topk(1)[1][0])

In [57]:
b_idx2word[output_num]

'<EOP>'

In [58]:
def greedy_decoder(model, enc_input, start_symbol):
    """
    For simplicity, a Greedy Decoder is Beam search when K=1. This is necessary for inference as we don't know the
    target sequence input. Therefore we try to generate the target input word by word, then feed it into the transformer.
    Starting Reference: http://nlp.seas.harvard.edu/2018/04/03/attention.html#greedy-decoding
    :param model: Transformer Model
    :param enc_input: The encoder input
    :param start_symbol: The start symbol. In this example it is 'S' which corresponds to index 4
    :return: The target input
    """
    enc_outputs, enc_self_attns = model.encoder(enc_input)
    dec_input = torch.zeros(1, 0).type_as(enc_input.data)
    terminal = False
    next_symbol = start_symbol
    while not terminal:         
        dec_input = torch.cat([dec_input.detach(),torch.tensor([[next_symbol]],dtype=enc_input.dtype)],-1)
        dec_outputs, _, _ = model.decoder(dec_input, enc_input, enc_outputs)
        projected = model.projection(dec_outputs)
        prob = projected.squeeze(0).max(dim=-1, keepdim=False)[1]
        next_word = prob.data[-1]
        next_symbol = next_word
        if next_symbol == b_word2idx["<EOP>"]: # tgt_vocab["."]:
            terminal = True
        print(next_word)            
    return dec_input

In [59]:
b_idx2word[623]

'<EOP>'

In [66]:
# Test
enc_inputs, _, _ = next(iter(loader))
enc_inputs = enc_inputs#.cuda()
for i in tqdm.tqdm(range(len(enc_inputs))):
    greedy_dec_input = greedy_decoder(model, enc_inputs[i].view(1, -1), start_symbol=b_word2idx["<START>"])
    predict, _, _, _ = model(enc_inputs[i].view(1, -1), greedy_dec_input)
    predict = predict.data.max(1, keepdim=True)[1]
#     print(predict.squeeze())
    print(enc_inputs[i], '->', [b_idx2word[n.item()] for n in predict.squeeze()])

  0%|                                                   | 0/128 [00:00<?, ?it/s]


tensor(623)


TypeError: iteration over a 0-d tensor

TypeError: iteration over a 0-d tensor 

爆这个错，说明只生成了一个字符。那就是说基本没用了。