In [1]:
'''
2014年7月4日人工智能实训作业。
Task 1: 基于下面的代码实现训练penn数据集(可以取部分数据实现即可) ，并计算测试集PPL值。(注意：需要更改Postion Emb部分)
Task 2: 实现Beam Search解码方法
完成之后，请把代码和训练日志提交到: https://send2me.cn/DtYAmnVi/QJ68hwswVdVgDg
'''


'\n2014年7月4日人工智能实训作业。\nTask 1: 基于下面的代码实现训练penn数据集(可以取部分数据实现即可) ，并计算测试集PPL值。(注意：需要更改Postion Emb部分)\nTask 2: 实现Beam Search解码方法\n完成之后，请把代码和训练日志提交到: https://send2me.cn/DtYAmnVi/QJ68hwswVdVgDg\n'

In [2]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt


In [3]:
src_len = 5 # length of source
tgt_len = 5 # length of target

d_model = 128  # Embedding Size
d_ff = 512  # FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_layers = 6  # number of Encoder of Decoder Layer
n_heads = 8  # number of heads in Multi-Head Attention

batch_size = 512

In [4]:

def make_dict(path):
    with open(path, 'r', encoding='utf-8') as fr:
        text = fr.readlines()
    text = [n.strip() for n in text]
    text = ' '.join(text).split(" ")
    text = list(set(text))
    
    word2number_dict = {}
    number2word_dict = {}
    for i,word in enumerate(text,4):
        word2number_dict[word] = i
        number2word_dict[i] = word
    word2number_dict["<pad>"] = 0
    number2word_dict[0] = "<pad>"
    word2number_dict["<unk_word>"] = 1
    number2word_dict[1] = "<unk_word>"
    word2number_dict["<sos>"] = 2
    number2word_dict[2] = "<sos>"
    word2number_dict["<eos>"] = 3
    number2word_dict[3] = "<eos>"

    return word2number_dict, number2word_dict

In [5]:
train_path = './data/train.txt' # the path of train dataset
valid_path = './data/valid.txt'
test_psth = './data/test.txt'

In [6]:
src_vocab , _ = make_dict(train_path)
src_vocab_size = len(src_vocab)
tgt_vocab , number_dict = make_dict(train_path)
tgt_vocab_size = len(tgt_vocab)

In [7]:
def make_batch(path,src_vocab,tgt_vocab,batch_size):
    def src_word2number(word):
        try:
            return src_vocab[word]
        except :
            return 1
    def tgt_word2number(word):
        try:
            return tgt_vocab[word]
        except :
            return 1
    input_batchs = []
    output_batchs = []
    target_batchs = []
    with open(path,'r',encoding='utf-8') as f:
        text = f.readlines()
    text =[line.strip() for line in text]
    
    input_batch = []
    output_batch = []
    target_batch = []
    
    for sentences in text:
        
        words = sentences.split()
        
        if len(words) <src_len+tgt_len:continue
        
        for i in range(len(words)-src_len-tgt_len+1):
            
            input_batch.append([src_word2number(words[i+k]) for k in range(src_len)])
            
            output = [tgt_word2number(words[i+src_len+k]) for k in range(tgt_len)]*2
            
            output.insert(0,tgt_word2number('<sos>'))
            output.append(tgt_word2number('<eos>'))
            
            output_batch.append(output[0:tgt_len+1])
            target_batch.append(output[tgt_len+1:])
            
    for i in range(0,len(input_batch),batch_size):
        if i+batch_size>len(input_batch):
            break
        input_batchs.append(input_batch[i:i+batch_size])
        output_batchs.append(output_batch[i:i+batch_size])
        target_batchs.append(target_batch[i:i+batch_size])
    
    return torch.LongTensor(input_batchs), torch.LongTensor(output_batchs), torch.LongTensor(target_batchs)

In [8]:

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
        self.linear = nn.Linear(n_heads * d_v, d_model)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = self.linear(context)
        return self.layer_norm(output + residual), attn # output: [batch_size x len_q x d_model]

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, inputs):
        residual = inputs # inputs : [batch_size, len_q, d_model]
        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
        output = self.conv2(output).transpose(1, 2)
        return self.layer_norm(output + residual)

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.dec_self_attn = MultiHeadAttention()
        self.dec_enc_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
        dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_outputs = self.pos_ffn(dec_outputs)
        return dec_outputs, dec_self_attn, dec_enc_attn

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_len+1, d_model),freeze=True)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])

    def forward(self, enc_inputs): # enc_inputs : [batch_size x source_len]
        
        enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(abs_position_encode(enc_inputs))# pos_emb出错
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
        enc_self_attns = []
        for layer in self.layers:
            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
            enc_self_attns.append(enc_self_attn)
        return enc_outputs, enc_self_attns

def abs_position_encode(inputs):
    batch_size , seq_len = inputs.size()
    position_encode = []
    for i in range(batch_size):
        position_encode.append([k for k in range(seq_len)])
    return torch.LongTensor(position_encode)

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(tgt_len+1, d_model),freeze=True)
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])

    def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs : [batch_size x target_len]
        dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(abs_position_encode(dec_inputs))
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)
        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs)
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)

        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)

        dec_self_attns, dec_enc_attns = [], []
        for layer in self.layers:
            dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            dec_self_attns.append(dec_self_attn)
            dec_enc_attns.append(dec_enc_attn)
        return dec_outputs, dec_self_attns, dec_enc_attns

class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False)
    def forward(self, enc_inputs, dec_inputs):
        enc_outputs, enc_self_attns = self.encoder(enc_inputs)
        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        dec_logits = self.projection(dec_outputs) # dec_logits : [batch_size x src_vocab_size x tgt_vocab_size]
        return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns


In [9]:

# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps

def get_sinusoid_encoding_table(n_position, d_model):
    def cal_angle(position, hid_idx):
        return position / np.power(10000, 2 * (hid_idx // 2) / d_model)
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
    return torch.FloatTensor(sinusoid_table)

def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

def get_attn_subsequent_mask(seq):
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    subsequent_mask = np.triu(np.ones(attn_shape), k=1)
    subsequent_mask = torch.from_numpy(subsequent_mask).byte()
    return subsequent_mask

# def showgraph(attn):
#     attn = attn[-1].squeeze(0)[0]
#     attn = attn.squeeze(0).data.numpy()
#     fig = plt.figure(figsize=(n_heads, n_heads)) # [n_heads, n_heads]
#     ax = fig.add_subplot(1, 1, 1)
#     ax.matshow(attn, cmap='viridis')
#     ax.set_xticklabels(['']+sentences[0].split(), fontdict={'fontsize': 14}, rotation=90)
#     ax.set_yticklabels(['']+sentences[2].split(), fontdict={'fontsize': 14})
#     plt.show()

In [27]:
model = Transformer()

criterion = nn.CrossEntropyLoss()

enc_inputs_batchs, dec_input_batchs, target_batchs = make_batch(train_path, src_vocab=src_vocab , tgt_vocab=tgt_vocab , batch_size=batch_size)

print("batch_num: ", len(enc_inputs_batchs))

batch_num:  103


In [28]:
enc_inputs = enc_inputs_batchs[0]
print(enc_inputs.shape)

torch.Size([512, 5])


In [29]:
print(enc_inputs[0])

tensor([4154, 3277, 3511, 5655, 1869])


In [30]:
target_batch = target_batchs[0]
print(target_batch.shape)

torch.Size([512, 6])


In [31]:
print(target_batch[0])

tensor([4240, 4435, 4425, 5476, 5209,    3])


In [32]:
dec_inputs = dec_input_batchs[0]
print(dec_inputs.shape)

torch.Size([512, 6])


In [33]:
print(dec_inputs[0])

tensor([   2, 4240, 4435, 4425, 5476, 5209])


In [17]:
enc_outputs,enc_self_attens = model.encoder(enc_inputs)


In [22]:
print(type(enc_self_attens))
print(type(enc_outputs))

<class 'list'>
<class 'torch.Tensor'>


In [25]:
print(len(enc_self_attens),len(enc_self_attens[0]),len(enc_self_attens[0][0]))

6 512 8


In [26]:
print("enc_inputs shape:",enc_inputs.shape,"\t enc_outputs shape:",enc_outputs.shape)

enc_inputs shape: torch.Size([512, 5]) 	 enc_outputs shape: torch.Size([512, 5, 128])


In [34]:
start_token = tgt_vocab['<sos>']
end_token = tgt_vocab['<eos>']
print(start_token,end_token)

2 3


In [59]:
# Create a tensor with shape [batch_size, 1] filled with start_token
dec_inputs_copy = torch.full((enc_inputs.shape[0], 1), start_token)


In [103]:
sequences  = [[[(start_token,1)] for _ in range(enc_inputs.shape[0])]for _ in range(3)]
print(len(sequences))

3


In [104]:
print(len(sequences[0]))

512


In [99]:
sequences  = [[(start_token,1)] for _ in range(512)]

dec_inputs = torch.tensor([[seq[-1][0]] for seq in sequences])

In [100]:
dec_inputs[:5]

tensor([[2],
        [2],
        [2],
        [2],
        [2]])

In [101]:
len(sequences)

512

In [102]:
dec_inputs.shape

torch.Size([512, 1])

In [60]:

print(dec_inputs_copy.shape)


torch.Size([512, 1])


In [61]:
dec_outputs,dec_self_attens,dec_enc_attens = model.decoder(dec_inputs_copy,enc_inputs,enc_outputs)

In [62]:
print(dec_outputs.shape)

torch.Size([512, 1, 128])


In [63]:
dec_logits = model.projection(dec_outputs)

In [64]:
print(dec_logits.shape)

torch.Size([512, 1, 7615])


In [65]:
print(dec_logits[-1][-1][:5])

tensor([-0.3115,  1.1620, -0.4799, -0.3663, -0.0946], grad_fn=<SliceBackward0>)


In [66]:
log_probs = torch.nn.functional.log_softmax(dec_logits,dim=2)

In [67]:
print(log_probs[-1][-1][:5])

tensor([-9.4195, -7.9460, -9.5879, -9.4743, -9.2026], grad_fn=<SliceBackward0>)


In [68]:
print(log_probs.shape)

torch.Size([512, 1, 7615])


In [69]:
top_k_probs,top_k_idx = log_probs.topk(k = 5,dim = -1)

In [70]:
top_k_probs.shape

torch.Size([512, 1, 5])

In [71]:
top_k_idx.shape

torch.Size([512, 1, 5])

In [72]:
top_k_idx[-1][-1]

tensor([7344, 1942, 6491, 1920, 1114])

In [73]:
top_k_probs[-1][-1]

tensor([-7.0585, -7.1436, -7.1496, -7.3540, -7.3543],
       grad_fn=<SelectBackward0>)