In [2]:
import paddle
import paddle.nn.functional as F
import string
import re
import numpy as np

paddle.disable_static()
print(paddle.__version__)
print(paddle.__git_commit__)


0.0.0
7f2aa2db3c69cb9ebb8bae9e19280e75f964e1d0


In [3]:
!wget -c https://www.manythings.org/anki/cmn-eng.zip && unzip -f cmn-eng.zip

--2020-09-03 14:14:55--  https://www.manythings.org/anki/cmn-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.108.196, 104.24.109.196, 172.67.173.198, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.108.196|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

Archive:  cmn-eng.zip


In [4]:
!wc -l cmn.txt

   23610 cmn.txt


In [5]:
MAX_LEN = 10

In [6]:

lines = open('cmn.txt', encoding='utf-8').read().strip().split('\n')
words_re = re.compile(r'\w+')

pairs = []
for l in lines:
    en_sent, cn_sent, _ = l.split('\t')
    pairs.append((words_re.findall(en_sent.lower()), list(cn_sent)))

# create a smaller dataset to make the demo process faster
filtered_pairs = []

for x in pairs:
    if len(x[0]) < MAX_LEN and len(x[1]) < MAX_LEN and \
    x[0][0] in ('i', 'you', 'he', 'she', 'we', 'they'):
        filtered_pairs.append(x)

            
print(len(filtered_pairs))
for x in filtered_pairs[:10]: print(x) 

5508
(['i', 'won'], ['我', '赢', '了', '。'])
(['he', 'ran'], ['他', '跑', '了', '。'])
(['i', 'quit'], ['我', '退', '出', '。'])
(['i', 'm', 'ok'], ['我', '沒', '事', '。'])
(['i', 'm', 'up'], ['我', '已', '经', '起', '来', '了', '。'])
(['we', 'try'], ['我', '们', '来', '试', '试', '。'])
(['he', 'came'], ['他', '来', '了', '。'])
(['he', 'runs'], ['他', '跑', '。'])
(['i', 'agree'], ['我', '同', '意', '。'])
(['i', 'm', 'ill'], ['我', '生', '病', '了', '。'])


#  创建词表

- 英文都变成了小写，去掉了标点符号。
- 中文未做分词，按照字做的切分。

In [7]:
en_vocab = {}
cn_vocab = {}

# create special token for unkown, begin of sentence, end of sentence
en_vocab['<pad>'], en_vocab['<bos>'], en_vocab['<eos>'] = 0, 1, 2
cn_vocab['<pad>'], cn_vocab['<bos>'], cn_vocab['<eos>'] = 0, 1, 2

#print(en_vocab, cn_vocab)

en_idx, cn_idx = 3, 3

for en, cn in filtered_pairs:
    for w in en: 
        if w not in en_vocab: 
            en_vocab[w] = en_idx
            en_idx += 1
    for w in cn:  
        if w not in cn_vocab: 
            cn_vocab[w] = cn_idx
            cn_idx += 1

print(len(list(en_vocab)))
print(len(list(cn_vocab)))

2539
2039


# 创建padding过的数据集

In [8]:
# create padded datasets
padded_en_sents = []
padded_cn_sents = []
padded_cn_label_sents = []
for en, cn in filtered_pairs:
    # reverse source sentence
    padded_en_sent = en + ['<pad>'] * (MAX_LEN - len(en))
    padded_en_sent.reverse()
    padded_cn_sent = ['<bos>'] + cn +  ['<pad>'] * (MAX_LEN - len(cn))
    padded_cn_label_sent = cn +  ['<pad>'] * (MAX_LEN - len(cn)) + ['<eos>']

    padded_en_sents.append([en_vocab[w] for w in padded_en_sent])
    padded_cn_sents.append([cn_vocab[w] for w in padded_cn_sent])
    padded_cn_label_sents.append([cn_vocab[w] for w in padded_cn_label_sent])

train_en_sents = np.array(padded_en_sents)
train_cn_sents = np.array(padded_cn_sents)
train_cn_label_sents = np.array(padded_cn_label_sents)


print(train_en_sents.shape)
print(train_cn_sents.shape)
print(train_cn_label_sents.shape)

(5508, 10)
(5508, 11)
(5508, 11)


# 创建网络

In [9]:
embedding_size = 128
hidden_size = 256
num_encoder_lstm_layers = 1
en_vocab_size = len(list(en_vocab))
cn_vocab_size = len(list(cn_vocab))
epochs = 30
batch_size = 16

In [10]:
# encoder: simply learn representation of source sentence
class Encoder(paddle.nn.Layer):
    def __init__(self):
        super(Encoder, self).__init__()
        self.emb = paddle.nn.Embedding(size=[en_vocab_size, embedding_size],)
        self.lstm = paddle.nn.LSTM(input_size=embedding_size, 
                                   hidden_size=hidden_size, 
                                   num_layers=num_encoder_lstm_layers,
                                   dropout=0.5)

    def forward(self, x):
        x = self.emb(x)
        x, (_, _) = self.lstm(x)
        return x

In [11]:
# only move one step of LSTM, 
# the recurrent loop is implemented inside training loop
class AttentionDecoder(paddle.nn.Layer):
    def __init__(self):
        super(AttentionDecoder, self).__init__()
        self.emb = paddle.nn.Embedding(size=[cn_vocab_size, embedding_size],)
        
        # the lstm layer for to generate target sentence representation
        self.lstm = paddle.nn.LSTM(input_size=embedding_size + hidden_size, 
                                   hidden_size=hidden_size, 
                                   dropout=0.5)
        
        # for computing attention weights
        self.attention_linear1 = paddle.nn.Linear(hidden_size * 2, hidden_size)
        self.attention_linear2 = paddle.nn.Linear(hidden_size, 1)
        
        # for computing output logits
        self.outlinear =paddle.nn.Linear(hidden_size, cn_vocab_size)


    def forward(self, x, previous_hidden, previous_cell, encoder_outputs):
        x = self.emb(x)
        
        attention_inputs = paddle.concat((encoder_outputs, 
                                      paddle.tile(previous_hidden, repeat_times=[1, MAX_LEN, 1])),
                                      axis=-1
                                     )

        attention_hidden = self.attention_linear1(attention_inputs)
        attention_hidden = F.tanh(attention_hidden)
        attention_logits = self.attention_linear2(attention_hidden)
        attention_logits = paddle.squeeze(attention_logits)

        
        attention_weights = F.softmax(attention_logits)        
        attention_weights = paddle.expand_as(paddle.unsqueeze(attention_weights, -1), 
                                             encoder_outputs)

        context_vector = paddle.multiply(encoder_outputs, attention_weights)               
        context_vector = paddle.reduce_sum(context_vector, 1)
        context_vector = paddle.unsqueeze(context_vector, 1)
        
        lstm_input = paddle.concat((x, context_vector), axis=-1)

        
        # LSTM requires: timesteps * batch * hidden
        previous_hidden = paddle.transpose(previous_hidden, [1, 0, 2])
        previous_cell = paddle.transpose(previous_cell, [1, 0, 2])
        
        x, (hidden, cell) = self.lstm(lstm_input, (previous_hidden, previous_cell))
        
        # change the return to batch * timesteps * hidden 
        hidden = paddle.transpose(hidden, [1, 0, 2])
        cell = paddle.transpose(cell, [1, 0, 2])

        output = self.outlinear(hidden)
        output = paddle.squeeze(output)
        return output, (hidden, cell)

In [14]:
encoder = Encoder()
atten_decoder = AttentionDecoder()

opt = paddle.optimizer.Adam(learning_rate=0.001, 
                            parameters=encoder.parameters()+atten_decoder.parameters())

for epoch in range(epochs):
    print("epoch:{}".format(epoch))

    # shuffle training data
    perm = np.random.permutation(len(train_en_sents))
    train_en_sents_shuffled = train_en_sents[perm]
    train_cn_sents_shuffled = train_cn_sents[perm]
    train_cn_label_sents_shuffled = train_cn_label_sents[perm]

    
    for iteration in range(train_en_sents_shuffled.shape[0] // batch_size):
        x_data = train_en_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
        sent = paddle.to_tensor(x_data)
        en_repr = encoder(sent)

        x_cn_data = train_cn_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
        x_cn_label_data = train_cn_label_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]

        # batch * num_layer(=1 in this example) * hidden_size
        hidden = paddle.zeros([batch_size, 1, hidden_size])
        cell = paddle.zeros([batch_size, 1, hidden_size])

        loss = paddle.zeros([1])
        for i in range(MAX_LEN + 1):
            cn_word = paddle.to_tensor(x_cn_data[:,i:i+1])
            cn_word_label = paddle.to_tensor(x_cn_label_data[:,i:i+1])

            logits, (hidden, cell) = atten_decoder(cn_word, hidden, cell, en_repr)
            step_loss = F.softmax_with_cross_entropy(logits, cn_word_label)
            avg_step_loss = paddle.mean(step_loss)
            loss += avg_step_loss

        loss = loss / (MAX_LEN + 1)
        if(iteration % 200 == 0):
            print("iter {}, loss:{}".format(iteration, loss.numpy()))

        loss.backward()
        opt.minimize(loss)
        encoder.clear_gradients()
        atten_decoder.clear_gradients()

epoch:0
iter 0, loss:[7.6200185]
iter 200, loss:[3.4169505]
epoch:1
iter 0, loss:[3.1581175]
iter 200, loss:[3.3032415]
epoch:2
iter 0, loss:[3.0002146]
iter 200, loss:[3.2185385]
epoch:3
iter 0, loss:[2.9653757]
iter 200, loss:[3.098806]
epoch:4
iter 0, loss:[2.6793027]
iter 200, loss:[2.5913079]
epoch:5
iter 0, loss:[2.6999655]
iter 200, loss:[2.379569]
epoch:6
iter 0, loss:[2.5435457]
iter 200, loss:[2.748782]
epoch:7
iter 0, loss:[2.2716467]
iter 200, loss:[2.608843]
epoch:8
iter 0, loss:[2.406243]
iter 200, loss:[2.0575686]
epoch:9
iter 0, loss:[1.8435733]
iter 200, loss:[2.2846653]
epoch:10
iter 0, loss:[1.7847126]
iter 200, loss:[1.9135032]
epoch:11
iter 0, loss:[1.7565953]
iter 200, loss:[1.8443459]
epoch:12
iter 0, loss:[1.4571258]
iter 200, loss:[1.7388061]
epoch:13
iter 0, loss:[1.4517817]
iter 200, loss:[1.6169605]
epoch:14
iter 0, loss:[1.4762214]
iter 200, loss:[1.4081928]
epoch:15
iter 0, loss:[1.5862186]
iter 200, loss:[1.2722157]
epoch:16
iter 0, loss:[1.2187248]
iter 

# try the model

In [16]:
encoder.eval()
atten_decoder.eval()

num_of_exampels_to_evaluate = 10

indices = np.random.choice(len(train_en_sents),  num_of_exampels_to_evaluate, replace=False)
x_data = train_en_sents[indices]
sent = paddle.to_tensor(x_data)
en_repr = encoder(sent)

word = np.array(
    [[cn_vocab['<bos>']]] * num_of_exampels_to_evaluate
)
word = paddle.to_tensor(word)

hidden = paddle.zeros([num_of_exampels_to_evaluate, 1, hidden_size])
cell = paddle.zeros([num_of_exampels_to_evaluate, 1, hidden_size])

decoded_sent = []
for i in range(MAX_LEN + 1):
    logits, (hidden, cell) = atten_decoder(word, hidden, cell, en_repr)

    word = paddle.argmax(logits, axis=1)
    decoded_sent.append(word.numpy())
    word = paddle.unsqueeze(word, axis=-1)
    
results = np.stack(decoded_sent, axis=1)
for i in range(num_of_exampels_to_evaluate):
    en_input = " ".join(filtered_pairs[indices[i]][0])
    ground_truth_translate = "".join(filtered_pairs[indices[i]][1])
    model_translate = ""
    for k in results[i]:
        w = list(cn_vocab)[k]
        if w != '<pad>' and w != '<eos>':
            model_translate += w
    print(en_input)
    print("true: {}".format(ground_truth_translate))
    print("pred: {}".format(model_translate))

i d like to go abroad one day
true: 我想有一天去国外。
pred: 我想要去國国去。
i can t believe it
true: 我没法相信！
pred: 我不能相信。
you were treated unfairly
true: 你被不公平地對待。
pred: 你被测会離開了。
you are a good person
true: 你是一個好人。
pred: 你是一個好人。
they re self sufficient
true: 他们自给自足。
pred: 他们自己很強壯。
i ll see you next wednesday
true: 我下星期三见你。
pred: 我下週去看你。
i am interested in english
true: 我对英语感兴趣。
pred: 我对英语感兴趣。
they were so young
true: 他們是那麼年輕。
pred: 他們都是艺术家。
you re quite right
true: 你是對的。
pred: 你是對的。
i have a cough
true: 我咳嗽。
pred: 我愛了。


# The End

have fun with Paddle.