#  Sequence to Sequence 

In [8]:
import sys
sys.path.insert(0, '..')

import time
from mxnet import nd, init, gluon, autograd
from mxnet.gluon import nn, rnn, loss as gloss
import d2l

## Encoder

In [162]:
class TwoSeqEncoder(nn.Block):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(TwoSeqEncoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = rnn.LSTM(num_hiddens, num_layers, dropout=dropout)
        
    def forward(self, left_input, right_input):
        print('in encoder')
        X_left = self.embedding(left_input) # X shape: (batch_size, seq_len, embed_size)
        X_left = X_left.swapaxes(0, 1)  # RNN needs first axes to be time
        left_state = self.rnn.begin_state(batch_size=X_left.shape[1], \
                                               ctx=X_left.context)
        left_out, _ = self.rnn(X_left, left_state)
        
        X_right = self.embedding(right_input) # X shape: (batch_size, seq_len, embed_size)
        X_right = X_right.swapaxes(0, 1)  # RNN needs first axes to be time
        right_state = self.rnn.begin_state(batch_size=X_right.shape[1], \
                                               ctx=X_right.context)
        right_out, _ = self.rnn(X_right, right_state)
        # The shape of out is (seq_len, batch_size, num_hiddens).
        # state contains the hidden state and the memory cell
        # of the last time step, the shape is (num_layers, batch_size, num_hiddens)
        return left_out[:][-1][:], right_out[:][-1][:]

Sanity check

In [166]:
encoder = TwoSeqEncoder(vocab_size=10, embed_size=8,
                         num_hiddens=16, num_layers=2)
encoder.initialize()
X = nd.zeros((4, 7))
Y = nd.ones((4, 10))
left_output, right_output = encoder(X, Y)
# after running the encoder, output and state are both a list
# which contains the whole output and state of each time state in history
left_output, right_output

in encoder


(
 [[-9.3526716e-05 -4.9678278e-05  1.4228432e-05 -1.6644204e-04
    9.7964104e-05 -1.9300998e-04  1.5927634e-04 -2.7911537e-05
    1.8589392e-04 -7.2337089e-06 -3.2081801e-04 -1.8444611e-05
   -1.9077418e-04 -1.6275543e-04 -1.9651403e-05 -5.4234817e-05]
  [-9.3526716e-05 -4.9678278e-05  1.4228432e-05 -1.6644204e-04
    9.7964104e-05 -1.9300998e-04  1.5927634e-04 -2.7911537e-05
    1.8589392e-04 -7.2337089e-06 -3.2081801e-04 -1.8444611e-05
   -1.9077418e-04 -1.6275543e-04 -1.9651403e-05 -5.4234817e-05]
  [-9.3526716e-05 -4.9678278e-05  1.4228432e-05 -1.6644204e-04
    9.7964104e-05 -1.9300998e-04  1.5927634e-04 -2.7911537e-05
    1.8589392e-04 -7.2337089e-06 -3.2081801e-04 -1.8444611e-05
   -1.9077418e-04 -1.6275543e-04 -1.9651403e-05 -5.4234817e-05]
  [-9.3526716e-05 -4.9678278e-05  1.4228432e-05 -1.6644204e-04
    9.7964104e-05 -1.9300998e-04  1.5927634e-04 -2.7911537e-05
    1.8589392e-04 -7.2337089e-06 -3.2081801e-04 -1.8444611e-05
   -1.9077418e-04 -1.6275543e-04 -1.9651403e-05 -5

In [158]:
def exp_neg_abs_sim(left_embed, right_embed):
    return nd.exp(-nd.sum(nd.abs(left_embed-right_embed), axis=1, keepdims=True))

In [159]:
ManhattanDistance = nn.Lambda(lambda x: exp_neg_abs_sim(x[0], x[1]))

In [160]:
ManhattanDistance([left_output, right_output])


[[0.99716175]
 [0.99716175]
 [0.99716175]
 [0.99716175]]
<NDArray 4x1 @cpu(0)>

## Model building

In [169]:
model = nn.Sequential()
model.add(TwoSeqEncoder(vocab_size=2, embed_size=10, num_hiddens=32, num_layers=16))
model.initialize()
print(model)
model(X, Y)

Sequential(
  (0): TwoSeqEncoder(
    (embedding): Embedding(2 -> 10, float32)
    (rnn): LSTM(None -> 32, TNC, num_layers=16)
  )
)


TypeError: forward() takes 2 positional arguments but 3 were given

## Training

In [11]:
def train_ch7(model, data_iter, lr, num_epochs, ctx):  # Saved in d2l
    model.initialize(init.Xavier(), force_reinit=True, ctx=ctx)
    trainer = gluon.Trainer(model.collect_params(),
                            'adam', {'learning_rate': lr})
    loss = MaskedSoftmaxCELoss()
    tic = time.time()
    for epoch in range(1, num_epochs+1):
        l_sum, num_tokens_sum = 0.0, 0.0
        for batch in data_iter:
            X, X_vlen, Y, Y_vlen = [x.as_in_context(ctx) for x in batch]
            Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1
            with autograd.record():
                Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen)
                l = loss(Y_hat, Y_label, Y_vlen)
            l.backward()
            d2l.grad_clipping_gluon(model, 5, ctx)
            num_tokens = Y_vlen.sum().asscalar()
            trainer.step(num_tokens)
            l_sum += l.sum().asscalar()
            num_tokens_sum += num_tokens
        if epoch % 50 == 0:
            print("epoch %d, loss %.3f, time %.1f sec" % (
                epoch, l_sum/num_tokens_sum, time.time()-tic))
            tic = time.time()
            

Train the model

In [12]:
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_examples, max_len = 64, 1e3, 10
lr, num_epochs, ctx = 0.005, 300, d2l.try_gpu()

src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(
    batch_size, max_len, num_examples)
encoder = Seq2SeqEncoder(
    len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(
    len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
train_ch7(model, train_iter, lr, num_epochs, ctx)

epoch 50, loss 0.120, time 10.2 sec
epoch 100, loss 0.066, time 10.4 sec
epoch 150, loss 0.041, time 10.3 sec
epoch 200, loss 0.031, time 10.3 sec
epoch 250, loss 0.028, time 10.0 sec
epoch 300, loss 0.025, time 9.5 sec


## Predicting

In [13]:
def translate_ch7(model, src_sentence, src_vocab, tgt_vocab, max_len, ctx):
    src_tokens = src_vocab[src_sentence.lower().split(' ')]
    src_len = len(src_tokens)
    if src_len < max_len:
        src_tokens += [src_vocab.pad] * (max_len - src_len)
    enc_X = nd.array(src_tokens, ctx=ctx)
    enc_valid_length = nd.array([src_len], ctx=ctx)
    # use expand_dim to add the batch_size dimension.
    enc_outputs = model.encoder(enc_X.expand_dims(axis=0), enc_valid_length)
    dec_state = model.decoder.init_state(enc_outputs, enc_valid_length)
    dec_X = nd.array([tgt_vocab.bos], ctx=ctx).expand_dims(axis=0)
    predict_tokens = []
    for _ in range(max_len):
        Y, dec_state = model.decoder(dec_X, dec_state)
        # The token with highest score is used as the next time step input. 
        dec_X = Y.argmax(axis=2)
        py = dec_X.squeeze(axis=0).astype('int32').asscalar()
        if py == tgt_vocab.eos:
            break
        predict_tokens.append(py)
    return ' '.join(tgt_vocab.to_tokens(predict_tokens))

Try several examples:

In [14]:
for sentence in ['Go .', 'Wow !', "I'm OK .", 'I won !']:
    print(sentence + ' => ' + translate_ch7(
        model, sentence, src_vocab, tgt_vocab, max_len, ctx))

Go . => va !
Wow ! => <unk> !
I'm OK . => je vais bien .
I won ! => je l'ai emporté !
