#  Sequence to Sequence 

In [24]:
import sys
sys.path.insert(0, '..')

import time
from mxnet import nd, init, gluon, autograd
from mxnet.gluon import nn, rnn, loss as gloss
import d2l
import pandas as pd

## Encoder

In [14]:
class TwoSeqEncoder(nn.Block):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(TwoSeqEncoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = rnn.LSTM(num_hiddens, num_layers, dropout=dropout)
        
    def forward(self, X, *args):
        left_input, right_input = X[0], X[1]
        X_left = self.embedding(left_input) # X shape: (batch_size, seq_len, embed_size)
        X_left = X_left.swapaxes(0, 1)  # RNN needs first axes to be time
        left_state = self.rnn.begin_state(batch_size=X_left.shape[1], \
                                               ctx=X_left.context)
        left_out, left_state = self.rnn(X_left, left_state)
        
        X_right = self.embedding(right_input) # X shape: (batch_size, seq_len, embed_size)
        X_right = X_right.swapaxes(0, 1)  # RNN needs first axes to be time
        right_state = self.rnn.begin_state(batch_size=X_right.shape[1], \
                                               ctx=X_right.context)
        right_out, right_state = self.rnn(X_right, right_state)
        # The shape of out is (seq_len, batch_size, num_hiddens).
        # state contains the hidden state and the memory cell
        # of the last time step, the shape is (num_layers, batch_size, num_hiddens)
        return left_out[-1][:][:], right_out[-1][:][:]

Sanity check

In [15]:
encoder = TwoSeqEncoder(vocab_size=10, embed_size=8,
                         num_hiddens=16, num_layers=2)
encoder.initialize()
X = nd.zeros((4, 7))
Y = nd.ones((4, 10))
left_output, right_output = encoder([X, Y])
# after running the encoder, output and state are both a list
# which contains the whole output and state of each time state in history
left_output, right_output

(
 [[-2.2143868e-04  3.5002387e-05  2.7029600e-04  2.8731427e-05
   -5.2404794e-05  1.4110740e-05 -2.0850824e-04  1.0625856e-04
    1.5787718e-04  1.1166865e-04 -3.7516581e-04  1.7297745e-04
   -6.5742475e-05  6.8289250e-05 -2.2307977e-04 -5.6489898e-05]
  [-2.2143868e-04  3.5002387e-05  2.7029600e-04  2.8731427e-05
   -5.2404794e-05  1.4110740e-05 -2.0850824e-04  1.0625856e-04
    1.5787718e-04  1.1166865e-04 -3.7516581e-04  1.7297745e-04
   -6.5742475e-05  6.8289250e-05 -2.2307977e-04 -5.6489898e-05]
  [-2.2143868e-04  3.5002387e-05  2.7029600e-04  2.8731427e-05
   -5.2404794e-05  1.4110740e-05 -2.0850824e-04  1.0625856e-04
    1.5787718e-04  1.1166865e-04 -3.7516581e-04  1.7297745e-04
   -6.5742475e-05  6.8289250e-05 -2.2307977e-04 -5.6489898e-05]
  [-2.2143868e-04  3.5002387e-05  2.7029600e-04  2.8731427e-05
   -5.2404794e-05  1.4110740e-05 -2.0850824e-04  1.0625856e-04
    1.5787718e-04  1.1166865e-04 -3.7516581e-04  1.7297745e-04
   -6.5742475e-05  6.8289250e-05 -2.2307977e-04 -5

In [16]:
def exp_neg_abs_sim(left_embed, right_embed):
    return nd.exp(-nd.sum(nd.abs(left_embed-right_embed), axis=1, keepdims=True))

In [17]:
ManhattanDistance = nn.Lambda(lambda x: exp_neg_abs_sim(x[0], x[1]))

In [18]:
ManhattanDistance([left_output, right_output])


[[0.9971166]
 [0.9971166]
 [0.9971166]
 [0.9971166]]
<NDArray 4x1 @cpu(0)>

## Load data
This part is alike that in keras_model

## Model building

In [19]:
model = nn.Sequential()
model.add(TwoSeqEncoder(vocab_size=1000, embed_size=16, num_hiddens=32, num_layers=16),\
          ManhattanDistance)
model.initialize()
print(model)
model([X, Y])

Sequential(
  (0): TwoSeqEncoder(
    (embedding): Embedding(1000 -> 16, float32)
    (rnn): LSTM(None -> 32, TNC, num_layers=16)
  )
  (1): Lambda(<lambda>)
)



[[1.]
 [1.]
 [1.]
 [1.]]
<NDArray 4x1 @cpu(0)>

## Training

In [22]:
def train_ch7(model, data_iter, lr, num_epochs, ctx):  # Saved in d2l
    model.initialize(init.Xavier(), force_reinit=True, ctx=ctx)
    trainer = gluon.Trainer(model.collect_params(),
                            'adam', {'learning_rate': lr})
    loss = gloss.L1Loss
    tic = time.time()
    for epoch in range(1, num_epochs+1):
        l_sum = 0.0
        for batch in data_iter:
            left, right, label = [x.as_in_context(ctx) for x in batch]
            with autograd.record():
                predict = model([left, right])
                l = loss(predict, label)
            l.backward()
            l_sum += l
            d2l.grad_clipping_gluon(model, 5, ctx)
            trainer.step(1)
        if epoch % 50 == 0:
            print("epoch %d, loss %.3f, time %.1f sec" % (
                epoch, l_sum, time.time()-tic))
            tic = time.time()
            

Train the model

In [26]:
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_examples, max_len = 64, 1e3, 10
lr, num_epochs, ctx = 0.005, 300, d2l.try_gpu()
# load data


Unnamed: 0,is_spoiler,movie_id,review_text,plot_summary
0,False,tt2404463,Terrible How can you watch this the whole way ...,"Sarah Ashburn, an FBI agent, is extremely ambi..."
1,False,tt0878804,A feel good movie without the mush.. I went t...,Based on the true story of Leigh Anne Tuohy an...
2,False,tt0116191,Woefully bad Reading through all these positiv...,Emma Woodhouse is a congenial young lady who d...
3,True,tt0773262,"This show is a serial killer..! All right, set...","Dexter Morgan, Miami Metro Police Department b..."
4,False,tt2226597,"Like Any ""Mountain"", It Just Feels Unmovable S...","Stranded after a tragic plane crash, two stran..."


## Predicting

In [13]:
def translate_ch7(model, src_sentence, src_vocab, tgt_vocab, max_len, ctx):
    src_tokens = src_vocab[src_sentence.lower().split(' ')]
    src_len = len(src_tokens)
    if src_len < max_len:
        src_tokens += [src_vocab.pad] * (max_len - src_len)
    enc_X = nd.array(src_tokens, ctx=ctx)
    enc_valid_length = nd.array([src_len], ctx=ctx)
    # use expand_dim to add the batch_size dimension.
    enc_outputs = model.encoder(enc_X.expand_dims(axis=0), enc_valid_length)
    dec_state = model.decoder.init_state(enc_outputs, enc_valid_length)
    dec_X = nd.array([tgt_vocab.bos], ctx=ctx).expand_dims(axis=0)
    predict_tokens = []
    for _ in range(max_len):
        Y, dec_state = model.decoder(dec_X, dec_state)
        # The token with highest score is used as the next time step input. 
        dec_X = Y.argmax(axis=2)
        py = dec_X.squeeze(axis=0).astype('int32').asscalar()
        if py == tgt_vocab.eos:
            break
        predict_tokens.append(py)
    return ' '.join(tgt_vocab.to_tokens(predict_tokens))

Try several examples:

In [14]:
for sentence in ['Go .', 'Wow !', "I'm OK .", 'I won !']:
    print(sentence + ' => ' + translate_ch7(
        model, sentence, src_vocab, tgt_vocab, max_len, ctx))

Go . => va !
Wow ! => <unk> !
I'm OK . => je vais bien .
I won ! => je l'ai emporté !
