In [1]:
import collections
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import d2l.torch as d2l

In [None]:
# Here we implement the encoder with a multilayer GRU recurrent neural network.

def init_seq2seq(module):
    """Initialize all weights in the seq2seq model."""  
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)
    if type(module) == nn.GRU:
        for param in module._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_uniform_(module._parameters[param])

class Seq2SeqEncoder(d2l.Encoder):
    """The RNN encoder for sequence to seq2seq learning."""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super().__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = d2l.GRU(embed_size, num_hiddens, num_layers, dropout)

    def forward(self, X, *args):
        # X shape: (batch_size, num_steps) *** each entry is a token index
        embs = self.embedding(X.t().type(torch.int64))
        # embs shape: (num_steps, batch_size, embed_size) *** X.t() is just the transpose of X
        outputs, state = self.rnn(embs)
        # outputs shape: (num_steps, batch_size, num_hiddens)
        # state shape: (num_layers, batch_size, num_hiddens)
        return outputs, state

we instantiate a two-layer GRU encoder whose number of hidden units is 16. Given a minibatch of sequence inputs X (batch size $= 4$
; number of time steps $= 9$
), the hidden states of the final layer at all the time steps (enc_outputs returned by the encoder’s recurrent layers) are a tensor of shape (number of time steps, batch size, number of hidden units).

In [None]:
vocab_size, embed_size, num_hiddens, num_layers = 10, 8, 16, 2
batch_size, num_steps = 4, 9    