**The Baseline code with attention and LSTM instead of GRU**

In [1]:
!pip install d2l==1.0.0a0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting d2l==1.0.0a0
  Downloading d2l-1.0.0a0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting jupyter
  Downloading jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Collecting qtconsole
  Downloading qtconsole-5.4.2-py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.2/121.2 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting qtpy>=2.0.1
  Downloading QtPy-2.3.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.9/84.9 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: qtpy, jed

In [2]:
!pip install matplotlib
%matplotlib inline
!pip install matplotlib-inline
import sys
!{sys.executable} -m pip install matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import collections
import math
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l
import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import torch.optim as optim
import d2l
from d2l import torch as d2l
import torch
from torch import nn
from torch import optim

In [4]:
# Define the attention mechanism
class BahdanauAttention(nn.Module):
    def __init__(self, num_hiddens, **kwargs):
        super(BahdanauAttention, self).__init__(**kwargs)
        self.W_a = nn.Linear(num_hiddens, num_hiddens, bias=False)
        self.W_b = nn.Linear(num_hiddens, num_hiddens)
        self.v = nn.Linear(num_hiddens, 1, bias=False)

    def forward(self, query, keys, values):
        query = query.unsqueeze(1)
        score = self.v(torch.tanh(self.W_a(keys) + self.W_b(query)))
        attention_weights = torch.softmax(score, dim=1)
        return torch.bmm(attention_weights.permute(0, 2, 1), values)

In [5]:
class Seq2SeqEncoder(d2l.Encoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 drop_prob=0.0):
        super(Seq2SeqEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers,
                           dropout=drop_prob)

    def forward(self, X, *args):
        X = self.embedding(X)
        X = X.permute(1, 0, 2)
        output, state = self.rnn(X)
        return output, state

class Seq2SeqDecoder(d2l.Decoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 attention):
        super(Seq2SeqDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.attention = attention
        self.rnn = nn.LSTM(num_hiddens + embed_size, num_hiddens, num_layers)
        self.out = nn.Linear(num_hiddens, vocab_size)

    def forward(self, X, state, enc_outputs):
        enc_outputs, hidden_state = enc_outputs
        X = self.embedding(X)
        X = X.permute(1, 0, 2)
        outputs = []
        for x in X:
            context = self.attention(hidden_state[0][-1], enc_outputs, enc_outputs)
            x = torch.cat((context, x.unsqueeze(0)), dim=2)
            out, state = self.rnn(x, state)
            outputs.append(out)
        outputs = self.out(torch.cat(outputs, dim=0))
        return outputs.permute(1, 0, 2), state        

In [6]:
class Seq2Seq(d2l.EncoderDecoder):
    def __init__(self, encoder, decoder, **kwargs):
        super(Seq2Seq, self).__init__(encoder, decoder, **kwargs)

In [25]:
def train(num_epochs, model, loss, optimizer, train_iter, tgt_vocab):
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for batch in train_iter:
            X, X_valid_len, Y, Y_valid_len = d2l.to_input_output(batch)
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0])
            bos = bos.reshape(-1, 1)
            bos_len = torch.tensor([1] * Y.shape[0])
            bos_e = model.decode(bos.to(d2l.try_gpu()), X, X_valid_len)
            Y_in = d2l.concat([bos, Y[:, :-1]], 1)
            Y_in_valid_len = Y_valid_len.copy()
            for i in range(len(Y_in_valid_len)):
                Y_in_valid_len[i] += 1
            Y_hat, attention_weight = model(X, Y_in.to(d2l.try_gpu()),
                                             X_valid_len, Y_in_valid_len)
            l = loss(Y_hat, Y.to(d2l.try_gpu()), Y_in_valid_len)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            epoch_loss += l.item()
        print("Epoch %d, loss %.4f" % (epoch + 1, epoch_loss / len(train_iter)))

In [30]:
data = d2l.MTFraEng(batch_size=128)
embed_size = 256
num_hiddens = 256
num_layers = 2
dropout = 0.2
lr = 0.005
encoder = Seq2SeqEncoder(len(data.src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(len(data.tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.Seq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab['<pad>'], lr=lr)

trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1, num_gpus=1)
trainer.fit(model, data)

NotImplementedError: ignored