In [None]:
# NOTE: If you are running this notebook on Google Colab,
#       then uncomment the two lines below and then run this cell!

!pip install datasets evaluate --upgrade
!python -m spacy download de_core_news_sm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate

We'll set all possible random seeds for deterministic results.


In [5]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [6]:
dataset = datasets.load_dataset("bentrevett/multi30k")

Downloading readme: 100%|██████████| 1.15k/1.15k [00:00<00:00, 9.76MB/s]
Downloading data: 100%|██████████| 4.60M/4.60M [00:00<00:00, 23.7MB/s]
Downloading data: 100%|██████████| 164k/164k [00:00<00:00, 2.90MB/s]
Downloading data: 100%|██████████| 156k/156k [00:00<00:00, 2.02MB/s]
Generating train split: 29000 examples [00:00, 522944.57 examples/s]
Generating validation split: 1014 examples [00:00, 159858.08 examples/s]
Generating test split: 1000 examples [00:00, 168601.68 examples/s]


In [7]:
print(dataset)
# Print example from the training set
print("Example from the training set:")
print(dataset["train"][0])

# Print example from the validation set
print("\nExample from the validation set:")
print(dataset["validation"][0])

# Print example from the test set
print("\nExample from the test set:")
print(dataset["test"][0])


DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})
Example from the training set:
{'en': 'Two young, White males are outside near many bushes.', 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

Example from the validation set:
{'en': 'A group of men are loading cotton onto a truck', 'de': 'Eine Gruppe von Männern lädt Baumwolle auf einen Lastwagen'}

Example from the test set:
{'en': 'A man in an orange hat starring at something.', 'de': 'Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.'}


In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

For convenience, we create a variable for each split. Each being a `Dataset` object.


In [9]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

We can index into each `Dataset` to view an individual example. Each example has two features: "en" and "de", which are the parallel English and German sentences.


In [10]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

In [12]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [13]:
string = "What a lovely day it is today!"

[token.text for token in en_nlp.tokenizer(string)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

In [15]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [16]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map: 100%|██████████| 29000/29000 [00:04<00:00, 6566.68 examples/s]
Map: 100%|██████████| 1014/1014 [00:00<00:00, 6965.74 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 7126.77 examples/s]


In [18]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [19]:
en_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', 'a', '.', 'in', 'the', 'on', 'man']

In [20]:
en_vocab.get_itos()[9]

'man'

In [21]:
de_vocab.get_itos()[:10]

['<unk>', '<pad>', '<sos>', '<eos>', '.', 'ein', 'einem', 'in', 'eine', ',']

In [22]:
en_vocab.get_stoi()["the"]

7

In [23]:
en_vocab["the"]

7

In [24]:
len(en_vocab), len(de_vocab)

(5893, 7853)

In [25]:
"the" in en_vocab

True

In [26]:
"The" in en_vocab

False

In [27]:
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [28]:
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

In [29]:
en_vocab["The"]

0

In [30]:
en_vocab.get_itos()[0]

'<unk>'

In [31]:
tokens = ["i", "love", "watching", "crime", "shows"]

In [32]:
en_vocab.lookup_indices(tokens)

[956, 2169, 173, 0, 821]

In [33]:
en_vocab.lookup_tokens(en_vocab.lookup_indices(tokens))

['i', 'love', 'watching', '<unk>', 'shows']

In [34]:
def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [35]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map: 100%|██████████| 29000/29000 [00:02<00:00, 10372.97 examples/s]
Map: 100%|██████████| 1014/1014 [00:00<00:00, 9628.20 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 9748.30 examples/s]


In [36]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_ids': [2, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 3],
 'de_ids': [2, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 3]}

In [37]:
en_vocab.lookup_tokens(train_data[0]["en_ids"])

['<sos>',
 'two',
 'young',
 ',',
 'white',
 'males',
 'are',
 'outside',
 'near',
 'many',
 'bushes',
 '.',
 '<eos>']

In [38]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [39]:
train_data[0]

{'en_ids': tensor([   2,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
            3]),
 'de_ids': tensor([   2,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
         3171,    4,    3]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [40]:
type(train_data[0]["en_ids"])

torch.Tensor

In [41]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn

In [42]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [43]:
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [44]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden, cell

In [45]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hidden dim]
        # context = [n layers, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # seq length and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [n layers, batch size, hidden dim]
        # cell = [n layers, batch size, hidden dim]
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch size, output dim]
        return prediction, hidden, cell

In [46]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        # input = [batch size]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, hidden dim]
            # cell = [n layers, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

In [47]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.2
decoder_dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [48]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.2)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.2)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

We can also count the number of parameters in our model.


In [49]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 13,898,501 trainable parameters


### Optimizer

We define our optimizer, which we use to update our parameters in the training loop. Check out [this](http://ruder.io/optimizing-gradient-descent/) post for information about different optimizers. Here, we'll use Adam.


In [50]:
optimizer = optim.Adam(model.parameters())

In [51]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [52]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["de_ids"].to(device)
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [53]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [54]:
n_epochs = 50
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  2%|▏         | 1/50 [00:20<16:57, 20.76s/it]

	Train Loss:   5.014 | Train PPL: 150.499
	Valid Loss:   4.993 | Valid PPL: 147.447


  4%|▍         | 2/50 [00:37<14:46, 18.46s/it]

	Train Loss:   4.403 | Train PPL:  81.681
	Valid Loss:   4.761 | Valid PPL: 116.812


  6%|▌         | 3/50 [00:54<13:54, 17.76s/it]

	Train Loss:   4.100 | Train PPL:  60.364
	Valid Loss:   4.573 | Valid PPL:  96.873


  8%|▊         | 4/50 [01:11<13:23, 17.46s/it]

	Train Loss:   3.893 | Train PPL:  49.048
	Valid Loss:   4.352 | Valid PPL:  77.605


 10%|█         | 5/50 [01:28<12:55, 17.23s/it]

	Train Loss:   3.677 | Train PPL:  39.521
	Valid Loss:   4.256 | Valid PPL:  70.522


 12%|█▏        | 6/50 [01:45<12:33, 17.12s/it]

	Train Loss:   3.486 | Train PPL:  32.649
	Valid Loss:   4.074 | Valid PPL:  58.797


 14%|█▍        | 7/50 [02:02<12:12, 17.04s/it]

	Train Loss:   3.307 | Train PPL:  27.304
	Valid Loss:   4.007 | Valid PPL:  54.975


 16%|█▌        | 8/50 [02:19<11:53, 16.99s/it]

	Train Loss:   3.134 | Train PPL:  22.972
	Valid Loss:   3.942 | Valid PPL:  51.530


 18%|█▊        | 9/50 [02:41<12:45, 18.66s/it]

	Train Loss:   2.975 | Train PPL:  19.589
	Valid Loss:   3.880 | Valid PPL:  48.414


 20%|██        | 10/50 [02:58<12:04, 18.11s/it]

	Train Loss:   2.836 | Train PPL:  17.043
	Valid Loss:   3.766 | Valid PPL:  43.200


 22%|██▏       | 11/50 [03:14<11:28, 17.66s/it]

	Train Loss:   2.701 | Train PPL:  14.892
	Valid Loss:   3.688 | Valid PPL:  39.978


 24%|██▍       | 12/50 [03:31<11:01, 17.40s/it]

	Train Loss:   2.572 | Train PPL:  13.094
	Valid Loss:   3.707 | Valid PPL:  40.726


 26%|██▌       | 13/50 [03:48<10:37, 17.24s/it]

	Train Loss:   2.462 | Train PPL:  11.729
	Valid Loss:   3.694 | Valid PPL:  40.220


 28%|██▊       | 14/50 [04:05<10:17, 17.15s/it]

	Train Loss:   2.366 | Train PPL:  10.651
	Valid Loss:   3.639 | Valid PPL:  38.041


 30%|███       | 15/50 [04:22<09:55, 17.03s/it]

	Train Loss:   2.252 | Train PPL:   9.506
	Valid Loss:   3.714 | Valid PPL:  41.012


 32%|███▏      | 16/50 [04:39<09:38, 17.00s/it]

	Train Loss:   2.143 | Train PPL:   8.523
	Valid Loss:   3.700 | Valid PPL:  40.458


 34%|███▍      | 17/50 [04:55<09:19, 16.94s/it]

	Train Loss:   2.034 | Train PPL:   7.648
	Valid Loss:   3.664 | Valid PPL:  39.001


 36%|███▌      | 18/50 [05:12<09:01, 16.93s/it]

	Train Loss:   1.963 | Train PPL:   7.121
	Valid Loss:   3.729 | Valid PPL:  41.635


 38%|███▊      | 19/50 [05:29<08:44, 16.92s/it]

	Train Loss:   1.874 | Train PPL:   6.513
	Valid Loss:   3.792 | Valid PPL:  44.363


 40%|████      | 20/50 [05:46<08:26, 16.90s/it]

	Train Loss:   1.774 | Train PPL:   5.895
	Valid Loss:   3.796 | Valid PPL:  44.513


 42%|████▏     | 21/50 [06:03<08:10, 16.92s/it]

	Train Loss:   1.703 | Train PPL:   5.492
	Valid Loss:   3.861 | Valid PPL:  47.490


 44%|████▍     | 22/50 [06:20<07:54, 16.94s/it]

	Train Loss:   1.617 | Train PPL:   5.037
	Valid Loss:   3.890 | Valid PPL:  48.913


 46%|████▌     | 23/50 [06:37<07:36, 16.92s/it]

	Train Loss:   1.523 | Train PPL:   4.586
	Valid Loss:   3.877 | Valid PPL:  48.273


 48%|████▊     | 24/50 [06:54<07:19, 16.91s/it]

	Train Loss:   1.451 | Train PPL:   4.266
	Valid Loss:   3.972 | Valid PPL:  53.111


 50%|█████     | 25/50 [07:11<07:03, 16.94s/it]

	Train Loss:   1.370 | Train PPL:   3.935
	Valid Loss:   4.007 | Valid PPL:  54.990


 52%|█████▏    | 26/50 [07:28<06:46, 16.95s/it]

	Train Loss:   1.313 | Train PPL:   3.719
	Valid Loss:   4.123 | Valid PPL:  61.773


 54%|█████▍    | 27/50 [07:45<06:29, 16.93s/it]

	Train Loss:   1.251 | Train PPL:   3.495
	Valid Loss:   4.145 | Valid PPL:  63.114


 56%|█████▌    | 28/50 [08:02<06:12, 16.91s/it]

	Train Loss:   1.176 | Train PPL:   3.242
	Valid Loss:   4.144 | Valid PPL:  63.078


 58%|█████▊    | 29/50 [08:18<05:54, 16.89s/it]

	Train Loss:   1.120 | Train PPL:   3.065
	Valid Loss:   4.246 | Valid PPL:  69.831


 58%|█████▊    | 29/50 [08:24<06:05, 17.38s/it]


KeyboardInterrupt: 

We've now successfully trained a model that translates German into English! But how well does it perform?


In [55]:
model.load_state_dict(torch.load("tut1-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 3.606 | Test PPL:  36.829 |


In [56]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = de_vocab.lookup_indices(tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = en_vocab.lookup_tokens(inputs)
    return tokens

In [57]:
sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]

sentence, expected_translation

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.',
 'A man in an orange hat starring at something.')

In [59]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [60]:
translation

['<sos>',
 'a',
 'man',
 'in',
 'a',
 'orange',
 'shirt',
 'is',
 'a',
 'a',
 '.',
 '.',
 '<eos>']

In [61]:
sentence = "Ein Mann sitzt auf einer Bank."

In [62]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

And we receive our translation, which is reasonably close.


In [63]:
translation

['<sos>', 'a', 'man', 'sitting', 'on', 'a', 'bench', '.', '<eos>']

In [64]:
translations = [
    translate_sentence(
        example["de"],
        model,
        en_nlp,
        de_nlp,
        en_vocab,
        de_vocab,
        lower,
        sos_token,
        eos_token,
        device,
    )
    for example in tqdm.tqdm(test_data)
]

100%|██████████| 1000/1000 [00:05<00:00, 193.62it/s]


In [65]:
bleu = evaluate.load("bleu")

Downloading builder script: 100%|██████████| 5.94k/5.94k [00:00<00:00, 29.3MB/s]
Downloading extra modules: 4.07kB [00:00, 14.4MB/s]                   
Downloading extra modules: 100%|██████████| 3.34k/3.34k [00:00<00:00, 18.4MB/s]


In [66]:
predictions = [" ".join(translation[1:-1]) for translation in translations]

references = [[example["en"]] for example in test_data]

In [67]:
predictions[0], references[0]

('a man in a orange shirt is a a . .',
 ['A man in an orange hat starring at something.'])

In [68]:
def get_tokenizer_fn(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens

    return tokenizer_fn

In [69]:
tokenizer_fn = get_tokenizer_fn(en_nlp, lower)

In [70]:
tokenizer_fn(predictions[0]), tokenizer_fn(references[0][0])

(['a', 'man', 'in', 'a', 'orange', 'shirt', 'is', 'a', 'a', '.', '.'],
 ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.'])

In [71]:
results = bleu.compute(
    predictions=predictions, references=references, tokenizer=tokenizer_fn
)

In [72]:
results

{'bleu': 0.18715119453990228,
 'precisions': [0.5284559166537086,
  0.250042151407857,
  0.133492911066102,
  0.07392009734333807],
 'brevity_penalty': 0.9848768336512208,
 'length_ratio': 0.9849900444172155,
 'translation_length': 12862,
 'reference_length': 13058}