In [1]:
!pip install datasets evaluate spacy
!python -m spacy download de_core_news_sm

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import tqdm
import evaluate

In [2]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# Loading the dataset

In [3]:
dataset = datasets.load_dataset("bentrevett/multi30k")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

val.jsonl:   0%|          | 0.00/164k [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/156k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1014 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

# Creating a variable for each split

In [5]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"]
)

In [6]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

# Tokenizing the data

In [7]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [8]:
def tokenize_example(example, de_nlp, en_nlp, max_length, lower, sos_token, eos_token):
  de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
  en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
  if lower:
    de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [token.lower() for token in en_tokens]
  en_tokens = [sos_token] + en_tokens + [eos_token]
  de_tokens = [sos_token] + de_tokens + [eos_token]
  return {"de_tokens":de_tokens, "en_tokens": en_tokens}

In [9]:
max_length = 1000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

In [10]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>']}

In [11]:
from collections import Counter

def build_vocab(token_lists, min_freq, specials):
  counter = Counter()
  for token in token_lists:
    counter.update(token)

  # Here specials refer to the special tokens- <sos>, <eos>, <unk>, <pad>.
  vocab = {token:idx for idx, token in enumerate(specials)}
  idx = len(vocab)

  for token, freq in counter.items():
    if freq >= min_freq and token not in vocab:
      vocab[token] = idx
      idx += 1

  return vocab

In [19]:
class Vocab:
  def __init__(self, token_to_idx, unk_token="<unk>"):
    self.token_to_idx = token_to_idx
    self.idx_to_token = {idx: token for token, idx in token_to_idx.items()}
    self.unk_token = unk_token

  def __getitem__(self, token):
    return self.token_to_idx.get(token, self.token_to_idx[self.unk_token])

  def lookup_token(self, indices):
    return self.idx_to_token.get(indices, self.unk_token)

  def lookup_tokens(self, indices):
    return [self.lookup_token(index) for index in indices]

  def __len__(self):
    return len(self.token_to_idx)

  def get_itos(self):
    return [self.idx_to_token[i] for i in range(len(self.idx_to_token))]

  def get_stoi(self):
    return self.token_to_idx

  def lookup_indices(self, tokens):
    return [self[token] for token in tokens]

In [20]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token
]

en_vocab = build_vocab(train_data["en_tokens"], min_freq, special_tokens)
de_vocab = build_vocab(train_data["de_tokens"], min_freq, special_tokens)

In [21]:
en_vocab_obj = Vocab(en_vocab, unk_token)
en_vocab_obj.get_itos()[:10]

['<unk>',
 '<pad>',
 '<sos>',
 '<eos>',
 'two',
 'young',
 ',',
 'white',
 'males',
 'are']

In [22]:
de_vocab_obj = Vocab(de_vocab, unk_token)
de_vocab_obj.get_itos()[:10]

['<unk>',
 '<pad>',
 '<sos>',
 '<eos>',
 'zwei',
 'junge',
 'weiße',
 'männer',
 'sind',
 'im']

# Getting the text from the index

In [23]:
en_vocab_obj.get_stoi()["two"]

4

In [24]:
len(en_vocab), len(de_vocab)

(5893, 7853)

In [25]:
"The" in en_vocab

False

In [26]:
assert en_vocab_obj[unk_token] == de_vocab_obj[unk_token]
assert en_vocab_obj[pad_token] == de_vocab_obj[pad_token]

unk_index = en_vocab_obj[unk_token]
pad_index =  en_vocab_obj[pad_token]

In [31]:
en_vocab_obj["The"]

0

In [28]:
en_vocab_obj.get_itos()[0]

'<unk>'

In [29]:
tokens = ["i", "love", "watching", "crime", "shows"]

In [32]:
en_vocab_obj.lookup_indices(tokens)

[171, 4010, 225, 0, 1130]

In [33]:
en_vocab_obj.lookup_tokens(en_vocab_obj.lookup_indices(tokens))

['i', 'love', 'watching', '<unk>', 'shows']

# Converting tokens to indices of our *tokens*

In [34]:
def numericalize_example(example, en_vocab_obj, de_vocab_obj):
  en_ids = en_vocab_obj.lookup_indices(example["en_tokens"])
  de_ids = de_vocab_obj.lookup_indices(example["de_tokens"])
  return {"en_ids": en_ids, "de_ids": de_ids }

In [35]:
fn_kwargs = {
    "en_vocab_obj": en_vocab_obj,
    "de_vocab_obj": de_vocab_obj
}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

In [36]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'en_ids': [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 3],
 'de_ids': [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 3]}

In [37]:
en_vocab_obj.lookup_tokens(train_data[0]["en_ids"])

['<sos>',
 'two',
 'young',
 ',',
 'white',
 'males',
 'are',
 'outside',
 'near',
 'many',
 'bushes',
 '.',
 '<eos>']

# Converting the indices to Pytorch tensors to use in the model

In [38]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(
    type = data_type,
    columns = format_columns,
    output_all_columns = True
)

valid_data = valid_data.with_format(
    type = data_type,
    columns = format_columns,
    output_all_columns = True
)

test_data = test_data.with_format(
    type = data_type,
    columns = format_columns,
    output_all_columns = True
)

In [39]:
train_data[0]

{'en_ids': tensor([ 2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  3]),
 'de_ids': tensor([ 2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  3]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>']}

In [40]:
print("en_ids_type : " , type(train_data[0]["en_ids"]),"\n"
      "de_ids_type : ", type(train_data[0]["de_ids"]))

en_ids_type :  <class 'torch.Tensor'> 
de_ids_type :  <class 'torch.Tensor'>


# Data Loaders

In [41]:
def get_collate_fn(pad_index):
  def collate_fn(batch):
    batch_en_ids = [example["en_ids"] for example in batch]
    batch_de_ids = [example["de_ids"] for example in batch]
    batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
    batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
    batch = {
        "en_ids": batch_en_ids,
        "de_ids": batch_de_ids
    }
    return batch

  return collate_fn

In [42]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
  collate_fn = get_collate_fn(pad_index)
  data_loader = torch.utils.data.DataLoader(
      dataset,
      batch_size=batch_size,
      collate_fn = collate_fn,
      shuffle=shuffle
  )

  return data_loader

In [43]:
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)

# Building the Model

In [44]:
class Encoder(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
    super().__init__()
    self.hidden_dim = hidden_dim
    self.n_layers = n_layers
    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    # src = [src length, batch size]
    embedded = self.dropout(self.embedding(src))
    outputs , (hidden, cell) = self.rnn(embedded)
    # outputs are always from top hidden layer
    return hidden, cell

In [45]:
class Decoder(nn.Module):
  def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
    super().__init__()
    self.output_dim = output_dim
    self.hidden_dim = hidden_dim
    self.n_layers = n_layers
    self.embedding = nn.Embedding(output_dim, embedding_dim)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
    self.fc_out = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, input, hidden, cell):
    input = input.unsqueeze(0)
    embedded = self.dropout(self.embedding(input))
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
    prediction = self.fc_out(output.squeeze(0))
    return prediction, hidden, cell

In [46]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
    assert (
        encoder.hidden_dim == decoder.hidden_dim
    ), "Hidden dimensions of encoder and decoder must be equal."
    assert (
        encoder.n_layers == decoder.n_layers
    ), "Encoder and decoder must have equal number of layers."

  def forward(self, src, trg, teacher_forcing_ratio):
    # teacher forcing ratio is probability of using teacher forcing
    batch_size = trg.shape[1]
    trg_length = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
    hidden, cell = self.encoder(src)
    input = trg[0, :]
    for t in range(1, trg_length):
      output, hidden, cell = self.decoder(input, hidden, cell)
      # placing the predictions in a tensor holding predictions for each token
      outputs[t] = output
      # if we are going to use teacher forcing or not
      teacher_force = random.random() < teacher_forcing_ratio
      # get the highest predicted token from our predictions
      top1 = output.argmax(1)
      # if teacher forcing, use actual next token as next input
      # if not, use predicted token
      input = trg[t] if teacher_force else top1

    return outputs

# Training the Model

In [47]:
input_dim = len(de_vocab_obj)
output_dim = len(en_vocab_obj)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [48]:
def init_weights(m):
  for name, param in m.named_parameters():
    nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [49]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 13,898,501 trainable parameters


# Optimizer

In [50]:
optimizer = optim.Adam(model.parameters())

# Loss Function

In [51]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

# Training Loop

In [52]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
  model.train()
  epoch_loss = 0
  for i, batch in enumerate(data_loader):
    src = batch["de_ids"].to(device)
    trg = batch["en_ids"].to(device)
    optimizer.zero_grad()
    output = model(src, trg, teacher_forcing_ratio)
    output_dim = output.shape[-1]
    output = output[1:].view(-1, output_dim)
    trg = trg[1:].view(-1)
    loss = criterion(output, trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()

  return epoch_loss / len(data_loader)

# Evaluation Loop

In [53]:
def evaluate_fn(model, data_loader, criterion, device):
  model.eval()
  epoch_loss = 0
  with torch.no_grad():
    for i, batch in enumerate(data_loader):
      src = batch["de_ids"].to(device)
      trg = batch["en_ids"].to(device)
      output = model(src, trg, 0)
      output_dim = output.shape[-1]
      output = output[1:].view(-1, output_dim)
      trg = trg[1:].view(-1)
      loss = criterion(output, trg)
      epoch_loss += loss.item()

  return epoch_loss / len(data_loader)

# Model Training

In [54]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5
best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
  train_loss = train_fn(
      model,
      train_data_loader,
      optimizer,
      criterion,
      clip,
      teacher_forcing_ratio,
      device
  )

  valid_loss = evaluate_fn(
      model,
      valid_data_loader,
      criterion,
      device
  )

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), "tut3-model.pt")
  print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
  print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

 10%|█         | 1/10 [00:46<07:00, 46.68s/it]

	Train Loss:   5.028 | Train PPL: 152.603
	Valid Loss:   4.873 | Valid PPL: 130.727


 20%|██        | 2/10 [01:32<06:09, 46.22s/it]

	Train Loss:   4.390 | Train PPL:  80.607
	Valid Loss:   4.656 | Valid PPL: 105.258


 30%|███       | 3/10 [02:18<05:22, 46.05s/it]

	Train Loss:   4.103 | Train PPL:  60.538
	Valid Loss:   4.510 | Valid PPL:  90.906


 40%|████      | 4/10 [03:04<04:36, 46.11s/it]

	Train Loss:   3.901 | Train PPL:  49.442
	Valid Loss:   4.328 | Valid PPL:  75.768


 50%|█████     | 5/10 [03:50<03:49, 45.95s/it]

	Train Loss:   3.714 | Train PPL:  41.005
	Valid Loss:   4.244 | Valid PPL:  69.676


 60%|██████    | 6/10 [04:36<03:03, 45.93s/it]

	Train Loss:   3.557 | Train PPL:  35.047
	Valid Loss:   4.124 | Valid PPL:  61.800


 70%|███████   | 7/10 [05:21<02:17, 45.85s/it]

	Train Loss:   3.431 | Train PPL:  30.920
	Valid Loss:   4.084 | Valid PPL:  59.372


 80%|████████  | 8/10 [06:07<01:31, 45.84s/it]

	Train Loss:   3.297 | Train PPL:  27.040
	Valid Loss:   4.050 | Valid PPL:  57.412


 90%|█████████ | 9/10 [06:53<00:45, 45.92s/it]

	Train Loss:   3.166 | Train PPL:  23.702
	Valid Loss:   3.950 | Valid PPL:  51.953


100%|██████████| 10/10 [07:39<00:00, 45.95s/it]

	Train Loss:   3.048 | Train PPL:  21.064
	Valid Loss:   3.867 | Valid PPL:  47.808





In [55]:
model.load_state_dict(torch.load("tut3-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f}")

| Test Loss: 3.846 | Test PPL:  46.804


In [76]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab_obj,
    de_vocab_obj,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length = 25
):
  model.eval()
  with torch.no_grad():
    if isinstance(sentence, str):
      tokens = [token.text for token in de_nlp.tokenizer(sentence)]
    else:
      tokens = [token for token in sentence]
    if lower:
      tokens = [token.lower() for token in tokens]

    tokens = [sos_token] + tokens + [eos_token]
    ids = de_vocab_obj.lookup_indices(tokens)
    tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
    hidden, cell = model.encoder(tensor)
    inputs = en_vocab_obj.lookup_indices([sos_token])
    for _ in range(max_output_length):
      inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
      output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
      predicted_token = output.argmax(-1).item()
      inputs.append(predicted_token)
      if predicted_token == en_vocab_obj[eos_token]:
        break
    tokens = en_vocab_obj.lookup_tokens(inputs)
  return tokens

In [77]:
sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]

sentence, expected_translation

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.',
 'A man in an orange hat starring at something.')

In [78]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab_obj,
    de_vocab_obj,
    lower,
    sos_token,
    eos_token,
    device,
)

In [79]:
translation

['<sos>',
 'a',
 'man',
 'in',
 'a',
 'black',
 'hat',
 'is',
 'cutting',
 'food',
 '.',
 '<eos>']

In [80]:
sentence = "Ein Mann sitzt auf einer Bank."

In [81]:
print(de_vocab_obj.lookup_indices(["ein", ",ann", "sitzt", "auf", "einer", "bank", "."]))

[21, 0, 110, 33, 34, 115, 16]


In [82]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab_obj,
    de_vocab_obj,
    lower,
    sos_token,
    eos_token,
    device
)

In [83]:
translation

['<sos>', 'a', 'man', 'sitting', 'on', 'a', 'bench', '.', '<eos>']

In [87]:
translations = [
    translate_sentence(
        example["de"],
        model,
        en_nlp,
        de_nlp,
        en_vocab_obj,
        de_vocab_obj,
        lower,
        sos_token,
        eos_token,
        device
    )
    for example in tqdm.tqdm(test_data)
]

100%|██████████| 1000/1000 [00:08<00:00, 113.34it/s]


In [88]:
bleu = evaluate.load("bleu")

In [89]:
predictions = [" ".join(translation[1:-1]) for translation in translations]
references = [[example["en"]] for example in test_data]

In [91]:
predictions[0], references[0]

('a man in a black hat is cutting food .',
 ['A man in an orange hat starring at something.'])

In [98]:
def get_tokenizer_fn(nlp, lower):
  def tokenizer_fn(s):
    tokens = [token.text for token in nlp.tokenizer(s)]
    if lower:
      tokens = [token.lower() for token in tokens]
    return tokens
  return tokenizer_fn

In [99]:
tokenizer_fn = get_tokenizer_fn(en_nlp, lower)

In [100]:
tokenizer_fn(predictions[0]), tokenizer_fn(references[0][0])

(['a', 'man', 'in', 'a', 'black', 'hat', 'is', 'cutting', 'food', '.'],
 ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.'])

In [101]:
results = bleu.compute(
    predictions = predictions,
    references = references,
    tokenizer = tokenizer_fn
)

In [102]:
results

{'bleu': 0.1323227862965373,
 'precisions': [0.4748099976494555,
  0.18660205729830825,
  0.08928737340890086,
  0.04250742599610775],
 'brevity_penalty': 0.9771513870670351,
 'length_ratio': 0.9774084852197886,
 'translation_length': 12763,
 'reference_length': 13058}