<a href="https://colab.research.google.com/github/SooinJung/NLP-/blob/main/GRU_2_0721.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- 논문 리뷰 - Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation
- 데이터셋 - https://huggingface.co/datasets/bentrevett/multi30k
  - 독일어를 영어로 번역하는 task

# 0. 라이브러리 설치

In [None]:
!pip install datasets



In [None]:
!pip install evaluate



In [None]:
#!pip install torchtext==0.6.0  # 원하는 구 버전 설치

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate

In [None]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# 1. 데이터 불러오기

In [None]:
dataset = datasets.load_dataset("bentrevett/multi30k")

In [None]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

# 샘플 추출
dataset["train"][0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation succ

# 2. 데이터 전처리

In [None]:
# 토큰화 모델 불러옴
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")



In [None]:
# 토큰화 함수 생성
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    # 문장을 토큰화하고, 최대 길이(max_length)에 맞춰 자름
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    #  lower가 True이면 모든 토큰을 소문자로 변환
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    # [sos_token]을 문장 시작에 추가하고 [eos_token]을 문장 끝에 추가
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [None]:
max_length = 1_000
lower = True
sos_token = "<sos>" # 문장의 시작
eos_token = "<eos>" # 문장의 끝

# 토큰화 함수에 전달할 매개변수 딕셔너리 정의
fn_kwargs = {
    "en_nlp": en_nlp, # 영어 토큰화에 사용할 spaCy 언어 모델
    "de_nlp": de_nlp, # 독어 토큰화에 사용할 spaCy 언어 모델
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}
# 학습 데이터, 검증 데이터, 테스트 데이터에 대해 tokenize_example 함수 적용
# 각 데이터셋의 문장을 토큰화하고, 설정값에 따라 처리
train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

print(train_data)
train_data[0]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['en', 'de', 'en_tokens', 'de_tokens'],
    num_rows: 29000
})


{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

In [None]:
from torchtext.vocab import build_vocab_from_iterator

# 최소 빈도수 설정 (단어가 이 빈도수 이상일 때만 포함)
min_freq = 2
# 특수 토큰 정의
unk_token = "<unk>" # 알 수 없는 토큰
pad_token = "<pad>" # 패딩

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

# iterator를 사용하여 어휘(vocabulary)를 구축하는 함수
def build_vocab(iterator, min_freq=2, specials=special_tokens):
    return build_vocab_from_iterator(iterator, min_freq=min_freq, specials=special_tokens)

# 영어 어휘 구축
en_vocab = build_vocab(
    train_data["en_tokens"],  # 영어 문장 토큰 목록
    min_freq=min_freq,        # 최소 빈도수
    specials=special_tokens,  # 특수 토큰
)

# 독일어 어휘 구축
de_vocab = build_vocab(
    train_data["de_tokens"],  # 독일어 문장 토큰 목록
    min_freq=min_freq,        # 최소 빈도수
    specials=special_tokens,  # 특수 토큰
)

In [None]:
# <unk>, <pad> 토큰의 인덱스가 영어 어휘와 독일어 어휘에서 동일한지 확인
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

# <unk>, <pad> 토큰의 인덱스를 가져옴
unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [None]:
# 영어 어휘와 독일어 어휘에서 정의되지 않은 단어에 대해 <unk> 토큰의 인덱스를 기본값으로 설정
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

# 토큰을 인덱스(숫자)로 변환하는 함수
def numericalize_example(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

# numericalize_example 함수에 전달할 매개변수 딕셔너리 정의
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}

# 학습 데이터, 검증 데이터, 테스트 데이터에 대해 numericalize_example 함수 적용
# 각 데이터셋의 토큰을 인덱스로 변환
train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
data_type = "torch" # 데이터 형식을 'torch'로 설정
format_columns = ["en_ids", "de_ids"]

# 학습/검증/테스트 데이터의 형식을 'torch'로 설정하고, 'en_ids'와 'de_ids' 열을 포함
train_data = train_data.with_format(
    type=data_type, # 변환할 데이터를 torch 형식으로 변환
    columns=format_columns, # 'en_ids'와 'de_ids' 열을 포함
    output_all_columns=True # 모든 열을 포함해 형식 변환
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [None]:
#  배치 데이터를 처리할 함수
def get_collate_fn(pad_index):
    def collate_fn(batch):
        # 배치 내의 각 예제에서 영어와 독일어 토큰 인덱스를 추출
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]

        # 영어와 독일어 토큰 인덱스 리스트를 패딩하여 동일한 길이로 만듦
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)

        # 패딩된 인덱스 데이터를 포함하는 사전을 반환
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

# 데이터셋을 로드할 데이터로더 생성
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index) # 주어진 pad_index를 사용하여 콜레이트 함수를 생성

    # 데이터 로더를 생성
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,       # 사용할 데이터셋
        batch_size=batch_size, # 배치 크기
        collate_fn=collate_fn, # 배치 데이터를 처리할 콜레이트 함수
        shuffle=shuffle,       # 데이터셋을 무작위로 섞을지 여부
    )

    return data_loader

batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

# 3. 모델 구현

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim # GRU의 은닉 상태 차원
        self.embedding = nn.Embedding(input_dim, embedding_dim) # 단어 임베딩 레이어
        self.rnn = nn.GRU(embedding_dim, hidden_dim) # GRU 레이어
        self.dropout = nn.Dropout(dropout) # 드롭아웃 레이어

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))  # 입력 문장을 임베딩하여 고정 길이의 벡터로 변환하고, 드롭아웃 적용
        # embedded = [src length, batch size, embedding dim]
        outputs, hidden = self.rnn(embedded)  # 임베딩된 입력을 GRU에 통과
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim  # GRU의 은닉 상태 차원
        self.output_dim = output_dim  # 출력 단어 사전의 크기
        self.embedding = nn.Embedding(output_dim, embedding_dim)  # 단어 임베딩 레이어
        self.rnn = nn.GRU(embedding_dim + hidden_dim, hidden_dim)  # GRU 레이어, 입력 차원은 임베딩 차원 + 은닉 상태 차원
        self.fc_out = nn.Linear(embedding_dim + hidden_dim * 2, output_dim)  # 출력 레이어
        self.dropout = nn.Dropout(dropout)  # 드롭아웃 레이어

    def forward(self, input, hidden, context):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # context = [n layers * n directions, batch size, hidden dim]
        # n layers and n directions in the decoder will both always be 1, therefore:
        # hidden = [1, batch size, hidden dim]
        # context = [1, batch size, hidden dim]
        input = input.unsqueeze(0) # 입력 토큰을 [1, batch size] 형태로 변환
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))  # 입력 토큰을 임베딩 벡터로 변환하고, 드롭아웃 적용
        # embedded = [1, batch size, embedding dim]
        emb_con = torch.cat((embedded, context), dim=2) # 임베딩 벡터와 인코더의 최종 은닉 상태를 결합
        # emb_con = [1, batch size, embedding dim + hidden dim]
        output, hidden = self.rnn(emb_con, hidden)  # 임베딩 벡터와 인코더의 최종 은닉 상태를 GRU에 통과
        # output = [seq len, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # seq len, n layers and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [1, batch size, hidden dim]
        output = torch.cat(  # 임베딩 벡터, GRU의 은닉 상태, 그리고 인코더의 최종 은닉 상태를 결합
            (embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim=1
        )
        # output = [batch size, embedding dim + hidden dim * 2]
        prediction = self.fc_out(output)
        # prediction = [batch size, output dim]
        return prediction, hidden

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is the context
        context = self.encoder(src)
        # context = [n layers * n directions, batch size, hidden dim]
        # context also used as the initial hidden state of the decoder
        hidden = context
        # hidden = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden state and the context state
            # receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, context)
            # output = [batch size, output dim]
            # hidden = [1, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

In [None]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 14,219,781 trainable parameters


In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

# 4. 모델 학습

In [None]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["de_ids"].to(device)
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [None]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [None]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut2-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

 10%|█         | 1/10 [00:47<07:05, 47.24s/it]

	Train Loss:   5.043 | Train PPL: 154.927
	Valid Loss:   5.078 | Valid PPL: 160.477


 20%|██        | 2/10 [01:34<06:16, 47.03s/it]

	Train Loss:   4.354 | Train PPL:  77.828
	Valid Loss:   4.806 | Valid PPL: 122.252


 30%|███       | 3/10 [02:21<05:29, 47.03s/it]

	Train Loss:   4.047 | Train PPL:  57.245
	Valid Loss:   4.584 | Valid PPL:  97.917


 40%|████      | 4/10 [03:08<04:42, 47.01s/it]

	Train Loss:   3.734 | Train PPL:  41.862
	Valid Loss:   4.291 | Valid PPL:  73.007


 50%|█████     | 5/10 [03:54<03:54, 46.95s/it]

	Train Loss:   3.378 | Train PPL:  29.306
	Valid Loss:   4.040 | Valid PPL:  56.811


 60%|██████    | 6/10 [04:42<03:08, 47.06s/it]

	Train Loss:   3.094 | Train PPL:  22.063
	Valid Loss:   3.848 | Valid PPL:  46.905


 70%|███████   | 7/10 [05:30<02:22, 47.40s/it]

	Train Loss:   2.845 | Train PPL:  17.204
	Valid Loss:   3.783 | Valid PPL:  43.953


 80%|████████  | 8/10 [06:18<01:35, 47.79s/it]

	Train Loss:   2.592 | Train PPL:  13.361
	Valid Loss:   3.724 | Valid PPL:  41.418


 90%|█████████ | 9/10 [07:07<00:48, 48.06s/it]

	Train Loss:   2.394 | Train PPL:  10.959
	Valid Loss:   3.685 | Valid PPL:  39.864


100%|██████████| 10/10 [07:56<00:00, 47.69s/it]

	Train Loss:   2.186 | Train PPL:   8.903
	Valid Loss:   3.676 | Valid PPL:  39.507





In [None]:
model.load_state_dict(torch.load("tut2-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 3.664 | Test PPL:  39.030 |


# 5. 검증

In [None]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token] + tokens + [eos_token]
        ids = de_vocab.lookup_indices(tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        context = model.encoder(tensor)
        hidden = context
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden = model.decoder(inputs_tensor, hidden, context)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = en_vocab.lookup_tokens(inputs)
    return tokens

In [None]:
sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]

sentence, expected_translation

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.',
 'A man in an orange hat starring at something.')

In [None]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [None]:
translation

['<sos>',
 'a',
 'man',
 'in',
 'a',
 'white',
 'hat',
 'is',
 'something',
 '.',
 '<eos>']

In [None]:
sentence = "Ein Mann sieht sich einen Film an."

In [None]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [None]:
translation

['<sos>', 'a', 'man', 'looks', 'at', 'a', 'microscope', '.', '<eos>']

In [None]:
translations = [
    translate_sentence(
        example["de"],
        model,
        en_nlp,
        de_nlp,
        en_vocab,
        de_vocab,
        lower,
        sos_token,
        eos_token,
        device,
    )
    for example in tqdm.tqdm(test_data)
]

100%|██████████| 1000/1000 [00:12<00:00, 79.94it/s]


In [None]:
bleu = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [None]:
predictions = [" ".join(translation[1:-1]) for translation in translations]

references = [[example["en"]] for example in test_data]

In [None]:
def get_tokenizer_fn(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens

    return tokenizer_fn

In [None]:
tokenizer_fn = get_tokenizer_fn(en_nlp, lower)

In [None]:
results = bleu.compute(
    predictions=predictions, references=references, tokenizer=tokenizer_fn
)

In [None]:
results

{'bleu': 0.1936751802753128,
 'precisions': [0.5339046199701938,
  0.2631239935587762,
  0.13677758318739056,
  0.07322456813819578],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0277224689845306,
 'translation_length': 13420,
 'reference_length': 13058}