In [1]:
!pip install spacy
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys


def translate_sentence(model, tok,  sentence, german, english, device, max_length=50):
    # print(sentence)

    # sys.exit()

    # Load german tokenizer
    spacy_ger = tok

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, tok, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, tok, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])



# Sequence to Sequence Model using PyTorch

In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter

In [3]:
!python -m spacy download de_core_news_sm

Collecting de_core_news_sm==2.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 10.7 MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l- \ | / - \ | done
[?25h  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.3.0-py3-none-any.whl size=14907581 sha256=579eb667623d33d79cadc10aecf625d6219eef11cc7b23c3439e33fc882d8a0d
  Stored in directory: /tmp/pip-ephem-wheel-cache-5ts1d0v4/wheels/75/30/c3/ea1c6002eede7f49c8ab017ce62a2981a87b1cd39fab6e6a65
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [4]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 4.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [5]:
import de_core_news_sm
import en_core_web_sm

In [6]:
spacy_ger = de_core_news_sm.load()
spacy_eng = en_core_web_sm.load()

In [7]:
def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [8]:
german = Field(tokenize=tokenizer_ger, lower=True,
              init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenizer_eng, lower=True,
               init_token='<sos>', eos_token='<eos>')



In [9]:
train_data, validation_data, test_data = Multi30k.splits(exts=('.de','.en'),
                                                        fields=(german, english))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 10.7MB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 1.73MB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 1.73MB/s]


In [10]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

# Model

## Encoder

In [11]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, d_ratio):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(d_ratio)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=d_ratio)
    
    def forward(self, x):
        # x shape: (seq_length, batch_size)
        
        embedding = self.dropout(self.embedding(x)) # shape: (seq_length, batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)
        
        return hidden, cell

## Decoder

In [12]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size,
                output_size, num_layers, d_ratio):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layer = num_layers
        
        self.dropout = nn.Dropout(d_ratio)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = d_ratio)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):
        
        x = x.unsqueeze(0) # x shape: (N) => (1,N)
        
        embedding = self.dropout(self.embedding(x)) # shape: (1,batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding, (hidden,cell)) # outputs shape: (1, batch_size, hidden_size)
        predictions = self.fc(outputs) # predictions shape: (1, batch_size, vocab_length)
        
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell

In [13]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)
                
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        
        hidden, cell = self.encoder(source)
        
        # start token
        x = target[0]
        
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            
            x = target[t] if random.random() < teacher_force_ratio else best_guess
            
        return outputs

# Training

In [14]:
# Training Hyperparameters
num_epochs = 20
learning_rate = 0.001
batch_size = 64

# Model Hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

In [15]:
# Tensot board
writer = SummaryWriter(f'runs/loss_plot')
step = 0

In [16]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device)



In [17]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size,
                      hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, 
                      hidden_size, output_size, num_layers, dec_dropout).to(device)
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [19]:
if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

In [20]:
sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

In [21]:
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model, spacy_ger, sentence, german, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # Forward prop
        output = model(inp_data, target)

        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1


score = bleu(test_data[1:100], spacy_ger, model, german, english, device)
print(f"Bleu score {score*100:.2f}")

[Epoch 0 / 20]
=> Saving checkpoint
Translated example sentence: 
 ['b', 'age', 'members', 'members', 'walked', 'walked', 'measuring', 'medieval', 'medieval', 'measuring', 'conductor', 'son', 'mouse', 'mouse', 'clothed', 'clothed', 'clothed', 'clothed', 'clothed', 'clothed', 'clothed', 'clothed', 'atlanta', 'atlanta', 'atlanta', 'purse', 'formation', 'paddling', 'bemused', 'cable', 'cable', 'trucks', 'trucks', 'pulls', 'pulls', 'pulls', 'dutch', 'dutch', 'heads', 'drawing', 'hokey', 'hokey', 'numerous', 'bib', 'restaurant', 'spike', 'laboratory', 'rainy', 'joy', 'joy']




[Epoch 1 / 20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'black', 'and', 'a', 'black', 'and', 'a', 'black', 'and', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
[Epoch 2 / 20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'baseball', 'player', 'with', 'a', '<unk>', 'is', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
[Epoch 3 / 20]
=> Saving checkpoint
Translated example sentence: 
 ['a', '<unk>', 'with', 'with', 'a', '<unk>', '<unk>', '<unk>', 'to', 'a', 'a', 'a', 'a', '.', '<eos>']
[Epoch 4 / 20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'street', 'with', 'with', 'a', '<unk>', 'to', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
[Epoch 5 / 20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'street', 'with', 'with', 'a', '<unk>', 'with', 'a', '<unk>', '<unk>', 'a', 'a', 'a', '.', '<eos>']
[Epoch 6 / 20]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'street', 'with', 'with', 'a', 'large', 'of', 'a', 'is', 'a', 'a', 'a', 'a', '.', '<eos>'