In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator # preprocessing
import numpy as np 
import spacy 
import random
from torch.utils.tensorboard import SummaryWriter # to print to tensorboard
from torchtext.data.metrics import bleu_score

In [2]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    # print(sentence)

    # sys.exit()

    # Load german tokenizer
    spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])


In [3]:
spacy_ger = spacy.load("de")
spacy_eng = spacy.load('en')

def tokenizer_ger(text):
    return [ tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [ tok.text for tok in spacy_eng.tokenizer(text) ]

In [4]:
german = Field(tokenize=tokenizer_ger, lower=True, init_token="<sos>", eos_token="<eos>")
english = Field(tokenize=tokenizer_eng, lower=True, init_token="<sos>", eos_token="<eos>")



In [5]:
train_data, validation_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english))



In [6]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [7]:
class Encoder(nn.Module):
    # iput_size: the size of our vocab
    # embedding_size: each word is map to some dimisional space
    
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Modules
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)
        
    def forward(self, x):
        # x shape: (seq_length, N) -> where N is the batch_size
        
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size) -> 
        # each word will be map to some embedding size
        
        outputs, (hidden, cell) = self.rnn(embedding)
        
        return hidden, cell
        
    
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Module
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):
        # shape of x: (N) but we want (1, N) -> predict 1 word at a time
        x = x.unsqueeze(0)
        
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)
        
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # shape of output: (1 , N, hidden_size)
        
        predictions = self.fc(outputs)
        # shape of predictions: (1, N, length_of_vocab) 
        # but we want to remove the 1
        # We are predicting one word at a time but we want to predict all of the word in the 
        # target sentence
        
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell
        
        
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1] # (trg_len, N)
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)
        
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        
        hidden, cell = self.encoder(source)
        
        # Grab start token
        x = target[0]
        
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            
            outputs[t] = output
            # (N, english_vocab_size)
            
            best_guess = output.argmax(1)
            
            x = target[t] if random.random() < teacher_force_ratio else best_guess
            
        return outputs

In [11]:
### Now we're ready to do the training 

# Training hyperparameters
num_epochs = 50
learning_rate = 0.001
batch_size = 64

# Model hyperparamerters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key = lambda x: len(x.src),
    device=device
)

In [12]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, 
                      hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size,
                     hidden_size, output_size, num_layers, dec_dropout).to(device)
model = Seq2Seq(encoder_net, decoder_net).to(device)

In [13]:
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

if load_model:
    load_checkpoint(torch.load('my_checkpoint.pth.tar'), model, optimizer)
    
sentence = "Ein Boot mit anderen Männern wird von einem großen \
            Team von Pferden ans Ufer gezogen."
    
for epoch in range(num_epochs):
    print(f"Epoch [{epoch} / {num_epochs}]")
    
    checkpoint = {"state_dict": model.state_dict(), 'optimizer': optimizer.state_dict()}
    save_checkpoint(checkpoint)
    
    
    model.eval()
    translated_sentence = translate_sentence(model, sentence, german, english, device, max_length=50)
    print(f"Translated examples sentence \n {translated_sentence}")
    model.train()
    
    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)
        
        output = model(inp_data, target)
        # output shape: (trg_len, batch_size, output_dim)
        
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        
        optimizer.zero_grad()
        loss = criterion(output, target)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        
        writer.add_scalar('Training Loss', loss, global_step=step)
        step += 1

Epoch [0 / 50]
=> Saving checkpoint
Translated examples sentence 
 ['sprinkling', 'frames', 'frames', 'terminals', 'ymca', 'smile', 'smile', 'smile', 'boutique', 'aviator', 'issue', 'issue', 'issue', 'citizen', 'issue', 'chase', 'chase', 'wounds', 'booth', 'airborne', 'airborne', 'cutting', 'cutting', 'bullet', 'cutting', 'grimacing', 'boston', 'boston', 'structures', 'structures', 'structures', 'ymca', 'watched', 'smile', 'smile', 'smile', 'smile', 'smile', 'smile', 'monks', 'monks', 'son', 'son', 'needed', 'murals', 'chase', 'chase', 'airborne', 'subway', 'subway']




Epoch [1 / 50]
=> Saving checkpoint
Translated examples sentence 
 ['a', 'young', 'boy', 'in', 'a', 'blue', 'shirt', 'and', 'a', 'shirt', 'is', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch [2 / 50]
=> Saving checkpoint
Translated examples sentence 
 ['a', '<unk>', 'player', 'is', 'in', 'a', '<unk>', 'with', 'a', '<unk>', '<unk>', 'to', 'a', 'a', '.', '<eos>']
Epoch [3 / 50]
=> Saving checkpoint
Translated examples sentence 
 ['a', 'construction', 'player', 'with', 'a', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'a', 'a', '.', '<eos>']
Epoch [4 / 50]
=> Saving checkpoint
Translated examples sentence 
 ['a', '<unk>', 'with', 'with', 'a', '<unk>', '<unk>', 'a', '<unk>', 'to', 'a', 'a', 'a', '.', '<eos>']
Epoch [5 / 50]
=> Saving checkpoint
Translated examples sentence 
 ['a', '<unk>', 'with', 'with', 'a', 'a', 'a', 'a', 'to', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch [6 / 50]
=> Saving checkpoint
Translated examples sentence 
 ['a', 'rugby', 'player', 'with', 'w

In [None]:
score = bleu(test_data, model, german, english, device)
print(f"Bleu score {score*100:.2f}")

In [14]:
translate_sentence(model, "Zwei Leute gehen zum Laden, um sehr kalte Sahne zu kaufen", german, english, device, max_length=50)

['two',
 'people',
 'are',
 'waiting',
 'to',
 'be',
 'their',
 '<unk>',
 'to',
 'be',
 'end',
 '.',
 '<eos>']