In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import spacy
import numpy as np

import random
import math
import time

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

In [2]:
spacy_german = spacy.load('de')
spacy_english = spacy.load('en')

In [3]:
def tokenize_german(text):
    return [token.text for token in spacy_german.tokenizer(text)]
def tokenize_english(text):
    return [token.text for token in spacy_english.tokenizer(text)][::-1]

In [4]:
SOURCE = Field(tokenize = tokenize_english, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TARGET = Field(tokenize = tokenize_german, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [5]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.en', '.de'), 
                                                    fields = (SOURCE, TARGET))

In [334]:
print(train_data.examples[0].src)
print(train_data.examples[0].trg)

['.', 'bushes', 'many', 'near', 'outside', 'are', 'males', 'white', ',', 'young', 'two']
['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']


In [6]:
print("Training dataset size: " + str(len(train_data.examples)))
print("Validation dataset size: " + str(len(valid_data.examples)))
print("Test dataset size: " + str(len(test_data.examples)))

Training dataset size: 29000
Validation dataset size: 1014
Test dataset size: 1000


In [7]:
SOURCE.build_vocab(train_data, min_freq = 2)
TARGET.build_vocab(train_data, min_freq = 2)

print("English (Source) Vocabulary Size: " + str(len(SOURCE.vocab)))
print("German (Target) Vocabulary Size: " + str(len(TARGET.vocab)))

English (Source) Vocabulary Size: 5893
German (Target) Vocabulary Size: 7855


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = batch_size, 
    device = device)

In [20]:
class Encoder(nn.Module):
    def __init__(self, input_dims, emb_dims, hid_dims, n_layers, dropout):
        super().__init__()
        
        self.hid_dims = hid_dims
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dims, emb_dims)
        
        self.rnn = nn.LSTM(emb_dims, hid_dims, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        embedded = self.dropout(self.embedding(src))
        
        outputs, (h, cell) = self.rnn(embedded)
        
        return h, cell

In [21]:
class Decoder(nn.Module):
    def __init__(self, output_dims, emb_dims, hid_dims, n_layers, dropout):
        super().__init__()
        
        self.output_dims = output_dims
        self.hid_dims = hid_dims
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dims, emb_dims)
        
        self.rnn = nn.LSTM(emb_dims, hid_dims, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dims, output_dims)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, h, cell):
                
        input = input.unsqueeze(0)
                
        embedded = self.dropout(self.embedding(input))
                
        output, (h, cell) = self.rnn(embedded, (h, cell))
        
        pred = self.fc_out(output.squeeze(0))
        
        return pred, h, cell

In [30]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_rate = 0.5):
        
        batch_size = trg.shape[1]
        target_length = trg.shape[0]
        target_vocab_size = self.decoder.output_dims
        
        outputs = torch.zeros(target_length, batch_size, target_vocab_size).to(self.device)
        
        h, cell = self.encoder(src)
        
        input = trg[0,:]
        
        for t in range(1, target_length):

            output, h, cell = self.decoder(input, h, cell)
            
            outputs[t] = output
            
            top = output.argmax(1) 
        
            input = trg[t] if (random.random() < teacher_forcing_rate) else top
        
        return outputs

In [95]:
input_dimensions = len(SOURCE.vocab)
output_dimensions = len(TARGET.vocab)
encoder_embedding_dimensions = 256
decoder_embedding_dimensions = 256
hidden_layer_dimensions = 512
number_of_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

encod = Encoder(input_dimensions, encoder_embedding_dimensions,\
              hidden_layer_dimensions, number_of_layers, encoder_dropout)
decod = Decoder(output_dimensions, decoder_embedding_dimensions,\
              hidden_layer_dimensions, number_of_layers, decoder_dropout)

model = Seq2Seq(encod, decod, device).to(device)

In [96]:
def initialize_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.1, 0.1)
        
model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=7855, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [98]:
optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss(ignore_index = TARGET.vocab.stoi[TARGET.pad_token])

In [99]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        output_dims = output.shape[-1]
        output = output[1:].view(-1, output_dims)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [312]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0)

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [101]:
epochs = 10
grad_clip = 1

lowest_validation_loss = float('inf')

for epoch in range(epochs):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, grad_clip)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    if valid_loss < lowest_validation_loss:
        lowest_validation_loss = valid_loss
        torch.save(model.state_dict(), 'seq2seq.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {np.round(end_time-start_time,0)}s')
    print(f'\tTrain Loss: {train_loss:.4f}')
    print(f'\t Val. Loss: {valid_loss:.4f}')

Epoch: 01 | Time: 2335.0s
	Train Loss: 4.7367
	 Val. Loss: 4.6161
Epoch: 02 | Time: 2294.0s
	Train Loss: 4.0204
	 Val. Loss: 4.2858
Epoch: 03 | Time: 2301.0s
	Train Loss: 3.6623
	 Val. Loss: 4.0794
Epoch: 04 | Time: 2315.0s
	Train Loss: 3.4008
	 Val. Loss: 3.9103
Epoch: 05 | Time: 2305.0s
	Train Loss: 3.1771
	 Val. Loss: 3.8092
Epoch: 06 | Time: 2306.0s
	Train Loss: 3.0029
	 Val. Loss: 3.7161
Epoch: 07 | Time: 2298.0s
	Train Loss: 2.8314
	 Val. Loss: 3.6566
Epoch: 08 | Time: 2303.0s
	Train Loss: 2.6819
	 Val. Loss: 3.6300
Epoch: 09 | Time: 2315.0s
	Train Loss: 2.5497
	 Val. Loss: 3.6144
Epoch: 10 | Time: 2328.0s
	Train Loss: 2.4193
	 Val. Loss: 3.5642


In [313]:
model.load_state_dict(torch.load('seq2seq.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.4f}')

Test Loss: 3.5179


In [327]:
def translate(model, iterator, limit = 4):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            if i < limit :
                
                src = batch.src
                trg = batch.trg

                output = model(src, trg, 0)
                preds = torch.tensor([[torch.argmax(x).item()] for x in output])
                
                print('English Input: ' + str([SOURCE.vocab.itos[x] for x in src][1:-1][::-1]))
                print('Correct German Output: ' + str([TARGET.vocab.itos[x] for x in trg][1:-1]))
                print('Predicted German Output: ' + str([TARGET.vocab.itos[x] for x in preds][1:-1]))
                print('\n')

In [328]:
_, _, eval_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = 1, 
    device = device)

In [329]:
output = translate(model, eval_iterator)

English Input: ['two', 'men', 'wearing', 'hats', '.']
Correct German Output: ['zwei', 'männer', 'mit', 'mützen', '.']
Predicted German Output: ['zwei', 'männer', 'mit', 'hüten', '.']


English Input: ['young', 'woman', 'climbing', 'rock', 'face']
Correct German Output: ['junge', 'frau', 'klettert', 'auf', 'felswand']
Predicted German Output: ['eine', 'frau', 'klettert', 'durch', 'einen']


English Input: ['a', 'woman', 'is', 'playing', 'volleyball', '.']
Correct German Output: ['eine', 'frau', 'spielt', 'volleyball', '.']
Predicted German Output: ['eine', 'frau', 'spielt', 'volleyball', '.']


English Input: ['three', 'men', 'are', 'walking', 'up', 'hill', '.']
Correct German Output: ['drei', 'männer', 'gehen', 'bergauf', '.']
Predicted German Output: ['drei', 'männer', 'gehen', 'durch', 'den']


