In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

import spacy
import numpy as np

import random
import math
import time

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
spacy_german =  spacy.load('de_core_news_sm')
spacy_english = spacy.load('en_core_web_sm')



In [5]:
def tokenize_german(text):
    return [token.text  for token in  spacy_german.tokenizer(text)]

def tokenize_english(text):
    return [token.text for token in  spacy_english.tokenizer(text)]



In [6]:
SOURCE = Field(tokenize = tokenize_english,
                init_token = '<sos>',
                eos_token = '<eos>',
                lower = True 
                )

TARGET = Field(tokenize = tokenize_german, 

                init_token='<sos>',
                eos_token='<eos>',
                lower = True
                )

In [7]:
train_data , valid_data , test_data = Multi30k.splits(exts={'.en','.de'}, fields = (SOURCE, TARGET))

In [8]:
train_data.examples[0].src

['zwei',
 'junge',
 'weiße',
 'männer',
 'sind',
 'i',
 'm',
 'freien',
 'in',
 'der',
 'nähe',
 'vieler',
 'büsche',
 '.']

In [9]:
SOURCE.build_vocab(train_data, min_freq=2 )
TARGET.build_vocab(train_data,min_freq = 2)

In [10]:
print("English (Source) Vocabulary Size: " + str(len(SOURCE.vocab)))
print("German (Target) Vocabulary Size: " + str(len(TARGET.vocab)))


English (Source) Vocabulary Size: 7874
German (Target) Vocabulary Size: 5972


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size =32

train_iterator ,valid_iterator , test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = batch_size,
    device = device
)


In [12]:
class Encoder(nn.Module):
    def __init__(self, input_dims, emb_dims, hid_dims, n_layers, dropout):
        super().__init__()

        self.hid_dims = hid_dims
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dims, emb_dims)
        self.rnn =  nn.LSTM(emb_dims, hid_dims, n_layers, dropout= dropout)
        self.dropout = nn.Dropout(dropout)


    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (h,cell) = self.rnn(embedded)
        return h, cell
        

In [13]:
class Decoder(nn.Module):
    def __init__(self, output_dims, emb_dims, hid_dims, n_layers, dropout):
        super().__init__()

        self.output_dims = output_dims
        self.hid_dims  = hid_dims
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dims, emb_dims)
        self.rnn = nn.LSTM(emb_dims, hid_dims, n_layers , dropout = dropout)

        self.fc_out = nn.Linear(hid_dims, output_dims)
        self.dropout = nn.Dropout(dropout)


    def forward(self, input, h, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output ,(h,cell) = self.rnn(embedded, (h,cell))
        pred = self.fc_out(output.squeeze(0))
        return pred , h,cell

In [21]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device ):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg,teacher_forcing_rate =0.5):

        batch_size = trg.shape[1]
        target_length = trg.shape[0]
        target_vocab_size = self.decoder.output_dims

        outputs = torch.zeros(target_length, batch_size, target_vocab_size).to(self.device)
        h, cell = self.encoder(src)
        input = trg[0,:]
        for t in range (1, target_length):
            output, h ,cell = self.decoder(input, h,cell)
            outputs[t] = output
            top = output.argmax(1)
            input = trg[t] if (random.random() < teacher_forcing_rate) else top

        return outputs
        



In [22]:
input_dimensions = len(SOURCE.vocab)
output_dimensions = len(TARGET.vocab)
encoder_embedding_dimensions = 256
decoder_embedding_dimensions = 256
hidden_layer_dimensions = 512
number_of_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

encod = Encoder(input_dimensions, encoder_embedding_dimensions, hidden_layer_dimensions, number_of_layers, encoder_dropout)

decod = Decoder(output_dimensions, decoder_embedding_dimensions, hidden_layer_dimensions,number_of_layers, decoder_dropout )

model = Seq2Seq(encod, decod, device).to(device)




In [23]:
def initialize_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.1, 0.1)
        
model.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7874, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5972, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5972, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [24]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index= TARGET.vocab.stoi[TARGET.pad_token])


In [25]:
def train( model, iterator, optimizer , criterion, clip ):
    model.train()

    epoch_loss = 0

    for i , batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        optimizer.zero_grad()
        output = model(src, trg)
        output_dims = output.shape[-1]
        output = output[1:].view(-1,output_dims)
        trg= trg[1:].view(-1)
        loss = criterion(output,trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

        

In [26]:
def evaluate(model, iterator , criterion):
    model.eval()
    epoch_loss =0 
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg,0)

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)

            trg = trg[1:].view(-1)
            loss = criterion(output,trg)
            epoch_loss += loss.item()

    return epoch_loss/ len(iterator)
    

In [27]:
epochs = 10 
grad_clip = 1

lowest_validation_loss = float('inf')
for epoch in range(epochs):
    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, grad_clip)

    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    if valid_loss < lowest_validation_loss:
        lowest_validation_loss = valid_loss
        torch.save(model.state_dict(),'seq2seq.pt')

    print(f'Epoch:{epoch+1:02} | Time : {np.round(end_time-start_time,0)}s')
    print(f'\t train loss: {train_loss:.4f}')
    print(f'\t val loss: {valid_loss:.4f}')

Epoch:01 | Time : 801.0s
	 train loss: 4.5648
	 val loss: 4.3993
Epoch:02 | Time : 760.0s
	 train loss: 3.8672
	 val loss: 4.1499
Epoch:03 | Time : 700.0s
	 train loss: 3.5723
	 val loss: 3.9506
Epoch:04 | Time : 673.0s
	 train loss: 3.3399
	 val loss: 3.7692
Epoch:05 | Time : 661.0s
	 train loss: 3.1289
	 val loss: 3.7095
Epoch:06 | Time : 665.0s
	 train loss: 2.9551
	 val loss: 3.6549
Epoch:07 | Time : 1673.0s
	 train loss: 2.8046
	 val loss: 3.5621
Epoch:08 | Time : 1667.0s
	 train loss: 2.6857
	 val loss: 3.5728
Epoch:09 | Time : 1620.0s
	 train loss: 2.5563
	 val loss: 3.5683
Epoch:10 | Time : 1638.0s
	 train loss: 2.4491
	 val loss: 3.5842


In [28]:
model.load_state_dict(torch.load('seq2seq.pt'))
test_loss = evaluate(model, test_iterator, criterion)
print(f'Test Loss:{test_loss:.4f}')

Test Loss:3.6110


In [29]:
def translate(model, iterator, limit=4):
    model.eval()
    epoch_loss =0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            if  i < limit:
                src = batch.src
                trg = batch.trg

                output = model(src, trg, 0)
                preds = torch.tensor([[torch.argmax(x).item()] for x in output])
                
                print('English Input: ' + str([SOURCE.vocab.itos[x] for x in src][1:-1][::-1]))
                print('Correct German Output: ' + str([TARGET.vocab.itos[x] for x in trg][1:-1]))
                print('Predicted German Output: ' + str([TARGET.vocab.itos[x] for x in preds][1:-1]))
                print('\n')

In [30]:
_ , _ , eval_iterator= BucketIterator.splits(
        (train_data ,  valid_data , test_data),batch_size = 1, 
        device = device
    )

In [31]:
output = translate(model, eval_iterator)


English Input: ['.', 'mützen', 'mit', 'männer', 'zwei']
Correct German Output: ['two', 'men', 'wearing', 'hats', '.']
Predicted German Output: ['man', 'in', 'a', 'yellow', 'shirt']


English Input: ['felswand', 'auf', 'klettert', 'frau', 'junge']
Correct German Output: ['young', 'woman', 'climbing', 'rock', 'face']
Predicted German Output: ['young', 'woman', 'on', 'a', 'bicycle']


English Input: ['.', 'volleyball', 'spielt', 'frau', 'eine']
Correct German Output: ['a', 'woman', 'is', 'playing', 'volleyball', '.']
Predicted German Output: ['a', 'woman', 'playing', 'playing', 'the', '.']


English Input: ['.', 'bergauf', 'gehen', 'männer', 'drei']
Correct German Output: ['three', 'men', 'are', 'walking', 'up', 'hill', '.']
Predicted German Output: ['men', 'are', 'walking', 'together', '.', '<eos>', '<eos>']


