# Neural Machine Translation

Description of the dataset

In [None]:
%matplotlib inline
from tqdm import tqdm_notebook as tqdm

# Basic Packages
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext.datasets import TranslationDataset, WMT14
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import os

import itertools
import allennlp

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re

In [None]:
from allennlp.modules.attention import LinearAttention, CosineAttention, BilinearAttention, DotProductAttention

# Global Parameters

In [None]:
SEED = 1
BATCH_SIZE = 512
N_EPOCHS = 100
CLIP = 10
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

SAVE_DIR = 'exp_small_data_100'

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
spacy_fr = spacy.load('fr')
spacy_en = spacy.load('en')

In [None]:
# This tokenizer when creating the initial data splits filtering
def tokenize_fr(text):
    """
    Tokenizes French text from a string into a list of strings
    """
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
SRC = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
TRG = Field(tokenize=tokenize_fr, init_token='<sos>', eos_token='<eos>', lower=True)

# 1. Exploratory Data Analysis

## 1.1 Loading and Preprocessing

In this example, spacy pretrained tokenizers are used to load the English and German datasets.
These spacy models can be used in conjunction with torchtext, allowing the processed data to populate torch tensors and dataset iterators to be created for training. 

### Filter data to smaller dataset

In [None]:
class FrenchTatoeba(TranslationDataset):
    """English-to-French dataset from Tatoeba"""

    urls = ['https://download.pytorch.org/tutorial/data.zip']
    name = 'FrenchTatoeba'
    dirname = ''
    
    @classmethod
    def format_data(cls, download_dir, lang1, lang2, reverse=False):
        random.seed(1) # Get same split every time
        print("Reading lines...")

        # Read the file and split into lines
        lines = open(os.path.join(download_dir,'data/%s-%s.txt' % (lang1, lang2)), encoding='utf-8').read().strip().split('\n')

        # Split every line into pairs and normalize
        pairs = [[s for s in l.split('\t')] for l in lines]

        # Reverse pairs
        if reverse:
            pairs = [list(reversed(p)) for p in pairs]
             
        with open(os.path.join(download_dir,'all_data.en'), 'w') as lang1_file, \
                open(os.path.join(download_dir,'all_data.fr'), 'w') as lang2_file:
            for p in pairs:
                lang1_file.write(p[0] + '\n')
                lang2_file.write(p[1] + '\n')

    @classmethod
    def all_data(cls, exts, fields, root='.data',
               train='all_data', validation=None, test=None, **kwargs):
        """Create dataset objects for splits of the Tatoeba dataset.
        Arguments:
            exts: A tuple containing the extension to path for each language.
            fields: A tuple containing the fields that will be used for data
                in each language.
            root: Root dataset storage directory. Default is '.data'.
            train: The prefix of the train data. Default: 'train'.
        """
        
        if 'path' not in kwargs:
            expected_folder = os.path.join(root, cls.name)
            path = expected_folder if os.path.exists(expected_folder) else None
        else:
            path = kwargs['path']
            del kwargs['path']
        
        if path is None:
            path = cls.download(root)
        
        cls.format_data(path, 'eng', 'fra')

        train_data = None if train is None else cls(
            os.path.join(path, train), exts, fields, **kwargs)
        val_data = None if validation is None else cls(
            os.path.join(path, validation), exts, fields, **kwargs)
        test_data = None if test is None else cls(
            os.path.join(path, test), exts, fields, **kwargs)
        
        return tuple(d for d in (train_data, val_data, test_data)
                     if d is not None)
    
    @classmethod
    def splits(cls, exts, fields, root='.data',
               train='train', validation='val', test='test', **kwargs):
        """Create dataset objects for splits of the Multi30k dataset.
        Arguments:
            exts: A tuple containing the extension to path for each language.
            fields: A tuple containing the fields that will be used for data
                in each language.
            root: Root dataset storage directory. Default is '.data'.
            train: The prefix of the train data. Default: 'train'.
            validation: The prefix of the validation data. Default: 'val'.
            test: The prefix of the test data. Default: 'test'.
            Remaining keyword arguments: Passed to the splits method of
                Dataset.
        """
        
        if 'path' not in kwargs:
            expected_folder = os.path.join(root, cls.name)
            path = expected_folder if os.path.exists(expected_folder) else None
        else:
            path = kwargs['path']
            del kwargs['path']
            
        return super(FrenchTatoeba, cls).splits(
            exts, fields, path, root, train, validation, test, **kwargs)

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
#     s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def tokenize_fr(text):
    """
    Tokenizes French text from a string into a list of strings
    """
    text = normalizeString(text)
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    text = normalizeString(text)
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
MAX_LENGTH = 20

def filterPair(p):
    return len(p[0]) < MAX_LENGTH and \
        len(p[1]) < MAX_LENGTH

In [None]:
all_data = FrenchTatoeba.all_data(exts=('.en', '.fr'), 
                  fields=(SRC, TRG), 
                  filter_pred=lambda ex: filterPair([ex.src, ex.trg]))[0]

In [None]:
for e in all_data[:10]:
    print(e.src, e.trg)

In [None]:
download_dir = os.path.join('.data', FrenchTatoeba.name)
n_examples = len(all_data)
idx_array = list(range(n_examples))
random.shuffle(idx_array)
train_indexs = idx_array[:int(0.8*n_examples)] # 80% training data
val_indexs = idx_array[int(0.8*n_examples):int(0.9*n_examples)]
test_indexs = idx_array[int(0.9*n_examples):]

### Save train, test, val files for furture experiments 

In [None]:
with open(os.path.join(download_dir,'train.en'), 'w') as lang1_file, \
        open(os.path.join(download_dir,'train.fr'), 'w') as lang2_file:
    for i in train_indexs:
        lang1_file.write(' '.join(all_data[i].src) + '\n')
        lang2_file.write(' '.join(all_data[i].trg) + '\n')

with open(os.path.join(download_dir,'val.en'), 'w') as lang1_file, \
        open(os.path.join(download_dir,'val.fr'), 'w') as lang2_file:
    for i in val_indexs:
        lang1_file.write(' '.join(all_data[i].src) + '\n')
        lang2_file.write(' '.join(all_data[i].trg) + '\n')

with open(os.path.join(download_dir,'test.en'), 'w') as lang1_file, \
        open(os.path.join(download_dir,'test.fr'), 'w') as lang2_file:
    for i in test_indexs:
        lang1_file.write(' '.join(all_data[i].src) + '\n')
        lang2_file.write(' '.join(all_data[i].trg) + '\n')


### Load individual datasets

In [None]:
train_data, valid_data, test_data = FrenchTatoeba.splits(path='./.data/FrenchTatoeba/', exts=('.en', '.fr'), fields=(SRC, TRG))

In [None]:
SRC.build_vocab(train_data, min_freq=5)
TRG.build_vocab(train_data, min_freq=5)

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)

In [None]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

## 1.2 Target distribution analysis

In [None]:
print('Training set size: ', len(train_data))
print('Validation set size: ', len(valid_data))
print('Testing set size: ', len(test_data))

print('Size of English vocabulary: ', len(SRC.vocab))
print('Size of French vocabulary: ', len(TRG.vocab))

In [None]:
def show_dual_hist(data, data_src_title, data_tgt_title, title):
    data_1 = np.asarray([len(x.src) for x in data])
    data_2 = np.asarray([len(x.trg) for x in data])
    max_len = max(max(data_1),max(data_2))
    bins = range(1, max_len + 1, 1)
    plt.hist([data_1, data_2], bins, label=[data_src_title, data_tgt_title], align='left')
    plt.legend(loc='upper right')

    plt.grid(True)
    plt.xticks(range(1, max_len))
    plt.title(title)
    plt.ylabel("Number of examples")
    plt.xlabel("Example label")
    plt.figure(figsize=(180, 16), dpi= 80, facecolor='w', edgecolor='k')
    plt.show()

In [None]:
plt.rcParams['figure.figsize'] = [15, 5]
show_dual_hist(train_data,
               'English', 
               'French', 
               "Training Sentence Lengths")
show_dual_hist(valid_data,
               'English', 
               'French', 
               "Validation Sentence Lengths")
show_dual_hist(test_data,
               'English', 
               'French', 
               "Testing Sentence Lengths")

In [None]:
def word_frequency_bar_plot(data, data_title, top_n=100):
    objects = sorted(data.freqs, key=data.freqs.get, reverse=True)[0:top_n]
    counts = [data.freqs[o] for o in objects]
    y_pos = np.arange(len(objects))

    plt.barh(y_pos, counts, align='center', alpha=0.5)
    plt.yticks(y_pos, objects)
    plt.gca().invert_yaxis()
    plt.xlabel('Count')
    plt.title(data_title + " Top " + str(top_n) + " word counts")
    plt.show()

In [None]:
plt.rcParams['figure.figsize'] = [7, 18]
word_frequency_bar_plot(SRC.vocab, 'English', top_n=50)

In [None]:
plt.rcParams['figure.figsize'] = [7, 18]
word_frequency_bar_plot(TRG.vocab, 'French', top_n=50)

# 2. Model Building

## 2.1 Utilities for training and testing

In [None]:
# Allows us to optimize the encoder and decoder separately
class MultipleOptimizer(object):
    def __init__(self, *op):
        self.optimizers = op

    def zero_grad(self):
        for op in self.optimizers:
            op.zero_grad()

    def step(self):
        for op in self.optimizers:
            op.step()

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    with tqdm(total=len(iterator)) as t:
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            optimizer.zero_grad()

            output = model(src, trg)
            loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))
            loss.backward()

            if clip: 
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

            optimizer.step()
            epoch_loss += loss.item()
            
            t.set_postfix(loss='{:05.3f}'.format(epoch_loss / len(iterator)))
            t.update()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing
            loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [None]:
def eval_metric(history, metric_name):
  
    metric = history.history[metric_name]
    val_metric = history.history['val_' + metric_name]

    e = range(1, NB_EPOCHS+1)

    plt.plot(e, metric, color='navy', label='Train ' + metric_name)
    plt.plot(e, val_metric, color='red', label='Validation ' + metric_name)
    plt.xlabel('Epochs')
    plt.ylabel(metric_name)
    plt.legend()
    plt.show()

In [None]:
def plot_metrics(metrics_dict, title, chart_filter=''):
    data = pd.DataFrame.from_dict(metrics_dict)
    data = data.T
    data = data[list(filter(re.compile('.*'+ chart_filter +'.*').match, 
                            list(data.columns.values)))]
    data.plot(figsize=(10,6), title=title).legend(bbox_to_anchor=(1, 1))

In [None]:
def predict_text(model, text):
    model.eval()
    batch_size = BATCH_SIZE
    with torch.no_grad():
        src = SRC.process([SRC.preprocess(text)]).to(device)
        max_len = 2* len(src)
        encoder_outputs, hidden  = model.encoder(src)

        #first input to the decoder is the <sos> tokens
        output = TRG.process([TRG.preprocess(' ')])[0,:].to(device)


        outputs = torch.zeros(max_len, batch_size, len(TRG.vocab)).to(device)

        decoded_words = []
        decoder_attentions = torch.zeros(max_len, len(src))

        for t in range(0, max_len):
            output, hidden, attn = model.decoder(output, hidden, encoder_outputs)

            decoder_attentions[t] = attn.data
            outputs[t] = output
            teacher_force = 0 #random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)
            
            if top1 == TRG.vocab.stoi['<eos>']:
                decoded_words.append('<eos>')
                break
            else:
                decoded_words.append(TRG.vocab.itos[top1])
            
        output_sentence = ' '.join(decoded_words)
        return output_sentence, decoder_attentions[:t + 1]

In [None]:
def showAttention(input_sentence, output_sentence, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy())
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels(['', '<sos>'] + input_sentence.split(' ') +
                       ['<eos>'], rotation=90)
    ax.set_yticklabels([''] + output_sentence.split(' '))

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(model, input_sentence):
    output_words, attentions = predict_text(model, input_sentence)
    print('input =', input_sentence)
    print('output =', output_words)
    showAttention(input_sentence, output_words, attentions)

## 2.2 Experiments

### 2.2.2 Bahdanau Attention

In [None]:
class BahdanauEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, num_layers=2, bidirectional=True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src sent len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src sent len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
                
        #outputs = [sent len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        
        #outputs = [sent len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Parameter(torch.rand(dec_hid_dim))
        
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat encoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src sent len, dec hid dim]
        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2))) 
        
        #energy = [batch size, src sent len, dec hid dim]
        
        energy = energy.permute(0, 2, 1)
        
        #energy = [batch size, dec hid dim, src sent len]
        
        #v = [dec hid dim]
        
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        
        #v = [batch size, 1, dec hid dim]
                
        attention = torch.bmm(v, energy).squeeze(1)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

In [None]:
class BahdanauDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        #embedded = [1, batch size, emb dim]
        
        attns = self.attention(hidden, encoder_outputs)       
        #attn = [batch size, src len]
        
        a = attns.unsqueeze(1)
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim=2)
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        #output = [sent len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #sent len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        output = self.out(torch.cat((output, weighted, embedded), dim=1))
        #output = [bsz, output dim]
        
        return output, hidden.squeeze(0), attns

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        #src = [src sent len, batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        output = trg[0,:]
        
        for t in range(1, max_len):
            output, hidden, _ = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

In [None]:
attn = BahdanauAttention(ENC_HID_DIM, DEC_HID_DIM)

enc = BahdanauEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = BahdanauDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

# optimizer = optim.Adam(model.parameters())
learning_rate = 0.001
decoder_learning_ratio = 1.0

# Initialize optimizers and criterion
encoder_optimizer = optim.SGD(enc.parameters(), lr=learning_rate, momentum=0.99)
decoder_optimizer = optim.SGD(dec.parameters(), lr=learning_rate * decoder_learning_ratio, momentum=0.9)
enc_scheduler = optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 'min', patience=5)
dec_scheduler = optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, 'min', patience=5)

optimizer = MultipleOptimizer(encoder_optimizer, 
                        decoder_optimizer)

In [None]:
pad_idx = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# N_EPOCHS = 5
# CLIP = 10
# SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'bahdanau_model.pt')
bahdanau_training_metrics = {}

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')
    bahdanau_training_metrics[epoch] = {'train_loss':train_loss, 'train_ppl':math.exp(train_loss), 'val_loss':valid_loss, 'val_ppl':math.exp(valid_loss)}

In [None]:
plot_metrics(bahdanau_training_metrics, "Bahdanau Attention", chart_filter='loss')
plot_metrics(bahdanau_training_metrics, "Bahdanau Attention", chart_filter='ppl')

In [None]:
model.load_state_dict(torch.load(os.path.join(SAVE_DIR, 'bahdanau_model.pt')))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

In [None]:
plt.rcParams['figure.figsize'] = [20, 5]
evaluateAndShowAttention(model, 'you should try to see it .')

evaluateAndShowAttention(model, 'i m shorter than you .')

evaluateAndShowAttention(model, 'it was difficult for me .')

evaluateAndShowAttention(model, 'i don t follow .')

evaluateAndShowAttention(model, 'the region is relatively rich in mineral resources .')

evaluateAndShowAttention(model, 'i took many photos and some of them were printed in black and white .')

In [None]:
import json

with open(os.path.join(SAVE_DIR, 'bahdanau_model.json'), 'w') as fp:
    json.dump(bahdanau_training_metrics, fp)

### 2.2.3 Global Attention

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, 
                 dropout, num_layers=1, bidirectional=False):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, num_layers=num_layers, bidirectional=bidirectional)
        
        if bidirectional:
            self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src sent len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src sent len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
        
        if self.bidirectional:
            hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
            
        if not self.bidirectional and self.num_layers > 1: 
            hidden = hidden[-1,:,:]                   
        
        return outputs, hidden

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, 
                 attention, bidirectional_input=False):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention
        self.bidirectional_input = bidirectional_input
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        if bidirectional_input:
            self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
            self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        else:
            self.rnn = nn.GRU((enc_hid_dim) + emb_dim, dec_hid_dim)
            self.out = nn.Linear((enc_hid_dim) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        hidden = hidden.squeeze(0) if len(hidden.size()) > 2 else hidden # batch_size=1 issue
       
        # Repeat hidden state for attention on bidirectional outputs
        if hidden.size(-1) != encoder_outputs.size(-1):
#             print("Hidden size: ", hidden.size())
#             print("Encoder outputs size: ", encoder_outputs.size())
            attn = self.attention(hidden.repeat(1, 2), encoder_outputs.permute(1, 0, 2))
        else:
            attn = self.attention(hidden, encoder_outputs.permute(1, 0, 2))

                
        #a = [batch size, src len]
        
        a = attn.unsqueeze(1)
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim=2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
        
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [sent len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #sent len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        output = self.out(torch.cat((output, weighted, embedded), dim=1))
        
        #output = [bsz, output dim]
        
        return output, hidden.squeeze(0), attn

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        #src = [src sent len, batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
        hidden = hidden.squeeze(1)
        #first input to the decoder is the <sos> tokens
        output = trg[0,:]
        
        for t in range(1, max_len):
#             print("Hidden ", hidden.size())
            output, hidden, attn = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

### 2.2.4 Dot Product Attention

In [None]:
enc_bidirectional = True
enc_num_layers = 4

attn = DotProductAttention()
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT, 
              num_layers=enc_num_layers, bidirectional=enc_bidirectional)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, 
              attn, bidirectional_input=enc_bidirectional)

model = Seq2Seq(enc, dec, device).to(device)

learning_rate = 0.01
decoder_learning_ratio = 5.0

# Initialize optimizers and criterion
encoder_optimizer = optim.SGD(enc.parameters(), lr=learning_rate, momentum=0.9)
decoder_optimizer = optim.SGD(dec.parameters(), lr=learning_rate * decoder_learning_ratio, momentum=0.9)
enc_scheduler = optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 'min', patience=5)
dec_scheduler = optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, 'min', patience=5)

optimizer = MultipleOptimizer(encoder_optimizer, 
                        decoder_optimizer)

In [None]:
# Train
pad_idx = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'dot_model.pt')
dot_training_metrics = {}

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')
    dot_training_metrics[epoch] = {'train_loss':train_loss, 'train_ppl':math.exp(train_loss), 'val_loss':valid_loss, 'val_ppl':math.exp(valid_loss)}

In [None]:
plot_metrics(dot_training_metrics, "Dot Product Attention", chart_filter='loss')
plot_metrics(dot_training_metrics, "Dot Product Attention", chart_filter='ppl')

In [None]:
model.load_state_dict(torch.load(os.path.join(SAVE_DIR, 'dot_model.pt')))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

In [None]:
import json

with open(os.path.join(SAVE_DIR, 'dot_model.json'), 'w') as fp:
    json.dump(dot_training_metrics, fp)

In [None]:
plt.rcParams['figure.figsize'] = [20, 5]
evaluateAndShowAttention(model, 'you should try to see it .')

evaluateAndShowAttention(model, 'i m shorter than you .')

evaluateAndShowAttention(model, 'it was difficult for me .')

evaluateAndShowAttention(model, 'i don t follow .')

evaluateAndShowAttention(model, 'the region is relatively rich in mineral resources .')

evaluateAndShowAttention(model, 'i took many photos and some of them were printed in black and white .')

### 2.2.5 Cosine Attention

In [None]:
enc_bidirectional = True
enc_num_layers = 4

attn = CosineAttention()
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT, 
              num_layers=enc_num_layers, bidirectional=enc_bidirectional)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, 
              attn, bidirectional_input=enc_bidirectional)

model = Seq2Seq(enc, dec, device).to(device)

learning_rate = 0.01
decoder_learning_ratio = 5.0

# Initialize optimizers and criterion
encoder_optimizer = optim.SGD(enc.parameters(), lr=learning_rate, momentum=0.9)
decoder_optimizer = optim.SGD(dec.parameters(), lr=learning_rate * decoder_learning_ratio, momentum=0.9)
enc_scheduler = optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 'min', patience=5)
dec_scheduler = optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, 'min', patience=5)

optimizer = MultipleOptimizer(encoder_optimizer, 
                        decoder_optimizer)

In [None]:
# Train
pad_idx = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'cosine_model.pt')
cosine_training_metrics = {}

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')
    cosine_training_metrics[epoch] = {'train_loss':train_loss, 'train_ppl':math.exp(train_loss), 'val_loss':valid_loss, 'val_ppl':math.exp(valid_loss)}

In [None]:
plot_metrics(cosine_training_metrics, "Cosine Product Attention", chart_filter='loss')
plot_metrics(cosine_training_metrics, "Cosine Product Attention", chart_filter='ppl')

In [None]:
model.load_state_dict(torch.load(os.path.join(SAVE_DIR, 'cosine_model.pt')))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

In [None]:
import json

with open(os.path.join(SAVE_DIR, 'cosine_model.json'), 'w') as fp:
    json.dump(cosine_training_metrics, fp)

In [None]:
evaluateAndShowAttention(model, 'you should try to see it .')

evaluateAndShowAttention(model, 'i m shorter than you .')

evaluateAndShowAttention(model, 'it was difficult for me .')

evaluateAndShowAttention(model, 'i don t follow .')

evaluateAndShowAttention(model, 'the region is relatively rich in mineral resources .')

evaluateAndShowAttention(model, 'i took many photos and some of them were printed in black and white .')

### 2.2.6 Bilinear Attention

In [None]:
enc_bidirectional = True
enc_num_layers = 4

# attn = BilinearAttention(ENC_HID_DIM, DEC_HID_DIM) # Uni-directional attention
attn = BilinearAttention((ENC_HID_DIM * 2), (ENC_HID_DIM * 2)) # Bi-directional attention

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT, 
              num_layers=enc_num_layers, bidirectional=enc_bidirectional)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, 
              attn, bidirectional_input=enc_bidirectional)

model = Seq2Seq(enc, dec, device).to(device)

learning_rate = 0.01
decoder_learning_ratio = 5.0

# Initialize optimizers and criterion
# Initialize optimizers and criterion
encoder_optimizer = optim.SGD(enc.parameters(), lr=learning_rate, momentum=0.9)
decoder_optimizer = optim.SGD(dec.parameters(), lr=learning_rate * decoder_learning_ratio, momentum=0.9)
enc_scheduler = optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 'min', patience=5)
dec_scheduler = optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, 'min', patience=5)

optimizer = MultipleOptimizer(encoder_optimizer, 
                        decoder_optimizer)

In [None]:
# Train
pad_idx = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'bilinear_model_bidirectional.pt')
bilinear_training_metrics = {}

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')
    bilinear_training_metrics[epoch] = {'train_loss':train_loss, 'train_ppl':math.exp(train_loss), 'val_loss':valid_loss, 'val_ppl':math.exp(valid_loss)}

In [None]:
plot_metrics(bilinear_training_metrics, "Bilinear Product Attention", chart_filter='loss')
plot_metrics(bilinear_training_metrics, "Bilinear Product Attention", chart_filter='ppl')

In [None]:
import json

with open(os.path.join(SAVE_DIR, 'bilinear_model.json'), 'w') as fp:
    json.dump(bilinear_training_metrics, fp)

In [None]:
model.load_state_dict(torch.load(os.path.join(SAVE_DIR, 'bilinear_model_bidirectional.pt')))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

In [None]:
evaluateAndShowAttention(model, 'you should try to see it .')

evaluateAndShowAttention(model, 'i m shorter than you .')

evaluateAndShowAttention(model, 'it was difficult for me .')

evaluateAndShowAttention(model, 'i don t follow .')

evaluateAndShowAttention(model, 'the region is relatively rich in mineral resources .')

evaluateAndShowAttention(model, 'i took many photos and some of them were printed in black and white .')

### 2.2.7 Linear Attention

In [None]:
enc_bidirectional = True
enc_num_layers = 4

# attn = BilinearAttention(ENC_HID_DIM, DEC_HID_DIM) # Uni-directional attention
attn = LinearAttention((ENC_HID_DIM * 2), (ENC_HID_DIM * 2)) # Bi-directional attention

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT, 
              num_layers=enc_num_layers, bidirectional=enc_bidirectional)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, 
              attn, bidirectional_input=enc_bidirectional)

model = Seq2Seq(enc, dec, device).to(device)

learning_rate = 0.01
decoder_learning_ratio = 5.0

# Initialize optimizers and criterion
# Initialize optimizers and criterion
encoder_optimizer = optim.SGD(enc.parameters(), lr=learning_rate, momentum=0.9)
decoder_optimizer = optim.SGD(dec.parameters(), lr=learning_rate * decoder_learning_ratio, momentum=0.9)
enc_scheduler = optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 'min', patience=5)
dec_scheduler = optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, 'min', patience=5)

optimizer = MultipleOptimizer(encoder_optimizer, 
                        decoder_optimizer)

In [None]:
# Train
pad_idx = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'linear_model.pt')
linear_training_metrics = {}

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} |')
    linear_training_metrics[epoch] = {'train_loss':train_loss, 'train_ppl':math.exp(train_loss), 'val_loss':valid_loss, 'val_ppl':math.exp(valid_loss)}

In [None]:
plot_metrics(linear_training_metrics, "Linear Product Attention", chart_filter='loss')
plot_metrics(linear_training_metrics, "Linear Product Attention", chart_filter='ppl')

In [None]:
model.load_state_dict(torch.load(os.path.join(SAVE_DIR, 'linear_model.pt')))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

In [None]:
import json

with open(os.path.join(SAVE_DIR, 'linear_model.json'), 'w') as fp:
    json.dump(linear_training_metrics, fp)

In [None]:
evaluateAndShowAttention(model, 'you should try to see it .')

evaluateAndShowAttention(model, 'i m shorter than you .')

evaluateAndShowAttention(model, 'it was difficult for me .')

evaluateAndShowAttention(model, 'i don t follow .')

evaluateAndShowAttention(model, 'the region is relatively rich in mineral resources .')

evaluateAndShowAttention(model, 'i took many photos and some of them were printed in black and white .')

# 3. Comparison of Methods

In [None]:
def mergedicts(dict1, dict2):
    for k in set(dict1.keys()).union(dict2.keys()):
        if k in dict1 and k in dict2:
            if isinstance(dict1[k], dict) and isinstance(dict2[k], dict):
                yield (k, dict(mergedicts(dict1[k], dict2[k])))
            else:
                # If one of the values is not a dict, you can't continue merging it.
                # Value from second dict overrides one in first and we move on.
                yield (k, dict2[k])
                # Alternatively, replace this with exception raiser to alert you of value conflicts
        elif k in dict1:
            yield (k, dict1[k])
        else:
            yield (k, dict2[k])

In [None]:
all_results = {}
with open('./exp_small_data_100/bahdanau_model.json') as f:
    data = json.load(f)
    for e, l in data.items():
        new_data = {}
        for k,v in l.items():
            new_data["bahdanau_" + k] = v
        data[e] = new_data
    all_results = dict(mergedicts(all_results, data))

with open('./exp_small_data_100/dot_model.json') as f:
    data = json.load(f)
    for e, l in data.items():
        new_data = {}
        for k,v in l.items():
            new_data["dot_" + k] = v
        data[e] = new_data
    all_results = dict(mergedicts(all_results, data))

with open('./exp_small_data_100/cosine_model.json') as f:
    data = json.load(f)
    for e, l in data.items():
        new_data = {}
        for k,v in l.items():
            new_data["cosine_" + k] = v
        data[e] = new_data
    all_results = dict(mergedicts(all_results, data))

with open('./exp_small_data_100/bilinear_model.json') as f:
    data = json.load(f)
    for e, l in data.items():
        new_data = {}
        for k,v in l.items():
            new_data["bilinear_" + k] = v
        data[e] = new_data
    all_results = dict(mergedicts(all_results, data))

with open('./exp_small_data_100/linear_model.json') as f:
    data = json.load(f)
    for e, l in data.items():
        new_data = {}
        for k,v in l.items():
            new_data["linear_" + k] = v
        data[e] = new_data
    all_results = dict(mergedicts(all_results, data))

# Ensure keys are sorted as integers instead of strings
all_results = {int(k):v for k,v in all_results.items()}

In [None]:
plot_metrics(all_results, "Attention Comparison: Training PPL", 
             'train_ppl')
plot_metrics(all_results, "Attention Comparison: Validation PPL", 
             'val_ppl')

plot_metrics(all_results, "Attention Comparison: Training Loss", 
             'train_loss')
plot_metrics(all_results, "Attention Comparison: Validation Loss", 
             'val_loss')

In [None]:
from torchnlp.metrics import get_moses_multi_bleu

hypotheses = ["The brown fox jumps over the dog 笑"]
references = ["The quick brown fox jumps over the lazy dog 笑"]

# Compute BLEU score with the official BLEU perl script
get_moses_multi_bleu(hypotheses, references, lowercase=True)
