# Neural Machine Translation
In this project we are going to train sequence to sequence models for Spanish to English translation. 

# Step 1: Download & Prepare the Data

In [None]:
import pandas as pd
import unicodedata
import re
from torch.utils.data import Dataset
import torch
import random
import os
import numpy as np
import random
from torch.utils.data import DataLoader
rnn_encoder, rnn_encoder, transformer_encoder, transformer_decoder = None, None, None, None

## Helper Functions
This cell contains helper functions for the dataloader.

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """Normalizes latin chars with accent to their canonical decomposition"""
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    '''
    Preprocess the sentence to add the start, end tokens and make them lower-case
    '''
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r'([?.!,¿])', r' \1 ', w)
    w = re.sub(r'[" "]+', ' ', w)

    w = re.sub(r'[^a-zA-Z?.!,¿]+', ' ', w)
    
    w = w.rstrip().strip()
    w = '<start> ' + w + ' <end>'
    return w


def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len:
        padded[:] = x[:max_len]
    else:
        padded[:len(x)] = x
    return padded


def preprocess_data_to_tensor(dataframe, src_vocab, trg_vocab):
    # Vectorize the input and target languages
    src_tensor = [[src_vocab.word2idx[s if s in src_vocab.vocab else '<unk>'] for s in es.split(' ')] for es in dataframe['es'].values.tolist()]
    trg_tensor = [[trg_vocab.word2idx[s if s in trg_vocab.vocab else '<unk>'] for s in eng.split(' ')] for eng in dataframe['eng'].values.tolist()]

    # Calculate the max_length of input and output tensor for padding
    max_length_src, max_length_trg = max(len(t) for t in src_tensor), max(len(t) for t in trg_tensor)
    print('max_length_src: {}, max_length_trg: {}'.format(max_length_src, max_length_trg))

    # Pad all the sentences in the dataset with the max_length
    src_tensor = [pad_sequences(x, max_length_src) for x in src_tensor]
    trg_tensor = [pad_sequences(x, max_length_trg) for x in trg_tensor]

    return src_tensor, trg_tensor, max_length_src, max_length_trg


def train_test_split(src_tensor, trg_tensor):
    '''
    Create training and test sets.
    '''
    total_num_examples = len(src_tensor) - int(0.2*len(src_tensor))
    src_tensor_train, src_tensor_test = src_tensor[:int(0.75*total_num_examples)], src_tensor[int(0.75*total_num_examples):total_num_examples]
    trg_tensor_train, trg_tensor_test = trg_tensor[:int(0.75*total_num_examples)], trg_tensor[int(0.75*total_num_examples):total_num_examples]

    return src_tensor_train, src_tensor_test, trg_tensor_train, trg_tensor_test

## Download and Visualize the Data

Here we will download the translation data. We will learn a model to translate Spanish to English.

In [None]:
if __name__ == '__main__':
    os.system("wget http://www.manythings.org/anki/spa-eng.zip")
    os.system("unzip -o spa-eng.zip")

Now we visualize the data.

In [None]:
if __name__ == '__main__':
    lines = open('spa.txt', encoding='UTF-8').read().strip().split('\n')
    total_num_examples = 50000 
    original_word_pairs = [[w for w in l.split('\t')][:2] for l in lines[:total_num_examples]]
    random.shuffle(original_word_pairs)

    dat = pd.DataFrame(original_word_pairs, columns=['eng', 'es'])
    print(dat) # Visualize the data

                            eng                                  es
0         I led the discussion.                Moderé la discusión.
1        You know I have to go.          Sabes que me tengo que ir.
2          Give that cat to us.                     Danos ese gato.
3       She had white shoes on.  Ella llevaba unos zapatos blancos.
4         I should've told you.          Debería habértelo contado.
...                         ...                                 ...
49995  I hate Mary's boyfriend.              Odio al novio de Mary.
49996           Who wrote this?                ¿Quién escribió eso?
49997   You are not our friend.              No eres nuestra amiga.
49998               Try it now.                       Prueba ahora.
49999    Tom looks really hurt.         Tom se ve realmente herido.

[50000 rows x 2 columns]


Next we preprocess the data.

In [None]:
if __name__ == '__main__':
    data = dat.copy()
    data['eng'] = dat.eng.apply(lambda w: preprocess_sentence(w))
    data['es'] = dat.es.apply(lambda w: preprocess_sentence(w))
    print(data) # Visualizing the data

                                           eng  \
0         <start> i led the discussion . <end>   
1        <start> you know i have to go . <end>   
2          <start> give that cat to us . <end>   
3       <start> she had white shoes on . <end>   
4         <start> i should ve told you . <end>   
...                                        ...   
49995  <start> i hate mary s boyfriend . <end>   
49996           <start> who wrote this ? <end>   
49997   <start> you are not our friend . <end>   
49998               <start> try it now . <end>   
49999    <start> tom looks really hurt . <end>   

                                                      es  
0                    <start> modere la discusion . <end>  
1              <start> sabes que me tengo que ir . <end>  
2                         <start> danos ese gato . <end>  
3      <start> ella llevaba unos zapatos blancos . <end>  
4              <start> deberia habertelo contado . <end>  
...                                          

## Vocabulary & Dataloader Classes

In [None]:
class Vocab_Lang():
    def __init__(self, vocab):
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        self.idx2word = {0: '<pad>', 1: '<unk>'}
        self.vocab = vocab
        
        for index, word in enumerate(vocab):
            self.word2idx[word] = index + 2 
            self.idx2word[index + 2] = word
    
    def __len__(self):
        return len(self.word2idx)

class MyData(Dataset):
    def __init__(self, X, y):
        self.length = torch.LongTensor([np.sum(1 - np.equal(x, 0)) for x in X])
        self.data = torch.LongTensor(X)
        self.target = torch.LongTensor(y)
    
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        return x, y

    def __len__(self):
        return len(self.data)

In [None]:
if __name__ == '__main__':
    BATCH_SIZE = 64
    EMBEDDING_DIM = 256

## Build Vocabulary

In [None]:
def build_vocabulary(pd_dataframe):
    sentences = [sen.split() for sen in pd_dataframe]
    vocab = {}
    for sen in sentences:
        for word in sen:
            if word not in vocab:
                vocab[word] = 1
    return list(vocab.keys())

if __name__ == '__main__':
    src_vocab_list = build_vocabulary(data['es'])
    trg_vocab_list = build_vocabulary(data['eng'])

## Instantiate Datasets

We instantiate our training and validation datasets.

In [None]:
if __name__ == '__main__':
    src_vocab = Vocab_Lang(src_vocab_list)
    trg_vocab = Vocab_Lang(trg_vocab_list)

    src_tensor, trg_tensor, max_length_src, max_length_trg = preprocess_data_to_tensor(data, src_vocab, trg_vocab)
    src_tensor_train, src_tensor_val, trg_tensor_train, trg_tensor_val = train_test_split(src_tensor, trg_tensor)

    # Create train and val datasets
    train_dataset = MyData(src_tensor_train, trg_tensor_train)
    train_dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True)

    test_dataset = MyData(src_tensor_val, trg_tensor_val)
    test_dataset = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=False)

max_length_src: 16, max_length_trg: 12




In [None]:
if __name__ == '__main__':
    idxes = random.choices(range(len(train_dataset.dataset)), k=5)
    src, trg =  train_dataset.dataset[idxes]
    print('Source:', src)
    print('Source Dimensions: ', src.size())
    print('Target:', trg)
    print('Target Dimensions: ', trg.size())

Source: tensor([[   2,   40,  178,  747,    6,    7,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   2, 4864,   28,    7,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   2,   37,  907,   53,  271,  723,   41,    7,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   2,   44,  229, 3958,    6,    7,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0],
        [   2,   73, 3742,  291,    6,    7,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]])
Source Dimensions:  torch.Size([5, 16])
Target: tensor([[   2,   42,  162, 1680,    7,    8,    0,    0,    0,    0,    0,    0],
        [   2,  401,   31,    8,    0,    0,    0,    0,    0,    0,    0,    0],
        [   2,   41,   15,  204, 1842,   45,    8,    0,    0,    0,    0,    0],
        [   2,    3,   10,  208,    9, 1017,    7,    8,    0,    0,    0,    0],
        [   2,   72,   73,  188,  260,  

# Step 2: Training a Recurrent Neural Network (RNN)

In this module writing a recurrent model for machine translation, and then train and evaluate its results.

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import time
from tqdm.notebook import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu

## Encoder Model

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RnnEncoder(nn.Module):
    def __init__(self, src_vocab, embedding_dim, hidden_units):
        super(RnnEncoder, self).__init__()
        """
        Args:
            src_vocab: Vocab_Lang, the source vocabulary
            embedding_dim: the dimension of the embedding
            hidden_units: The number of features in the GRU hidden state
        """
        self.src_vocab = src_vocab 
        vocab_size = len(src_vocab)

        self.hidden_size=hidden_units
        self.num_layers=1
        # Initialize embedding layer
        self.embeddding= nn.Embedding(vocab_size, embedding_dim)
        num_layers=1
        # Initialize a single directional GRU with 1 layer and batch_first=False
        self.gru = nn.GRU(embedding_dim, hidden_units, self.num_layers, batch_first=False)

    def forward(self, x):
        """
        Args:
            x: source texts, [max_len, batch_size]
        Returns:
            output: [max_len, batch_size, hidden_units]
            hidden_state: [1, batch_size, hidden_units] 
        
        Pseudo-code:
        - Pass x through an embedding layer and pass the results through the recurrent net
        - Return output and hidden states from the recurrent net
        """
        
        ### TODO ###
        embedded = self.embeddding(x)
        packed_output, hidden = self.gru(embedded)
        output, hidden_state = packed_output, hidden
        
        return output, hidden_state

## Sanity Check: RNN Encoder Model

The code below runs a sanity check for your `RnnEncoder` class.

In [None]:
count_parameters = lambda model: sum(p.numel() for p in model.parameters() if p.requires_grad)

def sanityCheckModel(all_test_params, NN, expected_outputs, init_or_forward, data_loader):
    print('--- TEST: ' + ('Number of Model Parameters (tests __init__(...))' if init_or_forward=='init' else 'Output shape of forward(...)') + ' ---')
    if init_or_forward == "forward":
        # Reading the first batch of data for testing
        for texts_, labels_ in data_loader:
            texts_batch, labels_batch = texts_, labels_
            break

    for tp_idx, (test_params, expected_output) in enumerate(zip(all_test_params, expected_outputs)):       
        if init_or_forward == "forward":
            batch_size = test_params['batch_size']
            texts = texts_batch[:batch_size]
            if NN.__name__ == "RnnEncoder":
                texts = texts.transpose(0,1)

        # Construct the student model
        tps = {k:v for k, v in test_params.items() if k != 'batch_size'}
        stu_nn = NN(**tps)

        input_rep = str({k:v for k,v in tps.items()})

        if init_or_forward == "forward":
            with torch.no_grad():
                if NN.__name__ == "TransformerEncoder":
                    stu_out = stu_nn(texts)
                else:
                    stu_out, _ = stu_nn(texts)
                    expected_output = torch.rand(expected_output).transpose(0, 1).size()
            ref_out_shape = expected_output

            has_passed = torch.is_tensor(stu_out)
            if not has_passed: msg = 'Output must be a torch.Tensor; received ' + str(type(stu_out))
            else: 
                has_passed = stu_out.shape == ref_out_shape
                msg = 'Your Output Shape: ' + str(stu_out.shape)
            

            status = 'PASSED' if has_passed else 'FAILED'
            message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape: ' + str(texts.shape) + '\tExpected Output Shape: ' + str(ref_out_shape) + '\t' + msg
            print(message)
        else:
            stu_num_params = count_parameters(stu_nn)
            ref_num_params = expected_output
            comparison_result = (stu_num_params == ref_num_params)

            status = 'PASSED' if comparison_result else 'FAILED'
            message = '\t' + status + "\tInput: " + input_rep + ('\tExpected Num. Params: ' + str(ref_num_params) + '\tYour Num. Params: '+ str(stu_num_params))
            print(message)

        del stu_nn

In [None]:
if __name__ == '__main__':
    # Set random seed
    torch.manual_seed(42)
    # Create test inputs
    embedding_dim = [2, 5, 8]
    hidden_units = [50, 100, 200]
    params = []
    inputs = []
    for i in range(len(embedding_dim)):
        for hu in hidden_units:
            inp = {}
            inp['src_vocab'] = src_vocab
            inp['embedding_dim'] = embedding_dim[i]
            inp['hidden_units'] = hu
            inputs.append(inp)
    # Test init
    expected_outputs = [33770, 56870, 148070, 72725, 96275, 188375, 111680, 135680, 228680]

    sanityCheckModel(inputs, RnnEncoder, expected_outputs, "init", None)
    print()

    # Test forward
    inputs = []
    batch_sizes = [1, 2]
    for hu in hidden_units:
        for b in batch_sizes:
            inp = {}
            inp['embedding_dim'] = EMBEDDING_DIM
            inp['src_vocab'] = src_vocab
            inp["batch_size"] = b
            inp['hidden_units'] = hu
            inputs.append(inp)
    # create sanity datasets
    sanity_dataset = MyData(src_tensor_train, trg_tensor_train)
    sanity_loader = torch.utils.data.DataLoader(sanity_dataset, batch_size=50, num_workers=2, drop_last=True, shuffle=True)
    expected_outputs = [torch.Size([1, 16, 50]), torch.Size([2, 16, 50]), torch.Size([1, 16, 100]), torch.Size([2, 16, 100]), torch.Size([1, 16, 200]), torch.Size([2, 16, 200])]

    sanityCheckModel(inputs, RnnEncoder, expected_outputs, "forward", sanity_loader)

--- TEST: Number of Model Parameters (tests __init__(...)) ---
	PASSED	Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 2, 'hidden_units': 50}	Expected Num. Params: 33770	Your Num. Params: 33770
	PASSED	Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 2, 'hidden_units': 100}	Expected Num. Params: 56870	Your Num. Params: 56870
	PASSED	Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 2, 'hidden_units': 200}	Expected Num. Params: 148070	Your Num. Params: 148070
	PASSED	Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 5, 'hidden_units': 50}	Expected Num. Params: 72725	Your Num. Params: 72725
	PASSED	Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 5, 'hidden_units': 100}	Expected Num. Params: 96275	Your Num. Params: 96275
	PASSED	Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding

## Decoder Model
We will implement a Decoder model that uses an attention mechanism, as provided in https://arxiv.org/pdf/1409.0473.pdf. We have broken this up into three functions that you need to implement: `__init__(self, ...)`, `compute_attention(self, dec_hs, enc_output)`, and `forward(self, x, dec_hs, enc_output)`:

* <b>`__init__(self, ...)`:</b> Instantiate the parameters of your model, and store them in `self` variables.

* <b>`compute_attention(self, dec_hs, enc_output)`</b>: Compute the <b>context vector</b>, which is a weighted sum of the encoder output states. Suppose the decoder hidden state at time $t$ is $\mathbf{h}_t$, and the encoder hidden state at time $s$ is $\mathbf{\bar h}_s$. The pseudocode is as follows:

  1. <b>Attention scores:</b> Compute real-valued scores for the decoder hidden state $\mathbf{h}_t$ and each encoder hidden state $\mathbf{\bar h}_s$: $$\mathrm{score}(\mathbf{h}_t, \mathbf{\bar h}_s)=
      \mathbf{v}_a^T \tanh(\mathbf{W}_1 \mathbf{h}_t +\mathbf{W}_2 \mathbf{\bar h}_s)
$$
   Here you should implement the scoring function. A higher score indicates a stronger "affinity" between the decoder state and a specific encoder state. 

   Note that in theory, $\mathbf{v_a}$ could have a different dimension than $\mathbf{h}_t$ and $\mathbf{\bar h}_s$, but you should use the same hidden size for this vector.

 2. <b>Attention weights:</b> Normalize the attention scores to obtain a valid probability distribution: $$\alpha_{ts} = \frac{\exp \big (\mathrm{score}(\mathbf{h}_t, \mathbf{\bar h}_s) \big)}{\sum_{s'=1}^S \exp \big (\mathrm{score}(\mathbf{h}_t, \mathbf{\bar h}_{s'}) \big)}$$ Notice that this is just the softmax function, and can be implemented with `F.softmax(...)` in Pytorch.

 3. <b>Context vector:</b> Compute a context vector $\mathbf{c}_t$ that is a weighted average of the encoder hidden states, where the weights are given by the attention weights you just computed: $$\mathbf{c}_t=\sum_{s=1}^S \alpha_{ts} \mathbf{\bar h}_s$$

 You should return this context vector, along with the attention weights.



* <b>`forward(self, x, dec_hs, enc_output)`:</b> Run a <b>single</b> decoding step, resulting in a distribution over the vocabulary for the next token in the sequence. Pseudocode can be found in the docstrings below.



In [None]:
class RnnDecoder(nn.Module):
    def __init__(self, trg_vocab, embedding_dim, hidden_units):
        super(RnnDecoder, self).__init__()
        """
        Args:
            trg_vocab: Vocab_Lang, the target vocabulary
            embedding_dim: The dimension of the embedding
            hidden_units: The number of features in the GRU hidden state
        """
        self.trg_vocab = trg_vocab 
        self.vocab_size = len(self.trg_vocab)
        self.hidden_unit = hidden_units

        # Initialize embedding layer
        self.embedding= nn.Embedding(self.vocab_size, embedding_dim)

        # Initialize layers to compute attention score
        self.w1 = nn.Linear(self.hidden_unit,self.hidden_unit, bias=True)
        self.w2 = nn.Linear(self.hidden_unit,self.hidden_unit, bias=True)
        self.va = nn.Linear(self.hidden_unit,1)
        
        # Initialize a single directional GRU with 1 layer and batch_first=True
        self.gru = nn.GRU(self.hidden_unit+embedding_dim, self.hidden_unit,num_layers=1,batch_first=True)

        # Initialize fully connected layer
        self.fc = nn.Linear(self.hidden_unit, self.vocab_size)

    def compute_attention(self, dec_hs, enc_output):
        '''
        This function computes the context vector and attention weights.

        Args:
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            enc_output: Encoder outputs; [max_len_src, batch_size, hidden_units]

        Returns:
            context_vector: Context vector, according to formula; [batch_size, hidden_units]
            attention_weights: The attention weights you have calculated; [batch_size, max_len_src, 1]

        Pseudo-code:
            (1) Compute the attention scores for dec_hs & enc_output
                    - Hint: You may need to permute the dimensions of the tensors in order to pass them through linear layers
                    - Output size: [batch_size, max_len_src, 1]
            (2) Compute attention_weights by taking a softmax over your scores to normalize the distribution 
            (Make sure that after softmax the normalized scores add up to 1)
                    - Output size: [batch_size, max_len_src, 1]
            (3) Compute context_vector from attention_weights & enc_output
                    - Hint: You may find it helpful to use torch.sum & element-wise multiplication (* operator)
            (4) Return context_vector & attention_weights
        '''      
        context_vector, attention_weights = None, None
        
        enc_output=enc_output.permute(1,0,2)
        dec_hs=dec_hs.permute(1,0,2)
        #score(𝐡𝑡,𝐡¯𝑠)=𝐯𝑇𝑎tanh(𝐖1𝐡𝑡+𝐖2𝐡¯𝑠)
        attention_scores=self.va(F.tanh(self.w1(dec_hs)+self.w2(enc_output)))
        attention_weights = F.softmax(attention_scores,dim=1)
        context_vector=torch.sum(attention_weights*enc_output,dim=1)

        return context_vector, attention_weights

    def forward(self, x, dec_hs, enc_output):
        '''
        This function runs the decoder for a **single** time step.

        Args:
            x: Input token; [batch_size, 1]
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            enc_output: Encoder outputs; [max_len_src, batch_size, hidden_units]

        Returns:
            fc_out: (Unnormalized) output distribution [batch_size, vocab_size]
            dec_hs: Decoder hidden state; [1, batch_size, hidden_units]
            attention_weights: The attention weights you have learned; [batch_size, max_len_src, 1]

        Pseudo-code:
            (1) Compute the context vector & attention weights by calling self.compute_attention(...) on the appropriate input
            (2) Obtain embedding vectors for your input x
                    - Output size: [batch_size, 1, embedding_dim]             
            (3) Concatenate the context vector & the embedding vectors along the appropriate dimension
            (4) Feed this result through your RNN (along with the current hidden state) to get output and new hidden state
                    - Output sizes: [batch_size, 1, hidden_units] & [1, batch_size, hidden_units] 
            (5) Feed the output of your RNN through linear layer to get (unnormalized) output distribution (don't call softmax!)
            (6) Return this output, the new decoder hidden state, & the attention weights
        '''
        fc_out, attention_weights = None, None
        
        context_vector, attention_weights = self.compute_attention(dec_hs, enc_output)

        embedded = self.embedding(x)
        
        fc_out = torch.cat((context_vector.unsqueeze(1),embedded), dim=2)
        fc_out, hidden = self.gru(fc_out)
        
        fc_out = self.fc(fc_out).squeeze(1)

        return fc_out, dec_hs, attention_weights

## Sanity Check: RNN Decoder Model

The code below runs a sanity check for your `RnnDecoder` class. 

In [None]:
def sanityCheckDecoderModelForward(inputs, NN, expected_outputs):
    print('--- TEST: Output shape of forward(...) ---\n')
    expected_fc_outs = expected_outputs[0]
    expected_dec_hs = expected_outputs[1]
    expected_attention_weights = expected_outputs[2]
    msg = ''
    for i, inp in enumerate(inputs):
        input_rep = '{'
        for k,v in inp.items():
            if torch.is_tensor(v):
                input_rep += str(k) + ': ' + 'Tensor with shape ' + str(v.size()) + ', '
            else:
                input_rep += str(k) + ': ' + str(v) + ', '
        input_rep += '}'
        dec = RnnDecoder(trg_vocab=inp['trg_vocab'],embedding_dim=inp['embedding_dim'],hidden_units=inp['hidden_units'])
        dec_hs = torch.rand(1, inp["batch_size"], inp['hidden_units'])
        x = torch.randint(low=0,high=len(inp["trg_vocab"]),size=(inp["batch_size"], 1))
        with torch.no_grad(): 
            dec_out = dec(x=x, dec_hs=dec_hs,enc_output=inp['encoder_outputs'])
            if not isinstance(dec_out, tuple):
                msg = '\tFAILED\tYour RnnDecoder.forward() output must be a tuple; received ' + str(type(dec_out))
                print(msg)
                continue
            elif len(dec_out)!=3:
                msg = '\tFAILED\tYour RnnDecoder.forward() output must be a tuple of size 3; received tuple of size ' + str(len(dec_out))
                print(msg)
                continue
            stu_fc_out, stu_dec_hs, stu_attention_weights = dec_out
        del dec
        has_passed = True
        msg = ""
        if not torch.is_tensor(stu_fc_out):
            has_passed = False
            msg += '\tFAILED\tOutput must be a torch.Tensor; received ' + str(type(stu_fc_out)) + " "
        if not torch.is_tensor(stu_dec_hs):
            has_passed = False
            msg += '\tFAILED\tDecoder Hidden State must be a torch.Tensor; received ' + str(type(stu_dec_hs)) + " "
        if not torch.is_tensor(stu_attention_weights):
            has_passed = False
            msg += '\tFAILED\tAttention Weights must be a torch.Tensor; received ' + str(type(stu_attention_weights)) + " "
        
        status = 'PASSED' if has_passed else 'FAILED'
        if not has_passed:
            message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape: ' + str(inp['encoder_outputs'].shape) + '\tExpected Output Shape: ' + str(expected_fc_outs[i]) + '\t' + msg
            print(message)
            continue
        
        has_passed = stu_fc_out.size() == expected_fc_outs[i]
        msg = 'Your Output Shape: ' + str(stu_fc_out.size())
        status = 'PASSED' if has_passed else 'FAILED'
        message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape: ' + str(inp['encoder_outputs'].shape) + '\tExpected Output Shape: ' + str(expected_fc_outs[i]) + '\t' + msg
        print(message)

        has_passed = stu_dec_hs.size() == expected_dec_hs[i]
        msg = 'Your Hidden State Shape: ' + str(stu_dec_hs.size())
        status = 'PASSED' if has_passed else 'FAILED'
        message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape: ' + str(inp['encoder_outputs'].shape) + '\tExpected Hidden State Shape: ' + str(expected_dec_hs[i]) + '\t' + msg
        print(message)

        has_passed = stu_attention_weights.size() == expected_attention_weights[i]
        msg = 'Your Attention Weights Shape: ' + str(stu_attention_weights.size())
        status = 'PASSED' if has_passed else 'FAILED'
        message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape: ' + str(inp['encoder_outputs'].shape) + '\tExpected Attention Weights Shape: ' + str(expected_attention_weights[i]) + '\t' + msg
        print(message)

        stu_sum = stu_attention_weights.sum(dim=1).squeeze()
        if torch.allclose(stu_sum, torch.ones_like(stu_sum), atol=1e-5):
            print('\tPASSED\t The sum of your attention_weights along dim 1 is 1.')
        else:
            print('\tFAILED\t The sum of your attention_weights along dim 1 is not 1.')
        print()

In [None]:
if __name__ == '__main__':
    # Set random seed
    torch.manual_seed(42)
    # Create test inputs
    embedding_dim = [2, 5, 8]
    hidden_units = [50, 100, 200]
    params = []
    inputs = []
    for i in range(len(embedding_dim)):
        for hu in hidden_units:
            inp = {}
            inp['trg_vocab'] = trg_vocab
            inp['embedding_dim'] = embedding_dim[i]
            inp['hidden_units'] = hu
            inputs.append(inp)
    # Test init
    expected_outputs = [371028, 762228, 1664628, 391305, 782955, 1686255, 411582, 803682, 1707882]
    sanityCheckModel(inputs, RnnDecoder, expected_outputs, "init", None)
    print()

    # Test forward
    inputs = []
    batch_sizes = [1, 2, 4]
    embedding_dims = iter([50,80,100,120,150,200,300,400,500])
    encoder_outputs = iter([torch.rand([1, 16, 50]), torch.rand([2, 16, 50]), torch.rand([4, 16, 50]), torch.rand([1, 16, 100]), torch.rand([2, 16, 100]), torch.rand([4, 16, 100]), torch.rand([1, 16, 200]), torch.rand([2, 16, 200]),torch.rand([4, 16, 200])])
    expected_fc_outs = [torch.Size([1, 6609]),torch.Size([2, 6609]),torch.Size([4, 6609]),torch.Size([1, 6609]),torch.Size([2, 6609]),torch.Size([4, 6609]),torch.Size([1, 6609]),torch.Size([2, 6609]),torch.Size([4, 6609])]
    expected_dec_hs = [torch.Size([1, 1, 50]), torch.Size([1, 2, 50]), torch.Size([1, 4, 50]), torch.Size([1, 1, 100]), torch.Size([1, 2, 100]), torch.Size([1, 4, 100]), torch.Size([1, 1, 200]), torch.Size([1, 2, 200]), torch.Size([1, 4, 200])]
    expected_attention_weights = [torch.Size([1, 16, 1]), torch.Size([2, 16, 1]), torch.Size([4, 16, 1]), torch.Size([1, 16, 1]), torch.Size([2, 16, 1]), torch.Size([4, 16, 1]), torch.Size([1, 16, 1]), torch.Size([2, 16, 1]), torch.Size([4, 16, 1])]
    expected_outputs = (expected_fc_outs, expected_dec_hs, expected_attention_weights)
    
    for hu in hidden_units:
        for b in batch_sizes:
            inp = {}
            edim = next(embedding_dims)
            inp['embedding_dim'] = edim
            inp['trg_vocab'] = trg_vocab
            inp["batch_size"] = b
            inp['hidden_units'] = hu
            inp['encoder_outputs'] = next(encoder_outputs).transpose(0,1)
            inputs.append(inp)
    
    sanityCheckDecoderModelForward(inputs, RnnDecoder, expected_outputs)


--- TEST: Number of Model Parameters (tests __init__(...)) ---
	PASSED	Input: {'trg_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, 'embedding_dim': 2, 'hidden_units': 50}	Expected Num. Params: 371028	Your Num. Params: 371028
	PASSED	Input: {'trg_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, 'embedding_dim': 2, 'hidden_units': 100}	Expected Num. Params: 762228	Your Num. Params: 762228
	PASSED	Input: {'trg_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, 'embedding_dim': 2, 'hidden_units': 200}	Expected Num. Params: 1664628	Your Num. Params: 1664628
	PASSED	Input: {'trg_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, 'embedding_dim': 5, 'hidden_units': 50}	Expected Num. Params: 391305	Your Num. Params: 391305
	PASSED	Input: {'trg_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, 'embedding_dim': 5, 'hidden_units': 100}	Expected Num. Params: 782955	Your Num. Params: 782955
	PASSED	Input: {'trg_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, 



	PASSED	 Init Input: {embedding_dim: 500, trg_vocab: <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, batch_size: 4, hidden_units: 200, encoder_outputs: Tensor with shape torch.Size([16, 4, 200]), }	Forward Input Shape: torch.Size([16, 4, 200])	Expected Output Shape: torch.Size([4, 6609])	Your Output Shape: torch.Size([4, 6609])
	PASSED	 Init Input: {embedding_dim: 500, trg_vocab: <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, batch_size: 4, hidden_units: 200, encoder_outputs: Tensor with shape torch.Size([16, 4, 200]), }	Forward Input Shape: torch.Size([16, 4, 200])	Expected Hidden State Shape: torch.Size([1, 4, 200])	Your Hidden State Shape: torch.Size([1, 4, 200])
	PASSED	 Init Input: {embedding_dim: 500, trg_vocab: <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, batch_size: 4, hidden_units: 200, encoder_outputs: Tensor with shape torch.Size([16, 4, 200]), }	Forward Input Shape: torch.Size([16, 4, 200])	Expected Attention Weights Shape: torch.Size([4, 16, 1])	Your Attention Weights 

## Train RNN Model

We will train the encoder and decoder using cross-entropy loss.

In [None]:
def loss_function(real, pred):
    mask = real.ge(1).float() 
    
    loss_ = F.cross_entropy(pred, real) * mask 
    return torch.mean(loss_)

def train_rnn_model(encoder, decoder, dataset, optimizer, trg_vocab, device, n_epochs):
    batch_size = dataset.batch_size
    for epoch in range(n_epochs):
        start = time.time()
        n_batch = 0
        total_loss = 0
        
        encoder.train()
        decoder.train()
        
        for src, trg in tqdm(dataset):
            n_batch += 1
            loss = 0
            
            enc_output, enc_hidden = encoder(src.transpose(0,1).to(device))
            dec_hidden = enc_hidden
            
            dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size)
        
            # run code below for every timestep in the ys batch
            for t in range(1, trg.size(1)):
                predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))
                assert len(predictions.shape) == 2 and predictions.shape[0] == dec_input.shape[0] and predictions.shape[1] == len(trg_vocab.word2idx), "First output of decoder must have shape [batch_size, vocab_size], you returned shape " + str(predictions.shape)
                loss += loss_function(trg[:, t].to(device), predictions.to(device))
                dec_input = trg[:, t].unsqueeze(1)
        
            batch_loss = (loss / int(trg.size(1)))
            total_loss += batch_loss
            
            optimizer.zero_grad()
            
            batch_loss.backward()

            ### update model parameters
            optimizer.step()
        
        print('Epoch:{:2d}/{}\t Loss: {:.4f} \t({:.2f}s)'.format(epoch + 1, n_epochs, total_loss / n_batch, time.time() - start))

    print('Model trained!')

In [None]:
if __name__ == '__main__':
    LEARNING_RATE = 0.001
    HIDDEN_UNITS=256
    N_EPOCHS=10

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  
    rnn_encoder = RnnEncoder(src_vocab, EMBEDDING_DIM, HIDDEN_UNITS).to(device)
    rnn_decoder = RnnDecoder(trg_vocab, EMBEDDING_DIM, HIDDEN_UNITS).to(device)

    rnn_model_params = list(rnn_encoder.parameters()) + list(rnn_decoder.parameters())
    optimizer = torch.optim.Adam(rnn_model_params, lr=LEARNING_RATE)

    print('Encoder and Decoder models initialized!')

Encoder and Decoder models initialized!


In [None]:
if __name__ == '__main__':
    train_rnn_model(rnn_encoder, rnn_decoder, train_dataset, optimizer, trg_vocab, device, N_EPOCHS)

  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 1/10	 Loss: 1.8021 	(14.39s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 2/10	 Loss: 1.1908 	(13.24s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 3/10	 Loss: 0.9115 	(13.18s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 4/10	 Loss: 0.7000 	(12.99s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 5/10	 Loss: 0.5290 	(13.71s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 6/10	 Loss: 0.3932 	(13.07s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 7/10	 Loss: 0.2886 	(13.32s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 8/10	 Loss: 0.2107 	(13.16s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 9/10	 Loss: 0.1550 	(13.40s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:10/10	 Loss: 0.1155 	(13.27s)
Model trained!


## Inference (Decoding) Function

Now that we have trained the model, we can use it on test data.

In [None]:
def decode_rnn_model(encoder, decoder, src, max_decode_len, device):
    """
    Args:
        encoder: Your RnnEncoder object
        decoder: Your RnnDecoder object
        src: [max_src_length, batch_size] the source sentences you wish to translate
        max_decode_len: The maximum desired length (int) of your target translated sentences
        device: the device your torch tensors are on (you may need to call x.to(device) for some of your tensors)

    Returns:
        curr_output: [batch_size, max_decode_len] containing your predicted translated sentences
        curr_predictions: [batch_size, max_decode_len, trg_vocab_size] containing the (unnormalized) probabilities of each
            token in your vocabulary at each time step

    Pseudo-code:
    - Obtain encoder output and hidden state by encoding src sentences
    - For 1 ≤ t ≤ max_decode_len:
        - Obtain your (unnormalized) prediction probabilities and hidden state by feeding dec_input (the best words 
          from the previous time step), previous hidden state, and encoder output to decoder
        - Save your (unnormalized) prediction probabilities in curr_predictions at index t
        - Obtain your new dec_input by selecting the most likely (highest probability) token
        - Save dec_input in curr_output at index t
    """
    # Initialize variables
    trg_vocab = decoder.trg_vocab
    batch_size = src.size(1)
    curr_output = torch.zeros((batch_size, max_decode_len))
    curr_predictions = torch.zeros((batch_size, max_decode_len, len(trg_vocab.idx2word)))

    # We start the decoding with the start token for each example
    dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size)
    curr_output[:, 0] = dec_input.squeeze(1)
    
    ### TODO: Implement decoding algorithm ###
    enc_output, enco_hidden = encoder(src)
    enco_hidden = enco_hidden.to(device)
    hidden = enco_hidden
    
    dec_input = dec_input.to(device)
    enc_output = enc_output.to(device)
    
    for t in range(1, max_decode_len): 
       
      out_put, hidden, attention_weights = decoder(dec_input, hidden, enc_output)
      
      curr_predictions[:,t,:] = out_put
      dec_input = torch.argmax(out_put,1).unsqueeze(1)
      
      curr_output[:,t] = dec_input.squeeze(1)
    return curr_output, curr_predictions

You can run the cell below to qualitatively compare some of the sentences your model generates with the some of the correct translations.

In [None]:
if __name__ == '__main__':
    rnn_encoder.eval()
    rnn_decoder.eval()
    idxes = random.choices(range(len(test_dataset.dataset)), k=5)
    src, trg =  train_dataset.dataset[idxes]
    curr_output, _ = decode_rnn_model(rnn_encoder, rnn_decoder, src.transpose(0,1).to(device), trg.size(1), device)
    for i in range(len(src)):
        print("Source sentence:", ' '.join([x for x in [src_vocab.idx2word[j.item()] for j in src[i]] if x != '<pad>']))
        print("Target sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in trg[i]] if x != '<pad>']))
        print("Predicted sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in curr_output[i]] if x != '<pad>']))
        print("----------------")

Source sentence: <start> debes ser precavido . <end>
Target sentence: <start> you must be cautious . <end>
Predicted sentence: <start> you must be cautious . <end>
----------------
Source sentence: <start> de eso nada ! <end>
Target sentence: <start> no way ! <end>
Predicted sentence: <start> no way ! <end>
----------------
Source sentence: <start> hice una fortuna . <end>
Target sentence: <start> i made a fortune . <end>
Predicted sentence: <start> i made a fortune . <end>
----------------
Source sentence: <start> no bebo cerveza . <end>
Target sentence: <start> i don t drink beer . <end>
Predicted sentence: <start> i don t drink beer . <end>
----------------
Source sentence: <start> tu estas bajo juramento . <end>
Target sentence: <start> you are under oath . <end>
Predicted sentence: <start> you are under oath . <end>
----------------


## Evaluate RNN Model

We provide you with a function to run the test set through the model and calculate BLEU scores. We expect your BLEU scores to satisfy the following conditions:  

*   BLEU-1 > 0.290
*   BLEU-2 > 0.082
*   BLEU-3 > 0.060
*   BLEU-4 > 0.056

Read more about Bleu Score at :

1.   https://en.wikipedia.org/wiki/BLEU
2.   https://www.aclweb.org/anthology/P02-1040.pdf

In [None]:
def get_reference_candidate(target, pred, trg_vocab):
    def _to_token(sentence):
        lis = []
        for s in sentence[1:]:
            x = trg_vocab.idx2word[s]
            if x == "<end>": break
            lis.append(x)
        return lis
    reference = _to_token(list(target.numpy()))
    candidate = _to_token(list(pred.numpy()))
    return reference, candidate

def compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab):
    bleu_1 = 0.0
    bleu_2 = 0.0
    bleu_3 = 0.0
    bleu_4 = 0.0

    smoother = SmoothingFunction()
    save_reference = []
    save_candidate = []
    for i in range(len(target_tensor_val)):
        reference, candidate = get_reference_candidate(target_output[i], final_output[i], trg_vocab)
    
        bleu_1 += sentence_bleu(reference, candidate, weights=(1,), smoothing_function=smoother.method1)
        bleu_2 += sentence_bleu(reference, candidate, weights=(1/2, 1/2), smoothing_function=smoother.method1)
        bleu_3 += sentence_bleu(reference, candidate, weights=(1/3, 1/3, 1/3), smoothing_function=smoother.method1)
        bleu_4 += sentence_bleu(reference, candidate, weights=(1/4, 1/4, 1/4, 1/4), smoothing_function=smoother.method1)

        save_reference.append(reference)
        save_candidate.append(candidate)
    
    bleu_1 = bleu_1/len(target_tensor_val)
    bleu_2 = bleu_2/len(target_tensor_val)
    bleu_3 = bleu_3/len(target_tensor_val)
    bleu_4 = bleu_4/len(target_tensor_val)

    scores = {"bleu_1": bleu_1, "bleu_2": bleu_2, "bleu_3": bleu_3, "bleu_4": bleu_4}
    print('BLEU 1-gram: %f' % (bleu_1))
    print('BLEU 2-gram: %f' % (bleu_2))
    print('BLEU 3-gram: %f' % (bleu_3))
    print('BLEU 4-gram: %f' % (bleu_4))

    return save_candidate, scores

def evaluate_rnn_model(encoder, decoder, test_dataset, target_tensor_val, device):
    trg_vocab = decoder.trg_vocab
    batch_size = test_dataset.batch_size
    n_batch = 0
    total_loss = 0

    encoder.eval()
    decoder.eval()
    
    final_output, target_output = None, None

    with torch.no_grad():
        for batch, (src, trg) in enumerate(test_dataset):
            n_batch += 1
            loss = 0
            curr_output, curr_predictions = decode_rnn_model(encoder, decoder, src.transpose(0,1).to(device), trg.size(1), device)
            for t in range(1, trg.size(1)):
                loss += loss_function(trg[:, t].to(device), curr_predictions[:,t,:].to(device))

            if final_output is None:
                final_output = torch.zeros((len(target_tensor_val), trg.size(1)))
                target_output = torch.zeros((len(target_tensor_val), trg.size(1)))
            final_output[batch*batch_size:(batch+1)*batch_size] = curr_output
            target_output[batch*batch_size:(batch+1)*batch_size] = trg
            batch_loss = (loss / int(trg.size(1)))
            total_loss += batch_loss

        print('Loss {:.4f}'.format(total_loss / n_batch))
    
    # Compute BLEU scores
    return compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab)

In [None]:
if __name__ == '__main__':
    rnn_save_candidate, rnn_scores = evaluate_rnn_model(rnn_encoder, rnn_decoder, test_dataset, trg_tensor_val, device)

Loss 2.6865
BLEU 1-gram: 0.289119
BLEU 2-gram: 0.084423
BLEU 3-gram: 0.063367
BLEU 4-gram: 0.060701


# Step 3: Train a Transformer

Here you will write a transformer model for machine translation, and then train and evaluate its results. 

In [None]:
import math

## Positional Embeddings

Similar to the RNN, we start with the Encoder model. A key component of the encoder is the Positional Embedding. As we know, word embeddings encode words in such a way that words with similar meaning have similar vectors. Because there are no recurrences in a Transformer, we need a way to tell the transformer the relative position of words in a sentence: so will add a positional embedding to the word embeddings. Now, two words with a similar embedding will both be close in meaning and occur near each other in the sentence.

You will create a positional embedding matrix of size $(max\_len, embed\_dim)$ using the following formulae:
<br>
$\begin{align*} pe[pos,2i] &= \sin \Big (\frac{pos}{10000^{2i/embed\_dim}}\Big )\\pe[pos,2i+1] &= \cos \Big (\frac{pos}{10000^{2i/embed\_dim}}\Big ) \end{align*}$





In [None]:
def create_positional_embedding(max_len, embed_dim):
    '''
    Args:
        max_len: The maximum length supported for positional embeddings 
        embed_dim: The size of your embeddings
    Returns:
        pe: [max_len, 1, embed_dim] computed as in the formulae above
    '''
    pe = torch.zeros(max_len, embed_dim)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0).transpose(0, 1)

    return pe

## Encoder Model 

Now you will create the Encoder model for the transformer.

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, src_vocab, embedding_dim, num_heads,
        num_layers, dim_feedforward, max_len_src, device):
        super(TransformerEncoder, self).__init__()
        self.device = device
        """
        Args:
            src_vocab: Vocab_Lang, the source vocabulary
            embedding_dim: the dimension of the embedding (also the number of expected features for the input of the Transformer)
            num_heads: The number of attention heads
            num_layers: the number of Transformer Encoder layers
            dim_feedforward: the dimension of the feedforward network models in the Transformer
            max_len_src: maximum length of the source sentences
            device: the working device (you may need to map your postional embedding to this device)
        """
        self.src_vocab = src_vocab 
        src_vocab_size = len(src_vocab)

        # Create positional embedding matrix
        self.position_embedding = create_positional_embedding(max_len_src, embedding_dim).to(self.device)
        self.register_buffer('positional_embedding', self.position_embedding) # this informs the model that position_embedding is not a learnable parameter


        # Initialize embedding layer
        self.embedding= nn.Embedding(src_vocab_size, embedding_dim).to(self.device)

        # Dropout layer
        self.dropout1 = nn.Dropout().to(self.device)

        # Initialize a nn.TransformerEncoder model (you'll need to use embedding_dim,
        encoder_layers = torch.nn.TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward).to(self.device)
        self.transformer_encoder = torch.nn.TransformerEncoder(encoder_layers, num_layers).to(self.device)
        

    def make_src_mask(self, src):
        """
        Args:
            src: [max_len, batch_size]
        Returns:
            Boolean matrix of size [batch_size, max_len] indicating which indices are padding
        """
        assert len(src.shape) == 2, 'src must have exactly 2 dimensions'
        src_mask = src.transpose(0, 1) == 0 # padding idx
        return src_mask.to(self.device) 
    
    def forward(self, x):
        """
        Args:
            x: [max_len, batch_size]
        Returns:
            output: [max_len, batch_size, embed_dim]
        Pseudo-code (note: x refers to the original input to this function throughout the pseudo-code):
        - Pass x through the word embedding
        - Add positional embedding to the word embedding, then apply dropout
        - Call make_src_mask(x) to compute a mask: this tells us which indexes in x
          are padding, which we want to ignore for the self-attention
        - Call the encoder, with src_key_padding_mask = src_mask
        """
        x=x.to(self.device)
        embedded = self.embedding(x)
        
        pos=(self.position_embedding[:x.size(0)]).to(self.device)
        pos_embed = (embedded + pos).to(self.device)
        embedding=self.dropout1(pos_embed).to(self.device)
        
        src_mask=self.make_src_mask(x).to(self.device)
        
        output = self.transformer_encoder(embedding, src_key_padding_mask = src_mask).to(self.device)
       
        
        return output     

## Sanity Check: Transformer Encoder

The code below runs a sanity check for your `TransformerEncoder` class. 

In [None]:
if __name__=="__main__":
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Set random seed
    torch.manual_seed(42)
    # Create test inputs
    embedding_dim = [4, 8, 12]
    max_len = [10,20,30,40,50,60,70,80,90]
    num_layers = [1,1,1,2,2,2,3,3,3]
    nheads = [1, 1, 1, 1, 2, 2, 2, 4, 4]
    dimf = [50, 100, 150]
    params = []
    inputs = []
    i = 0
    for df in dimf:
        for ed in embedding_dim:
            inp = {}
            inp['src_vocab'] = src_vocab
            inp['embedding_dim'] = ed
            inp['num_heads'] = nheads[i]
            inp['dim_feedforward'] = df
            inp['num_layers'] = num_layers[i]
            inp['max_len_src'] = max_len[i]
            inp['device'] = device
            inputs.append(inp)
            i += 1
    # Test init
    expected_outputs = [51890, 103858, 155954, 53340, 106736, 160388, 55690, 111314, 167322]

    sanityCheckModel(inputs, TransformerEncoder, expected_outputs, "init", None)

--- TEST: Number of Model Parameters (tests __init__(...)) ---
	PASSED	Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 4, 'num_heads': 1, 'dim_feedforward': 50, 'num_layers': 1, 'max_len_src': 10, 'device': device(type='cuda')}	Expected Num. Params: 51890	Your Num. Params: 51890
	PASSED	Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 8, 'num_heads': 1, 'dim_feedforward': 50, 'num_layers': 1, 'max_len_src': 20, 'device': device(type='cuda')}	Expected Num. Params: 103858	Your Num. Params: 103858
	PASSED	Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 12, 'num_heads': 1, 'dim_feedforward': 50, 'num_layers': 1, 'max_len_src': 30, 'device': device(type='cuda')}	Expected Num. Params: 155954	Your Num. Params: 155954
	PASSED	Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 4, 'num_heads': 1, 'dim_feedforward': 100, 'num_layers': 2, 'max_len_src'

In [None]:
if __name__=="__main__":
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Set random seed
    torch.manual_seed(42)
    # Test forward
    inputs = []
    batch_sizes = [1, 2]
    dimf = 100
    embedding_dims = [32,64,128]
    nheads = iter([1, 1, 2, 2, 4, 4])
    num_layers = iter([1,1,2,2,3,3])
    max_len = iter([10,20,30,40,50,60])
    for ed in embedding_dims:
        for b in batch_sizes:
            inp = {}
            inp['src_vocab'] = src_vocab
            inp['embedding_dim'] = ed
            inp['num_heads'] = next(nheads)
            inp['dim_feedforward'] = dimf
            inp['num_layers'] = next(num_layers)
            inp['max_len_src'] = next(max_len)
            inp['device'] = device
            inp["batch_size"] = b
            inputs.append(inp)
    # create sanity datasets
    sanity_dataset = MyData(src_tensor_train, trg_tensor_train)
    sanity_loader = torch.utils.data.DataLoader(sanity_dataset, batch_size=50, num_workers=2, drop_last=True, shuffle=True)
    expected_outputs = [torch.Size([1, 16, 32]), torch.Size([2, 16, 32]), torch.Size([1, 16, 64]), torch.Size([2, 16, 64]), torch.Size([1, 16, 128]), torch.Size([2, 16, 128])]

    sanityCheckModel(inputs, TransformerEncoder, expected_outputs, "forward", sanity_loader)

--- TEST: Output shape of forward(...) ---
	PASSED	 Init Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 32, 'num_heads': 1, 'dim_feedforward': 100, 'num_layers': 1, 'max_len_src': 10, 'device': device(type='cuda')}	Forward Input Shape: torch.Size([1, 16])	Expected Output Shape: torch.Size([1, 16, 32])	Your Output Shape: torch.Size([1, 16, 32])
	PASSED	 Init Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 32, 'num_heads': 1, 'dim_feedforward': 100, 'num_layers': 1, 'max_len_src': 20, 'device': device(type='cuda')}	Forward Input Shape: torch.Size([2, 16])	Expected Output Shape: torch.Size([2, 16, 32])	Your Output Shape: torch.Size([2, 16, 32])
	PASSED	 Init Input: {'src_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbbe50>, 'embedding_dim': 64, 'num_heads': 2, 'dim_feedforward': 100, 'num_layers': 2, 'max_len_src': 30, 'device': device(type='cuda')}	Forward Input Shape: torch.Size([1, 16])	Expected Output Shape

## <font color='red'>TODO:</font> Decoder Model [10 points]
Now we implement a Decoder model. Unlike the RNN, you do not need to explicitly compute inter-attention with the encoder; you will use the nn.TransformerDecoder model, which takes care of this for you.

In this cell, you should implement the `__init(...)` and `forward(...)` functions, each of which is <b>5 points</b>.

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, trg_vocab, embedding_dim, num_heads,
        num_layers, dim_feedforward, max_len_trg, device):
        super(TransformerDecoder, self).__init__()
        self.device = device
        """
        Args:
            trg_vocab: Vocab_Lang, the target vocabulary
            embedding_dim: the dimension of the embedding (also the number of expected features for the input of the Transformer)
            num_heads: The number of attention heads
            num_layers: the number of Transformer Decoder layers
            dim_feedforward: the dimension of the feedforward network models in the Transformer
            max_len_trg: maximum length of the target sentences
            device: the working device (you may need to map your postional embedding to this device)
        """
        self.trg_vocab = trg_vocab # Do not change
        trg_vocab_size = len(trg_vocab)

        # Create positional embedding matrix
        self.position_embedding = create_positional_embedding(max_len_trg, embedding_dim).to(device)
        self.register_buffer('positional_embedding', self.position_embedding) # this informs the model that positional_embedding is not a learnable parameter

        ### TODO ###

        # Initialize embedding layer
        self.embedding= nn.Embedding(trg_vocab_size, embedding_dim).to(self.device)

        # Dropout layer
        self.dropout1 = nn.Dropout().to(self.device)

        # Initialize a nn.TransformerDecoder model (you'll need to use embedding_dim,
        # num_layers, num_heads, & dim_feedforward here)
        decoder_layers = nn.TransformerDecoderLayer(embedding_dim, num_heads, dim_feedforward).to(self.device)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layers, num_layers).to(self.device)
        # Final fully connected layer
        self.decoder = nn.Linear(embedding_dim, trg_vocab_size).to(self.device)

        
    def generate_square_subsequent_mask(self, sz):
        """Generate a square mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
        """
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(self.device)
        return mask

    def forward(self, dec_in, enc_out):
        """
        Args:
            dec_in: [sequence length, batch_size]
            enc_out: [max_len, batch_size, embed_dim]
        Returns:
            output: [sequence length, batch_size, trg_vocab_size]
        Pseudo-code:
        - Compute input word and positional embeddings in similar manner to encoder
        - Call generate_square_subsequent_mask() to compute a mask: this time,
          the mask is to prevent the decoder from attending to tokens in the "future".
          In other words, at time step i, the decoder should only attend to tokens
          1 to i-1.
        - Call the decoder, with tgt_mask = trg_mask
        - Run the output through the fully-connected layer and return it
        """
        dec_in=dec_in.to(self.device)
        enc_out=enc_out.to(self.device)
        ### TODO ###
        embedded = self.embedding(dec_in)
        
        pos_embed=self.position_embedding[:dec_in.size(0)].to(self.device)
        pos_embed = (embedded + pos_embed).to(self.device)
        embedding=self.dropout1(pos_embed).to(self.device).to(self.device)
        trg_mask=self.generate_square_subsequent_mask(dec_in.size(0)).to(self.device)
        output = self.transformer_decoder(embedding,enc_out,tgt_mask = trg_mask).to(self.device)
        output=self.decoder(output).to(self.device)
        return output    

## Sanity Check: Transformer Decoder

The code below runs a sanity check for your `TransformerDecoder` class.

In [None]:
def sanityCheckTransformerDecoderModelForward(inputs, NN, expected_outputs):
    print('--- TEST: Output shape of forward(...) ---\n')
    msg = ''
    for i, inp in enumerate(inputs):
        input_rep = '{'
        for k,v in inp.items():
            if torch.is_tensor(v):
                input_rep += str(k) + ': ' + 'Tensor with shape ' + str(v.size()) + ', '
            else:
                input_rep += str(k) + ': ' + str(v) + ', '
        input_rep += '}'
        dec = NN(trg_vocab=inp['trg_vocab'],embedding_dim=inp['embedding_dim'],num_heads=inp['num_heads'],num_layers=inp['num_layers'],dim_feedforward=inp['dim_feedforward'],max_len_trg=inp['max_len_trg'],device=inp['device'])
        dec_in = torch.randint(low=0,high=20,size=(inp['max_len_trg'], inp['batch_size']))
        enc_out = torch.rand(inp['max_len_trg'], inp['batch_size'], inp['embedding_dim'])
        inp['encoder_outputs'] = enc_out
        with torch.no_grad(): 
            stu_out = dec(enc_out=enc_out, dec_in=dec_in)
        del dec
        has_passed = True
        if not torch.is_tensor(stu_out):
            has_passed = False
            msg = 'Output must be a torch.Tensor; received ' + str(type(stu_out))
        status = 'PASSED' if has_passed else 'FAILED'
        if not has_passed:
            message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape: ' + str(inp['encoder_outputs'].shape) + '\tExpected Output Shape: ' + str(expected_outputs[i]) + '\t' + msg
            print(message)
            continue
        
        has_passed = stu_out.size() == expected_outputs[i]
        msg = 'Your Output Shape: ' + str(stu_out.size())
        status = 'PASSED' if has_passed else 'FAILED'
        message = '\t' + status + "\t Init Input: " + input_rep + '\tForward Input Shape: ' + str(inp['encoder_outputs'].shape) + '\tExpected Output Shape: ' + str(expected_outputs[i]) + '\t' + msg
        print(message)
        


In [None]:
if __name__ == '__main__':
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Set random seed
    torch.manual_seed(42)
    # Create test inputs
    hidden_units = [50, 100, 200]
    embedding_dim = [8, 16]
    num_heads = [1, 2]
    dim_feedforward = [50, 100]
    num_layers = [1, 2]
    max_lens = 64
    params = []
    inputs = []
    for ed in embedding_dim:
        for df in dim_feedforward:
            for nh in num_heads:
                for nl in num_layers:
                    inp = {}
                    inp['trg_vocab'] = trg_vocab
                    inp['embedding_dim'] = ed
                    inp['num_heads'] = nh
                    inp['num_layers'] = nl
                    inp['dim_feedforward'] = df
                    inp['max_len_trg'] = max_lens
                    inp['device'] = device
                    inputs.append(inp)
    # Test init
    expected_outputs = [113835, 115317, 113835, 115317, 114685, 117017, 114685, 117017]
    sanityCheckModel(inputs, TransformerDecoder, expected_outputs, "init", None)
    print()

    # Test forward
    inputs = []
    batch_sizes = [1, 2, 4]
    num_heads = 2
    num_layers = 1
    embedding_dims = iter([100, 100, 200, 200, 200, 400, 400, 800, 800])
    expected_outputs = [torch.Size([16, 1, 6609]),torch.Size([16, 2, 6609]),torch.Size([16, 4, 6609]),torch.Size([32, 1, 6609]),torch.Size([32, 2, 6609]),torch.Size([32, 4, 6609]),torch.Size([64, 1, 6609]),torch.Size([64, 2, 6609]),torch.Size([128, 4, 6609])]
    max_lens = iter([16, 16, 16, 32, 32, 32, 64, 64, 128])

    for hu in hidden_units:
        for b in batch_sizes:
            inp = {}
            edim = next(embedding_dims)
            inp['embedding_dim'] = edim
            inp['trg_vocab'] = trg_vocab
            inp['num_heads'] = num_heads
            inp['num_layers'] = num_layers
            inp["batch_size"] = b
            inp['dim_feedforward'] = hu
            inp['max_len_trg'] = next(max_lens)
            inp['device'] = device
            inputs.append(inp)
    
    sanityCheckTransformerDecoderModelForward(inputs, TransformerDecoder, expected_outputs)


--- TEST: Number of Model Parameters (tests __init__(...)) ---
	PASSED	Input: {'trg_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, 'embedding_dim': 8, 'num_heads': 1, 'num_layers': 1, 'dim_feedforward': 50, 'max_len_trg': 64, 'device': device(type='cuda')}	Expected Num. Params: 113835	Your Num. Params: 113835
	PASSED	Input: {'trg_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, 'embedding_dim': 8, 'num_heads': 1, 'num_layers': 2, 'dim_feedforward': 50, 'max_len_trg': 64, 'device': device(type='cuda')}	Expected Num. Params: 115317	Your Num. Params: 115317
	PASSED	Input: {'trg_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, 'embedding_dim': 8, 'num_heads': 2, 'num_layers': 1, 'dim_feedforward': 50, 'max_len_trg': 64, 'device': device(type='cuda')}	Expected Num. Params: 113835	Your Num. Params: 113835
	PASSED	Input: {'trg_vocab': <__main__.Vocab_Lang object at 0x7fa5f9cbd0d0>, 'embedding_dim': 8, 'num_heads': 2, 'num_layers': 2, 'dim_feedforward': 50, 'max_len_trg'

## Train Transformer Model

Like the RNN, we train the encoder and decoder using cross-entropy loss.

In [None]:
def train_transformer_model(encoder, decoder, dataset, optimizer, device, n_epochs):
    encoder.train()
    decoder.train()
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    for epoch in range(n_epochs):
        start = time.time()
        losses = []

        for src, trg in tqdm(train_dataset):
            
            src = src.to(device).transpose(0,1) # [max_src_length, batch_size]
            trg = trg.to(device).transpose(0,1) # [max_trg_length, batch_size]

            enc_out = encoder(src)
            output = decoder(trg[:-1, :], enc_out)

            output = output.reshape(-1, output.shape[2])
            trg = trg[1:].reshape(-1)

            optimizer.zero_grad()

            loss = criterion(output, trg)
            losses.append(loss.item())

            loss.backward()

            # Clip to avoid exploding grading issues
            torch.nn.utils.clip_grad_norm_(encoder.parameters(), max_norm=1)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=1)

            optimizer.step()

        mean_loss = sum(losses) / len(losses)
        print('Epoch:{:2d}/{}\t Loss:{:.4f} ({:.2f}s)'.format(epoch + 1, n_epochs, mean_loss, time.time() - start))


In [None]:
if __name__ == '__main__':
    # HYPERPARAMETERS - feel free to change
    LEARNING_RATE = 0.001
    DIM_FEEDFORWARD=512
    N_EPOCHS=15
    N_HEADS=2
    N_LAYERS=2

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    transformer_encoder = TransformerEncoder(src_vocab, EMBEDDING_DIM, N_HEADS, 
                                 N_LAYERS,DIM_FEEDFORWARD,
                                 max_length_src, device).to(device)
    transformer_decoder = TransformerDecoder(trg_vocab, EMBEDDING_DIM, N_HEADS, 
                              N_LAYERS,DIM_FEEDFORWARD,
                              max_length_trg, device).to(device)

    transformer_model_params = list(transformer_encoder.parameters()) + list(transformer_decoder.parameters())
    optimizer = torch.optim.Adam(transformer_model_params, lr=LEARNING_RATE)

    print('Encoder and Decoder models initialized!')

Encoder and Decoder models initialized!


In [None]:
if __name__ == '__main__':
    train_transformer_model(transformer_encoder, transformer_decoder, train_dataset, optimizer, device, N_EPOCHS)

  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 1/15	 Loss:3.4307 (10.33s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 2/15	 Loss:2.5699 (10.17s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 3/15	 Loss:2.2203 (10.17s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 4/15	 Loss:1.9504 (10.28s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 5/15	 Loss:1.7294 (10.26s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 6/15	 Loss:1.5467 (10.37s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 7/15	 Loss:1.3991 (10.28s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 8/15	 Loss:1.2774 (10.22s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch: 9/15	 Loss:1.1772 (10.30s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:10/15	 Loss:1.0974 (10.18s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:11/15	 Loss:1.0325 (10.21s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:12/15	 Loss:0.9718 (10.26s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:13/15	 Loss:0.9201 (10.29s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:14/15	 Loss:0.8780 (10.41s)


  0%|          | 0/468 [00:00<?, ?it/s]

Epoch:15/15	 Loss:0.8371 (10.18s)


## Inference (Decoding) Function

Now that we have trained the model, we can use it on test data.

In [None]:
def decode_transformer_model(encoder, decoder, src, max_decode_len, device):
    """
    Args:
        encoder: Your TransformerEncoder object
        decoder: Your TransformerDecoder object
        src: [max_src_length, batch_size] the source sentences you wish to translate
        max_decode_len: The maximum desired length (int) of your target translated sentences
        device: the device your torch tensors are on (you may need to call x.to(device) for some of your tensors)

    Returns:
        curr_output: [batch_size, max_decode_len] containing your predicted translated sentences
        curr_predictions: [batch_size, max_decode_len, trg_vocab_size] containing the (unnormalized) probabilities of each
            token in your vocabulary at each time step

    Pseudo-code:
    - Obtain encoder output by encoding src sentences
    - For 1 ≤ t ≤ max_decode_len:
        - Obtain dec_input as the best words so far for previous time steps (you can get this from curr_output)
        - Obtain your (unnormalized) prediction probabilities by feeding dec_input and encoder output to decoder
        - Save your (unnormalized) prediction probabilities in curr_predictions at index t
        - Calculate the most likely (highest probability) token and save in curr_output at timestep t
    """
    # Initialize variables
    trg_vocab = decoder.trg_vocab
    batch_size = src.size(1)
    curr_output = torch.zeros((batch_size, max_decode_len))
    curr_predictions = torch.zeros((batch_size, max_decode_len, len(trg_vocab.idx2word)))
    enc_output = None

    # We start the decoding with the start token for each example
    dec_input = torch.tensor([[trg_vocab.word2idx['<start>']]] * batch_size).transpose(0,1)
    curr_output[:, 0] = dec_input.squeeze(1)
    enc_output= encoder(src)
    
    
    for t in range(0, max_decode_len-1):
      dec_input = curr_output[:,:t+1].transpose(0,1)
      output= decoder(dec_input.int().to(device),enc_output)[-1]
      curr_predictions[:,t+1,:] = output
      curr_output[:,t+1] = torch.argmax(output,dim=1)
    return curr_output, curr_predictions, enc_output

You can run the cell below to qualitatively compare some of the sentences your model generates with the some of the correct translations.

In [None]:
if __name__ == '__main__':
    transformer_encoder.eval()
    transformer_decoder.eval()
    idxes = random.choices(range(len(test_dataset.dataset)), k=5)
    src, trg =  train_dataset.dataset[idxes]
    curr_output, _, _ = decode_transformer_model(transformer_encoder, transformer_decoder, src.transpose(0,1).to(device), trg.size(1), device)
    for i in range(len(src)):
        print("Source sentence:", ' '.join([x for x in [src_vocab.idx2word[j.item()] for j in src[i]] if x != '<pad>']))
        print("Target sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in trg[i]] if x != '<pad>']))
        print("Predicted sentence:", ' '.join([x for x in [trg_vocab.idx2word[j.item()] for j in curr_output[i]] if x != '<pad>']))
        print("----------------")

Source sentence: <start> le gusta pescar . <end>
Target sentence: <start> he is fond of fishing . <end>
Predicted sentence: <start> he likes fishing . <end> . <end> . <end> . <end>
----------------
Source sentence: <start> la libertad no es gratis . <end>
Target sentence: <start> freedom is not free . <end>
Predicted sentence: <start> freedom isn t free . <end> . <end> . <end> .
----------------
Source sentence: <start> a el le gusta leer libros . <end>
Target sentence: <start> he likes to read books . <end>
Predicted sentence: <start> he likes reading books . <end> . <end> . <end> .
----------------
Source sentence: <start> mi casa es como la suya . <end>
Target sentence: <start> my house is like yours . <end>
Predicted sentence: <start> my house is like yours . <end> . <end> . <end>
----------------
Source sentence: <start> el vive en el lujo . <end>
Target sentence: <start> he lives in luxury . <end>
Predicted sentence: <start> he lives in luxury . <end> . <end> . <end> .
----------

## Evaluate Transformer Model

Now we can run the test set through the transformer model. We expect your BLEU scores to satisfy the following conditions: 

*   BLEU-1 > 0.290
*   BLEU-2 > 0.082
*   BLEU-3 > 0.060
*   BLEU-4 > 0.056


In [None]:
def evaluate_model(encoder, decoder, test_dataset, target_tensor_val, device):
    trg_vocab = decoder.trg_vocab
    batch_size = test_dataset.batch_size
    n_batch = 0
    total_loss = 0

    encoder.eval()
    decoder.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    losses=[]
    final_output, target_output = None, None

    with torch.no_grad():
        for batch, (src, trg) in enumerate(test_dataset):
            n_batch += 1
            loss = 0
            
            src, trg = src.transpose(0,1).to(device), trg.transpose(0,1).to(device)
            curr_output, curr_predictions, enc_out = decode_transformer_model(encoder, decoder, src, trg.size(0), device)

            for t in range(1, trg.size(0)):
                output = decoder(trg[:-1, :], enc_out)
                output = output.reshape(-1, output.shape[2])
                loss_trg = trg[1:].reshape(-1)
                loss += criterion(output, loss_trg)
                # loss += criterion(curr_predictions[:,t,:].to(device), trg[t,:].reshape(-1).to(device))

            if final_output is None:
                final_output = torch.zeros((len(target_tensor_val), trg.size(0)))
                target_output = torch.zeros((len(target_tensor_val), trg.size(0)))

            final_output[batch*batch_size:(batch+1)*batch_size] = curr_output
            target_output[batch*batch_size:(batch+1)*batch_size] = trg.transpose(0,1)
            losses.append(loss.item() / (trg.size(0)-1))

        mean_loss = sum(losses) / len(losses)
        print('Loss {:.4f}'.format(mean_loss))
    
    # Compute Bleu scores
    return compute_bleu_scores(target_tensor_val, target_output, final_output, trg_vocab)

In [None]:
if __name__ == '__main__':
    transformer_save_candidate, transformer_scores = evaluate_model(transformer_encoder, transformer_decoder, test_dataset, trg_tensor_val, device)

Loss 1.3600
BLEU 1-gram: 0.292146
BLEU 2-gram: 0.082395
BLEU 3-gram: 0.060457
BLEU 4-gram: 0.057649
