The data set taken is from a previous tutorial on Transation from French to English  which is available at https://download.pytorch.org/tutorial/data.zip
We are trying to use the same data and see how the transformer gives the output compared to the one that was available using the Seq2Seq network


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.data import Field,TabularDataset,BucketIterator  
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from typing import Iterable, List
from spacy.lang.fr.examples import sentences 

import spacy
import numpy as np

import random
import math
import time
import pandas as pd

In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
### this is the step to convert a single file to the training and testing. 
## then convert it to the form that it is being accepted by the torchtext

In [6]:
df=pd.read_csv('data/eng-fra.txt',sep="\t")
df=df.applymap(lambda x: x.replace('"', ''))
print(df)
df.columns=['TRG','SRC']
df.head()

# # Creating a dataframe with 75%
# # values of original dataframe
train_data = df.sample(frac = 0.75)
 
# Creating dataframe with
# rest of the 25% values
test_data = df.drop(train_data.index)

train_data.to_csv("data/single_train_data.csv")
test_data.to_csv("data/single_test_data.csv")


                                                      Go.  \
0                                                    Run!   
1                                                    Run!   
2                                                    Wow!   
3                                                   Fire!   
4                                                   Help!   
...                                                   ...   
135836  A carbon footprint is the amount of carbon dio...   
135837  Death is something that we're often discourage...   
135838  Since there are usually multiple websites on a...   
135839  If someone who doesn't know your background sa...   
135840  It may be impossible to get a completely error...   

                                                     Va !  
0                                                 Cours !  
1                                                Courez !  
2                                              Ça alors !  
3                          

In [7]:
spacy_fr = spacy.load('fr_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [8]:
def tokenize_fr(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [9]:
SRC = Field(tokenize = tokenize_fr, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [11]:
fields={'SRC':('SRC',SRC),'TRG':('TRG',TRG)}

train_data, test_data=TabularDataset.splits(
                                    path='data',
                                    train='single_train_data.csv',#this is the traing file
                                    test='single_test_data.csv',##this is the test file
                                    format='csv',
                                    fields=fields)## need to put zero if only one data is being returned https://github.com/pytorch/text/issues/474


In [12]:
SRC.build_vocab(train_data, min_freq = 1)
TRG.build_vocab(train_data, min_freq = 1)##the vocabulary will be out of the trainig

In [13]:
len(TRG.vocab)

12096

In [14]:
#trainingData

In [22]:
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), 
     batch_size = 64,sort=False,
     device = device)


In [23]:
for i,batch in enumerate(train_iterator):
    print(batch.SRC)
    print(batch.TRG)
    break

tensor([[   2,    5,   13,  ...,    1,    1,    1],
        [   2,   69,   35,  ...,    1,    1,    1],
        [   2,   20, 5797,  ...,    1,    1,    1],
        ...,
        [   2,    5,   16,  ...,    1,    1,    1],
        [   2,    5,   37,  ...,    1,    1,    1],
        [   2,   29,    9,  ...,    1,    1,    1]], device='cuda:0')
tensor([[  2,   5, 254,  ...,   1,   1,   1],
        [  2,  19,   6,  ...,   1,   1,   1],
        [  2,   5,  35,  ...,   1,   1,   1],
        ...,
        [  2,   5,  53,  ...,   1,   1,   1],
        [  2,   5,  35,  ...,   1,   1,   1],
        [  2,  13,  21,  ...,   1,   1,   1]], device='cuda:0')


In [24]:
vars(train_data.examples[1])['SRC']

['on',
 'ne',
 'peut',
 'pas',
 'être',
 'à',
 'deux',
 'endroits',
 'au',
 'même',
 'moment',
 '.']

In [25]:
### now we will start building the encoder class

class Encoder(nn.Module):
    def __init__(self,
                 input_dim,#length of the vocabulary
                 hid_dim,#512
                 n_layers,#6
                 n_heads,#8
                 pf_dim,#output dimention
                 dropout,
                 device,#cuda
                 max_length=100):
        super().__init__()
        
        self.device=device
        
        #input embedding
        self.tok_embedding=nn.Embedding(input_dim,hid_dim)
        ## convert the indices of the words to the position embedding. since there can be just 
        ## 100 tokens in the sentence hence the embedding will be a matrix of 100*hid_dim
        self.pos_embedding=nn.Embedding(max_length,hid_dim) 
        
        ##create layers ..what happens in each layer is listed
        self.layers=nn.ModuleList([EncoderLayer(hid_dim, ## this is 512
                                                n_heads, ## MHA has n_heads
                                                pf_dim, ## the dimentions of output
                                                dropout,
                                                device
                                                )for _ in range(n_layers)])
        
        
        
        ##define dropout
        self.dropout=nn.Dropout(dropout)
        self.scale=torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    
    ##src=it is a set of input values that are in the form of batch
    ##after the Encoder running will get the inputs for the decoder
    def forward(self,src,src_mask):
        batch_size=src.shape[0]
        src_len=src.shape[1]
        
        ##it will fill the pos vector with the src_len numbers which are repeated batch_size
        ## also for the combination to happen between the pos vector and the input they 
        ## must be of the same dimention
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        ##this is how the batch will look like after combining the scaling factor, positional embedding
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        
        ##pass it to all the layers...these layers is the MHA & feedforward network + residual connections
        ##after the operations also the same variable src is being updated
        for layer in self.layers:
            src = layer(src, src_mask)
            
        return src
        
        
        
        


In [26]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        # the two norm layers
        self.self_attn_layer_norm=nn.LayerNorm(hid_dim)
        self.ff_layer_norm=nn.LayerNorm(hid_dim)
        
        ## MultiHeadAttentionLayer is a class that we will call
        self.self_attention=MultiHeadAttentionLayer(hid_dim,n_heads,dropout,device)
        self.dropout = nn.Dropout(dropout)
        ## the feedforward
        ## PositionwiseFeedforwardLayer is a class that will be called
        self.positionwise_feedforward=PositionwiseFeedforwardLayer(hid_dim,pf_dim,dropout)
        
    ##src is the batch wise data
    def forward(self,src,src_mask):
        
        #src = [batch size, src len, hid dim]...basically each token will be of hid_dim
        ##[[.1,.1,.2,.3,.4,.4,...512 entries]
        ##[.1,.1,.2,.3,.4,.4,...512 entries]] two tokens whose embedding is 512 length and are in the form of a batch
        
        
        #self_attention...it will take the query,key,value matrix and then calculate the self attention. Also to use
        ## the same code for the decoder by passing a src_mask
        #_src,_=self.self_attention(query=src,key=src,value=src,mask=src_mask)
        
        _src,_=self.self_attention(src,src,src,src_mask)
        
        ##apply the dropout, residual and pass to norm
        src=self.self_attn_layer_norm(src+self.dropout(_src))
        
        
        _src=self.positionwise_feedforward(src)
        
        src=self.ff_layer_norm(src+self.dropout(_src))
        
        return src
        

In [27]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
    
        assert hid_dim % n_heads == 0
        
        self.hid_dim=hid_dim
        self.n_heads=n_heads
        self.head_dim=hid_dim // n_heads
        
        ##this is as good as definng a Wq...when ever we will have a linear layer there will be a W associated
        ## also instead of decalring it for the head_dim we are declaring it for the hidden dimention and then 
        ## we will divide the matrix into the n_heads.. this way the code is much general
        self.fc_q=nn.Linear(hid_dim,hid_dim)
        self.fc_k=nn.Linear(hid_dim,hid_dim)
        self.fc_v=nn.Linear(hid_dim,hid_dim)
        
        ## this is the last weight matrix that will be used at the time of concatenation of the outputs        
        self.fc_o=nn.Linear(hid_dim,hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        ##the scaling factor as in scale dot products...this will be the sqrt of the dimention that we are using
        self.scale=torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    
    
    ## there are lot of transformation happeing
    ## [batch,word,hid_dim] -> [batch,word,n_head,head_dim] (view)-> [batch,n_head,word,head_dim] (permute)
    ## ->[batch,words,number_heads,head_dim] (permute +contiguous) -> [batch,words,hid_dim] (view)
    ##
    ##
    def forward(self,query,key,value,mask=None):
        
        #assuming that the matrices have an additional batch dimention
        batch_size = query.shape[0]
        
        #Q=[batch,word,hid_dim]
        Q=self.fc_q(query)
        K=self.fc_k(key)
        V=self.fc_v(value)
        
        #Q is of the dimention [batch,word,hid_dim]
        # we want to create new Q which is [batch,num_head,words,head_dim]
        # it is a 2 step thing...
        # step1 [batch,word,hid_dim] -> [batch,word,n_head,head_dim] using views
        #step2 using permute change the axis from [batch,word,n_head,head_dim] -> [batch,n_head,word,head_dim]
        Q=Q.view(batch_size,-1,self.n_heads,self.head_dim).permute(0,2,1,3)
        K=K.view(batch_size,-1,self.n_heads,self.head_dim).permute(0,2,1,3)
        V=V.view(batch_size,-1,self.n_heads,self.head_dim).permute(0,2,1,3)
        
        
        ## now compuet the energy ..we need to take a transpose of K
        ## this will give us energy dimentions as [batch_len,number_heads,number_words,number_words]
        energy=torch.matmul(Q,K.permute(0,1,3,2))/self.scale
        #0, 1, 3, 2
        ## this is how we will use the mask
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        #attentiox=[batch,number_heads,words,words]
        attention=torch.softmax(energy,dim=-1)
        
        ##this will give x=[batch,number_heads,words,head_dim]
        x=torch.matmul(self.dropout(attention),V)
        
        #batchnumber,words,heads,head_dim..this is some memory optimization operation
        # but the x is also changed from [batch,number_heads,words,head_dim] -> [batch,words,number_heads,head_dim]
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #now number_heads and head_dim are getting combined together to get the single continuous tensor
        #[batch,words,number_heads,head_dim] -> [batch,words,hid_dim]
        x = x.view(batch_size, -1, self.hid_dim)
        
        
        ##pass it through the linear layer
        x=self.fc_o(x)
        
        
        #x=x = [batch size, words, hid dim]
        return x,attention
        
        
        
        
        

In [28]:
##positionwise feed forward layer
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,x):
        #x = [batch size, seq len, hid_dim]->[batch size, seq len, pf_dim]    
        x=self.dropout(torch.relu(self.fc_1(x)))
        
        #x=[batch size, seq len, pf_dim]->[batch size, seq len, hid_dim]
        x=self.fc_2(x)
        
        return x
        
    

In [29]:
##decoder
##this should be simialr to the encoder class so I am strating by copying the encoder class
class Decoder(nn.Module):
    def __init__(self,
                 output_dim,#length of the vocabulary
                 hid_dim,#512
                 n_layers,#6
                 n_heads,#8
                 pf_dim,#output dimention
                 dropout,
                 device,#cuda
                 max_length=100):
        super().__init__()
        
        self.device=device
        
        #input embedding
        self.tok_embedding=nn.Embedding(output_dim,hid_dim)
        ## convert the indices of the words to the position embedding. since there can be just 
        ## 100 tokens in the sentence hence the embedding will be a matrix of 100*hid_dim
        self.pos_embedding=nn.Embedding(max_length,hid_dim) 
        
        ##create layers ..what happens in each layer is listed
        self.layers=nn.ModuleList([DecoderLayer(hid_dim, ## this is 512
                                                n_heads, ## MHA has n_heads
                                                pf_dim, ## the dimentions of output
                                                dropout,
                                                device
                                                )
                                    for _ in range(n_layers)])
        
        
        ##thiis is extra
        self.fc_out = nn.Linear(hid_dim, output_dim)
        ##define dropout
        self.dropout=nn.Dropout(dropout)
        self.scale=torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    
    ## the forward function will be little different than the Encoder
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        batch_size=trg.shape[0]
        trg_len=trg.shape[1]
        
        ##it will fill the pos vector with the src_len numbers which are repeated batch_size
        ## also for the combination to happen between the pos vector and the input they 
        ## must be of the same dimention
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        ##this is how the batch will look like after combining the scaling factor, positional embedding
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        
        
        ##pass it to all the layers...these layers is the MHA & feedforward network + residual connections
        ##after the operations also the same variable src is being updated
        for layer in self.layers:
            trg,attention = layer(trg, enc_src, trg_mask, src_mask)
            
        output=self.fc_out(trg)
        return output,attention
        
        
        
        


In [30]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        #norm for the self attention
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        
        #norm for the encoder _attentions
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        
        #the feedfoward layer normalization..this one is after the encoder-attentions
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        
        ## calculate the self attentions
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        
        # calculate the attention using the K,V coming from encoder
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        
        #feedforward
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
        
        
    def forward(self,trg,enc_src,trg_mask,src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #trg_mask = [batch size, trg len]
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
            
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        
        #encoder attention Q is coming from Decoder while K and V are coming from encoder
        # the src_mask is to stop the decoder from using the <PAD> values in case if any
        #enc_src=[batchsize,source_len,hid_dim]
        #src_mask=[batbatch_size,source_len]
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        _trg=self.positionwise_feedforward(trg)
        
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        return trg,attention
            
            

In [31]:
##this module will encapsulate the encoder -decoder piece . it will also take care of the maskings
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    #this is just to add the 0s at the places which are pad
    def make_src_mask(self, src):        
        #src = [batch size, src len] 
        ## we are chaecking if the word is not a <PAD>..
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        #src_mask = [batch size, 1, 1, src len]
        return src_mask

    def make_trg_mask(self, trg):
        #first check for the padding and make them 0
        ##this is also checking if the word is not a PAD
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

        trg_len = trg.shape[1]

        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()

        #if the padding is giving some more locatiosn as 0 then include them as well
        trg_mask = trg_pad_mask & trg_sub_mask
        
        return trg_mask
         
        #src = [batch size, src len]
        #trg = [batch size, trg len]
    def forward(self,src,trg):
            
        #src_mask=[batch size,1,1,src len]
        src_mask = self.make_src_mask(src)

        #trg_mask=[batch size,1,1,trg len]
        trg_mask = self.make_trg_mask(trg)

        #enc_src=[batch size, src len, hid dim]
        enc_src = self.encoder(src, src_mask)

        ##output = [batch size, trg len, output dim]
        #attention = [batch size, n_heads, trg len, src len]  ..in case of the encoder it is bs,nhead,src_len,src_len
        ## but in the case of the decoder Q is coming from decoder(aka target) so trg_len and 
        ## K is coming from encoder so src_len
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)

        return output,attention

In [32]:

BATCH_SIZE = 64
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]


In [33]:
model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)
#model

In [34]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 15,461,952 trainable parameters


In [35]:
##weight initialization
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [36]:
model.apply(initialize_weights);


In [37]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [38]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [39]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
#     train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
#     train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    
    #for src, tgt in train_dataloader:
    count=0
    for i,batch in enumerate(train_iterator):
        src = batch.SRC.to(device)
        tgt = batch.TRG.to(device)
        # print(src.shape)
        # print(tgt.shape)
        # print(tgt[:,:-1].shape)

        optimizer.zero_grad()
        output, _ = model(src, tgt[:,:-1]) #[:,:-1])

        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        tgt = tgt[:,1:].contiguous().view(-1)

        # output = output[1:].view(-1, output.shape[-1])
        # tgt = tgt[1:].reshape(-1) #tgt[:,:-1][1:].reshape(-1)
        loss = loss_fn(output, tgt)
        loss.backward()
        clip = 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        losses += loss.item()
        count+=1

    return losses / count


In [40]:
def evaluate(model):
    model.eval()
    losses = 0

#     val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
#     val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    count=0
    for i,batch in enumerate(test_iterator):
        src = batch.SRC.to(device)
        tgt = batch.TRG.to(device)[:,:-1]

        output, _ = model(src, tgt[:,:-1])
            
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
        
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        tgt = tgt[:,1:].contiguous().view(-1)

        # output = output[1:].view(-1, output.shape[-1])
        # tgt = tgt[1:].reshape(-1)
        loss = loss_fn(output, tgt)
        losses += loss.item()
        count+=1
    return losses / count


In [41]:
from timeit import default_timer as timer
NUM_EPOCHS = 10
train_losses = []
val_losses = []

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer)
    end_time = timer()
    val_loss = evaluate(model)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Train PPL: {math.exp(train_loss):7.3f} | Val loss: {val_loss:.3f}, Val. PPL: {math.exp(val_loss):7.3f} | "f"Epoch time = {(end_time - start_time):.3f}s"))
    train_losses.append(train_loss)
    val_losses.append(val_loss)


Epoch: 1, Train loss: 2.864, Train PPL:  17.536 | Val loss: 1.701, Val. PPL:   5.480 | Epoch time = 54.723s
Epoch: 2, Train loss: 1.520, Train PPL:   4.573 | Val loss: 1.232, Val. PPL:   3.429 | Epoch time = 55.496s
Epoch: 3, Train loss: 1.095, Train PPL:   2.989 | Val loss: 1.094, Val. PPL:   2.985 | Epoch time = 55.959s
Epoch: 4, Train loss: 0.871, Train PPL:   2.389 | Val loss: 1.021, Val. PPL:   2.775 | Epoch time = 56.227s
Epoch: 5, Train loss: 0.727, Train PPL:   2.070 | Val loss: 0.995, Val. PPL:   2.704 | Epoch time = 57.041s
Epoch: 6, Train loss: 0.628, Train PPL:   1.874 | Val loss: 0.984, Val. PPL:   2.675 | Epoch time = 56.153s
Epoch: 7, Train loss: 0.554, Train PPL:   1.741 | Val loss: 0.979, Val. PPL:   2.662 | Epoch time = 56.122s
Epoch: 8, Train loss: 0.500, Train PPL:   1.648 | Val loss: 0.977, Val. PPL:   2.656 | Epoch time = 56.240s
Epoch: 9, Train loss: 0.455, Train PPL:   1.576 | Val loss: 0.977, Val. PPL:   2.657 | Epoch time = 56.316s
Epoch: 10, Train loss: 0.420

In [44]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
    
    model.eval()
        
#     if isinstance(sentence, str):
#         src_tensor = text_transform[SRC_LANGUAGE](sentence).unsqueeze(0).to(device)
#     else:
#         src_tensor = text_transform[TGT_LANGUAGE](sentence).unsqueeze(0).to(device)
    
#     src_mask = model.make_src_mask(src_tensor)
    
    
    if isinstance(sentence, str):
        nlp = spacy.load('fr_core_news_sm')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    #trg_indexes = [trg_field[init_token]]
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]


    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        #if pred_token == trg_field[eos_token]:
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
            
    
    #trg_tokens = [trg_field.vocab.get_itos()[i] for i in trg_indexes]
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [60]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def display_attention(sentence, translation, attention, n_heads = 8, n_rows = 4, n_cols = 2):
    
    assert n_rows * n_cols == n_heads
    
    fig = plt.figure(figsize=(15,25))
    
    for i in range(n_heads):
        
        ax = fig.add_subplot(n_rows, n_cols, i+1)
        
        _attention = attention.squeeze(0)[i].cpu().detach().numpy()

        cax = ax.matshow(_attention, cmap='bone')

        ax.tick_params(labelsize=12)
        ax.set_xticklabels(['']+['<sos>']+[t.lower() for t in sentence]+['<eos>'], 
                           rotation=45)
        ax.set_yticklabels(['']+translation)

        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    plt.close()

In [43]:
for example_idx in range(8,12):

    src = vars(test_data.examples[example_idx])['SRC']
    trg = vars(test_data.examples[example_idx])['TRG']

    print("src = ",' '.join(src))
    print("trg =" ,' '.join(trg))

    translation, attention = translate_sentence(src, SRC, TRG, model, device)
    print("predicted trg =" ,' '.join(translation))
    print("*"*40)
    #print(f'predicted trg = {' '.join(translation)}')

src =  vrai ?
trg = really ?
predicted trg = is it true ? <eos>
****************************************
src =  nous gagnâmes .
trg = we won .
predicted trg = we 're cooperating . <eos>
****************************************
src =  sois calme !
trg = be calm .
predicted trg = be quiet . <eos>
****************************************
src =  soyez gentille !
trg = be nice .
predicted trg = be nice . <eos>
****************************************


in the example we can see that the can not has been replaced by can't
duel is not present imn the vocab and hence it has been replaced by <Unk>

In [38]:
for example_idx in range(40,50):

    src = vars(train_data.examples[example_idx])['SRC']
    trg = vars(train_data.examples[example_idx])['TRG']

    print("src = ",' '.join(src))
    print("trg =" ,' '.join(trg))

    translation, attention = translate_sentence(src, SRC, TRG, model, device)
    print("predicted trg =" ,' '.join(translation))
    print("*"*40)
    #print(f'predicted trg = {' '.join(translation)}')

src =  j' ai agrippé la corde aussi longtemps que j' ai pu , mais j' ai finalement dû la lâcher .
trg = i held onto the rope for as long as i could , but i finally had to let go .
predicted trg = i held onto the rope for as long as i could , but i finally could at go let go . <eos>
****************************************
src =  j' espère que vous allez tous bien .
trg = i hope you are all well .
predicted trg = i hope you 're all right . <eos>
****************************************
src =  la musique numérique devient de plus en plus populaire .
trg = digital music is becoming more and more popular .
predicted trg = digital music is becoming more popular and more popular . <eos>
****************************************
src =  il ne sait pas jouer de la guitare .
trg = he can not play guitar .
predicted trg = he ca n't play guitar . <eos>
****************************************
src =  ça a été une bonne journée .
trg = it was a good day .
predicted trg = it was a good day . <eos>
***