1.Imports

In [55]:
# About the data:
#"The dataset we'll be using is the Multi30k dataset. This is a dataset with ~30,000 parallel English, German and French sentences, each with ~12 words per sentence"

In [56]:
# import the dependencies
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.datasets import Multi30k                                  # Dataset 
from torchtext.legacy.data import Field, BucketIterator                         # For text preprocessing
import spacy                                                                    # For tokenization
import numpy as np
import random
import math
import time

2.Data Preprocessing using torchtext

In [57]:
!python -m spacy download de_core_news_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [58]:
import en_core_web_sm
import de_core_news_sm

In [59]:
spacy_german = spacy.load('de_core_news_sm')
spacy_english = spacy.load('en_core_web_sm')


In [60]:
# To Tokenizes English and german text from a string into a list of strings (tokens):
def tokenize_de(text):
  return [tok.text for tok in spacy_german.tokenizer(text)]

def tokenize_en(text):
  return [tok.text for tok in spacy_english.tokenizer(text)]

In [61]:
# To set Field for assigning parameters for Data Preprocessing:
SRC = Field(tokenize = tokenize_de,                         # initialize token as <sos> i.e start of sentence
            init_token = '<sos>', 
            eos_token = '<eos>',                                   # eos_token as <eos> i.e end of sentence
            lower = True)                                          # Lower case

TRG= Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [62]:
# Data Split:
train_data, validation_data, test_data = Multi30k.splits(exts=('.de','.en'),
                                                         fields = (SRC,TRG))

In [63]:
print(f"Number of training examples: {len(train_data.examples)}")                          #length of train_data i.e src len or trg len
print(f"Number of validation examples: {len(validation_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [64]:
print(vars(train_data.examples[1500]))

{'src': ['eine', 'person', 'in', 'einer', 'warnweste', ',', 'die', 'einen', 'lkw', 'schiebt', '.'], 'trg': ['a', 'person', 'in', 'a', 'safety', 'vest', 'pushing', 'a', 'truck', '.']}


In [66]:
# Buliding Vocabulary 
SRC.build_vocab(train_data, min_freq = 2)            # It is essential vocabulary must be build from training set  and not the validation/test set to avoid information leakage in to the model. 
TRG.build_vocab(train_data, min_freq = 2)           # Using the min_freq argument, only allow tokens that appear at least 2 times in training set

In [67]:
print(f"Unique tokens in source (German) vocabulary: {len(SRC.vocab)}")               # input_dim to encoder
print(f"Unique tokens in target (English) vocabulary: {len(TRG.vocab)}")              # output_dim to decoder

Unique tokens in source (German) vocabulary: 7855
Unique tokens in target (English) vocabulary: 5893


In [68]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [69]:
#BucketIterator :The final step of preparing the data is to create the iterators. These can be iterated on to return a batch of data which will have the PyTorch tensors containing a batch of numericalized src and trg sentences. 
#Numericalized is just a fancy way of saying they have been converted from a sequence of readable tokens to a sequence of corresponding indexes, using the vocabulary.
BATCH_SIZE = 128
train_iterator, valid_iterator , test_iterator = BucketIterator.splits((train_data,validation_data,test_data),batch_size = BATCH_SIZE, device = device,sort_within_batch=True,)

In [70]:
len(train_iterator),len(valid_iterator),len(test_iterator)

(227, 8, 8)

In [71]:
type(train_iterator)

torchtext.legacy.data.iterator.BucketIterator

3.Seq2Seq Model Building 

In [72]:
class Encoder(nn.Module):
  def __init__(self,input_dim,emb_dim,hid_dim,n_layers,dropout):
    super().__init__()
    self.hid_dim = hid_dim                                                #define hidden state
    self.n_layers = n_layers                                               

    self.embedding = nn.Embedding(input_dim,emb_dim)                      # input_dim = len(src.Vocab)<<one hot representation : emb_dim = embedding_vector_features<< dense representation 
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout= dropout)               #LSTM layer : hid_dim is the dimensionality of the hidden and cell states.
    self.dropout = nn.Dropout(dropout)
  
  def forward(self,src):                                               # src is german sequence which has shape of (src len and batch_size) (src len is sequential length)
    embedded = self.dropout(self.embedding(src))                      #embedd = (src len,batch_size,emb dim)
    outputs,(hidden,cell) = self.rnn(embedded)                  #outputs = [src len, batch_size, hid_dim* n directions]#hidden = [n_layers * n directions, batch_size,hid_dim] #cell = [n_layers * n directions,batch_size,hid_dim]
    
    return hidden,cell
#n the forward method, we pass  the source sentence, which is converted into dense vectors using the embedding layer, and then dropout is applied. 
#These embeddings are then passed into the RNN. As  whole sequence is passed to the RNN, it will automatically do the recurrent calculation of the hidden states over the whole sequence.
#Notice that we do not pass an initial hidden or cell state to the RNN so that if no hidden/cell state is passed to the RNN, it will automatically create an initial hidden/cell state as a tensor of all zeros.

class Decoder(nn.Module):
  def __init__(self,output_dim,emb_dim,hid_dim,n_layers,dropout):
    super().__init__()

    self.output_dim = output_dim                                                  #output_dim = len(trg.Vocab)<<one hot representation
    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(output_dim,emb_dim)                             #emb_dim = embedding_vector_features<< dense representation 

    self.rnn = nn.LSTM(emb_dim,hid_dim,n_layers,dropout = dropout)

    self.fc_out = nn.Linear(hid_dim, output_dim)                                  # fully connected Dense layer at the top for output
                            
    self.dropout = nn.Dropout(dropout)

  def forward(self,input,hidden,cell):

    input = input.unsqueeze(0)                                                    # input = [Batch_size] #hidden = [n layers * n directions, Batch_size, hid_dim]  #cell = [n layers * n directions, Batch_size, hid_dim] # n directions =1

    embedded = self.dropout(self.embedding(input))                               # embedding_shape = (1,batch_size,emb_dim) # seq len is always 1 at decoder
    output, (hidden, cell) = self.rnn(embedded, (hidden,cell))      # output = [1,Batch_size, hid_dim * n directions] #hidden = [n layers * n directions, Batch_size, hid_dim] #cell = [n layers * n directions, Batch_size, hid_dim]
                                                                    # shape of predictions: (Batch_size,output_dim)
    prediction = self.fc_out(output.squeeze(0))

    return prediction,hidden,cell

#Within the forward method, we accept a batch of input tokens, previous hidden states and previous cell states. 
#As we are only decoding one token at a time, the input tokens will always have a sequence length of 1.
# We unsqueeze the input tokens to add a sentence length dimension of 1. Then, similar to the encoder, we pass through an embedding layer and apply dropout. 
#This batch of embedded tokens is then passed into the RNN with the previous hidden and cell states.i.e context vector 
#This produces an output (hidden state from the top layer of the RNN), a new hidden state (one for each layer, stacked on top of each other) and a new cell state (also one per layer, stacked on top of each other). 
#We then pass the output (after getting rid of the sentence length dimension) through the linear layer to receive our prediction. We then return the prediction, the new hidden state and the new cell state.        
       
class Seq2Seq(nn.Module):
  def __init__(self,encoder,decoder,device):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.device = device

    assert encoder.hid_dim == decoder.hid_dim, \
        "Hidden dimensions of encoder and decoder must be equal!"
    assert encoder.n_layers == decoder.n_layers, \
        "Encoder and decoder must have equal number of layers!"
  
  def forward(self, src, trg, teacher_forcing_ratio = 0.5):                                  #src = [src len, batch size] #trg = [trg len, batch size]  #teacher_forcing_ratio is probability to use teacher forcing 
    batch_size = trg.shape[1]                  #returns batch_size                           #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
    trg_len = trg.shape[0]                     #returns sequential length
    trg_vocab_size = self.decoder.output_dim

    outputs = torch.zeros(trg_len,batch_size,trg_vocab_size).to(self.device)         #tensor to store decoder outputs

    hidden,cell = self.encoder(src)                                             #last hidden state of the encoder is used as the initial hidden state of the decoder

    input=trg[0,:]                                                              #first input to the decoder is the <sos> tokens

    for t in range(1, trg_len):
      output, hidden, cell = self.decoder(input,hidden,cell)       #receive output tensor (predictions) and new hidden and cell states by insert input token embedding, previous hidden and previous cell states

      outputs[t] = output                                      #place predictions in a tensor holding predictions for each token

      teacher_force = random.random() < teacher_forcing_ratio  #decide if we are going to use teacher forcing or not

      top1 = output.argmax(1)                                  #get the highest predicted token from our predictions

      input = trg[t] if teacher_force else top1                #if teacher forcing, use actual next token as next input if not, use predicted token

    return outputs


4.Assigning  Parameters, loss function , optimizers

In [73]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

In [74]:
#Initialize the weights :
def init_weights(m):
  for name, param in m.named_parameters():     
    nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)
#initialize all weights from a uniform distribution between -0.08 and +0.08.
# The init_weights function will be called on every module and sub-module within our model. For each module, loop through all of the parameters and sample them from a uniform distribution with nn.init.uniform_.


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [75]:
#function that will calculate the number of trainable parameters in the model.
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,899,013 trainable parameters


In [76]:
#Defining Optimizer:to update our parameters in the training loop
optimizer = optim.Adam(model.parameters())

In [77]:
#Defining loss function:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
#oss function calculates the average loss per token, however by passing the index of the <pad> token as the ignore_index argument ignores the loss whenever the target token is a padding token.

5.Define Training loop:

In [78]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()                                                # Set the model into training mode
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src                                          #get the source and target sentences from the batch
        trg = batch.trg
        
        optimizer.zero_grad()                                    #zero the gradients calculated from the last batch
        
        output = model(src, trg)                                 #feed the source and target into the model to get the output,
        
        output_dim = output.shape[-1]                            ##trg = [trg len, batch size]  #output = [trg len, batch size, output dim]
        
        output = output[1:].view(-1, output_dim)                 #loss function only works on 2d inputs with 1d targets we need to flatten each of them with '.view'. we slice off the first column of the output and target tensors
        trg = trg[1:].view(-1)
                                                                 # #trg = [(trg len - 1) * batch size]  #output = [(trg len - 1) * batch size, output dim]
        loss = criterion(output, trg)
        
        loss.backward()                                          #calculate the gradients with loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)    #clip is used  to prevent gradients from exploding (a common issue in RNNs)
        
        optimizer.step()                                            #update the parameters of our model by doing an optimizer step
        
        epoch_loss += loss.item()                                   #sum the loss value to a running total
        
    return epoch_loss / len(iterator)

6.Define Evaluation loop:


In [79]:
def evaluate(model, iterator, criterion):
    
    model.eval()                                                  #set the mode to evaluation. This will turn off dropout 
    
    epoch_loss = 0
    
    with torch.no_grad():                                        #use the with torch.no_grad() block to ensure no gradients are calculated within the block. This reduces memory consumption and speeds things up 
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0)                          # 0 indicates turn off teacher forcing  
                                                                 #trg = [trg len, batch size] #output = [trg len, batch size, output dim]
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)                               #trg = [(trg len - 1) * batch size]  #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [80]:
#create a function that tell that how long an epoch takes.
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

7.Training the Model

In [81]:
#At each epoch, we'll be checking if our model has achieved the best validation loss so far. If it has, we'll update our best validation loss and save the parameters of our model (called state_dict in PyTorch).
#Then, when we come to test our model, we'll use the saved parameters used to achieve the best validation loss.

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model,train_iterator,optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'Pytorch Enc-Dec Model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

# Perplexity is the multiplicative inverse of the probability assigned to the test set by the language model, normalized by the number of words in the test set. If a language model can predict unseen words from the test set.
#As a result, better language models will have lower perplexity values or higher probability values for a test set.

Epoch: 01 | Time: 12m 6s
	Train Loss: 5.055 | Train PPL: 156.858
	 Val. Loss: 4.854 |  Val. PPL: 128.253
Epoch: 02 | Time: 12m 5s
	Train Loss: 4.594 | Train PPL:  98.851
	 Val. Loss: 4.752 |  Val. PPL: 115.818
Epoch: 03 | Time: 12m 4s
	Train Loss: 4.238 | Train PPL:  69.272
	 Val. Loss: 4.586 |  Val. PPL:  98.061
Epoch: 04 | Time: 11m 59s
	Train Loss: 4.028 | Train PPL:  56.158
	 Val. Loss: 4.398 |  Val. PPL:  81.252
Epoch: 05 | Time: 12m 4s
	Train Loss: 3.831 | Train PPL:  46.114
	 Val. Loss: 4.256 |  Val. PPL:  70.534
Epoch: 06 | Time: 12m 4s
	Train Loss: 3.653 | Train PPL:  38.585
	 Val. Loss: 4.191 |  Val. PPL:  66.098
Epoch: 07 | Time: 12m 1s
	Train Loss: 3.480 | Train PPL:  32.455
	 Val. Loss: 4.037 |  Val. PPL:  56.660
Epoch: 08 | Time: 12m 1s
	Train Loss: 3.323 | Train PPL:  27.757
	 Val. Loss: 3.930 |  Val. PPL:  50.911
Epoch: 09 | Time: 12m 4s
	Train Loss: 3.179 | Train PPL:  24.030
	 Val. Loss: 3.830 |  Val. PPL:  46.056
Epoch: 10 | Time: 12m 4s
	Train Loss: 3.035 | Train PP

In [83]:
#We'll load the parameters (state_dict) that gave our model the best validation loss and run it the model on the test set.
model.load_state_dict(torch.load('Pytorch Enc-Dec Model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.732 | Test PPL:  41.757 |
