<a href="https://colab.research.google.com/github/Sghosh32/Neural-Machine-Translation/blob/main/Sequence2sequence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score

import spacy
import numpy as np

import random
import math
import time

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Notebook is running on", device)

Notebook is running on cuda


In [None]:
seed = 3456
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True 

In [None]:
!python -m spacy download de
!python -m spacy download en 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 5.0 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.0 MB/s 
[38;5;2m✔ Download and installation s

In [None]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def de_tokenizer(text):
  return [token.text for token in spacy_de.tokenizer(text)]

def en_tokenizer(text):
  return [token.text for token in spacy_en.tokenizer(text)]

In [None]:
Source_Field = Field(eos_token = '<src_eos>', init_token = '<src_sos>', lower = True, tokenize = de_tokenizer)

Target_Field = Field(eos_token = '<trg_eos>', init_token = '<trg_sos>', lower = True, tokenize = en_tokenizer)

In [None]:
training_data, validation_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (Source_Field, Target_Field)) 

FileNotFoundError: ignored

In [None]:
print(f"Number of Training Examples: {len(training_data.examples)}")
print(f"Number of Validation Examples: {len(validation_data.examples)}")
print(f"Number of Testing Examples: {len(test_data.examples)}")

print(vars(training_data.examples[1]))
print(vars(validation_data.examples[1]))
print(vars(test_data.examples[1]))

In [None]:
Source_Field.build_vocab(training_data, min_freq = 2)
Target_Field.build_vocab(training_data, min_freq = 2)

print(f"Source vocab size: {len(Source_Field.vocab)}")
print(f"Target vocab size: {len(Target_Field.vocab)}")

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, encoder_dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_dim, num_layers = num_layers, dropout = encoder_dropout)
        self.dropout = nn.Dropout(encoder_dropout)

    def forward(self, input):
        embedding = self.dropout(self.embedding(input))
        output, states = self.rnn(embedding)

        return states

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, decoder_dropout):
        super(Decoder, self).__init__()
        self.output_dim = vocab_size
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_dim, num_layers = num_layers, dropout = decoder_dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(decoder_dropout)

    def forward(self, input, states):
        embedding = self.dropout(self.embedding(input))
        output, states = self.rnn(embedding, states)
        output = self.fc(output).unsqueeze(0)

        return output, states


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_forcing_ratio):
        encoder_states = self.encoder(source)
        decoder_states = encoder_states
        batch_size = target.shape[1]
        sequence_length = target.shape[0]
        predictions = torch.zeros(sequence_length, batch_size, self.decoder.output_dim).to(device)
        input = target[0,:].unsqueeze(0)
        for t in range(1, sequence_length):
            output, decoder_states = self.decoder(input, decoder_states) 
            output = output.squeeze()
            predictions[t] = output.view(batch_size, self.decoder.output_dim)
            if random.random() < teacher_forcing_ratio:
                input = target[t].unsqueeze(0)
            else: 
                input = output.argmax(1).unsqueeze(0)
        return predictions[1:]
        

In [None]:
Batch_size = 128
training_iterator, validation_iterator, test_iterator = BucketIterator.splits((training_data, validation_data, test_data), batch_size = Batch_size, device = device)
SRC_VOCAB_SIZE = len(Source_Field.vocab)
TRG_VOCAB_SIZE = len(Target_Field.vocab)
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5
LR = 0.001
CLIP = 1
NUM_EPOCHS = 20

In [None]:
target_padding_index = Target_Field.vocab.stoi[Target_Field.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = target_padding_index)


In [None]:
encoder = Encoder(SRC_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, ENCODER_DROPOUT).to(device)
decoder = Decoder(TRG_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, DECODER_DROPOUT).to(device)
seq2seq = Seq2Seq(encoder, decoder).to(device)
optimizer = optim.Adam(seq2seq.parameters())

In [None]:
print(optimizer)

In [None]:
def weight_initialization(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
seq2seq.apply(weight_initialization)



In [None]:
def Train(model, iterator, criterion, optimizer, clip = 1):
    model.train()
    epoch_loss = 0
    for _, batch in enumerate(iterator):
        model.zero_grad()
        source = batch.src
        target = batch.trg
        outputs = model(source, target, teacher_forcing_ratio = 0.5)
        outputs = outputs.view(-1, outputs.shape[-1])
        targets = target[1:].view(-1)
        batch_loss = criterion(outputs, targets.to(device))
        batch_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += batch_loss.item()
    return epoch_loss / len(iterator)


In [None]:
def Evaluate(iterator, model, criterion):
    model.eval()
    evaluation_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            model.zero_grad()
            source = batch.src
            target = batch.trg
            outputs = model(source, target, 0)
            outputs = outputs.view(-1, outputs.shape[-1])
            targets = target[1:].view(-1)
            batch_loss = criterion(outputs, targets.to(device))
            evaluation_loss = batch_loss.item()
        return evaluation_loss / len(iterator)
      

In [None]:
def Epoch_Time(start_time, end_time):
    elasped_time = end_time = start_time
    elasped_mins = int(elasped_time / 60)
    elasped_secs = int(elasped_time - (elasped_mins) * 60)
    return elasped_mins, elasped_secs

In [None]:
seq2seq.eval()
best_valid_loss = float('inf')
for epoch in range(NUM_EPOCHS):
    
    start_time = time.time()
    train_loss = Train(seq2seq, training_iterator, criterion, optimizer)
    valid_loss = Evaluate(validation_iterator, seq2seq, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = Epoch_Time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [None]:
def ipTensor(sentence, src_field):
    if isinstance(sentence, list):
        tokens = [src_field.init_token] + [token.lower() for token in sentence] + [src_field.eos_token]
    else:
        tokens = [src_field.init_token] + de_tokenizer(sentence) + [src_field.eos_token]
    seq_len = len(tokens)
    ip_tensor = torch.LongTensor([src_field.vocab.stoi[token] for token in tokens]).to(device)
    return ip_tensor.view(seq_len, 1)

In [None]:
def Translate(source_sentence, source_field, target_field, model):
    input_tensor = ipTensor(source_sentence, source_field)
    max_length = 4 * input_tensor.shape[0]
    source_length = [input_tensor.shape[0]]
    with torch.no_grad():
        encoder_states = model.encoder(input_tensor)
    decoder_states = encoder_states
    sos_id = Target_Field.vocab.stoi[target_field.init_token]
    eos_id = Target_Field.vocab.stoi[target_field.eos_token]
    predicts = [sos_id]
    len = 1
    while len < max_length:
        input = torch.LongTensor([predicts[-1]]).view((1,1)).to(device)
        with torch.no_grad():
            output, decoder_states = model.decoder(input, decoder_states)
        output = output.squeeze()
        output = output.view(-1, model.decoder.output_dim)
        predicts.append(output.argmax(-1).item())
        len += 1
        if predicts[-1] == eos_id:
            break
    sentence = [target_field.vocab.itos[id] for id in predicts[1:]]
    return sentence


In [None]:
ind = int(random.random() * len(test_data.examples))
example = test_data.examples[ind]
source_sentence = example.src
target_sentence = example.trg
print("German Sentence: ", ' '.join(source_sentence))
translation = Translate(source_sentence, Source_Field, Target_Field, seq2seq)
print("Predicted Translation: ", ' '.join(translation[:-1]))
print("Actual Translation: ", ' '.join(target_sentence))


In [None]:
def Calculate_BLEU(data, src_field, trg_field, model):
    trgs = []
    predicted_trgs = []
    for i in range(len(data.examples)):
        src_sentence = vars(data[i])['src']
        trg_sentence = vars(data[i])['trg']
        try:                               
            predicted_trg = Translate(src_sentence, src_field, trg_field, model)
            predicted_trgs.append(predicted_trg[:-1])
            trgs.append([trg_sentence])
        except:
            pass
    return bleu_score(predicted_trgs, trgs)

In [None]:
bleu_score_test = Calculate_BLEU(test_data, Source_Field, Target_Field, seq2seq)
print(f"BLEU score on Testing Data: {bleu_score_test*100:.2f}")