<a href="https://colab.research.google.com/github/Sghosh32/Neural-Machine-Translation/blob/main/Seq2Seq_with_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 2.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 19.1 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
Successfully installed sentencepiece-0.1.96 torchtext-0.6.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score

import numpy as np
import math
import random
import spacy
import time

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Notebook is running on", device)

Notebook is running on cuda


In [None]:
SEED = 1111
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
!python -m spacy download de
!python -m space download en

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 8.5 MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-py3-none-any.whl size=14907055 sha256=83656985da670f1ac08fb7ad13dfb40931ab1d18f285152abb699a3435a49eb8
  Stored in directory: /tmp/pip-ephem-wheel-cache-wcwfo0xb/wheels/00/66/69/cb6c921610087d2cab339062345098e30a5ceb665360e7b32a
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m

In [None]:
de_model = spacy.load('de')
en_model = spacy.load('en')

In [None]:
def de_tokenizer(sentence):
    return [token.text for token in de_model.tokenizer(sentence)]

def en_tokenizer(sentence):
    return [token.text for token in en_model.tokenizer(sentence)]

In [None]:
Source_Field = Field(eos_token = 'src_eos', init_token = 'src_sos', lower = True, include_lengths = True, tokenize = de_tokenizer)
Target_Field = Field(eos_token = 'src_eos', init_token = 'src_sos', lower = True, tokenize = en_tokenizer)

In [None]:
def ipTensor(sentence, source_field):
    if isinstance(sentence, list):
        tokens = [source_field.init_token] + [token.lower() for token in sentence] + [source_field.eos_token]
    else:
        tokens = [source_field.init_token] + de_tokenizer(sentence) + [source_field.eos_token]
    sequence_length = len(tokens)
    input_tensor = torch.LongTensor([source_field.vocab.stoi[token] for token in tokens]).to(device)
    return input_tensor.view(sequence_length, 1)

In [None]:
training_data, validation_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (Source_Field, Target_Field))
Source_Field.build_vocab(training_data, min_freq = 2)
Target_Field.build_vocab(training_data, min_freq = 2)
print(f"Source vocab size: {len(Source_Field.vocab)} | Target vocab size: {len(Target_Field.vocab)}")

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 611kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 174kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 166kB/s]


Source vocab size: 7855 | Target vocab size: 5893


In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, encoder_dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(encoder_dropout)
        self.rnn = nn.GRU(input_size = embedding_dim, hidden_size = encoder_hidden_dim, bidirectional = True)
        self.fc = nn.Linear(2 * encoder_hidden_dim, decoder_hidden_dim)

    def forward(self, input):
        embedding = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedding)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        return output, hidden

In [None]:
class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super(Attention, self).__init__()
        self.attention = nn.Linear(2 * encoder_hidden_dim + decoder_hidden_dim, decoder_hidden_dim)
        self.v = nn.Linear(decoder_hidden_dim, 1, bias = False)

    def forward(self, hidden, encoder_output):
        batch_size = encoder_output.shape[1]
        source_length = encoder_output.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, source_length, 1)
        encoder_output = encoder_output.permute(1, 0, 2)
        energy = torch.tanh(self.attention(torch.cat((hidden, encoder_output), dim = 2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim = 1)

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, decoder_dropout, attention):
        super(Decoder, self).__init__()
        self.output_dim = vocab_size
        self.attention = attention
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(decoder_dropout)
        self.rnn = nn.GRU(input_size = 2 * encoder_hidden_dim + embedding_dim, hidden_size = decoder_hidden_dim)
        self.fc = nn.Linear(2 * encoder_hidden_dim + decoder_hidden_dim + embedding_dim, vocab_size)

    def forward(self, input, hidden, encoder_output):
        input = input.unsqueeze(0)
        embedding = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_output)
        a = a.unsqueeze(1)
        encoder_output = encoder_output.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_output)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedding, weighted), dim = 2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        embedding = embedding.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc(torch.cat((output, weighted, embedding), dim = 1))
        return prediction, hidden.squeeze(0)

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio = 0.5):
        batch_size = source.shape[1]
        target_length = target.shape[0]
        target_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(target_length, batch_size, target_vocab_size).to(self.device)
        encoder_output, hidden = self.encoder(source)
        input = target[0,:]
        for t in range(1, target_length):
            output, hidden = self.decoder(input, hidden, encoder_output)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1) 
            input = target[t] if teacher_force else top1
        return outputs[1:]

In [None]:
def Train(iterator, model, criterion, optimizer, clip = 1):
    model.train()
    epoch_loss = 0
    for _, batch in enumerate(iterator):
        model.zero_grad()
        source, source_length = batch.src
        target = batch.trg
        outputs = model(source, target)
        outputs = outputs.view(-1, outputs.shape[-1])
        targets = target[1:].view(-1)
        batch_loss = criterion(outputs, targets.to(device))
        batch_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += batch_loss.item()
    return epoch_loss / len(iterator)

In [None]:
def Evaluate(iterator, model, criterion):
    model.eval()
    evaluation_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            source, source_length = batch.src
            target = batch.trg
            outputs = model(source, target, 0)
            outputs = outputs.view(-1, outputs.shape[-1])
            targets = target[1:].view(-1)
            batch_loss = criterion(outputs, targets.to(device))
            evaluation_loss += batch_loss.item()
        return evaluation_loss / len(iterator)

In [None]:
def Epoch_Time(start_time, end_time):
    elasped_time = end_time - start_time
    elasped_mins = int(elasped_time / 60)
    elasped_secs = int(elasped_time - (elasped_mins * 60))
    return (elasped_mins, elasped_secs)

In [None]:
BATCH_SIZE = 128
training_iterator, validation_iterator, test_iterator = BucketIterator.splits((training_data, validation_data, test_data), batch_size = BATCH_SIZE, device = device)

In [None]:
print("Number of Training sentences: ", len(training_data.examples))
print("Number of Validation sentences: ", len(validation_data.examples))
print("Number of Test sentences: ", len(test_data.examples))

Number of Training sentences:  29000
Number of Validation sentences:  1014
Number of Test sentences:  1000


In [None]:
CLIP = 1
NUM_EPOCHS = 1
HIDDEN_DIM = 512
SOURCE_VOCAB_SIZE = len(Source_Field.vocab)
TARGET_VOCAB_SIZE = len(Target_Field.vocab)
EMBEDDING_DIM = 256
NUM_LAYERS = 1
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5

In [None]:
target_padding_index = Target_Field.vocab.stoi[Target_Field.pad_token]
source_padding_index = Source_Field.vocab.stoi[Source_Field.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = target_padding_index)

In [None]:
def initialize_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name: 
            nn.init.normal_(param.data, mean = 0, std = 0.01)
        else:
            nn.init.constant_(param.data, 0)

In [None]:
attention = Attention(HIDDEN_DIM, HIDDEN_DIM)
encoder = Encoder(SOURCE_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, HIDDEN_DIM, ENCODER_DROPOUT).to(device)
decoder = Decoder(TARGET_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, HIDDEN_DIM, DECODER_DROPOUT, attention).to(device)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)
optimizer = optim.Adam(seq2seq.parameters())

In [None]:
seq2seq.apply(initialize_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attention): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(5893, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (rnn): GRU(1280, 512)
    (fc): Linear(in_features=1792, out_features=5893, bias=True)
  )
)

In [None]:
seq2seq.eval()
best_valid_loss = float('inf')
for epoch in range(NUM_EPOCHS):
    
    start_time = time.time()
    train_loss = Train(training_iterator, seq2seq, criterion, optimizer, clip = 1)
    valid_loss = Evaluate(validation_iterator, seq2seq, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = Epoch_Time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 1m 19s
	Train Loss: 5.038 | Train PPL: 154.209
	 Val. Loss: 5.124 |  Val. PPL: 168.045


In [None]:
def Translate(source_sentence, source_field, target_field, model):
    ip_tensor = ipTensor(source_sentence, source_field)
    maximum_length = 4*ip_tensor.shape[0]
    source_length = [ip_tensor.shape[0]]
    with torch.no_grad():
        encoder_output, encoder_states = model.encoder(ip_tensor)
    decoder_states = encoder_states
    sos_id = target_field.vocab.stoi[target_field.init_token]
    eos_id = target_field.vocab.stoi[target_field.eos_token]
    predicts = [sos_id]
    print(decoder_states.size())
    len = 1
    while len < maximum_length:
        input = torch.LongTensor([predicts[-1]]).view((1, 1)).to(device)
        with torch.no_grad():
            output, decoder_states = model.decoder(input, decoder_states, encoder_output)
        decoder_states=decoder_states.squeeze(0)
        output = output.squeeze()
        output = output.view(-1, model.decoder.output_size)
        predicts.append(output.argmax(-1).item())
        len += 1
        if predicts[-1] == eos_id:
            break
    sentence = [target_field.vocab.itos[id] for id in predicts[1:]]
    return sentence


In [None]:
ind = int(random.random() * len(test_data.examples))
example = test_data.examples[ind]
source_sentence = example.src
target_sentence = example.trg
print("German: ", ' '.join(source_sentence))
translation = Translate(source_sentence, Source_Field, Target_Field, seq2seq)
print("English: ", ' '.join(translation[:-1]))
print("Actual Translation: ", ' '.join(target_sentence))

In [None]:
def Calculate_BLEU(data, source_field, target_field, model):
    trgs = []
    predicted_trgs = []
    for i in range(len(data.examples)):
        src_sentence = vars(data[i])['src']
        trg_sentence = vars(data[i])['trg']
        try:                                # Sometimes(rarely) CUDA throws a "Device side assert triggered" error. So, just to avoid restarting runtime.
            predicted_trg, _ = Translate(src_sentence, source_field, target_field, model)
            predicted_trgs.append(predicted_trg[:-1])
            trgs.append([trg_sentence])
        except:
            pass
    return bleu_score(predicted_trgs, trgs)