<a href="https://colab.research.google.com/github/Sghosh32/Neural-Machine-Translation/blob/main/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchtext==0.8.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.8.0
  Downloading torchtext-0.8.0-cp37-cp37m-manylinux1_x86_64.whl (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 32.1 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
Successfully installed torchtext-0.8.0


In [None]:
import math
import numpy as np
import random
import spacy
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Notebook is running on", device)

Notebook is running on cuda


In [None]:
SEED = 4444

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
!python -m spacy download de
!python -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'de' are deprecated. Please use the
full pipeline package name 'de_core_news_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0-py3-none-any.whl (14.6 MB)
[K     |████████████████████████████████| 14.6 MB 510 kB/s 
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spac

In [None]:
de_model = spacy.load('de_core_news_sm')
en_model = spacy.load('en_core_web_sm')

In [None]:
def de_tokenizer(sentence):
    return [token.text for token in de_model.tokenizer(sentence)]

def en_tokenizer(sentence):
    return [token.text for token in en_model.tokenizer(sentence)]

In [None]:
Source_Field = Field(eos_token = '<src_eos>', init_token = '<src_sos>', lower = True, tokenize = de_tokenizer, batch_first = True)
Target_Field = Field(eos_token = '<trg_eos>', init_token = '<trg_sos>', lower = True, tokenize = en_tokenizer, batch_first = True)



In [None]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (Source_Field, Target_Field), root = 'data')



In [None]:
Source_Field.build_vocab(train_data, min_freq = 2)
Target_Field.build_vocab(train_data, min_freq = 2)
print(f"Source vocab size: {len(Source_Field.vocab)} | Target vocab size: {len(Target_Field.vocab)}")

Source vocab size: 7853 | Target vocab size: 5893


In [None]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hidden_dimension, n_heads, dropout):
        super(MultiHeadAttentionLayer, self).__init__()
        self.hidden_dimension = hidden_dimension
        self.n_heads = n_heads
        self.head_dimension = hidden_dimension // n_heads
        self.fc_Q = nn.Linear(hidden_dimension, hidden_dimension)
        self.fc_K = nn.Linear(hidden_dimension, hidden_dimension)
        self.fc_V = nn.Linear(hidden_dimension, hidden_dimension)
        self.fc_O = nn.Linear(hidden_dimension, hidden_dimension)
        self.scale = math.sqrt(self.head_dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask = None):
        batch_size = query.shape[0]
        Q = self.fc_Q(query)
        K = self.fc_K(key)
        V = self.fc_V(value)
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dimension).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dimension).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dimension).permute(0, 2, 1, 3)
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        attention = torch.softmax(energy, dim = -1)
        x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0, 2, 1, 3).contiguous()
        output = self.fc_O(x.view(batch_size, -1, self.hidden_dimension))
        return output

In [None]:
class PositionFeedForwardLayer(nn.Module):
    def __init__(self, hidden_dimension, pff_dimension, dropout):
        super(PositionFeedForwardLayer, self).__init__()
        self.fc_1 = nn.Linear(hidden_dimension, pff_dimension)
        self.fc_2 = nn.Linear(pff_dimension, hidden_dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input):
        output = torch.relu(self.fc_1(input))
        output = self.fc_2(self.dropout(output))
        return output

In [None]:
class Encoder_Layer(nn.Module):
    def __init__(self, hidden_dimension, n_heads, pff_dimension, dropout):
        super(Encoder_Layer, self).__init__()
        self.self_attention = MultiHeadAttentionLayer(hidden_dimension, n_heads, dropout)
        self.pff = PositionFeedForwardLayer(hidden_dimension, pff_dimension, dropout)
        self.attention_normalized = nn.LayerNorm(hidden_dimension)
        self.pff_normalized = nn.LayerNorm(hidden_dimension)
        self.dropout = nn.Dropout

    def forward(self, source, source_mask):
        attention_output = self.self_attention(source, source, source, source_mask)
        inter_output = self.attention_normalized(attention_output + source)
        pff_output = self.pff(inter_output)
        output = self.pff_normalized(pff_output + inter_output)
        return output          

In [None]:
class Encoder(nn.Module):
    def __init__(self, token_vocab_size, positional_vocab_size, hidden_dimension, encoder_heads, encoder_pff_dimension, num_layers, encoder_dropout):
        super(Encoder, self).__init__()
        self.token_embedding = nn.Embedding(token_vocab_size, hidden_dimension)
        self.positional_embedding = nn.Embedding(positional_vocab_size, hidden_dimension)
        self.encoder_layers = nn.ModuleList([Encoder_Layer(hidden_dimension, encoder_heads, encoder_pff_dimension, encoder_dropout) for i in range(num_layers)])
        self.scale = math.sqrt(hidden_dimension)
        self.dropout = nn.Dropout(encoder_dropout)

    def forward(self, source, source_mask):
        batch_size = source.shape[0]
        source_length = source.shape[1]
        token_embedding = self.token_embedding(source)
        positional_tensor = torch.arange(0, source_length).unsqueeze(0).repeat(batch_size, 1).to(device)
        positional_embedding = self.positional_embedding(positional_tensor)
        encoder_embedding = self.dropout(token_embedding * self.scale + positional_embedding)
        encoder_state = encoder_embedding
        for encoder_layer in self.encoder_layers:
            encoder_state = encoder_layer(encoder_state, source_mask)
        return encoder_state

In [None]:
class Decoder_Layer(nn.Module):
    def __init__(self, hidden_dimension, n_heads, pff_dimension, dropout):
        super(Decoder_Layer, self).__init__()
        self.self_attention = MultiHeadAttentionLayer(hidden_dimension, n_heads, dropout)
        self.cross_attention = MultiHeadAttentionLayer(hidden_dimension, n_heads, dropout)
        self.pff = PositionFeedForwardLayer(hidden_dimension, pff_dimension, dropout)
        self.attention_norm1 = nn.LayerNorm(hidden_dimension)
        self.attention_norm2 = nn.LayerNorm(hidden_dimension)
        self.pff_normalized = nn.LayerNorm(hidden_dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, target, target_mask, encoder_output, source_mask):
        self_attention = self.self_attention(target, target, target, target_mask)
        output1 = self.attention_norm1(self.dropout(self_attention) + target)
        cross_attention = self.cross_attention(output1, encoder_output, encoder_output, source_mask)
        output2 = self.attention_norm2(self.dropout(cross_attention) + output1)
        pff_output = self.pff(output2)
        output = self.pff_normalized(self.dropout(pff_output) + output2)
        return output 

In [None]:
class Decoder(nn.Module):
    def __init__(self, token_vocab_size, positional_vocab_size, hidden_dimension, decoder_heads, decoder_pff_dimension, num_layers, decoder_dropout):
        super(Decoder, self).__init__()
        self.token_embedding = nn.Embedding(token_vocab_size, hidden_dimension)
        self.positional_embedding = nn.Embedding(positional_vocab_size, hidden_dimension)
        self.decoder_layers = nn.ModuleList([Decoder_Layer(hidden_dimension, decoder_heads, decoder_pff_dimension, decoder_dropout) for i in range(num_layers)])
        self.fc = nn.Linear(hidden_dimension, token_vocab_size)
        self.scale = math.sqrt(hidden_dimension)
        self.dropout = nn.Dropout(decoder_dropout)

    def forward(self, target, target_mask, encoder_output, source_mask):
        batch_size = target.shape[0]
        target_length = target.shape[1]
        token_embedding = self.token_embedding(target)
        positional_tensor = torch.arange(0, target_length).unsqueeze(0).repeat(batch_size, 1).to(device)
        positional_embedding = self.positional_embedding(positional_tensor)
        decoder_embedding = self.dropout(token_embedding * self.scale + positional_embedding)
        decoder_state = decoder_embedding
        for decoder_layer in self.decoder_layers:
            decoder_state = decoder_layer(decoder_state, target_mask, encoder_output, source_mask)
        output = self.fc(decoder_state)
        return output

In [None]:
class Transformer(nn.Module):
    def __init__(self, encoder, decoder, source_padding_index, target_padding_index):
        super(Transformer, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.source_padding_index = source_padding_index
        self.target_padding_index = target_padding_index

    def make_source_mask(self, src):
        source_mask = (src != self.source_padding_index).unsqueeze(1).unsqueeze(2).to(device)
        return source_mask

    def make_target_mask(self, trg):
        trg_length = trg.shape[1]
        pad_mask = (trg != self.target_padding_index).unsqueeze(1).unsqueeze(2).to(device)
        sub_mask = torch.tril(torch.ones((trg_length, trg_length), device = device)).bool()
        target_mask = pad_mask & sub_mask
        return target_mask

    def forward(self, source, target):
        source_mask = self.make_source_mask(source)
        target_mask = self.make_target_mask(target)
        encoder_output = self.encoder(source, source_mask)
        output = self.decoder(target, target_mask, encoder_output, source_mask)
        return output

In [None]:
def Train(iterator, model, criterion, optimizer, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        source = batch.src
        target = batch.trg
        outputs = model(source, target[:, :-1])
        outputs = outputs.contiguous().view(-1, outputs.shape[-1])
        targets = target[:, 1:].contiguous().view(-1).to(device)
        batch_loss = criterion(outputs, targets)
        batch_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += batch_loss.item()
    return epoch_loss / len(iterator)

In [None]:
def Evaluate(iterator, model, criterion):
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            source = batch.src
            target = batch.trg
            outputs = model(source, target[:, :-1])
            outputs = outputs.contiguous().view(-1, outputs.shape[-1])
            targets = target[:, 1:].contiguous().view(-1).to(device)
            batch_loss = criterion(outputs, targets)
            eval_loss += batch_loss.item()
        return eval_loss/len(iterator)

In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size = BATCH_SIZE, device = device)



In [None]:
print("Number of Training sequences: ", len(train_data.examples))
print("Number of Test sequences: ", len(test_data.examples))
print("Number of Validation sequences: ", len(valid_data.examples))

Number of Training sequences:  29000
Number of Test sequences:  1000
Number of Validation sequences:  1014


In [None]:
NUM_EPOCHS = 10
LR = 0.0005
CLIP = 1
SOURCE_VOCAB_SIZE = len(Source_Field.vocab)
TARGET_VOCAB_SIZE = len(Target_Field.vocab)
HIDDEN_DIMENSION = 256
ENCODER_PFF_DIMENSION = 512
DECODER_PFF_DIMENSION = 512
ENCODER_HEADS = 8
DECODER_HEADS = 8
ENCODER_DROPOUT = 0.1
DECODER_DROPOUT = 0.1
ENCODER_NUM_LAYERS = 3
DECODER_NUM_LAYERS = 3
MAX_LENGTH = 100

In [None]:
source_padding_index = Source_Field.vocab.stoi[Source_Field.pad_token]
target_padding_index = Target_Field.vocab.stoi[Target_Field.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = target_padding_index)

In [None]:
encoder = Encoder(SOURCE_VOCAB_SIZE, MAX_LENGTH, HIDDEN_DIMENSION, ENCODER_HEADS, ENCODER_PFF_DIMENSION, ENCODER_NUM_LAYERS, ENCODER_DROPOUT).to(device)
decoder = Decoder(TARGET_VOCAB_SIZE, MAX_LENGTH, HIDDEN_DIMENSION, DECODER_HEADS, DECODER_PFF_DIMENSION, DECODER_NUM_LAYERS, DECODER_DROPOUT).to(device)
transformer = Transformer(encoder, decoder, source_padding_index, target_padding_index).to(device)
optimizer = optim.Adam(transformer.parameters(), LR)

In [None]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [None]:
transformer.apply(initialize_weights)

Transformer(
  (encoder): Encoder(
    (token_embedding): Embedding(7853, 256)
    (positional_embedding): Embedding(100, 256)
    (encoder_layers): ModuleList(
      (0): Encoder_Layer(
        (self_attention): MultiHeadAttentionLayer(
          (fc_Q): Linear(in_features=256, out_features=256, bias=True)
          (fc_K): Linear(in_features=256, out_features=256, bias=True)
          (fc_V): Linear(in_features=256, out_features=256, bias=True)
          (fc_O): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (pff): PositionFeedForwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (attention_normalized): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (pff_normalized): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      )

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(transformer):,} trainable parameters.')

The model has 9,038,341 trainable parameters.


In [None]:
print(f"Learning Rate: {LR}, Hidden Dimmensions: {HIDDEN_DIMENSION}")
train_losses = []
valid_losses = []
prev_epoch = 1
min_losses = [float('inf'), float('inf')]
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = Train(train_iterator, transformer, criterion, optimizer, CLIP)
    train_losses.append(train_loss)
    valid_loss = Evaluate(test_iterator, transformer, criterion)
    valid_losses.append(valid_loss)
    if valid_loss < min_losses[0]:
        min_losses[0] = valid_loss
        min_losses[1] = train_loss
    if epoch % int(NUM_EPOCHS / 10) == 0:
        prev_epoch = epoch + 1
        print(f"Training Loss: {train_loss:.4f} | Validation Loss: {valid_loss:.4f}")
        print(f"Training PPL: {math.exp(train_loss):.4f} | Validation PPL: {math.exp(valid_loss):.4f}")

Learning Rate: 0.0005, Hidden Dimmensions: 256




Training Loss: 4.2062 | Validation Loss: 2.9314
Training PPL: 67.1029 | Validation PPL: 18.7544
Training Loss: 2.7188 | Validation Loss: 2.2458
Training PPL: 15.1623 | Validation PPL: 9.4475
Training Loss: 2.1585 | Validation Loss: 1.9441
Training PPL: 8.6580 | Validation PPL: 6.9876
Training Loss: 1.8189 | Validation Loss: 1.7976
Training PPL: 6.1650 | Validation PPL: 6.0352
Training Loss: 1.5761 | Validation Loss: 1.7211
Training PPL: 4.8359 | Validation PPL: 5.5909
Training Loss: 1.3935 | Validation Loss: 1.6957
Training PPL: 4.0290 | Validation PPL: 5.4505
Training Loss: 1.2421 | Validation Loss: 1.6733
Training PPL: 3.4629 | Validation PPL: 5.3295
Training Loss: 1.1158 | Validation Loss: 1.6909
Training PPL: 3.0521 | Validation PPL: 5.4244
Training Loss: 1.0095 | Validation Loss: 1.6842
Training PPL: 2.7441 | Validation PPL: 5.3883
Training Loss: 0.9163 | Validation Loss: 1.7041
Training PPL: 2.4999 | Validation PPL: 5.4964


In [None]:
transformer.eval()
test_loss = Evaluate(test_iterator, transformer, criterion)
print(f"Test Loss: {test_loss:.4f} | Test PPL: {math.exp(test_loss):.4f}")

Test Loss: 1.7041 | Test PPL: 5.4964




In [None]:
def sentence_translation(sentence, src_field, trg_field, model, device, max_len = 50):
    model.eval()
    if isinstance(sentence, str):
        nlp = spacy.load('de_core_news_sm')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_source_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_target_mask(trg_tensor)
        
        with torch.no_grad():
            output = model.decoder(trg_tensor, trg_mask, enc_src, src_mask)
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:]

In [None]:
ind = int(random.random() * len(train_data.examples))
example = train_data.examples[ind + 1]
source_sentence = example.src
target_sentence = example.trg
print("German Sentence: ", ' '.join(source_sentence))
translation = sentence_translation(source_sentence, Source_Field, Target_Field, transformer, device)
print("Predicted Translation: ", ' '.join(translation[:-1]))
print("Actual Translation: ", ' '.join(target_sentence))

German Sentence:  ein kleiner junge in einem blauen t-shirt wirft einen football .
Predicted Translation:  a young boy in a blue shirt throwing a football .
Actual Translation:  a little boy at camp , in a blue shirt , throwing a football .


In [None]:
def Calculate_BLEU(data, source_field, target_field, model):
    targets = []
    predicted_targets = []
    for datum in data:
        source_sentence = vars(datum)['src']
        target_sentence = vars(datum)['trg']
        predicted_target = sentence_translation(source_sentence, source_field, target_field, model, device)
        predicted_targets.append(predicted_target[:-1])
        targets.append([target_sentence])
    return bleu_score(predicted_targets, targets)

In [None]:
bleu_score_test = Calculate_BLEU(test_data, Source_Field, Target_Field, transformer)
print(f"BLEU score on Testing Data: {bleu_score_test*100:.2f}")

BLEU score on Testing Data: 36.31
