<a href="https://colab.research.google.com/github/VedantDere0104/Neural-Machine-Translation/blob/main/NMT_With_Transformer_Multi30k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
####

In [None]:
import torch
from torch import nn
import torchtext
from torchtext.data import Field  , BucketIterator
from torchtext.datasets import Multi30k
import spacy
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
import nltk
from nltk.translate.bleu_score import corpus_bleu
import math

In [None]:
! python -m spacy download de

In [None]:
spacy_ger = spacy.load('de')
spacy_eng = spacy.load('en')

In [None]:
def tokenizer_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]
def tokenizer_eng(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]
  

In [None]:
german = Field(tokenize=tokenizer_ger , init_token='<SOS>' , eos_token='<EOS>' , )
english = Field(tokenize= tokenizer_eng , init_token='<SOS>' , eos_token='<EOS>')

In [None]:
train_data , validation_data , test_data = Multi30k.splits(exts=('.de' , '.en') , 
                                                           fields = (german , english) , )

In [None]:
german.build_vocab(train_data)
english.build_vocab(train_data)

In [None]:
class PositionalEncoder(nn.Module):
  def __init__(self, d_model, max_seq_len = 80):
    super().__init__()
    self.d_model = d_model
    
    # create constant 'pe' matrix with values dependant on 
    # pos and i
    self.pe = torch.zeros(max_seq_len, d_model)
    for pos in range(max_seq_len):
        for i in range(0, d_model, 2):
            self.pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
            self.pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
            
    self.pe = self.pe.unsqueeze(0)

  
  def forward(self, x):
    # make embeddings relatively larger
    x = x * math.sqrt(self.d_model)
    #add constant to embedding
    seq_len = x.size(1)
    x = torch.tensor(self.pe[:,:seq_len] , requires_grad=False).cuda()
    return x

In [None]:
class Transformer(nn.Module):
  def __init__(
    self,
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
  ):
    super(Transformer, self).__init__()
    self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
    #self.src_position_embedding = nn.Embedding(max_len, embedding_size)
    self.src_position_embedding = PositionalEncoder(embedding_size)
    self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
    #self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
    self.trg_position_embedding = PositionalEncoder(embedding_size)

    self.device = device
    self.transformer = nn.Transformer(
        embedding_size,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
    )
    self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
    self.dropout = nn.Dropout(dropout)
    self.src_pad_idx = src_pad_idx
    self.out = nn.Linear(trg_vocab_size , trg_vocab_size)
    self.relu = nn.ReLU(inplace = True)


  def make_src_mask(self, src):
    src_mask = src.transpose(0, 1) == self.src_pad_idx

    # (N, src_len)
    return src_mask.to(self.device)

  def forward(self, src, trg):
    src_seq_length, N = src.shape
    trg_seq_length, N = trg.shape

    src_positions = (
        torch.arange(0, src_seq_length)
        .unsqueeze(1)
        .expand(src_seq_length, N)
        .to(self.device)
    )

    trg_positions = (
        torch.arange(0, trg_seq_length)
        .unsqueeze(1)
        .expand(trg_seq_length, N)
        .to(self.device)
    )

    embed_src = self.dropout(
        (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
    )
    embed_trg = self.dropout(
        (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
    )

    src_padding_mask = self.make_src_mask(src)
    trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
        self.device
    )

    out = self.transformer(
        embed_src,
        embed_trg,
        src_key_padding_mask=src_padding_mask,
        tgt_mask=trg_mask,
    )
    out = self.out(self.fc_out(out))
    return out

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 300
learning_rate = 0.00001
batch_size = 32


In [None]:
def save_checkpoint(state, filename="/content/drive/MyDrive/checkpoint_NMT_Multi30k.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [None]:
src_vocab_size = len(german.vocab)
trg_vocab_size = len(english.vocab)
embed_size = 512 
heads = 8
num_encoder_layer = 6
num_decoder_layer = 6
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_index = german.vocab.stoi['<PAD>']
betas = (0.9 , 0.98)
save_model = True
load_model = True


In [None]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

In [None]:
for batch in train_iterator:
  print(batch.src.shape , batch.trg.shape)
  break

In [None]:
model = Transformer(
    embed_size , 
    src_vocab_size , 
    trg_vocab_size , 
    src_pad_index , 
    heads , 
    num_encoder_layer , 
    num_decoder_layer , 
    forward_expansion , 
    dropout ,
    max_len , 
    device
).to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters() , lr = learning_rate , betas = betas )
pad_index = english.vocab.stoi['<PAD>']
criterion = nn.CrossEntropyLoss(ignore_index = pad_index)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

In [None]:
for epoch in range(num_epochs):
  print(f'[Epoch {epoch + 1} / {num_epochs}]')
  if save_model:
    checkpoint = {
        "state_dict": model.state_dict(),
        "optimizer": optimizer.state_dict(),
        }
    save_checkpoint(checkpoint)


  losses = []
  losses_valid = []
  for batch_idx, batch in enumerate(tqdm(train_iterator)):

    model.train()
    
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)
    
    output = model(inp_data, target[:-1, :]).to(device)

    output = output.reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()

    loss = criterion(output, target)
    losses.append(loss.item())

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.5)
    

 
    optimizer.step()

  model.eval()
  with torch.no_grad():
    for batch_idx , batch_valid in enumerate(valid_iterator):
      inp_data_valid =  batch_valid.src.to(device)
      target_data_valid = batch_valid.trg.to(device)
      model_prediction = model(inp_data_valid , target_data_valid[:-1 , :])
      model_prediction = model_prediction.reshape(-1, model_prediction.shape[2])
      target_data_valid = target_data_valid[1:].reshape(-1)

      loss_valid = criterion(model_prediction , target_data_valid)
      losses_valid.append(loss_valid)
      
  model.train()

  mean_loss = sum(losses) / len(losses)
  mean_loss_valid = sum(losses_valid) / len(losses_valid)

  print(f'[Loss => {mean_loss} ------ Validation_loss => {mean_loss_valid} ]')

In [None]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
  # Load german tokenizer
  #spacy_ger = spacy.load("de")

  # Create tokens using spacy and everything in lower case (which is what our vocab is)
  if type(sentence) == str:
    tokens = [token.text.lower() for token in spacy_ger(sentence)]
  else:
    tokens = [token.lower() for token in sentence]

  # Add <SOS> and <EOS> in beginning and end respectively
  tokens.insert(0, german.init_token)
  tokens.append(german.eos_token)

  # Go through each german token and convert to an index
  text_to_indices = [german.vocab.stoi[token] for token in tokens]

  # Convert to Tensor
  sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

  outputs = [english.vocab.stoi["<sos>"]]
  for i in range(max_length):
    trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

    with torch.no_grad():
      output = model(sentence_tensor, trg_tensor)

    best_guess = output.argmax(2)[-1, :].item()
    outputs.append(best_guess)

    if best_guess == english.vocab.stoi["<eos>"]:
      break

  translated_sentence = [english.vocab.itos[idx] for idx in outputs]
  # remove start token
  return translated_sentence[1:]

In [None]:
def bleu(data, model, german, english, device):
  targets = []
  outputs = []

  for example in tqdm(data):
    src = vars(example)["src"]
    trg = vars(example)["trg"]

    prediction = translate_sentence(model, src, german, english, device)
    prediction = prediction[:-1]  # remove <eos> token

    targets.append([trg])
    outputs.append(prediction)

  return corpus_bleu(targets , outputs)

In [None]:
bleu(test_data , model , english , german , device)