<a href="https://colab.research.google.com/github/VasileiosKarapoulios/Machine-Translation/blob/main/PyTorch_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -m spacy download en
!python -m spacy download el
!pip install torchtext==0.6.0

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.metrics import bleu_score
from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import spacy
import os
import pandas as pd
import sys
import numpy as np
import random 

In [None]:
def translate(model, sentence, greek, english, device, max_length=50):
    # Load greek tokenizer
    spacy_el = spacy.load("el")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_el(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <sos> and <eos> in beginning and in the end 
    tokens.insert(0, greek.init_token)
    tokens.append(greek.eos_token)

    # Go through each greek token and convert to an index
    text_to_indices = [greek.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [english.vocab.stoi["<sos>"]]
    for i in range(max_length):
        tgt_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, tgt_tensor)

        prediction = output.argmax(2)[-1, :].item()
        outputs.append(prediction)

        if prediction == english.vocab.stoi["<eos>"]:
            break

    translated = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated[1:]

def bleu(data, model, greek, english, device):
    targets = []
    outputs = []

    for sentence in data:
        src = vars(sentence)["Greek"]
        tgt = vars(sentence)["English"]

        prediction = translate(model, src, greek, english, device)

        # remove <eos> token
        prediction = prediction[:-1]  

        targets.append([tgt])
        outputs.append(prediction)
    return bleu_score(outputs, targets)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed=42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
df = pd.read_csv("/content/drive/My Drive/EN-EL.txt", sep='\t', header = None)[[0,1]].rename(columns = {0:"English", 1:"Greek"})

In [None]:
# remove very long and very short sentences and sentences where translations are 
# not of roughly equal length
df['eng_len'] = df['English'].str.count(' ')
df['el_len'] = df['Greek'].str.count(' ')
df = df.query('el_len < 100')
df = df.query('eng_len < 100')
df = df.query('el_len > 4 & eng_len > 4')
df = df.query('el_len < eng_len * 1.5 & el_len * 1.5 > eng_len')

In [None]:
df = df.drop('eng_len', 1)
df = df.drop('el_len', 1)

In [None]:
df = df[:42000]

In [None]:
df.English[2]

'fixing the minimum selling prices for butter for the 150th individual invitation to tender under the standing invitation to tender provided for in Regulation (EC) No 2571/97'

In [None]:
df.Greek[2]

'για καθορισμό των ελαχίστων τιμών πώλησης βουτύρου για την 150η ειδική δημοπρασία που πραγματοποιείται στο πλαίσιο της διαρκούς δημοπρασίας που προβλέπεται από τον κανονισμό (ΕΚ) αριθ. 2571/97'

In [None]:
df.head(10)

Unnamed: 0,English,Greek
2,fixing the minimum selling prices for butter f...,για καθορισμό των ελαχίστων τιμών πώλησης βουτ...
4,Having regard to the Treaty establishing the E...,Έχοντας υπόψη: τη συνθήκη για την ίδρυση της Ε...
5,Having regard to Council Regulation (EC) No 12...,τον κανονισμό (ΕΚ) αριθ. 1255/1999 του Συμβουλ...
7,"The intervention agencies are, pursuant to Com...",Σύμφωνα με τον κανονισμό (ΕΚ) αριθ. 2571/97 τη...
8,The amount(s) of the processing securities mus...,Πρέπει συνεπώς να καθοριστούν το ή τα ποσά των...
9,The measures provided for in this Regulation a...,Τα μέτρα που προβλέπονται στον παρόντα κανονισ...
12,The minimum selling prices of butter from inte...,Για την 150η ειδική δημοπρασία στο πλαίσιο της...
14,This Regulation shall enter into force on 16 O...,Ο παρών κανονισμός αρχίζει να ισχύει στις 16 Ο...
15,This Regulation shall be binding in its entire...,Ο παρών κανονισμός είναι δεσμευτικός ως προς ό...
20,to the Commission Regulation of 15 October 200...,"του κανονισμού της Επιτροπής, της 15ης Οκτωβρί..."


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.023)

#36075 train | 925 test (0.025)#41034 train | 966 test (0.023)#51064 train | 936 test (0.018)#70992 train | 1008 test (0.014)#91080 train | 920 test (0.01)
#31040 train | 960 test (0.03)#26055 train | 945 test (0.035)#21010 train | 990 test (0.045)#16150 train | 850 test (0.05)#11040 train | 960 test (0.08)
#6020 train | 980 test (0.14)

train, validation = train_test_split(train, test_size=0.023) 

#35173 train | 902 validation (0.025)#40090 train | 944 validation (0.023)#50145 train | 919 validation (0.018)#69998 train | 994 validation (0.014)#90169 train | 911 test (0.01)
#30108 train | 932 validation (0.03)#25143 train | 912 validation (0.035)#20065 train | 945 validation (0.045)#15342 train | 808 validation (0.05)
#10046 train | 994 validation (0.09)#5057 train | 963 validation (0.16)

In [None]:
train.to_csv("train.csv", index=False, encoding="utf-8-sig")
test.to_csv("test.csv", index=False, encoding="utf-8-sig")
validation.to_csv("validation.csv", index=False, encoding="utf-8-sig")

In [None]:
spacy_eng = spacy.load("en")
spacy_el = spacy.load("el")

def tokenize_el(text):
  return [token.text for token in spacy_el.tokenizer(text)]
  
def tokenize_eng(text):
    return [token.text for token in spacy_eng.tokenizer(text)]

english = Field(tokenize=tokenize_eng, init_token = "<sos>", eos_token = "<eos>", lower=True)
greek = Field(tokenize=tokenize_el, init_token = "<sos>", eos_token = "<eos>", lower=True)

fields = [("English", english), ("Greek", greek)]

train_data, test_data, valid_data = TabularDataset.splits(path="", train="train.csv", test="test.csv", validation="validation.csv", format="csv", fields=fields)

english.build_vocab(train_data, max_size=10000, min_freq=2)
greek.build_vocab(train_data, max_size=10000, min_freq=2)

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        tgt_vocab_size,
        src_pad_idx,
        nheads, #for the attention mechanism
        num_encoder_layers,
        num_decoder_layers,
        dim_feedforward,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.tgt_word_embedding = nn.Embedding(tgt_vocab_size, embedding_size)
        self.tgt_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            nheads,
            num_encoder_layers,
            num_decoder_layers,
            dim_feedforward,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src): #If it is padded, there is no need to compute the padded values
        # src: (src_len, N)
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # result: (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, tgt):
        src_seq_length, N = src.shape
        tgt_seq_length, N = tgt.shape

        src_positions = torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(self.device)  #Create positions for src embedding

        tgt_positions = torch.arange(0, tgt_seq_length).unsqueeze(1).expand(tgt_seq_length, N).to(self.device)  #Create positions for tgt embedding

        src = self.dropout(self.src_word_embedding(src) + self.src_position_embedding(src_positions))   # So that the network is aware of the order of words
        
        tgt = self.dropout(self.tgt_word_embedding(tgt) + self.tgt_position_embedding(tgt_positions))   # So that the network is aware of the order of words

        src_padding_mask = self.make_src_mask(src)
        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt_seq_length).to(self.device)  # We mask the target input to the decoder so that the 1st output of the decoder only 
                                                                                                     # had access to the 1st element and then the 2nd output only had access to the 1st and 
                                                                                                     # 2nd input to the decoder.
        out = self.transformer(
            src,
            tgt,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=tgt_mask,
        )
        out = self.fc_out(out)
        return out

In [None]:
# Hyperparameters
num_epochs = 50
learning_rate = 3e-4
batch_size = 32
src_vocab_size = len(greek.vocab)
tgt_vocab_size = len(english.vocab)
embedding_size = 512
nheads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 200 
dim_feedforward = 2048 #number of nodes in the feedforward network model
src_pad_idx = english.vocab.stoi["<pad>"]

In [None]:
train_iterator, test_iterator, valid_iterator = BucketIterator.splits(
    (train_data, test_data, valid_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.Greek),
    device="cuda"
)

In [None]:
model = Transformer(
    embedding_size,
    src_vocab_size,
    tgt_vocab_size,
    src_pad_idx,
    nheads,
    num_encoder_layers,
    num_decoder_layers,
    dim_feedforward,
    dropout,
    max_len,
    device,
).to(device)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

In [None]:
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
temp = -1
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")
    model.train()
    losses = []
    eval_losses = 0
    for batch_idx, batch in enumerate(train_iterator):
      
        # Get input and targets to cuda
        inp_data = batch.Greek.to(device)
        target = batch.English.to(device)
        if inp_data.shape[0] < 200 and target.shape[0] < 200:

          # Forward propagation
          output = model(inp_data, target[:-1, :])    # We want the target to be shifted, because when we send in the first element of the input
                                                      # to be the start token we want the first output to correspond to the second element in the target.

          # Output is of shape (tgt_len, batch_size, output_dim) but Crossentropy
          # doesn't accept such input
          # We need output_words * batch_size
          # Also remove the start token
          output = output.reshape(-1, output.shape[2])
          target = target[1:].reshape(-1)

          optimizer.zero_grad()

          loss = criterion(output, target)  # Use Crossentropy for gradient,
          losses.append(loss.item())        # but BLEU score for saving best model

          # Back propagation
          loss.backward()

          # Clip to avoid exploding gradient issues
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

          # Gradient descent step
          optimizer.step()

    model.eval()
    score = bleu(valid_data[1:100], model, greek, english, device)
    print("Validation Bleu score", score*100)

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss) # change learning rate

    if score > temp:
      temp = score
      checkpoint = {
          "state_dict": model.state_dict(),
          "optimizer": optimizer.state_dict(),
      }
      print("Saving model...")
      torch.save(checkpoint, "my_checkpoint.pth.tar")

[Epoch 0 / 50]
Validation Bleu score 7.4042216021791205
Saving model...
[Epoch 1 / 50]
Validation Bleu score 12.95580277545326
Saving model...
[Epoch 2 / 50]
Validation Bleu score 18.19541147657579
Saving model...
[Epoch 3 / 50]
Validation Bleu score 21.896437359169344
Saving model...
[Epoch 4 / 50]
Validation Bleu score 27.998257114528464
Saving model...
[Epoch 5 / 50]
Validation Bleu score 31.205713748931885
Saving model...
[Epoch 6 / 50]
Validation Bleu score 34.2140008750644
Saving model...
[Epoch 7 / 50]
Validation Bleu score 36.4376308830772
Saving model...
[Epoch 8 / 50]
Validation Bleu score 35.63706784986916
[Epoch 9 / 50]
Validation Bleu score 38.02049667442805
Saving model...
[Epoch 10 / 50]
Validation Bleu score 39.97228164770609
Saving model...
[Epoch 11 / 50]
Validation Bleu score 39.55351930890009
[Epoch 12 / 50]
Validation Bleu score 39.29257191322627
[Epoch 13 / 50]
Validation Bleu score 40.32613886593779
Saving model...
[Epoch 14 / 50]
Validation Bleu score 43.2841603

In [None]:
saved_model = torch.load("/content/my_checkpoint.pth.tar")
model.load_state_dict(saved_model["state_dict"])
optimizer.load_state_dict(saved_model["optimizer"])

In [None]:
# running on entire test data takes a while
score = bleu(test_data[1:100], model, greek, english, device)
print("Bleu score", score*100)

Bleu score 40.11381957984589


Greek

Bleu score 45.06 | 90000

Translated example sentence: 
 ['main', 'network', 'of', 'the', 'network', '<.eos>']

Bleu score 43.55 | 70000

Translated example sentence: 
 ['main', 'network', '<.eos>']

Bleu score 24.52 | 50000

Translated example sentence: ['agreed', 'place', 'of', 'delivery', 'in', 'arriving', 'country', '<.eos>']

Bleu score 40.11| 40000

Translated example sentence: 
 ['i.', 'network', '<.eos>']

Bleu score 50.99 | 35000

Translated example sentence: 
 ['the', 'network', 'of', 'the', 'network', 'of', 'services', '<.eos>']

Bleu score 51.69 | 29000

Translated example sentence: 
 ['central', 'network', '<.eos>']

Bleu score 47.33 | 25000

Translated example sentence: 
 ['vessels', '<.unk>', '<.eos>']

Bleu score 44.06 | 20000

Translated example sentence: 
 ['<.unk>', 'ban', 'on', 'area', 'payment', '<.eos>']

Bleu score 42.11 | 15000

Translated example sentence: 
 ['professional', 'secrecy', 'in', 'state', 'aid', 'decisions', '<.eos>']

Bleu score 33.11 | 10000

Translated example sentence: 
 ['regular', 'way', 'purchase', 'or', 'sale', 'of', 'wine', '<.eos>']

Bleu score 31.62 | 5000

Translated example sentence: 
 ['application', 'of', 'the', 'new', 'standards', 'should', 'remove', 'products', 'of', 'the', 'combined', 'nomenclature', '<.eos>']

In [None]:
model.eval()
sentence = "Κεντρικό δίκτυο"
translated = translate(model, sentence, greek, english, device, max_length=50)
print("Translated example sentence: \n", translated)

Translated example sentence: 
 ['i.', 'network', '<eos>']
