<a href="https://colab.research.google.com/github/Raian-Rahman/Design-Project-Codes/blob/main/Transformer_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U torchtext==0.8.0

Requirement already up-to-date: torchtext==0.8.0 in /usr/local/lib/python3.7/dist-packages (0.8.0)


In [3]:
%cd /content/

import torch
import torch.nn as nn
import torch.optim as optim  
import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

/content


In [4]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys

def translate_sentence(model, sentence, german, english, device, max_length=50):
    # Load german tokenizer
    spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [english.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

### Data Preprocessing

In [None]:
!python -m spacy download en
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 6.8MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp37-none-any.whl size=14907057 sha256=705f55f28a1221869f2c6f1d6cc195532c926b7350bc9c82a143a7231568a2d8
  Stored in directory: /tmp/pip-ephem-wheel-cache-l_bg8vcv/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Inst

In [None]:
spacy_ger = spacy.load('de')
spacy_eng = spacy.load('en')

In [None]:
def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

In [None]:
def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [None]:
"""
torchtext field defines how data should be processed in the text
"""
german = Field(tokenize=tokenize_ger, lower=True, init_token="<SOS>", eos_token="<eos>")
english = Field(tokenize=tokenize_eng, lower=True, init_token="<SOS>", eos_token="<eos>")



### Training data and Validation data

In [None]:
train_data, validation_data, test_data = Multi30k.splits(
    exts = (".de",".en"), fields = (german,english)
) 

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 594kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 164kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 165kB/s]


In [None]:
german.build_vocab(train_data,max_size = 20000, min_freq = 2)
english.build_vocab(train_data,max_size = 20000, min_freq = 2)

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out


In [None]:
#setup the training phase

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

load_model = False
save_model = True

num_epochs = 50
learning_rate = 3e-4
batch_size = 32

src_vocab_size = len(german.vocab)
trg_vocab_size = len(english.vocab)

embedding_size = 512
num_heads = 8
num_encoder_layers = 6
num_decoder_layers = 6
dropout = 0.10
max_len = 100 #maximum sentence length used for positional embedding
forward_expansion = 4
src_pad_idx = english.vocab.stoi["<pad>"]

cuda


In [None]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data,validation_data, test_data), batch_size = batch_size, sort_within_batch = True, sort_key = lambda x:len(x.src)
)



In [None]:
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device
).to(device)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"),model, optimizer)

In [None]:
sentence = "ich liebe dich so sehr"

In [None]:
for epoch in range(num_epochs):
    print(f"[Epoch {epoch}/{num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict" : model.state_dict(),
            "optimizer"  : optimizer.state_dict()
        }
        save_checkpoint(checkpoint)

    model.eval()
    translated_sentence = translate_sentence(
        model, sentence, german, english, device, 4
    )

    print(f"Translated example sentence\n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        #forward prop

        output = model(inp_data, target[:-1])
        """
        comment
        """

        output = output.reshape(-1, output.shape[2])

        target = target[1:].reshape(-1)
        optimizer.zero_grad()

        loss = criterion(output,target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)
        optimizer.step()

[Epoch 0/50]
=> Saving checkpoint
Translated example sentence
 ['counter', 'counter', 'counter', 'counter']




[Epoch 1/50]
=> Saving checkpoint
Translated example sentence
 ['a', 'football', 'player', 'player']
[Epoch 2/50]
=> Saving checkpoint
Translated example sentence
 ['a', 'group', 'of', '<unk>']
[Epoch 3/50]
=> Saving checkpoint
Translated example sentence
 ['this', 'very', 'very', 'very']
[Epoch 4/50]
=> Saving checkpoint
Translated example sentence
 ['a', 'very', 'very', 'very']
[Epoch 5/50]
=> Saving checkpoint
Translated example sentence
 ['the', 'very', 'very', 'very']
[Epoch 6/50]
=> Saving checkpoint
Translated example sentence
 ['very', 'very', 'very', 'very']
[Epoch 7/50]
=> Saving checkpoint
Translated example sentence
 ['protesters', 'of', 'very', 'very']
[Epoch 8/50]
=> Saving checkpoint
Translated example sentence
 ['protesters', 'of', 'very', 'cause']
[Epoch 9/50]
=> Saving checkpoint
Translated example sentence
 ['very', 'very', 'very', 'very']
[Epoch 10/50]
=> Saving checkpoint
Translated example sentence
 ['very', 'very', 'very', 'very']
[Epoch 11/50]
=> Saving checkpoi

In [None]:
score = bleu(test_data, model, german, english, device)
print(f"Bleu Score {score*100:.2f}")

Bleu Score5.71


In [None]:
translated_sentence = translate_sentence(
        model, sentence, german, english, device, 3
)

print(f"Translated example sentence\n {translated_sentence}")

Translated example sentence
 ['i', '<unk>', '.']
