#### Installations

In [None]:
!pip install nltk
!pip install bert_score
!pip install datasets
!pip install -U  torchdata
!pip install -U spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

#### Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
import tqdm
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from typing import Iterable, List
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
from timeit import default_timer as timer
import nltk 
from nltk.tokenize import word_tokenize
import evaluate
import bert_score
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  return torch._C._cuda_getDeviceCount() > 0


#### Loading data and data visualization

In [35]:
train_data = load_dataset("wmt16", "de-en", split="train[:50000]")
val_data = load_dataset("wmt16", "de-en", split="validation")
test_data = load_dataset("wmt16", "de-en", split="test")

In [36]:
print(f"Train data size: {len(train_data)}, type: {type(train_data)}")
print(f"Validation data size: {len(val_data)}, type: {type(val_data)}")
print(f"Test data size: {len(test_data)}, type: {type(test_data)}")

Train data size: 50000, type: <class 'datasets.arrow_dataset.Dataset'>
Validation data size: 2169, type: <class 'datasets.arrow_dataset.Dataset'>
Test data size: 2999, type: <class 'datasets.arrow_dataset.Dataset'>


In [37]:
for i in range(5):
    data = train_data[i]
    german = data["translation"]["de"]
    english = data["translation"]["en"]
    print(f"German: {german}")
    print(f"English: {english}")

German: Wiederaufnahme der Sitzungsperiode
English: Resumption of the session
German: Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.
English: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
German: Wie Sie feststellen konnten, ist der gefürchtete "Millenium-Bug " nicht eingetreten. Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden.
English: Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
German: Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sit

In [38]:
config = dict(
    source_language = "de", 
    target_language = "en",
    EMB_SIZE = 512,
    NHEAD = 8,
    FFN_HID_DIM = 512,
    BATCH_SIZE = 32,
    NUM_ENCODER_LAYERS = 3,
    NUM_DECODER_LAYERS = 3,
    NUM_EPOCHS = 4
)

In [39]:
source_language = config["source_language"]
target_language = config["target_language"]

#### Initialising tokenizer

In [40]:
# Tokenization
token_transform = {}
token_transform[source_language] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[target_language] = get_tokenizer('spacy', language='en_core_web_sm')

In [41]:
for i in range(5): 
    print(train_data[i])
    data_pt = train_data[i]
    data_src = data_pt['translation'][source_language]
    data_tgt = data_pt['translation'][target_language]
    print(f"German: {data_src}")
    print(f"English: {data_tgt}")
    print(f"Tokenized German: {token_transform[source_language](data_src)}")
    print(f"Tokenized English: {token_transform[target_language](data_tgt)}")
    break
    

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}}
German: Wiederaufnahme der Sitzungsperiode
English: Resumption of the session
Tokenized German: ['Wiederaufnahme', 'der', 'Sitzungsperiode']
Tokenized English: ['Resumption', 'of', 'the', 'session']


In [42]:
# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    for data_sample in data_iter:
        yield token_transform[language](data_sample['translation'][language])

In [43]:
tokens = yield_tokens(train_data, source_language)
# Checking whether tokenization works or not
for token in tokens: 
    print(token)
    break

['Wiederaufnahme', 'der', 'Sitzungsperiode']


#### Text pre-processing

In [44]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [45]:
vocab_transform = {}   
for ln in [source_language, target_language]:
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_data, ln), min_freq=1, specials=special_symbols)

In [46]:
for ln in [source_language, target_language]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [47]:
SRC_VOCAB_SIZE = len(vocab_transform[source_language])
TGT_VOCAB_SIZE = len(vocab_transform[target_language])

In [48]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [49]:
def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [50]:
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

In [51]:
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]), torch.tensor(token_ids), torch.tensor([EOS_IDX])))

In [52]:
text_transform = {}
for ln in [source_language, target_language]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor

#### Dataloaders

In [53]:
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for element in batch: 
        src_ = element["translation"][source_language]
        tgt_ = element["translation"][target_language]
        src_batch.append(text_transform[source_language](src_))
        tgt_batch.append(text_transform[target_language](tgt_))
    src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = nn.utils.rnn.pad_sequence(tgt_batch, padding_value=PAD_IDX)
    element = {'src': src_batch, 'tgt': tgt_batch}
    return element

train_dataloader = DataLoader(train_data, batch_size=32, collate_fn=collate_fn, shuffle=True)

for i, batch in enumerate(train_dataloader):
    src = batch["src"]
    tgt = batch["tgt"]
    tgt_input = tgt[:-1, :]
    src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
    break

In [54]:
train_dataloader = DataLoader(train_data, batch_size=config["BATCH_SIZE"], collate_fn=collate_fn, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=config["BATCH_SIZE"], collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=config["BATCH_SIZE"], collate_fn=collate_fn)

#### Model Architecture

In [55]:
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [56]:
# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [57]:
# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int, emb_size: int, nhead: int, src_vocab_size: int, tgt_vocab_size: int, dim_feedforward: int = 512, dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor, tgt_mask: Tensor, src_padding_mask: Tensor, tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

#### WandB Setup

In [58]:
import wandb

In [59]:
wandb.login(relogin=True)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/mehar21541/.netrc


True

In [60]:
wandb.init(project = "assignment-3", entity = "nlp-assignments", config = config)

#### Training Setup

In [61]:
torch.manual_seed(0)

transformer = Seq2SeqTransformer(config["NUM_ENCODER_LAYERS"], config["NUM_DECODER_LAYERS"], config["EMB_SIZE"], config["NHEAD"], SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, config["FFN_HID_DIM"])

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



In [62]:
def train_epoch(model, optimizer, dataloader):
    model.train()
    losses = 0
    wandb.define_metric('Minibatch_Epoch')
    wandb.define_metric('MiniBatch_Loss')
    minibatch = 0
    for i, batch in enumerate(dataloader):
        src = batch["src"]
        tgt = batch["tgt"]
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
        # logging
        log = {}
        log["Minibatch_Epoch"] = minibatch
        log["MiniBatch_Loss"] = loss.item()
        wandb.log(log)
        minibatch += 1

    return losses / len(list(dataloader))

In [63]:
def evaluate(model, dataloader):
    model.eval()
    losses = 0

    for i, batch in enumerate(dataloader):
        src = batch["src"]
        tgt = batch["tgt"]
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(list(dataloader))

#### Training Loop

In [64]:
for epoch in range(1, config["NUM_EPOCHS"]+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer, train_dataloader)
    end_time = timer()
    val_loss = evaluate(transformer, val_dataloader)
    wandb.define_metric('Epoch')
    wandb.define_metric('Train_Loss')
    wandb.define_metric('Val_Loss')
    log = {}
    log["Epoch"] = epoch
    log["Train_Loss"] = train_loss
    log["Val_Loss"] = val_loss
    wandb.log(log)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))



In [None]:
wandb.finish()

0,1
MiniBatch_Loss,█▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁
Minibatch_Epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
MiniBatch_Loss,6.39236
Minibatch_Epoch,103.0


#### Model Evaluation

In [None]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0)).type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

In [None]:
# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[config["source_language"]](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[config["target_language"]].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [None]:
# translate German to English
for i in range(5):
    src = train_data[i]["translation"][config["source_language"]]
    print(f"German: {src}")
    print(f"Translated English: {translate(transformer, src)}")
    print(f"Actual English: {train_data[i]['translation'][config['target_language']]}")
    print()

German: Wiederaufnahme der Sitzungsperiode
Translated English:  the the the the the the the the the
Actual English: Resumption of the session

German: Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.
Translated English:  the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Actual English: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.

German: Wie Sie feststellen konnten, ist der gefürchtete "Millenium-Bug " nicht eingetreten. Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden.
Translated English:  the the the t

#### Saving Model

In [None]:
# save the model
torch.save(transformer.state_dict(), "2A_transformer.pth")

#### Evaluation Metrics Computation

In [None]:
# compute BLEU score using huggingface datasets library
from datasets import load_metric
metric = load_metric("bleu")
# compute actual BLEU score on validation dataset
prediction = []
reference = []
for i in range(100):
    src = val_data[i]["translation"][config["source_language"]]
    pred = translate(transformer, src)
    prediction.append(pred)
    reference.append([val_data[i]["translation"][config["target_language"]]])
references = [r[0] for r in reference]

In [None]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=prediction, references=reference)
print(results)

In [None]:
meteor = evaluate.load("meteor")
results = meteor.compute(predictions=prediction, references=references)
print(results)

In [None]:
bertScore = evaluate.load("bertscore")
results = bertScore.compute(predictions=prediction, references=references, lang="en")
print(results)