In [1]:
# !conda install -c conda-forge spacy -y
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
!pip install datasets==2.18

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-ne

In [2]:
# importing required libraries
import warnings
warnings.simplefilter("ignore")
import torch.nn as nn
import torch
import torch.nn.functional as F
import math,copy,re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import gc
print(torch.__version__)

import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.vocab import vocab
from collections import Counter
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
from torch.nn import (TransformerEncoder, TransformerDecoder,TransformerEncoderLayer, TransformerDecoderLayer)
from torch import Tensor
import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


2.2.1+cu121


In [3]:
spacy_eng = spacy.load("en_core_web_sm")
spacy_ger = spacy.load("de_core_news_sm")


def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


multi30k = load_dataset("bentrevett/multi30k")
train, test = multi30k['train'], multi30k['test']

Downloading readme:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
ger_counter = Counter()
eng_counter = Counter()
for data in tqdm(train):
    ger_counter.update(tokenizer_ger(data['de'].lower()))
    eng_counter.update(tokenizer_eng(data['en'].lower()))



ger_vocab = vocab(ger_counter, min_freq=2, specials=("<unk>", "<pad>", "<sos>", "<eos>"))
eng_vocab = vocab(eng_counter, min_freq=2, specials=("<unk>", "<pad>", "<sos>", "<eos>"))
ger_vocab.set_default_index(ger_vocab["<unk>"])
eng_vocab.set_default_index(eng_vocab["<unk>"])
print(f"Size of German Vocab : {len(ger_vocab)}\n Size of English Vocab : {len(eng_vocab)}")


text_transform_eng = lambda x: [eng_vocab['<sos>']] + [eng_vocab[token.lower()] for token in tokenizer_eng(x)] + [eng_vocab['<eos>']]
text_transform_ger = lambda x: [ger_vocab['<sos>']] + [ger_vocab[token.lower()] for token in tokenizer_ger(x)] + [ger_vocab['<eos>']]

BATCH_SIZE = 128
PAD_IDX = ger_vocab['<pad>']
BOS_IDX = ger_vocab['<sos>']
EOS_IDX = ger_vocab['<eos>']


def collate_batch(batch):
    src_list, tgt_list = [], []
    for data in batch:
        src_list.append(torch.tensor(text_transform_ger(data['de'])))
        tgt_list.append(torch.tensor(text_transform_eng(data['en'])))

    src_list = pad_sequence(src_list, padding_value=ger_vocab['<pad>']).T
    tgt_list = pad_sequence(tgt_list, padding_value=eng_vocab['<pad>']).T

    inp = {
        "src": src_list.permute(-1,-2),
        "tgt": tgt_list.permute(-1,-2)
    }

    return inp

train_dataloader = DataLoader(
    train,
    collate_fn = collate_batch,
    shuffle = True,
    batch_size = BATCH_SIZE,
    pin_memory = True
)

test_dataloader = DataLoader(
    test,
    collate_fn = collate_batch,
    shuffle = False,
    batch_size = BATCH_SIZE,
    pin_memory = True
)


100%|██████████| 29000/29000 [00:12<00:00, 2304.36it/s]


Size of German Vocab : 7853
 Size of English Vocab : 5893


In [5]:
# transformer
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        decoder_layer = TransformerDecoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding +
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [7]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [8]:
SRC_VOCAB_SIZE = len(ger_vocab)
TGT_VOCAB_SIZE = len(eng_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
NUM_EPOCHS = 50
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(
    transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9
)

In [9]:
def train_epoch(model, train_iter, optimizer):
    model.train()
    losses = 0
    for idx, data in enumerate(train_iter):

        src = data["src"].to(device)
        tgt = data["tgt"].to(device)
        # print("src", src.shape)
        # print("tgt", tgt.shape)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                       src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
    torch.save(model, "my_checkpoint.pth.tar")

    return losses / len(train_iter)


def evaluate(model, val_iter):
    model.eval()
    losses = 0
    for idx, data in (enumerate(test_dataloader)):
        src = data["src"].to(device)
        tgt = data["tgt"].to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                       src_padding_mask, tgt_padding_mask, src_padding_mask)
        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(val_iter)

In [10]:
for epoch in range(1, 15):
    start_time = time.time()
    train_loss = train_epoch(transformer, train_dataloader, optimizer)
    end_time = time.time()
    val_loss = evaluate(transformer, test_dataloader)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "
           f"Epoch time = {(end_time - start_time):.3f}s"))

Epoch: 1, Train loss: 4.920, Val loss: 3.759, Epoch time = 34.240s
Epoch: 2, Train loss: 3.490, Val loss: 2.974, Epoch time = 32.953s
Epoch: 3, Train loss: 2.929, Val loss: 2.569, Epoch time = 32.601s
Epoch: 4, Train loss: 2.578, Val loss: 2.315, Epoch time = 32.599s
Epoch: 5, Train loss: 2.318, Val loss: 2.141, Epoch time = 32.745s
Epoch: 6, Train loss: 2.113, Val loss: 2.007, Epoch time = 32.715s
Epoch: 7, Train loss: 1.943, Val loss: 1.904, Epoch time = 33.026s
Epoch: 8, Train loss: 1.801, Val loss: 1.824, Epoch time = 33.022s
Epoch: 9, Train loss: 1.677, Val loss: 1.781, Epoch time = 33.121s
Epoch: 10, Train loss: 1.571, Val loss: 1.734, Epoch time = 33.016s
Epoch: 11, Train loss: 1.477, Val loss: 1.705, Epoch time = 33.031s
Epoch: 12, Train loss: 1.392, Val loss: 1.677, Epoch time = 34.328s
Epoch: 13, Train loss: 1.315, Val loss: 1.664, Epoch time = 33.257s
Epoch: 14, Train loss: 1.241, Val loss: 1.648, Epoch time = 33.503s


In [11]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(device).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
          break
    return ys


def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
    model.eval()
    tokens = [ger_vocab["<sos>"]] + [ src_vocab[tok.lower() ] for tok in src_tokenizer(src)] + [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1))
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    print(tgt_tokens)
    return " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<sos>", "").replace("<eos>", "")

output = translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", ger_vocab, eng_vocab, tokenizer_ger)
print(output)

tensor([   2,   21,  243,   74,  118,   35,   17,  109,   74,  202, 3422,  672,
          14,    3], device='cuda:0')
 a group of people standing in front of an icy country . 
