# Seq2Seq Machine Translation with Attention

## 1. Data exploration and preprocessing

In [13]:
# Get datasets as lists
with open("./data/europarl-v7.de-en.de") as file:
    ger = [line.rstrip() for line in file]
with open("./data/europarl-v7.de-en.en") as file:
    eng = [line.rstrip() for line in file]

In [17]:
print(len(eng))

1920209


In [18]:
len(ger)

1920209

In [19]:
words = 0
for sent in eng:
    words += len(sent.split())
print(f"Number of english words: {words}")

words = 0
for sent in ger:
    words += len(sent.split())
print(f"Number of german words: {words}")

Number of english words: 47882343
Number of german words: 44614285


In [20]:
# Tokenization - use spacy
import spacy
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_ger(text):
    """
    Take german sentence and tokenize it using spacy. 
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_eng(text):
    """
    Take english sentence and tokenize it using spacy. 
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [225]:
from torchtext.data import Field, BucketIterator

src_field = Field(init_token = '<sos>', 
            eos_token = '<eos>',
            pad_token='<pad>', 
            unk_token='<unk>',
            lower = True, 
            include_lengths = True,
            sequential=True,
            batch_first=True)

trg_field = Field(init_token = '<sos>', 
            eos_token = '<eos>',
            pad_token='<pad>', 
            unk_token='<unk>',
            lower = True, 
            include_lengths = True,
            sequential=True,batch_first=True)

In [165]:
# Tokenize data
ger_token = [tokenize_ger(sent) for sent in ger]
eng_token = [tokenize_eng(sent) for sent in eng]

In [166]:
# Vocab size 32K
# https://jlibovicky.github.io/2021/07/24/MT-Weekly-The-Wisdom-of-the-WMT-Crowd.html

In [167]:
max_vocab_size = 32000
src_field.build_vocab(ger_token, min_freq=2, max_size=max_vocab_size)
trg_field.build_vocab(eng_token,  min_freq=2, max_size=max_vocab_size)

In [163]:
# Frequency hodls all frequencies
print(len(SRC.vocab.freqs))

# ITOS/
print(len(SRC.vocab.stoi))
print(len(SRC.vocab.itos))

397884
32004
32004


In [168]:
from torch.utils.data import Dataset
class TextDtaatset(Dataset):

  def __init__(self, data):
    self.text = data
    


  def __len__(self):

    return len(self.text)


  def __getitem__(self, idx):

    return self.text[idx]

In [169]:
ger_dataset = TextDtaatset(data=ger_token)

In [177]:
# Define iterator
from torchtext.data import BucketIterator

iterator = BucketIterator.splits(datasets=ger_dataset, batch_size=2, sort_key=lambda x: len(x))

In [195]:
from torchtext.data import TabularDataset

with open('./data/train_de') as src, open('./data/train_en') as tgt:
    with open('./data/train.csv','w') as file:
        for src_sentence, tgt_sentence in zip(src, tgt):
            line = f'{src_sentence.rstrip()} , {tgt_sentence.rstrip()}'
            file.write(line)
            file.write('\n')

with open('./data/val_de') as src, open('./data/val_en') as tgt:
    with open('./data/val.csv','w') as file:
        for src_sentence, tgt_sentence in zip(src, tgt):
            line = f'{src_sentence.rstrip()} , {tgt_sentence.rstrip()}'
            file.write(line)
            file.write('\n')

In [226]:
(train_obj, valid_obj) = TabularDataset.splits(
  path="",
  train='./data/val.csv',
  validation='./data/val.csv',
  format='csv',
  fields=[('src',src_field ), ('trg', trg_field)])

In [227]:
src_field.build_vocab(train_obj, min_freq=1, max_size=max_vocab_size)
trg_field.build_vocab(train_obj,  min_freq=1, max_size=max_vocab_size)

In [240]:
train_iter = BucketIterator(
  dataset=train_obj,
  batch_size = 2,
  sort_key=lambda x: len(x.src),
  shuffle=True,
  device="cpu",)

In [241]:
example=next(iter(train_iter))
src = example.src
trg = example.trg
print(src)
print(trg)

(tensor([[ 2,  4,  6,  3,  1,  1],
        [ 2, 14,  8,  7, 10,  3]]), tensor([4, 6]))
(tensor([[ 2,  5,  9,  3,  1],
        [ 2,  7,  4, 10,  3]]), tensor([4, 5]))


In [242]:
# itos is list of token strings with their idx 
for i in src[0][0]:
    print(src_field.vocab.itos[i])
print()
for i in trg[0][0]:
    print(trg_field.vocab.itos[i])

# The second element in the tuple is the real length that we pass to the packed_seq!

<sos>
bestens
danke
<eos>
<pad>
<pad>

<sos>
fine
thanks
<eos>
<pad>


In [205]:
trg