# Seq2Seq Machine Translation with Attention

## 1. Preprocessing

Dataset link: https://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz

In [27]:
# Check the size of both datasets
with open("./data/unprocessed/europarl-v7.de-en.de") as file:
    ger = [line.rstrip() for line in file]
with open("./data/unprocessed/europarl-v7.de-en.en") as file:
    eng = [line.rstrip() for line in file]

print(len(eng))
print(len(ger))

1920209
1920209


In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
from seq2seq_attention.preprocess import get_parallel_csv

# Take ">" as seperator since it is not included in the text - unique to seperate eng-ger pairs. 
# There were 7 occurences in total of > which were removed by hand before
get_parallel_csv(path_1="./data/unprocessed/europarl-v7.de-en.de", path_2="./data/unprocessed/europarl-v7.de-en.en", new_file_path="./data/processed/en_ger_full.csv", delimiter=">")

File successfully created.
11289 lines were removed.


In [20]:
# Remove sentences with lower number of words
from seq2seq_attention.preprocess import remove_short_sentences
remove_short_sentences(data_dir="./data/processed/en_ger_full.csv", min_length=4, delimiter=">", new_file_path="./data/processed/en_ger_full_removed_sent_len.csv")

File successfully created.
15555 sentence-pairs were removed.


In [22]:
from seq2seq_attention.preprocess import train_test_split
train_test_split(file_path="./data/processed/en_ger_full_removed_sent_len.csv", sep=">", random_seed=118, dir="./data/processed")

All files successfully created.


In [25]:
# Check files
import pandas as pd
train = pd.read_csv("./data/processed/train.csv", header=None, sep=">", names=["ger", "eng"])
val = pd.read_csv("./data/processed/val.csv", header=None, sep=">", names=["ger", "eng"])
test = pd.read_csv("./data/processed/test.csv", header=None,sep=">", names=["ger", "eng"])

In [26]:
train_len = len(train)
val_len = len(val)
test_len = len(test)
print(f"Train: {train_len}, Val: {val_len}, Test: {test_len}")
print(f"Total: {train_len+val_len+test_len}")
print(f"Total + Removed: {train_len+val_len+test_len+11289+15555}")

Train: 1514692, Val: 189336, Test: 189337
Total: 1893365
Total + Removed: 1920209


In [253]:
train.head()

Unnamed: 0,ger,eng
0,Gegenwärtig hat jeder fünfte Mensch auf der We...,"At present, one in five people across the glo..."
1,Leider entsprechen die Entscheidungen des Rate...,"Sadly, the Council's decisions are not always..."
2,Dies ist das Prinzip vom 11. März 2011.,This is the principle of 11 March 2011.
3,In Bezug auf den Inhalt des Dossiers möchte ic...,"Regarding the substance of the dossier, witho..."
4,Ich warne vor Vereinfachungen. Drei Vereinfach...,"I would warn against over-simplification, thr..."


In [254]:
val.head()

Unnamed: 0,ger,eng
0,Handelsbeziehungen und Gipfelkonferenz EU/USA,EU/USA summit and trade relations
1,"Die Versuche gehen ja nun schon lange, sie sin...",Efforts have been going on for a long time no...
2,"Ein Beispiel: für Fische wie den Seehecht, der...","To give you an example, fish such as hake 11 ..."
3,Wenn Sie mit der gleichen Energie und mit der ...,If you act at European level with the same en...
4,"Ich muss alle, die diesen Änderungsantrag vorg...",I have to ask those who proposed the amendmen...


In [121]:
test.head()

Unnamed: 0,ger,eng
0,Möge er so weitermachen!,May he continue to do so.
1,"Es geht jedoch um einen neuen Ansatz, über den...","However, it is about a new approach that I re..."
2,Diese spontanen Beweise der Solidarität waren ...,This spontaneous show of solidarity has been ...
3,Die beiden schwierigeren Punkte betrafen die A...,The two more difficult points concerned the a...
4,Alle leben noch in Angst.,Everyone is still afraid.


## 2. Initialize Dataloaders

In [28]:
from seq2seq_attention.build_dataloaders import build_fields, build_bucket_iterator, get_datasets, build_vocab
BATCH_SIZE = 100
DEVICE = "cpu"

src_field, trg_field = build_fields()
train_set, val_set, test_set = get_datasets(train_path="./data/processed/train.csv", 
                                            val_path="./data/processed/val.csv", 
                                            test_path="./data/processed/test.csv", 
                                            src_field=src_field, 
                                            trg_field=trg_field)
build_vocab(src_field=src_field, trg_field=trg_field, train_set=train_set)

In [None]:
# Check vocabulary 


In [29]:
train_loader = build_bucket_iterator(dataset=train_set, batch_size=BATCH_SIZE, device=DEVICE)
val_loader = build_bucket_iterator(dataset=val_set, batch_size=BATCH_SIZE, device=DEVICE)
test_loader = build_bucket_iterator(dataset=test_set, batch_size=BATCH_SIZE, device=DEVICE)

In [30]:
# Retrieve sample batch
iterator = iter(train_loader)

In [44]:
example = next(iterator)
src_batch = example.src
trg_batch = example.trg
print(src_batch[0].shape, src_batch[1].shape)
print(trg_batch[0].shape, trg_batch[1].shape)

torch.Size([100, 6]) torch.Size([100])
torch.Size([100, 16]) torch.Size([100])


In [45]:
print(src_batch[1])

tensor([6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6])


In [46]:
print(trg_batch[1])

tensor([ 6,  6,  8,  8,  7,  7,  6,  5,  8,  6,  7,  6,  8,  8,  8, 10,  9,  7,
         4,  6,  6,  6,  4,  5, 14,  7,  6,  9,  6,  8,  4,  6,  6,  8,  8,  5,
         6,  6,  7,  6,  6,  6,  7,  4,  7, 10,  5,  6,  6,  5,  7,  6,  6,  4,
         6,  6,  7,  5,  6,  5,  6,  8,  6,  6,  6, 11, 16,  6,  7,  6,  9,  6,
         8,  6,  6,  4,  5,  6,  4,  4,  6,  6,  6,  7,  5,  5,  7,  6,  7,  9,
         9,  6,  6,  4,  6,  7,  6, 13, 11,  8])


In [51]:
# itos is list of token strings with their idx 
for j in range(5):
    src = ""
    for i in src_batch[0][j]:
       src = " ".join([src,  src_field.vocab.itos[i]])
    print(src)
    trg = ""
    for i in trg_batch[0][j]:
        trg = " ".join([trg, trg_field.vocab.itos[i]])
    print(trg)
    print()
# The second element in the tuple is the real length that we pass to the packed_seq!

 <sos> was steckt dahinter ? <eos>
 <sos> for what purpose ? <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

 <sos> was das ist ? <eos>
 <sos> what is that ? <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

 <sos> wollen wir das ? <eos>
 <sos> is that what we want ? <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

 <sos> mittelübertragungen : siehe protokoll <eos>
 <sos> transfers of appropriations : see minutes <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

 <sos> eine krise reicht . <eos>
 <sos> one crisis is enough . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

