In [63]:
import os
import re
import gzip
import spacy
import random
import zipfile
import unicodedata
import numpy as np
import pandas as pd
import urllib.request

from tqdm import tqdm
from collections import Counter
from typing import List, Dict, Tuple, Optional, Iterator

In [64]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

In [65]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x127fda8d0>

## Data Preprocessing

In [12]:
MULTI30K_URL = "https://github.com/multi30k/dataset/raw/master/data/task1/raw"

TRAIN_FILES = {
    'de': "train.de.gz",
    'en': "train.en.gz"
}
VAL_FILES = {
    'de': "val.de.gz",
    'en': "val.en.gz"
}
TEST_FILES = {
    'de': "test_2016_flickr.de.gz",
    'en': "test_2016_flickr.en.gz"
}

In [13]:
DEFAULT_DATA_DIR = "data/multi30k"
SPACY_DE_MODEL = "de_core_news_sm"
SPACY_EN_MODEL = "en_core_web_sm"

### 1. Downloading the datasets from Multi30K

In [15]:
def download_and_extract_data(data_dir):
    os.makedirs(data_dir, exist_ok=True)
    
    for split, files in zip(["train", "val", "test"], [TRAIN_FILES, VAL_FILES, TEST_FILES]):
        for lang, filename in files.items():
            url = f"{MULTI30K_URL}/{filename}"
            output_path = os.path.join(data_dir, filename)

            # Skip if file already exists
            if os.path.exists(output_path.replace('.gz', '')):
                print("File already exists")
                continue

            # Download the file to the specified directory
            urllib.request.urlretrieve(url, output_path)
            with gzip.open(output_path, 'rb') as f_in:
                with open(output_path.replace('.gz', ''), 'wb') as f_out:
                    f_out.write(f_in.read())
            
            # Remove .gz file
            os.remove(output_path)

In [16]:
download_and_extract_data(DEFAULT_DATA_DIR)

### 2. Loading the Spacy models

In [17]:
def load_spacy_models():
    de_nlp = spacy.load(SPACY_DE_MODEL)
    en_nlp = spacy.load(SPACY_EN_MODEL)
    
    return de_nlp, en_nlp

In [18]:
de_nlp, en_nlp = load_spacy_models()

### 3. Reading and tokenizing the data

In [20]:
PAD_TOKEN = "<pad>"
SOS_TOKEN = "<sos>"  
EOS_TOKEN = "<eos>" 
UNK_TOKEN = "<unk>" 

In [21]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Normalize unicode
    text = unicodedata.normalize('NFKD', text)
    
    # Space contraction
    text = re.sub(r'\s+', ' ', text)
    
    # Trim the sentence
    text = text.strip()
    
    return text

In [22]:
def tokenize_sentence(sentence, spacy_model, max_length = None):
    # Preprocess the sentence
    sentence = preprocess_text(sentence)
    
    # Tokenize
    tokens = [token.text for token in spacy_model(sentence)]
    
    # Truncate if necessary
    if max_length is not None and len(tokens) > max_length - 2:
        tokens = tokens[:max_length - 2]
    
    # Add SOS and EOS tokens
    tokens = [SOS_TOKEN] + tokens + [EOS_TOKEN]
    
    return tokens

In [23]:
def read_and_tokenize_data(data_dir, split, de_nlp, en_nlp, max_length = None):
    if split == 'train':
        de_path = os.path.join(data_dir, TRAIN_FILES['de'].replace('.gz', ''))
        en_path = os.path.join(data_dir, TRAIN_FILES['en'].replace('.gz', ''))
    elif split == 'val':
        de_path = os.path.join(data_dir, VAL_FILES['de'].replace('.gz', ''))
        en_path = os.path.join(data_dir, VAL_FILES['en'].replace('.gz', ''))
    elif split == 'test':
        de_path = os.path.join(data_dir, TEST_FILES['de'].replace('.gz', ''))
        en_path = os.path.join(data_dir, TEST_FILES['en'].replace('.gz', ''))
    else:
        print(f"Invalid split: {split}")
    
    # Read files
    with open(de_path, 'r', encoding='utf-8') as f:
        de_sentences = f.readlines()
    
    with open(en_path, 'r', encoding='utf-8') as f:
        en_sentences = f.readlines()
    
    
    # Tokenize sentences
    tokenized_de = []
    tokenized_en = []
    
    for de_sent, en_sent in tqdm(zip(de_sentences, en_sentences), total=len(de_sentences)):
        de_tokens = tokenize_sentence(de_sent, de_nlp, max_length)
        en_tokens = tokenize_sentence(en_sent, en_nlp, max_length)
        
        tokenized_de.append(de_tokens)
        tokenized_en.append(en_tokens)
    
    return tokenized_de, tokenized_en

In [26]:
tokenized_de_train, tokenized_en_train = read_and_tokenize_data(DEFAULT_DATA_DIR, 'train', de_nlp, en_nlp, 100)
tokenized_de_val, tokenized_en_val = read_and_tokenize_data(DEFAULT_DATA_DIR, 'val', de_nlp, en_nlp, 100)
tokenized_de_test, tokenized_en_test = read_and_tokenize_data(DEFAULT_DATA_DIR, 'test', de_nlp, en_nlp, 100)

100%|████████████████████████████████████| 29000/29000 [03:08<00:00, 153.87it/s]
100%|██████████████████████████████████████| 1014/1014 [00:06<00:00, 156.77it/s]
100%|██████████████████████████████████████| 1000/1000 [00:06<00:00, 154.58it/s]


### 4. Build Vocabularies

In [43]:
class Vocabulary:
    def __init__(self, language, min_freq = 2):
        self.language = language
        self.min_freq = min_freq
        self.word2idx = {}
        self.idx2word = {}
        self.word_freq = Counter()
        self.specials = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN]

        for token in self.specials:
            self.add_token(token)

    # Token and indices updation
    def add_token(self, token):
        if token not in self.word2idx:
            self.word2idx[token] = len(self.word2idx)
            self.idx2word[len(self.idx2word)] = token
        return self.word2idx[token]

    # Counter updation
    def add_tokens(self, tokens):
        self.word_freq.update(tokens)

    # Main build
    def build(self):
        words = [word for word, freq in self.word_freq.items() if freq >= self.min_freq]

        for word in words:
            self.add_token(word)
    
    def __len__(self):
        return len(self.word2idx)

    # Get token's index
    def token_to_idx(self, token):
        return self.word2idx.get(token, self.word2idx[UNK_TOKEN])

    # Get list of indices for tokens
    def tokens_to_indices(self, tokens):
        return [self.token_to_idx(token) for token in tokens]

    # Get index's respective token
    def idx_to_token(self, idx):
        return self.idx2word.get(idx, UNK_TOKEN)

    # Get list of tokens for incides
    def indices_to_tokens(self, indices):
        return [self.idx_to_token(idx) for idx in indices]

In [44]:
def build_vocabularies(tokenized_de, tokenized_en, min_freq = 2):
    # Create vocabulary object
    de_vocab = Vocabulary(language='de', min_freq=min_freq)
    en_vocab = Vocabulary(language='en', min_freq=min_freq)
    
    # Add tokens
    for tokens in tokenized_de:
        de_vocab.add_tokens(tokens)
    
    for tokens in tokenized_en:
        en_vocab.add_tokens(tokens)
    
    # Build vocabularies
    de_vocab.build()
    en_vocab.build()
    
    return de_vocab, en_vocab

In [45]:
de_vocab, en_vocab = build_vocabularies(tokenized_de_train, tokenized_en_train, min_freq = 2)

### 5. Convert tokens to indices

In [46]:
def convert_to_indices(tokenized_sentences, vocab):
    return [vocab.tokens_to_indices(tokens) for tokens in tokenized_sentences]

In [47]:
de_indices_train = convert_to_indices(tokenized_de_train, de_vocab)
en_indices_train = convert_to_indices(tokenized_en_train, en_vocab)
de_indices_val = convert_to_indices(tokenized_de_val, de_vocab)
en_indices_val = convert_to_indices(tokenized_en_val, en_vocab)
de_indices_test = convert_to_indices(tokenized_de_test, de_vocab)
en_indices_test = convert_to_indices(tokenized_en_test, en_vocab)

### 6. Creating Data Loaders

In [55]:
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, source_vocab, target_vocab):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
    
    def __len__(self):
        return len(self.source_sentences)
    
    def __getitem__(self, idx):
        source = torch.tensor(self.source_sentences[idx], dtype=torch.long)
        target = torch.tensor(self.target_sentences[idx], dtype=torch.long)
        return source, target

In [56]:
def collate_fn(batch, pad_idx):
    # Sort batch by source length (descending)
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    
    # Separate source and target sequences
    source_seqs, target_seqs = zip(*batch)
    
    # Get lengths
    source_lengths = [len(seq) for seq in source_seqs]
    target_lengths = [len(seq) for seq in target_seqs]
    
    # Pad sequences
    padded_source = torch.nn.utils.rnn.pad_sequence(
        source_seqs, batch_first=True, padding_value=pad_idx
    )
    padded_target = torch.nn.utils.rnn.pad_sequence(
        target_seqs, batch_first=True, padding_value=pad_idx
    )
    
    # Convert lengths to tensor
    source_lengths = torch.tensor(source_lengths, dtype=torch.long)
    target_lengths = torch.tensor(target_lengths, dtype=torch.long)
    
    return padded_source, padded_target, source_lengths, target_lengths

In [57]:
def create_data_loaders(de_indices_train, en_indices_train, de_indices_val, 
                        en_indices_val, de_indices_test, en_indices_test, 
                        de_vocab: Vocabulary, en_vocab: Vocabulary,
                        batch_size = 64, shuffle = True):

    # Create datasets
    train_dataset = TranslationDataset(de_indices_train, en_indices_train, de_vocab, en_vocab)
    val_dataset = TranslationDataset(de_indices_val, en_indices_val, de_vocab, en_vocab)
    test_dataset = TranslationDataset(de_indices_test, en_indices_test, de_vocab, en_vocab)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=lambda batch: collate_fn(batch, de_vocab.token_to_idx(PAD_TOKEN))
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: collate_fn(batch, de_vocab.token_to_idx(PAD_TOKEN))
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: collate_fn(batch, de_vocab.token_to_idx(PAD_TOKEN))
    )
    
    return train_loader, val_loader, test_loader


In [59]:
train_loader, val_loader, test_loader = create_data_loaders(
        de_indices_train, en_indices_train,
        de_indices_val, en_indices_val,
        de_indices_test, en_indices_test,
        de_vocab, en_vocab, batch_size = 64)

### 7. Create Embeddings

In [61]:
def prepare_embeddings(de_vocab, en_vocab, embedding_dim = 300):
    # Initialize embedding matrices with random values
    de_embeddings = torch.randn(len(de_vocab), embedding_dim)
    en_embeddings = torch.randn(len(en_vocab), embedding_dim)
    
    # Set padding token embedding to zeros
    de_embeddings[de_vocab.token_to_idx(PAD_TOKEN)] = torch.zeros(embedding_dim)
    en_embeddings[en_vocab.token_to_idx(PAD_TOKEN)] = torch.zeros(embedding_dim)
    
    return de_embeddings, en_embeddings

In [66]:
de_embeddings, en_embeddings = prepare_embeddings(de_vocab, en_vocab, embedding_dim = 300)

In [67]:
de_embeddings

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0318,  0.1016,  1.3433,  ...,  0.3130,  0.8050, -1.1134],
        [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
        ...,
        [-0.1072, -2.1045, -1.8351,  ...,  0.3758, -0.1355, -0.7026],
        [-0.0394,  0.3005,  1.2208,  ...,  1.6449,  0.1213,  1.3730],
        [ 0.0093, -0.3865, -1.1337,  ...,  1.0761,  0.3917, -1.0943]])

### 8. Save processed data

In [72]:
def save_data(de_vocab, en_vocab, de_embeddings, en_embeddings, output_dir = DEFAULT_DATA_DIR + "data/processed_data"):
    os.makedirs(output_dir, exist_ok=True)
    
    # Save vocabularies
    torch.save(de_vocab, os.path.join(output_dir, "de_vocab.pt"))
    torch.save(en_vocab, os.path.join(output_dir, "en_vocab.pt"))
    
    # Save embeddings
    torch.save(de_embeddings, os.path.join(output_dir, "de_embeddings.pt"))
    torch.save(en_embeddings, os.path.join(output_dir, "en_embeddings.pt"))

In [73]:
save_data(de_vocab, en_vocab, de_embeddings, en_embeddings, "data/processed_data")