In [13]:
import torch 
from torch.utils.data import random_split,DataLoader
import torch.nn as nn
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from pathlib import Path

In [12]:
def get_sentances(ds,lang):
    for item in ds:
        yield item["translation"][lang]

In [11]:
def get_or_build_tokenizer(config,ds , lang):
    tokenizer_path = Path(config["tokenizer_path"].format(lang))
    if not tokenizer_path.exists():
        tokenizer = Tokenizer(model= WordLevel(unk_token= "[UNK]"))
        tokenizer.pre_tokenizer = Whitespace() 
        trainer = WordLevelTrainer(special_tokens=["[UNK]","[PAD]" , "[SOS]","[EOS]"],min_frequency=2)
        tokenizer.train_from_iterator(iterator=get_sentances(ds,lang),trainer=trainer)
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer
        

In [15]:
def get_ds(config):
    lang_src = config["lang_src"]
    lang_tgt = config["lang_tgt"]
    ds_raw = load_dataset(name='opus_books', subset = f"{lang_src}-{lang_tgt}", split="train")
    
    #We define two tokenizer (each language has different tokens)
    tokenizer_src = get_or_build_tokenizer(config,ds_raw,lang_src)
    tokenizer_tgt = get_or_build_tokenizer(config,ds_raw,lang_tgt)
    
    train_size = int(0.9 * len(ds_raw))
    val_size = int(0.1 * len(ds_raw))
    train_ds_raw , val_ds_raw = random_split(dataset=ds_raw,size = [train_size,val_size])

