# 1. Train The Model

## 1.1 Import the module

In [1]:
import torch 
import torch.nn as nn 


In [8]:
from datasets import load_dataset                    # we will use the dataset 
from tokenizers import Tokenizer                      # tokenizer library in huggingface
from tokenizers.models import WordLevel                # we will use the word lavel tokenizer
from tokenizers.trainers import WordLevelTrainer        # that class will train the tokenizer 
from tokenizers.pre_tokenizers import Whitespace         # we will split the word according to the white space


In [None]:
from pathlib import Path

## 1.2 Tokenizers
- Most of the code taken from https://huggingface.co/docs/tokenizers/quicktour 

In [None]:
# we will get all senteces so that we can iterate through the dataset to get all the sentences corresponding to the part the particular language 
# for which we are creating the tokenizer 
def get_all_sentences(ds, lang):
    for item in ds:    # each item of dataset is pair of sentences extract the one particular language 
        yield item['translation'][lang]   # this is the item representing the pair and from this pair we extract only the one language thats we want

In [None]:
def get_or_build_tokenizer(config,     # configuration of our model 
                           ds,          # dataset 
                           lang):        # language which language convert the token
    
    # that file where we will save this tokenizer 
    tokenizer_path = Path(config['tokenizer_file'].format(lang))   # path of configuration file name is tokenizer_file and which format of language
    if not Path.exists(tokenizer_path):             # if path is exists 
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))    # unk means unknown, if our tokenizer sees a word that it does not recogized in its vocabulary it will replace it with this word unknown it will map it to the number corresponding to this word unknown 
        tokenizer.pre_tokenizer = Whitespace           # that we wplit by white space 
        
        # then we train we build the trainer to train our tokenizer 
        trainer = WordLevelTrainer(   # this is word lavel trainer it will split the word using the white space and also single word
            special_tokens = ["[UNK]",     # unknown word 
                              "[PAD]",     # padding of word 
                              "[SOS]",     # start of the sentence 
                              "[EOS]"],    # end of the sentence it has to have a frequency of at least two 
            min_frequency=2)   # that word for a word to appear in our vocabulary 
        
        tokenizer.train_from_iterator(get_all_sentences(  #  give all the sentences from dataset 
            ds,         # dataset 
            lang),      # language 
            trainer=trainer)   # word lavel trainer 
        tokenizer.save(str(tokenizer_path))    # save the tokenizer_path
        
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer
        
        

## 1.3 Get Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split
from dataset import BilingualDataset, causal_mask

- Note- in this line 
    ```train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])```

    we will create the dataset that is BilingualDataset dataset, so you go to dataset.py file see BilingualDataset dataset


In [None]:
def get_ds(config):  # get dataset which also data takes the configuation of the model
    
    # it only has the train split, so we divide it overselves 
    # huggingface allow us to download its very easily we just need to tell him what is the name of the dataset and tell him what is the subset we want 
    # the subset that is English to italian but we want to also make it configurable for you guys to change the language very fast so 
    ds_raw = load_dataset(  
        'opus_books',   # what is the name of the dataset 
        f"{config['lang_src']}-{config['lang_tgt']}",      # subset of the dataset, we will have two parameters in the configuration one is called language source and one is called language target
        split='train')   # leter defined what split we want in dataset in our case only training split we want 
    
    
    # Build tokenizers 
    tokenizer_src = get_or_build_tokenizer(config,    # configuration 
                                           ds_raw,    # Raw dataset
                                           config['lang_src'])  # source language of tokenizer 
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])
    
    
    # keep 90% for training 10% for validation 
    train_ds_size = int(0.9 * len(ds_raw))          # this is train data 
    val_ds_size = len(ds_raw) - train_ds_size       # this is validatio data 
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_raw, val_ds_raw])   # the method random split allow it's a method from pytorch that allow to split a dataset using the size that we give as input
    
    
    # we need to create the dataset the dataset theat our model will use to access the tensor directory because before we just created the tokenizer and we just loaded the data but 
    # we need to create the tensors that our model will use 
    # create the BilingualDataset dataset 
    
    
    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    
    
    # find the maximum length of each sentence in the source and target sentence 
    max_len_src = 0
    max_len_tgt = 0
    
    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids 
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids 
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))
        
        
    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')
    
    
    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)
    
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt


