# 1. Train The Model

## 1.1 Import the module

In [1]:
import torch 
import torch.nn as nn 


In [8]:
from datasets import load_dataset                    # we will use the dataset 
from tokenizers import Tokenizer                      # tokenizer library in huggingface
from tokenizers.models import WordLevel                # we will use the word lavel tokenizer
from tokenizers.trainers import WordLevelTrainer        # that class will train the tokenizer 
from tokenizers.pre_tokenizers import Whitespace         # we will split the word according to the white space


In [None]:
from pathlib import Path

## 1.2 Tokenizers
- Most of the code taken from https://huggingface.co/docs/tokenizers/quicktour 

In [None]:
# we will get all senteces so that we can iterate through the dataset to get all the sentences corresponding to the part the particular language 
# for which we are creating the tokenizer 
def get_all_sentences(ds, lang):
    for item in ds:    # each item of dataset is pair of sentences extract the one particular language 
        yield item['translation'][lang]   # this is the item representing the pair and from this pair we extract only the one language thats we want

In [None]:
def get_or_build_tokenizer(config,     # configuration of our model 
                           ds,          # dataset 
                           lang):        # language which language convert the token
    
    # that file where we will save this tokenizer 
    tokenizer_path = Path(config['tokenizer_file'].format(lang))   # path of configuration file name is tokenizer_file and which format of language
    if not Path.exists(tokenizer_path):             # if path is exists 
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))    # unk means unknown, if our tokenizer sees a word that it does not recogized in its vocabulary it will replace it with this word unknown it will map it to the number corresponding to this word unknown 
        tokenizer.pre_tokenizer = Whitespace           # that we wplit by white space 
        
        # then we train we build the trainer to train our tokenizer 
        trainer = WordLevelTrainer(   # this is word lavel trainer it will split the word using the white space and also single word
            special_tokens = ["[UNK]",     # unknown word 
                              "[PAD]",     # padding of word 
                              "[SOS]",     # start of the sentence 
                              "[EOS]"],    # end of the sentence it has to have a frequency of at least two 
            min_frequency=2)   # that word for a word to appear in our vocabulary 
        
        tokenizer.train_from_iterator(get_all_sentences(  #  give all the sentences from dataset 
            ds,         # dataset 
            lang),      # language 
            trainer=trainer)   # word lavel trainer 
        tokenizer.save(str(tokenizer_path))    # save the tokenizer_path
        
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer
        
        