# 1. Train The Model

## 1.1 Import the module

In [1]:
import torch 
import torch.nn as nn 


In [3]:
from config import get_config, get_weights_file_path, latest_weights_file_path

ModuleNotFoundError: No module named 'config'

In [8]:
from datasets import load_dataset                    # we will use the dataset 
from tokenizers import Tokenizer                      # tokenizer library in huggingface
from tokenizers.models import WordLevel                # we will use the word lavel tokenizer
from tokenizers.trainers import WordLevelTrainer        # that class will train the tokenizer 
from tokenizers.pre_tokenizers import Whitespace         # we will split the word according to the white space


In [None]:
from pathlib import Path

## 1.2 Tokenizers
- Most of the code taken from https://huggingface.co/docs/tokenizers/quicktour 

In [None]:
# we will get all senteces so that we can iterate through the dataset to get all the sentences corresponding to the part the particular language 
# for which we are creating the tokenizer 
def get_all_sentences(ds, lang):
    for item in ds:    # each item of dataset is pair of sentences extract the one particular language 
        yield item['translation'][lang]   # this is the item representing the pair and from this pair we extract only the one language thats we want

In [None]:
def get_or_build_tokenizer(config,     # configuration of our model 
                           ds,          # dataset 
                           lang):        # language which language convert the token
    
    # that file where we will save this tokenizer 
    tokenizer_path = Path(config['tokenizer_file'].format(lang))   # path of configuration file name is tokenizer_file and which format of language
    if not Path.exists(tokenizer_path):             # if path is exists 
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))    # unk means unknown, if our tokenizer sees a word that it does not recogized in its vocabulary it will replace it with this word unknown it will map it to the number corresponding to this word unknown 
        tokenizer.pre_tokenizer = Whitespace           # that we wplit by white space 
        
        # then we train we build the trainer to train our tokenizer 
        trainer = WordLevelTrainer(   # this is word lavel trainer it will split the word using the white space and also single word
            special_tokens = ["[UNK]",     # unknown word 
                              "[PAD]",     # padding of word 
                              "[SOS]",     # start of the sentence 
                              "[EOS]"],    # end of the sentence it has to have a frequency of at least two 
            min_frequency=2)   # that word for a word to appear in our vocabulary 
        
        tokenizer.train_from_iterator(get_all_sentences(  #  give all the sentences from dataset 
            ds,         # dataset 
            lang),      # language 
            trainer=trainer)   # word lavel trainer 
        tokenizer.save(str(tokenizer_path))    # save the tokenizer_path
        
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer
        
        

## 1.3 Get Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split
from dataset import BilingualDataset, causal_mask

- Note- in this line 
    ```train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])```

    we will create the dataset that is BilingualDataset dataset, so you go to dataset.py file see BilingualDataset dataset


In [None]:
def get_ds(config):  # get dataset which also data takes the configuation of the model
    
    # it only has the train split, so we divide it overselves 
    # huggingface allow us to download its very easily we just need to tell him what is the name of the dataset and tell him what is the subset we want 
    # the subset that is English to italian but we want to also make it configurable for you guys to change the language very fast so 
    ds_raw = load_dataset(  
        'opus_books',   # what is the name of the dataset 
        f"{config['lang_src']}-{config['lang_tgt']}",      # subset of the dataset, we will have two parameters in the configuration one is called language source and one is called language target
        split='train')   # leter defined what split we want in dataset in our case only training split we want 
    
    
    # Build tokenizers 
    tokenizer_src = get_or_build_tokenizer(config,    # configuration 
                                           ds_raw,    # Raw dataset
                                           config['lang_src'])  # source language of tokenizer 
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])
    
    
    # keep 90% for training 10% for validation 
    train_ds_size = int(0.9 * len(ds_raw))          # this is train data 
    val_ds_size = len(ds_raw) - train_ds_size       # this is validatio data 
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_raw, val_ds_raw])   # the method random split allow it's a method from pytorch that allow to split a dataset using the size that we give as input
    
    
    # we need to create the dataset the dataset theat our model will use to access the tensor directory because before we just created the tokenizer and we just loaded the data but 
    # we need to create the tensors that our model will use 
    # create the BilingualDataset dataset 
    
    
    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    
    
    # find the maximum length of each sentence in the source and target sentence 
    max_len_src = 0
    max_len_tgt = 0
    
    
    
    # basically we do each sentence from each language from the source and the target language i convert into IDs using the tokenizer and 
    # i check the length if the length is let's say 190 we can choose 200 as sequence length because it will cover all the possible sentences
    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids 
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids 
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))
        
        
    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')
    
    
    train_dataloader = DataLoader(
        train_ds,   # train data 
        batch_size=config['batch_size'],  # batch size according to configuration
        shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)
    
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt




## 1.4 Get Model

In [None]:
from model import build_transformer

In [None]:
def get_model(config, 
              vocab_src_len, 
              vocab_tgt_len):
    
    model = build_transformer(
        vocab_src_len,             # vocabulary source length
        vocab_tgt_len,             # vocabulary target length
        config["seq_len"],        # sequence length of source language
        config["seq_len"],         # sequence length of target language 
        d_model=config["d_model"]
    )
    
    return model

## 1.5 Train Model

In [None]:
def train_model(config):
    
    # Define the device 
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    elif (device == 'mps'):
        print(f"Device name: <mps>")
    else:
        print("NOTE: If you have a GPU, consider using it for training.")
        print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
        print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
    device = torch.device(device)
    
    
     # Make sure the weights folder exists
    Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
    
    
# since we also have the configuration that allow us to resume the trining in case the model crashes  or something crashes 
# and that allow us to restore the state of the model and the state of the optimizer 
    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    global_step = 0
    preload = config['preload']
    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
    else:
        print('No model to preload, starting from scratch')

    loss_fn = nn.CrossEntropyLoss(
        ignore_index=tokenizer_src.token_to_id('[PAD]'),   # what is the ignore index  its padding token we don't want the loss to the padding token to contribute to the loss 
        # label smothing basically allows us our model to be less confident about its decision so imagine our model is telling us to choose the word number three and with very high probability so what we will do with label booting is take a little percentage of that probabilty and distribute to the other tokens so our model became less sure of its choice so kind of less overfit and this actually improves the accuracy of the model 
        label_smoothing=0.1   # so we will use the level putting of 0.1 which means from every highest probability token take 0.1 percent of score and give it the others 
        ).to(device)
    
    
    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:
            # finally we get the tensor 
            encoder_input = batch['encoder_input'].to(device) # (b, seq_len) b -> batch size 
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)   # why these two mask are diffrent because in the one case we are only telling him to hide only the padding tokens in the other case we are also telling him to hide all this subsequent words for each word to hide all the subsequent words to mask them out 

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)   # Linear layer 

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)  # what is label -> label is each position tell each B and sequence length so for each dimension tells us what is the position in the vocabulary of that particular word  

            # Compute the loss using a simple cross entropy
            # (B, seq_len, tgt_vocab_size)  --> (B * seq_len, tgt_vocab_size)
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
            
            
            # Log the loss
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagate the loss
            loss.backward()     # loss of the backward if loss is more the start for training for bigning 

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)    # zero grad 

            global_step += 1

        # Run validation at the end of every epoch
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

        # Save the model at the end of every epoch
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        
        
        # when we want to be able to resume the training to also save not only the state of the model but also the state of the optimizer because optimizer also keep tracks of some statistics one for each weight to understand how to move each weight so independently so i saw the optimizer dictionary is very big if you want your training to be resumable you need to save it otherwise the optimizer will always start from zero and will have to figure out from zero , so every time you save some snapshot 
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)



if __name__ == '__main__':
    warnings.filterwarnings("ignore")   # warning is frustating wo i want to filter them out 
    config = get_config()
    train_model(config)
