In [3]:
import torch 
from torch.utils.data import random_split,DataLoader
import import_ipynb
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from pathlib import Path
from dataset_py import BillingualDataset,causal_mask
from model import build_transformer
from config import get_config , get_weights_file_path
from  tqdm import tqdm
import warnings

2025-10-20 13:54:18.533848: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-20 13:54:18.571865: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-20 13:54:19.831035: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
def get_sentances(ds,lang):
    for item in ds:
        yield item["translation"][lang]

In [5]:
def get_or_build_tokenizer(config,ds , lang):
    tokenizer_path = Path(config["tokenizer_path"].format(lang))
    if not tokenizer_path.exists():
        tokenizer = Tokenizer(model= WordLevel(unk_token= "[UNK]"))
        tokenizer.pre_tokenizer = Whitespace() 
        trainer = WordLevelTrainer(special_tokens=["[UNK]","[PAD]" , "[SOS]","[EOS]"],min_frequency=2)
        tokenizer.train_from_iterator(iterator=get_sentances(ds,lang),trainer=trainer)
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer
        

In [6]:
def get_ds(config):
    lang_src = config["lang_src"]
    lang_tgt = config["lang_tgt"]
    len_seq = config["seq_len"]
    
    ds_raw = load_dataset(name='opus_books', subset = f"{lang_src}-{lang_tgt}", split="train")
    
    #We define two tokenizer (each language has different tokens)
    tokenizer_src = get_or_build_tokenizer(config,ds_raw,lang_src)
    tokenizer_tgt = get_or_build_tokenizer(config,ds_raw,lang_tgt)
    
    train_size = int(0.9 * len(ds_raw))
    val_size = int(0.1 * len(ds_raw))
    train_ds_raw , val_ds_raw = random_split(dataset=ds_raw,size = [train_size,val_size])

    train_ds = BillingualDataset(ds =train_ds_raw,src_lang= lang_src , tgt_lang= lang_tgt,src_tokenizer= tokenizer_src, tgt_tokenizer=tokenizer_tgt , max_len=len_seq)
    val_ds = BillingualDataset(ds =val_ds_raw,src_lang= lang_src , tgt_lang= lang_tgt,src_tokenizer= tokenizer_src, tgt_tokenizer=tokenizer_tgt , max_len=len_seq)

    #Find the max len sentance 
    max_len_src = 0
    max_len_tgt = 0 
    for item in ds_raw:
        src_idc = tokenizer_src.encode(item["translation"][lang_src]).ids
        tgt_idc = tokenizer_src.encode(item["translation"][lang_tgt]).ids
        
        max_len_src = max(max_len_src, len(src_idc))
        max_tgt_idc = max(max_len_tgt,len(tgt_idc))
        
        print(f'Max len source : {max_len_src}')
        print(f'Max len tgt : {max_len_tgt}')
        
    train_dataloader = DataLoader(train_ds,batch_size=config["batch_size"], shuffle=True)
    val_dataloader = DataLoader(val_ds,batch_size=config["batch_size"], shuffle=True)
    
    return train_dataloader , val_dataloader , tokenizer_src , tokenizer_tgt
        



In [7]:
def get_model(config, vocab_src_len , vocab_tgt_len):
    model = build_transformer(src_vocab_size=vocab_src_len, tgt_vocab_size=vocab_tgt_len,src_seq_len=config['seq_len'],tgt_seq_len= config['seq_len'],d_model=config['d_model'])
    return model

In [10]:
def train_model(config):
    #Define in which device the training will be
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    print(f" Using device:{device}")
    
    #Create the weights folder
    Path(config["model_folder"]).mkdir(parents=True, exist_ok=True)
    
    #Load the dataset
    train_dataloader , val_dataloader , tokenizer_src , tokenizer_tgt = get_ds(config=config)
    
    #Import the model
    model = get_model(config= config,vocab_src_len= tokenizer_src.get_vocab_size(), vocab_tgt_len= tokenizer_tgt.get_vocab_size()).to(device)
    
    # Start tensoboard to visualize the loss charts
    writer = SummaryWriter(config["experiment_name"])
    
    optimizer = torch.optim.Adam(model.parameters , lr=config['lr'], eps= 1e-9)
    
    initial_epoch = 0
    global_epoch = 0
    
    if config["preload"]:
        model_filename = get_weights_file_path(config=config,epoch=config["preload"])
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        initial_epoch = state["epoch"] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state["global_step"]
        
    loss_fn= nn.CrossEntropyLoss(ignore_index= tokenizer_src.token_to_id("[PAS]"),label_smoothing=0.1).to(device)
    
    for epoch in range(initial_epoch,config["num_epochs"]):
        model.train()
        batch_iterator = tqdm(train_dataloader, desc = f'Processing epoch {epoch:0.2d}')
        for batch in batch_iterator:
            encoder_input = batch["encoder_input"].to(device) #(B , seq_len)
            decoder_input = batch["decoder_input"].to(device) #(B , seq_len)

            encoder_mask = batch["encoder_mask"]
            decoder_mask = batch["decoder_mask"]
            
            encoder_output = model.encode(input=encoder_input, src_mask=encoder_mask)
            decoder_output = model.decode(encoder_output, decoder_input, decoder_mask)
            proj_output = model.projection_layer(decoder_output)
            label = batch["label"].to(device)
            
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
            
            #Write the loss
            writer.add_scalar("train_loss", loss.item(), global_step=global_step)
            writer.flush()
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            global_step += 1
    model_filename = get_weights_file_path(config, f"{epoch:02d}")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'global_step': global_step
    }, model_filename)


if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    train_model(config=config)

 Using device:cpu


TypeError: load_dataset() missing 1 required positional argument: 'path'