 T5 google model

In [1]:

from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config, Trainer, TrainingArguments
from datasets import load_dataset
import torch
tokenizer = T5Tokenizer.from_pretrained('google/mt5-large')
model = T5ForConditionalGeneration.from_pretrained('google/mt5-large')


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [2]:
dataset = load_dataset('csv', data_files={'translation': './dataset_for_training.csv'}, encoding='utf-8')


In [3]:
print(dataset['translation'][0])


{'Mot en fr': 'saisir', 'Daridja arabe': 'حكم'}


In [4]:
def tokenize_function(example):
    source_tokens = tokenizer.encode_plus(
        example["Mot en fr"],  
        max_length=20,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    target_tokens = tokenizer.encode_plus(
        example["Daridja arabe"], 
        max_length=20,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    return {
        "input_ids": source_tokens["input_ids"].flatten(),
        "attention_mask": source_tokens["attention_mask"].flatten(),
        "labels": target_tokens["input_ids"].flatten()
    }


In [5]:
example = dataset["translation"][0]  
tokenized_example = tokenize_function(example)
print(tokenized_example)


{'input_ids': tensor([  327, 45144,     1,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor([  259, 18197,     1,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])}


In [6]:
tokenized_dataset = dataset["translation"].filter(lambda x: x["Mot en fr"] is not None and x["Daridja arabe"] is not None)
tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=False)

In [7]:
from datasets import Dataset

tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['Mot en fr', 'Daridja arabe', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14954
    })
    test: Dataset({
        features: ['Mot en fr', 'Daridja arabe', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3739
    })
})

In [9]:


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    bleu_score = corpus_bleu([[ref] for ref in labels_str], pred_str)

    return {"bleu_score": bleu_score}


In [10]:
resume_checkpoint_dir = "./Modal1"

training_args = TrainingArguments(
    output_dir="./Modal1",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    logging_dir="./logs",
    save_steps=500,  
    overwrite_output_dir=False,  
    resume_from_checkpoint=resume_checkpoint_dir  
)


In [11]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,)

In [12]:


trainer.train()

  0%|          | 0/18700 [00:00<?, ?it/s]

{'loss': 7.6189, 'grad_norm': 2.1504368782043457, 'learning_rate': 4.8663101604278076e-05, 'epoch': 0.27}
{'loss': 0.7574, 'grad_norm': 0.910918653011322, 'learning_rate': 4.732620320855615e-05, 'epoch': 0.53}
{'loss': 0.681, 'grad_norm': 0.7778192162513733, 'learning_rate': 4.598930481283423e-05, 'epoch': 0.8}
{'loss': 0.6446, 'grad_norm': 0.9133714437484741, 'learning_rate': 4.4652406417112304e-05, 'epoch': 1.07}
{'loss': 0.5912, 'grad_norm': 1.6034926176071167, 'learning_rate': 4.331550802139038e-05, 'epoch': 1.34}
{'loss': 0.5784, 'grad_norm': 0.7972973585128784, 'learning_rate': 4.197860962566845e-05, 'epoch': 1.6}
{'loss': 0.5728, 'grad_norm': 1.0540673732757568, 'learning_rate': 4.0641711229946525e-05, 'epoch': 1.87}
{'loss': 0.5353, 'grad_norm': 1.4703896045684814, 'learning_rate': 3.93048128342246e-05, 'epoch': 2.14}
{'loss': 0.5068, 'grad_norm': 1.052030086517334, 'learning_rate': 3.796791443850268e-05, 'epoch': 2.41}
{'loss': 0.5069, 'grad_norm': 1.119246006011963, 'learning

TrainOutput(global_step=18700, training_loss=0.6244368425665054, metrics={'train_runtime': 133181.6806, 'train_samples_per_second': 1.123, 'train_steps_per_second': 0.14, 'total_flos': 1.746864549888e+16, 'train_loss': 0.6244368425665054, 'epoch': 10.0})