# Translation

## Preparing the data

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4.py", lang1="en", lang2="fr", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [3]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [4]:
split_datasets["validation"] = split_datasets.pop("test")

#### checking  Model Helsinki-NLP/opus-mt-en-fr model

In [5]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
translator = pipeline("translation", model=model_checkpoint)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [6]:
split_datasets["train"][1]["translation"]

{'en': 'Default to expanded threads',
 'fr': 'Par défaut, développer les fils de discussion'}

In [7]:
translator("Default to expanded threads")

# In the dataset, the translations for threads is "fils de discussion", where as the "Helsinki-NLP/opus-mt-en-fr" model translates it to "threads".

[{'translation_text': 'Par défaut pour les threads élargis'}]

In [8]:
split_datasets["train"][172]["translation"]

{'en': 'Unable to import %1 using the OFX importer plugin. This file is not the correct format.',
 'fr': "Impossible d'importer %1 en utilisant le module d'extension d'importation OFX. Ce fichier n'a pas un format correct."}

In [9]:
translator(
    "Unable to import %1 using the OFX importer plugin. This file is not the correct format."
)

# Another example of this behavior can be seen with the word “plugin,” which isn’t officially a French word but which most native speakers will understand and not bother to translate. In the KDE4 dataset this word has been translated in French into the more official “module d’extension”

[{'translation_text': "Impossible d'importer %1 en utilisant le plugin d'importateur OFX. Ce fichier n'est pas le bon format."}]

## Processing the data

In [10]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

# If you are using a multilingual tokenizer such as mBART, mBART-50, or M2M100, you will need to set the language codes of your inputs and targets in the tokenizer by setting tokenizer.src_lang and tokenizer.tgt_lang to the right values.

In [11]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence, text_target=fr_sentence)
inputs

{'input_ids': [47591, 12, 9842, 19634, 9, 0], 'attention_mask': [1, 1, 1, 1, 1, 1], 'labels': [577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]}

In [12]:
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))

['▁Par', '▁dé', 'f', 'aut', ',', '▁dé', 've', 'lop', 'per', '▁les', '▁fil', 's', '▁de', '▁discussion', '</s>']
['▁Par', '▁défaut', ',', '▁développer', '▁les', '▁fils', '▁de', '▁discussion', '</s>']


In [13]:
max_length = tokenizer.model_max_length
print(f"max_length: {max_length}")


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

max_length: 512


In [14]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

## Fine-tuning the model with the Trainer API

In [15]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

#### Data collation

In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [18]:
batch["labels"]
# our labels have been padded to the maximum length of the batch, using -100

tensor([[  577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],
        [ 1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,   817,
           550,  7032,  5821,  7907, 12649,     0]])

In [19]:
batch["decoder_input_ids"]
# Decoder input IDs, which are shifted versions of the labels with a special token at the beginning

tensor([[59513,   577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,
         59513, 59513, 59513, 59513, 59513, 59513],
        [59513,  1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,
           817,   550,  7032,  5821,  7907, 12649]])

In [20]:
# labels for the first and second elements in our dataset
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]
[1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124, 817, 550, 7032, 5821, 7907, 12649, 0]


In [21]:
print("Pad token ID:", tokenizer.pad_token_id)
print("Pad token:", tokenizer.pad_token)


Pad token ID: 59513
Pad token: <pad>


In [22]:
print("Collator padding token ID:", data_collator.tokenizer.pad_token_id)


Collator padding token ID: 59513


In [23]:
# Check what token ID 0 corresponds to
token_id = 0
token = tokenizer.convert_ids_to_tokens(token_id)
print(f"Token ID {token_id} corresponds to token: '{token}'")


Token ID 0 corresponds to token: '</s>'


### Metrics

In [24]:
!pip install sacrebleu



In [25]:
import evaluate

metric = evaluate.load("sacrebleu")

In [26]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [27]:
predictions = ["This This This This"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 1.683602693167689,
 'counts': [1, 0, 0, 0],
 'totals': [4, 3, 2, 1],
 'precisions': [25.0, 16.666666666666668, 12.5, 12.5],
 'bp': 0.10539922456186433,
 'sys_len': 4,
 'ref_len': 13}

In [28]:
predictions = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 0.0,
 'counts': [2, 1, 0, 0],
 'totals': [2, 1, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 0.004086771438464067,
 'sys_len': 2,
 'ref_len': 13}

In [29]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

### Fine-tuning the model

In [30]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr-checkpoints",
    evaluation_strategy="no",  # we will just evaluate our model once before training and after.
    save_strategy="epoch",  # Save checkpoints at the end of every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,  # Limit the number of saved checkpoints
    num_train_epochs=10,
    predict_with_generate=True,  # needed to generate the translations
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



In [31]:
trainer.evaluate(max_length=max_length)

100%|██████████| 329/329 [17:22<00:00,  3.17s/it]


{'eval_loss': 1.7000652551651,
 'eval_model_preparation_time': 0.002,
 'eval_bleu': 38.973943774115114,
 'eval_runtime': 1051.0115,
 'eval_samples_per_second': 19.998,
 'eval_steps_per_second': 0.313}

In [32]:
trainer.train()

  1%|          | 502/59120 [01:12<1:22:01, 11.91it/s] 

{'loss': 1.3916, 'grad_norm': 7.633892059326172, 'learning_rate': 1.9831190798376187e-05, 'epoch': 0.08}


  2%|▏         | 1000/59120 [02:22<1:27:25, 11.08it/s]

{'loss': 1.2408, 'grad_norm': 4.5788655281066895, 'learning_rate': 1.9662043301759137e-05, 'epoch': 0.17}


  3%|▎         | 1500/59120 [03:24<2:37:07,  6.11it/s] 

{'loss': 1.2028, 'grad_norm': 5.243614673614502, 'learning_rate': 1.9492895805142083e-05, 'epoch': 0.25}


  3%|▎         | 2001/59120 [04:52<4:06:37,  3.86it/s] 

{'loss': 1.1534, 'grad_norm': 3.415882110595703, 'learning_rate': 1.9323748308525033e-05, 'epoch': 0.34}


  4%|▍         | 2501/59120 [05:46<1:33:43, 10.07it/s]

{'loss': 1.1093, 'grad_norm': 3.2337677478790283, 'learning_rate': 1.9154600811907986e-05, 'epoch': 0.42}


  5%|▌         | 3001/59120 [06:49<1:13:33, 12.72it/s]

{'loss': 1.0754, 'grad_norm': 3.4433956146240234, 'learning_rate': 1.8985453315290936e-05, 'epoch': 0.51}


  6%|▌         | 3501/59120 [07:53<1:26:35, 10.70it/s] 

{'loss': 1.0467, 'grad_norm': 4.641908168792725, 'learning_rate': 1.8816305818673886e-05, 'epoch': 0.59}


  7%|▋         | 4000/59120 [08:56<1:30:58, 10.10it/s]

{'loss': 1.0355, 'grad_norm': 3.7774765491485596, 'learning_rate': 1.8647158322056836e-05, 'epoch': 0.68}


  8%|▊         | 4503/59120 [10:07<1:13:28, 12.39it/s] 

{'loss': 1.0421, 'grad_norm': 3.130302906036377, 'learning_rate': 1.8478349120433018e-05, 'epoch': 0.76}


  8%|▊         | 5000/59120 [11:22<7:13:17,  2.08it/s] 

{'loss': 1.0158, 'grad_norm': 2.876955509185791, 'learning_rate': 1.8309201623815968e-05, 'epoch': 0.85}


  9%|▉         | 5502/59120 [12:45<1:48:37,  8.23it/s] 

{'loss': 1.0, 'grad_norm': 3.218794107437134, 'learning_rate': 1.8140054127198918e-05, 'epoch': 0.93}


 10%|█         | 6003/59120 [13:56<1:08:29, 12.93it/s] 

{'loss': 0.9665, 'grad_norm': 2.9098243713378906, 'learning_rate': 1.7971244925575103e-05, 'epoch': 1.01}


 11%|█         | 6502/59120 [15:05<1:07:14, 13.04it/s] 

{'loss': 0.8815, 'grad_norm': 2.203686475753784, 'learning_rate': 1.7802097428958052e-05, 'epoch': 1.1}


 12%|█▏        | 7002/59120 [16:01<1:08:45, 12.63it/s]

{'loss': 0.9049, 'grad_norm': 3.389420986175537, 'learning_rate': 1.7632949932341002e-05, 'epoch': 1.18}


 13%|█▎        | 7502/59120 [17:15<1:59:48,  7.18it/s] 

{'loss': 0.9059, 'grad_norm': 2.778923749923706, 'learning_rate': 1.7463802435723952e-05, 'epoch': 1.27}


 14%|█▎        | 8003/59120 [18:23<1:04:06, 13.29it/s] 

{'loss': 0.8858, 'grad_norm': 5.216675758361816, 'learning_rate': 1.7294993234100137e-05, 'epoch': 1.35}


 14%|█▍        | 8502/59120 [19:33<1:34:54,  8.89it/s] 

{'loss': 0.8881, 'grad_norm': 3.9376771450042725, 'learning_rate': 1.7125845737483087e-05, 'epoch': 1.44}


 15%|█▌        | 9001/59120 [20:40<1:15:00, 11.14it/s] 

{'loss': 0.8885, 'grad_norm': 3.8835389614105225, 'learning_rate': 1.6956698240866037e-05, 'epoch': 1.52}


 16%|█▌        | 9501/59120 [21:38<1:03:40, 12.99it/s] 

{'loss': 0.8774, 'grad_norm': 3.470211982727051, 'learning_rate': 1.6787550744248987e-05, 'epoch': 1.61}


 17%|█▋        | 10000/59120 [22:51<1:41:49,  8.04it/s]

{'loss': 0.872, 'grad_norm': 3.03437876701355, 'learning_rate': 1.6618403247631937e-05, 'epoch': 1.69}


 18%|█▊        | 10502/59120 [24:13<1:00:01, 13.50it/s] 

{'loss': 0.8883, 'grad_norm': 3.5217363834381104, 'learning_rate': 1.6449255751014887e-05, 'epoch': 1.78}


 19%|█▊        | 11001/59120 [25:21<1:26:20,  9.29it/s] 

{'loss': 0.8614, 'grad_norm': 3.965338706970215, 'learning_rate': 1.6280446549391072e-05, 'epoch': 1.86}


 19%|█▉        | 11502/59120 [26:30<1:04:53, 12.23it/s]

{'loss': 0.8682, 'grad_norm': 2.316436767578125, 'learning_rate': 1.6111299052774022e-05, 'epoch': 1.95}


 20%|██        | 12002/59120 [27:34<1:11:01, 11.06it/s]

{'loss': 0.8286, 'grad_norm': 3.9571282863616943, 'learning_rate': 1.5942151556156972e-05, 'epoch': 2.03}


 21%|██        | 12502/59120 [28:46<1:54:39,  6.78it/s]

{'loss': 0.7808, 'grad_norm': 3.676339864730835, 'learning_rate': 1.577300405953992e-05, 'epoch': 2.11}


 22%|██▏       | 13002/59120 [29:57<1:05:13, 11.78it/s] 

{'loss': 0.7811, 'grad_norm': 2.8704166412353516, 'learning_rate': 1.560385656292287e-05, 'epoch': 2.2}


 23%|██▎       | 13501/59120 [31:11<1:21:16,  9.36it/s] 

{'loss': 0.8, 'grad_norm': 4.323342800140381, 'learning_rate': 1.5434709066305818e-05, 'epoch': 2.28}


 24%|██▎       | 14001/59120 [32:15<1:16:14,  9.86it/s] 

{'loss': 0.79, 'grad_norm': 2.5053164958953857, 'learning_rate': 1.5265561569688768e-05, 'epoch': 2.37}


 25%|██▍       | 14502/59120 [33:16<2:52:13,  4.32it/s]

{'loss': 0.7849, 'grad_norm': 3.5477893352508545, 'learning_rate': 1.509641407307172e-05, 'epoch': 2.45}


 25%|██▌       | 15000/59120 [34:28<3:50:20,  3.19it/s] 

{'loss': 0.7839, 'grad_norm': 3.699144124984741, 'learning_rate': 1.4927604871447903e-05, 'epoch': 2.54}


 26%|██▌       | 15502/59120 [35:38<53:23, 13.62it/s]   

{'loss': 0.799, 'grad_norm': 2.969682455062866, 'learning_rate': 1.4758795669824088e-05, 'epoch': 2.62}


 27%|██▋       | 16002/59120 [36:45<1:19:35,  9.03it/s]

{'loss': 0.7799, 'grad_norm': 4.236715316772461, 'learning_rate': 1.4589648173207038e-05, 'epoch': 2.71}


 28%|██▊       | 16502/59120 [37:49<1:15:35,  9.40it/s] 

{'loss': 0.7756, 'grad_norm': 3.2528302669525146, 'learning_rate': 1.4420500676589988e-05, 'epoch': 2.79}


 29%|██▉       | 17001/59120 [38:54<1:03:44, 11.01it/s]

{'loss': 0.7837, 'grad_norm': 4.050159454345703, 'learning_rate': 1.4251353179972938e-05, 'epoch': 2.88}


 30%|██▉       | 17503/59120 [40:15<2:52:01,  4.03it/s]

{'loss': 0.7907, 'grad_norm': 3.53711199760437, 'learning_rate': 1.4082543978349121e-05, 'epoch': 2.96}


 30%|███       | 18000/59120 [41:22<48:35, 14.10it/s]  

{'loss': 0.7533, 'grad_norm': 4.171680927276611, 'learning_rate': 1.3913396481732071e-05, 'epoch': 3.04}


 31%|███▏      | 18500/59120 [42:27<49:41, 13.62it/s]  

{'loss': 0.724, 'grad_norm': 3.026613712310791, 'learning_rate': 1.3744248985115021e-05, 'epoch': 3.13}


 32%|███▏      | 19002/59120 [43:42<1:20:35,  8.30it/s]

{'loss': 0.7289, 'grad_norm': 3.7827978134155273, 'learning_rate': 1.357510148849797e-05, 'epoch': 3.21}


 33%|███▎      | 19501/59120 [44:36<52:06, 12.67it/s]  

{'loss': 0.7078, 'grad_norm': 2.7544713020324707, 'learning_rate': 1.3405953991880922e-05, 'epoch': 3.3}


 34%|███▍      | 20002/59120 [45:44<1:46:10,  6.14it/s] 

{'loss': 0.7154, 'grad_norm': 2.6823747158050537, 'learning_rate': 1.3236806495263872e-05, 'epoch': 3.38}


 35%|███▍      | 20501/59120 [46:49<58:19, 11.04it/s]  

{'loss': 0.7314, 'grad_norm': 3.3478825092315674, 'learning_rate': 1.3067658998646822e-05, 'epoch': 3.47}


 36%|███▌      | 21000/59120 [47:51<52:16, 12.15it/s]  

{'loss': 0.726, 'grad_norm': 3.5908212661743164, 'learning_rate': 1.289851150202977e-05, 'epoch': 3.55}


 36%|███▋      | 21500/59120 [49:07<8:32:00,  1.22it/s] 

{'loss': 0.7285, 'grad_norm': 3.1159133911132812, 'learning_rate': 1.2729702300405956e-05, 'epoch': 3.64}


 37%|███▋      | 22001/59120 [50:24<3:04:06,  3.36it/s]

{'loss': 0.7162, 'grad_norm': 3.3147387504577637, 'learning_rate': 1.2560554803788905e-05, 'epoch': 3.72}


 38%|███▊      | 22502/59120 [51:36<47:22, 12.88it/s]  

{'loss': 0.7233, 'grad_norm': 3.7307050228118896, 'learning_rate': 1.2391407307171854e-05, 'epoch': 3.81}


 39%|███▉      | 23001/59120 [52:44<1:59:45,  5.03it/s]

{'loss': 0.7159, 'grad_norm': 2.382382392883301, 'learning_rate': 1.2222259810554804e-05, 'epoch': 3.89}


 40%|███▉      | 23501/59120 [53:51<1:43:27,  5.74it/s]

{'loss': 0.722, 'grad_norm': 4.039222717285156, 'learning_rate': 1.2053112313937754e-05, 'epoch': 3.97}


 41%|████      | 24000/59120 [55:01<54:41, 10.70it/s]  

{'loss': 0.6786, 'grad_norm': 3.7908201217651367, 'learning_rate': 1.188430311231394e-05, 'epoch': 4.06}


 41%|████▏     | 24503/59120 [56:09<1:27:46,  6.57it/s]

{'loss': 0.6735, 'grad_norm': 3.7970995903015137, 'learning_rate': 1.171515561569689e-05, 'epoch': 4.14}


 42%|████▏     | 25001/59120 [57:17<1:12:19,  7.86it/s]

{'loss': 0.6736, 'grad_norm': 4.007111549377441, 'learning_rate': 1.1546008119079838e-05, 'epoch': 4.23}


 43%|████▎     | 25501/59120 [58:26<2:40:28,  3.49it/s]

{'loss': 0.664, 'grad_norm': 3.4510090351104736, 'learning_rate': 1.1376860622462788e-05, 'epoch': 4.31}


 44%|████▍     | 26003/59120 [59:34<42:52, 12.88it/s]  

{'loss': 0.6724, 'grad_norm': 3.279106378555298, 'learning_rate': 1.1207713125845738e-05, 'epoch': 4.4}


 45%|████▍     | 26501/59120 [1:00:40<46:06, 11.79it/s]  

{'loss': 0.6696, 'grad_norm': 2.7026331424713135, 'learning_rate': 1.1038565629228688e-05, 'epoch': 4.48}


 46%|████▌     | 27002/59120 [1:01:56<1:01:38,  8.68it/s]

{'loss': 0.6738, 'grad_norm': 3.247185230255127, 'learning_rate': 1.0869418132611638e-05, 'epoch': 4.57}


 47%|████▋     | 27502/59120 [1:03:15<46:11, 11.41it/s]  

{'loss': 0.6908, 'grad_norm': 3.6047909259796143, 'learning_rate': 1.0700608930987821e-05, 'epoch': 4.65}


 47%|████▋     | 28001/59120 [1:04:19<40:10, 12.91it/s]  

{'loss': 0.6687, 'grad_norm': 4.114670753479004, 'learning_rate': 1.0531461434370771e-05, 'epoch': 4.74}


 48%|████▊     | 28502/59120 [1:05:27<42:03, 12.13it/s]  

{'loss': 0.6632, 'grad_norm': 2.717122793197632, 'learning_rate': 1.0362313937753723e-05, 'epoch': 4.82}


 49%|████▉     | 29002/59120 [1:06:34<55:10,  9.10it/s]  

{'loss': 0.6842, 'grad_norm': 2.6075901985168457, 'learning_rate': 1.0193166441136673e-05, 'epoch': 4.91}


 50%|████▉     | 29501/59120 [1:07:32<39:42, 12.43it/s]  

{'loss': 0.6767, 'grad_norm': 3.6151123046875, 'learning_rate': 1.0024018944519623e-05, 'epoch': 4.99}


 51%|█████     | 30002/59120 [1:08:49<37:32, 12.93it/s]  

{'loss': 0.6441, 'grad_norm': 3.7179670333862305, 'learning_rate': 9.855209742895806e-06, 'epoch': 5.07}


 52%|█████▏    | 30502/59120 [1:10:07<47:26, 10.06it/s]  

{'loss': 0.6274, 'grad_norm': 3.9154396057128906, 'learning_rate': 9.686062246278756e-06, 'epoch': 5.16}


 52%|█████▏    | 31001/59120 [1:11:19<42:41, 10.98it/s]  

{'loss': 0.6214, 'grad_norm': 2.7849080562591553, 'learning_rate': 9.516914749661706e-06, 'epoch': 5.24}


 53%|█████▎    | 31501/59120 [1:12:23<35:01, 13.14it/s]  

{'loss': 0.6362, 'grad_norm': 3.1513593196868896, 'learning_rate': 9.347767253044656e-06, 'epoch': 5.33}


 54%|█████▍    | 32002/59120 [1:13:35<45:48,  9.87it/s]  

{'loss': 0.6366, 'grad_norm': 3.0636239051818848, 'learning_rate': 9.178619756427606e-06, 'epoch': 5.41}


 55%|█████▍    | 32502/59120 [1:14:46<50:12,  8.84it/s]  

{'loss': 0.635, 'grad_norm': 3.542881727218628, 'learning_rate': 9.009810554803789e-06, 'epoch': 5.5}


 56%|█████▌    | 33002/59120 [1:15:54<43:13, 10.07it/s]  

{'loss': 0.629, 'grad_norm': 2.9938108921051025, 'learning_rate': 8.840663058186739e-06, 'epoch': 5.58}


 57%|█████▋    | 33500/59120 [1:16:59<2:07:21,  3.35it/s]

{'loss': 0.6424, 'grad_norm': 3.608818769454956, 'learning_rate': 8.67151556156969e-06, 'epoch': 5.67}


 58%|█████▊    | 34001/59120 [1:18:11<39:55, 10.48it/s]  

{'loss': 0.6354, 'grad_norm': 4.858671188354492, 'learning_rate': 8.50236806495264e-06, 'epoch': 5.75}


 58%|█████▊    | 34503/59120 [1:19:17<35:51, 11.44it/s]  

{'loss': 0.6365, 'grad_norm': 3.254009246826172, 'learning_rate': 8.333220568335589e-06, 'epoch': 5.84}


 59%|█████▉    | 35000/59120 [1:20:20<32:14, 12.47it/s]  

{'loss': 0.6343, 'grad_norm': 2.389611005783081, 'learning_rate': 8.164073071718539e-06, 'epoch': 5.92}


 60%|██████    | 35500/59120 [1:21:21<42:35,  9.24it/s]  

{'loss': 0.6305, 'grad_norm': 3.2198381423950195, 'learning_rate': 7.994925575101489e-06, 'epoch': 6.0}


 61%|██████    | 36000/59120 [1:22:24<31:52, 12.09it/s]  

{'loss': 0.597, 'grad_norm': 2.834723711013794, 'learning_rate': 7.82577807848444e-06, 'epoch': 6.09}


 62%|██████▏   | 36501/59120 [1:23:29<1:16:18,  4.94it/s]

{'loss': 0.5914, 'grad_norm': 2.7054672241210938, 'learning_rate': 7.656630581867388e-06, 'epoch': 6.17}


 63%|██████▎   | 37003/59120 [1:24:39<28:58, 12.72it/s]  

{'loss': 0.6084, 'grad_norm': 2.8164889812469482, 'learning_rate': 7.487483085250339e-06, 'epoch': 6.26}


 63%|██████▎   | 37501/59120 [1:25:44<44:32,  8.09it/s]  

{'loss': 0.5947, 'grad_norm': 3.42501163482666, 'learning_rate': 7.3186738836265225e-06, 'epoch': 6.34}


 64%|██████▍   | 38002/59120 [1:26:51<32:48, 10.73it/s]  

{'loss': 0.604, 'grad_norm': 3.881469249725342, 'learning_rate': 7.149526387009473e-06, 'epoch': 6.43}


 65%|██████▌   | 38502/59120 [1:27:49<31:26, 10.93it/s]  

{'loss': 0.5896, 'grad_norm': 3.2387328147888184, 'learning_rate': 6.980378890392423e-06, 'epoch': 6.51}


 66%|██████▌   | 39002/59120 [1:28:52<26:15, 12.77it/s]  

{'loss': 0.5985, 'grad_norm': 3.245598316192627, 'learning_rate': 6.811231393775373e-06, 'epoch': 6.6}


 67%|██████▋   | 39500/59120 [1:30:16<36:12,  9.03it/s]  

{'loss': 0.6207, 'grad_norm': 4.7686448097229, 'learning_rate': 6.642422192151556e-06, 'epoch': 6.68}


 68%|██████▊   | 40001/59120 [1:31:39<30:03, 10.60it/s]  

{'loss': 0.6164, 'grad_norm': 3.3545920848846436, 'learning_rate': 6.473274695534507e-06, 'epoch': 6.77}


 69%|██████▊   | 40501/59120 [1:32:50<26:14, 11.83it/s]  

{'loss': 0.5994, 'grad_norm': 3.037534713745117, 'learning_rate': 6.304127198917457e-06, 'epoch': 6.85}


 69%|██████▉   | 41003/59120 [1:34:02<22:35, 13.36it/s]  

{'loss': 0.6249, 'grad_norm': 4.4626784324646, 'learning_rate': 6.134979702300406e-06, 'epoch': 6.94}


 70%|███████   | 41503/59120 [1:35:07<35:07,  8.36it/s]  

{'loss': 0.5985, 'grad_norm': 2.8611857891082764, 'learning_rate': 5.96617050067659e-06, 'epoch': 7.02}


 71%|███████   | 42002/59120 [1:36:31<27:00, 10.56it/s]  

{'loss': 0.5841, 'grad_norm': 3.028613805770874, 'learning_rate': 5.79702300405954e-06, 'epoch': 7.1}


 72%|███████▏  | 42501/59120 [1:37:48<41:33,  6.67it/s]  

{'loss': 0.586, 'grad_norm': 2.902698040008545, 'learning_rate': 5.627875507442491e-06, 'epoch': 7.19}


 73%|███████▎  | 43002/59120 [1:39:08<21:48, 12.32it/s]  

{'loss': 0.5869, 'grad_norm': 2.2707433700561523, 'learning_rate': 5.45872801082544e-06, 'epoch': 7.27}


 74%|███████▎  | 43501/59120 [1:40:25<2:30:41,  1.73it/s]

{'loss': 0.5888, 'grad_norm': 3.4720981121063232, 'learning_rate': 5.289918809201624e-06, 'epoch': 7.36}


 74%|███████▍  | 44002/59120 [1:41:26<30:41,  8.21it/s]  

{'loss': 0.576, 'grad_norm': 2.8364577293395996, 'learning_rate': 5.120771312584573e-06, 'epoch': 7.44}


 75%|███████▌  | 44501/59120 [1:42:28<21:59, 11.08it/s]  

{'loss': 0.5759, 'grad_norm': 3.178103446960449, 'learning_rate': 4.951623815967524e-06, 'epoch': 7.53}


 76%|███████▌  | 45003/59120 [1:43:28<20:20, 11.57it/s]  

{'loss': 0.5742, 'grad_norm': 3.5063467025756836, 'learning_rate': 4.782476319350474e-06, 'epoch': 7.61}


 77%|███████▋  | 45502/59120 [1:44:30<30:08,  7.53it/s]  

{'loss': 0.5912, 'grad_norm': 2.37205171585083, 'learning_rate': 4.613328822733424e-06, 'epoch': 7.7}


 78%|███████▊  | 46001/59120 [1:45:35<29:46,  7.34it/s]  

{'loss': 0.575, 'grad_norm': 3.2511661052703857, 'learning_rate': 4.444181326116374e-06, 'epoch': 7.78}


 79%|███████▊  | 46500/59120 [1:46:47<42:17,  4.97it/s]  

{'loss': 0.5724, 'grad_norm': 3.2974693775177, 'learning_rate': 4.275033829499324e-06, 'epoch': 7.87}


 80%|███████▉  | 47001/59120 [1:47:49<18:59, 10.63it/s]  

{'loss': 0.5745, 'grad_norm': 3.180819511413574, 'learning_rate': 4.105886332882274e-06, 'epoch': 7.95}


 80%|████████  | 47502/59120 [1:48:56<23:52,  8.11it/s]  

{'loss': 0.5697, 'grad_norm': 2.4791033267974854, 'learning_rate': 3.937077131258458e-06, 'epoch': 8.03}


 81%|████████  | 48001/59120 [1:50:12<16:07, 11.49it/s]  

{'loss': 0.556, 'grad_norm': 3.5898892879486084, 'learning_rate': 3.7679296346414073e-06, 'epoch': 8.12}


 82%|████████▏ | 48502/59120 [1:51:28<14:58, 11.81it/s]  

{'loss': 0.5726, 'grad_norm': 2.7320892810821533, 'learning_rate': 3.5987821380243577e-06, 'epoch': 8.2}


 83%|████████▎ | 49002/59120 [1:52:31<16:35, 10.17it/s]  

{'loss': 0.5603, 'grad_norm': 3.3177103996276855, 'learning_rate': 3.429634641407307e-06, 'epoch': 8.29}


 84%|████████▎ | 49501/59120 [1:53:43<13:35, 11.79it/s]  

{'loss': 0.558, 'grad_norm': 2.1732170581817627, 'learning_rate': 3.2604871447902575e-06, 'epoch': 8.37}


 85%|████████▍ | 50001/59120 [1:54:57<27:47,  5.47it/s]  

{'loss': 0.5631, 'grad_norm': 3.1351735591888428, 'learning_rate': 3.091339648173207e-06, 'epoch': 8.46}


 85%|████████▌ | 50502/59120 [1:56:01<11:48, 12.16it/s]  

{'loss': 0.5486, 'grad_norm': 2.601547956466675, 'learning_rate': 2.9221921515561573e-06, 'epoch': 8.54}


 86%|████████▋ | 51001/59120 [1:57:10<11:43, 11.54it/s]  

{'loss': 0.5706, 'grad_norm': 4.160942554473877, 'learning_rate': 2.753382949932341e-06, 'epoch': 8.63}


 87%|████████▋ | 51501/59120 [1:58:18<10:44, 11.81it/s]  

{'loss': 0.5683, 'grad_norm': 3.348295211791992, 'learning_rate': 2.5842354533152914e-06, 'epoch': 8.71}


 88%|████████▊ | 52002/59120 [1:59:19<16:01,  7.40it/s]

{'loss': 0.564, 'grad_norm': 3.148343563079834, 'learning_rate': 2.415087956698241e-06, 'epoch': 8.8}


 89%|████████▉ | 52502/59120 [2:00:31<11:08,  9.90it/s]  

{'loss': 0.5725, 'grad_norm': 3.285578489303589, 'learning_rate': 2.246278755074425e-06, 'epoch': 8.88}


 90%|████████▉ | 53002/59120 [2:01:29<09:39, 10.57it/s]  

{'loss': 0.567, 'grad_norm': 3.201730251312256, 'learning_rate': 2.077131258457375e-06, 'epoch': 8.96}


 90%|█████████ | 53503/59120 [2:02:37<07:08, 13.09it/s]  

{'loss': 0.553, 'grad_norm': 3.166001796722412, 'learning_rate': 1.907983761840325e-06, 'epoch': 9.05}


 91%|█████████▏| 54002/59120 [2:03:43<11:25,  7.46it/s]  

{'loss': 0.5614, 'grad_norm': 3.105032444000244, 'learning_rate': 1.7388362652232748e-06, 'epoch': 9.13}


 92%|█████████▏| 54500/59120 [2:04:45<1:02:38,  1.23it/s]

{'loss': 0.5553, 'grad_norm': 4.7028937339782715, 'learning_rate': 1.5696887686062248e-06, 'epoch': 9.22}


 93%|█████████▎| 55002/59120 [2:06:02<09:07,  7.52it/s]  

{'loss': 0.552, 'grad_norm': 3.5488646030426025, 'learning_rate': 1.4005412719891747e-06, 'epoch': 9.3}


 94%|█████████▍| 55500/59120 [2:07:16<09:43,  6.20it/s]

{'loss': 0.5483, 'grad_norm': 2.8257858753204346, 'learning_rate': 1.2313937753721246e-06, 'epoch': 9.39}


 95%|█████████▍| 56001/59120 [2:08:28<29:28,  1.76it/s]

{'loss': 0.5476, 'grad_norm': 4.221645355224609, 'learning_rate': 1.0622462787550745e-06, 'epoch': 9.47}


 96%|█████████▌| 56501/59120 [2:09:44<04:55,  8.88it/s]

{'loss': 0.5494, 'grad_norm': 3.1773674488067627, 'learning_rate': 8.930987821380243e-07, 'epoch': 9.56}


 96%|█████████▋| 57002/59120 [2:10:48<04:28,  7.90it/s]

{'loss': 0.5477, 'grad_norm': 2.9824230670928955, 'learning_rate': 7.242895805142085e-07, 'epoch': 9.64}


 97%|█████████▋| 57502/59120 [2:11:56<02:53,  9.33it/s]

{'loss': 0.5579, 'grad_norm': 2.51481294631958, 'learning_rate': 5.551420838971583e-07, 'epoch': 9.73}


 98%|█████████▊| 58003/59120 [2:12:54<01:30, 12.39it/s]

{'loss': 0.5392, 'grad_norm': 3.0290215015411377, 'learning_rate': 3.8599458728010834e-07, 'epoch': 9.81}


 99%|█████████▉| 58502/59120 [2:14:08<00:54, 11.24it/s]

{'loss': 0.5458, 'grad_norm': 2.9967031478881836, 'learning_rate': 2.168470906630582e-07, 'epoch': 9.9}


100%|█████████▉| 59002/59120 [2:15:08<00:10, 10.80it/s]

{'loss': 0.5434, 'grad_norm': 2.6248016357421875, 'learning_rate': 4.769959404600812e-08, 'epoch': 9.98}


100%|██████████| 59120/59120 [2:15:28<00:00,  7.27it/s]

{'train_runtime': 8128.9827, 'train_samples_per_second': 232.692, 'train_steps_per_second': 7.273, 'train_loss': 0.7104815463735867, 'epoch': 10.0}





TrainOutput(global_step=59120, training_loss=0.7104815463735867, metrics={'train_runtime': 8128.9827, 'train_samples_per_second': 232.692, 'train_steps_per_second': 7.273, 'total_flos': 4.123593985189478e+16, 'train_loss': 0.7104815463735867, 'epoch': 10.0})

In [35]:
trainer.evaluate(max_length=max_length)

100%|██████████| 329/329 [1:40:07<00:00, 18.26s/it]  


{'eval_loss': 0.7965446710586548,
 'eval_model_preparation_time': 0.002,
 'eval_bleu': 54.95663610516514,
 'eval_runtime': 6012.2223,
 'eval_samples_per_second': 3.496,
 'eval_steps_per_second': 0.055,
 'epoch': 10.0}

In [36]:
# Saving the Model and Tokenizer
model_output_dir = "marian-finetuned-kde4-en-to-fr"

trainer.save_model(model_output_dir)

tokenizer.save_pretrained(model_output_dir)

('marian-finetuned-kde4-en-to-fr\\tokenizer_config.json',
 'marian-finetuned-kde4-en-to-fr\\special_tokens_map.json',
 'marian-finetuned-kde4-en-to-fr\\vocab.json',
 'marian-finetuned-kde4-en-to-fr\\source.spm',
 'marian-finetuned-kde4-en-to-fr\\target.spm',
 'marian-finetuned-kde4-en-to-fr\\added_tokens.json')

In [37]:
import json
import os

log_history = trainer.state.log_history

# Save the entire log_history to log_history.json
log_history_output_file = os.path.join(model_output_dir, "log_history.json")
with open(log_history_output_file, "w") as f:
    json.dump(log_history, f, indent=4)

print(f"Log history saved to {log_history_output_file}")

Log history saved to marian-finetuned-kde4-en-to-fr\log_history.json


In [40]:
from transformers import pipeline

model_checkpoint = "marian-finetuned-kde4-en-to-fr"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'translation_text': 'Par défaut, développer les fils de discussion'}]