## **Importing Dependencies**

In [1]:
import torch
print(torch.cuda.is_available())  
print(torch.cuda.device_count()) 
print(torch.cuda.get_device_name(0)) 


True
1
NVIDIA GeForce RTX 2060


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from sacrebleu.metrics import BLEU

import warnings
warnings.filterwarnings('ignore')

if torch.cuda.is_available():
    print("CUDA is available!  Training on GPU ...")
else:
    print("CUDA is not available.  Training on CPU ...")

  from .autonotebook import tqdm as notebook_tqdm


CUDA is available!  Training on GPU ...


## **Loading Dataset**

In [3]:
dataset = load_dataset("opus_books","en-es")

In [5]:
print(dataset["train"][8])

{'id': '8', 'translation': {'en': "In the society of his nephew and niece, and their children, the old Gentleman's days were comfortably spent.", 'es': 'En compañía de su sobrino y sobrina, y de los hijos de ambos, la vida transcurrió confortablemente para el anciano caballero.'}}


In [6]:
## Splitting the dataset into train and test

dataset = dataset["train"].train_test_split(test_size=0.2)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

## **Tokenization**

`Tokenization is the process of breaking down text into smaller units, like words or subwords, which our transformer model can understand and process.`

**Why Tokenization Matters**

`Transformer models don't work directly with raw text. They require numerical representations of the text, and tokenization is the bridge between the two. Each word or subword is assigned a unique numerical ID, allowing the model to manipulate these IDs for translation.`

In [7]:
# Model checkpoint for English-to-Spanish translation
model_checkpoint = "Helsinki-NLP/opus-mt-en-es"

# Load the tokenizer
tokenizer  = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
def tokenize_function(examples):
    # Check if 'translation' is a dictionary or a list of dictionaries
    if isinstance(examples["translation"], dict):
        # Single example: Create a list with the single dictionary
        examples["translation"] = [examples["translation"]]

    return tokenizer(
        [x["en"] for x in examples["translation"]],
        text_target=[x["es"] for x in examples["translation"]],
        padding="max_length",
        truncation=True,
    )


In [9]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 74776/74776 [00:42<00:00, 1754.56 examples/s]
Map: 100%|██████████| 18694/18694 [00:10<00:00, 1857.85 examples/s]


In [10]:
print(tokenized_dataset['train'][2])
print(tokenized_dataset['test'][2])

{'id': '73854', 'translation': {'en': '"Master is well aware," Conseil replied, "that I\'m not seasoned in practical application.', 'es': '-El señor sabe muy bien que la práctica no es mi dominio.'}, 'input_ids': [52, 45941, 31, 255, 4217, 1896, 13307, 110, 1610, 17412, 2, 52, 9764, 33, 20, 92, 64, 6341, 118, 16, 4101, 1177, 3, 0, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65

**Dataloader**

In [11]:
from torch.utils.data import DataLoader

# Prepare for dataloader
train_dataset = tokenized_dataset["train"].shuffle(seed=42).with_format("torch")
val_dataset = tokenized_dataset["test"].shuffle(seed=42).with_format("torch")

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(val_dataset, batch_size=8)

**Loading Pre-trained model**

* Model: MarianMT
* Checkpoint: Helsinki-NLP/opus-mt-en-es

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

## **Fine-Tuning**

In [15]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

# Data Collator to prepare batches
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id
)


# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True # Use mixed precision if your GPU supports it
)


## Create the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


## Start Training
trainer.train()

  0%|          | 1/14022 [04:47<1118:47:21, 287.26s/it]
  2%|▏         | 500/28041 [03:56<3:42:12,  2.07it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.235, 'grad_norm': 0.6157875061035156, 'learning_rate': 1.9646945544024823e-05, 'epoch': 0.05}


  4%|▎         | 1000/28041 [07:55<3:23:17,  2.22it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1378, 'grad_norm': 0.5674015879631042, 'learning_rate': 1.9290324881423633e-05, 'epoch': 0.11}


  5%|▌         | 1500/28041 [11:49<3:15:54,  2.26it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1373, 'grad_norm': 0.35841381549835205, 'learning_rate': 1.893370421882244e-05, 'epoch': 0.16}


  7%|▋         | 2000/28041 [15:46<3:24:33,  2.12it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1343, 'grad_norm': 0.36481973528862, 'learning_rate': 1.857708355622125e-05, 'epoch': 0.21}


  9%|▉         | 2500/28041 [19:47<3:14:20,  2.19it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1309, 'grad_norm': 0.3853970766067505, 'learning_rate': 1.822046289362006e-05, 'epoch': 0.27}


 11%|█         | 3000/28041 [23:46<3:10:59,  2.19it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1305, 'grad_norm': 0.6465558409690857, 'learning_rate': 1.7863842231018866e-05, 'epoch': 0.32}


 12%|█▏        | 3500/28041 [27:44<3:06:58,  2.19it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1308, 'grad_norm': 0.2857813537120819, 'learning_rate': 1.7507221568417676e-05, 'epoch': 0.37}


 14%|█▍        | 4000/28041 [33:35<3:03:49,  2.18it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1295, 'grad_norm': 0.41148141026496887, 'learning_rate': 1.7150600905816483e-05, 'epoch': 0.43}


 16%|█▌        | 4500/28041 [37:36<3:02:40,  2.15it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1288, 'grad_norm': 0.4222314655780792, 'learning_rate': 1.6793980243215293e-05, 'epoch': 0.48}


 18%|█▊        | 5000/28041 [41:35<2:54:13,  2.20it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1299, 'grad_norm': 0.32494494318962097, 'learning_rate': 1.6437359580614103e-05, 'epoch': 0.53}


 20%|█▉        | 5500/28041 [45:32<2:50:19,  2.21it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1309, 'grad_norm': 0.6993133425712585, 'learning_rate': 1.608073891801291e-05, 'epoch': 0.59}


 21%|██▏       | 6000/28041 [49:27<2:48:19,  2.18it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.127, 'grad_norm': 0.5496408343315125, 'learning_rate': 1.572411825541172e-05, 'epoch': 0.64}


 23%|██▎       | 6500/28041 [53:24<2:44:40,  2.18it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1256, 'grad_norm': 0.43100443482398987, 'learning_rate': 1.536749759281053e-05, 'epoch': 0.7}


 25%|██▍       | 7000/28041 [57:29<2:38:46,  2.21it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1228, 'grad_norm': 0.49240586161613464, 'learning_rate': 1.5010876930209338e-05, 'epoch': 0.75}


 27%|██▋       | 7500/28041 [1:01:23<2:34:28,  2.22it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1257, 'grad_norm': 0.37928810715675354, 'learning_rate': 1.4654256267608146e-05, 'epoch': 0.8}


 29%|██▊       | 8000/28041 [1:05:18<2:31:04,  2.21it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1276, 'grad_norm': 0.4654674828052521, 'learning_rate': 1.4298348846332158e-05, 'epoch': 0.86}


 30%|███       | 8500/28041 [1:09:12<2:27:35,  2.21it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1243, 'grad_norm': 0.3538779616355896, 'learning_rate': 1.3941728183730968e-05, 'epoch': 0.91}


 32%|███▏      | 9000/28041 [1:13:06<2:24:00,  2.20it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1312, 'grad_norm': 0.28583911061286926, 'learning_rate': 1.3585107521129774e-05, 'epoch': 0.96}


 33%|███▎      | 9347/28041 [1:15:50<3:26:34,  1.51it/s] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 0.11859449744224548, 'eval_runtime': 461.8561, 'eval_samples_per_second': 40.476, 'eval_steps_per_second': 10.12, 'epoch': 1.0}


 34%|███▍      | 9500/28041 [1:24:44<2:19:41,  2.21it/s]   
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1242, 'grad_norm': 0.41127169132232666, 'learning_rate': 1.3228486858528584e-05, 'epoch': 1.02}


 36%|███▌      | 10000/28041 [1:28:36<2:16:08,  2.21it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1157, 'grad_norm': 0.3797702193260193, 'learning_rate': 1.2871866195927394e-05, 'epoch': 1.07}


 37%|███▋      | 10500/28041 [1:32:29<2:12:24,  2.21it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1169, 'grad_norm': 0.6227406859397888, 'learning_rate': 1.25152455333262e-05, 'epoch': 1.12}


 39%|███▉      | 11000/28041 [1:36:22<2:08:47,  2.21it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1186, 'grad_norm': 0.3207803964614868, 'learning_rate': 1.215862487072501e-05, 'epoch': 1.18}


 41%|████      | 11500/28041 [1:40:14<2:04:40,  2.21it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1124, 'grad_norm': 0.4625977873802185, 'learning_rate': 1.180200420812382e-05, 'epoch': 1.23}


 43%|████▎     | 12000/28041 [1:44:06<2:00:59,  2.21it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1176, 'grad_norm': 0.3407890200614929, 'learning_rate': 1.1445383545522627e-05, 'epoch': 1.28}


 45%|████▍     | 12500/28041 [1:47:59<1:56:58,  2.21it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1174, 'grad_norm': 0.42639070749282837, 'learning_rate': 1.1088762882921437e-05, 'epoch': 1.34}


 46%|████▋     | 13000/28041 [1:51:51<1:53:33,  2.21it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1141, 'grad_norm': 0.41901472210884094, 'learning_rate': 1.0732142220320246e-05, 'epoch': 1.39}


 48%|████▊     | 13500/28041 [1:55:43<1:49:16,  2.22it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1173, 'grad_norm': 0.3751932680606842, 'learning_rate': 1.0375521557719054e-05, 'epoch': 1.44}


 50%|████▉     | 14000/28041 [1:59:37<1:45:35,  2.22it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1128, 'grad_norm': 0.35162806510925293, 'learning_rate': 1.0018900895117864e-05, 'epoch': 1.5}


 52%|█████▏    | 14500/28041 [2:03:32<1:42:03,  2.21it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1149, 'grad_norm': 0.35078394412994385, 'learning_rate': 9.663706715167077e-06, 'epoch': 1.55}


 53%|█████▎    | 15000/28041 [2:07:24<1:38:11,  2.21it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1169, 'grad_norm': 0.4595462381839752, 'learning_rate': 9.307086052565887e-06, 'epoch': 1.6}


 55%|█████▌    | 15500/28041 [2:11:18<1:34:52,  2.20it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1113, 'grad_norm': 0.410284161567688, 'learning_rate': 8.950465389964695e-06, 'epoch': 1.66}


 57%|█████▋    | 16000/28041 [2:15:15<1:30:34,  2.22it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1149, 'grad_norm': 0.25328677892684937, 'learning_rate': 8.593844727363503e-06, 'epoch': 1.71}


 59%|█████▉    | 16500/28041 [2:19:09<1:26:57,  2.21it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1165, 'grad_norm': 0.2815222144126892, 'learning_rate': 8.237224064762313e-06, 'epoch': 1.77}


 61%|██████    | 17000/28041 [2:23:04<1:23:35,  2.20it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1151, 'grad_norm': 0.5153366923332214, 'learning_rate': 7.880603402161122e-06, 'epoch': 1.82}


 62%|██████▏   | 17500/28041 [2:27:01<1:19:44,  2.20it/s] 
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.113, 'grad_norm': 0.360563725233078, 'learning_rate': 7.523982739559931e-06, 'epoch': 1.87}


 64%|██████▍   | 18000/28041 [2:30:55<1:15:43,  2.21it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1129, 'grad_norm': 0.2890985608100891, 'learning_rate': 7.167362076958739e-06, 'epoch': 1.93}


 66%|██████▌   | 18500/28041 [2:34:48<1:11:50,  2.21it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1135, 'grad_norm': 0.40455180406570435, 'learning_rate': 6.810741414357549e-06, 'epoch': 1.98}


 67%|██████▋   | 18694/28041 [2:36:24<1:10:29,  2.21it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 0.11587274819612503, 'eval_runtime': 463.0873, 'eval_samples_per_second': 40.368, 'eval_steps_per_second': 10.093, 'epoch': 2.0}


 68%|██████▊   | 19000/28041 [2:46:26<1:08:05,  2.21it/s]   
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1137, 'grad_norm': 0.3112124502658844, 'learning_rate': 6.454120751756357e-06, 'epoch': 2.03}


 70%|██████▉   | 19500/28041 [2:50:22<1:04:34,  2.20it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1096, 'grad_norm': 0.3559054434299469, 'learning_rate': 6.097500089155166e-06, 'epoch': 2.09}


 71%|███████▏  | 20000/28041 [2:54:16<1:00:51,  2.20it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1116, 'grad_norm': 0.4402216076850891, 'learning_rate': 5.740879426553975e-06, 'epoch': 2.14}


 73%|███████▎  | 20500/28041 [2:58:10<56:53,  2.21it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1096, 'grad_norm': 0.5238280892372131, 'learning_rate': 5.384258763952784e-06, 'epoch': 2.19}


 75%|███████▍  | 21000/28041 [3:02:05<52:58,  2.22it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1096, 'grad_norm': 0.3320372402667999, 'learning_rate': 5.027638101351593e-06, 'epoch': 2.25}


 77%|███████▋  | 21500/28041 [3:05:58<49:15,  2.21it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1085, 'grad_norm': 0.3235066831111908, 'learning_rate': 4.6710174387504015e-06, 'epoch': 2.3}


 78%|███████▊  | 22000/28041 [3:09:54<45:23,  2.22it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1071, 'grad_norm': 0.41274240612983704, 'learning_rate': 4.314396776149211e-06, 'epoch': 2.35}


 80%|████████  | 22500/28041 [3:13:49<41:44,  2.21it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1064, 'grad_norm': 0.3276349604129791, 'learning_rate': 3.957776113548019e-06, 'epoch': 2.41}


 82%|████████▏ | 23000/28041 [3:17:45<37:57,  2.21it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1098, 'grad_norm': 0.2843802571296692, 'learning_rate': 3.601155450946828e-06, 'epoch': 2.46}


 84%|████████▍ | 23500/28041 [3:21:37<34:15,  2.21it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1078, 'grad_norm': 0.5549394488334656, 'learning_rate': 3.244534788345637e-06, 'epoch': 2.51}


 86%|████████▌ | 24000/28041 [3:25:29<30:31,  2.21it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.11, 'grad_norm': 0.297656387090683, 'learning_rate': 2.887914125744446e-06, 'epoch': 2.57}


 87%|████████▋ | 24500/28041 [3:29:52<2:21:08,  2.39s/it]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1101, 'grad_norm': 0.353456050157547, 'learning_rate': 2.5312934631432544e-06, 'epoch': 2.62}


 89%|████████▉ | 25000/28041 [3:34:05<22:54,  2.21it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1074, 'grad_norm': 0.42576009035110474, 'learning_rate': 2.1746728005420635e-06, 'epoch': 2.67}


 91%|█████████ | 25500/28041 [3:37:58<19:03,  2.22it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1073, 'grad_norm': 0.3594461679458618, 'learning_rate': 1.8180521379408725e-06, 'epoch': 2.73}


 93%|█████████▎| 26000/28041 [3:41:50<15:17,  2.23it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1051, 'grad_norm': 0.47031840682029724, 'learning_rate': 1.4614314753396814e-06, 'epoch': 2.78}


 95%|█████████▍| 26500/28041 [3:45:43<11:39,  2.20it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1058, 'grad_norm': 0.34311962127685547, 'learning_rate': 1.1048108127384902e-06, 'epoch': 2.84}


 96%|█████████▋| 27000/28041 [3:49:35<07:48,  2.22it/s]  
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1076, 'grad_norm': 0.4031136631965637, 'learning_rate': 7.48190150137299e-07, 'epoch': 2.89}


 98%|█████████▊| 27500/28041 [3:53:28<04:04,  2.21it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1076, 'grad_norm': 0.3567255437374115, 'learning_rate': 3.9156948753610785e-07, 'epoch': 2.94}


100%|█████████▉| 28000/28041 [3:57:18<00:18,  2.21it/s]
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


{'loss': 0.1056, 'grad_norm': 0.24180710315704346, 'learning_rate': 3.494882493491673e-08, 'epoch': 3.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 0.11516240984201431, 'eval_runtime': 458.8522, 'eval_samples_per_second': 40.741, 'eval_steps_per_second': 10.186, 'epoch': 3.0}
{'train_runtime': 14739.7812, 'train_samples_per_second': 15.219, 'train_steps_per_second': 1.902, 'train_loss': 0.1195510238167784, 'epoch': 3.0}





TrainOutput(global_step=28041, training_loss=0.1195510238167784, metrics={'train_runtime': 14739.7812, 'train_samples_per_second': 15.219, 'train_steps_per_second': 1.902, 'total_flos': 3.0417409630273536e+16, 'train_loss': 0.1195510238167784, 'epoch': 3.0})