In [1]:
from datasets import load_dataset
import evaluate
import torch
import os

# Use CPU only
device = torch.device("cpu")
print(f"Using device: {device}")

# Load dataset
dataset = load_dataset("parquet", data_files={
    "train": "data/iamTangsang_dataset/train-00000-of-00001.parquet",
    "validation": "data/iamTangsang_dataset/validation-00000-of-00001.parquet", 
    "test": "data/iamTangsang_dataset/test-00000-of-00001.parquet"
})

# Subset the data
dataset["train"] = dataset["train"].select(range(10000))       
dataset["validation"] = dataset["validation"].select(range(1000))
dataset["test"] = dataset["test"].select(range(200)) 

# Filter out empty strings and ensure data quality
def is_valid_pair(example):
    source = str(example["source"]).strip() if example["source"] is not None else ""
    target = str(example["target"]).strip() if example["target"] is not None else ""
    return len(source) > 0 and len(target) > 0 and len(source) < 1000 and len(target) < 1000

dataset = dataset.filter(is_valid_pair)

print("Dataset sizes after filtering:")
print(f"Train: {len(dataset['train'])}")
print(f"Validation: {len(dataset['validation'])}")
print(f"Test: {len(dataset['test'])}")
print("\nSample:")
print(dataset["train"][0])

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu
Dataset sizes after filtering:
Train: 10000
Validation: 1000
Test: 200

Sample:
{'source': '"कुनै पनि अन्य सरकारी एजेन्सीले यो जानकारी प्रयोग गर्न सक्दैन, केन्द्रीय सरकार अन्तर्गतका कसैले कुनै पनि हालतमा यो जानकारी पाउँदैनन् र राज्यका अधिकारीहरूमा पनि स्वास्थ्य अधिकारीहरूले मात्र यसलाई प्रयोग गर्न सक्दछन्," उनले भने।', 'target': '"No other government agency can use this information, no one in the commonwealth government at all, and in state authorities, only the health officer can use it.'}


In [2]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small", legacy=False)

def preprocess_function(examples):
    sources = [str(src).strip() for src in examples["source"]]
    targets = [str(tgt).strip() for tgt in examples["target"]]
    inputs = ["translate Nepali to English: " + src for src in sources]
    model_inputs = tokenizer(inputs, max_length=300, truncation=True, padding=True, return_tensors=None)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=300, truncation=True, padding=True, return_tensors=None)

    model_inputs["labels"] = [
        [(token_id if token_id != tokenizer.pad_token_id else -100) for token_id in label]
        for label in labels["input_ids"]
    ]
    
    return model_inputs

In [3]:
print("Tokenizer vocab size:", tokenizer.vocab_size)
print("Pad token id:", tokenizer.pad_token_id)

lengths = []
for i in range(min(1000, len(dataset["train"]))):
    input_text = "translate Nepali to English: " + str(dataset["train"][i]["source"])
    tokens = tokenizer(input_text)["input_ids"]
    lengths.append(len(tokens))

print(f"Max length: {max(lengths)}")
print(f"Average length: {sum(lengths)/len(lengths):.1f}")

Tokenizer vocab size: 250100
Pad token id: 0
Max length: 272
Average length: 30.3


In [4]:
print("Preprocessing datasets...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing"
)

tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

sample = tokenized_dataset["train"][0]
for key, value in sample.items():
    if torch.is_tensor(value):
        print(f"{key}: shape {value.shape}, dtype {value.dtype}")
    else:
        print(f"{key}: {type(value)}")


Preprocessing datasets...
input_ids: shape torch.Size([272]), dtype torch.int64
attention_mask: shape torch.Size([272]), dtype torch.int64
labels: shape torch.Size([219]), dtype torch.int64


In [5]:
def find_invalid_tokens(dataset_split, split_name=""):
    invalid_count = 0
    print(f"\nChecking {split_name} set...")
    for i in range(min(100, len(dataset_split))):
        labels = dataset_split[i]["labels"]
        for token_id in labels:
            if token_id != -100 and (token_id >= tokenizer.vocab_size or token_id < 0):
                print(f"❌ Invalid token ID {token_id} at index {i}")
                invalid_count += 1
                break
    print(f"Invalid tokens in {split_name}: {invalid_count}")
    return invalid_count

find_invalid_tokens(tokenized_dataset["train"], "train")
find_invalid_tokens(tokenized_dataset["validation"], "validation")
find_invalid_tokens(tokenized_dataset["test"], "test")


Checking train set...
Invalid tokens in train: 0

Checking validation set...
Invalid tokens in validation: 0

Checking test set...
Invalid tokens in test: 0


0

In [6]:
print("Loading model...")
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
model = model.to(device)

Loading model...


In [7]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-npi-en",
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    generation_max_length=300,
    weight_decay=0.01,
    save_total_limit=6,
    num_train_epochs=6,
    predict_with_generate=True,
    fp16=False,
    use_cpu=True,
    dataloader_pin_memory=False,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    gradient_checkpointing=False,
    report_to=None,
)

In [8]:
from transformers import Seq2SeqTrainer

print("Testing data collator...")
test_batch = [tokenized_dataset["train"][i] for i in range(2)]
collated = data_collator(test_batch)
print("Data collator working:")
for key, value in collated.items():
    print(f"  {key}: {value.shape}")

# %%
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


Testing data collator...
Data collator working:
  input_ids: torch.Size([2, 272])
  attention_mask: torch.Size([2, 272])
  labels: torch.Size([2, 219])
  decoder_input_ids: torch.Size([2, 219])


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


In [9]:
print("Starting training on CPU...")
trainer.train()
print("Training completed!")

Starting training on CPU...


  0%|          | 100/30000 [02:02<10:35:41,  1.28s/it]

{'loss': 21.2574, 'grad_norm': 44822.11328125, 'learning_rate': 0.00019933333333333334, 'epoch': 0.02}


  1%|          | 200/30000 [04:04<9:53:42,  1.20s/it] 

{'loss': 9.4868, 'grad_norm': 242.44509887695312, 'learning_rate': 0.00019866666666666668, 'epoch': 0.04}


  1%|          | 300/30000 [06:06<10:59:06,  1.33s/it]

{'loss': 6.5127, 'grad_norm': 107.95098114013672, 'learning_rate': 0.00019800000000000002, 'epoch': 0.06}


  1%|▏         | 400/30000 [08:10<10:05:37,  1.23s/it]

{'loss': 5.511, 'grad_norm': 24.68218994140625, 'learning_rate': 0.00019733333333333335, 'epoch': 0.08}


  2%|▏         | 500/30000 [10:12<9:37:08,  1.17s/it] 

{'loss': 5.0798, 'grad_norm': 32.682350158691406, 'learning_rate': 0.00019666666666666666, 'epoch': 0.1}


  2%|▏         | 600/30000 [12:19<10:19:47,  1.26s/it]

{'loss': 4.8172, 'grad_norm': 23.70722198486328, 'learning_rate': 0.000196, 'epoch': 0.12}


  2%|▏         | 700/30000 [14:22<9:38:03,  1.18s/it] 

{'loss': 4.6993, 'grad_norm': 17.543363571166992, 'learning_rate': 0.00019533333333333336, 'epoch': 0.14}


  3%|▎         | 800/30000 [16:25<9:22:19,  1.16s/it] 

{'loss': 4.4581, 'grad_norm': 7.984825134277344, 'learning_rate': 0.0001946666666666667, 'epoch': 0.16}


  3%|▎         | 900/30000 [18:26<9:57:34,  1.23s/it] 

{'loss': 4.3801, 'grad_norm': 7.300928115844727, 'learning_rate': 0.000194, 'epoch': 0.18}


  3%|▎         | 1000/30000 [20:31<9:56:14,  1.23s/it]

{'loss': 4.103, 'grad_norm': 6.58079195022583, 'learning_rate': 0.00019333333333333333, 'epoch': 0.2}


  4%|▎         | 1100/30000 [22:43<11:13:28,  1.40s/it]

{'loss': 4.06, 'grad_norm': 8.219980239868164, 'learning_rate': 0.0001926666666666667, 'epoch': 0.22}


  4%|▍         | 1200/30000 [24:47<10:19:17,  1.29s/it]

{'loss': 3.8985, 'grad_norm': 11.106361389160156, 'learning_rate': 0.000192, 'epoch': 0.24}


  4%|▍         | 1300/30000 [26:55<9:23:00,  1.18s/it] 

{'loss': 4.0169, 'grad_norm': 10.39916706085205, 'learning_rate': 0.00019133333333333334, 'epoch': 0.26}


  5%|▍         | 1400/30000 [28:59<9:40:52,  1.22s/it] 

{'loss': 3.7996, 'grad_norm': 10.653345108032227, 'learning_rate': 0.00019066666666666668, 'epoch': 0.28}


  5%|▌         | 1500/30000 [31:06<9:43:14,  1.23s/it] 

{'loss': 3.7844, 'grad_norm': 9.195404052734375, 'learning_rate': 0.00019, 'epoch': 0.3}


  5%|▌         | 1600/30000 [33:16<9:46:52,  1.24s/it] 

{'loss': 3.8286, 'grad_norm': 11.313310623168945, 'learning_rate': 0.00018933333333333335, 'epoch': 0.32}


  6%|▌         | 1700/30000 [35:25<10:11:50,  1.30s/it]

{'loss': 3.6808, 'grad_norm': 5.989418983459473, 'learning_rate': 0.00018866666666666668, 'epoch': 0.34}


  6%|▌         | 1800/30000 [37:32<9:26:59,  1.21s/it] 

{'loss': 3.5117, 'grad_norm': 6.229604244232178, 'learning_rate': 0.000188, 'epoch': 0.36}


  6%|▋         | 1900/30000 [39:35<9:19:31,  1.19s/it] 

{'loss': 3.7343, 'grad_norm': 7.014500617980957, 'learning_rate': 0.00018733333333333335, 'epoch': 0.38}


  7%|▋         | 2000/30000 [41:46<10:39:23,  1.37s/it]

{'loss': 3.4572, 'grad_norm': 6.873966693878174, 'learning_rate': 0.0001866666666666667, 'epoch': 0.4}


  7%|▋         | 2100/30000 [44:01<10:17:40,  1.33s/it]

{'loss': 3.5863, 'grad_norm': 7.49422025680542, 'learning_rate': 0.00018600000000000002, 'epoch': 0.42}


  7%|▋         | 2200/30000 [46:09<9:24:04,  1.22s/it] 

{'loss': 3.6351, 'grad_norm': 10.705790519714355, 'learning_rate': 0.00018533333333333333, 'epoch': 0.44}


  8%|▊         | 2300/30000 [48:17<9:40:15,  1.26s/it] 

{'loss': 3.4934, 'grad_norm': 7.0245561599731445, 'learning_rate': 0.00018466666666666666, 'epoch': 0.46}


  8%|▊         | 2400/30000 [50:25<9:16:10,  1.21s/it] 

{'loss': 3.435, 'grad_norm': 5.949498653411865, 'learning_rate': 0.00018400000000000003, 'epoch': 0.48}


  8%|▊         | 2500/30000 [52:32<10:02:16,  1.31s/it]

{'loss': 3.4853, 'grad_norm': 4.197351455688477, 'learning_rate': 0.00018333333333333334, 'epoch': 0.5}


  9%|▊         | 2600/30000 [54:42<10:00:20,  1.31s/it]

{'loss': 3.5096, 'grad_norm': 6.799025535583496, 'learning_rate': 0.00018266666666666667, 'epoch': 0.52}


  9%|▉         | 2700/30000 [56:49<9:15:23,  1.22s/it] 

{'loss': 3.3634, 'grad_norm': 5.141382217407227, 'learning_rate': 0.000182, 'epoch': 0.54}


  9%|▉         | 2800/30000 [58:57<9:55:48,  1.31s/it] 

{'loss': 3.5172, 'grad_norm': 8.702921867370605, 'learning_rate': 0.00018133333333333334, 'epoch': 0.56}


 10%|▉         | 2900/30000 [1:01:03<9:08:29,  1.21s/it] 

{'loss': 3.3906, 'grad_norm': 7.998960494995117, 'learning_rate': 0.00018066666666666668, 'epoch': 0.58}


 10%|█         | 3000/30000 [1:03:09<9:37:41,  1.28s/it] 

{'loss': 3.3866, 'grad_norm': 8.711162567138672, 'learning_rate': 0.00018, 'epoch': 0.6}


 10%|█         | 3100/30000 [1:05:20<9:07:25,  1.22s/it] 

{'loss': 3.2273, 'grad_norm': 7.915528297424316, 'learning_rate': 0.00017933333333333332, 'epoch': 0.62}


 11%|█         | 3200/30000 [1:07:26<9:03:41,  1.22s/it] 

{'loss': 3.3634, 'grad_norm': 9.205799102783203, 'learning_rate': 0.00017866666666666668, 'epoch': 0.64}


 11%|█         | 3300/30000 [1:09:32<10:09:07,  1.37s/it]

{'loss': 3.2164, 'grad_norm': 11.693943977355957, 'learning_rate': 0.00017800000000000002, 'epoch': 0.66}


 11%|█▏        | 3400/30000 [1:11:40<9:40:08,  1.31s/it] 

{'loss': 3.2248, 'grad_norm': 4.657835483551025, 'learning_rate': 0.00017733333333333335, 'epoch': 0.68}


 12%|█▏        | 3500/30000 [1:13:46<9:28:02,  1.29s/it] 

{'loss': 3.3447, 'grad_norm': 5.443624973297119, 'learning_rate': 0.00017666666666666666, 'epoch': 0.7}


 12%|█▏        | 3600/30000 [1:15:55<9:32:56,  1.30s/it] 

{'loss': 3.1081, 'grad_norm': 6.073862552642822, 'learning_rate': 0.00017600000000000002, 'epoch': 0.72}


 12%|█▏        | 3700/30000 [1:18:02<9:19:37,  1.28s/it] 

{'loss': 3.2113, 'grad_norm': 10.086788177490234, 'learning_rate': 0.00017533333333333336, 'epoch': 0.74}


 13%|█▎        | 3800/30000 [1:20:07<8:53:58,  1.22s/it] 

{'loss': 3.1993, 'grad_norm': 6.584895610809326, 'learning_rate': 0.00017466666666666667, 'epoch': 0.76}


 13%|█▎        | 3900/30000 [1:22:12<9:05:12,  1.25s/it] 

{'loss': 3.2595, 'grad_norm': 5.324306011199951, 'learning_rate': 0.000174, 'epoch': 0.78}


 13%|█▎        | 4000/30000 [1:24:16<8:20:12,  1.15s/it] 

{'loss': 3.1483, 'grad_norm': 8.353704452514648, 'learning_rate': 0.00017333333333333334, 'epoch': 0.8}


 14%|█▎        | 4100/30000 [1:26:28<8:42:19,  1.21s/it] 

{'loss': 3.2897, 'grad_norm': 4.9236836433410645, 'learning_rate': 0.00017266666666666667, 'epoch': 0.82}


 14%|█▍        | 4200/30000 [1:28:28<9:08:50,  1.28s/it]

{'loss': 3.2744, 'grad_norm': 11.317562103271484, 'learning_rate': 0.000172, 'epoch': 0.84}


 14%|█▍        | 4300/30000 [1:30:28<8:27:56,  1.19s/it]

{'loss': 3.3654, 'grad_norm': 7.371620178222656, 'learning_rate': 0.00017133333333333334, 'epoch': 0.86}


 15%|█▍        | 4400/30000 [1:32:29<8:49:12,  1.24s/it]

{'loss': 3.1941, 'grad_norm': 8.258766174316406, 'learning_rate': 0.00017066666666666668, 'epoch': 0.88}


 15%|█▌        | 4500/30000 [1:34:31<8:40:07,  1.22s/it]

{'loss': 3.0794, 'grad_norm': 9.07197380065918, 'learning_rate': 0.00017, 'epoch': 0.9}


 15%|█▌        | 4600/30000 [1:36:35<7:45:02,  1.10s/it] 

{'loss': 3.2575, 'grad_norm': 4.540729999542236, 'learning_rate': 0.00016933333333333335, 'epoch': 0.92}


 16%|█▌        | 4700/30000 [1:38:34<8:30:43,  1.21s/it]

{'loss': 3.2139, 'grad_norm': 7.073605537414551, 'learning_rate': 0.00016866666666666668, 'epoch': 0.94}


 16%|█▌        | 4800/30000 [1:40:36<8:29:16,  1.21s/it]

{'loss': 3.1956, 'grad_norm': 5.583514213562012, 'learning_rate': 0.000168, 'epoch': 0.96}


 16%|█▋        | 4900/30000 [1:42:36<8:00:06,  1.15s/it]

{'loss': 3.1454, 'grad_norm': 7.71763277053833, 'learning_rate': 0.00016733333333333335, 'epoch': 0.98}


 17%|█▋        | 5000/30000 [1:44:35<8:16:37,  1.19s/it]

{'loss': 3.0094, 'grad_norm': 6.93963623046875, 'learning_rate': 0.0001666666666666667, 'epoch': 1.0}


                                                        
 17%|█▋        | 5000/30000 [1:46:25<8:16:37,  1.19s/it]

{'eval_loss': 2.520526647567749, 'eval_runtime': 107.1228, 'eval_samples_per_second': 9.335, 'eval_steps_per_second': 4.668, 'epoch': 1.0}


 17%|█▋        | 5100/30000 [1:48:32<8:28:52,  1.23s/it]  

{'loss': 2.9258, 'grad_norm': 7.448722839355469, 'learning_rate': 0.000166, 'epoch': 1.02}


 17%|█▋        | 5200/30000 [1:50:38<8:04:00,  1.17s/it]

{'loss': 2.9734, 'grad_norm': 5.0227952003479, 'learning_rate': 0.00016533333333333333, 'epoch': 1.04}


 18%|█▊        | 5300/30000 [1:52:43<7:58:32,  1.16s/it] 

{'loss': 2.9377, 'grad_norm': 11.973612785339355, 'learning_rate': 0.00016466666666666667, 'epoch': 1.06}


 18%|█▊        | 5400/30000 [1:54:42<7:54:48,  1.16s/it]

{'loss': 2.7382, 'grad_norm': 9.728065490722656, 'learning_rate': 0.000164, 'epoch': 1.08}


 18%|█▊        | 5500/30000 [1:56:44<8:25:27,  1.24s/it]

{'loss': 2.7665, 'grad_norm': 3.744199275970459, 'learning_rate': 0.00016333333333333334, 'epoch': 1.1}


 19%|█▊        | 5600/30000 [1:58:54<8:15:31,  1.22s/it] 

{'loss': 2.7888, 'grad_norm': 7.7837347984313965, 'learning_rate': 0.00016266666666666667, 'epoch': 1.12}


 19%|█▉        | 5700/30000 [2:00:57<9:16:06,  1.37s/it]

{'loss': 2.8175, 'grad_norm': 9.88839054107666, 'learning_rate': 0.000162, 'epoch': 1.14}


 19%|█▉        | 5800/30000 [2:03:00<8:30:00,  1.26s/it]

{'loss': 2.7772, 'grad_norm': 8.595609664916992, 'learning_rate': 0.00016133333333333334, 'epoch': 1.16}


 20%|█▉        | 5900/30000 [2:04:57<7:57:19,  1.19s/it]

{'loss': 2.8671, 'grad_norm': 6.566179275512695, 'learning_rate': 0.00016066666666666668, 'epoch': 1.18}


 20%|██        | 6000/30000 [2:06:54<8:02:23,  1.21s/it]

{'loss': 2.7096, 'grad_norm': 5.74712610244751, 'learning_rate': 0.00016, 'epoch': 1.2}


 20%|██        | 6100/30000 [2:08:57<7:54:41,  1.19s/it] 

{'loss': 2.7892, 'grad_norm': 5.867049694061279, 'learning_rate': 0.00015933333333333332, 'epoch': 1.22}


 21%|██        | 6200/30000 [2:10:56<7:43:50,  1.17s/it]

{'loss': 2.7104, 'grad_norm': 6.5150465965271, 'learning_rate': 0.00015866666666666668, 'epoch': 1.24}


 21%|██        | 6300/30000 [2:12:56<8:01:39,  1.22s/it]

{'loss': 2.5985, 'grad_norm': 4.888063907623291, 'learning_rate': 0.00015800000000000002, 'epoch': 1.26}


 21%|██▏       | 6400/30000 [2:14:57<8:09:41,  1.24s/it]

{'loss': 2.7505, 'grad_norm': 9.110223770141602, 'learning_rate': 0.00015733333333333333, 'epoch': 1.28}


 22%|██▏       | 6500/30000 [2:16:56<7:10:56,  1.10s/it]

{'loss': 2.6599, 'grad_norm': 6.0494866371154785, 'learning_rate': 0.00015666666666666666, 'epoch': 1.3}


 22%|██▏       | 6600/30000 [2:18:53<7:17:18,  1.12s/it] 

{'loss': 2.7263, 'grad_norm': 6.448815822601318, 'learning_rate': 0.00015600000000000002, 'epoch': 1.32}


 22%|██▏       | 6700/30000 [2:20:49<7:47:48,  1.20s/it]

{'loss': 2.8273, 'grad_norm': 6.978143215179443, 'learning_rate': 0.00015533333333333333, 'epoch': 1.34}


 23%|██▎       | 6800/30000 [2:22:48<7:29:21,  1.16s/it]

{'loss': 2.6597, 'grad_norm': 10.276030540466309, 'learning_rate': 0.00015466666666666667, 'epoch': 1.36}


 23%|██▎       | 6900/30000 [2:24:47<7:23:44,  1.15s/it]

{'loss': 2.7909, 'grad_norm': 8.029081344604492, 'learning_rate': 0.000154, 'epoch': 1.38}


 23%|██▎       | 7000/30000 [2:26:45<8:07:06,  1.27s/it]

{'loss': 2.7709, 'grad_norm': 6.686424732208252, 'learning_rate': 0.00015333333333333334, 'epoch': 1.4}


 24%|██▎       | 7100/30000 [2:28:45<7:19:55,  1.15s/it] 

{'loss': 2.6249, 'grad_norm': 5.337942123413086, 'learning_rate': 0.00015266666666666667, 'epoch': 1.42}


 24%|██▍       | 7200/30000 [2:30:43<7:31:38,  1.19s/it]

{'loss': 2.7947, 'grad_norm': 7.059854984283447, 'learning_rate': 0.000152, 'epoch': 1.44}


 24%|██▍       | 7300/30000 [2:32:42<6:58:02,  1.10s/it]

{'loss': 2.7384, 'grad_norm': 5.3936004638671875, 'learning_rate': 0.00015133333333333334, 'epoch': 1.46}


 25%|██▍       | 7400/30000 [2:34:42<7:11:23,  1.15s/it]

{'loss': 2.7767, 'grad_norm': 5.797730922698975, 'learning_rate': 0.00015066666666666668, 'epoch': 1.48}


 25%|██▌       | 7500/30000 [2:36:42<7:41:25,  1.23s/it]

{'loss': 2.6877, 'grad_norm': 6.119821548461914, 'learning_rate': 0.00015000000000000001, 'epoch': 1.5}


 25%|██▌       | 7600/30000 [2:38:44<8:15:30,  1.33s/it] 

{'loss': 2.6916, 'grad_norm': 11.591140747070312, 'learning_rate': 0.00014933333333333335, 'epoch': 1.52}


 26%|██▌       | 7700/30000 [2:40:42<7:25:42,  1.20s/it]

{'loss': 2.6806, 'grad_norm': 11.95719051361084, 'learning_rate': 0.00014866666666666666, 'epoch': 1.54}


 26%|██▌       | 7800/30000 [2:42:41<7:15:19,  1.18s/it]

{'loss': 2.7427, 'grad_norm': 6.913187503814697, 'learning_rate': 0.000148, 'epoch': 1.56}


 26%|██▋       | 7900/30000 [2:44:41<7:02:49,  1.15s/it]

{'loss': 2.7341, 'grad_norm': 6.351408004760742, 'learning_rate': 0.00014733333333333335, 'epoch': 1.58}


 27%|██▋       | 8000/30000 [2:46:40<7:43:58,  1.27s/it]

{'loss': 2.7958, 'grad_norm': 7.405006408691406, 'learning_rate': 0.00014666666666666666, 'epoch': 1.6}


 27%|██▋       | 8100/30000 [2:48:41<7:32:32,  1.24s/it] 

{'loss': 2.7337, 'grad_norm': 7.838242053985596, 'learning_rate': 0.000146, 'epoch': 1.62}


 27%|██▋       | 8200/30000 [2:50:38<7:10:37,  1.19s/it]

{'loss': 2.7561, 'grad_norm': 6.462875843048096, 'learning_rate': 0.00014533333333333333, 'epoch': 1.64}


 28%|██▊       | 8300/30000 [2:52:37<6:56:43,  1.15s/it]

{'loss': 2.703, 'grad_norm': 8.575815200805664, 'learning_rate': 0.0001446666666666667, 'epoch': 1.66}


 28%|██▊       | 8400/30000 [2:54:37<7:40:59,  1.28s/it]

{'loss': 2.4977, 'grad_norm': 6.619747638702393, 'learning_rate': 0.000144, 'epoch': 1.68}


 28%|██▊       | 8500/30000 [2:56:34<7:06:57,  1.19s/it]

{'loss': 2.749, 'grad_norm': 7.092264175415039, 'learning_rate': 0.00014333333333333334, 'epoch': 1.7}


 29%|██▊       | 8600/30000 [2:58:36<6:49:35,  1.15s/it] 

{'loss': 2.6192, 'grad_norm': 6.704310417175293, 'learning_rate': 0.00014266666666666667, 'epoch': 1.72}


 29%|██▉       | 8700/30000 [3:00:31<7:03:01,  1.19s/it]

{'loss': 2.7773, 'grad_norm': 5.711865425109863, 'learning_rate': 0.000142, 'epoch': 1.74}


 29%|██▉       | 8800/30000 [3:02:30<6:54:01,  1.17s/it]

{'loss': 2.5776, 'grad_norm': 6.728206634521484, 'learning_rate': 0.00014133333333333334, 'epoch': 1.76}


 30%|██▉       | 8900/30000 [3:04:29<7:31:43,  1.28s/it]

{'loss': 2.6603, 'grad_norm': 5.488132476806641, 'learning_rate': 0.00014066666666666668, 'epoch': 1.78}


 30%|███       | 9000/30000 [3:06:28<6:38:55,  1.14s/it]

{'loss': 2.6391, 'grad_norm': 7.748819351196289, 'learning_rate': 0.00014, 'epoch': 1.8}


 30%|███       | 9100/30000 [3:08:31<6:24:09,  1.10s/it] 

{'loss': 2.632, 'grad_norm': 6.25527811050415, 'learning_rate': 0.00013933333333333335, 'epoch': 1.82}


 31%|███       | 9200/30000 [3:10:27<6:21:48,  1.10s/it]

{'loss': 2.6224, 'grad_norm': 5.201485633850098, 'learning_rate': 0.00013866666666666669, 'epoch': 1.84}


 31%|███       | 9300/30000 [3:12:28<7:02:20,  1.22s/it]

{'loss': 2.6142, 'grad_norm': 5.41255521774292, 'learning_rate': 0.000138, 'epoch': 1.86}


 31%|███▏      | 9400/30000 [3:14:26<6:31:20,  1.14s/it]

{'loss': 2.6066, 'grad_norm': 4.269364833831787, 'learning_rate': 0.00013733333333333333, 'epoch': 1.88}


 32%|███▏      | 9500/30000 [3:16:27<6:45:52,  1.19s/it]

{'loss': 2.4623, 'grad_norm': 8.589313507080078, 'learning_rate': 0.00013666666666666666, 'epoch': 1.9}


 32%|███▏      | 9600/30000 [3:18:29<6:17:37,  1.11s/it] 

{'loss': 2.6194, 'grad_norm': 6.524998188018799, 'learning_rate': 0.00013600000000000003, 'epoch': 1.92}


 32%|███▏      | 9700/30000 [3:20:31<6:39:22,  1.18s/it]

{'loss': 2.6795, 'grad_norm': 6.619063854217529, 'learning_rate': 0.00013533333333333333, 'epoch': 1.94}


 33%|███▎      | 9800/30000 [3:22:30<6:46:38,  1.21s/it]

{'loss': 2.566, 'grad_norm': 6.168013572692871, 'learning_rate': 0.00013466666666666667, 'epoch': 1.96}


 33%|███▎      | 9900/30000 [3:24:30<6:32:33,  1.17s/it]

{'loss': 2.6133, 'grad_norm': 4.56100606918335, 'learning_rate': 0.000134, 'epoch': 1.98}


 33%|███▎      | 10000/30000 [3:26:29<6:36:37,  1.19s/it]

{'loss': 2.5928, 'grad_norm': 5.2353081703186035, 'learning_rate': 0.00013333333333333334, 'epoch': 2.0}


                                                         
 33%|███▎      | 10000/30000 [3:28:07<6:36:37,  1.19s/it]

{'eval_loss': 2.3184187412261963, 'eval_runtime': 95.3564, 'eval_samples_per_second': 10.487, 'eval_steps_per_second': 5.243, 'epoch': 2.0}


 34%|███▎      | 10100/30000 [3:30:04<6:15:30,  1.13s/it]  

{'loss': 2.3873, 'grad_norm': 6.719305992126465, 'learning_rate': 0.00013266666666666667, 'epoch': 2.02}


 34%|███▍      | 10200/30000 [3:32:03<6:36:15,  1.20s/it]

{'loss': 2.4539, 'grad_norm': 6.051456928253174, 'learning_rate': 0.000132, 'epoch': 2.04}


 34%|███▍      | 10300/30000 [3:34:01<6:18:15,  1.15s/it]

{'loss': 2.2951, 'grad_norm': 5.492381572723389, 'learning_rate': 0.00013133333333333332, 'epoch': 2.06}


 35%|███▍      | 10400/30000 [3:36:00<6:17:49,  1.16s/it]

{'loss': 2.296, 'grad_norm': 8.808542251586914, 'learning_rate': 0.00013066666666666668, 'epoch': 2.08}


 35%|███▌      | 10500/30000 [3:37:58<6:09:40,  1.14s/it]

{'loss': 2.3509, 'grad_norm': 6.150615692138672, 'learning_rate': 0.00013000000000000002, 'epoch': 2.1}


 35%|███▌      | 10600/30000 [3:40:00<6:21:31,  1.18s/it] 

{'loss': 2.3156, 'grad_norm': 6.74047327041626, 'learning_rate': 0.00012933333333333332, 'epoch': 2.12}


 36%|███▌      | 10700/30000 [3:41:58<6:48:22,  1.27s/it]

{'loss': 2.4363, 'grad_norm': 6.240455627441406, 'learning_rate': 0.00012866666666666666, 'epoch': 2.14}


 36%|███▌      | 10800/30000 [3:43:57<5:57:58,  1.12s/it]

{'loss': 2.2995, 'grad_norm': 8.483962059020996, 'learning_rate': 0.00012800000000000002, 'epoch': 2.16}


 36%|███▋      | 10900/30000 [3:45:57<6:13:20,  1.17s/it]

{'loss': 2.4264, 'grad_norm': 6.092353820800781, 'learning_rate': 0.00012733333333333336, 'epoch': 2.18}


 37%|███▋      | 11000/30000 [3:47:55<6:21:23,  1.20s/it]

{'loss': 2.4261, 'grad_norm': 7.138434410095215, 'learning_rate': 0.00012666666666666666, 'epoch': 2.2}


 37%|███▋      | 11100/30000 [3:49:57<6:10:39,  1.18s/it] 

{'loss': 2.2313, 'grad_norm': 8.24083137512207, 'learning_rate': 0.000126, 'epoch': 2.22}


 37%|███▋      | 11200/30000 [3:51:59<6:32:19,  1.25s/it]

{'loss': 2.4281, 'grad_norm': 6.545738697052002, 'learning_rate': 0.00012533333333333334, 'epoch': 2.24}


 38%|███▊      | 11300/30000 [3:53:57<6:19:11,  1.22s/it]

{'loss': 2.4466, 'grad_norm': 6.305813789367676, 'learning_rate': 0.00012466666666666667, 'epoch': 2.26}


 38%|███▊      | 11400/30000 [3:55:55<6:07:16,  1.18s/it]

{'loss': 2.2302, 'grad_norm': 3.9707605838775635, 'learning_rate': 0.000124, 'epoch': 2.28}


 38%|███▊      | 11500/30000 [3:57:58<6:21:36,  1.24s/it]

{'loss': 2.3289, 'grad_norm': 4.299967288970947, 'learning_rate': 0.00012333333333333334, 'epoch': 2.3}


 39%|███▊      | 11600/30000 [4:00:04<5:49:35,  1.14s/it] 

{'loss': 2.4319, 'grad_norm': 5.354289531707764, 'learning_rate': 0.00012266666666666668, 'epoch': 2.32}


 39%|███▉      | 11700/30000 [4:02:08<6:28:09,  1.27s/it]

{'loss': 2.4626, 'grad_norm': 8.571588516235352, 'learning_rate': 0.000122, 'epoch': 2.34}


 39%|███▉      | 11800/30000 [4:04:10<6:22:39,  1.26s/it]

{'loss': 2.3061, 'grad_norm': 8.524046897888184, 'learning_rate': 0.00012133333333333335, 'epoch': 2.36}


 40%|███▉      | 11900/30000 [4:06:13<6:03:22,  1.20s/it]

{'loss': 2.2935, 'grad_norm': 5.3102707862854, 'learning_rate': 0.00012066666666666668, 'epoch': 2.38}


 40%|████      | 12000/30000 [4:08:17<6:07:14,  1.22s/it]

{'loss': 2.3307, 'grad_norm': 6.218020439147949, 'learning_rate': 0.00012, 'epoch': 2.4}


 40%|████      | 12100/30000 [4:10:25<5:59:45,  1.21s/it] 

{'loss': 2.37, 'grad_norm': 5.5137810707092285, 'learning_rate': 0.00011933333333333334, 'epoch': 2.42}


 41%|████      | 12200/30000 [4:12:30<5:45:08,  1.16s/it]

{'loss': 2.3085, 'grad_norm': 4.7408647537231445, 'learning_rate': 0.00011866666666666669, 'epoch': 2.44}


 41%|████      | 12300/30000 [4:14:37<6:13:44,  1.27s/it]

{'loss': 2.4668, 'grad_norm': 7.812468528747559, 'learning_rate': 0.000118, 'epoch': 2.46}


 41%|████▏     | 12400/30000 [4:16:42<6:09:02,  1.26s/it]

{'loss': 2.2964, 'grad_norm': 5.949591159820557, 'learning_rate': 0.00011733333333333334, 'epoch': 2.48}


 42%|████▏     | 12500/30000 [4:18:47<5:35:54,  1.15s/it]

{'loss': 2.3932, 'grad_norm': 5.16259241104126, 'learning_rate': 0.00011666666666666668, 'epoch': 2.5}


 42%|████▏     | 12600/30000 [4:20:55<6:02:50,  1.25s/it] 

{'loss': 2.3966, 'grad_norm': 3.815485715866089, 'learning_rate': 0.000116, 'epoch': 2.52}


 42%|████▏     | 12700/30000 [4:22:59<5:54:41,  1.23s/it]

{'loss': 2.3432, 'grad_norm': 11.25667667388916, 'learning_rate': 0.00011533333333333334, 'epoch': 2.54}


 43%|████▎     | 12800/30000 [4:25:04<5:48:29,  1.22s/it]

{'loss': 2.2807, 'grad_norm': 4.294554233551025, 'learning_rate': 0.00011466666666666667, 'epoch': 2.56}


 43%|████▎     | 12900/30000 [4:27:10<5:54:29,  1.24s/it]

{'loss': 2.3492, 'grad_norm': 9.601530075073242, 'learning_rate': 0.00011399999999999999, 'epoch': 2.58}


 43%|████▎     | 13000/30000 [4:29:16<6:01:58,  1.28s/it]

{'loss': 2.448, 'grad_norm': 7.5850043296813965, 'learning_rate': 0.00011333333333333334, 'epoch': 2.6}


 44%|████▎     | 13100/30000 [4:31:22<5:52:34,  1.25s/it] 

{'loss': 2.5156, 'grad_norm': 6.223313808441162, 'learning_rate': 0.00011266666666666668, 'epoch': 2.62}


 44%|████▍     | 13200/30000 [4:33:36<6:09:56,  1.32s/it]

{'loss': 2.3215, 'grad_norm': 6.315258979797363, 'learning_rate': 0.00011200000000000001, 'epoch': 2.64}


 44%|████▍     | 13300/30000 [4:35:42<6:20:15,  1.37s/it]

{'loss': 2.2393, 'grad_norm': 11.483413696289062, 'learning_rate': 0.00011133333333333333, 'epoch': 2.66}


 45%|████▍     | 13400/30000 [4:37:46<5:46:12,  1.25s/it]

{'loss': 2.3559, 'grad_norm': 7.418323040008545, 'learning_rate': 0.00011066666666666667, 'epoch': 2.68}


 45%|████▌     | 13500/30000 [4:40:00<6:04:38,  1.33s/it]

{'loss': 2.3344, 'grad_norm': 4.331194877624512, 'learning_rate': 0.00011000000000000002, 'epoch': 2.7}


 45%|████▌     | 13600/30000 [4:42:22<6:14:37,  1.37s/it] 

{'loss': 2.3339, 'grad_norm': 6.381738662719727, 'learning_rate': 0.00010933333333333333, 'epoch': 2.72}


 46%|████▌     | 13700/30000 [4:44:50<7:03:28,  1.56s/it]

{'loss': 2.4702, 'grad_norm': 6.659092903137207, 'learning_rate': 0.00010866666666666667, 'epoch': 2.74}


 46%|████▌     | 13800/30000 [4:47:17<7:59:14,  1.77s/it]

{'loss': 2.348, 'grad_norm': 6.8726606369018555, 'learning_rate': 0.00010800000000000001, 'epoch': 2.76}


 46%|████▋     | 13900/30000 [4:49:57<6:05:57,  1.36s/it]

{'loss': 2.2875, 'grad_norm': 7.825135707855225, 'learning_rate': 0.00010733333333333333, 'epoch': 2.78}


 47%|████▋     | 14000/30000 [4:52:06<5:26:36,  1.22s/it]

{'loss': 2.3348, 'grad_norm': 4.691781997680664, 'learning_rate': 0.00010666666666666667, 'epoch': 2.8}


 47%|████▋     | 14100/30000 [4:54:43<7:16:48,  1.65s/it]

{'loss': 2.1748, 'grad_norm': 4.744790077209473, 'learning_rate': 0.00010600000000000002, 'epoch': 2.82}


 47%|████▋     | 14200/30000 [4:57:17<7:23:05,  1.68s/it]

{'loss': 2.3593, 'grad_norm': 10.318510055541992, 'learning_rate': 0.00010533333333333332, 'epoch': 2.84}


 48%|████▊     | 14300/30000 [4:59:52<7:05:17,  1.63s/it]

{'loss': 2.3871, 'grad_norm': 6.0209527015686035, 'learning_rate': 0.00010466666666666667, 'epoch': 2.86}


 48%|████▊     | 14400/30000 [5:02:23<6:29:05,  1.50s/it]

{'loss': 2.2795, 'grad_norm': 6.800227165222168, 'learning_rate': 0.00010400000000000001, 'epoch': 2.88}


 48%|████▊     | 14500/30000 [5:05:03<7:19:47,  1.70s/it]

{'loss': 2.361, 'grad_norm': 6.515925407409668, 'learning_rate': 0.00010333333333333334, 'epoch': 2.9}


 49%|████▊     | 14600/30000 [5:07:34<6:21:53,  1.49s/it] 

{'loss': 2.36, 'grad_norm': 7.102094650268555, 'learning_rate': 0.00010266666666666666, 'epoch': 2.92}


 49%|████▉     | 14700/30000 [5:10:08<5:44:47,  1.35s/it]

{'loss': 2.2269, 'grad_norm': 5.350571632385254, 'learning_rate': 0.00010200000000000001, 'epoch': 2.94}


 49%|████▉     | 14800/30000 [5:12:38<5:25:36,  1.29s/it]

{'loss': 2.2991, 'grad_norm': 3.8516016006469727, 'learning_rate': 0.00010133333333333335, 'epoch': 2.96}


 50%|████▉     | 14900/30000 [5:15:17<7:07:44,  1.70s/it]

{'loss': 2.2809, 'grad_norm': 6.211740970611572, 'learning_rate': 0.00010066666666666667, 'epoch': 2.98}


 50%|█████     | 15000/30000 [5:17:57<7:28:58,  1.80s/it]

{'loss': 2.397, 'grad_norm': 7.833597660064697, 'learning_rate': 0.0001, 'epoch': 3.0}


                                                         
 50%|█████     | 15000/30000 [5:19:50<7:28:58,  1.80s/it]

{'eval_loss': 2.2627744674682617, 'eval_runtime': 109.5832, 'eval_samples_per_second': 9.125, 'eval_steps_per_second': 4.563, 'epoch': 3.0}


 50%|█████     | 15100/30000 [5:22:18<5:27:26,  1.32s/it]  

{'loss': 2.2701, 'grad_norm': 8.184561729431152, 'learning_rate': 9.933333333333334e-05, 'epoch': 3.02}


 51%|█████     | 15200/30000 [5:24:37<5:11:45,  1.26s/it]

{'loss': 2.1984, 'grad_norm': 3.894052028656006, 'learning_rate': 9.866666666666668e-05, 'epoch': 3.04}


 51%|█████     | 15300/30000 [5:26:55<5:18:39,  1.30s/it]

{'loss': 2.1056, 'grad_norm': 7.616975784301758, 'learning_rate': 9.8e-05, 'epoch': 3.06}


 51%|█████▏    | 15400/30000 [5:29:11<4:50:33,  1.19s/it]

{'loss': 2.0497, 'grad_norm': 4.048689842224121, 'learning_rate': 9.733333333333335e-05, 'epoch': 3.08}


 52%|█████▏    | 15500/30000 [5:31:34<5:45:03,  1.43s/it]

{'loss': 2.0476, 'grad_norm': 5.238004684448242, 'learning_rate': 9.666666666666667e-05, 'epoch': 3.1}


 52%|█████▏    | 15600/30000 [5:34:03<5:29:36,  1.37s/it] 

{'loss': 2.1565, 'grad_norm': 4.650882244110107, 'learning_rate': 9.6e-05, 'epoch': 3.12}


 52%|█████▏    | 15700/30000 [5:36:36<5:55:49,  1.49s/it]

{'loss': 1.9797, 'grad_norm': 5.614658832550049, 'learning_rate': 9.533333333333334e-05, 'epoch': 3.14}


 53%|█████▎    | 15800/30000 [5:38:42<4:39:34,  1.18s/it]

{'loss': 2.2535, 'grad_norm': 5.558225631713867, 'learning_rate': 9.466666666666667e-05, 'epoch': 3.16}


 53%|█████▎    | 15900/30000 [5:40:42<4:35:57,  1.17s/it]

{'loss': 2.1244, 'grad_norm': 6.335029125213623, 'learning_rate': 9.4e-05, 'epoch': 3.18}


 53%|█████▎    | 16000/30000 [5:42:42<4:26:47,  1.14s/it]

{'loss': 2.2313, 'grad_norm': 5.532258033752441, 'learning_rate': 9.333333333333334e-05, 'epoch': 3.2}


 54%|█████▎    | 16100/30000 [5:44:45<4:32:35,  1.18s/it]

{'loss': 2.1788, 'grad_norm': 5.697210311889648, 'learning_rate': 9.266666666666666e-05, 'epoch': 3.22}


 54%|█████▍    | 16200/30000 [5:46:46<4:27:17,  1.16s/it]

{'loss': 2.0697, 'grad_norm': 6.979964256286621, 'learning_rate': 9.200000000000001e-05, 'epoch': 3.24}


 54%|█████▍    | 16300/30000 [5:48:45<4:24:37,  1.16s/it]

{'loss': 2.1789, 'grad_norm': 9.458154678344727, 'learning_rate': 9.133333333333334e-05, 'epoch': 3.26}


 55%|█████▍    | 16400/30000 [5:50:44<4:41:10,  1.24s/it]

{'loss': 1.9371, 'grad_norm': 9.095229148864746, 'learning_rate': 9.066666666666667e-05, 'epoch': 3.28}


 55%|█████▌    | 16500/30000 [5:52:46<4:34:27,  1.22s/it]

{'loss': 2.0851, 'grad_norm': 5.411893844604492, 'learning_rate': 9e-05, 'epoch': 3.3}


 55%|█████▌    | 16600/30000 [5:54:52<4:34:28,  1.23s/it]

{'loss': 2.1445, 'grad_norm': 6.035506248474121, 'learning_rate': 8.933333333333334e-05, 'epoch': 3.32}


 56%|█████▌    | 16700/30000 [5:56:52<4:40:45,  1.27s/it]

{'loss': 2.0774, 'grad_norm': 6.319214820861816, 'learning_rate': 8.866666666666668e-05, 'epoch': 3.34}


 56%|█████▌    | 16800/30000 [5:58:50<4:11:38,  1.14s/it]

{'loss': 2.1691, 'grad_norm': 6.514753341674805, 'learning_rate': 8.800000000000001e-05, 'epoch': 3.36}


 56%|█████▋    | 16900/30000 [6:00:51<4:07:34,  1.13s/it]

{'loss': 2.2196, 'grad_norm': 6.731828212738037, 'learning_rate': 8.733333333333333e-05, 'epoch': 3.38}


 57%|█████▋    | 17000/30000 [6:02:55<4:11:55,  1.16s/it]

{'loss': 2.222, 'grad_norm': 4.929152965545654, 'learning_rate': 8.666666666666667e-05, 'epoch': 3.4}


 57%|█████▋    | 17100/30000 [6:04:57<4:08:28,  1.16s/it]

{'loss': 2.1264, 'grad_norm': 5.723608016967773, 'learning_rate': 8.6e-05, 'epoch': 3.42}


 57%|█████▋    | 17200/30000 [6:06:57<4:05:25,  1.15s/it]

{'loss': 2.1734, 'grad_norm': 13.579743385314941, 'learning_rate': 8.533333333333334e-05, 'epoch': 3.44}


 58%|█████▊    | 17300/30000 [6:08:59<4:30:04,  1.28s/it]

{'loss': 2.1362, 'grad_norm': 5.729812145233154, 'learning_rate': 8.466666666666667e-05, 'epoch': 3.46}


 58%|█████▊    | 17400/30000 [6:11:00<4:13:50,  1.21s/it]

{'loss': 2.0668, 'grad_norm': 4.035669326782227, 'learning_rate': 8.4e-05, 'epoch': 3.48}


 58%|█████▊    | 17500/30000 [6:13:04<4:14:36,  1.22s/it]

{'loss': 2.0209, 'grad_norm': 3.8938097953796387, 'learning_rate': 8.333333333333334e-05, 'epoch': 3.5}


 59%|█████▊    | 17600/30000 [6:15:07<4:30:12,  1.31s/it]

{'loss': 2.1316, 'grad_norm': 4.790655136108398, 'learning_rate': 8.266666666666667e-05, 'epoch': 3.52}


 59%|█████▉    | 17700/30000 [6:17:02<3:56:58,  1.16s/it]

{'loss': 2.2486, 'grad_norm': 5.595198631286621, 'learning_rate': 8.2e-05, 'epoch': 3.54}


 59%|█████▉    | 17800/30000 [6:18:54<3:41:54,  1.09s/it]

{'loss': 2.14, 'grad_norm': 6.499783039093018, 'learning_rate': 8.133333333333334e-05, 'epoch': 3.56}


 60%|█████▉    | 17900/30000 [6:20:47<3:47:05,  1.13s/it]

{'loss': 1.9237, 'grad_norm': 6.540791034698486, 'learning_rate': 8.066666666666667e-05, 'epoch': 3.58}


 60%|██████    | 18000/30000 [6:40:42<4:04:57,  1.22s/it]   

{'loss': 2.1298, 'grad_norm': 7.776782989501953, 'learning_rate': 8e-05, 'epoch': 3.6}


 60%|██████    | 18100/30000 [6:43:11<5:37:57,  1.70s/it]

{'loss': 2.1434, 'grad_norm': 7.44364595413208, 'learning_rate': 7.933333333333334e-05, 'epoch': 3.62}


 61%|██████    | 18200/30000 [6:45:39<4:15:04,  1.30s/it]

{'loss': 2.088, 'grad_norm': 5.241666316986084, 'learning_rate': 7.866666666666666e-05, 'epoch': 3.64}


 61%|██████    | 18300/30000 [6:48:10<5:34:18,  1.71s/it]

{'loss': 2.1608, 'grad_norm': 4.427433490753174, 'learning_rate': 7.800000000000001e-05, 'epoch': 3.66}


 61%|██████▏   | 18400/30000 [6:50:27<4:09:12,  1.29s/it]

{'loss': 2.0499, 'grad_norm': 8.785063743591309, 'learning_rate': 7.733333333333333e-05, 'epoch': 3.68}


 62%|██████▏   | 18500/30000 [6:52:38<4:00:15,  1.25s/it]

{'loss': 2.0435, 'grad_norm': 4.309150218963623, 'learning_rate': 7.666666666666667e-05, 'epoch': 3.7}


 62%|██████▏   | 18600/30000 [6:54:58<4:19:25,  1.37s/it]

{'loss': 2.0956, 'grad_norm': 5.660992622375488, 'learning_rate': 7.6e-05, 'epoch': 3.72}


 62%|██████▏   | 18700/30000 [6:57:16<4:05:22,  1.30s/it]

{'loss': 2.1131, 'grad_norm': 4.643530368804932, 'learning_rate': 7.533333333333334e-05, 'epoch': 3.74}


 63%|██████▎   | 18800/30000 [6:59:29<4:38:04,  1.49s/it]

{'loss': 2.0913, 'grad_norm': 7.343771457672119, 'learning_rate': 7.466666666666667e-05, 'epoch': 3.76}


 63%|██████▎   | 18900/30000 [7:01:42<4:34:30,  1.48s/it]

{'loss': 2.2911, 'grad_norm': 5.437021732330322, 'learning_rate': 7.4e-05, 'epoch': 3.78}


 63%|██████▎   | 19000/30000 [7:04:11<4:40:33,  1.53s/it]

{'loss': 2.1366, 'grad_norm': 5.215383529663086, 'learning_rate': 7.333333333333333e-05, 'epoch': 3.8}


 64%|██████▎   | 19100/30000 [7:07:10<4:27:48,  1.47s/it]

{'loss': 2.1136, 'grad_norm': 9.557177543640137, 'learning_rate': 7.266666666666667e-05, 'epoch': 3.82}


 64%|██████▍   | 19200/30000 [7:09:43<4:22:22,  1.46s/it]

{'loss': 2.0814, 'grad_norm': 4.655855655670166, 'learning_rate': 7.2e-05, 'epoch': 3.84}


 64%|██████▍   | 19300/30000 [7:12:18<4:43:29,  1.59s/it]

{'loss': 2.1757, 'grad_norm': 6.213645935058594, 'learning_rate': 7.133333333333334e-05, 'epoch': 3.86}


 65%|██████▍   | 19400/30000 [7:15:10<4:33:20,  1.55s/it]

{'loss': 2.0476, 'grad_norm': 5.3244404792785645, 'learning_rate': 7.066666666666667e-05, 'epoch': 3.88}


 65%|██████▌   | 19500/30000 [7:17:43<4:10:39,  1.43s/it]

{'loss': 2.0917, 'grad_norm': 4.451261520385742, 'learning_rate': 7e-05, 'epoch': 3.9}


 65%|██████▌   | 19600/30000 [7:20:12<4:20:00,  1.50s/it]

{'loss': 2.2032, 'grad_norm': 6.4943766593933105, 'learning_rate': 6.933333333333334e-05, 'epoch': 3.92}


 66%|██████▌   | 19700/30000 [7:22:37<3:47:20,  1.32s/it]

{'loss': 2.0384, 'grad_norm': 5.378139495849609, 'learning_rate': 6.866666666666666e-05, 'epoch': 3.94}


 66%|██████▌   | 19800/30000 [7:25:08<4:21:05,  1.54s/it]

{'loss': 2.2609, 'grad_norm': 10.043134689331055, 'learning_rate': 6.800000000000001e-05, 'epoch': 3.96}


 66%|██████▋   | 19900/30000 [7:27:27<3:48:51,  1.36s/it]

{'loss': 2.1762, 'grad_norm': 7.9453301429748535, 'learning_rate': 6.733333333333333e-05, 'epoch': 3.98}


 67%|██████▋   | 20000/30000 [7:29:52<3:18:08,  1.19s/it]

{'loss': 2.0888, 'grad_norm': 3.585653781890869, 'learning_rate': 6.666666666666667e-05, 'epoch': 4.0}


                                                         
 67%|██████▋   | 20000/30000 [7:31:24<3:18:08,  1.19s/it]

{'eval_loss': 2.1932523250579834, 'eval_runtime': 89.1001, 'eval_samples_per_second': 11.223, 'eval_steps_per_second': 5.612, 'epoch': 4.0}


 67%|██████▋   | 20100/30000 [7:33:15<2:58:43,  1.08s/it] 

{'loss': 1.9941, 'grad_norm': 8.165247917175293, 'learning_rate': 6.6e-05, 'epoch': 4.02}


 67%|██████▋   | 20200/30000 [7:35:05<2:55:17,  1.07s/it]

{'loss': 1.9785, 'grad_norm': 5.734081745147705, 'learning_rate': 6.533333333333334e-05, 'epoch': 4.04}


 68%|██████▊   | 20300/30000 [7:36:59<3:05:56,  1.15s/it]

{'loss': 1.9376, 'grad_norm': 3.931157112121582, 'learning_rate': 6.466666666666666e-05, 'epoch': 4.06}


 68%|██████▊   | 20400/30000 [7:38:54<3:00:41,  1.13s/it]

{'loss': 2.0001, 'grad_norm': 5.398315906524658, 'learning_rate': 6.400000000000001e-05, 'epoch': 4.08}


 68%|██████▊   | 20500/30000 [7:40:47<2:55:25,  1.11s/it]

{'loss': 1.8672, 'grad_norm': 7.183920383453369, 'learning_rate': 6.333333333333333e-05, 'epoch': 4.1}


 69%|██████▊   | 20600/30000 [7:42:42<2:49:25,  1.08s/it]

{'loss': 1.9233, 'grad_norm': 6.573293209075928, 'learning_rate': 6.266666666666667e-05, 'epoch': 4.12}


 69%|██████▉   | 20700/30000 [7:44:34<2:41:59,  1.05s/it]

{'loss': 2.0113, 'grad_norm': 4.378442287445068, 'learning_rate': 6.2e-05, 'epoch': 4.14}


 69%|██████▉   | 20800/30000 [7:46:26<2:43:40,  1.07s/it]

{'loss': 1.9171, 'grad_norm': 7.8514251708984375, 'learning_rate': 6.133333333333334e-05, 'epoch': 4.16}


 70%|██████▉   | 20900/30000 [7:48:18<2:53:05,  1.14s/it]

{'loss': 1.8926, 'grad_norm': 6.380101203918457, 'learning_rate': 6.066666666666667e-05, 'epoch': 4.18}


 70%|███████   | 21000/30000 [7:50:11<2:38:50,  1.06s/it]

{'loss': 1.9016, 'grad_norm': 4.3058576583862305, 'learning_rate': 6e-05, 'epoch': 4.2}


 70%|███████   | 21100/30000 [7:52:12<2:56:16,  1.19s/it]

{'loss': 1.8273, 'grad_norm': 5.163041591644287, 'learning_rate': 5.9333333333333343e-05, 'epoch': 4.22}


 71%|███████   | 21200/30000 [7:54:04<2:40:52,  1.10s/it]

{'loss': 1.9825, 'grad_norm': 4.656503200531006, 'learning_rate': 5.866666666666667e-05, 'epoch': 4.24}


 71%|███████   | 21300/30000 [7:55:57<2:41:37,  1.11s/it]

{'loss': 1.93, 'grad_norm': 5.065099716186523, 'learning_rate': 5.8e-05, 'epoch': 4.26}


 71%|███████▏  | 21400/30000 [7:57:50<2:45:10,  1.15s/it]

{'loss': 1.9457, 'grad_norm': 10.849148750305176, 'learning_rate': 5.7333333333333336e-05, 'epoch': 4.28}


 72%|███████▏  | 21500/30000 [7:59:44<2:38:34,  1.12s/it]

{'loss': 2.0542, 'grad_norm': 8.230137825012207, 'learning_rate': 5.666666666666667e-05, 'epoch': 4.3}


 72%|███████▏  | 21600/30000 [8:01:40<2:41:17,  1.15s/it]

{'loss': 1.8554, 'grad_norm': 5.468996524810791, 'learning_rate': 5.6000000000000006e-05, 'epoch': 4.32}


 72%|███████▏  | 21700/30000 [8:03:33<2:25:52,  1.05s/it]

{'loss': 1.9837, 'grad_norm': 8.494721412658691, 'learning_rate': 5.5333333333333334e-05, 'epoch': 4.34}


 73%|███████▎  | 21800/30000 [8:05:36<2:33:31,  1.12s/it]

{'loss': 1.9736, 'grad_norm': 8.67777156829834, 'learning_rate': 5.466666666666666e-05, 'epoch': 4.36}


 73%|███████▎  | 21900/30000 [8:07:29<2:29:10,  1.10s/it]

{'loss': 1.8562, 'grad_norm': 5.377033233642578, 'learning_rate': 5.4000000000000005e-05, 'epoch': 4.38}


 73%|███████▎  | 22000/30000 [8:09:23<2:28:39,  1.11s/it]

{'loss': 1.9805, 'grad_norm': 9.138154029846191, 'learning_rate': 5.333333333333333e-05, 'epoch': 4.4}


 74%|███████▎  | 22100/30000 [8:11:19<2:38:11,  1.20s/it]

{'loss': 1.9258, 'grad_norm': 9.513936996459961, 'learning_rate': 5.266666666666666e-05, 'epoch': 4.42}


 74%|███████▍  | 22200/30000 [8:13:15<2:34:47,  1.19s/it]

{'loss': 2.0022, 'grad_norm': 5.388346195220947, 'learning_rate': 5.2000000000000004e-05, 'epoch': 4.44}


 74%|███████▍  | 22300/30000 [8:15:20<2:41:47,  1.26s/it]

{'loss': 1.9026, 'grad_norm': 9.702619552612305, 'learning_rate': 5.133333333333333e-05, 'epoch': 4.46}


 75%|███████▍  | 22400/30000 [8:17:15<2:16:54,  1.08s/it]

{'loss': 1.9605, 'grad_norm': 4.720794200897217, 'learning_rate': 5.0666666666666674e-05, 'epoch': 4.48}


 75%|███████▌  | 22500/30000 [8:19:08<2:16:38,  1.09s/it]

{'loss': 1.988, 'grad_norm': 7.658483505249023, 'learning_rate': 5e-05, 'epoch': 4.5}


 75%|███████▌  | 22600/30000 [8:21:02<2:14:01,  1.09s/it]

{'loss': 1.9295, 'grad_norm': 10.341439247131348, 'learning_rate': 4.933333333333334e-05, 'epoch': 4.52}


 76%|███████▌  | 22700/30000 [8:22:55<2:15:05,  1.11s/it]

{'loss': 1.9336, 'grad_norm': 6.656500339508057, 'learning_rate': 4.866666666666667e-05, 'epoch': 4.54}


 76%|███████▌  | 22800/30000 [8:24:52<2:14:52,  1.12s/it]

{'loss': 1.9439, 'grad_norm': 9.226197242736816, 'learning_rate': 4.8e-05, 'epoch': 4.56}


 76%|███████▋  | 22900/30000 [8:26:46<2:17:50,  1.16s/it]

{'loss': 2.0712, 'grad_norm': 7.373109340667725, 'learning_rate': 4.7333333333333336e-05, 'epoch': 4.58}


 77%|███████▋  | 23000/30000 [8:28:44<2:14:42,  1.15s/it]

{'loss': 1.9844, 'grad_norm': 4.251288414001465, 'learning_rate': 4.666666666666667e-05, 'epoch': 4.6}


 77%|███████▋  | 23100/30000 [8:30:44<2:28:03,  1.29s/it]

{'loss': 2.0908, 'grad_norm': 5.312857627868652, 'learning_rate': 4.600000000000001e-05, 'epoch': 4.62}


 77%|███████▋  | 23200/30000 [8:32:37<2:05:40,  1.11s/it]

{'loss': 2.0174, 'grad_norm': 8.047904968261719, 'learning_rate': 4.5333333333333335e-05, 'epoch': 4.64}


 78%|███████▊  | 23300/30000 [8:34:32<2:00:44,  1.08s/it]

{'loss': 2.0043, 'grad_norm': 4.633071422576904, 'learning_rate': 4.466666666666667e-05, 'epoch': 4.66}


 78%|███████▊  | 23400/30000 [8:36:28<2:17:40,  1.25s/it]

{'loss': 1.9865, 'grad_norm': 8.753836631774902, 'learning_rate': 4.4000000000000006e-05, 'epoch': 4.68}


 78%|███████▊  | 23500/30000 [8:38:22<2:07:49,  1.18s/it]

{'loss': 2.0181, 'grad_norm': 4.682305335998535, 'learning_rate': 4.3333333333333334e-05, 'epoch': 4.7}


 79%|███████▊  | 23600/30000 [8:40:23<2:11:43,  1.23s/it]

{'loss': 1.9887, 'grad_norm': 5.967099189758301, 'learning_rate': 4.266666666666667e-05, 'epoch': 4.72}


 79%|███████▉  | 23700/30000 [8:42:17<2:03:38,  1.18s/it]

{'loss': 2.0327, 'grad_norm': 8.47728157043457, 'learning_rate': 4.2e-05, 'epoch': 4.74}


 79%|███████▉  | 23800/30000 [8:44:11<2:00:36,  1.17s/it]

{'loss': 2.0755, 'grad_norm': 6.760278224945068, 'learning_rate': 4.133333333333333e-05, 'epoch': 4.76}


 80%|███████▉  | 23900/30000 [8:46:04<1:56:22,  1.14s/it]

{'loss': 1.9778, 'grad_norm': 6.092094421386719, 'learning_rate': 4.066666666666667e-05, 'epoch': 4.78}


 80%|████████  | 24000/30000 [8:47:58<1:54:52,  1.15s/it]

{'loss': 1.872, 'grad_norm': 5.847410202026367, 'learning_rate': 4e-05, 'epoch': 4.8}


 80%|████████  | 24100/30000 [8:49:55<1:49:27,  1.11s/it]

{'loss': 1.9493, 'grad_norm': 4.915213584899902, 'learning_rate': 3.933333333333333e-05, 'epoch': 4.82}


 81%|████████  | 24200/30000 [8:51:48<1:52:43,  1.17s/it]

{'loss': 1.9763, 'grad_norm': 6.440154075622559, 'learning_rate': 3.866666666666667e-05, 'epoch': 4.84}


 81%|████████  | 24300/30000 [8:53:42<1:47:07,  1.13s/it]

{'loss': 2.0096, 'grad_norm': 8.685290336608887, 'learning_rate': 3.8e-05, 'epoch': 4.86}


 81%|████████▏ | 24400/30000 [8:55:35<1:52:13,  1.20s/it]

{'loss': 2.0333, 'grad_norm': 7.4914727210998535, 'learning_rate': 3.733333333333334e-05, 'epoch': 4.88}


 82%|████████▏ | 24500/30000 [8:57:41<1:54:07,  1.24s/it]

{'loss': 1.9442, 'grad_norm': 6.895726680755615, 'learning_rate': 3.6666666666666666e-05, 'epoch': 4.9}


 82%|████████▏ | 24600/30000 [8:59:49<1:58:21,  1.32s/it]

{'loss': 1.8723, 'grad_norm': 7.386151313781738, 'learning_rate': 3.6e-05, 'epoch': 4.92}


 82%|████████▏ | 24700/30000 [9:01:52<1:40:30,  1.14s/it]

{'loss': 1.9532, 'grad_norm': 6.707838535308838, 'learning_rate': 3.5333333333333336e-05, 'epoch': 4.94}


 83%|████████▎ | 24800/30000 [9:03:58<1:49:33,  1.26s/it]

{'loss': 1.8508, 'grad_norm': 7.585244655609131, 'learning_rate': 3.466666666666667e-05, 'epoch': 4.96}


 83%|████████▎ | 24900/30000 [9:06:09<1:49:45,  1.29s/it]

{'loss': 1.9019, 'grad_norm': 8.184401512145996, 'learning_rate': 3.4000000000000007e-05, 'epoch': 4.98}


 83%|████████▎ | 25000/30000 [9:08:12<1:44:24,  1.25s/it]

{'loss': 1.9844, 'grad_norm': 5.2767558097839355, 'learning_rate': 3.3333333333333335e-05, 'epoch': 5.0}


                                                         
 83%|████████▎ | 25000/30000 [9:09:49<1:44:24,  1.25s/it]

{'eval_loss': 2.188234329223633, 'eval_runtime': 93.6611, 'eval_samples_per_second': 10.677, 'eval_steps_per_second': 5.338, 'epoch': 5.0}


 84%|████████▎ | 25100/30000 [9:11:57<1:42:30,  1.26s/it] 

{'loss': 1.8171, 'grad_norm': 4.568292140960693, 'learning_rate': 3.266666666666667e-05, 'epoch': 5.02}


 84%|████████▍ | 25200/30000 [9:14:03<1:45:32,  1.32s/it]

{'loss': 1.8319, 'grad_norm': 4.681552886962891, 'learning_rate': 3.2000000000000005e-05, 'epoch': 5.04}


 84%|████████▍ | 25300/30000 [9:16:10<1:46:33,  1.36s/it]

{'loss': 1.8527, 'grad_norm': 8.864524841308594, 'learning_rate': 3.1333333333333334e-05, 'epoch': 5.06}


 85%|████████▍ | 25400/30000 [9:18:16<1:36:09,  1.25s/it]

{'loss': 1.9104, 'grad_norm': 6.680832862854004, 'learning_rate': 3.066666666666667e-05, 'epoch': 5.08}


 85%|████████▌ | 25500/30000 [9:20:24<1:29:40,  1.20s/it]

{'loss': 1.8026, 'grad_norm': 5.419872760772705, 'learning_rate': 3e-05, 'epoch': 5.1}


 85%|████████▌ | 25600/30000 [9:22:37<1:34:50,  1.29s/it]

{'loss': 1.9349, 'grad_norm': 10.569047927856445, 'learning_rate': 2.9333333333333336e-05, 'epoch': 5.12}


 86%|████████▌ | 25700/30000 [9:24:44<1:32:15,  1.29s/it]

{'loss': 1.8154, 'grad_norm': 4.328467845916748, 'learning_rate': 2.8666666666666668e-05, 'epoch': 5.14}


 86%|████████▌ | 25800/30000 [9:26:53<1:23:02,  1.19s/it]

{'loss': 1.9939, 'grad_norm': 3.518089771270752, 'learning_rate': 2.8000000000000003e-05, 'epoch': 5.16}


 86%|████████▋ | 25900/30000 [9:29:01<1:22:27,  1.21s/it]

{'loss': 1.9216, 'grad_norm': 3.9880926609039307, 'learning_rate': 2.733333333333333e-05, 'epoch': 5.18}


 87%|████████▋ | 26000/30000 [9:31:09<1:22:29,  1.24s/it]

{'loss': 1.7707, 'grad_norm': 6.050681114196777, 'learning_rate': 2.6666666666666667e-05, 'epoch': 5.2}


 87%|████████▋ | 26100/30000 [9:33:11<1:22:13,  1.27s/it]

{'loss': 1.9053, 'grad_norm': 6.108247756958008, 'learning_rate': 2.6000000000000002e-05, 'epoch': 5.22}


 87%|████████▋ | 26200/30000 [9:35:12<1:10:40,  1.12s/it]

{'loss': 1.9315, 'grad_norm': 5.059654712677002, 'learning_rate': 2.5333333333333337e-05, 'epoch': 5.24}


 88%|████████▊ | 26300/30000 [9:37:08<1:19:08,  1.28s/it]

{'loss': 1.8916, 'grad_norm': 5.90951681137085, 'learning_rate': 2.466666666666667e-05, 'epoch': 5.26}


 88%|████████▊ | 26400/30000 [9:39:09<1:10:31,  1.18s/it]

{'loss': 1.8051, 'grad_norm': 4.86004114151001, 'learning_rate': 2.4e-05, 'epoch': 5.28}


 88%|████████▊ | 26500/30000 [9:41:06<1:10:28,  1.21s/it]

{'loss': 1.8082, 'grad_norm': 6.202188968658447, 'learning_rate': 2.3333333333333336e-05, 'epoch': 5.3}


 89%|████████▊ | 26600/30000 [9:43:07<1:07:15,  1.19s/it]

{'loss': 1.8959, 'grad_norm': 4.460750579833984, 'learning_rate': 2.2666666666666668e-05, 'epoch': 5.32}


 89%|████████▉ | 26700/30000 [9:45:06<1:02:41,  1.14s/it]

{'loss': 1.8434, 'grad_norm': 7.297046184539795, 'learning_rate': 2.2000000000000003e-05, 'epoch': 5.34}


 89%|████████▉ | 26800/30000 [9:47:07<1:07:07,  1.26s/it]

{'loss': 1.7961, 'grad_norm': 7.657840728759766, 'learning_rate': 2.1333333333333335e-05, 'epoch': 5.36}


 90%|████████▉ | 26900/30000 [9:49:03<1:01:29,  1.19s/it]

{'loss': 1.9374, 'grad_norm': 5.368454456329346, 'learning_rate': 2.0666666666666666e-05, 'epoch': 5.38}


 90%|█████████ | 27000/30000 [9:50:56<54:20,  1.09s/it]  

{'loss': 1.7897, 'grad_norm': 5.876288890838623, 'learning_rate': 2e-05, 'epoch': 5.4}


 90%|█████████ | 27100/30000 [9:52:54<57:40,  1.19s/it]  

{'loss': 1.8033, 'grad_norm': 6.115242004394531, 'learning_rate': 1.9333333333333333e-05, 'epoch': 5.42}


 91%|█████████ | 27200/30000 [9:54:49<54:47,  1.17s/it]  

{'loss': 1.9187, 'grad_norm': 6.299952507019043, 'learning_rate': 1.866666666666667e-05, 'epoch': 5.44}


 91%|█████████ | 27300/30000 [9:56:45<51:27,  1.14s/it]

{'loss': 1.9236, 'grad_norm': 4.012822151184082, 'learning_rate': 1.8e-05, 'epoch': 5.46}


 91%|█████████▏| 27400/30000 [9:58:42<49:22,  1.14s/it]

{'loss': 1.8275, 'grad_norm': 5.1001386642456055, 'learning_rate': 1.7333333333333336e-05, 'epoch': 5.48}


 92%|█████████▏| 27500/30000 [10:00:40<50:29,  1.21s/it]

{'loss': 1.6688, 'grad_norm': 4.703350067138672, 'learning_rate': 1.6666666666666667e-05, 'epoch': 5.5}


 92%|█████████▏| 27600/30000 [10:02:40<48:33,  1.21s/it]  

{'loss': 1.8422, 'grad_norm': 7.730883598327637, 'learning_rate': 1.6000000000000003e-05, 'epoch': 5.52}


 92%|█████████▏| 27700/30000 [10:04:45<41:19,  1.08s/it]

{'loss': 1.9787, 'grad_norm': 8.782158851623535, 'learning_rate': 1.5333333333333334e-05, 'epoch': 5.54}


 93%|█████████▎| 27800/30000 [10:06:42<41:37,  1.14s/it]

{'loss': 1.743, 'grad_norm': 6.852510929107666, 'learning_rate': 1.4666666666666668e-05, 'epoch': 5.56}


 93%|█████████▎| 27900/30000 [10:08:38<40:43,  1.16s/it]

{'loss': 1.888, 'grad_norm': 5.59823751449585, 'learning_rate': 1.4000000000000001e-05, 'epoch': 5.58}


 93%|█████████▎| 28000/30000 [10:10:40<41:48,  1.25s/it]

{'loss': 1.812, 'grad_norm': 4.4308247566223145, 'learning_rate': 1.3333333333333333e-05, 'epoch': 5.6}


 94%|█████████▎| 28100/30000 [10:12:42<39:31,  1.25s/it]  

{'loss': 1.7665, 'grad_norm': 3.2403876781463623, 'learning_rate': 1.2666666666666668e-05, 'epoch': 5.62}


 94%|█████████▍| 28200/30000 [10:14:41<38:24,  1.28s/it]

{'loss': 1.8624, 'grad_norm': 10.677203178405762, 'learning_rate': 1.2e-05, 'epoch': 5.64}


 94%|█████████▍| 28300/30000 [10:16:43<33:57,  1.20s/it]

{'loss': 1.8102, 'grad_norm': 7.783634185791016, 'learning_rate': 1.1333333333333334e-05, 'epoch': 5.66}


 95%|█████████▍| 28400/30000 [10:18:42<32:20,  1.21s/it]

{'loss': 1.8682, 'grad_norm': 3.2383556365966797, 'learning_rate': 1.0666666666666667e-05, 'epoch': 5.68}


 95%|█████████▌| 28500/30000 [10:20:41<28:53,  1.16s/it]

{'loss': 1.808, 'grad_norm': 7.461501598358154, 'learning_rate': 1e-05, 'epoch': 5.7}


 95%|█████████▌| 28600/30000 [10:22:49<28:04,  1.20s/it]

{'loss': 1.7873, 'grad_norm': 3.8842689990997314, 'learning_rate': 9.333333333333334e-06, 'epoch': 5.72}


 96%|█████████▌| 28700/30000 [10:24:50<28:32,  1.32s/it]

{'loss': 1.8509, 'grad_norm': 4.710551738739014, 'learning_rate': 8.666666666666668e-06, 'epoch': 5.74}


 96%|█████████▌| 28800/30000 [10:26:51<25:19,  1.27s/it]

{'loss': 1.7727, 'grad_norm': 4.740843296051025, 'learning_rate': 8.000000000000001e-06, 'epoch': 5.76}


 96%|█████████▋| 28900/30000 [10:28:54<24:37,  1.34s/it]

{'loss': 1.8396, 'grad_norm': 10.908681869506836, 'learning_rate': 7.333333333333334e-06, 'epoch': 5.78}


 97%|█████████▋| 29000/30000 [10:30:56<20:32,  1.23s/it]

{'loss': 1.9834, 'grad_norm': 8.203012466430664, 'learning_rate': 6.666666666666667e-06, 'epoch': 5.8}


 97%|█████████▋| 29100/30000 [10:33:00<20:06,  1.34s/it]

{'loss': 1.9231, 'grad_norm': 9.927766799926758, 'learning_rate': 6e-06, 'epoch': 5.82}


 97%|█████████▋| 29200/30000 [10:34:58<16:58,  1.27s/it]

{'loss': 1.9029, 'grad_norm': 9.455146789550781, 'learning_rate': 5.333333333333334e-06, 'epoch': 5.84}


 98%|█████████▊| 29300/30000 [10:37:00<13:59,  1.20s/it]

{'loss': 1.9075, 'grad_norm': 4.710106372833252, 'learning_rate': 4.666666666666667e-06, 'epoch': 5.86}


 98%|█████████▊| 29400/30000 [10:39:04<12:31,  1.25s/it]

{'loss': 1.9188, 'grad_norm': 12.21905517578125, 'learning_rate': 4.000000000000001e-06, 'epoch': 5.88}


 98%|█████████▊| 29500/30000 [10:41:07<10:17,  1.24s/it]

{'loss': 1.9181, 'grad_norm': 6.333349704742432, 'learning_rate': 3.3333333333333333e-06, 'epoch': 5.9}


 99%|█████████▊| 29600/30000 [10:43:10<07:34,  1.14s/it]

{'loss': 1.9312, 'grad_norm': 4.23691463470459, 'learning_rate': 2.666666666666667e-06, 'epoch': 5.92}


 99%|█████████▉| 29700/30000 [10:45:11<06:17,  1.26s/it]

{'loss': 1.7741, 'grad_norm': 5.274267673492432, 'learning_rate': 2.0000000000000003e-06, 'epoch': 5.94}


 99%|█████████▉| 29800/30000 [10:47:08<03:50,  1.15s/it]

{'loss': 1.9093, 'grad_norm': 4.860395431518555, 'learning_rate': 1.3333333333333334e-06, 'epoch': 5.96}


100%|█████████▉| 29900/30000 [10:49:05<02:00,  1.21s/it]

{'loss': 1.842, 'grad_norm': 6.313051700592041, 'learning_rate': 6.666666666666667e-07, 'epoch': 5.98}


100%|██████████| 30000/30000 [10:51:08<00:00,  1.13s/it]

{'loss': 1.8363, 'grad_norm': 4.338659286499023, 'learning_rate': 0.0, 'epoch': 6.0}


                                                        
100%|██████████| 30000/30000 [10:52:40<00:00,  1.31s/it]

{'eval_loss': 2.1863021850585938, 'eval_runtime': 89.9491, 'eval_samples_per_second': 11.117, 'eval_steps_per_second': 5.559, 'epoch': 6.0}
{'train_runtime': 39160.9488, 'train_samples_per_second': 1.532, 'train_steps_per_second': 0.766, 'train_loss': 2.524774772644043, 'epoch': 6.0}
Training completed!





In [10]:
trainer.save_model("./mt5-npi-en")
tokenizer.save_pretrained("./mt5-npi-en")

('./mt5-npi-en/tokenizer_config.json',
 './mt5-npi-en/special_tokens_map.json',
 './mt5-npi-en/spiece.model',
 './mt5-npi-en/added_tokens.json')

In [11]:
test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print("Test Evaluation:", test_results)


100%|██████████| 100/100 [00:15<00:00,  6.37it/s]

Test Evaluation: {'eval_loss': 2.2883307933807373, 'eval_runtime': 15.8664, 'eval_samples_per_second': 12.605, 'eval_steps_per_second': 6.303, 'epoch': 6.0}





In [13]:
from tqdm import tqdm
import torch

bleu = evaluate.load("bleu")

# For single-GPU CPU/accelerator inference
model.eval()

batch_size = 8  # You can tune this
sources = dataset["test"]["source"]
references = dataset["test"]["target"]

predictions = []

for i in tqdm(range(0, len(sources), batch_size), desc="Generating translations"):
    batch_src = sources[i:i + batch_size]
    batch_inputs = ["translate Nepali to English: " + s for s in batch_src]
    
    inputs = tokenizer(
        batch_inputs,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=300
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=300,
            num_beams=4
        )
    
    batch_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predictions.extend(batch_preds)

# Print a few samples to check translation quality
for i in range(30):
    print(f"\nSource    : {sources[i]}")
    print(f"Reference : {references[i]}")
    print(f"Predicted : {predictions[i]}")

bleu_score = bleu.compute(
    predictions=[p.strip() for p in predictions],
    references=[[r.strip()] for r in references]
)
print("Test BLEU:", bleu_score["bleu"])



Generating translations: 100%|██████████| 25/25 [04:10<00:00, 10.03s/it]


Source    : यसबाट युवकहरु पनि पीडित भएको देखिन्छन ।
Reference : Young people also suffer from it.
Predicted : The young people are not stunned.

Source    : बजाऊने लिस्ट
Reference : Playlist
Predicted : Playing List

Source    : त्यहाँ केही नियमहरू छन् जुन तपाईंले पछ्याउनु पर्छः
Reference : There are a few rules that you should follow:
Predicted : There are some rules that you need to follow:

Source    : सबै गीतहरु उनी आफैंले लेखेका हुन् ।
Reference : All songs were written by himself.
Predicted : He wrote all the songs he wrote.

Source    : मेरो एउटा जर्मन साथी छ ।
Reference : I had a German friend.
Predicted : I have a German friend.

Source    : तपाईँ वास्तवमै फाइल मेट्न चाहनुहुन्छ?

Reference : Do you really want to delete file ?

Predicted : Do you want to delete file?

Source    : यस कुरालाई लिएर आक्रोशित हुनुपर्ने कुनै आवश्यकता छैन ।
Reference : There is no need to get upset about this.
Predicted : There is no need to be worried about this.

Source    : पुरुष र महिला दुबैले य




: 