In [None]:
# finetune_layoutlm_perfect.py
# 100% FIXED: Dynamic padding, no data loss, full SROIE + FUNSD
# Using local weights: Dataset/SROIE2019/layoutlm-base-uncased

import os
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset as HFDataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
from tqdm.auto import tqdm
import json
import re

# ================================
# CONFIG
# ================================
PRETRAINED_PATH = "Dataset/SROIE2019/layoutlm-base-uncased"
FUNSD_DIR = "Dataset/FUNSD"
SROIE_TRAIN_DIR = "Dataset/SROIE2019/train"
SROIE_TEST_DIR = "Dataset/SROIE2019/test"
OUTPUT_DIR = "finetuned_layoutlm_final_perfect"
LOG_CSV = os.path.join(OUTPUT_DIR, "training_log.csv")

MAX_LENGTH = 512
BATCH_SIZE = 8
EPOCHS = 30
LR = 5e-5
WEIGHT_DECAY = 0.01
VAL_RATIO = 0.15

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Label set
label2id = {
    "O": 0,
    "B-HEADER": 1, "I-HEADER": 2,
    "B-QUESTION": 3, "I-QUESTION": 4,
    "B-ANSWER": 5, "I-ANSWER": 6,
    "B-COMPANY": 7, "I-COMPANY": 8,
    "B-DATE": 9, "I-DATE": 10,
    "B-ADDRESS": 11, "I-ADDRESS": 12,
    "B-TOTAL": 13, "I-TOTAL": 14
}
id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)

# ================================
# DATASET
# ================================
class DocumentDataset(Dataset):
    def __init__(self, tokenizer):
        self.examples = []
        self.tokenizer = tokenizer

        # FUNSD
        for split in ["training_data", "testing_data"]:
            ann_dir = os.path.join(FUNSD_DIR, split, "annotations")
            if not os.path.exists(ann_dir): continue
            for json_file in os.listdir(ann_dir):
                if not json_file.endswith(".json"): continue
                with open(os.path.join(ann_dir, json_file), "r", encoding="utf-8", errors="ignore") as f:
                    data = json.load(f)
                words, bboxes, labels = [], [], []
                for item in data["form"]:
                    text = item["text"].strip()
                    if not text: continue
                    words.append(text)
                    box = item["box"]
                    bboxes.append([box[0], box[1], box[2], box[3]])
                    lbl = item["label"].lower()
                    if lbl == "header": labels.append("B-HEADER")
                    elif lbl == "question": labels.append("B-QUESTION")
                    elif lbl == "answer": labels.append("B-ANSWER")
                    else: labels.append("O")
                if words:
                    self.examples.append({"words": words, "bboxes": bboxes, "ner_tags": labels})

        # SROIE train + test
        for sroie_dir in [SROIE_TRAIN_DIR, SROIE_TEST_DIR]:
            box_dir = os.path.join(sroie_dir, "box")
            entity_dir = os.path.join(sroie_dir, "entities")
            if not os.path.exists(box_dir): continue
            for txt_file in os.listdir(box_dir):
                if not txt_file.endswith(".txt"): continue
                base = txt_file.replace(".txt", "")
                entity_path = os.path.join(entity_dir, base + ".txt")
                if not os.path.exists(entity_path): continue

                words, bboxes, labels = [], [], []
                with open(os.path.join(box_dir, txt_file), "r", encoding="utf-8", errors="ignore") as f:
                    for line in f:
                        parts = line.strip().split(",", 8)
                        if len(parts) != 9: continue
                        x1, y1, _, _, x3, y3, _, _, text = parts
                        words.append(text.strip())
                        bboxes.append([int(x1), int(y1), int(x3), int(y3)])
                        labels.append("O")

                with open(entity_path, "r", encoding="utf-8", errors="ignore") as f:
                    for line in f:
                        if ":" not in line: continue
                        key, value = line.split(":", 1)
                        key = key.strip().lower()
                        value = value.strip().lower()
                        key_map = {"company": "COMPANY", "date": "DATE", "address": "ADDRESS", "total": "TOTAL"}
                        tag = key_map.get(key)
                        if not tag: continue
                        for i, w in enumerate([ww.lower() for ww in words]):
                            if value in w or w in value:
                                if labels[i] == "O":
                                    labels[i] = f"B-{tag}"
                                else:
                                    labels[i] = f"I-{tag}"

                if words:
                    self.examples.append({"words": words, "bboxes": bboxes, "ner_tags": labels})

    def __len__(self): return len(self.examples)
    def __getitem__(self, idx):
        item = self.examples[idx]
        return {
            "words": item["words"],
            "bboxes": item["bboxes"],
            "ner_tags": item["ner_tags"]
        }

# ================================
# PREPROCESS FUNCTION
# ================================
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_PATH)

def preprocess(example):
    words = example["words"]
    bboxes = example["bboxes"]
    labels = example["ner_tags"]

    encoding = tokenizer(
        words,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        is_split_into_words=True,
        return_tensors="pt"
    )

    word_ids = encoding.word_ids()
    input_labels = []
    input_boxes = []

    prev_wid = None
    for i, wid in enumerate(word_ids):
        if wid is None:
            input_labels.append(-100)
            input_boxes.append([0,0,0,0])
        elif wid == prev_wid:
            input_labels.append(-100)
            input_boxes.append(input_boxes[-1])
        else:
            input_labels.append(label2id.get(labels[wid], 0))
            box = bboxes[wid]
            max_x = max(b[2] for b in bboxes) or 1
            max_y = max(b[3] for b in bboxes) or 1
            input_boxes.append([
                int(box[0] * 1000 / max_x),
                int(box[1] * 1000 / max_y),
                int(box[2] * 1000 / max_x),
                int(box[3] * 1000 / max_y)
            ])
            prev_wid = wid

    return {
        "input_ids": encoding.input_ids.flatten(),
        "attention_mask": encoding.attention_mask.flatten(),
        "bbox": torch.tensor(input_boxes),
        "labels": torch.tensor(input_labels)
    }

# ================================
# LOAD DATA
# ================================
raw_dataset = DocumentDataset(tokenizer)
hf_dataset = HFDataset.from_list([preprocess(ex) for ex in tqdm(raw_dataset, desc="Preprocessing")])

train_size = int(0.85 * len(hf_dataset))
val_size = len(hf_dataset) - train_size
train_ds, val_ds = torch.utils.data.random_split(hf_dataset, [train_size, val_size])

# ================================
# MODEL + TRAINER
# ================================
model = AutoModelForTokenClassification.from_pretrained(
    PRETRAINED_PATH,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir=OUTPUT_DIR,
    logging_steps=10,
    save_total_limit=2,
    report_to=[],
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    compute_metrics=lambda p: {
        "accuracy": accuracy_score(p.label_ids[p.label_ids != -100], p.predictions.argmax(-1)[p.label_ids != -100]),
        "f1": f1_score(p.label_ids[p.label_ids != -100], p.predictions.argmax(-1)[p.label_ids != -100], average="macro")
    }
)

# Train
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Training complete! Best model saved to: {OUTPUT_DIR}")

  from .autonotebook import tqdm as notebook_tqdm
Preprocessing: 100%|██████████| 1172/1172 [00:03<00:00, 307.43it/s]
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at Dataset/SROIE2019/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.h_position_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.w_position_embeddings.weight', 'embeddings.word_embeddings.weight', 'embeddings.x_position_embeddings.weight', 'embeddings.y_position_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.quer

{'loss': 1.1862, 'grad_norm': 2.0486397743225098, 'learning_rate': 4.9880000000000004e-05, 'epoch': 0.08}


  1%|          | 20/3750 [04:21<13:08:35, 12.69s/it]

{'loss': 0.8544, 'grad_norm': 15.424850463867188, 'learning_rate': 4.974666666666667e-05, 'epoch': 0.16}


  1%|          | 30/3750 [06:35<13:50:37, 13.40s/it]

{'loss': 0.4952, 'grad_norm': 2.3452234268188477, 'learning_rate': 4.9613333333333335e-05, 'epoch': 0.24}


  1%|          | 40/3750 [08:43<13:06:24, 12.72s/it]

{'loss': 0.6487, 'grad_norm': 1.8993349075317383, 'learning_rate': 4.948000000000001e-05, 'epoch': 0.32}


  1%|▏         | 50/3750 [10:50<13:02:34, 12.69s/it]

{'loss': 0.6974, 'grad_norm': 3.217223644256592, 'learning_rate': 4.9346666666666666e-05, 'epoch': 0.4}


  2%|▏         | 60/3750 [12:57<13:12:54, 12.89s/it]

{'loss': 0.677, 'grad_norm': 9.501729011535645, 'learning_rate': 4.921333333333333e-05, 'epoch': 0.48}


  2%|▏         | 70/3750 [15:04<12:48:25, 12.53s/it]

{'loss': 0.602, 'grad_norm': 3.190091848373413, 'learning_rate': 4.9080000000000004e-05, 'epoch': 0.56}


  2%|▏         | 80/3750 [17:10<12:54:39, 12.66s/it]

{'loss': 0.5536, 'grad_norm': 11.48459529876709, 'learning_rate': 4.894666666666667e-05, 'epoch': 0.64}


  2%|▏         | 90/3750 [19:16<12:44:13, 12.53s/it]

{'loss': 0.4992, 'grad_norm': 9.755359649658203, 'learning_rate': 4.8813333333333336e-05, 'epoch': 0.72}


  3%|▎         | 100/3750 [21:22<12:48:40, 12.64s/it]

{'loss': 0.549, 'grad_norm': 2.732105255126953, 'learning_rate': 4.868e-05, 'epoch': 0.8}


  3%|▎         | 110/3750 [23:27<12:50:33, 12.70s/it]

{'loss': 0.4361, 'grad_norm': 5.491386413574219, 'learning_rate': 4.854666666666667e-05, 'epoch': 0.88}


  3%|▎         | 120/3750 [25:34<12:43:57, 12.63s/it]

{'loss': 0.6795, 'grad_norm': 1.024959683418274, 'learning_rate': 4.841333333333334e-05, 'epoch': 0.96}


                                                     
  3%|▎         | 125/3750 [27:43<10:49:48, 10.76s/it]

{'eval_loss': 0.45075035095214844, 'eval_accuracy': 0.8900325596050835, 'eval_f1': 0.23545429285912753, 'eval_runtime': 73.0434, 'eval_samples_per_second': 2.41, 'eval_steps_per_second': 0.301, 'epoch': 1.0}


  3%|▎         | 130/3750 [29:02<18:56:24, 18.84s/it]

{'loss': 0.5159, 'grad_norm': 3.920257568359375, 'learning_rate': 4.8280000000000005e-05, 'epoch': 1.04}


  4%|▎         | 140/3750 [31:09<12:52:08, 12.83s/it]

{'loss': 0.3799, 'grad_norm': 5.268338680267334, 'learning_rate': 4.814666666666667e-05, 'epoch': 1.12}


  4%|▍         | 150/3750 [33:15<12:42:14, 12.70s/it]

{'loss': 0.3198, 'grad_norm': 27.168132781982422, 'learning_rate': 4.801333333333334e-05, 'epoch': 1.2}


  4%|▍         | 160/3750 [35:21<12:28:45, 12.51s/it]

{'loss': 0.4204, 'grad_norm': 3.99286150932312, 'learning_rate': 4.788e-05, 'epoch': 1.28}


  5%|▍         | 170/3750 [37:27<12:36:50, 12.68s/it]

{'loss': 0.2857, 'grad_norm': 93.42219543457031, 'learning_rate': 4.774666666666667e-05, 'epoch': 1.36}


  5%|▍         | 180/3750 [39:33<12:31:47, 12.64s/it]

{'loss': 0.1593, 'grad_norm': 0.9104962944984436, 'learning_rate': 4.761333333333334e-05, 'epoch': 1.44}


  5%|▌         | 190/3750 [41:40<12:39:45, 12.80s/it]

{'loss': 0.3894, 'grad_norm': 2.6903059482574463, 'learning_rate': 4.748e-05, 'epoch': 1.52}


  5%|▌         | 200/3750 [43:47<12:30:37, 12.69s/it]

{'loss': 0.2035, 'grad_norm': 0.002902661683037877, 'learning_rate': 4.7346666666666665e-05, 'epoch': 1.6}


  6%|▌         | 210/3750 [45:54<12:26:43, 12.66s/it]

{'loss': 0.3358, 'grad_norm': 3.5156631469726562, 'learning_rate': 4.721333333333334e-05, 'epoch': 1.68}


  6%|▌         | 220/3750 [48:01<12:34:39, 12.83s/it]

{'loss': 0.2078, 'grad_norm': 1.6971646547317505, 'learning_rate': 4.708e-05, 'epoch': 1.76}


  6%|▌         | 230/3750 [50:08<12:23:39, 12.68s/it]

{'loss': 0.1787, 'grad_norm': 19.107589721679688, 'learning_rate': 4.694666666666667e-05, 'epoch': 1.84}


  6%|▋         | 240/3750 [52:14<12:09:10, 12.46s/it]

{'loss': 0.2092, 'grad_norm': 0.37898120284080505, 'learning_rate': 4.6813333333333335e-05, 'epoch': 1.92}


  7%|▋         | 250/3750 [54:14<10:30:05, 10.80s/it]

{'loss': 0.2777, 'grad_norm': 18.611955642700195, 'learning_rate': 4.668e-05, 'epoch': 2.0}


                                                     
  7%|▋         | 250/3750 [55:26<10:30:05, 10.80s/it]

{'eval_loss': 0.27984267473220825, 'eval_accuracy': 0.9196512971326541, 'eval_f1': 0.4344593803186664, 'eval_runtime': 72.1772, 'eval_samples_per_second': 2.438, 'eval_steps_per_second': 0.305, 'epoch': 2.0}


  7%|▋         | 260/3750 [57:47<13:11:47, 13.61s/it]

{'loss': 0.1963, 'grad_norm': 1.6983662843704224, 'learning_rate': 4.6546666666666666e-05, 'epoch': 2.08}


  7%|▋         | 270/3750 [59:53<12:17:47, 12.72s/it]

{'loss': 0.1577, 'grad_norm': 0.007294870913028717, 'learning_rate': 4.641333333333334e-05, 'epoch': 2.16}


  7%|▋         | 280/3750 [1:01:58<12:00:10, 12.45s/it]

{'loss': 0.4154, 'grad_norm': 2.6959526538848877, 'learning_rate': 4.6280000000000004e-05, 'epoch': 2.24}


  8%|▊         | 290/3750 [1:04:01<11:48:38, 12.29s/it]

{'loss': 0.1299, 'grad_norm': 19.225400924682617, 'learning_rate': 4.614666666666667e-05, 'epoch': 2.32}


  8%|▊         | 300/3750 [1:06:08<12:07:01, 12.64s/it]

{'loss': 0.2414, 'grad_norm': 0.4598172903060913, 'learning_rate': 4.6013333333333336e-05, 'epoch': 2.4}


  8%|▊         | 310/3750 [1:08:13<11:52:46, 12.43s/it]

{'loss': 0.1187, 'grad_norm': 0.0156340803951025, 'learning_rate': 4.588e-05, 'epoch': 2.48}


  9%|▊         | 320/3750 [1:10:20<12:13:55, 12.84s/it]

{'loss': 0.0811, 'grad_norm': 0.0014890508027747273, 'learning_rate': 4.5746666666666674e-05, 'epoch': 2.56}


  9%|▉         | 330/3750 [1:12:23<11:32:02, 12.14s/it]

{'loss': 0.1531, 'grad_norm': 1.93109130859375, 'learning_rate': 4.561333333333333e-05, 'epoch': 2.64}


  9%|▉         | 340/3750 [1:14:26<11:37:57, 12.28s/it]

{'loss': 0.2327, 'grad_norm': 1.8418164253234863, 'learning_rate': 4.548e-05, 'epoch': 2.72}


  9%|▉         | 350/3750 [1:16:28<11:28:15, 12.15s/it]

{'loss': 0.1646, 'grad_norm': 1.4852501153945923, 'learning_rate': 4.534666666666667e-05, 'epoch': 2.8}


 10%|▉         | 360/3750 [1:18:31<11:32:13, 12.25s/it]

{'loss': 0.1662, 'grad_norm': 0.008536835201084614, 'learning_rate': 4.5213333333333336e-05, 'epoch': 2.88}


 10%|▉         | 370/3750 [1:20:33<11:25:38, 12.17s/it]

{'loss': 0.3424, 'grad_norm': 3.8558616638183594, 'learning_rate': 4.508e-05, 'epoch': 2.96}


                                                       
 10%|█         | 375/3750 [1:22:38<9:45:24, 10.41s/it]

{'eval_loss': 0.15641465783119202, 'eval_accuracy': 0.94118264888142, 'eval_f1': 0.5294583058402872, 'eval_runtime': 70.4897, 'eval_samples_per_second': 2.497, 'eval_steps_per_second': 0.312, 'epoch': 3.0}


 10%|█         | 380/3750 [1:23:53<16:50:52, 18.00s/it]

{'loss': 0.1396, 'grad_norm': 2.402503252029419, 'learning_rate': 4.494666666666667e-05, 'epoch': 3.04}


 10%|█         | 390/3750 [1:25:56<11:33:41, 12.39s/it]

{'loss': 0.1293, 'grad_norm': 0.0006856205873191357, 'learning_rate': 4.4813333333333333e-05, 'epoch': 3.12}


 11%|█         | 400/3750 [1:27:57<11:24:40, 12.26s/it]

{'loss': 0.2165, 'grad_norm': 0.6694238185882568, 'learning_rate': 4.468e-05, 'epoch': 3.2}


 11%|█         | 410/3750 [1:30:00<11:27:37, 12.35s/it]

{'loss': 0.1384, 'grad_norm': 0.001251638401299715, 'learning_rate': 4.454666666666667e-05, 'epoch': 3.28}


 11%|█         | 420/3750 [1:32:03<11:29:43, 12.43s/it]

{'loss': 0.2311, 'grad_norm': 0.0023977106902748346, 'learning_rate': 4.441333333333334e-05, 'epoch': 3.36}


 11%|█▏        | 430/3750 [1:34:06<11:16:23, 12.22s/it]

{'loss': 0.1805, 'grad_norm': 7.7653985023498535, 'learning_rate': 4.428e-05, 'epoch': 3.44}


 12%|█▏        | 440/3750 [1:36:08<11:10:24, 12.15s/it]

{'loss': 0.1231, 'grad_norm': 0.37754854559898376, 'learning_rate': 4.414666666666667e-05, 'epoch': 3.52}


 12%|█▏        | 450/3750 [1:38:11<11:21:35, 12.39s/it]

{'loss': 0.2004, 'grad_norm': 1.7410446405410767, 'learning_rate': 4.4013333333333334e-05, 'epoch': 3.6}


 12%|█▏        | 460/3750 [1:40:13<11:08:47, 12.20s/it]

{'loss': 0.2286, 'grad_norm': 0.5359947681427002, 'learning_rate': 4.388000000000001e-05, 'epoch': 3.68}


 13%|█▎        | 470/3750 [1:42:16<11:11:48, 12.29s/it]

{'loss': 0.0547, 'grad_norm': 2.7900049686431885, 'learning_rate': 4.374666666666667e-05, 'epoch': 3.76}


 13%|█▎        | 480/3750 [1:44:20<11:00:03, 12.11s/it]

{'loss': 0.1665, 'grad_norm': 1.4726676940917969, 'learning_rate': 4.361333333333333e-05, 'epoch': 3.84}


 13%|█▎        | 490/3750 [1:46:23<11:04:04, 12.22s/it]

{'loss': 0.1341, 'grad_norm': 3.4813740253448486, 'learning_rate': 4.3480000000000004e-05, 'epoch': 3.92}


 13%|█▎        | 500/3750 [1:48:18<9:13:35, 10.22s/it] 

{'loss': 0.1182, 'grad_norm': 1.9143688678741455, 'learning_rate': 4.334666666666667e-05, 'epoch': 4.0}


                                                      
 13%|█▎        | 500/3750 [1:49:29<9:13:35, 10.22s/it]

{'eval_loss': 0.23758092522621155, 'eval_accuracy': 0.9397122151034555, 'eval_f1': 0.5245353959198423, 'eval_runtime': 70.5782, 'eval_samples_per_second': 2.494, 'eval_steps_per_second': 0.312, 'epoch': 4.0}


 14%|█▎        | 510/3750 [1:51:46<11:58:25, 13.30s/it]

{'loss': 0.1326, 'grad_norm': 0.6346926689147949, 'learning_rate': 4.3213333333333335e-05, 'epoch': 4.08}


 14%|█▍        | 520/3750 [1:53:48<11:05:37, 12.36s/it]

{'loss': 0.075, 'grad_norm': 1.2332388162612915, 'learning_rate': 4.308e-05, 'epoch': 4.16}


 14%|█▍        | 530/3750 [1:55:49<10:45:35, 12.03s/it]

{'loss': 0.1202, 'grad_norm': 0.003401298774406314, 'learning_rate': 4.2946666666666667e-05, 'epoch': 4.24}


 14%|█▍        | 540/3750 [1:57:51<10:50:06, 12.15s/it]

{'loss': 0.1516, 'grad_norm': 0.0040498776361346245, 'learning_rate': 4.281333333333333e-05, 'epoch': 4.32}


 15%|█▍        | 550/3750 [1:59:53<10:49:41, 12.18s/it]

{'loss': 0.1143, 'grad_norm': 1.847141146659851, 'learning_rate': 4.2680000000000005e-05, 'epoch': 4.4}


 15%|█▍        | 560/3750 [2:01:55<10:57:09, 12.36s/it]

{'loss': 0.1291, 'grad_norm': 0.005478695500642061, 'learning_rate': 4.254666666666667e-05, 'epoch': 4.48}


 15%|█▌        | 570/3750 [2:03:58<10:46:58, 12.21s/it]

{'loss': 0.101, 'grad_norm': 0.004491962026804686, 'learning_rate': 4.241333333333333e-05, 'epoch': 4.56}


 15%|█▌        | 580/3750 [2:06:01<10:45:11, 12.21s/it]

{'loss': 0.0823, 'grad_norm': 0.6893176436424255, 'learning_rate': 4.228e-05, 'epoch': 4.64}


 16%|█▌        | 590/3750 [2:08:02<10:40:46, 12.17s/it]

{'loss': 0.0981, 'grad_norm': 0.8050693273544312, 'learning_rate': 4.214666666666667e-05, 'epoch': 4.72}


 16%|█▌        | 600/3750 [2:10:05<10:48:41, 12.36s/it]

{'loss': 0.1358, 'grad_norm': 3.1629867553710938, 'learning_rate': 4.201333333333334e-05, 'epoch': 4.8}


 16%|█▋        | 610/3750 [2:12:08<10:47:14, 12.37s/it]

{'loss': 0.1393, 'grad_norm': 0.7522015571594238, 'learning_rate': 4.1880000000000006e-05, 'epoch': 4.88}


 17%|█▋        | 620/3750 [2:14:10<10:36:16, 12.20s/it]

{'loss': 0.0757, 'grad_norm': 1.139510154724121, 'learning_rate': 4.1746666666666665e-05, 'epoch': 4.96}


                                                       
 17%|█▋        | 625/3750 [2:16:16<8:58:26, 10.34s/it]

{'eval_loss': 0.15436945855617523, 'eval_accuracy': 0.9489549417078038, 'eval_f1': 0.5813487770194549, 'eval_runtime': 71.0582, 'eval_samples_per_second': 2.477, 'eval_steps_per_second': 0.31, 'epoch': 5.0}


 17%|█▋        | 630/3750 [2:17:31<15:42:07, 18.12s/it]

{'loss': 0.098, 'grad_norm': 0.6061702370643616, 'learning_rate': 4.161333333333334e-05, 'epoch': 5.04}


 17%|█▋        | 640/3750 [2:19:37<11:02:30, 12.78s/it]

{'loss': 0.0785, 'grad_norm': 0.0025287331081926823, 'learning_rate': 4.148e-05, 'epoch': 5.12}


 17%|█▋        | 650/3750 [2:21:40<10:45:39, 12.50s/it]

{'loss': 0.0602, 'grad_norm': 1.191095232963562, 'learning_rate': 4.134666666666667e-05, 'epoch': 5.2}


 18%|█▊        | 660/3750 [2:23:44<10:37:33, 12.38s/it]

{'loss': 0.0564, 'grad_norm': 2.073140859603882, 'learning_rate': 4.1213333333333334e-05, 'epoch': 5.28}


 18%|█▊        | 670/3750 [2:25:47<10:30:11, 12.28s/it]

{'loss': 0.0562, 'grad_norm': 1.2377110719680786, 'learning_rate': 4.108e-05, 'epoch': 5.36}


 18%|█▊        | 680/3750 [2:27:48<10:14:50, 12.02s/it]

{'loss': 0.0908, 'grad_norm': 1.3717527389526367, 'learning_rate': 4.0946666666666665e-05, 'epoch': 5.44}


 18%|█▊        | 690/3750 [2:29:48<10:11:22, 11.99s/it]

{'loss': 0.0697, 'grad_norm': 1.6378487348556519, 'learning_rate': 4.081333333333334e-05, 'epoch': 5.52}


 19%|█▊        | 700/3750 [2:31:51<10:21:02, 12.22s/it]

{'loss': 0.0924, 'grad_norm': 1.1652501821517944, 'learning_rate': 4.0680000000000004e-05, 'epoch': 5.6}


 19%|█▉        | 710/3750 [2:33:53<10:26:11, 12.36s/it]

{'loss': 0.0903, 'grad_norm': 2.8681156635284424, 'learning_rate': 4.054666666666667e-05, 'epoch': 5.68}


 19%|█▉        | 720/3750 [2:35:56<10:26:38, 12.41s/it]

{'loss': 0.083, 'grad_norm': 0.0012902297312393785, 'learning_rate': 4.0413333333333335e-05, 'epoch': 5.76}


 19%|█▉        | 730/3750 [2:37:59<10:16:09, 12.24s/it]

{'loss': 0.0819, 'grad_norm': 0.5278894305229187, 'learning_rate': 4.028e-05, 'epoch': 5.84}


 20%|█▉        | 740/3750 [2:39:59<10:01:46, 12.00s/it]

{'loss': 0.0854, 'grad_norm': 1.2409642934799194, 'learning_rate': 4.014666666666667e-05, 'epoch': 5.92}


 20%|██        | 750/3750 [2:41:55<8:33:51, 10.28s/it] 

{'loss': 0.059, 'grad_norm': 0.001266762032173574, 'learning_rate': 4.001333333333334e-05, 'epoch': 6.0}


                                                      
 20%|██        | 750/3750 [2:43:06<8:33:51, 10.28s/it]

{'eval_loss': 0.2271898239850998, 'eval_accuracy': 0.947064383993278, 'eval_f1': 0.6104012048586902, 'eval_runtime': 70.7635, 'eval_samples_per_second': 2.487, 'eval_steps_per_second': 0.311, 'epoch': 6.0}


 20%|██        | 760/3750 [2:45:19<10:44:43, 12.94s/it]

{'loss': 0.0799, 'grad_norm': 0.845068633556366, 'learning_rate': 3.988e-05, 'epoch': 6.08}


 21%|██        | 770/3750 [2:47:22<10:11:01, 12.30s/it]

{'loss': 0.0273, 'grad_norm': 0.890960693359375, 'learning_rate': 3.974666666666667e-05, 'epoch': 6.16}


 21%|██        | 780/3750 [2:49:25<10:03:50, 12.20s/it]

{'loss': 0.0551, 'grad_norm': 0.8611893653869629, 'learning_rate': 3.9613333333333336e-05, 'epoch': 6.24}


 21%|██        | 790/3750 [2:51:29<10:29:17, 12.76s/it]

{'loss': 0.0534, 'grad_norm': 1.0361279249191284, 'learning_rate': 3.948e-05, 'epoch': 6.32}


 21%|██▏       | 800/3750 [2:53:34<10:02:38, 12.26s/it]

{'loss': 0.0713, 'grad_norm': 0.5586753487586975, 'learning_rate': 3.9346666666666674e-05, 'epoch': 6.4}


 22%|██▏       | 810/3750 [2:55:40<10:05:04, 12.35s/it]

{'loss': 0.0632, 'grad_norm': 0.7820737957954407, 'learning_rate': 3.921333333333333e-05, 'epoch': 6.48}


 22%|██▏       | 820/3750 [2:57:44<10:05:56, 12.41s/it]

{'loss': 0.0412, 'grad_norm': 0.7088037729263306, 'learning_rate': 3.908e-05, 'epoch': 6.56}


 22%|██▏       | 830/3750 [2:59:45<9:42:21, 11.97s/it] 

{'loss': 0.0532, 'grad_norm': 1.4328194856643677, 'learning_rate': 3.894666666666667e-05, 'epoch': 6.64}


 22%|██▏       | 840/3750 [3:01:48<9:55:50, 12.29s/it]

{'loss': 0.0319, 'grad_norm': 1.3547019958496094, 'learning_rate': 3.881333333333334e-05, 'epoch': 6.72}


 23%|██▎       | 850/3750 [3:03:51<9:54:13, 12.29s/it] 

{'loss': 0.0592, 'grad_norm': 0.7570298910140991, 'learning_rate': 3.868e-05, 'epoch': 6.8}


 23%|██▎       | 860/3750 [3:05:54<9:51:42, 12.28s/it] 

{'loss': 0.0696, 'grad_norm': 0.31138068437576294, 'learning_rate': 3.854666666666667e-05, 'epoch': 6.88}


 23%|██▎       | 870/3750 [3:07:57<9:48:41, 12.26s/it] 

{'loss': 0.0373, 'grad_norm': 1.2486112117767334, 'learning_rate': 3.8413333333333334e-05, 'epoch': 6.96}


                                                      
 23%|██▎       | 875/3750 [3:10:03<8:11:06, 10.25s/it]

{'eval_loss': 0.18643957376480103, 'eval_accuracy': 0.9516857472954522, 'eval_f1': 0.6208734577966112, 'eval_runtime': 71.2082, 'eval_samples_per_second': 2.472, 'eval_steps_per_second': 0.309, 'epoch': 7.0}


 23%|██▎       | 880/3750 [3:11:19<14:25:27, 18.09s/it]

{'loss': 0.0431, 'grad_norm': 1.5527408123016357, 'learning_rate': 3.828e-05, 'epoch': 7.04}


 24%|██▎       | 890/3750 [3:13:22<9:51:18, 12.41s/it] 

{'loss': 0.0566, 'grad_norm': 0.0025006968062371016, 'learning_rate': 3.814666666666667e-05, 'epoch': 7.12}


 24%|██▍       | 900/3750 [3:15:25<9:39:08, 12.19s/it]

{'loss': 0.019, 'grad_norm': 0.5875598192214966, 'learning_rate': 3.801333333333333e-05, 'epoch': 7.2}


 24%|██▍       | 910/3750 [3:17:28<9:41:00, 12.27s/it]

{'loss': 0.0116, 'grad_norm': 0.027770336717367172, 'learning_rate': 3.788e-05, 'epoch': 7.28}


 25%|██▍       | 920/3750 [3:19:29<9:31:28, 12.12s/it]

{'loss': 0.0368, 'grad_norm': 0.4980275332927704, 'learning_rate': 3.774666666666667e-05, 'epoch': 7.36}


 25%|██▍       | 930/3750 [3:21:32<9:37:52, 12.30s/it]

{'loss': 0.034, 'grad_norm': 0.0006913927500136197, 'learning_rate': 3.7613333333333335e-05, 'epoch': 7.44}


 25%|██▌       | 940/3750 [3:23:36<9:49:09, 12.58s/it]

{'loss': 0.021, 'grad_norm': 0.0005818596691824496, 'learning_rate': 3.748000000000001e-05, 'epoch': 7.52}


 25%|██▌       | 950/3750 [3:25:40<9:46:16, 12.56s/it]

{'loss': 0.046, 'grad_norm': 1.0087746381759644, 'learning_rate': 3.7346666666666666e-05, 'epoch': 7.6}


 26%|██▌       | 960/3750 [3:27:44<9:29:37, 12.25s/it]

{'loss': 0.045, 'grad_norm': 1.3412619829177856, 'learning_rate': 3.721333333333333e-05, 'epoch': 7.68}


 26%|██▌       | 970/3750 [3:29:45<9:16:55, 12.02s/it]

{'loss': 0.0328, 'grad_norm': 0.5622690320014954, 'learning_rate': 3.7080000000000004e-05, 'epoch': 7.76}


 26%|██▌       | 980/3750 [3:31:47<9:21:43, 12.17s/it]

{'loss': 0.0254, 'grad_norm': 0.7438905835151672, 'learning_rate': 3.694666666666667e-05, 'epoch': 7.84}


 26%|██▋       | 990/3750 [3:33:48<9:11:18, 11.98s/it]

{'loss': 0.0481, 'grad_norm': 1.2077194452285767, 'learning_rate': 3.6813333333333335e-05, 'epoch': 7.92}


 27%|██▋       | 1000/3750 [3:35:44<7:51:36, 10.29s/it]

{'loss': 0.0236, 'grad_norm': 0.0005798490601591766, 'learning_rate': 3.668e-05, 'epoch': 8.0}


                                                       
 27%|██▋       | 1000/3750 [3:36:54<7:51:36, 10.29s/it]

{'eval_loss': 0.24758793413639069, 'eval_accuracy': 0.9447537023421909, 'eval_f1': 0.6207480319705185, 'eval_runtime': 69.6624, 'eval_samples_per_second': 2.526, 'eval_steps_per_second': 0.316, 'epoch': 8.0}


 27%|██▋       | 1010/3750 [3:39:11<10:05:11, 13.25s/it]

{'loss': 0.0205, 'grad_norm': 0.002585094189271331, 'learning_rate': 3.654666666666667e-05, 'epoch': 8.08}


 27%|██▋       | 1020/3750 [3:41:11<9:03:29, 11.94s/it] 

{'loss': 0.0172, 'grad_norm': 0.22069963812828064, 'learning_rate': 3.641333333333333e-05, 'epoch': 8.16}


 27%|██▋       | 1030/3750 [3:43:12<9:03:25, 11.99s/it]

{'loss': 0.0204, 'grad_norm': 0.0014102425193414092, 'learning_rate': 3.6280000000000005e-05, 'epoch': 8.24}


 28%|██▊       | 1040/3750 [3:45:15<9:12:27, 12.23s/it]

{'loss': 0.0222, 'grad_norm': 1.5367783308029175, 'learning_rate': 3.614666666666667e-05, 'epoch': 8.32}


 28%|██▊       | 1050/3750 [3:47:18<9:19:46, 12.44s/it]

{'loss': 0.0295, 'grad_norm': 1.0151636600494385, 'learning_rate': 3.6013333333333336e-05, 'epoch': 8.4}


 28%|██▊       | 1060/3750 [3:49:21<9:06:15, 12.18s/it]

{'loss': 0.0134, 'grad_norm': 0.757465124130249, 'learning_rate': 3.588e-05, 'epoch': 8.48}


 29%|██▊       | 1070/3750 [3:51:24<9:08:24, 12.28s/it]

{'loss': 0.0234, 'grad_norm': 0.5971685647964478, 'learning_rate': 3.574666666666667e-05, 'epoch': 8.56}


 29%|██▉       | 1080/3750 [3:53:27<9:04:42, 12.24s/it]

{'loss': 0.0208, 'grad_norm': 0.0007433338323608041, 'learning_rate': 3.561333333333334e-05, 'epoch': 8.64}


 29%|██▉       | 1090/3750 [3:55:30<9:13:07, 12.48s/it]

{'loss': 0.0323, 'grad_norm': 1.2313573360443115, 'learning_rate': 3.548e-05, 'epoch': 8.72}


 29%|██▉       | 1100/3750 [3:57:33<9:00:25, 12.24s/it]

{'loss': 0.0234, 'grad_norm': 0.7585819959640503, 'learning_rate': 3.5346666666666665e-05, 'epoch': 8.8}


 30%|██▉       | 1110/3750 [3:59:37<9:00:33, 12.29s/it]

{'loss': 0.0272, 'grad_norm': 1.7395477294921875, 'learning_rate': 3.521333333333334e-05, 'epoch': 8.88}


 30%|██▉       | 1120/3750 [4:01:40<8:57:45, 12.27s/it]

{'loss': 0.0319, 'grad_norm': 0.7983978390693665, 'learning_rate': 3.508e-05, 'epoch': 8.96}


                                                       
 30%|███       | 1125/3750 [4:03:46<7:32:53, 10.35s/it]

{'eval_loss': 0.3361940383911133, 'eval_accuracy': 0.9427581136435248, 'eval_f1': 0.6095579239310097, 'eval_runtime': 71.2121, 'eval_samples_per_second': 2.471, 'eval_steps_per_second': 0.309, 'epoch': 9.0}


 30%|███       | 1130/3750 [4:05:05<13:16:34, 18.24s/it]

{'loss': 0.0208, 'grad_norm': 1.4846285581588745, 'learning_rate': 3.494666666666667e-05, 'epoch': 9.04}


 30%|███       | 1140/3750 [4:07:08<9:09:15, 12.63s/it] 

{'loss': 0.0096, 'grad_norm': 0.7397037148475647, 'learning_rate': 3.4813333333333334e-05, 'epoch': 9.12}


 31%|███       | 1150/3750 [4:09:12<8:52:04, 12.28s/it]

{'loss': 0.0178, 'grad_norm': 0.20746921002864838, 'learning_rate': 3.468e-05, 'epoch': 9.2}


 31%|███       | 1160/3750 [4:11:14<8:45:15, 12.17s/it]

{'loss': 0.0078, 'grad_norm': 1.4592251777648926, 'learning_rate': 3.4546666666666666e-05, 'epoch': 9.28}


 31%|███       | 1170/3750 [4:13:16<8:53:35, 12.41s/it]

{'loss': 0.0151, 'grad_norm': 0.1324102133512497, 'learning_rate': 3.441333333333334e-05, 'epoch': 9.36}


 31%|███▏      | 1180/3750 [4:15:20<8:46:30, 12.29s/it]

{'loss': 0.0104, 'grad_norm': 1.7766811847686768, 'learning_rate': 3.4280000000000004e-05, 'epoch': 9.44}


 32%|███▏      | 1190/3750 [4:17:23<8:40:35, 12.20s/it]

{'loss': 0.0059, 'grad_norm': 0.24918700754642487, 'learning_rate': 3.414666666666666e-05, 'epoch': 9.52}


 32%|███▏      | 1200/3750 [4:19:26<8:42:03, 12.28s/it]

{'loss': 0.0195, 'grad_norm': 1.3170143365859985, 'learning_rate': 3.4013333333333335e-05, 'epoch': 9.6}


 32%|███▏      | 1210/3750 [4:21:29<8:38:48, 12.26s/it]

{'loss': 0.011, 'grad_norm': 0.6325759887695312, 'learning_rate': 3.388e-05, 'epoch': 9.68}


 33%|███▎      | 1220/3750 [4:23:32<8:42:30, 12.39s/it]

{'loss': 0.0225, 'grad_norm': 1.7666044235229492, 'learning_rate': 3.374666666666667e-05, 'epoch': 9.76}


 33%|███▎      | 1230/3750 [4:25:32<8:23:07, 11.98s/it]

{'loss': 0.0293, 'grad_norm': 1.4196405410766602, 'learning_rate': 3.361333333333333e-05, 'epoch': 9.84}


 33%|███▎      | 1240/3750 [4:27:36<8:41:06, 12.46s/it]

{'loss': 0.0139, 'grad_norm': 0.00033106518094427884, 'learning_rate': 3.348e-05, 'epoch': 9.92}


 33%|███▎      | 1250/3750 [4:29:35<7:24:54, 10.68s/it]

{'loss': 0.0211, 'grad_norm': 0.0004631364718079567, 'learning_rate': 3.334666666666667e-05, 'epoch': 10.0}


                                                       
 33%|███▎      | 1250/3750 [4:30:46<7:24:54, 10.68s/it]

{'eval_loss': 0.3608884811401367, 'eval_accuracy': 0.9407625249448587, 'eval_f1': 0.5891491271416877, 'eval_runtime': 70.6539, 'eval_samples_per_second': 2.491, 'eval_steps_per_second': 0.311, 'epoch': 10.0}


 34%|███▎      | 1260/3750 [4:33:03<9:15:06, 13.38s/it] 

{'loss': 0.0035, 'grad_norm': 0.0019893699791282415, 'learning_rate': 3.3213333333333336e-05, 'epoch': 10.08}


 34%|███▍      | 1270/3750 [4:35:06<8:34:26, 12.45s/it]

{'loss': 0.0106, 'grad_norm': 0.39585912227630615, 'learning_rate': 3.308e-05, 'epoch': 10.16}


 34%|███▍      | 1280/3750 [4:37:10<8:26:02, 12.29s/it]

{'loss': 0.0042, 'grad_norm': 0.7112096548080444, 'learning_rate': 3.294666666666667e-05, 'epoch': 10.24}


 34%|███▍      | 1290/3750 [4:39:12<8:21:22, 12.23s/it]

{'loss': 0.0211, 'grad_norm': 2.2234609127044678, 'learning_rate': 3.281333333333333e-05, 'epoch': 10.32}


 35%|███▍      | 1300/3750 [4:41:15<8:21:48, 12.29s/it]

{'loss': 0.0244, 'grad_norm': 1.028939127922058, 'learning_rate': 3.268e-05, 'epoch': 10.4}


 35%|███▍      | 1310/3750 [4:43:17<8:17:33, 12.24s/it]

{'loss': 0.0072, 'grad_norm': 1.7153229713439941, 'learning_rate': 3.254666666666667e-05, 'epoch': 10.48}


 35%|███▌      | 1320/3750 [4:45:18<8:07:25, 12.03s/it]

{'loss': 0.01, 'grad_norm': 0.22682854533195496, 'learning_rate': 3.241333333333334e-05, 'epoch': 10.56}


 35%|███▌      | 1330/3750 [4:47:20<8:14:57, 12.27s/it]

{'loss': 0.0113, 'grad_norm': 2.3383171558380127, 'learning_rate': 3.2279999999999996e-05, 'epoch': 10.64}


 36%|███▌      | 1340/3750 [4:49:24<8:09:31, 12.19s/it]

{'loss': 0.0229, 'grad_norm': 1.3797117471694946, 'learning_rate': 3.214666666666667e-05, 'epoch': 10.72}


 36%|███▌      | 1350/3750 [4:51:26<8:04:06, 12.10s/it]

{'loss': 0.0076, 'grad_norm': 0.00030522915767505765, 'learning_rate': 3.2013333333333334e-05, 'epoch': 10.8}


 36%|███▋      | 1360/3750 [4:53:28<8:04:21, 12.16s/it]

{'loss': 0.0075, 'grad_norm': 0.00030949950451031327, 'learning_rate': 3.188e-05, 'epoch': 10.88}


 37%|███▋      | 1370/3750 [4:55:31<8:05:26, 12.24s/it]

{'loss': 0.0164, 'grad_norm': 1.5273152589797974, 'learning_rate': 3.174666666666667e-05, 'epoch': 10.96}


                                                       
 37%|███▋      | 1375/3750 [4:57:35<6:42:31, 10.17s/it]

{'eval_loss': 0.37306153774261475, 'eval_accuracy': 0.9427581136435248, 'eval_f1': 0.6109777732226713, 'eval_runtime': 70.0289, 'eval_samples_per_second': 2.513, 'eval_steps_per_second': 0.314, 'epoch': 11.0}


 37%|███▋      | 1380/3750 [4:58:52<11:53:25, 18.06s/it]

{'loss': 0.0091, 'grad_norm': 0.4762701988220215, 'learning_rate': 3.161333333333333e-05, 'epoch': 11.04}


 37%|███▋      | 1390/3750 [5:00:54<8:06:07, 12.36s/it] 

{'loss': 0.0104, 'grad_norm': 0.04701482132077217, 'learning_rate': 3.1480000000000004e-05, 'epoch': 11.12}


 37%|███▋      | 1400/3750 [5:02:56<7:55:24, 12.14s/it]

{'loss': 0.0026, 'grad_norm': 1.7682491540908813, 'learning_rate': 3.134666666666667e-05, 'epoch': 11.2}


 38%|███▊      | 1410/3750 [5:04:59<8:03:33, 12.40s/it]

{'loss': 0.0096, 'grad_norm': 1.9402799606323242, 'learning_rate': 3.1213333333333335e-05, 'epoch': 11.28}


 38%|███▊      | 1420/3750 [5:07:02<8:02:46, 12.43s/it]

{'loss': 0.0145, 'grad_norm': 0.0005915941437706351, 'learning_rate': 3.108e-05, 'epoch': 11.36}


 38%|███▊      | 1430/3750 [5:09:05<7:51:08, 12.18s/it]

{'loss': 0.0092, 'grad_norm': 2.252258777618408, 'learning_rate': 3.0946666666666666e-05, 'epoch': 11.44}


 38%|███▊      | 1440/3750 [5:11:07<7:50:09, 12.21s/it]

{'loss': 0.0049, 'grad_norm': 0.00036351100425235927, 'learning_rate': 3.081333333333333e-05, 'epoch': 11.52}


 39%|███▊      | 1450/3750 [5:13:10<7:50:05, 12.26s/it]

{'loss': 0.011, 'grad_norm': 0.23213094472885132, 'learning_rate': 3.0680000000000004e-05, 'epoch': 11.6}


 39%|███▉      | 1460/3750 [5:15:13<7:54:19, 12.43s/it]

{'loss': 0.0037, 'grad_norm': 0.33459654450416565, 'learning_rate': 3.054666666666667e-05, 'epoch': 11.68}


 39%|███▉      | 1470/3750 [5:17:15<7:48:20, 12.32s/it]

{'loss': 0.0111, 'grad_norm': 1.2233741283416748, 'learning_rate': 3.0413333333333332e-05, 'epoch': 11.76}


 39%|███▉      | 1480/3750 [5:19:18<7:42:40, 12.23s/it]

{'loss': 0.0101, 'grad_norm': 1.6473172903060913, 'learning_rate': 3.028e-05, 'epoch': 11.84}


 40%|███▉      | 1490/3750 [5:21:21<7:41:40, 12.26s/it]

{'loss': 0.0103, 'grad_norm': 1.5763309001922607, 'learning_rate': 3.0146666666666667e-05, 'epoch': 11.92}


 40%|████      | 1500/3750 [5:23:18<6:25:56, 10.29s/it]

{'loss': 0.0091, 'grad_norm': 0.00027879097615368664, 'learning_rate': 3.0013333333333333e-05, 'epoch': 12.0}


                                                       
 40%|████      | 1500/3750 [5:24:29<6:25:56, 10.29s/it]

{'eval_loss': 0.3043343424797058, 'eval_accuracy': 0.9475895389139797, 'eval_f1': 0.628227527482468, 'eval_runtime': 70.975, 'eval_samples_per_second': 2.48, 'eval_steps_per_second': 0.31, 'epoch': 12.0}


 40%|████      | 1510/3750 [5:26:45<8:09:25, 13.11s/it] 

{'loss': 0.0063, 'grad_norm': 0.08729004114866257, 'learning_rate': 2.9880000000000002e-05, 'epoch': 12.08}


 41%|████      | 1520/3750 [5:28:48<7:42:48, 12.45s/it]

{'loss': 0.0043, 'grad_norm': 0.00029655566322617233, 'learning_rate': 2.9746666666666668e-05, 'epoch': 12.16}


 41%|████      | 1530/3750 [5:30:50<7:31:43, 12.21s/it]

{'loss': 0.0022, 'grad_norm': 0.6259177327156067, 'learning_rate': 2.9613333333333337e-05, 'epoch': 12.24}


 41%|████      | 1540/3750 [5:32:52<7:35:13, 12.36s/it]

{'loss': 0.011, 'grad_norm': 1.0662827491760254, 'learning_rate': 2.9480000000000002e-05, 'epoch': 12.32}


 41%|████▏     | 1550/3750 [5:34:55<7:28:08, 12.22s/it]

{'loss': 0.0046, 'grad_norm': 0.00028056700830347836, 'learning_rate': 2.9346666666666668e-05, 'epoch': 12.4}


 42%|████▏     | 1560/3750 [5:36:58<7:25:52, 12.22s/it]

{'loss': 0.0023, 'grad_norm': 0.00028642965480685234, 'learning_rate': 2.9213333333333337e-05, 'epoch': 12.48}


 42%|████▏     | 1570/3750 [5:39:00<7:23:20, 12.20s/it]

{'loss': 0.0092, 'grad_norm': 1.5021170377731323, 'learning_rate': 2.9080000000000003e-05, 'epoch': 12.56}


 42%|████▏     | 1580/3750 [5:41:03<7:24:16, 12.28s/it]

{'loss': 0.0087, 'grad_norm': 0.06891871988773346, 'learning_rate': 2.8946666666666665e-05, 'epoch': 12.64}


 42%|████▏     | 1590/3750 [5:43:06<7:21:56, 12.28s/it]

{'loss': 0.0063, 'grad_norm': 0.89606112241745, 'learning_rate': 2.8813333333333338e-05, 'epoch': 12.72}


 43%|████▎     | 1600/3750 [5:45:09<7:16:54, 12.19s/it]

{'loss': 0.0075, 'grad_norm': 1.323642611503601, 'learning_rate': 2.868e-05, 'epoch': 12.8}


 43%|████▎     | 1610/3750 [5:47:12<7:15:00, 12.20s/it]

{'loss': 0.0043, 'grad_norm': 0.6638923287391663, 'learning_rate': 2.8546666666666666e-05, 'epoch': 12.88}


 43%|████▎     | 1620/3750 [5:49:15<7:16:18, 12.29s/it]

{'loss': 0.0092, 'grad_norm': 0.5948591828346252, 'learning_rate': 2.8413333333333335e-05, 'epoch': 12.96}


                                                       
 43%|████▎     | 1625/3750 [5:51:21<6:05:58, 10.33s/it]

{'eval_loss': 0.3081783354282379, 'eval_accuracy': 0.9489549417078038, 'eval_f1': 0.6162062869137741, 'eval_runtime': 71.253, 'eval_samples_per_second': 2.47, 'eval_steps_per_second': 0.309, 'epoch': 13.0}


 43%|████▎     | 1630/3750 [5:52:35<10:35:05, 17.97s/it]

{'loss': 0.0099, 'grad_norm': 1.0490763187408447, 'learning_rate': 2.828e-05, 'epoch': 13.04}


 44%|████▎     | 1640/3750 [5:54:38<7:12:21, 12.29s/it] 

{'loss': 0.0024, 'grad_norm': 0.123544842004776, 'learning_rate': 2.8146666666666666e-05, 'epoch': 13.12}


 44%|████▍     | 1650/3750 [5:56:38<6:57:30, 11.93s/it]

{'loss': 0.0053, 'grad_norm': 0.7485452890396118, 'learning_rate': 2.8013333333333335e-05, 'epoch': 13.2}


 44%|████▍     | 1660/3750 [5:58:40<7:10:44, 12.37s/it]

{'loss': 0.0076, 'grad_norm': 2.0314724445343018, 'learning_rate': 2.788e-05, 'epoch': 13.28}


 45%|████▍     | 1670/3750 [6:00:43<7:03:24, 12.21s/it]

{'loss': 0.0055, 'grad_norm': 0.36430323123931885, 'learning_rate': 2.7746666666666666e-05, 'epoch': 13.36}


 45%|████▍     | 1680/3750 [6:02:46<7:02:09, 12.24s/it]

{'loss': 0.0044, 'grad_norm': 0.16548146307468414, 'learning_rate': 2.7613333333333335e-05, 'epoch': 13.44}


 45%|████▌     | 1690/3750 [6:04:48<6:58:39, 12.19s/it]

{'loss': 0.0051, 'grad_norm': 0.00024312363530043513, 'learning_rate': 2.748e-05, 'epoch': 13.52}


 45%|████▌     | 1700/3750 [6:06:51<7:00:00, 12.29s/it]

{'loss': 0.0019, 'grad_norm': 0.14754967391490936, 'learning_rate': 2.734666666666667e-05, 'epoch': 13.6}


 46%|████▌     | 1710/3750 [6:08:54<6:53:30, 12.16s/it]

{'loss': 0.0016, 'grad_norm': 0.8563933372497559, 'learning_rate': 2.7213333333333336e-05, 'epoch': 13.68}


 46%|████▌     | 1720/3750 [6:11:01<7:01:47, 12.47s/it]

{'loss': 0.0018, 'grad_norm': 0.046843934804201126, 'learning_rate': 2.7079999999999998e-05, 'epoch': 13.76}


 46%|████▌     | 1730/3750 [6:13:04<6:54:00, 12.30s/it]

{'loss': 0.0062, 'grad_norm': 0.5415218472480774, 'learning_rate': 2.694666666666667e-05, 'epoch': 13.84}


 46%|████▋     | 1740/3750 [6:15:06<6:46:51, 12.14s/it]

{'loss': 0.005, 'grad_norm': 0.44014784693717957, 'learning_rate': 2.6813333333333336e-05, 'epoch': 13.92}


 47%|████▋     | 1750/3750 [6:17:01<5:37:34, 10.13s/it]

{'loss': 0.0073, 'grad_norm': 0.0004201888805255294, 'learning_rate': 2.668e-05, 'epoch': 14.0}


                                                       
 47%|████▋     | 1750/3750 [6:18:12<5:37:34, 10.13s/it]

{'eval_loss': 0.34848347306251526, 'eval_accuracy': 0.9482197248188216, 'eval_f1': 0.6334170779265769, 'eval_runtime': 70.7732, 'eval_samples_per_second': 2.487, 'eval_steps_per_second': 0.311, 'epoch': 14.0}


 47%|████▋     | 1760/3750 [6:20:28<7:16:39, 13.17s/it] 

{'loss': 0.0037, 'grad_norm': 1.6743649244308472, 'learning_rate': 2.654666666666667e-05, 'epoch': 14.08}


 47%|████▋     | 1770/3750 [6:22:31<6:45:21, 12.28s/it]

{'loss': 0.0011, 'grad_norm': 0.9106221199035645, 'learning_rate': 2.6413333333333333e-05, 'epoch': 14.16}


 47%|████▋     | 1780/3750 [6:24:32<6:36:03, 12.06s/it]

{'loss': 0.0017, 'grad_norm': 1.215401530265808, 'learning_rate': 2.628e-05, 'epoch': 14.24}


 48%|████▊     | 1790/3750 [6:26:33<6:33:19, 12.04s/it]

{'loss': 0.0028, 'grad_norm': 0.011429330334067345, 'learning_rate': 2.6146666666666668e-05, 'epoch': 14.32}


 48%|████▊     | 1800/3750 [6:28:36<6:42:46, 12.39s/it]

{'loss': 0.0017, 'grad_norm': 0.2639833092689514, 'learning_rate': 2.6013333333333334e-05, 'epoch': 14.4}


 48%|████▊     | 1810/3750 [6:30:38<6:34:27, 12.20s/it]

{'loss': 0.0067, 'grad_norm': 0.0029713299591094255, 'learning_rate': 2.588e-05, 'epoch': 14.48}


 49%|████▊     | 1820/3750 [6:32:42<6:38:35, 12.39s/it]

{'loss': 0.0029, 'grad_norm': 1.147097110748291, 'learning_rate': 2.574666666666667e-05, 'epoch': 14.56}


 49%|████▉     | 1830/3750 [6:34:44<6:35:50, 12.37s/it]

{'loss': 0.0032, 'grad_norm': 0.9362461566925049, 'learning_rate': 2.5613333333333334e-05, 'epoch': 14.64}


 49%|████▉     | 1840/3750 [6:36:45<6:22:57, 12.03s/it]

{'loss': 0.0025, 'grad_norm': 1.2047449350357056, 'learning_rate': 2.5480000000000003e-05, 'epoch': 14.72}


 49%|████▉     | 1850/3750 [6:38:48<6:28:07, 12.26s/it]

{'loss': 0.0061, 'grad_norm': 0.4940430223941803, 'learning_rate': 2.534666666666667e-05, 'epoch': 14.8}


 50%|████▉     | 1860/3750 [6:40:52<6:27:04, 12.29s/it]

{'loss': 0.0035, 'grad_norm': 0.6156449317932129, 'learning_rate': 2.5213333333333335e-05, 'epoch': 14.88}


 50%|████▉     | 1870/3750 [6:42:53<6:18:03, 12.07s/it]

{'loss': 0.0027, 'grad_norm': 0.06437388807535172, 'learning_rate': 2.5080000000000004e-05, 'epoch': 14.96}


                                                       
 50%|█████     | 1875/3750 [6:44:59<5:23:36, 10.36s/it]

{'eval_loss': 0.4068663716316223, 'eval_accuracy': 0.9453838882470329, 'eval_f1': 0.6319284002762063, 'eval_runtime': 71.3891, 'eval_samples_per_second': 2.465, 'eval_steps_per_second': 0.308, 'epoch': 15.0}


 50%|█████     | 1880/3750 [6:46:15<9:23:04, 18.07s/it] 

{'loss': 0.0033, 'grad_norm': 0.1276429146528244, 'learning_rate': 2.494666666666667e-05, 'epoch': 15.04}


 50%|█████     | 1890/3750 [6:48:20<6:42:41, 12.99s/it]

{'loss': 0.0013, 'grad_norm': 0.000267248775344342, 'learning_rate': 2.4813333333333335e-05, 'epoch': 15.12}


 51%|█████     | 1900/3750 [6:50:25<6:20:49, 12.35s/it]

{'loss': 0.0019, 'grad_norm': 0.010946513153612614, 'learning_rate': 2.468e-05, 'epoch': 15.2}


 51%|█████     | 1910/3750 [6:52:27<6:13:27, 12.18s/it]

{'loss': 0.0043, 'grad_norm': 0.004665854386985302, 'learning_rate': 2.4546666666666667e-05, 'epoch': 15.28}


 51%|█████     | 1920/3750 [6:54:30<6:12:13, 12.20s/it]

{'loss': 0.0037, 'grad_norm': 0.001359647256322205, 'learning_rate': 2.4413333333333336e-05, 'epoch': 15.36}


 51%|█████▏    | 1930/3750 [6:56:33<6:15:10, 12.37s/it]

{'loss': 0.0013, 'grad_norm': 1.7041741609573364, 'learning_rate': 2.428e-05, 'epoch': 15.44}


 52%|█████▏    | 1940/3750 [6:58:35<6:07:31, 12.18s/it]

{'loss': 0.0035, 'grad_norm': 0.00026393073494546115, 'learning_rate': 2.4146666666666667e-05, 'epoch': 15.52}


 52%|█████▏    | 1950/3750 [7:00:38<6:10:07, 12.34s/it]

{'loss': 0.0065, 'grad_norm': 0.0002881508262362331, 'learning_rate': 2.4013333333333336e-05, 'epoch': 15.6}


 52%|█████▏    | 1960/3750 [7:02:41<6:03:34, 12.19s/it]

{'loss': 0.0032, 'grad_norm': 1.562265157699585, 'learning_rate': 2.3880000000000002e-05, 'epoch': 15.68}


 53%|█████▎    | 1970/3750 [7:04:43<6:02:28, 12.22s/it]

{'loss': 0.0037, 'grad_norm': 0.6252687573432922, 'learning_rate': 2.3746666666666667e-05, 'epoch': 15.76}


 53%|█████▎    | 1980/3750 [7:06:45<5:59:29, 12.19s/it]

{'loss': 0.0013, 'grad_norm': 0.13526082038879395, 'learning_rate': 2.3613333333333333e-05, 'epoch': 15.84}


 53%|█████▎    | 1990/3750 [7:08:47<5:54:39, 12.09s/it]

{'loss': 0.0009, 'grad_norm': 0.0007575856288895011, 'learning_rate': 2.3480000000000002e-05, 'epoch': 15.92}


 53%|█████▎    | 2000/3750 [7:10:43<4:58:00, 10.22s/it]

{'loss': 0.003, 'grad_norm': 1.2251226902008057, 'learning_rate': 2.3346666666666668e-05, 'epoch': 16.0}


                                                       
 53%|█████▎    | 2000/3750 [7:11:54<4:58:00, 10.22s/it]

{'eval_loss': 0.3970596492290497, 'eval_accuracy': 0.9481146938346813, 'eval_f1': 0.6248265175106787, 'eval_runtime': 71.112, 'eval_samples_per_second': 2.475, 'eval_steps_per_second': 0.309, 'epoch': 16.0}


 54%|█████▎    | 2010/3750 [7:14:10<6:20:11, 13.11s/it] 

{'loss': 0.0032, 'grad_norm': 0.0005535286036320031, 'learning_rate': 2.3213333333333334e-05, 'epoch': 16.08}


 54%|█████▍    | 2020/3750 [7:16:13<5:52:28, 12.22s/it]

{'loss': 0.0024, 'grad_norm': 0.8799270391464233, 'learning_rate': 2.3080000000000003e-05, 'epoch': 16.16}


 54%|█████▍    | 2030/3750 [7:18:16<5:50:59, 12.24s/it]

{'loss': 0.0017, 'grad_norm': 0.0004896898753941059, 'learning_rate': 2.294666666666667e-05, 'epoch': 16.24}


 54%|█████▍    | 2040/3750 [7:20:18<5:49:12, 12.25s/it]

{'loss': 0.0011, 'grad_norm': 0.026393422856926918, 'learning_rate': 2.2813333333333334e-05, 'epoch': 16.32}


 55%|█████▍    | 2050/3750 [7:22:21<5:51:02, 12.39s/it]

{'loss': 0.0053, 'grad_norm': 0.735355794429779, 'learning_rate': 2.268e-05, 'epoch': 16.4}


 55%|█████▍    | 2060/3750 [7:24:22<5:39:09, 12.04s/it]

{'loss': 0.0033, 'grad_norm': 0.0021251041907817125, 'learning_rate': 2.254666666666667e-05, 'epoch': 16.48}


 55%|█████▌    | 2070/3750 [7:26:25<5:42:04, 12.22s/it]

{'loss': 0.0027, 'grad_norm': 0.19089771807193756, 'learning_rate': 2.2413333333333334e-05, 'epoch': 16.56}


 55%|█████▌    | 2080/3750 [7:28:26<5:35:30, 12.05s/it]

{'loss': 0.0051, 'grad_norm': 0.0024197224993258715, 'learning_rate': 2.228e-05, 'epoch': 16.64}


 56%|█████▌    | 2090/3750 [7:30:29<5:39:21, 12.27s/it]

{'loss': 0.0033, 'grad_norm': 1.2952131032943726, 'learning_rate': 2.214666666666667e-05, 'epoch': 16.72}


 56%|█████▌    | 2100/3750 [7:32:32<5:36:24, 12.23s/it]

{'loss': 0.0015, 'grad_norm': 0.5638099908828735, 'learning_rate': 2.201333333333333e-05, 'epoch': 16.8}


 56%|█████▋    | 2110/3750 [7:34:35<5:34:14, 12.23s/it]

{'loss': 0.0037, 'grad_norm': 0.3081258237361908, 'learning_rate': 2.188e-05, 'epoch': 16.88}


 57%|█████▋    | 2120/3750 [7:36:38<5:33:42, 12.28s/it]

{'loss': 0.0041, 'grad_norm': 0.18003962934017181, 'learning_rate': 2.174666666666667e-05, 'epoch': 16.96}


                                                       
 57%|█████▋    | 2125/3750 [7:38:43<4:39:28, 10.32s/it]

{'eval_loss': 0.3979097306728363, 'eval_accuracy': 0.9463291671042958, 'eval_f1': 0.631983425893814, 'eval_runtime': 70.7779, 'eval_samples_per_second': 2.487, 'eval_steps_per_second': 0.311, 'epoch': 17.0}


 57%|█████▋    | 2130/3750 [7:39:58<8:10:28, 18.17s/it] 

{'loss': 0.001, 'grad_norm': 0.03286410868167877, 'learning_rate': 2.1613333333333335e-05, 'epoch': 17.04}


 57%|█████▋    | 2140/3750 [7:42:01<5:33:26, 12.43s/it]

{'loss': 0.0036, 'grad_norm': 0.00018084331532008946, 'learning_rate': 2.148e-05, 'epoch': 17.12}


 57%|█████▋    | 2150/3750 [7:44:04<5:26:23, 12.24s/it]

{'loss': 0.0038, 'grad_norm': 1.0491513013839722, 'learning_rate': 2.1346666666666667e-05, 'epoch': 17.2}


 58%|█████▊    | 2160/3750 [7:46:06<5:18:59, 12.04s/it]

{'loss': 0.0023, 'grad_norm': 0.08114629983901978, 'learning_rate': 2.1213333333333336e-05, 'epoch': 17.28}


 58%|█████▊    | 2170/3750 [7:48:09<5:27:18, 12.43s/it]

{'loss': 0.0044, 'grad_norm': 0.00018443776934873313, 'learning_rate': 2.1079999999999998e-05, 'epoch': 17.36}


 58%|█████▊    | 2180/3750 [7:50:11<5:19:53, 12.23s/it]

{'loss': 0.0019, 'grad_norm': 2.0137715339660645, 'learning_rate': 2.0946666666666667e-05, 'epoch': 17.44}


 58%|█████▊    | 2190/3750 [7:52:11<5:12:55, 12.04s/it]

{'loss': 0.0025, 'grad_norm': 0.0010399256134405732, 'learning_rate': 2.0813333333333336e-05, 'epoch': 17.52}


 59%|█████▊    | 2200/3750 [7:54:12<5:11:36, 12.06s/it]

{'loss': 0.0032, 'grad_norm': 1.3868601322174072, 'learning_rate': 2.0680000000000002e-05, 'epoch': 17.6}


 59%|█████▉    | 2210/3750 [7:56:15<5:13:27, 12.21s/it]

{'loss': 0.0029, 'grad_norm': 0.0521208792924881, 'learning_rate': 2.0546666666666668e-05, 'epoch': 17.68}


 59%|█████▉    | 2220/3750 [7:58:17<5:10:25, 12.17s/it]

{'loss': 0.0036, 'grad_norm': 0.5591918230056763, 'learning_rate': 2.0413333333333333e-05, 'epoch': 17.76}


 59%|█████▉    | 2230/3750 [8:00:19<5:08:25, 12.17s/it]

{'loss': 0.0009, 'grad_norm': 0.01799052022397518, 'learning_rate': 2.0280000000000002e-05, 'epoch': 17.84}


 60%|█████▉    | 2240/3750 [8:02:22<5:06:37, 12.18s/it]

{'loss': 0.0006, 'grad_norm': 0.4786897897720337, 'learning_rate': 2.0146666666666668e-05, 'epoch': 17.92}


 60%|██████    | 2250/3750 [8:04:18<4:16:33, 10.26s/it]

{'loss': 0.0003, 'grad_norm': 0.00022169944713823497, 'learning_rate': 2.0013333333333334e-05, 'epoch': 18.0}


                                                       
 60%|██████    | 2250/3750 [8:05:28<4:16:33, 10.26s/it]

{'eval_loss': 0.4039253890514374, 'eval_accuracy': 0.9496901585967861, 'eval_f1': 0.6254063572078159, 'eval_runtime': 70.4482, 'eval_samples_per_second': 2.498, 'eval_steps_per_second': 0.312, 'epoch': 18.0}


 60%|██████    | 2260/3750 [8:07:45<5:28:24, 13.22s/it] 

{'loss': 0.0011, 'grad_norm': 0.0001959687942871824, 'learning_rate': 1.9880000000000003e-05, 'epoch': 18.08}


 61%|██████    | 2270/3750 [8:09:47<5:01:23, 12.22s/it]

{'loss': 0.0019, 'grad_norm': 0.013615039177238941, 'learning_rate': 1.974666666666667e-05, 'epoch': 18.16}


 61%|██████    | 2280/3750 [8:11:50<5:03:41, 12.40s/it]

{'loss': 0.0016, 'grad_norm': 0.0007491725846193731, 'learning_rate': 1.9613333333333334e-05, 'epoch': 18.24}


 61%|██████    | 2290/3750 [8:13:51<4:53:05, 12.04s/it]

{'loss': 0.0002, 'grad_norm': 0.00018968418589793146, 'learning_rate': 1.948e-05, 'epoch': 18.32}


 61%|██████▏   | 2300/3750 [8:15:53<4:54:02, 12.17s/it]

{'loss': 0.0025, 'grad_norm': 0.012567303143441677, 'learning_rate': 1.934666666666667e-05, 'epoch': 18.4}


 62%|██████▏   | 2310/3750 [8:17:56<4:52:42, 12.20s/it]

{'loss': 0.0005, 'grad_norm': 0.009878413751721382, 'learning_rate': 1.9213333333333335e-05, 'epoch': 18.48}


 62%|██████▏   | 2320/3750 [8:19:56<4:45:07, 11.96s/it]

{'loss': 0.0017, 'grad_norm': 0.00046956457663327456, 'learning_rate': 1.908e-05, 'epoch': 18.56}


 62%|██████▏   | 2330/3750 [8:21:58<4:51:02, 12.30s/it]

{'loss': 0.001, 'grad_norm': 0.08201783150434494, 'learning_rate': 1.894666666666667e-05, 'epoch': 18.64}


 62%|██████▏   | 2340/3750 [8:24:00<4:44:54, 12.12s/it]

{'loss': 0.0007, 'grad_norm': 2.779292345046997, 'learning_rate': 1.8813333333333335e-05, 'epoch': 18.72}


 63%|██████▎   | 2350/3750 [8:26:01<4:43:34, 12.15s/it]

{'loss': 0.0009, 'grad_norm': 0.20202021300792694, 'learning_rate': 1.868e-05, 'epoch': 18.8}


 63%|██████▎   | 2360/3750 [8:28:03<4:40:15, 12.10s/it]

{'loss': 0.003, 'grad_norm': 2.2640929222106934, 'learning_rate': 1.8546666666666666e-05, 'epoch': 18.88}


 63%|██████▎   | 2370/3750 [8:30:05<4:40:32, 12.20s/it]

{'loss': 0.0004, 'grad_norm': 0.27883973717689514, 'learning_rate': 1.8413333333333335e-05, 'epoch': 18.96}


                                                       
 63%|██████▎   | 2375/3750 [8:32:10<3:55:20, 10.27s/it]

{'eval_loss': 0.4386669099330902, 'eval_accuracy': 0.9480096628505409, 'eval_f1': 0.6382437080525234, 'eval_runtime': 70.8232, 'eval_samples_per_second': 2.485, 'eval_steps_per_second': 0.311, 'epoch': 19.0}


 63%|██████▎   | 2380/3750 [8:33:26<6:51:29, 18.02s/it] 

{'loss': 0.0012, 'grad_norm': 2.9788575172424316, 'learning_rate': 1.828e-05, 'epoch': 19.04}


 64%|██████▎   | 2390/3750 [8:35:28<4:39:29, 12.33s/it]

{'loss': 0.0012, 'grad_norm': 0.06465937197208405, 'learning_rate': 1.8146666666666667e-05, 'epoch': 19.12}


 64%|██████▍   | 2400/3750 [8:37:29<4:32:36, 12.12s/it]

{'loss': 0.001, 'grad_norm': 0.05214391648769379, 'learning_rate': 1.8013333333333336e-05, 'epoch': 19.2}


 64%|██████▍   | 2410/3750 [8:39:31<4:30:52, 12.13s/it]

{'loss': 0.0011, 'grad_norm': 0.012098842300474644, 'learning_rate': 1.7879999999999998e-05, 'epoch': 19.28}


 65%|██████▍   | 2420/3750 [8:41:33<4:32:54, 12.31s/it]

{'loss': 0.0022, 'grad_norm': 0.23242798447608948, 'learning_rate': 1.7746666666666667e-05, 'epoch': 19.36}


 65%|██████▍   | 2430/3750 [8:43:39<4:35:04, 12.50s/it]

{'loss': 0.0003, 'grad_norm': 0.019705932587385178, 'learning_rate': 1.7613333333333333e-05, 'epoch': 19.44}


 65%|██████▌   | 2440/3750 [8:45:42<4:27:38, 12.26s/it]

{'loss': 0.0009, 'grad_norm': 0.29313012957572937, 'learning_rate': 1.7480000000000002e-05, 'epoch': 19.52}


 65%|██████▌   | 2450/3750 [8:47:44<4:25:20, 12.25s/it]

{'loss': 0.0005, 'grad_norm': 0.00023218340356834233, 'learning_rate': 1.7346666666666668e-05, 'epoch': 19.6}


 66%|██████▌   | 2460/3750 [8:49:46<4:23:53, 12.27s/it]

{'loss': 0.0003, 'grad_norm': 0.0023777063470333815, 'learning_rate': 1.7213333333333333e-05, 'epoch': 19.68}


 66%|██████▌   | 2470/3750 [8:51:48<4:18:41, 12.13s/it]

{'loss': 0.0, 'grad_norm': 0.00016036118904594332, 'learning_rate': 1.7080000000000002e-05, 'epoch': 19.76}


 66%|██████▌   | 2480/3750 [8:53:50<4:16:46, 12.13s/it]

{'loss': 0.0026, 'grad_norm': 1.6143547296524048, 'learning_rate': 1.6946666666666665e-05, 'epoch': 19.84}


 66%|██████▋   | 2490/3750 [8:55:52<4:14:41, 12.13s/it]

{'loss': 0.0005, 'grad_norm': 0.364269882440567, 'learning_rate': 1.6813333333333334e-05, 'epoch': 19.92}


 67%|██████▋   | 2500/3750 [8:57:47<3:33:48, 10.26s/it]

{'loss': 0.001, 'grad_norm': 0.00829396490007639, 'learning_rate': 1.668e-05, 'epoch': 20.0}


                                                       
 67%|██████▋   | 2500/3750 [8:58:58<3:33:48, 10.26s/it]

{'eval_loss': 0.49309107661247253, 'eval_accuracy': 0.9425480516752442, 'eval_f1': 0.6094413012365228, 'eval_runtime': 70.3715, 'eval_samples_per_second': 2.501, 'eval_steps_per_second': 0.313, 'epoch': 20.0}


 67%|██████▋   | 2510/3750 [9:01:15<4:35:10, 13.31s/it] 

{'loss': 0.0021, 'grad_norm': 0.3259330093860626, 'learning_rate': 1.654666666666667e-05, 'epoch': 20.08}


 67%|██████▋   | 2520/3750 [9:03:17<4:10:00, 12.20s/it]

{'loss': 0.002, 'grad_norm': 0.00016125522961374372, 'learning_rate': 1.6413333333333334e-05, 'epoch': 20.16}


 67%|██████▋   | 2530/3750 [9:05:19<4:07:14, 12.16s/it]

{'loss': 0.0025, 'grad_norm': 0.006662246771156788, 'learning_rate': 1.628e-05, 'epoch': 20.24}


 68%|██████▊   | 2540/3750 [9:07:21<4:05:36, 12.18s/it]

{'loss': 0.0004, 'grad_norm': 0.03902657702565193, 'learning_rate': 1.614666666666667e-05, 'epoch': 20.32}


 68%|██████▊   | 2550/3750 [9:09:23<4:03:30, 12.18s/it]

{'loss': 0.0014, 'grad_norm': 0.0008938702521845698, 'learning_rate': 1.601333333333333e-05, 'epoch': 20.4}


 68%|██████▊   | 2560/3750 [9:11:26<4:02:10, 12.21s/it]

{'loss': 0.0014, 'grad_norm': 0.21985644102096558, 'learning_rate': 1.588e-05, 'epoch': 20.48}


 69%|██████▊   | 2570/3750 [9:13:28<4:00:13, 12.22s/it]

{'loss': 0.0003, 'grad_norm': 0.1203760951757431, 'learning_rate': 1.574666666666667e-05, 'epoch': 20.56}


 69%|██████▉   | 2580/3750 [9:15:30<3:58:23, 12.23s/it]

{'loss': 0.0011, 'grad_norm': 0.13943959772586823, 'learning_rate': 1.5613333333333335e-05, 'epoch': 20.64}


 69%|██████▉   | 2590/3750 [9:17:32<3:54:30, 12.13s/it]

{'loss': 0.0002, 'grad_norm': 0.00014699302846565843, 'learning_rate': 1.548e-05, 'epoch': 20.72}


 69%|██████▉   | 2600/3750 [9:19:33<3:53:13, 12.17s/it]

{'loss': 0.0004, 'grad_norm': 0.434110164642334, 'learning_rate': 1.5346666666666667e-05, 'epoch': 20.8}


 70%|██████▉   | 2610/3750 [9:21:35<3:54:43, 12.35s/it]

{'loss': 0.001, 'grad_norm': 0.00015699741197749972, 'learning_rate': 1.5213333333333336e-05, 'epoch': 20.88}


 70%|██████▉   | 2620/3750 [9:23:37<3:49:19, 12.18s/it]

{'loss': 0.0014, 'grad_norm': 2.8659839630126953, 'learning_rate': 1.508e-05, 'epoch': 20.96}


                                                       
 70%|███████   | 2625/3750 [9:25:42<3:11:46, 10.23s/it]

{'eval_loss': 0.487710565328598, 'eval_accuracy': 0.9464341980884361, 'eval_f1': 0.6289151062130045, 'eval_runtime': 70.8188, 'eval_samples_per_second': 2.485, 'eval_steps_per_second': 0.311, 'epoch': 21.0}


 70%|███████   | 2630/3750 [9:26:56<5:34:23, 17.91s/it] 

{'loss': 0.0016, 'grad_norm': 0.04221566766500473, 'learning_rate': 1.4946666666666667e-05, 'epoch': 21.04}


 70%|███████   | 2640/3750 [9:28:58<3:48:38, 12.36s/it]

{'loss': 0.0005, 'grad_norm': 0.006421822123229504, 'learning_rate': 1.4813333333333334e-05, 'epoch': 21.12}


 71%|███████   | 2650/3750 [9:31:00<3:45:53, 12.32s/it]

{'loss': 0.0038, 'grad_norm': 0.001106818439438939, 'learning_rate': 1.4680000000000002e-05, 'epoch': 21.2}


 71%|███████   | 2660/3750 [9:33:02<3:40:25, 12.13s/it]

{'loss': 0.0001, 'grad_norm': 0.008441164158284664, 'learning_rate': 1.4546666666666667e-05, 'epoch': 21.28}


 71%|███████   | 2670/3750 [9:35:04<3:39:04, 12.17s/it]

{'loss': 0.0015, 'grad_norm': 0.755119264125824, 'learning_rate': 1.4413333333333335e-05, 'epoch': 21.36}


 71%|███████▏  | 2680/3750 [9:37:06<3:36:51, 12.16s/it]

{'loss': 0.0008, 'grad_norm': 0.0505213625729084, 'learning_rate': 1.4280000000000002e-05, 'epoch': 21.44}


 72%|███████▏  | 2690/3750 [9:39:08<3:34:47, 12.16s/it]

{'loss': 0.0001, 'grad_norm': 0.000861183216329664, 'learning_rate': 1.4146666666666666e-05, 'epoch': 21.52}


 72%|███████▏  | 2700/3750 [9:41:10<3:33:11, 12.18s/it]

{'loss': 0.0028, 'grad_norm': 2.131190538406372, 'learning_rate': 1.4013333333333334e-05, 'epoch': 21.6}


 72%|███████▏  | 2710/3750 [9:43:12<3:30:57, 12.17s/it]

{'loss': 0.0003, 'grad_norm': 0.00043697215733118355, 'learning_rate': 1.3880000000000001e-05, 'epoch': 21.68}


 73%|███████▎  | 2720/3750 [9:45:14<3:28:48, 12.16s/it]

{'loss': 0.0032, 'grad_norm': 1.1518702507019043, 'learning_rate': 1.3746666666666667e-05, 'epoch': 21.76}


 73%|███████▎  | 2730/3750 [9:47:16<3:27:02, 12.18s/it]

{'loss': 0.0012, 'grad_norm': 2.712864398956299, 'learning_rate': 1.3613333333333334e-05, 'epoch': 21.84}


 73%|███████▎  | 2740/3750 [9:49:18<3:24:30, 12.15s/it]

{'loss': 0.001, 'grad_norm': 0.0002333720913156867, 'learning_rate': 1.3480000000000001e-05, 'epoch': 21.92}


 73%|███████▎  | 2750/3750 [9:51:13<2:51:08, 10.27s/it]

{'loss': 0.0012, 'grad_norm': 0.7775922417640686, 'learning_rate': 1.3346666666666669e-05, 'epoch': 22.0}


                                                       
 73%|███████▎  | 2750/3750 [9:52:24<2:51:08, 10.27s/it]

{'eval_loss': 0.4805426299571991, 'eval_accuracy': 0.9440184854532087, 'eval_f1': 0.6217847336723967, 'eval_runtime': 70.6031, 'eval_samples_per_second': 2.493, 'eval_steps_per_second': 0.312, 'epoch': 22.0}


 74%|███████▎  | 2760/3750 [9:54:41<3:36:20, 13.11s/it] 

{'loss': 0.0003, 'grad_norm': 0.006880495697259903, 'learning_rate': 1.3213333333333333e-05, 'epoch': 22.08}


 74%|███████▍  | 2770/3750 [9:56:43<3:19:34, 12.22s/it]

{'loss': 0.0004, 'grad_norm': 0.011698957532644272, 'learning_rate': 1.308e-05, 'epoch': 22.16}


 74%|███████▍  | 2780/3750 [9:58:45<3:16:55, 12.18s/it]

{'loss': 0.0024, 'grad_norm': 0.0001422135392203927, 'learning_rate': 1.2946666666666668e-05, 'epoch': 22.24}


 74%|███████▍  | 2790/3750 [10:00:47<3:14:49, 12.18s/it]

{'loss': 0.0011, 'grad_norm': 0.25173214077949524, 'learning_rate': 1.2813333333333333e-05, 'epoch': 22.32}


 75%|███████▍  | 2800/3750 [10:02:59<3:21:37, 12.73s/it]

{'loss': 0.0001, 'grad_norm': 0.001531969872303307, 'learning_rate': 1.268e-05, 'epoch': 22.4}


 75%|███████▍  | 2810/3750 [10:05:11<3:22:44, 12.94s/it]

{'loss': 0.0003, 'grad_norm': 0.4881620705127716, 'learning_rate': 1.2546666666666668e-05, 'epoch': 22.48}


 75%|███████▌  | 2820/3750 [10:07:16<3:11:56, 12.38s/it]

{'loss': 0.0002, 'grad_norm': 0.00014093637582845986, 'learning_rate': 1.2413333333333334e-05, 'epoch': 22.56}


 75%|███████▌  | 2830/3750 [10:09:19<3:08:32, 12.30s/it]

{'loss': 0.0008, 'grad_norm': 0.006612428463995457, 'learning_rate': 1.2280000000000001e-05, 'epoch': 22.64}


 76%|███████▌  | 2840/3750 [10:11:22<3:06:37, 12.31s/it]

{'loss': 0.0011, 'grad_norm': 0.1234024241566658, 'learning_rate': 1.2146666666666667e-05, 'epoch': 22.72}


 76%|███████▌  | 2850/3750 [10:13:25<3:03:33, 12.24s/it]

{'loss': 0.0011, 'grad_norm': 0.3242424428462982, 'learning_rate': 1.2013333333333334e-05, 'epoch': 22.8}


 76%|███████▋  | 2860/3750 [10:15:28<3:02:01, 12.27s/it]

{'loss': 0.0004, 'grad_norm': 0.0555979460477829, 'learning_rate': 1.1880000000000001e-05, 'epoch': 22.88}


 77%|███████▋  | 2870/3750 [10:17:31<3:02:49, 12.47s/it]

{'loss': 0.0002, 'grad_norm': 0.07393637299537659, 'learning_rate': 1.1746666666666667e-05, 'epoch': 22.96}


                                                        
 77%|███████▋  | 2875/3750 [10:19:37<2:31:01, 10.36s/it]

{'eval_loss': 0.5005574226379395, 'eval_accuracy': 0.9442285474214893, 'eval_f1': 0.6350505664997929, 'eval_runtime': 71.0457, 'eval_samples_per_second': 2.477, 'eval_steps_per_second': 0.31, 'epoch': 23.0}


 77%|███████▋  | 2880/3750 [10:20:55<4:25:46, 18.33s/it]

{'loss': 0.0007, 'grad_norm': 1.216825246810913, 'learning_rate': 1.1613333333333335e-05, 'epoch': 23.04}


 77%|███████▋  | 2890/3750 [10:22:58<3:00:16, 12.58s/it]

{'loss': 0.0008, 'grad_norm': 0.022350342944264412, 'learning_rate': 1.148e-05, 'epoch': 23.12}


 77%|███████▋  | 2900/3750 [10:25:01<2:53:25, 12.24s/it]

{'loss': 0.0003, 'grad_norm': 0.002847307128831744, 'learning_rate': 1.1346666666666666e-05, 'epoch': 23.2}


 78%|███████▊  | 2910/3750 [10:27:04<2:52:01, 12.29s/it]

{'loss': 0.0001, 'grad_norm': 0.013838917948305607, 'learning_rate': 1.1213333333333333e-05, 'epoch': 23.28}


 78%|███████▊  | 2920/3750 [10:29:07<2:52:11, 12.45s/it]

{'loss': 0.0002, 'grad_norm': 0.2823443114757538, 'learning_rate': 1.108e-05, 'epoch': 23.36}


 78%|███████▊  | 2930/3750 [10:31:10<2:46:51, 12.21s/it]

{'loss': 0.0006, 'grad_norm': 0.000740855117328465, 'learning_rate': 1.0946666666666668e-05, 'epoch': 23.44}


 78%|███████▊  | 2940/3750 [10:33:12<2:45:11, 12.24s/it]

{'loss': 0.0001, 'grad_norm': 0.00012175038136774674, 'learning_rate': 1.0813333333333334e-05, 'epoch': 23.52}


 79%|███████▊  | 2950/3750 [10:35:15<2:43:34, 12.27s/it]

{'loss': 0.0006, 'grad_norm': 0.003928035963326693, 'learning_rate': 1.0680000000000001e-05, 'epoch': 23.6}


 79%|███████▉  | 2960/3750 [10:37:18<2:41:48, 12.29s/it]

{'loss': 0.0006, 'grad_norm': 0.0003280166129115969, 'learning_rate': 1.0546666666666667e-05, 'epoch': 23.68}


 79%|███████▉  | 2970/3750 [10:39:21<2:41:23, 12.41s/it]

{'loss': 0.0, 'grad_norm': 0.09114056080579758, 'learning_rate': 1.0413333333333332e-05, 'epoch': 23.76}


 79%|███████▉  | 2980/3750 [10:41:24<2:37:27, 12.27s/it]

{'loss': 0.0014, 'grad_norm': 0.803958535194397, 'learning_rate': 1.0280000000000002e-05, 'epoch': 23.84}


 80%|███████▉  | 2990/3750 [10:43:27<2:36:32, 12.36s/it]

{'loss': 0.0001, 'grad_norm': 0.24614305794239044, 'learning_rate': 1.0146666666666667e-05, 'epoch': 23.92}


 80%|████████  | 3000/3750 [10:45:24<2:09:46, 10.38s/it]

{'loss': 0.0005, 'grad_norm': 2.592926263809204, 'learning_rate': 1.0013333333333335e-05, 'epoch': 24.0}


                                                        
 80%|████████  | 3000/3750 [10:46:34<2:09:46, 10.38s/it]

{'eval_loss': 0.5017656683921814, 'eval_accuracy': 0.9447537023421909, 'eval_f1': 0.6311722864824283, 'eval_runtime': 70.529, 'eval_samples_per_second': 2.495, 'eval_steps_per_second': 0.312, 'epoch': 24.0}


 80%|████████  | 3010/3750 [10:48:51<2:42:16, 13.16s/it]

{'loss': 0.0001, 'grad_norm': 0.036357033997774124, 'learning_rate': 9.88e-06, 'epoch': 24.08}


 81%|████████  | 3020/3750 [10:50:54<2:29:45, 12.31s/it]

{'loss': 0.0001, 'grad_norm': 0.0458231084048748, 'learning_rate': 9.746666666666666e-06, 'epoch': 24.16}


 81%|████████  | 3030/3750 [10:52:57<2:27:38, 12.30s/it]

{'loss': 0.0002, 'grad_norm': 0.003433068748563528, 'learning_rate': 9.613333333333333e-06, 'epoch': 24.24}


 81%|████████  | 3040/3750 [10:55:00<2:27:19, 12.45s/it]

{'loss': 0.0016, 'grad_norm': 0.014141847379505634, 'learning_rate': 9.48e-06, 'epoch': 24.32}


 81%|████████▏ | 3050/3750 [10:57:04<2:23:46, 12.32s/it]

{'loss': 0.0002, 'grad_norm': 0.016879910603165627, 'learning_rate': 9.346666666666668e-06, 'epoch': 24.4}


 82%|████████▏ | 3060/3750 [10:59:07<2:21:19, 12.29s/it]

{'loss': 0.0008, 'grad_norm': 0.03256571292877197, 'learning_rate': 9.213333333333334e-06, 'epoch': 24.48}


 82%|████████▏ | 3070/3750 [11:01:10<2:18:42, 12.24s/it]

{'loss': 0.0001, 'grad_norm': 0.00037002560566179454, 'learning_rate': 9.080000000000001e-06, 'epoch': 24.56}


 82%|████████▏ | 3080/3750 [11:03:13<2:16:54, 12.26s/it]

{'loss': 0.001, 'grad_norm': 0.00012781968689523637, 'learning_rate': 8.946666666666667e-06, 'epoch': 24.64}


 82%|████████▏ | 3090/3750 [11:05:16<2:14:45, 12.25s/it]

{'loss': 0.0003, 'grad_norm': 0.0007351168314926326, 'learning_rate': 8.813333333333333e-06, 'epoch': 24.72}


 83%|████████▎ | 3100/3750 [11:07:18<2:12:56, 12.27s/it]

{'loss': 0.0006, 'grad_norm': 0.00011799929052358493, 'learning_rate': 8.68e-06, 'epoch': 24.8}


 83%|████████▎ | 3110/3750 [11:09:21<2:11:04, 12.29s/it]

{'loss': 0.0006, 'grad_norm': 0.000204089839826338, 'learning_rate': 8.546666666666667e-06, 'epoch': 24.88}


 83%|████████▎ | 3120/3750 [11:11:25<2:08:47, 12.27s/it]

{'loss': 0.0001, 'grad_norm': 0.049326084554195404, 'learning_rate': 8.413333333333335e-06, 'epoch': 24.96}


                                                        
 83%|████████▎ | 3125/3750 [11:13:30<1:47:46, 10.35s/it]

{'eval_loss': 0.5172151327133179, 'eval_accuracy': 0.9439134544690684, 'eval_f1': 0.6220306874162915, 'eval_runtime': 71.1204, 'eval_samples_per_second': 2.475, 'eval_steps_per_second': 0.309, 'epoch': 25.0}


 83%|████████▎ | 3130/3750 [11:14:44<3:04:52, 17.89s/it]

{'loss': 0.0, 'grad_norm': 0.00011165269097546116, 'learning_rate': 8.28e-06, 'epoch': 25.04}


 84%|████████▎ | 3140/3750 [11:16:47<2:08:05, 12.60s/it]

{'loss': 0.001, 'grad_norm': 0.005531087517738342, 'learning_rate': 8.146666666666668e-06, 'epoch': 25.12}


 84%|████████▍ | 3150/3750 [11:18:50<2:02:34, 12.26s/it]

{'loss': 0.0002, 'grad_norm': 0.0029605089221149683, 'learning_rate': 8.013333333333333e-06, 'epoch': 25.2}


 84%|████████▍ | 3160/3750 [11:20:53<2:00:18, 12.23s/it]

{'loss': 0.0, 'grad_norm': 0.003081751521676779, 'learning_rate': 7.879999999999999e-06, 'epoch': 25.28}


 85%|████████▍ | 3170/3750 [11:22:56<1:58:33, 12.26s/it]

{'loss': 0.0009, 'grad_norm': 0.05712408572435379, 'learning_rate': 7.746666666666668e-06, 'epoch': 25.36}


 85%|████████▍ | 3180/3750 [11:24:59<1:56:52, 12.30s/it]

{'loss': 0.0007, 'grad_norm': 0.008371805772185326, 'learning_rate': 7.613333333333334e-06, 'epoch': 25.44}


 85%|████████▌ | 3190/3750 [11:27:02<1:56:05, 12.44s/it]

{'loss': 0.0, 'grad_norm': 0.0014815157046541572, 'learning_rate': 7.480000000000001e-06, 'epoch': 25.52}


 85%|████████▌ | 3200/3750 [11:29:05<1:52:26, 12.27s/it]

{'loss': 0.0016, 'grad_norm': 1.950814962387085, 'learning_rate': 7.346666666666667e-06, 'epoch': 25.6}


 86%|████████▌ | 3210/3750 [11:31:08<1:50:29, 12.28s/it]

{'loss': 0.0001, 'grad_norm': 0.003984416369348764, 'learning_rate': 7.2133333333333334e-06, 'epoch': 25.68}


 86%|████████▌ | 3220/3750 [11:33:11<1:48:50, 12.32s/it]

{'loss': 0.0003, 'grad_norm': 0.9527855515480042, 'learning_rate': 7.080000000000001e-06, 'epoch': 25.76}


 86%|████████▌ | 3230/3750 [11:35:14<1:46:15, 12.26s/it]

{'loss': 0.0002, 'grad_norm': 0.00011754205479519442, 'learning_rate': 6.9466666666666665e-06, 'epoch': 25.84}


 86%|████████▋ | 3240/3750 [11:37:17<1:44:39, 12.31s/it]

{'loss': 0.0004, 'grad_norm': 0.00011624216131167486, 'learning_rate': 6.813333333333334e-06, 'epoch': 25.92}


 87%|████████▋ | 3250/3750 [11:39:13<1:25:53, 10.31s/it]

{'loss': 0.0001, 'grad_norm': 0.013155084103345871, 'learning_rate': 6.68e-06, 'epoch': 26.0}


                                                        
 87%|████████▋ | 3250/3750 [11:40:24<1:25:53, 10.31s/it]

{'eval_loss': 0.5245051980018616, 'eval_accuracy': 0.9435983615166474, 'eval_f1': 0.6175384626475736, 'eval_runtime': 70.6143, 'eval_samples_per_second': 2.492, 'eval_steps_per_second': 0.312, 'epoch': 26.0}


 87%|████████▋ | 3260/3750 [11:42:42<1:48:34, 13.29s/it]

{'loss': 0.0009, 'grad_norm': 0.8609973788261414, 'learning_rate': 6.546666666666668e-06, 'epoch': 26.08}


 87%|████████▋ | 3270/3750 [11:44:45<1:38:31, 12.32s/it]

{'loss': 0.0, 'grad_norm': 0.0025682756677269936, 'learning_rate': 6.4133333333333335e-06, 'epoch': 26.16}


 87%|████████▋ | 3280/3750 [11:46:48<1:36:01, 12.26s/it]

{'loss': 0.0003, 'grad_norm': 0.00021559071319643408, 'learning_rate': 6.28e-06, 'epoch': 26.24}


 88%|████████▊ | 3290/3750 [11:48:52<1:34:15, 12.29s/it]

{'loss': 0.0007, 'grad_norm': 0.018215985968708992, 'learning_rate': 6.146666666666667e-06, 'epoch': 26.32}


 88%|████████▊ | 3300/3750 [11:50:55<1:32:20, 12.31s/it]

{'loss': 0.0001, 'grad_norm': 0.0001144154739449732, 'learning_rate': 6.013333333333333e-06, 'epoch': 26.4}


 88%|████████▊ | 3310/3750 [11:52:58<1:29:40, 12.23s/it]

{'loss': 0.0002, 'grad_norm': 0.00030612022965215147, 'learning_rate': 5.8800000000000005e-06, 'epoch': 26.48}


 89%|████████▊ | 3320/3750 [11:55:01<1:28:00, 12.28s/it]

{'loss': 0.0001, 'grad_norm': 0.0008515930967405438, 'learning_rate': 5.746666666666667e-06, 'epoch': 26.56}


 89%|████████▉ | 3330/3750 [11:57:04<1:26:06, 12.30s/it]

{'loss': 0.0001, 'grad_norm': 0.00010917831968981773, 'learning_rate': 5.6133333333333335e-06, 'epoch': 26.64}


 89%|████████▉ | 3340/3750 [11:59:07<1:23:58, 12.29s/it]

{'loss': 0.0001, 'grad_norm': 0.00011244898632867262, 'learning_rate': 5.48e-06, 'epoch': 26.72}


 89%|████████▉ | 3350/3750 [12:01:10<1:21:32, 12.23s/it]

{'loss': 0.0004, 'grad_norm': 0.00011648421786958352, 'learning_rate': 5.3466666666666674e-06, 'epoch': 26.8}


 90%|████████▉ | 3360/3750 [12:03:13<1:19:39, 12.25s/it]

{'loss': 0.0003, 'grad_norm': 0.5622162222862244, 'learning_rate': 5.213333333333333e-06, 'epoch': 26.88}


 90%|████████▉ | 3370/3750 [12:05:16<1:18:39, 12.42s/it]

{'loss': 0.0015, 'grad_norm': 0.00011606364569161087, 'learning_rate': 5.08e-06, 'epoch': 26.96}


                                                        
 90%|█████████ | 3375/3750 [12:07:23<1:06:48, 10.69s/it]

{'eval_loss': 0.5236798524856567, 'eval_accuracy': 0.9430732065959458, 'eval_f1': 0.6120996824542685, 'eval_runtime': 70.9587, 'eval_samples_per_second': 2.48, 'eval_steps_per_second': 0.31, 'epoch': 27.0}


 90%|█████████ | 3380/3750 [12:08:37<1:50:46, 17.96s/it]

{'loss': 0.0, 'grad_norm': 0.00016071999561972916, 'learning_rate': 4.946666666666667e-06, 'epoch': 27.04}


 90%|█████████ | 3390/3750 [12:10:41<1:14:32, 12.42s/it]

{'loss': 0.0003, 'grad_norm': 0.5360328555107117, 'learning_rate': 4.8133333333333336e-06, 'epoch': 27.12}


 91%|█████████ | 3400/3750 [12:12:44<1:11:37, 12.28s/it]

{'loss': 0.0001, 'grad_norm': 0.0001125108465203084, 'learning_rate': 4.68e-06, 'epoch': 27.2}


 91%|█████████ | 3410/3750 [12:14:47<1:10:36, 12.46s/it]

{'loss': 0.0, 'grad_norm': 0.018889404833316803, 'learning_rate': 4.5466666666666675e-06, 'epoch': 27.28}


 91%|█████████ | 3420/3750 [12:16:50<1:07:44, 12.32s/it]

{'loss': 0.0, 'grad_norm': 0.00031848743674345315, 'learning_rate': 4.413333333333333e-06, 'epoch': 27.36}


 91%|█████████▏| 3430/3750 [12:18:53<1:05:04, 12.20s/it]

{'loss': 0.0, 'grad_norm': 0.02415916696190834, 'learning_rate': 4.28e-06, 'epoch': 27.44}


 92%|█████████▏| 3440/3750 [12:20:56<1:03:15, 12.24s/it]

{'loss': 0.0006, 'grad_norm': 0.00011350040585966781, 'learning_rate': 4.146666666666667e-06, 'epoch': 27.52}


 92%|█████████▏| 3450/3750 [12:22:59<1:01:33, 12.31s/it]

{'loss': 0.0, 'grad_norm': 0.014453019015491009, 'learning_rate': 4.013333333333334e-06, 'epoch': 27.6}


 92%|█████████▏| 3460/3750 [12:25:02<59:28, 12.31s/it]  

{'loss': 0.0, 'grad_norm': 0.0001091777958208695, 'learning_rate': 3.88e-06, 'epoch': 27.68}


 93%|█████████▎| 3470/3750 [12:27:05<57:20, 12.29s/it]

{'loss': 0.0002, 'grad_norm': 0.0021316222846508026, 'learning_rate': 3.746666666666667e-06, 'epoch': 27.76}


 93%|█████████▎| 3480/3750 [12:29:08<55:56, 12.43s/it]

{'loss': 0.0, 'grad_norm': 0.0006864184979349375, 'learning_rate': 3.613333333333334e-06, 'epoch': 27.84}


 93%|█████████▎| 3490/3750 [12:31:11<53:15, 12.29s/it]

{'loss': 0.0, 'grad_norm': 0.0002980683057103306, 'learning_rate': 3.4799999999999997e-06, 'epoch': 27.92}


 93%|█████████▎| 3500/3750 [12:33:07<42:41, 10.24s/it]

{'loss': 0.0001, 'grad_norm': 0.0009906215127557516, 'learning_rate': 3.3466666666666667e-06, 'epoch': 28.0}


                                                      
 93%|█████████▎| 3500/3750 [12:34:18<42:41, 10.24s/it]

{'eval_loss': 0.5232617855072021, 'eval_accuracy': 0.9439134544690684, 'eval_f1': 0.6185119610686327, 'eval_runtime': 70.8255, 'eval_samples_per_second': 2.485, 'eval_steps_per_second': 0.311, 'epoch': 28.0}


 94%|█████████▎| 3510/3750 [12:36:36<53:08, 13.28s/it]  

{'loss': 0.0001, 'grad_norm': 0.00013467995449900627, 'learning_rate': 3.2133333333333336e-06, 'epoch': 28.08}


 94%|█████████▍| 3520/3750 [12:38:39<47:07, 12.29s/it]

{'loss': 0.0, 'grad_norm': 0.0029998510144650936, 'learning_rate': 3.08e-06, 'epoch': 28.16}


 94%|█████████▍| 3530/3750 [12:40:42<44:52, 12.24s/it]

{'loss': 0.0001, 'grad_norm': 0.5019673705101013, 'learning_rate': 2.9466666666666667e-06, 'epoch': 28.24}


 94%|█████████▍| 3540/3750 [12:42:45<42:50, 12.24s/it]

{'loss': 0.0005, 'grad_norm': 0.0024684991221874952, 'learning_rate': 2.8133333333333336e-06, 'epoch': 28.32}


 95%|█████████▍| 3550/3750 [12:44:48<40:45, 12.23s/it]

{'loss': 0.0027, 'grad_norm': 0.7263360619544983, 'learning_rate': 2.68e-06, 'epoch': 28.4}


 95%|█████████▍| 3560/3750 [12:46:51<38:51, 12.27s/it]

{'loss': 0.0, 'grad_norm': 0.00011090448242612183, 'learning_rate': 2.5466666666666667e-06, 'epoch': 28.48}


 95%|█████████▌| 3570/3750 [12:48:54<37:23, 12.46s/it]

{'loss': 0.0, 'grad_norm': 0.00011253684351686388, 'learning_rate': 2.4133333333333332e-06, 'epoch': 28.56}


 95%|█████████▌| 3580/3750 [12:50:58<35:17, 12.46s/it]

{'loss': 0.0001, 'grad_norm': 0.00015027917106635869, 'learning_rate': 2.28e-06, 'epoch': 28.64}


 96%|█████████▌| 3590/3750 [12:53:01<32:46, 12.29s/it]

{'loss': 0.0, 'grad_norm': 0.00011180395085830241, 'learning_rate': 2.1466666666666667e-06, 'epoch': 28.72}


 96%|█████████▌| 3600/3750 [12:55:04<30:43, 12.29s/it]

{'loss': 0.0001, 'grad_norm': 0.0010116492630913854, 'learning_rate': 2.0133333333333333e-06, 'epoch': 28.8}


 96%|█████████▋| 3610/3750 [12:57:07<28:39, 12.28s/it]

{'loss': 0.0, 'grad_norm': 0.000813676102552563, 'learning_rate': 1.8800000000000002e-06, 'epoch': 28.88}


 97%|█████████▋| 3620/3750 [12:59:10<26:36, 12.28s/it]

{'loss': 0.0004, 'grad_norm': 0.005813307128846645, 'learning_rate': 1.7466666666666665e-06, 'epoch': 28.96}


                                                      
 97%|█████████▋| 3625/3750 [13:01:15<21:27, 10.30s/it]

{'eval_loss': 0.5213159918785095, 'eval_accuracy': 0.9435983615166474, 'eval_f1': 0.6180341533432248, 'eval_runtime': 71.0588, 'eval_samples_per_second': 2.477, 'eval_steps_per_second': 0.31, 'epoch': 29.0}


 97%|█████████▋| 3630/3750 [13:02:29<35:49, 17.91s/it]  

{'loss': 0.0, 'grad_norm': 0.012076049111783504, 'learning_rate': 1.6133333333333333e-06, 'epoch': 29.04}


 97%|█████████▋| 3640/3750 [13:04:32<22:47, 12.43s/it]

{'loss': 0.0017, 'grad_norm': 0.00010771704546641558, 'learning_rate': 1.4800000000000002e-06, 'epoch': 29.12}


 97%|█████████▋| 3650/3750 [13:06:35<20:25, 12.26s/it]

{'loss': 0.0001, 'grad_norm': 0.0009319575619883835, 'learning_rate': 1.3466666666666668e-06, 'epoch': 29.2}


 98%|█████████▊| 3660/3750 [13:08:38<18:25, 12.28s/it]

{'loss': 0.0001, 'grad_norm': 0.0018560183234512806, 'learning_rate': 1.2133333333333333e-06, 'epoch': 29.28}


 98%|█████████▊| 3670/3750 [13:10:41<16:23, 12.29s/it]

{'loss': 0.0001, 'grad_norm': 0.0006614604499191046, 'learning_rate': 1.08e-06, 'epoch': 29.36}


 98%|█████████▊| 3680/3750 [13:12:44<14:16, 12.24s/it]

{'loss': 0.0, 'grad_norm': 0.00030021171551197767, 'learning_rate': 9.466666666666667e-07, 'epoch': 29.44}


 98%|█████████▊| 3690/3750 [13:14:47<12:13, 12.22s/it]

{'loss': 0.0, 'grad_norm': 0.00010660119005478919, 'learning_rate': 8.133333333333333e-07, 'epoch': 29.52}


 99%|█████████▊| 3700/3750 [13:16:50<10:11, 12.24s/it]

{'loss': 0.0001, 'grad_norm': 0.00011291418923065066, 'learning_rate': 6.8e-07, 'epoch': 29.6}


 99%|█████████▉| 3710/3750 [13:18:53<08:17, 12.45s/it]

{'loss': 0.0001, 'grad_norm': 0.000115732014819514, 'learning_rate': 5.466666666666667e-07, 'epoch': 29.68}


 99%|█████████▉| 3720/3750 [13:20:56<06:14, 12.47s/it]

{'loss': 0.0003, 'grad_norm': 0.0009100873721763492, 'learning_rate': 4.133333333333334e-07, 'epoch': 29.76}


 99%|█████████▉| 3730/3750 [13:23:00<04:09, 12.46s/it]

{'loss': 0.0004, 'grad_norm': 0.010664567351341248, 'learning_rate': 2.8e-07, 'epoch': 29.84}


100%|█████████▉| 3740/3750 [13:25:03<02:02, 12.29s/it]

{'loss': 0.0, 'grad_norm': 0.001720730448141694, 'learning_rate': 1.4666666666666668e-07, 'epoch': 29.92}


100%|██████████| 3750/3750 [13:26:59<00:00, 10.33s/it]

{'loss': 0.0005, 'grad_norm': 0.004230150021612644, 'learning_rate': 1.3333333333333335e-08, 'epoch': 30.0}


                                                      
100%|██████████| 3750/3750 [13:28:24<00:00, 10.33s/it]

{'eval_loss': 0.5209712386131287, 'eval_accuracy': 0.9435983615166474, 'eval_f1': 0.6180341533432248, 'eval_runtime': 70.666, 'eval_samples_per_second': 2.491, 'eval_steps_per_second': 0.311, 'epoch': 30.0}


100%|██████████| 3750/3750 [13:28:50<00:00, 12.94s/it]


{'train_runtime': 48530.1569, 'train_samples_per_second': 0.616, 'train_steps_per_second': 0.077, 'train_loss': 0.05545929155767274, 'epoch': 30.0}
Training complete! Best model saved to: finetuned_layoutlm_final_perfect
