In [1]:
import os 

os.chdir("../..")

from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np
from task1.config import ProjectPaths
import pandas as pd
import torch

paths = ProjectPaths()

# === 3. Set device ===
device = "mps" if torch.backends.mps.is_available() else "cpu"

# === 4. Load and preprocess data ===
def load_datasets(path):
    df = pd.read_csv(path, sep='\t')
    df = df[df['label'].isin(['SUBJ', 'OBJ'])].copy()
    df['label'] = df['label'].map({'OBJ': 0, 'SUBJ': 1})
    df = df[['sentence', 'label']]
    return Dataset.from_pandas(df)

train_dataset = load_datasets(paths.english_data_dir / "train_en.tsv")
val_dataset   = load_datasets(paths.english_data_dir / "dev_en.tsv")
test_dataset  = load_datasets(paths.english_data_dir / "dev_test_en.tsv")

W0613 21:50:04.550000 13020 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:
dataset = load_dataset("rotten_tomatoes")
train_dataset_stage1 = dataset["train"]
val_dataset_stage1 = dataset["validation"]

small_train_dataset = train_dataset_stage1.select(range(1000))  # Use only 1000 examples for training
small_val_dataset = val_dataset_stage1.select(range(200))

In [4]:
model_name = "microsoft/deberta-v3-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


STAGE 1 FINETUNE

In [3]:
def tokenize_stage1(examples):
    # The text column in this dataset is named 'text'
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [5]:
tokenized_train_dataset = small_train_dataset.map(tokenize_stage1, batched=True)
tokenized_val_dataset = small_val_dataset.map(tokenize_stage1, batched=True)

# The 'label' column is already correct, just need to set the format
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
tokenized_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# The Trainer works best when the label column is named 'labels'
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_val_dataset = tokenized_val_dataset.rename_column("label", "labels")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [6]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type=TaskType.SEQ_CLS,
    target_modules=["query_proj", "key_proj", "value_proj", "dense"] 
)

model = get_peft_model(model, lora_config).to(device)

In [7]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "precision": precision.compute(predictions=preds, references=labels, average="macro")["precision"],
        "recall": recall.compute(predictions=preds, references=labels, average="macro")["recall"],
    }

In [9]:
# === 8. TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics
)

# === 10. Train ===
trainer.train()

trainer.save_model("./results/stage1-rotten-tomatoes/final_checkpoint")

  0%|          | 0/1250 [00:00<?, ?it/s]



{'loss': 0.5125, 'grad_norm': 3.277414321899414, 'learning_rate': 4.96e-05, 'epoch': 0.04}
{'loss': 0.438, 'grad_norm': 2.9193906784057617, 'learning_rate': 4.92e-05, 'epoch': 0.08}
{'loss': 0.3646, 'grad_norm': 2.579204797744751, 'learning_rate': 4.88e-05, 'epoch': 0.12}
{'loss': 0.276, 'grad_norm': 2.0394837856292725, 'learning_rate': 4.8400000000000004e-05, 'epoch': 0.16}
{'loss': 0.1926, 'grad_norm': 1.5165563821792603, 'learning_rate': 4.8e-05, 'epoch': 0.2}
{'loss': 0.1219, 'grad_norm': 1.0093077421188354, 'learning_rate': 4.76e-05, 'epoch': 0.24}
{'loss': 0.0604, 'grad_norm': 0.6101557612419128, 'learning_rate': 4.72e-05, 'epoch': 0.28}
{'loss': 0.0225, 'grad_norm': 0.2730678915977478, 'learning_rate': 4.6800000000000006e-05, 'epoch': 0.32}
{'loss': 0.0088, 'grad_norm': 0.15851998329162598, 'learning_rate': 4.64e-05, 'epoch': 0.36}
{'loss': 0.0042, 'grad_norm': 0.06501436978578568, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.4}
{'loss': 0.0023, 'grad_norm': 0.053331635892

  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 0.00015026680193841457, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 14.3622, 'eval_samples_per_second': 13.925, 'eval_steps_per_second': 3.481, 'epoch': 1.0}




{'loss': 0.0003, 'grad_norm': 0.009262253530323505, 'learning_rate': 3.960000000000001e-05, 'epoch': 1.04}
{'loss': 0.0003, 'grad_norm': 0.007886522449553013, 'learning_rate': 3.9200000000000004e-05, 'epoch': 1.08}
{'loss': 0.0003, 'grad_norm': 0.007912270724773407, 'learning_rate': 3.88e-05, 'epoch': 1.12}
{'loss': 0.0002, 'grad_norm': 0.00939998310059309, 'learning_rate': 3.8400000000000005e-05, 'epoch': 1.16}
{'loss': 0.0002, 'grad_norm': 0.006601901724934578, 'learning_rate': 3.8e-05, 'epoch': 1.2}
{'loss': 0.0002, 'grad_norm': 0.006959951017051935, 'learning_rate': 3.76e-05, 'epoch': 1.24}
{'loss': 0.0002, 'grad_norm': 0.0059132324531674385, 'learning_rate': 3.72e-05, 'epoch': 1.28}
{'loss': 0.0002, 'grad_norm': 0.005666421726346016, 'learning_rate': 3.68e-05, 'epoch': 1.32}
{'loss': 0.0002, 'grad_norm': 0.005643172655254602, 'learning_rate': 3.6400000000000004e-05, 'epoch': 1.36}
{'loss': 0.0002, 'grad_norm': 0.004386341664940119, 'learning_rate': 3.6e-05, 'epoch': 1.4}
{'loss': 

  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 4.456419628695585e-05, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 13.3921, 'eval_samples_per_second': 14.934, 'eval_steps_per_second': 3.734, 'epoch': 2.0}




{'loss': 0.0001, 'grad_norm': 0.0033771907910704613, 'learning_rate': 2.96e-05, 'epoch': 2.04}
{'loss': 0.0001, 'grad_norm': 0.003621310694143176, 'learning_rate': 2.9199999999999998e-05, 'epoch': 2.08}
{'loss': 0.0001, 'grad_norm': 0.0022555270697921515, 'learning_rate': 2.88e-05, 'epoch': 2.12}
{'loss': 0.0001, 'grad_norm': 0.0027023237198591232, 'learning_rate': 2.84e-05, 'epoch': 2.16}
{'loss': 0.0001, 'grad_norm': 0.0031810638029128313, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.2}
{'loss': 0.0001, 'grad_norm': 0.00251132738776505, 'learning_rate': 2.7600000000000003e-05, 'epoch': 2.24}
{'loss': 0.0001, 'grad_norm': 0.003545573679730296, 'learning_rate': 2.7200000000000004e-05, 'epoch': 2.28}
{'loss': 0.0001, 'grad_norm': 0.00284872786141932, 'learning_rate': 2.6800000000000004e-05, 'epoch': 2.32}
{'loss': 0.0001, 'grad_norm': 0.0032106763683259487, 'learning_rate': 2.64e-05, 'epoch': 2.36}
{'loss': 0.0001, 'grad_norm': 0.003920187707990408, 'learning_rate': 2.60000000000

  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 2.4574692361056805e-05, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 13.9356, 'eval_samples_per_second': 14.352, 'eval_steps_per_second': 3.588, 'epoch': 3.0}




{'loss': 0.0001, 'grad_norm': 0.0023126991000026464, 'learning_rate': 1.9600000000000002e-05, 'epoch': 3.04}
{'loss': 0.0001, 'grad_norm': 0.003034348599612713, 'learning_rate': 1.9200000000000003e-05, 'epoch': 3.08}
{'loss': 0.0001, 'grad_norm': 0.0020138768013566732, 'learning_rate': 1.88e-05, 'epoch': 3.12}
{'loss': 0.0001, 'grad_norm': 0.0019444555509835482, 'learning_rate': 1.84e-05, 'epoch': 3.16}
{'loss': 0.0001, 'grad_norm': 0.0019115214236080647, 'learning_rate': 1.8e-05, 'epoch': 3.2}
{'loss': 0.0001, 'grad_norm': 0.0016724423039704561, 'learning_rate': 1.76e-05, 'epoch': 3.24}
{'loss': 0.0, 'grad_norm': 0.0015990036772564054, 'learning_rate': 1.7199999999999998e-05, 'epoch': 3.28}
{'loss': 0.0001, 'grad_norm': 0.0013744912575930357, 'learning_rate': 1.6800000000000002e-05, 'epoch': 3.32}
{'loss': 0.0001, 'grad_norm': 0.0018672782462090254, 'learning_rate': 1.6400000000000002e-05, 'epoch': 3.36}
{'loss': 0.0001, 'grad_norm': 0.0011863559484481812, 'learning_rate': 1.600000000

  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 1.789315319911111e-05, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 13.6475, 'eval_samples_per_second': 14.655, 'eval_steps_per_second': 3.664, 'epoch': 4.0}




{'loss': 0.0, 'grad_norm': 0.0012299632653594017, 'learning_rate': 9.600000000000001e-06, 'epoch': 4.04}
{'loss': 0.0, 'grad_norm': 0.001932988641783595, 'learning_rate': 9.2e-06, 'epoch': 4.08}
{'loss': 0.0, 'grad_norm': 0.0020748027600347996, 'learning_rate': 8.8e-06, 'epoch': 4.12}
{'loss': 0.0001, 'grad_norm': 0.004370896145701408, 'learning_rate': 8.400000000000001e-06, 'epoch': 4.16}
{'loss': 0.0, 'grad_norm': 0.0010559711372479796, 'learning_rate': 8.000000000000001e-06, 'epoch': 4.2}
{'loss': 0.0, 'grad_norm': 0.0018097219290211797, 'learning_rate': 7.6e-06, 'epoch': 4.24}
{'loss': 0.0, 'grad_norm': 0.0015827303286641836, 'learning_rate': 7.2e-06, 'epoch': 4.28}
{'loss': 0.0001, 'grad_norm': 0.0012150249676778913, 'learning_rate': 6.800000000000001e-06, 'epoch': 4.32}
{'loss': 0.0, 'grad_norm': 0.0012280270457267761, 'learning_rate': 6.4000000000000006e-06, 'epoch': 4.36}
{'loss': 0.0, 'grad_norm': 0.0016680666012689471, 'learning_rate': 6e-06, 'epoch': 4.4}
{'loss': 0.0, 'grad

  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 1.607583908480592e-05, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 13.9267, 'eval_samples_per_second': 14.361, 'eval_steps_per_second': 3.59, 'epoch': 5.0}




{'train_runtime': 1715.1648, 'train_samples_per_second': 2.915, 'train_steps_per_second': 0.729, 'train_loss': 0.016182669390528464, 'epoch': 5.0}




STAGE 2

In [11]:
def tokenize_fn(examples):
    return tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Map:   0%|          | 0/462 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

In [12]:
from peft import PeftModel

base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = PeftModel.from_pretrained(base_model, "./results/stage1-rotten-tomatoes/final_checkpoint")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type=TaskType.SEQ_CLS,
    target_modules=["query_proj", "key_proj", "value_proj", "dense"] 
)

model = get_peft_model(model, lora_config).to(device)

In [14]:
# === 8. TrainingArguments ===
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# === 10. Train ===
trainer.train()

# === 11. Evaluate on test set ===
trainer.evaluate(eval_dataset=test_dataset)

  0%|          | 0/2080 [00:00<?, ?it/s]



{'loss': 0.7158, 'grad_norm': 2.6309142112731934, 'learning_rate': 4.9759615384615386e-05, 'epoch': 0.05}
{'loss': 0.7413, 'grad_norm': 4.365935325622559, 'learning_rate': 4.9519230769230776e-05, 'epoch': 0.1}
{'loss': 0.6976, 'grad_norm': 2.2812647819519043, 'learning_rate': 4.927884615384616e-05, 'epoch': 0.14}
{'loss': 0.6893, 'grad_norm': 1.0032331943511963, 'learning_rate': 4.9038461538461536e-05, 'epoch': 0.19}
{'loss': 0.6935, 'grad_norm': 1.085867166519165, 'learning_rate': 4.8798076923076926e-05, 'epoch': 0.24}
{'loss': 0.665, 'grad_norm': 3.629504442214966, 'learning_rate': 4.855769230769231e-05, 'epoch': 0.29}
{'loss': 0.6473, 'grad_norm': 1.706917405128479, 'learning_rate': 4.8317307692307693e-05, 'epoch': 0.34}
{'loss': 0.6658, 'grad_norm': 3.340214729309082, 'learning_rate': 4.8076923076923084e-05, 'epoch': 0.38}
{'loss': 0.6709, 'grad_norm': 1.4903264045715332, 'learning_rate': 4.783653846153847e-05, 'epoch': 0.43}
{'loss': 0.65, 'grad_norm': 1.477706789970398, 'learning

  0%|          | 0/116 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


{'eval_loss': 0.7485350966453552, 'eval_accuracy': 0.4805194805194805, 'eval_f1_macro': 0.32456140350877194, 'eval_precision': 0.24025974025974026, 'eval_recall': 0.5, 'eval_runtime': 31.879, 'eval_samples_per_second': 14.492, 'eval_steps_per_second': 3.639, 'epoch': 1.0}
{'loss': 0.5944, 'grad_norm': 1.0798184871673584, 'learning_rate': 4.495192307692308e-05, 'epoch': 1.01}
{'loss': 0.6815, 'grad_norm': 3.670705795288086, 'learning_rate': 4.4711538461538466e-05, 'epoch': 1.06}
{'loss': 0.6479, 'grad_norm': 3.354647159576416, 'learning_rate': 4.447115384615384e-05, 'epoch': 1.11}
{'loss': 0.6563, 'grad_norm': 1.6315975189208984, 'learning_rate': 4.423076923076923e-05, 'epoch': 1.15}
{'loss': 0.5759, 'grad_norm': 1.2726610898971558, 'learning_rate': 4.3990384615384616e-05, 'epoch': 1.2}
{'loss': 0.6289, 'grad_norm': 2.087432622909546, 'learning_rate': 4.375e-05, 'epoch': 1.25}
{'loss': 0.5575, 'grad_norm': 3.4683303833007812, 'learning_rate': 4.350961538461539e-05, 'epoch': 1.3}
{'loss'

  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.5776647329330444, 'eval_accuracy': 0.7489177489177489, 'eval_f1_macro': 0.7486162448167815, 'eval_precision': 0.7485556722689075, 'eval_recall': 0.7487049549549549, 'eval_runtime': 31.9392, 'eval_samples_per_second': 14.465, 'eval_steps_per_second': 3.632, 'epoch': 2.0}




{'loss': 0.4773, 'grad_norm': 1.1511704921722412, 'learning_rate': 3.9903846153846155e-05, 'epoch': 2.02}
{'loss': 0.4391, 'grad_norm': 1.0211766958236694, 'learning_rate': 3.966346153846154e-05, 'epoch': 2.07}
{'loss': 0.496, 'grad_norm': 4.388455867767334, 'learning_rate': 3.942307692307692e-05, 'epoch': 2.12}
{'loss': 0.7016, 'grad_norm': 9.57120418548584, 'learning_rate': 3.918269230769231e-05, 'epoch': 2.16}
{'loss': 0.4218, 'grad_norm': 11.63208293914795, 'learning_rate': 3.8942307692307696e-05, 'epoch': 2.21}
{'loss': 0.4181, 'grad_norm': 0.5472500920295715, 'learning_rate': 3.870192307692308e-05, 'epoch': 2.26}
{'loss': 0.6692, 'grad_norm': 17.737932205200195, 'learning_rate': 3.846153846153846e-05, 'epoch': 2.31}
{'loss': 0.2692, 'grad_norm': 1.8524974584579468, 'learning_rate': 3.8221153846153846e-05, 'epoch': 2.36}
{'loss': 0.2486, 'grad_norm': 1.6352205276489258, 'learning_rate': 3.798076923076923e-05, 'epoch': 2.4}
{'loss': 0.6171, 'grad_norm': 1.5832785367965698, 'learnin

  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.7074335217475891, 'eval_accuracy': 0.7662337662337663, 'eval_f1_macro': 0.7638930950938825, 'eval_precision': 0.7871503496503496, 'eval_recall': 0.7712837837837838, 'eval_runtime': 32.2238, 'eval_samples_per_second': 14.337, 'eval_steps_per_second': 3.6, 'epoch': 3.0}




{'loss': 0.3086, 'grad_norm': 2.400819778442383, 'learning_rate': 3.485576923076923e-05, 'epoch': 3.03}
{'loss': 0.4891, 'grad_norm': 11.556095123291016, 'learning_rate': 3.461538461538462e-05, 'epoch': 3.08}
{'loss': 0.3419, 'grad_norm': 0.7763872742652893, 'learning_rate': 3.4375e-05, 'epoch': 3.12}
{'loss': 0.8371, 'grad_norm': 4.672676086425781, 'learning_rate': 3.4134615384615386e-05, 'epoch': 3.17}
{'loss': 0.2709, 'grad_norm': 5.861742973327637, 'learning_rate': 3.3894230769230776e-05, 'epoch': 3.22}
{'loss': 0.429, 'grad_norm': 13.483816146850586, 'learning_rate': 3.365384615384616e-05, 'epoch': 3.27}
{'loss': 0.2291, 'grad_norm': 0.699564516544342, 'learning_rate': 3.3413461538461536e-05, 'epoch': 3.32}
{'loss': 0.4639, 'grad_norm': 0.4182063639163971, 'learning_rate': 3.3173076923076926e-05, 'epoch': 3.37}
{'loss': 0.5647, 'grad_norm': 3.8619370460510254, 'learning_rate': 3.293269230769231e-05, 'epoch': 3.41}
{'loss': 0.529, 'grad_norm': 17.611005783081055, 'learning_rate': 3

  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.6220338344573975, 'eval_accuracy': 0.7965367965367965, 'eval_f1_macro': 0.7965329835082459, 'eval_precision': 0.7981404958677686, 'eval_recall': 0.7979166666666666, 'eval_runtime': 31.1256, 'eval_samples_per_second': 14.843, 'eval_steps_per_second': 3.727, 'epoch': 4.0}




{'loss': 0.2451, 'grad_norm': 4.175917625427246, 'learning_rate': 2.9807692307692308e-05, 'epoch': 4.04}
{'loss': 0.3067, 'grad_norm': 6.523779392242432, 'learning_rate': 2.9567307692307695e-05, 'epoch': 4.09}
{'loss': 0.6254, 'grad_norm': 7.666072368621826, 'learning_rate': 2.932692307692308e-05, 'epoch': 4.13}
{'loss': 0.3372, 'grad_norm': 0.41526758670806885, 'learning_rate': 2.9086538461538465e-05, 'epoch': 4.18}
{'loss': 0.2485, 'grad_norm': 3.8127057552337646, 'learning_rate': 2.8846153846153845e-05, 'epoch': 4.23}
{'loss': 0.24, 'grad_norm': 5.140757083892822, 'learning_rate': 2.860576923076923e-05, 'epoch': 4.28}
{'loss': 0.213, 'grad_norm': 0.44469577074050903, 'learning_rate': 2.8365384615384616e-05, 'epoch': 4.33}
{'loss': 0.4403, 'grad_norm': 0.5003892779350281, 'learning_rate': 2.8125000000000003e-05, 'epoch': 4.38}
{'loss': 0.3649, 'grad_norm': 2.986278533935547, 'learning_rate': 2.7884615384615386e-05, 'epoch': 4.42}
{'loss': 0.4183, 'grad_norm': 0.7238451838493347, 'lea

  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.6049712896347046, 'eval_accuracy': 0.7770562770562771, 'eval_f1_macro': 0.7749208916890156, 'eval_precision': 0.7808412561221549, 'eval_recall': 0.7744369369369369, 'eval_runtime': 32.3873, 'eval_samples_per_second': 14.265, 'eval_steps_per_second': 3.582, 'epoch': 5.0}




{'loss': 0.2218, 'grad_norm': 8.287753105163574, 'learning_rate': 2.4759615384615388e-05, 'epoch': 5.05}
{'loss': 0.3197, 'grad_norm': 0.5161803364753723, 'learning_rate': 2.4519230769230768e-05, 'epoch': 5.1}
{'loss': 0.5229, 'grad_norm': 31.6905517578125, 'learning_rate': 2.4278846153846155e-05, 'epoch': 5.14}
{'loss': 0.3076, 'grad_norm': 14.561025619506836, 'learning_rate': 2.4038461538461542e-05, 'epoch': 5.19}
{'loss': 0.4056, 'grad_norm': 0.25801676511764526, 'learning_rate': 2.3798076923076922e-05, 'epoch': 5.24}
{'loss': 0.4772, 'grad_norm': 3.6835227012634277, 'learning_rate': 2.355769230769231e-05, 'epoch': 5.29}
{'loss': 0.3572, 'grad_norm': 20.35322380065918, 'learning_rate': 2.3317307692307692e-05, 'epoch': 5.34}
{'loss': 0.5734, 'grad_norm': 0.5391113758087158, 'learning_rate': 2.307692307692308e-05, 'epoch': 5.38}
{'loss': 0.4305, 'grad_norm': 5.674897193908691, 'learning_rate': 2.2836538461538463e-05, 'epoch': 5.43}
{'loss': 0.4565, 'grad_norm': 3.4031033515930176, 'le

  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.6812739968299866, 'eval_accuracy': 0.7922077922077922, 'eval_f1_macro': 0.7916455268053442, 'eval_precision': 0.8011363636363636, 'eval_recall': 0.7954391891891892, 'eval_runtime': 31.4417, 'eval_samples_per_second': 14.694, 'eval_steps_per_second': 3.689, 'epoch': 6.0}




{'loss': 0.4277, 'grad_norm': 10.552940368652344, 'learning_rate': 1.9951923076923078e-05, 'epoch': 6.01}
{'loss': 0.2472, 'grad_norm': 0.4496120512485504, 'learning_rate': 1.971153846153846e-05, 'epoch': 6.06}
{'loss': 0.1069, 'grad_norm': 11.161867141723633, 'learning_rate': 1.9471153846153848e-05, 'epoch': 6.11}
{'loss': 0.3246, 'grad_norm': 8.6983060836792, 'learning_rate': 1.923076923076923e-05, 'epoch': 6.15}
{'loss': 0.4737, 'grad_norm': 5.5420660972595215, 'learning_rate': 1.8990384615384615e-05, 'epoch': 6.2}
{'loss': 0.5248, 'grad_norm': 5.767539024353027, 'learning_rate': 1.8750000000000002e-05, 'epoch': 6.25}
{'loss': 0.4467, 'grad_norm': 15.867173194885254, 'learning_rate': 1.8509615384615385e-05, 'epoch': 6.3}
{'loss': 0.1932, 'grad_norm': 5.527315139770508, 'learning_rate': 1.826923076923077e-05, 'epoch': 6.35}
{'loss': 0.2661, 'grad_norm': 0.7370733022689819, 'learning_rate': 1.8028846153846156e-05, 'epoch': 6.39}
{'loss': 0.2653, 'grad_norm': 8.97192096710205, 'learnin

  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.7820060849189758, 'eval_accuracy': 0.7748917748917749, 'eval_f1_macro': 0.7736659632197769, 'eval_precision': 0.7885152974064296, 'eval_recall': 0.7789414414414415, 'eval_runtime': 33.5496, 'eval_samples_per_second': 13.771, 'eval_steps_per_second': 3.458, 'epoch': 7.0}




{'loss': 0.3353, 'grad_norm': 13.57868766784668, 'learning_rate': 1.4903846153846154e-05, 'epoch': 7.02}
{'loss': 0.4774, 'grad_norm': 19.689579010009766, 'learning_rate': 1.466346153846154e-05, 'epoch': 7.07}
{'loss': 0.1299, 'grad_norm': 0.46344655752182007, 'learning_rate': 1.4423076923076923e-05, 'epoch': 7.12}
{'loss': 0.4036, 'grad_norm': 0.2670450806617737, 'learning_rate': 1.4182692307692308e-05, 'epoch': 7.16}
{'loss': 0.3374, 'grad_norm': 0.2098051905632019, 'learning_rate': 1.3942307692307693e-05, 'epoch': 7.21}
{'loss': 0.6979, 'grad_norm': 5.856878280639648, 'learning_rate': 1.3701923076923078e-05, 'epoch': 7.26}
{'loss': 0.1938, 'grad_norm': 12.298230171203613, 'learning_rate': 1.3461538461538462e-05, 'epoch': 7.31}
{'loss': 0.2577, 'grad_norm': 0.21689049899578094, 'learning_rate': 1.3221153846153847e-05, 'epoch': 7.36}
{'loss': 0.365, 'grad_norm': 1.2410768270492554, 'learning_rate': 1.2980769230769232e-05, 'epoch': 7.4}
{'loss': 0.0425, 'grad_norm': 0.6255244016647339,

  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.7771704196929932, 'eval_accuracy': 0.7965367965367965, 'eval_f1_macro': 0.7962274774774776, 'eval_precision': 0.802952576379389, 'eval_recall': 0.7992680180180181, 'eval_runtime': 30.9806, 'eval_samples_per_second': 14.913, 'eval_steps_per_second': 3.744, 'epoch': 8.0}




{'loss': 0.6964, 'grad_norm': 0.4213506579399109, 'learning_rate': 9.85576923076923e-06, 'epoch': 8.03}
{'loss': 0.3204, 'grad_norm': 0.28340980410575867, 'learning_rate': 9.615384615384616e-06, 'epoch': 8.08}
{'loss': 0.252, 'grad_norm': 0.22745178639888763, 'learning_rate': 9.375000000000001e-06, 'epoch': 8.12}
{'loss': 0.1699, 'grad_norm': 0.4436761438846588, 'learning_rate': 9.134615384615384e-06, 'epoch': 8.17}
{'loss': 0.296, 'grad_norm': 0.9309639930725098, 'learning_rate': 8.89423076923077e-06, 'epoch': 8.22}
{'loss': 0.4854, 'grad_norm': 22.498491287231445, 'learning_rate': 8.653846153846155e-06, 'epoch': 8.27}
{'loss': 0.5662, 'grad_norm': 10.544913291931152, 'learning_rate': 8.41346153846154e-06, 'epoch': 8.32}
{'loss': 0.1409, 'grad_norm': 0.16386733949184418, 'learning_rate': 8.173076923076923e-06, 'epoch': 8.37}
{'loss': 0.3608, 'grad_norm': 0.16042792797088623, 'learning_rate': 7.932692307692308e-06, 'epoch': 8.41}
{'loss': 0.5159, 'grad_norm': 0.2758682370185852, 'learn

  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.7697113752365112, 'eval_accuracy': 0.8008658008658008, 'eval_f1_macro': 0.8008322087269456, 'eval_precision': 0.8034101174345076, 'eval_recall': 0.8025900900900901, 'eval_runtime': 31.8622, 'eval_samples_per_second': 14.5, 'eval_steps_per_second': 3.641, 'epoch': 9.0}




{'loss': 0.0958, 'grad_norm': 0.16159193217754364, 'learning_rate': 4.807692307692308e-06, 'epoch': 9.04}
{'loss': 0.428, 'grad_norm': 14.142708778381348, 'learning_rate': 4.567307692307692e-06, 'epoch': 9.09}
{'loss': 0.1882, 'grad_norm': 0.19805146753787994, 'learning_rate': 4.326923076923077e-06, 'epoch': 9.13}
{'loss': 0.2386, 'grad_norm': 0.18475200235843658, 'learning_rate': 4.086538461538462e-06, 'epoch': 9.18}
{'loss': 0.2388, 'grad_norm': 11.519464492797852, 'learning_rate': 3.846153846153847e-06, 'epoch': 9.23}
{'loss': 0.6297, 'grad_norm': 6.080330848693848, 'learning_rate': 3.6057692307692307e-06, 'epoch': 9.28}
{'loss': 0.2681, 'grad_norm': 0.1991061270236969, 'learning_rate': 3.3653846153846154e-06, 'epoch': 9.33}
{'loss': 0.2496, 'grad_norm': 3.957364797592163, 'learning_rate': 3.125e-06, 'epoch': 9.38}
{'loss': 0.4178, 'grad_norm': 19.460834503173828, 'learning_rate': 2.884615384615385e-06, 'epoch': 9.42}
{'loss': 0.2517, 'grad_norm': 0.1953696459531784, 'learning_rate'

  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.7592930197715759, 'eval_accuracy': 0.8008658008658008, 'eval_f1_macro': 0.8008620689655173, 'eval_precision': 0.8024793388429752, 'eval_recall': 0.8022522522522523, 'eval_runtime': 30.9328, 'eval_samples_per_second': 14.936, 'eval_steps_per_second': 3.75, 'epoch': 10.0}
{'train_runtime': 2995.3065, 'train_samples_per_second': 2.771, 'train_steps_per_second': 0.694, 'train_loss': 0.42064572182985455, 'epoch': 10.0}




  0%|          | 0/121 [00:00<?, ?it/s]

{'eval_loss': 0.8633573055267334,
 'eval_accuracy': 0.8057851239669421,
 'eval_f1_macro': 0.673611111111111,
 'eval_precision': 0.7854435831180018,
 'eval_recall': 0.6500769857802735,
 'eval_runtime': 33.5153,
 'eval_samples_per_second': 14.441,
 'eval_steps_per_second': 3.61,
 'epoch': 10.0}