pip install -q git+https://github.com/huggingface/trl

In [3]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# The instruction dataset to use
dataset_name = "StarkWizard/cairo-instruct"

# Fine-tuned model name
new_model = "StarkWizard/Mistral-7b-instruct-cairo-PEFT"



import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

hub_name = "StarkWizard/Mistral-7b-instruct-cairo-instruct"

max_steps = 1000 # to tweak to get the best out of the model 

In [2]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer
import os
 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name,
                                             trust_remote_code=True,
                                             quantization_config=bnb_config,
                                             use_auth_token=True,
                                             device_map={"": 0}
                                             )

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [4]:
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import os

model.config.use_cache=False
model.config.pretraining_tp=1
model.config.window = 256 
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


Loading Dataset

In [5]:
from datasets import load_dataset

# Load the dataset
dataset_train = load_dataset(dataset_name, split="train", download_mode='force_redownload',ignore_verifications=True)
dataset_test = load_dataset(dataset_name, split="eval", download_mode='force_redownload',ignore_verifications=True)



Downloading readme:   0%|          | 0.00/527 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/36.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/245 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/527 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/36.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/245 [00:00<?, ? examples/s]

In [6]:



peft_config = LoraConfig(
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head"
    ],inference_mode = False
)

model = get_peft_model(model, peft_config)
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing = True,
    optim="paged_adamw_32bit",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    save_strategy="epoch",
    logging_dir="./logs", 
    logging_steps=5,
    max_steps=max_steps,
    fp16=False,
    push_to_hub=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=512,
    neftune_noise_alpha=5
)


Map:   0%|          | 0/245 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



In [7]:

trainer.train()
trainer.model.push_to_hub(new_model)

  0%|          | 0/1000 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 4.9419, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.02}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.648280143737793, 'eval_runtime': 0.9615, 'eval_samples_per_second': 5.2, 'eval_steps_per_second': 1.04, 'epoch': 0.02}
{'loss': 3.9783, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.04}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.627208709716797, 'eval_runtime': 0.9668, 'eval_samples_per_second': 5.172, 'eval_steps_per_second': 1.034, 'epoch': 0.04}
{'loss': 4.435, 'learning_rate': 1e-05, 'epoch': 0.06}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.582626819610596, 'eval_runtime': 0.9696, 'eval_samples_per_second': 5.157, 'eval_steps_per_second': 1.031, 'epoch': 0.06}
{'loss': 4.2584, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.08}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.493605613708496, 'eval_runtime': 0.989, 'eval_samples_per_second': 5.056, 'eval_steps_per_second': 1.011, 'epoch': 0.08}
{'loss': 4.3762, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.1}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.3248162269592285, 'eval_runtime': 0.9879, 'eval_samples_per_second': 5.061, 'eval_steps_per_second': 1.012, 'epoch': 0.1}
{'loss': 3.9439, 'learning_rate': 2e-05, 'epoch': 0.12}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.127838134765625, 'eval_runtime': 1.0238, 'eval_samples_per_second': 4.884, 'eval_steps_per_second': 0.977, 'epoch': 0.12}
{'loss': 3.5457, 'learning_rate': 1.9998688836656322e-05, 'epoch': 0.14}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.8556900024414062, 'eval_runtime': 0.9837, 'eval_samples_per_second': 5.083, 'eval_steps_per_second': 1.017, 'epoch': 0.14}
{'loss': 3.9832, 'learning_rate': 1.9994755690455154e-05, 'epoch': 0.16}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.613964796066284, 'eval_runtime': 0.982, 'eval_samples_per_second': 5.092, 'eval_steps_per_second': 1.018, 'epoch': 0.16}
{'loss': 3.8853, 'learning_rate': 1.998820159279591e-05, 'epoch': 0.18}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.40177583694458, 'eval_runtime': 0.9899, 'eval_samples_per_second': 5.051, 'eval_steps_per_second': 1.01, 'epoch': 0.18}
{'loss': 2.9133, 'learning_rate': 1.997902826237712e-05, 'epoch': 0.2}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.252758502960205, 'eval_runtime': 1.0489, 'eval_samples_per_second': 4.767, 'eval_steps_per_second': 0.953, 'epoch': 0.2}
{'loss': 2.7053, 'learning_rate': 1.9967238104745695e-05, 'epoch': 0.22}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.1319775581359863, 'eval_runtime': 1.1031, 'eval_samples_per_second': 4.533, 'eval_steps_per_second': 0.907, 'epoch': 0.22}
{'loss': 2.2822, 'learning_rate': 1.995283421166614e-05, 'epoch': 0.24}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.9988157749176025, 'eval_runtime': 1.0143, 'eval_samples_per_second': 4.929, 'eval_steps_per_second': 0.986, 'epoch': 0.24}
{'loss': 3.3321, 'learning_rate': 1.993582036030978e-05, 'epoch': 0.27}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.865684986114502, 'eval_runtime': 0.9758, 'eval_samples_per_second': 5.124, 'eval_steps_per_second': 1.025, 'epoch': 0.27}
{'loss': 2.6849, 'learning_rate': 1.9916201012264255e-05, 'epoch': 0.29}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.758342742919922, 'eval_runtime': 1.0452, 'eval_samples_per_second': 4.784, 'eval_steps_per_second': 0.957, 'epoch': 0.29}
{'loss': 2.1692, 'learning_rate': 1.9893981312363563e-05, 'epoch': 0.31}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.7543416023254395, 'eval_runtime': 1.052, 'eval_samples_per_second': 4.753, 'eval_steps_per_second': 0.951, 'epoch': 0.31}
{'loss': 2.7414, 'learning_rate': 1.9869167087338908e-05, 'epoch': 0.33}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.7596824169158936, 'eval_runtime': 0.9868, 'eval_samples_per_second': 5.067, 'eval_steps_per_second': 1.013, 'epoch': 0.33}
{'loss': 2.2613, 'learning_rate': 1.9841764844290744e-05, 'epoch': 0.35}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.6586971282958984, 'eval_runtime': 1.0468, 'eval_samples_per_second': 4.777, 'eval_steps_per_second': 0.955, 'epoch': 0.35}
{'loss': 2.4811, 'learning_rate': 1.9811781768982392e-05, 'epoch': 0.37}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.560894727706909, 'eval_runtime': 1.0352, 'eval_samples_per_second': 4.83, 'eval_steps_per_second': 0.966, 'epoch': 0.37}
{'loss': 2.0377, 'learning_rate': 1.977922572395571e-05, 'epoch': 0.39}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.579294204711914, 'eval_runtime': 1.0921, 'eval_samples_per_second': 4.578, 'eval_steps_per_second': 0.916, 'epoch': 0.39}
{'loss': 1.945, 'learning_rate': 1.9744105246469264e-05, 'epoch': 0.41}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.6100664138793945, 'eval_runtime': 1.0863, 'eval_samples_per_second': 4.603, 'eval_steps_per_second': 0.921, 'epoch': 0.41}
{'loss': 2.7281, 'learning_rate': 1.9706429546259592e-05, 'epoch': 0.43}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.2494733333587646, 'eval_runtime': 1.0187, 'eval_samples_per_second': 4.908, 'eval_steps_per_second': 0.982, 'epoch': 0.43}
{'loss': 2.2824, 'learning_rate': 1.9666208503126115e-05, 'epoch': 0.45}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.193776845932007, 'eval_runtime': 0.9865, 'eval_samples_per_second': 5.068, 'eval_steps_per_second': 1.014, 'epoch': 0.45}
{'loss': 2.1301, 'learning_rate': 1.9623452664340305e-05, 'epoch': 0.47}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.1809732913970947, 'eval_runtime': 1.0985, 'eval_samples_per_second': 4.552, 'eval_steps_per_second': 0.91, 'epoch': 0.47}
{'loss': 2.1599, 'learning_rate': 1.957817324187987e-05, 'epoch': 0.49}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.1663665771484375, 'eval_runtime': 0.982, 'eval_samples_per_second': 5.092, 'eval_steps_per_second': 1.018, 'epoch': 0.49}
{'loss': 2.4669, 'learning_rate': 1.953038210948861e-05, 'epoch': 0.51}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.156499147415161, 'eval_runtime': 1.0565, 'eval_samples_per_second': 4.733, 'eval_steps_per_second': 0.947, 'epoch': 0.51}
{'loss': 1.9863, 'learning_rate': 1.9480091799562706e-05, 'epoch': 0.53}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.1338536739349365, 'eval_runtime': 1.062, 'eval_samples_per_second': 4.708, 'eval_steps_per_second': 0.942, 'epoch': 0.53}
{'loss': 2.133, 'learning_rate': 1.9427315499864345e-05, 'epoch': 0.55}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.094836473464966, 'eval_runtime': 1.0029, 'eval_samples_per_second': 4.986, 'eval_steps_per_second': 0.997, 'epoch': 0.55}
{'loss': 1.8624, 'learning_rate': 1.937206705006344e-05, 'epoch': 0.57}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.049799680709839, 'eval_runtime': 1.098, 'eval_samples_per_second': 4.554, 'eval_steps_per_second': 0.911, 'epoch': 0.57}
{'loss': 2.0716, 'learning_rate': 1.9314360938108427e-05, 'epoch': 0.59}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.995033621788025, 'eval_runtime': 0.9772, 'eval_samples_per_second': 5.117, 'eval_steps_per_second': 1.023, 'epoch': 0.59}
{'loss': 1.7386, 'learning_rate': 1.9254212296427043e-05, 'epoch': 0.61}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9597610235214233, 'eval_runtime': 1.043, 'eval_samples_per_second': 4.794, 'eval_steps_per_second': 0.959, 'epoch': 0.61}
{'loss': 1.52, 'learning_rate': 1.9191636897958123e-05, 'epoch': 0.63}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9357579946517944, 'eval_runtime': 1.0654, 'eval_samples_per_second': 4.693, 'eval_steps_per_second': 0.939, 'epoch': 0.63}
{'loss': 1.3844, 'learning_rate': 1.9126651152015404e-05, 'epoch': 0.65}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9232494831085205, 'eval_runtime': 1.0135, 'eval_samples_per_second': 4.933, 'eval_steps_per_second': 0.987, 'epoch': 0.65}
{'loss': 2.0719, 'learning_rate': 1.905927209998447e-05, 'epoch': 0.67}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9007654190063477, 'eval_runtime': 1.0439, 'eval_samples_per_second': 4.79, 'eval_steps_per_second': 0.958, 'epoch': 0.67}
{'loss': 1.6826, 'learning_rate': 1.8989517410853956e-05, 'epoch': 0.69}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.862769365310669, 'eval_runtime': 0.9812, 'eval_samples_per_second': 5.096, 'eval_steps_per_second': 1.019, 'epoch': 0.69}
{'loss': 1.9418, 'learning_rate': 1.8917405376582144e-05, 'epoch': 0.71}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8324064016342163, 'eval_runtime': 1.0861, 'eval_samples_per_second': 4.604, 'eval_steps_per_second': 0.921, 'epoch': 0.71}
{'loss': 1.5165, 'learning_rate': 1.8842954907300236e-05, 'epoch': 0.73}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7975536584854126, 'eval_runtime': 0.9966, 'eval_samples_per_second': 5.017, 'eval_steps_per_second': 1.003, 'epoch': 0.73}
{'loss': 1.5764, 'learning_rate': 1.876618552635348e-05, 'epoch': 0.76}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7771412134170532, 'eval_runtime': 1.0185, 'eval_samples_per_second': 4.909, 'eval_steps_per_second': 0.982, 'epoch': 0.76}
{'loss': 1.7126, 'learning_rate': 1.8687117365181514e-05, 'epoch': 0.78}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7570476531982422, 'eval_runtime': 0.9543, 'eval_samples_per_second': 5.239, 'eval_steps_per_second': 1.048, 'epoch': 0.78}
{'loss': 1.5553, 'learning_rate': 1.8605771158039253e-05, 'epoch': 0.8}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7477325201034546, 'eval_runtime': 0.9762, 'eval_samples_per_second': 5.122, 'eval_steps_per_second': 1.024, 'epoch': 0.8}
{'loss': 1.4325, 'learning_rate': 1.8522168236559693e-05, 'epoch': 0.82}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.738938570022583, 'eval_runtime': 0.9727, 'eval_samples_per_second': 5.14, 'eval_steps_per_second': 1.028, 'epoch': 0.82}
{'loss': 1.7383, 'learning_rate': 1.8436330524160048e-05, 'epoch': 0.84}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7283666133880615, 'eval_runtime': 1.0623, 'eval_samples_per_second': 4.707, 'eval_steps_per_second': 0.941, 'epoch': 0.84}
{'loss': 1.4096, 'learning_rate': 1.8348280530292712e-05, 'epoch': 0.86}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7192062139511108, 'eval_runtime': 0.9881, 'eval_samples_per_second': 5.06, 'eval_steps_per_second': 1.012, 'epoch': 0.86}
{'loss': 1.3947, 'learning_rate': 1.8258041344542567e-05, 'epoch': 0.88}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7158523797988892, 'eval_runtime': 1.0314, 'eval_samples_per_second': 4.848, 'eval_steps_per_second': 0.97, 'epoch': 0.88}
{'loss': 1.4394, 'learning_rate': 1.816563663057211e-05, 'epoch': 0.9}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.708975076675415, 'eval_runtime': 0.9539, 'eval_samples_per_second': 5.242, 'eval_steps_per_second': 1.048, 'epoch': 0.9}
{'loss': 1.5481, 'learning_rate': 1.8071090619916095e-05, 'epoch': 0.92}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7074848413467407, 'eval_runtime': 1.0613, 'eval_samples_per_second': 4.711, 'eval_steps_per_second': 0.942, 'epoch': 0.92}
{'loss': 1.4635, 'learning_rate': 1.797442810562721e-05, 'epoch': 0.94}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7116563320159912, 'eval_runtime': 0.9676, 'eval_samples_per_second': 5.167, 'eval_steps_per_second': 1.033, 'epoch': 0.94}
{'loss': 1.3564, 'learning_rate': 1.7875674435774546e-05, 'epoch': 0.96}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7041183710098267, 'eval_runtime': 0.9829, 'eval_samples_per_second': 5.087, 'eval_steps_per_second': 1.017, 'epoch': 0.96}
{'loss': 1.5381, 'learning_rate': 1.7774855506796497e-05, 'epoch': 0.98}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6901878118515015, 'eval_runtime': 1.0602, 'eval_samples_per_second': 4.716, 'eval_steps_per_second': 0.943, 'epoch': 0.98}
{'loss': 1.2412, 'learning_rate': 1.767199775670986e-05, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.683773398399353, 'eval_runtime': 1.0628, 'eval_samples_per_second': 4.704, 'eval_steps_per_second': 0.941, 'epoch': 1.0}




{'loss': 1.7424, 'learning_rate': 1.7567128158176955e-05, 'epoch': 1.02}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6802936792373657, 'eval_runtime': 1.0208, 'eval_samples_per_second': 4.898, 'eval_steps_per_second': 0.98, 'epoch': 1.02}
{'loss': 1.2657, 'learning_rate': 1.7460274211432463e-05, 'epoch': 1.04}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.686870813369751, 'eval_runtime': 0.9502, 'eval_samples_per_second': 5.262, 'eval_steps_per_second': 1.052, 'epoch': 1.04}
{'loss': 1.2026, 'learning_rate': 1.7351463937072008e-05, 'epoch': 1.06}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7028175592422485, 'eval_runtime': 1.031, 'eval_samples_per_second': 4.85, 'eval_steps_per_second': 0.97, 'epoch': 1.06}
{'loss': 0.9746, 'learning_rate': 1.7240725868704218e-05, 'epoch': 1.08}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7092052698135376, 'eval_runtime': 0.9688, 'eval_samples_per_second': 5.161, 'eval_steps_per_second': 1.032, 'epoch': 1.08}
{'loss': 1.3277, 'learning_rate': 1.7128089045468294e-05, 'epoch': 1.1}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7065465450286865, 'eval_runtime': 1.0625, 'eval_samples_per_second': 4.706, 'eval_steps_per_second': 0.941, 'epoch': 1.1}
{'loss': 1.741, 'learning_rate': 1.7013583004418994e-05, 'epoch': 1.12}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7007354497909546, 'eval_runtime': 1.0283, 'eval_samples_per_second': 4.862, 'eval_steps_per_second': 0.972, 'epoch': 1.12}
{'loss': 1.4553, 'learning_rate': 1.6897237772781046e-05, 'epoch': 1.14}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.690101981163025, 'eval_runtime': 1.0619, 'eval_samples_per_second': 4.708, 'eval_steps_per_second': 0.942, 'epoch': 1.14}
{'loss': 1.4277, 'learning_rate': 1.6779083860075032e-05, 'epoch': 1.16}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6786473989486694, 'eval_runtime': 0.989, 'eval_samples_per_second': 5.056, 'eval_steps_per_second': 1.011, 'epoch': 1.16}
{'loss': 1.5373, 'learning_rate': 1.665915225011681e-05, 'epoch': 1.18}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6731199026107788, 'eval_runtime': 1.0585, 'eval_samples_per_second': 4.724, 'eval_steps_per_second': 0.945, 'epoch': 1.18}
{'loss': 1.3754, 'learning_rate': 1.6537474392892527e-05, 'epoch': 1.2}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.669006586074829, 'eval_runtime': 1.0012, 'eval_samples_per_second': 4.994, 'eval_steps_per_second': 0.999, 'epoch': 1.2}
{'loss': 1.8448, 'learning_rate': 1.6414082196311402e-05, 'epoch': 1.22}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6428474187850952, 'eval_runtime': 0.9516, 'eval_samples_per_second': 5.254, 'eval_steps_per_second': 1.051, 'epoch': 1.22}
{'loss': 1.132, 'learning_rate': 1.6289008017838447e-05, 'epoch': 1.24}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6276661157608032, 'eval_runtime': 1.0489, 'eval_samples_per_second': 4.767, 'eval_steps_per_second': 0.953, 'epoch': 1.24}
{'loss': 1.1909, 'learning_rate': 1.6162284656009276e-05, 'epoch': 1.27}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6236035823822021, 'eval_runtime': 1.0054, 'eval_samples_per_second': 4.973, 'eval_steps_per_second': 0.995, 'epoch': 1.27}
{'loss': 1.2459, 'learning_rate': 1.603394534182925e-05, 'epoch': 1.29}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6253389120101929, 'eval_runtime': 0.9597, 'eval_samples_per_second': 5.21, 'eval_steps_per_second': 1.042, 'epoch': 1.29}
{'loss': 1.1233, 'learning_rate': 1.5904023730059227e-05, 'epoch': 1.31}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.631037950515747, 'eval_runtime': 1.0095, 'eval_samples_per_second': 4.953, 'eval_steps_per_second': 0.991, 'epoch': 1.31}
{'loss': 1.1812, 'learning_rate': 1.5772553890390196e-05, 'epoch': 1.33}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6326717138290405, 'eval_runtime': 1.0621, 'eval_samples_per_second': 4.708, 'eval_steps_per_second': 0.942, 'epoch': 1.33}
{'loss': 1.2173, 'learning_rate': 1.5639570298509067e-05, 'epoch': 1.35}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6318066120147705, 'eval_runtime': 1.0453, 'eval_samples_per_second': 4.783, 'eval_steps_per_second': 0.957, 'epoch': 1.35}
{'loss': 1.1845, 'learning_rate': 1.5505107827058038e-05, 'epoch': 1.37}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6330829858779907, 'eval_runtime': 1.0261, 'eval_samples_per_second': 4.873, 'eval_steps_per_second': 0.975, 'epoch': 1.37}
{'loss': 1.4047, 'learning_rate': 1.536920173648984e-05, 'epoch': 1.39}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.619769811630249, 'eval_runtime': 1.0478, 'eval_samples_per_second': 4.772, 'eval_steps_per_second': 0.954, 'epoch': 1.39}
{'loss': 1.3456, 'learning_rate': 1.52318876658213e-05, 'epoch': 1.41}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6102068424224854, 'eval_runtime': 0.9472, 'eval_samples_per_second': 5.279, 'eval_steps_per_second': 1.056, 'epoch': 1.41}
{'loss': 1.0766, 'learning_rate': 1.5093201623287631e-05, 'epoch': 1.43}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5971643924713135, 'eval_runtime': 1.067, 'eval_samples_per_second': 4.686, 'eval_steps_per_second': 0.937, 'epoch': 1.43}
{'loss': 1.434, 'learning_rate': 1.4953179976899878e-05, 'epoch': 1.45}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5710870027542114, 'eval_runtime': 1.019, 'eval_samples_per_second': 4.907, 'eval_steps_per_second': 0.981, 'epoch': 1.45}
{'loss': 1.4121, 'learning_rate': 1.4811859444908053e-05, 'epoch': 1.47}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5519123077392578, 'eval_runtime': 1.0625, 'eval_samples_per_second': 4.706, 'eval_steps_per_second': 0.941, 'epoch': 1.47}
{'loss': 0.991, 'learning_rate': 1.4669277086172406e-05, 'epoch': 1.49}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.530672311782837, 'eval_runtime': 1.0564, 'eval_samples_per_second': 4.733, 'eval_steps_per_second': 0.947, 'epoch': 1.49}
{'loss': 1.1855, 'learning_rate': 1.4525470290445392e-05, 'epoch': 1.51}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5250167846679688, 'eval_runtime': 1.0172, 'eval_samples_per_second': 4.916, 'eval_steps_per_second': 0.983, 'epoch': 1.51}
{'loss': 0.9791, 'learning_rate': 1.4380476768566825e-05, 'epoch': 1.53}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5176395177841187, 'eval_runtime': 0.9766, 'eval_samples_per_second': 5.12, 'eval_steps_per_second': 1.024, 'epoch': 1.53}
{'loss': 1.1704, 'learning_rate': 1.4234334542574906e-05, 'epoch': 1.55}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5166162252426147, 'eval_runtime': 1.0539, 'eval_samples_per_second': 4.744, 'eval_steps_per_second': 0.949, 'epoch': 1.55}
{'loss': 0.8702, 'learning_rate': 1.4087081935735565e-05, 'epoch': 1.57}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.518131136894226, 'eval_runtime': 1.0156, 'eval_samples_per_second': 4.923, 'eval_steps_per_second': 0.985, 'epoch': 1.57}
{'loss': 1.1582, 'learning_rate': 1.3938757562492873e-05, 'epoch': 1.59}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5083940029144287, 'eval_runtime': 1.0209, 'eval_samples_per_second': 4.898, 'eval_steps_per_second': 0.98, 'epoch': 1.59}
{'loss': 1.0805, 'learning_rate': 1.378940031834307e-05, 'epoch': 1.61}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5045589208602905, 'eval_runtime': 1.0413, 'eval_samples_per_second': 4.802, 'eval_steps_per_second': 0.96, 'epoch': 1.61}
{'loss': 1.3099, 'learning_rate': 1.3639049369634878e-05, 'epoch': 1.63}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4955168962478638, 'eval_runtime': 0.9573, 'eval_samples_per_second': 5.223, 'eval_steps_per_second': 1.045, 'epoch': 1.63}
{'loss': 1.2066, 'learning_rate': 1.3487744143298822e-05, 'epoch': 1.65}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4817583560943604, 'eval_runtime': 0.9507, 'eval_samples_per_second': 5.259, 'eval_steps_per_second': 1.052, 'epoch': 1.65}
{'loss': 1.0825, 'learning_rate': 1.3335524316508208e-05, 'epoch': 1.67}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.48458731174469, 'eval_runtime': 1.0285, 'eval_samples_per_second': 4.862, 'eval_steps_per_second': 0.972, 'epoch': 1.67}
{'loss': 1.0802, 'learning_rate': 1.3182429806274442e-05, 'epoch': 1.69}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4848607778549194, 'eval_runtime': 1.062, 'eval_samples_per_second': 4.708, 'eval_steps_per_second': 0.942, 'epoch': 1.69}
{'loss': 1.7319, 'learning_rate': 1.3028500758979507e-05, 'epoch': 1.71}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4854938983917236, 'eval_runtime': 0.9737, 'eval_samples_per_second': 5.135, 'eval_steps_per_second': 1.027, 'epoch': 1.71}
{'loss': 1.5408, 'learning_rate': 1.2873777539848284e-05, 'epoch': 1.73}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4908990859985352, 'eval_runtime': 1.0458, 'eval_samples_per_second': 4.781, 'eval_steps_per_second': 0.956, 'epoch': 1.73}
{'loss': 0.5243, 'learning_rate': 1.2718300722363431e-05, 'epoch': 1.76}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4992594718933105, 'eval_runtime': 1.0591, 'eval_samples_per_second': 4.721, 'eval_steps_per_second': 0.944, 'epoch': 1.76}
{'loss': 1.0521, 'learning_rate': 1.2562111077625723e-05, 'epoch': 1.78}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4942649602890015, 'eval_runtime': 1.0022, 'eval_samples_per_second': 4.989, 'eval_steps_per_second': 0.998, 'epoch': 1.78}
{'loss': 1.0145, 'learning_rate': 1.2405249563662539e-05, 'epoch': 1.8}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4867466688156128, 'eval_runtime': 1.0396, 'eval_samples_per_second': 4.809, 'eval_steps_per_second': 0.962, 'epoch': 1.8}
{'loss': 1.0813, 'learning_rate': 1.2247757314687296e-05, 'epoch': 1.82}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4759937524795532, 'eval_runtime': 1.0275, 'eval_samples_per_second': 4.866, 'eval_steps_per_second': 0.973, 'epoch': 1.82}
{'loss': 1.1515, 'learning_rate': 1.2089675630312755e-05, 'epoch': 1.84}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4461733102798462, 'eval_runtime': 0.9718, 'eval_samples_per_second': 5.145, 'eval_steps_per_second': 1.029, 'epoch': 1.84}
{'loss': 0.9266, 'learning_rate': 1.1931045964720882e-05, 'epoch': 1.86}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4357622861862183, 'eval_runtime': 0.9622, 'eval_samples_per_second': 5.196, 'eval_steps_per_second': 1.039, 'epoch': 1.86}
{'loss': 0.6752, 'learning_rate': 1.177190991579223e-05, 'epoch': 1.88}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.432822346687317, 'eval_runtime': 1.0587, 'eval_samples_per_second': 4.723, 'eval_steps_per_second': 0.945, 'epoch': 1.88}
{'loss': 1.1664, 'learning_rate': 1.1612309214197599e-05, 'epoch': 1.9}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.434196949005127, 'eval_runtime': 1.0279, 'eval_samples_per_second': 4.864, 'eval_steps_per_second': 0.973, 'epoch': 1.9}
{'loss': 1.1168, 'learning_rate': 1.1452285712454905e-05, 'epoch': 1.92}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4389960765838623, 'eval_runtime': 0.9607, 'eval_samples_per_second': 5.205, 'eval_steps_per_second': 1.041, 'epoch': 1.92}
{'loss': 1.3819, 'learning_rate': 1.1291881373954066e-05, 'epoch': 1.94}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4467544555664062, 'eval_runtime': 0.9792, 'eval_samples_per_second': 5.106, 'eval_steps_per_second': 1.021, 'epoch': 1.94}
{'loss': 0.9204, 'learning_rate': 1.1131138261952845e-05, 'epoch': 1.96}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4450793266296387, 'eval_runtime': 0.9663, 'eval_samples_per_second': 5.174, 'eval_steps_per_second': 1.035, 'epoch': 1.96}
{'loss': 0.8669, 'learning_rate': 1.0970098528546482e-05, 'epoch': 1.98}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4357173442840576, 'eval_runtime': 1.0074, 'eval_samples_per_second': 4.963, 'eval_steps_per_second': 0.993, 'epoch': 1.98}
{'loss': 1.0333, 'learning_rate': 1.0808804403614044e-05, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.423604965209961, 'eval_runtime': 0.9901, 'eval_samples_per_second': 5.05, 'eval_steps_per_second': 1.01, 'epoch': 2.0}




{'loss': 1.0886, 'learning_rate': 1.0647298183744359e-05, 'epoch': 2.02}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4128453731536865, 'eval_runtime': 0.9619, 'eval_samples_per_second': 5.198, 'eval_steps_per_second': 1.04, 'epoch': 2.02}
{'loss': 1.1797, 'learning_rate': 1.0485622221144485e-05, 'epoch': 2.04}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4084722995758057, 'eval_runtime': 1.0745, 'eval_samples_per_second': 4.653, 'eval_steps_per_second': 0.931, 'epoch': 2.04}
{'loss': 1.0462, 'learning_rate': 1.0323818912533561e-05, 'epoch': 2.06}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.409127950668335, 'eval_runtime': 1.0777, 'eval_samples_per_second': 4.639, 'eval_steps_per_second': 0.928, 'epoch': 2.06}
{'loss': 1.009, 'learning_rate': 1.0161930688025018e-05, 'epoch': 2.08}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4156675338745117, 'eval_runtime': 1.056, 'eval_samples_per_second': 4.735, 'eval_steps_per_second': 0.947, 'epoch': 2.08}
{'loss': 0.7713, 'learning_rate': 1e-05, 'epoch': 2.1}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4276854991912842, 'eval_runtime': 1.067, 'eval_samples_per_second': 4.686, 'eval_steps_per_second': 0.937, 'epoch': 2.1}
{'loss': 1.1869, 'learning_rate': 9.838069311974986e-06, 'epoch': 2.12}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4372100830078125, 'eval_runtime': 1.0629, 'eval_samples_per_second': 4.704, 'eval_steps_per_second': 0.941, 'epoch': 2.12}
{'loss': 0.5705, 'learning_rate': 9.676181087466444e-06, 'epoch': 2.14}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4451981782913208, 'eval_runtime': 0.9772, 'eval_samples_per_second': 5.117, 'eval_steps_per_second': 1.023, 'epoch': 2.14}
{'loss': 0.8965, 'learning_rate': 9.514377778855521e-06, 'epoch': 2.16}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4561508893966675, 'eval_runtime': 1.0782, 'eval_samples_per_second': 4.637, 'eval_steps_per_second': 0.927, 'epoch': 2.16}
{'loss': 0.6888, 'learning_rate': 9.352701816255643e-06, 'epoch': 2.18}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4562872648239136, 'eval_runtime': 1.0377, 'eval_samples_per_second': 4.818, 'eval_steps_per_second': 0.964, 'epoch': 2.18}
{'loss': 0.682, 'learning_rate': 9.19119559638596e-06, 'epoch': 2.2}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4598742723464966, 'eval_runtime': 1.0285, 'eval_samples_per_second': 4.862, 'eval_steps_per_second': 0.972, 'epoch': 2.2}
{'loss': 0.8815, 'learning_rate': 9.02990147145352e-06, 'epoch': 2.22}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4600144624710083, 'eval_runtime': 1.0707, 'eval_samples_per_second': 4.67, 'eval_steps_per_second': 0.934, 'epoch': 2.22}
{'loss': 0.9211, 'learning_rate': 8.868861738047158e-06, 'epoch': 2.24}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.465942144393921, 'eval_runtime': 0.962, 'eval_samples_per_second': 5.197, 'eval_steps_per_second': 1.039, 'epoch': 2.24}
{'loss': 0.8063, 'learning_rate': 8.708118626045939e-06, 'epoch': 2.27}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.46628999710083, 'eval_runtime': 1.0378, 'eval_samples_per_second': 4.818, 'eval_steps_per_second': 0.964, 'epoch': 2.27}
{'loss': 0.6676, 'learning_rate': 8.5477142875451e-06, 'epoch': 2.29}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4635310173034668, 'eval_runtime': 1.0753, 'eval_samples_per_second': 4.65, 'eval_steps_per_second': 0.93, 'epoch': 2.29}
{'loss': 1.0024, 'learning_rate': 8.387690785802403e-06, 'epoch': 2.31}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4577163457870483, 'eval_runtime': 0.9904, 'eval_samples_per_second': 5.049, 'eval_steps_per_second': 1.01, 'epoch': 2.31}
{'loss': 0.9457, 'learning_rate': 8.228090084207773e-06, 'epoch': 2.33}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4535603523254395, 'eval_runtime': 0.9959, 'eval_samples_per_second': 5.02, 'eval_steps_per_second': 1.004, 'epoch': 2.33}
{'loss': 1.0273, 'learning_rate': 8.068954035279121e-06, 'epoch': 2.35}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4479535818099976, 'eval_runtime': 1.0726, 'eval_samples_per_second': 4.662, 'eval_steps_per_second': 0.932, 'epoch': 2.35}
{'loss': 0.5464, 'learning_rate': 7.91032436968725e-06, 'epoch': 2.37}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4496335983276367, 'eval_runtime': 0.9752, 'eval_samples_per_second': 5.127, 'eval_steps_per_second': 1.025, 'epoch': 2.37}
{'loss': 0.7404, 'learning_rate': 7.752242685312709e-06, 'epoch': 2.39}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4582407474517822, 'eval_runtime': 1.0953, 'eval_samples_per_second': 4.565, 'eval_steps_per_second': 0.913, 'epoch': 2.39}
{'loss': 0.7804, 'learning_rate': 7.594750436337467e-06, 'epoch': 2.41}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4658985137939453, 'eval_runtime': 1.065, 'eval_samples_per_second': 4.695, 'eval_steps_per_second': 0.939, 'epoch': 2.41}
{'loss': 0.9942, 'learning_rate': 7.4378889223742766e-06, 'epoch': 2.43}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4701298475265503, 'eval_runtime': 0.9789, 'eval_samples_per_second': 5.108, 'eval_steps_per_second': 1.022, 'epoch': 2.43}
{'loss': 0.9433, 'learning_rate': 7.2816992776365714e-06, 'epoch': 2.45}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.472982406616211, 'eval_runtime': 1.0568, 'eval_samples_per_second': 4.731, 'eval_steps_per_second': 0.946, 'epoch': 2.45}
{'loss': 0.8804, 'learning_rate': 7.126222460151719e-06, 'epoch': 2.47}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4687778949737549, 'eval_runtime': 1.0832, 'eval_samples_per_second': 4.616, 'eval_steps_per_second': 0.923, 'epoch': 2.47}
{'loss': 0.7836, 'learning_rate': 6.971499241020495e-06, 'epoch': 2.49}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4657362699508667, 'eval_runtime': 0.9963, 'eval_samples_per_second': 5.019, 'eval_steps_per_second': 1.004, 'epoch': 2.49}
{'loss': 0.7613, 'learning_rate': 6.8175701937255645e-06, 'epoch': 2.51}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4587688446044922, 'eval_runtime': 1.0823, 'eval_samples_per_second': 4.62, 'eval_steps_per_second': 0.924, 'epoch': 2.51}
{'loss': 0.8007, 'learning_rate': 6.664475683491797e-06, 'epoch': 2.53}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4564504623413086, 'eval_runtime': 1.0827, 'eval_samples_per_second': 4.618, 'eval_steps_per_second': 0.924, 'epoch': 2.53}
{'loss': 0.7768, 'learning_rate': 6.5122558567011775e-06, 'epoch': 2.55}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.450104832649231, 'eval_runtime': 1.0357, 'eval_samples_per_second': 4.828, 'eval_steps_per_second': 0.966, 'epoch': 2.55}
{'loss': 0.9832, 'learning_rate': 6.360950630365126e-06, 'epoch': 2.57}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4430418014526367, 'eval_runtime': 1.009, 'eval_samples_per_second': 4.956, 'eval_steps_per_second': 0.991, 'epoch': 2.57}
{'loss': 0.7297, 'learning_rate': 6.210599681656933e-06, 'epoch': 2.59}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4410481452941895, 'eval_runtime': 1.0868, 'eval_samples_per_second': 4.601, 'eval_steps_per_second': 0.92, 'epoch': 2.59}
{'loss': 0.8646, 'learning_rate': 6.061242437507131e-06, 'epoch': 2.61}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4439502954483032, 'eval_runtime': 1.0859, 'eval_samples_per_second': 4.605, 'eval_steps_per_second': 0.921, 'epoch': 2.61}
{'loss': 1.1847, 'learning_rate': 5.912918064264441e-06, 'epoch': 2.63}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4449137449264526, 'eval_runtime': 1.0488, 'eval_samples_per_second': 4.767, 'eval_steps_per_second': 0.953, 'epoch': 2.63}
{'loss': 0.7582, 'learning_rate': 5.765665457425102e-06, 'epoch': 2.65}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4396737813949585, 'eval_runtime': 1.0055, 'eval_samples_per_second': 4.973, 'eval_steps_per_second': 0.995, 'epoch': 2.65}
{'loss': 1.024, 'learning_rate': 5.619523231433177e-06, 'epoch': 2.67}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4311939477920532, 'eval_runtime': 1.0829, 'eval_samples_per_second': 4.617, 'eval_steps_per_second': 0.923, 'epoch': 2.67}
{'loss': 0.6909, 'learning_rate': 5.4745297095546125e-06, 'epoch': 2.69}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.429716944694519, 'eval_runtime': 0.9653, 'eval_samples_per_second': 5.18, 'eval_steps_per_second': 1.036, 'epoch': 2.69}
{'loss': 0.9462, 'learning_rate': 5.330722913827594e-06, 'epoch': 2.71}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.431138515472412, 'eval_runtime': 1.0856, 'eval_samples_per_second': 4.606, 'eval_steps_per_second': 0.921, 'epoch': 2.71}
{'loss': 0.6868, 'learning_rate': 5.18814055509195e-06, 'epoch': 2.73}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4344227313995361, 'eval_runtime': 1.076, 'eval_samples_per_second': 4.647, 'eval_steps_per_second': 0.929, 'epoch': 2.73}
{'loss': 0.9798, 'learning_rate': 5.046820023100129e-06, 'epoch': 2.76}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4379546642303467, 'eval_runtime': 1.0608, 'eval_samples_per_second': 4.713, 'eval_steps_per_second': 0.943, 'epoch': 2.76}
{'loss': 1.2549, 'learning_rate': 4.9067983767123736e-06, 'epoch': 2.78}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4391754865646362, 'eval_runtime': 0.979, 'eval_samples_per_second': 5.107, 'eval_steps_per_second': 1.021, 'epoch': 2.78}
{'loss': 0.5431, 'learning_rate': 4.7681123341787e-06, 'epoch': 2.8}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4393600225448608, 'eval_runtime': 1.0828, 'eval_samples_per_second': 4.618, 'eval_steps_per_second': 0.924, 'epoch': 2.8}
{'loss': 0.7168, 'learning_rate': 4.630798263510162e-06, 'epoch': 2.82}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4391018152236938, 'eval_runtime': 0.9792, 'eval_samples_per_second': 5.106, 'eval_steps_per_second': 1.021, 'epoch': 2.82}
{'loss': 0.8719, 'learning_rate': 4.494892172941965e-06, 'epoch': 2.84}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.43899405002594, 'eval_runtime': 1.081, 'eval_samples_per_second': 4.625, 'eval_steps_per_second': 0.925, 'epoch': 2.84}
{'loss': 0.6935, 'learning_rate': 4.360429701490935e-06, 'epoch': 2.86}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4359900951385498, 'eval_runtime': 1.0913, 'eval_samples_per_second': 4.582, 'eval_steps_per_second': 0.916, 'epoch': 2.86}
{'loss': 0.7472, 'learning_rate': 4.2274461096098085e-06, 'epoch': 2.88}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.422940969467163, 'eval_runtime': 1.0104, 'eval_samples_per_second': 4.949, 'eval_steps_per_second': 0.99, 'epoch': 2.88}
{'loss': 0.7485, 'learning_rate': 4.095976269940777e-06, 'epoch': 2.9}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4084949493408203, 'eval_runtime': 1.0225, 'eval_samples_per_second': 4.89, 'eval_steps_per_second': 0.978, 'epoch': 2.9}
{'loss': 0.8291, 'learning_rate': 3.966054658170754e-06, 'epoch': 2.92}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3976655006408691, 'eval_runtime': 1.0994, 'eval_samples_per_second': 4.548, 'eval_steps_per_second': 0.91, 'epoch': 2.92}
{'loss': 0.8684, 'learning_rate': 3.837715343990727e-06, 'epoch': 2.94}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3934195041656494, 'eval_runtime': 1.0014, 'eval_samples_per_second': 4.993, 'eval_steps_per_second': 0.999, 'epoch': 2.94}
{'loss': 0.7158, 'learning_rate': 3.7109919821615546e-06, 'epoch': 2.96}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3929800987243652, 'eval_runtime': 1.0906, 'eval_samples_per_second': 4.585, 'eval_steps_per_second': 0.917, 'epoch': 2.96}
{'loss': 0.9039, 'learning_rate': 3.585917803688603e-06, 'epoch': 2.98}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3936004638671875, 'eval_runtime': 1.0934, 'eval_samples_per_second': 4.573, 'eval_steps_per_second': 0.915, 'epoch': 2.98}
{'loss': 0.6393, 'learning_rate': 3.4625256071074776e-06, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3933602571487427, 'eval_runtime': 1.0914, 'eval_samples_per_second': 4.581, 'eval_steps_per_second': 0.916, 'epoch': 3.0}




{'loss': 0.5457, 'learning_rate': 3.3408477498831917e-06, 'epoch': 3.02}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3917125463485718, 'eval_runtime': 1.0068, 'eval_samples_per_second': 4.966, 'eval_steps_per_second': 0.993, 'epoch': 3.02}
{'loss': 1.0716, 'learning_rate': 3.2209161399249677e-06, 'epoch': 3.04}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3921622037887573, 'eval_runtime': 1.002, 'eval_samples_per_second': 4.99, 'eval_steps_per_second': 0.998, 'epoch': 3.04}
{'loss': 0.5797, 'learning_rate': 3.1027622272189572e-06, 'epoch': 3.06}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3907551765441895, 'eval_runtime': 1.0163, 'eval_samples_per_second': 4.92, 'eval_steps_per_second': 0.984, 'epoch': 3.06}
{'loss': 0.5073, 'learning_rate': 2.9864169955810085e-06, 'epoch': 3.08}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3910346031188965, 'eval_runtime': 1.054, 'eval_samples_per_second': 4.744, 'eval_steps_per_second': 0.949, 'epoch': 3.08}
{'loss': 0.5619, 'learning_rate': 2.8719109545317102e-06, 'epoch': 3.1}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3925131559371948, 'eval_runtime': 1.0235, 'eval_samples_per_second': 4.885, 'eval_steps_per_second': 0.977, 'epoch': 3.1}
{'loss': 0.7002, 'learning_rate': 2.759274131295787e-06, 'epoch': 3.12}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3947409391403198, 'eval_runtime': 1.1165, 'eval_samples_per_second': 4.478, 'eval_steps_per_second': 0.896, 'epoch': 3.12}
{'loss': 0.9512, 'learning_rate': 2.648536062927999e-06, 'epoch': 3.14}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3973573446273804, 'eval_runtime': 1.0989, 'eval_samples_per_second': 4.55, 'eval_steps_per_second': 0.91, 'epoch': 3.14}
{'loss': 0.6535, 'learning_rate': 2.5397257885675396e-06, 'epoch': 3.16}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3991589546203613, 'eval_runtime': 1.0102, 'eval_samples_per_second': 4.95, 'eval_steps_per_second': 0.99, 'epoch': 3.16}
{'loss': 0.3872, 'learning_rate': 2.432871841823047e-06, 'epoch': 3.18}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4013789892196655, 'eval_runtime': 1.0104, 'eval_samples_per_second': 4.949, 'eval_steps_per_second': 0.99, 'epoch': 3.18}
{'loss': 0.6217, 'learning_rate': 2.328002243290138e-06, 'epoch': 3.2}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4049789905548096, 'eval_runtime': 1.0717, 'eval_samples_per_second': 4.665, 'eval_steps_per_second': 0.933, 'epoch': 3.2}
{'loss': 0.6864, 'learning_rate': 2.2251444932035094e-06, 'epoch': 3.22}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.401507019996643, 'eval_runtime': 1.0146, 'eval_samples_per_second': 4.928, 'eval_steps_per_second': 0.986, 'epoch': 3.22}
{'loss': 0.4067, 'learning_rate': 2.124325564225458e-06, 'epoch': 3.24}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3965617418289185, 'eval_runtime': 0.9969, 'eval_samples_per_second': 5.015, 'eval_steps_per_second': 1.003, 'epoch': 3.24}
{'loss': 0.4893, 'learning_rate': 2.025571894372794e-06, 'epoch': 3.27}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3938764333724976, 'eval_runtime': 1.0924, 'eval_samples_per_second': 4.577, 'eval_steps_per_second': 0.915, 'epoch': 3.27}
{'loss': 0.5004, 'learning_rate': 1.9289093800839067e-06, 'epoch': 3.29}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.395090103149414, 'eval_runtime': 0.9963, 'eval_samples_per_second': 5.019, 'eval_steps_per_second': 1.004, 'epoch': 3.29}
{'loss': 0.9775, 'learning_rate': 1.8343633694278895e-06, 'epoch': 3.31}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3961716890335083, 'eval_runtime': 0.9815, 'eval_samples_per_second': 5.094, 'eval_steps_per_second': 1.019, 'epoch': 3.31}
{'loss': 0.9014, 'learning_rate': 1.7419586554574364e-06, 'epoch': 3.33}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3970109224319458, 'eval_runtime': 1.0293, 'eval_samples_per_second': 4.858, 'eval_steps_per_second': 0.972, 'epoch': 3.33}
{'loss': 0.8747, 'learning_rate': 1.6517194697072903e-06, 'epoch': 3.35}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3974961042404175, 'eval_runtime': 1.0153, 'eval_samples_per_second': 4.925, 'eval_steps_per_second': 0.985, 'epoch': 3.35}
{'loss': 0.7479, 'learning_rate': 1.5636694758399563e-06, 'epoch': 3.37}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3981674909591675, 'eval_runtime': 0.9887, 'eval_samples_per_second': 5.057, 'eval_steps_per_second': 1.011, 'epoch': 3.37}
{'loss': 0.5784, 'learning_rate': 1.4778317634403082e-06, 'epoch': 3.39}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3986719846725464, 'eval_runtime': 1.0117, 'eval_samples_per_second': 4.942, 'eval_steps_per_second': 0.988, 'epoch': 3.39}
{'loss': 0.7599, 'learning_rate': 1.3942288419607476e-06, 'epoch': 3.41}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4003371000289917, 'eval_runtime': 1.1018, 'eval_samples_per_second': 4.538, 'eval_steps_per_second': 0.908, 'epoch': 3.41}
{'loss': 0.425, 'learning_rate': 1.3128826348184886e-06, 'epoch': 3.43}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4018553495407104, 'eval_runtime': 1.0175, 'eval_samples_per_second': 4.914, 'eval_steps_per_second': 0.983, 'epoch': 3.43}
{'loss': 0.5207, 'learning_rate': 1.233814473646524e-06, 'epoch': 3.45}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.403199553489685, 'eval_runtime': 1.079, 'eval_samples_per_second': 4.634, 'eval_steps_per_second': 0.927, 'epoch': 3.45}
{'loss': 0.8591, 'learning_rate': 1.1570450926997657e-06, 'epoch': 3.47}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4040404558181763, 'eval_runtime': 1.1012, 'eval_samples_per_second': 4.541, 'eval_steps_per_second': 0.908, 'epoch': 3.47}
{'loss': 0.5839, 'learning_rate': 1.0825946234178575e-06, 'epoch': 3.49}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4041695594787598, 'eval_runtime': 1.0866, 'eval_samples_per_second': 4.601, 'eval_steps_per_second': 0.92, 'epoch': 3.49}
{'loss': 0.7019, 'learning_rate': 1.010482589146048e-06, 'epoch': 3.51}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4045270681381226, 'eval_runtime': 1.0526, 'eval_samples_per_second': 4.75, 'eval_steps_per_second': 0.95, 'epoch': 3.51}
{'loss': 0.4606, 'learning_rate': 9.407279000155311e-07, 'epoch': 3.53}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4043595790863037, 'eval_runtime': 1.0391, 'eval_samples_per_second': 4.812, 'eval_steps_per_second': 0.962, 'epoch': 3.53}
{'loss': 0.8912, 'learning_rate': 8.733488479845997e-07, 'epoch': 3.55}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4043699502944946, 'eval_runtime': 0.971, 'eval_samples_per_second': 5.149, 'eval_steps_per_second': 1.03, 'epoch': 3.55}
{'loss': 0.6471, 'learning_rate': 8.083631020418792e-07, 'epoch': 3.57}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4047304391860962, 'eval_runtime': 1.0159, 'eval_samples_per_second': 4.922, 'eval_steps_per_second': 0.984, 'epoch': 3.57}
{'loss': 0.5152, 'learning_rate': 7.457877035729588e-07, 'epoch': 3.59}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4050421714782715, 'eval_runtime': 1.0543, 'eval_samples_per_second': 4.742, 'eval_steps_per_second': 0.948, 'epoch': 3.59}
{'loss': 0.4845, 'learning_rate': 6.856390618915775e-07, 'epoch': 3.61}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.403854489326477, 'eval_runtime': 1.0068, 'eval_samples_per_second': 4.966, 'eval_steps_per_second': 0.993, 'epoch': 3.61}
{'loss': 0.6449, 'learning_rate': 6.279329499365649e-07, 'epoch': 3.63}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4031351804733276, 'eval_runtime': 1.0801, 'eval_samples_per_second': 4.629, 'eval_steps_per_second': 0.926, 'epoch': 3.63}
{'loss': 0.7303, 'learning_rate': 5.726845001356573e-07, 'epoch': 3.65}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4025185108184814, 'eval_runtime': 1.0059, 'eval_samples_per_second': 4.971, 'eval_steps_per_second': 0.994, 'epoch': 3.65}
{'loss': 0.4894, 'learning_rate': 5.199082004372958e-07, 'epoch': 3.67}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4022027254104614, 'eval_runtime': 1.0634, 'eval_samples_per_second': 4.702, 'eval_steps_per_second': 0.94, 'epoch': 3.67}
{'loss': 0.6502, 'learning_rate': 4.696178905113913e-07, 'epoch': 3.69}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4021401405334473, 'eval_runtime': 1.0568, 'eval_samples_per_second': 4.731, 'eval_steps_per_second': 0.946, 'epoch': 3.69}
{'loss': 0.8449, 'learning_rate': 4.218267581201296e-07, 'epoch': 3.71}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4019750356674194, 'eval_runtime': 1.0934, 'eval_samples_per_second': 4.573, 'eval_steps_per_second': 0.915, 'epoch': 3.71}
{'loss': 0.7148, 'learning_rate': 3.7654733565969826e-07, 'epoch': 3.73}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4019005298614502, 'eval_runtime': 1.0192, 'eval_samples_per_second': 4.906, 'eval_steps_per_second': 0.981, 'epoch': 3.73}
{'loss': 0.7008, 'learning_rate': 3.3379149687388866e-07, 'epoch': 3.76}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4019414186477661, 'eval_runtime': 1.0479, 'eval_samples_per_second': 4.772, 'eval_steps_per_second': 0.954, 'epoch': 3.76}
{'loss': 0.5209, 'learning_rate': 2.935704537404083e-07, 'epoch': 3.78}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4018391370773315, 'eval_runtime': 1.0049, 'eval_samples_per_second': 4.976, 'eval_steps_per_second': 0.995, 'epoch': 3.78}
{'loss': 1.022, 'learning_rate': 2.5589475353073987e-07, 'epoch': 3.8}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4015519618988037, 'eval_runtime': 1.0483, 'eval_samples_per_second': 4.77, 'eval_steps_per_second': 0.954, 'epoch': 3.8}
{'loss': 0.8529, 'learning_rate': 2.2077427604429435e-07, 'epoch': 3.82}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.40128755569458, 'eval_runtime': 0.9964, 'eval_samples_per_second': 5.018, 'eval_steps_per_second': 1.004, 'epoch': 3.82}
{'loss': 0.4514, 'learning_rate': 1.8821823101760949e-07, 'epoch': 3.84}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4013633728027344, 'eval_runtime': 0.9891, 'eval_samples_per_second': 5.055, 'eval_steps_per_second': 1.011, 'epoch': 3.84}
{'loss': 0.5137, 'learning_rate': 1.5823515570925763e-07, 'epoch': 3.86}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4015636444091797, 'eval_runtime': 1.1096, 'eval_samples_per_second': 4.506, 'eval_steps_per_second': 0.901, 'epoch': 3.86}
{'loss': 0.9131, 'learning_rate': 1.30832912661093e-07, 'epoch': 3.88}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4016257524490356, 'eval_runtime': 1.0986, 'eval_samples_per_second': 4.551, 'eval_steps_per_second': 0.91, 'epoch': 3.88}
{'loss': 0.5213, 'learning_rate': 1.0601868763643997e-07, 'epoch': 3.9}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4016882181167603, 'eval_runtime': 0.9889, 'eval_samples_per_second': 5.056, 'eval_steps_per_second': 1.011, 'epoch': 3.9}
{'loss': 0.5542, 'learning_rate': 8.379898773574924e-08, 'epoch': 3.92}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4017866849899292, 'eval_runtime': 1.0193, 'eval_samples_per_second': 4.905, 'eval_steps_per_second': 0.981, 'epoch': 3.92}
{'loss': 0.9475, 'learning_rate': 6.417963969022389e-08, 'epoch': 3.94}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4018710851669312, 'eval_runtime': 1.1011, 'eval_samples_per_second': 4.541, 'eval_steps_per_second': 0.908, 'epoch': 3.94}
{'loss': 0.6425, 'learning_rate': 4.716578833386054e-08, 'epoch': 3.96}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4019339084625244, 'eval_runtime': 1.0211, 'eval_samples_per_second': 4.897, 'eval_steps_per_second': 0.979, 'epoch': 3.96}
{'loss': 0.886, 'learning_rate': 3.2761895254306285e-08, 'epoch': 3.98}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4019321203231812, 'eval_runtime': 1.0558, 'eval_samples_per_second': 4.736, 'eval_steps_per_second': 0.947, 'epoch': 3.98}
{'loss': 0.7525, 'learning_rate': 2.0971737622883515e-08, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4019006490707397, 'eval_runtime': 1.0893, 'eval_samples_per_second': 4.59, 'eval_steps_per_second': 0.918, 'epoch': 4.0}




{'loss': 0.4966, 'learning_rate': 1.179840720409331e-08, 'epoch': 4.02}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4018852710723877, 'eval_runtime': 0.9669, 'eval_samples_per_second': 5.171, 'eval_steps_per_second': 1.034, 'epoch': 4.02}
{'loss': 0.6851, 'learning_rate': 5.2443095448506674e-09, 'epoch': 4.04}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4018710851669312, 'eval_runtime': 0.9897, 'eval_samples_per_second': 5.052, 'eval_steps_per_second': 1.01, 'epoch': 4.04}
{'loss': 0.7414, 'learning_rate': 1.3111633436779792e-09, 'epoch': 4.06}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4018678665161133, 'eval_runtime': 1.0225, 'eval_samples_per_second': 4.89, 'eval_steps_per_second': 0.978, 'epoch': 4.06}
{'loss': 0.4963, 'learning_rate': 0.0, 'epoch': 4.08}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4018681049346924, 'eval_runtime': 1.093, 'eval_samples_per_second': 4.575, 'eval_steps_per_second': 0.915, 'epoch': 4.08}
{'train_runtime': 2141.2376, 'train_samples_per_second': 0.467, 'train_steps_per_second': 0.467, 'train_loss': 1.2612421960830689, 'epoch': 4.08}


'https://huggingface.co/pechaut/Mistral-7b-instruct-cairo-PEFT/tree/main/'