pip install -q git+https://github.com/huggingface/trl

In [1]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# The instruction dataset to use
dataset_name = "StarkWizard/cairo-instruct"

# Fine-tuned model name
new_model = "StarkWizard/Mistral-7b-instruct-cairo-PEFT"



import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

hub_name = "StarkWizard/Mistral-7b-instruct-cairo-instruct"

max_steps = 1000 # to tweak to get the best out of the model 
nb_epochs = 3

In [2]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer
import os
 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name,
                                             trust_remote_code=True,
                                             quantization_config=bnb_config,
                                             use_auth_token=True,
                                             device_map={"": 0}
                                             )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [3]:
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import os

model.config.use_cache=False
model.config.pretraining_tp=1
model.config.window = 256 
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


Loading Dataset

In [4]:
from datasets import load_dataset

# Load the dataset
dataset_train = load_dataset(dataset_name, split="train", download_mode='force_redownload',ignore_verifications=True)
dataset_test = load_dataset(dataset_name, split="eval", download_mode='force_redownload',ignore_verifications=True)



Downloading readme:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/79.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/805 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/79.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/805 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [5]:



peft_config = LoraConfig(
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head"
    ],inference_mode = False
)

model = get_peft_model(model, peft_config)
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing = True,
    optim="paged_adamw_32bit",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    save_strategy="epoch",
    logging_dir="./logs", 
    logging_steps=5,
    num_train_epochs=nb_epochs,
    fp16=False,
    push_to_hub=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=512,
    neftune_noise_alpha=5
)


Map:   0%|          | 0/805 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



In [6]:

trainer.train()
trainer.model.push_to_hub(new_model)

  0%|          | 0/2415 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 4.3051, 'learning_rate': 1.3698630136986302e-06, 'epoch': 0.01}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.651289463043213, 'eval_runtime': 0.973, 'eval_samples_per_second': 5.139, 'eval_steps_per_second': 1.028, 'epoch': 0.01}
{'loss': 6.0984, 'learning_rate': 2.7397260273972604e-06, 'epoch': 0.01}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.645524024963379, 'eval_runtime': 0.9662, 'eval_samples_per_second': 5.175, 'eval_steps_per_second': 1.035, 'epoch': 0.01}
{'loss': 4.5181, 'learning_rate': 4.109589041095891e-06, 'epoch': 0.02}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.63462495803833, 'eval_runtime': 0.9752, 'eval_samples_per_second': 5.127, 'eval_steps_per_second': 1.025, 'epoch': 0.02}
{'loss': 5.0098, 'learning_rate': 5.479452054794521e-06, 'epoch': 0.02}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.619684219360352, 'eval_runtime': 1.0001, 'eval_samples_per_second': 4.999, 'eval_steps_per_second': 1.0, 'epoch': 0.02}
{'loss': 4.8269, 'learning_rate': 6.849315068493151e-06, 'epoch': 0.03}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.5942702293396, 'eval_runtime': 0.9782, 'eval_samples_per_second': 5.111, 'eval_steps_per_second': 1.022, 'epoch': 0.03}
{'loss': 4.3197, 'learning_rate': 8.219178082191782e-06, 'epoch': 0.04}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.550282001495361, 'eval_runtime': 1.0956, 'eval_samples_per_second': 4.564, 'eval_steps_per_second': 0.913, 'epoch': 0.04}
{'loss': 4.3894, 'learning_rate': 9.589041095890411e-06, 'epoch': 0.04}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.47790002822876, 'eval_runtime': 1.0404, 'eval_samples_per_second': 4.806, 'eval_steps_per_second': 0.961, 'epoch': 0.04}
{'loss': 3.8205, 'learning_rate': 1.0958904109589042e-05, 'epoch': 0.05}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.369492530822754, 'eval_runtime': 0.9832, 'eval_samples_per_second': 5.086, 'eval_steps_per_second': 1.017, 'epoch': 0.05}
{'loss': 3.8474, 'learning_rate': 1.2328767123287673e-05, 'epoch': 0.06}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.217759609222412, 'eval_runtime': 1.0958, 'eval_samples_per_second': 4.563, 'eval_steps_per_second': 0.913, 'epoch': 0.06}
{'loss': 4.5352, 'learning_rate': 1.3698630136986302e-05, 'epoch': 0.06}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.059573650360107, 'eval_runtime': 1.023, 'eval_samples_per_second': 4.888, 'eval_steps_per_second': 0.978, 'epoch': 0.06}
{'loss': 3.7626, 'learning_rate': 1.5068493150684933e-05, 'epoch': 0.07}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.8719234466552734, 'eval_runtime': 1.0847, 'eval_samples_per_second': 4.61, 'eval_steps_per_second': 0.922, 'epoch': 0.07}
{'loss': 3.0298, 'learning_rate': 1.6438356164383563e-05, 'epoch': 0.07}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.681899309158325, 'eval_runtime': 1.0931, 'eval_samples_per_second': 4.574, 'eval_steps_per_second': 0.915, 'epoch': 0.07}
{'loss': 3.3875, 'learning_rate': 1.7808219178082194e-05, 'epoch': 0.08}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.5443577766418457, 'eval_runtime': 1.0146, 'eval_samples_per_second': 4.928, 'eval_steps_per_second': 0.986, 'epoch': 0.08}
{'loss': 2.8085, 'learning_rate': 1.9178082191780822e-05, 'epoch': 0.09}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.470305919647217, 'eval_runtime': 0.9877, 'eval_samples_per_second': 5.062, 'eval_steps_per_second': 1.012, 'epoch': 0.09}
{'loss': 2.8103, 'learning_rate': 1.9999964012166784e-05, 'epoch': 0.09}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.4166626930236816, 'eval_runtime': 0.974, 'eval_samples_per_second': 5.133, 'eval_steps_per_second': 1.027, 'epoch': 0.09}
{'loss': 2.8745, 'learning_rate': 1.9999559152017842e-05, 'epoch': 0.1}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.339308500289917, 'eval_runtime': 0.9905, 'eval_samples_per_second': 5.048, 'eval_steps_per_second': 1.01, 'epoch': 0.1}
{'loss': 2.6028, 'learning_rate': 1.999870446520163e-05, 'epoch': 0.11}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.28226900100708, 'eval_runtime': 1.0981, 'eval_samples_per_second': 4.553, 'eval_steps_per_second': 0.911, 'epoch': 0.11}
{'loss': 2.7059, 'learning_rate': 1.9997399990165947e-05, 'epoch': 0.11}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.202972412109375, 'eval_runtime': 1.0907, 'eval_samples_per_second': 4.584, 'eval_steps_per_second': 0.917, 'epoch': 0.11}
{'loss': 2.7878, 'learning_rate': 1.9995645785592137e-05, 'epoch': 0.12}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.864076614379883, 'eval_runtime': 1.0899, 'eval_samples_per_second': 4.587, 'eval_steps_per_second': 0.917, 'epoch': 0.12}
{'loss': 2.7932, 'learning_rate': 1.999344193039248e-05, 'epoch': 0.12}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.819537401199341, 'eval_runtime': 0.9873, 'eval_samples_per_second': 5.064, 'eval_steps_per_second': 1.013, 'epoch': 0.12}
{'loss': 2.4089, 'learning_rate': 1.9990788523706636e-05, 'epoch': 0.13}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.770174264907837, 'eval_runtime': 1.0332, 'eval_samples_per_second': 4.839, 'eval_steps_per_second': 0.968, 'epoch': 0.13}
{'loss': 2.3262, 'learning_rate': 1.998768568489717e-05, 'epoch': 0.14}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.7181313037872314, 'eval_runtime': 0.9905, 'eval_samples_per_second': 5.048, 'eval_steps_per_second': 1.01, 'epoch': 0.14}
{'loss': 2.3868, 'learning_rate': 1.9984133553544204e-05, 'epoch': 0.14}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.651160717010498, 'eval_runtime': 1.0625, 'eval_samples_per_second': 4.706, 'eval_steps_per_second': 0.941, 'epoch': 0.14}
{'loss': 2.1462, 'learning_rate': 1.998013228943912e-05, 'epoch': 0.15}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.5865116119384766, 'eval_runtime': 1.0867, 'eval_samples_per_second': 4.601, 'eval_steps_per_second': 0.92, 'epoch': 0.15}
{'loss': 2.0003, 'learning_rate': 1.997568207257738e-05, 'epoch': 0.16}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.5381879806518555, 'eval_runtime': 1.1052, 'eval_samples_per_second': 4.524, 'eval_steps_per_second': 0.905, 'epoch': 0.16}
{'loss': 1.9509, 'learning_rate': 1.9970783103150434e-05, 'epoch': 0.16}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.469899892807007, 'eval_runtime': 1.0258, 'eval_samples_per_second': 4.874, 'eval_steps_per_second': 0.975, 'epoch': 0.16}
{'loss': 1.949, 'learning_rate': 1.996543560153671e-05, 'epoch': 0.17}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.41989803314209, 'eval_runtime': 1.0763, 'eval_samples_per_second': 4.646, 'eval_steps_per_second': 0.929, 'epoch': 0.17}
{'loss': 1.8966, 'learning_rate': 1.9959639808291694e-05, 'epoch': 0.17}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.367833375930786, 'eval_runtime': 1.0802, 'eval_samples_per_second': 4.629, 'eval_steps_per_second': 0.926, 'epoch': 0.17}
{'loss': 2.799, 'learning_rate': 1.9953395984137113e-05, 'epoch': 0.18}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.3347058296203613, 'eval_runtime': 1.0211, 'eval_samples_per_second': 4.897, 'eval_steps_per_second': 0.979, 'epoch': 0.18}
{'loss': 2.2398, 'learning_rate': 1.994670440994921e-05, 'epoch': 0.19}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.3252034187316895, 'eval_runtime': 1.0401, 'eval_samples_per_second': 4.807, 'eval_steps_per_second': 0.961, 'epoch': 0.19}
{'loss': 1.941, 'learning_rate': 1.993956538674611e-05, 'epoch': 0.19}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.3223793506622314, 'eval_runtime': 1.0136, 'eval_samples_per_second': 4.933, 'eval_steps_per_second': 0.987, 'epoch': 0.19}
{'loss': 2.013, 'learning_rate': 1.9931979235674274e-05, 'epoch': 0.2}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.314272880554199, 'eval_runtime': 1.0866, 'eval_samples_per_second': 4.602, 'eval_steps_per_second': 0.92, 'epoch': 0.2}
{'loss': 2.0168, 'learning_rate': 1.9923946297994044e-05, 'epoch': 0.2}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.297389268875122, 'eval_runtime': 0.9759, 'eval_samples_per_second': 5.123, 'eval_steps_per_second': 1.025, 'epoch': 0.2}
{'loss': 1.8037, 'learning_rate': 1.991546693506432e-05, 'epoch': 0.21}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.269404411315918, 'eval_runtime': 0.9768, 'eval_samples_per_second': 5.119, 'eval_steps_per_second': 1.024, 'epoch': 0.21}
{'loss': 2.4074, 'learning_rate': 1.9906541528326266e-05, 'epoch': 0.22}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.230644941329956, 'eval_runtime': 0.9996, 'eval_samples_per_second': 5.002, 'eval_steps_per_second': 1.0, 'epoch': 0.22}
{'loss': 1.9955, 'learning_rate': 1.9897170479286178e-05, 'epoch': 0.22}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.1970455646514893, 'eval_runtime': 1.0922, 'eval_samples_per_second': 4.578, 'eval_steps_per_second': 0.916, 'epoch': 0.22}
{'loss': 1.6786, 'learning_rate': 1.988735420949742e-05, 'epoch': 0.23}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.174060344696045, 'eval_runtime': 1.0586, 'eval_samples_per_second': 4.723, 'eval_steps_per_second': 0.945, 'epoch': 0.23}
{'loss': 2.2734, 'learning_rate': 1.9877093160541452e-05, 'epoch': 0.24}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.1545302867889404, 'eval_runtime': 0.9957, 'eval_samples_per_second': 5.022, 'eval_steps_per_second': 1.004, 'epoch': 0.24}
{'loss': 1.7815, 'learning_rate': 1.9866387794007968e-05, 'epoch': 0.24}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.1268558502197266, 'eval_runtime': 1.0686, 'eval_samples_per_second': 4.679, 'eval_steps_per_second': 0.936, 'epoch': 0.24}
{'loss': 1.8702, 'learning_rate': 1.9855238591474132e-05, 'epoch': 0.25}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.107708215713501, 'eval_runtime': 1.0888, 'eval_samples_per_second': 4.592, 'eval_steps_per_second': 0.918, 'epoch': 0.25}
{'loss': 1.679, 'learning_rate': 1.9843646054482914e-05, 'epoch': 0.25}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.0930638313293457, 'eval_runtime': 0.9792, 'eval_samples_per_second': 5.106, 'eval_steps_per_second': 1.021, 'epoch': 0.25}
{'loss': 1.764, 'learning_rate': 1.9831610704520537e-05, 'epoch': 0.26}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.068228006362915, 'eval_runtime': 1.008, 'eval_samples_per_second': 4.961, 'eval_steps_per_second': 0.992, 'epoch': 0.26}
{'loss': 1.8928, 'learning_rate': 1.9819133082993e-05, 'epoch': 0.27}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.046252489089966, 'eval_runtime': 1.0836, 'eval_samples_per_second': 4.614, 'eval_steps_per_second': 0.923, 'epoch': 0.27}
{'loss': 1.6704, 'learning_rate': 1.9806213751201746e-05, 'epoch': 0.27}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.0148766040802, 'eval_runtime': 1.089, 'eval_samples_per_second': 4.592, 'eval_steps_per_second': 0.918, 'epoch': 0.27}
{'loss': 1.6566, 'learning_rate': 1.9792853290318384e-05, 'epoch': 0.28}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9985826015472412, 'eval_runtime': 0.9911, 'eval_samples_per_second': 5.045, 'eval_steps_per_second': 1.009, 'epoch': 0.28}
{'loss': 1.6265, 'learning_rate': 1.977905230135857e-05, 'epoch': 0.29}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9878942966461182, 'eval_runtime': 1.0812, 'eval_samples_per_second': 4.624, 'eval_steps_per_second': 0.925, 'epoch': 0.29}
{'loss': 1.619, 'learning_rate': 1.9764811405154965e-05, 'epoch': 0.29}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9748655557632446, 'eval_runtime': 1.0875, 'eval_samples_per_second': 4.598, 'eval_steps_per_second': 0.92, 'epoch': 0.29}
{'loss': 1.9639, 'learning_rate': 1.9750131242329296e-05, 'epoch': 0.3}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9534509181976318, 'eval_runtime': 1.0096, 'eval_samples_per_second': 4.953, 'eval_steps_per_second': 0.991, 'epoch': 0.3}
{'loss': 1.9363, 'learning_rate': 1.9735012473263545e-05, 'epoch': 0.3}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9344098567962646, 'eval_runtime': 0.9877, 'eval_samples_per_second': 5.062, 'eval_steps_per_second': 1.012, 'epoch': 0.3}
{'loss': 1.1618, 'learning_rate': 1.971945577807025e-05, 'epoch': 0.31}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9247429370880127, 'eval_runtime': 1.0426, 'eval_samples_per_second': 4.796, 'eval_steps_per_second': 0.959, 'epoch': 0.31}
{'loss': 1.8425, 'learning_rate': 1.970346185656189e-05, 'epoch': 0.32}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.91989004611969, 'eval_runtime': 1.0962, 'eval_samples_per_second': 4.561, 'eval_steps_per_second': 0.912, 'epoch': 0.32}
{'loss': 1.5395, 'learning_rate': 1.9687031428219432e-05, 'epoch': 0.32}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9240295886993408, 'eval_runtime': 1.0893, 'eval_samples_per_second': 4.59, 'eval_steps_per_second': 0.918, 'epoch': 0.32}
{'loss': 1.6621, 'learning_rate': 1.9670165232159938e-05, 'epoch': 0.33}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9276336431503296, 'eval_runtime': 1.0312, 'eval_samples_per_second': 4.849, 'eval_steps_per_second': 0.97, 'epoch': 0.33}
{'loss': 1.7791, 'learning_rate': 1.965286402710333e-05, 'epoch': 0.34}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.928799033164978, 'eval_runtime': 1.0423, 'eval_samples_per_second': 4.797, 'eval_steps_per_second': 0.959, 'epoch': 0.34}
{'loss': 1.8282, 'learning_rate': 1.9635128591338265e-05, 'epoch': 0.34}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.929003119468689, 'eval_runtime': 1.1012, 'eval_samples_per_second': 4.541, 'eval_steps_per_second': 0.908, 'epoch': 0.34}
{'loss': 1.4037, 'learning_rate': 1.961695972268711e-05, 'epoch': 0.35}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9306358098983765, 'eval_runtime': 0.9807, 'eval_samples_per_second': 5.098, 'eval_steps_per_second': 1.02, 'epoch': 0.35}
{'loss': 1.4468, 'learning_rate': 1.9598358238470058e-05, 'epoch': 0.35}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9216468334197998, 'eval_runtime': 1.0526, 'eval_samples_per_second': 4.75, 'eval_steps_per_second': 0.95, 'epoch': 0.35}
{'loss': 1.5645, 'learning_rate': 1.9579324975468363e-05, 'epoch': 0.36}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9026925563812256, 'eval_runtime': 1.064, 'eval_samples_per_second': 4.699, 'eval_steps_per_second': 0.94, 'epoch': 0.36}
{'loss': 1.3587, 'learning_rate': 1.9559860789886697e-05, 'epoch': 0.37}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8776752948760986, 'eval_runtime': 0.9915, 'eval_samples_per_second': 5.043, 'eval_steps_per_second': 1.009, 'epoch': 0.37}
{'loss': 1.5783, 'learning_rate': 1.9539966557314634e-05, 'epoch': 0.37}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8569786548614502, 'eval_runtime': 0.9872, 'eval_samples_per_second': 5.065, 'eval_steps_per_second': 1.013, 'epoch': 0.37}
{'loss': 1.3736, 'learning_rate': 1.9519643172687263e-05, 'epoch': 0.38}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8367040157318115, 'eval_runtime': 1.0892, 'eval_samples_per_second': 4.59, 'eval_steps_per_second': 0.918, 'epoch': 0.38}
{'loss': 1.3657, 'learning_rate': 1.949889155024492e-05, 'epoch': 0.39}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8224834203720093, 'eval_runtime': 1.0464, 'eval_samples_per_second': 4.778, 'eval_steps_per_second': 0.956, 'epoch': 0.39}
{'loss': 1.6, 'learning_rate': 1.9477712623492083e-05, 'epoch': 0.39}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8208906650543213, 'eval_runtime': 1.0642, 'eval_samples_per_second': 4.698, 'eval_steps_per_second': 0.94, 'epoch': 0.39}
{'loss': 1.3556, 'learning_rate': 1.9456107345155346e-05, 'epoch': 0.4}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8269269466400146, 'eval_runtime': 1.045, 'eval_samples_per_second': 4.785, 'eval_steps_per_second': 0.957, 'epoch': 0.4}
{'loss': 1.3053, 'learning_rate': 1.94340766871406e-05, 'epoch': 0.4}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8399032354354858, 'eval_runtime': 1.001, 'eval_samples_per_second': 4.995, 'eval_steps_per_second': 0.999, 'epoch': 0.4}
{'loss': 1.3685, 'learning_rate': 1.941162164048928e-05, 'epoch': 0.41}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.858776330947876, 'eval_runtime': 1.076, 'eval_samples_per_second': 4.647, 'eval_steps_per_second': 0.929, 'epoch': 0.41}
{'loss': 1.5458, 'learning_rate': 1.9388743215333787e-05, 'epoch': 0.42}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8713802099227905, 'eval_runtime': 1.0591, 'eval_samples_per_second': 4.721, 'eval_steps_per_second': 0.944, 'epoch': 0.42}
{'loss': 1.2776, 'learning_rate': 1.9365442440852078e-05, 'epoch': 0.42}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8865792751312256, 'eval_runtime': 1.0057, 'eval_samples_per_second': 4.972, 'eval_steps_per_second': 0.994, 'epoch': 0.42}
{'loss': 1.9151, 'learning_rate': 1.934172036522133e-05, 'epoch': 0.43}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8839794397354126, 'eval_runtime': 1.0538, 'eval_samples_per_second': 4.745, 'eval_steps_per_second': 0.949, 'epoch': 0.43}
{'loss': 1.3067, 'learning_rate': 1.9317578055570812e-05, 'epoch': 0.43}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8786346912384033, 'eval_runtime': 1.103, 'eval_samples_per_second': 4.533, 'eval_steps_per_second': 0.907, 'epoch': 0.43}
{'loss': 1.4648, 'learning_rate': 1.929301659793387e-05, 'epoch': 0.44}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8800042867660522, 'eval_runtime': 0.9893, 'eval_samples_per_second': 5.054, 'eval_steps_per_second': 1.011, 'epoch': 0.44}
{'loss': 1.3578, 'learning_rate': 1.9268037097199074e-05, 'epoch': 0.45}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8745524883270264, 'eval_runtime': 1.0911, 'eval_samples_per_second': 4.582, 'eval_steps_per_second': 0.916, 'epoch': 0.45}
{'loss': 1.8549, 'learning_rate': 1.924264067706052e-05, 'epoch': 0.45}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8681930303573608, 'eval_runtime': 1.091, 'eval_samples_per_second': 4.583, 'eval_steps_per_second': 0.917, 'epoch': 0.45}
{'loss': 1.3957, 'learning_rate': 1.9216828479967274e-05, 'epoch': 0.46}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8491700887680054, 'eval_runtime': 1.0126, 'eval_samples_per_second': 4.938, 'eval_steps_per_second': 0.988, 'epoch': 0.46}
{'loss': 1.4982, 'learning_rate': 1.9190601667071983e-05, 'epoch': 0.47}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8256248235702515, 'eval_runtime': 1.0199, 'eval_samples_per_second': 4.903, 'eval_steps_per_second': 0.981, 'epoch': 0.47}
{'loss': 1.384, 'learning_rate': 1.916396141817865e-05, 'epoch': 0.47}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8096163272857666, 'eval_runtime': 1.0952, 'eval_samples_per_second': 4.566, 'eval_steps_per_second': 0.913, 'epoch': 0.47}
{'loss': 1.3536, 'learning_rate': 1.9136908931689535e-05, 'epoch': 0.48}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7986438274383545, 'eval_runtime': 1.0754, 'eval_samples_per_second': 4.65, 'eval_steps_per_second': 0.93, 'epoch': 0.48}
{'loss': 1.2109, 'learning_rate': 1.9109445424551274e-05, 'epoch': 0.48}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7756812572479248, 'eval_runtime': 1.091, 'eval_samples_per_second': 4.583, 'eval_steps_per_second': 0.917, 'epoch': 0.48}
{'loss': 1.46, 'learning_rate': 1.9081572132200122e-05, 'epoch': 0.49}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7699886560440063, 'eval_runtime': 1.0314, 'eval_samples_per_second': 4.848, 'eval_steps_per_second': 0.97, 'epoch': 0.49}
{'loss': 1.6749, 'learning_rate': 1.905329030850637e-05, 'epoch': 0.5}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7528040409088135, 'eval_runtime': 0.9925, 'eval_samples_per_second': 5.038, 'eval_steps_per_second': 1.008, 'epoch': 0.5}
{'loss': 1.7566, 'learning_rate': 1.902460122571796e-05, 'epoch': 0.5}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7424724102020264, 'eval_runtime': 1.0867, 'eval_samples_per_second': 4.601, 'eval_steps_per_second': 0.92, 'epoch': 0.5}
{'loss': 1.6968, 'learning_rate': 1.8995506174403235e-05, 'epoch': 0.51}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7398145198822021, 'eval_runtime': 1.0709, 'eval_samples_per_second': 4.669, 'eval_steps_per_second': 0.934, 'epoch': 0.51}
{'loss': 1.7462, 'learning_rate': 1.896600646339289e-05, 'epoch': 0.52}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7415310144424438, 'eval_runtime': 1.0893, 'eval_samples_per_second': 4.59, 'eval_steps_per_second': 0.918, 'epoch': 0.52}
{'loss': 1.2093, 'learning_rate': 1.89361034197211e-05, 'epoch': 0.52}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7434028387069702, 'eval_runtime': 1.0907, 'eval_samples_per_second': 4.584, 'eval_steps_per_second': 0.917, 'epoch': 0.52}
{'loss': 1.4057, 'learning_rate': 1.890579838856581e-05, 'epoch': 0.53}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7503207921981812, 'eval_runtime': 0.9797, 'eval_samples_per_second': 5.103, 'eval_steps_per_second': 1.021, 'epoch': 0.53}
{'loss': 1.3066, 'learning_rate': 1.8875092733188232e-05, 'epoch': 0.53}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7555660009384155, 'eval_runtime': 1.0158, 'eval_samples_per_second': 4.922, 'eval_steps_per_second': 0.984, 'epoch': 0.53}
{'loss': 1.3021, 'learning_rate': 1.8843987834871532e-05, 'epoch': 0.54}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7512025833129883, 'eval_runtime': 0.9815, 'eval_samples_per_second': 5.094, 'eval_steps_per_second': 1.019, 'epoch': 0.54}
{'loss': 1.3985, 'learning_rate': 1.8812485092858662e-05, 'epoch': 0.55}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7461583614349365, 'eval_runtime': 1.0926, 'eval_samples_per_second': 4.576, 'eval_steps_per_second': 0.915, 'epoch': 0.55}
{'loss': 1.3506, 'learning_rate': 1.8780585924289443e-05, 'epoch': 0.55}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7455589771270752, 'eval_runtime': 0.9794, 'eval_samples_per_second': 5.105, 'eval_steps_per_second': 1.021, 'epoch': 0.55}
{'loss': 1.3965, 'learning_rate': 1.874829176413681e-05, 'epoch': 0.56}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7457339763641357, 'eval_runtime': 1.0131, 'eval_samples_per_second': 4.935, 'eval_steps_per_second': 0.987, 'epoch': 0.56}
{'loss': 1.1251, 'learning_rate': 1.8715604065142243e-05, 'epoch': 0.57}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7519242763519287, 'eval_runtime': 1.079, 'eval_samples_per_second': 4.634, 'eval_steps_per_second': 0.927, 'epoch': 0.57}
{'loss': 1.2288, 'learning_rate': 1.8682524297750436e-05, 'epoch': 0.57}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7622143030166626, 'eval_runtime': 0.9917, 'eval_samples_per_second': 5.042, 'eval_steps_per_second': 1.008, 'epoch': 0.57}
{'loss': 0.9003, 'learning_rate': 1.864905395004315e-05, 'epoch': 0.58}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7738120555877686, 'eval_runtime': 1.0137, 'eval_samples_per_second': 4.932, 'eval_steps_per_second': 0.986, 'epoch': 0.58}
{'loss': 1.4472, 'learning_rate': 1.8615194527672247e-05, 'epoch': 0.58}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7770884037017822, 'eval_runtime': 1.0185, 'eval_samples_per_second': 4.909, 'eval_steps_per_second': 0.982, 'epoch': 0.58}
{'loss': 1.4742, 'learning_rate': 1.8580947553791996e-05, 'epoch': 0.59}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7725193500518799, 'eval_runtime': 1.0489, 'eval_samples_per_second': 4.767, 'eval_steps_per_second': 0.953, 'epoch': 0.59}
{'loss': 1.2695, 'learning_rate': 1.8546314568990524e-05, 'epoch': 0.6}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7587652206420898, 'eval_runtime': 1.0787, 'eval_samples_per_second': 4.635, 'eval_steps_per_second': 0.927, 'epoch': 0.6}
{'loss': 1.3834, 'learning_rate': 1.8511297131220523e-05, 'epoch': 0.6}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7411916255950928, 'eval_runtime': 1.0889, 'eval_samples_per_second': 4.592, 'eval_steps_per_second': 0.918, 'epoch': 0.6}
