In [1]:
import sys
import os
sys.path.append('..')

import random

import numpy as np
import pandas as pd
import torch
import transformers
import wandb

from transformers import PatchTSTConfig, Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import PatchTSTForPrediction as PatchTSTForPredictionOG
from TimeSeriesJEPA.models import PatchTSTModelJEPA, PatchTSTForPrediction
from TimeSeriesJEPA.datasets.benchmark_dataset import BenchmarkDataset
os.environ["WANDB_PROJECT"] = "TimeSeriesJEPA" 

In [2]:
csv_path = r"D:\Coursework\MTS\dataset\ETT-small\ETTh1.csv"
dataset = "ETTh1"
num_workers = 4  # Reduce this if you have low number of CPU cores
batch_size = 10  # Reduce if not enough GPU memory available
context_length = 512
forecast_horizon = 96
patch_length = 8
patch_stride = 8
num_input_channels=7
d_model=64
num_attention_heads=4
num_hidden_layers=3
ffn_dim=64
dropout=0.05
head_dropout=0.2
pooling_type=None
channel_attention=False
scaling="std"
loss="mse"
pre_norm=True
norm_type="batchnorm"
positional_encoding_type = "sincos"

learning_rate=0.0001
num_train_epochs=30
max_steps=200

setting = 'Supervised_{}_PatchTST_sl{}_dm{}_nh{}_el{}_fd{}_bs{}_lr{}_e{}'.format(
        dataset,
        context_length,
        d_model,
        num_attention_heads,
        num_hidden_layers,
        ffn_dim,
        batch_size,
        learning_rate,
        num_train_epochs
        )

In [3]:

trainwindowds = BenchmarkDataset(csv_path=csv_path, context_length=context_length, prediction_length=forecast_horizon, flag='train', returndict=True)
valwindowds = BenchmarkDataset(csv_path=csv_path, context_length=context_length, prediction_length=forecast_horizon, flag='test', returndict=True)
print("dataset loaded, total size: ", len(trainwindowds), len(valwindowds))

Total data size:  (17420, 7)
Total data size:  (17420, 7)
dataset loaded, total size:  8033 2785


In [None]:
print("Loading prediction model")

config = PatchTSTConfig(
    do_mask_input=False,
    context_length=context_length,
    patch_length=patch_length,
    num_input_channels=num_input_channels,
    patch_stride=patch_stride,
    prediction_length=forecast_horizon,
    d_model=d_model,
    num_attention_heads=num_attention_heads,
    num_hidden_layers=num_hidden_layers,
    ffn_dim=ffn_dim,
    dropout=dropout,
    head_dropout=head_dropout,
    pooling_type=pooling_type,
    channel_attention=channel_attention,
    scaling=scaling,
    loss=loss,
    pre_norm=pre_norm,
    norm_type=norm_type,
    positional_encoding_type = positional_encoding_type
)

model = PatchTSTForPredictionOG(config=config)

In [None]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model.parameters()])
print("encoder parameters: ", params)

In [None]:
wandb.init(project="TimeSeriesJEPA", name=setting)


train_args = TrainingArguments(
    output_dir=os.path.join("checkpoints", setting),
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    # num_train_epochs=num_train_epochs,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=1,
    max_steps=max_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=1,  # num_workers,
    logging_strategy="steps",
    logging_steps=1,
    save_total_limit=3,
    load_best_model_at_end=True,  # Load the best model when training ends
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
    greater_is_better=False,  # For loss
    label_names=["future_values"],
    report_to="wandb",
    run_name=setting
    )

# # Create a new early stopping callback with faster convergence properties
# early_stopping_callback = EarlyStoppingCallback(
#     early_stopping_patience=15,  # Number of epochs with no improvement after which to stop
#     early_stopping_threshold=0.001,  # Minimum improvement required to consider as improvement
# )

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=trainwindowds,
    eval_dataset=valwindowds,
    # callbacks=[early_stopping_callback],
)

In [None]:
print("\n\nDoing forecasting training")
trainer.train()

In [None]:
trainer.evaluate(valwindowds)
wandb.finish()

In [4]:
print("Loading pretrained encoder model")
setting = "PatchTST_Time300B_sl512_enc_dm64_nh4_el3_fd64_pred_dm32_nh2_el1_fd32_bs256_lr0.0001_pe10_data_nenc1_npred4_ema_05_1"
encoder_model = PatchTSTModelJEPA.from_pretrained("D:\\Coursework\\MTS\\timeseriesJEPA\\results\\"+setting+"\\checkpoint-57120")
print("Done")
encoder_model.cuda()
run_name = "Finetuned_"+dataset+"_"+setting+"_17136"

Loading pretrained encoder model
Done


In [5]:
run_name

'Finetuned_ETTh1_PatchTST_Time300B_sl512_enc_dm64_nh4_el3_fd64_pred_dm32_nh2_el1_fd32_bs256_lr0.0001_pe10_data_nenc1_npred4_ema_05_1_17136'

In [6]:
print("Loading prediction model")

config = PatchTSTConfig(
    do_mask_input=False,
    context_length=context_length,
    patch_length=patch_length,
    num_input_channels=num_input_channels,
    patch_stride=patch_length,
    prediction_length=forecast_horizon,
    d_model=64,
    num_attention_heads=4,
    # num_hidden_layers=4,
    ffn_dim=128,
    dropout=0.05,
    head_dropout=head_dropout,
    pooling_type=None,
    channel_attention=False,
    scaling="std",
    loss="mse",
    pre_norm=True,
    norm_type="batchnorm",
    positional_encoding_type = "sincos"
)

model = PatchTSTForPrediction(config=config, encoder_model=encoder_model)

Loading prediction model


In [7]:
encoder_model

PatchTSTModelJEPA(
  (scaler): PatchTSTScaler(
    (scaler): PatchTSTStdScaler()
  )
  (patchifier): PatchTSTPatchify()
  (encoder): PatchTSTEncoder(
    (embedder): PatchTSTEmbedding(
      (input_embedding): Linear(in_features=8, out_features=64, bias=True)
    )
    (positional_encoder): PatchTSTPositionalEncoding(
      (positional_dropout): Identity()
    )
    (layers): ModuleList(
      (0-2): 3 x PatchTSTEncoderLayer(
        (self_attn): PatchTSTAttention(
          (k_proj): Linear(in_features=64, out_features=64, bias=True)
          (v_proj): Linear(in_features=64, out_features=64, bias=True)
          (q_proj): Linear(in_features=64, out_features=64, bias=True)
          (out_proj): Linear(in_features=64, out_features=64, bias=True)
        )
        (dropout_path1): Identity()
        (norm_sublayer1): PatchTSTBatchNorm(
          (batchnorm): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (ff): Sequential(
          (0):

In [8]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("encoder parameters: ", params)

encoder parameters:  393312


In [9]:
params = sum([np.prod(p.size()) for p in encoder_model.parameters()])
print("encoder parameters: ", params)

encoder parameters:  80448


In [10]:
wandb.init(project="TimeSeriesJEPA", name=run_name)

train_args_jepa = TrainingArguments(
    output_dir=os.path.join("checkpoints", run_name),
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    # num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=1,  # num_workers,
    logging_strategy="steps",
    logging_steps=1,
    save_total_limit=3,
    load_best_model_at_end=True,  # Load the best model when training ends
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
    greater_is_better=False,  # For loss
    label_names=["future_values"],
    report_to="wandb",
    run_name=run_name
)


trainer_jepa = Trainer(
    model=model,
    args=train_args_jepa,
    train_dataset=trainwindowds,
    eval_dataset=valwindowds,
)

[34m[1mwandb[0m: Currently logged in as: [33mvg2523[0m ([33mhpml_4[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


max_steps is given, it will override any value given in num_train_epochs


In [11]:
print("\n\nDoing forecasting training")
trainer_jepa.train()



Doing forecasting training


  0%|          | 0/200 [00:00<?, ?it/s]

{'loss': 1.6188, 'grad_norm': 7.519554615020752, 'learning_rate': 9.95e-05, 'epoch': 0.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 1.607560396194458, 'eval_runtime': 7.9619, 'eval_samples_per_second': 349.79, 'eval_steps_per_second': 35.042, 'epoch': 0.0}
{'loss': 1.4725, 'grad_norm': 6.31898832321167, 'learning_rate': 9.900000000000001e-05, 'epoch': 0.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 1.4169403314590454, 'eval_runtime': 8.8913, 'eval_samples_per_second': 313.226, 'eval_steps_per_second': 31.379, 'epoch': 0.0}
{'loss': 1.2051, 'grad_norm': 4.8494062423706055, 'learning_rate': 9.850000000000001e-05, 'epoch': 0.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 1.2683452367782593, 'eval_runtime': 9.1844, 'eval_samples_per_second': 303.231, 'eval_steps_per_second': 30.378, 'epoch': 0.0}
{'loss': 1.1771, 'grad_norm': 4.383195877075195, 'learning_rate': 9.8e-05, 'epoch': 0.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 1.1509937047958374, 'eval_runtime': 8.4956, 'eval_samples_per_second': 327.816, 'eval_steps_per_second': 32.84, 'epoch': 0.0}
{'loss': 1.1216, 'grad_norm': 3.6452550888061523, 'learning_rate': 9.75e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 1.0552183389663696, 'eval_runtime': 8.1423, 'eval_samples_per_second': 342.04, 'eval_steps_per_second': 34.265, 'epoch': 0.01}
{'loss': 0.9157, 'grad_norm': 2.9545040130615234, 'learning_rate': 9.7e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.9874330163002014, 'eval_runtime': 8.9754, 'eval_samples_per_second': 310.294, 'eval_steps_per_second': 31.085, 'epoch': 0.01}
{'loss': 0.7137, 'grad_norm': 2.263284921646118, 'learning_rate': 9.65e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.9404687881469727, 'eval_runtime': 8.6235, 'eval_samples_per_second': 322.955, 'eval_steps_per_second': 32.353, 'epoch': 0.01}
{'loss': 1.118, 'grad_norm': 2.716399669647217, 'learning_rate': 9.6e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.9041896462440491, 'eval_runtime': 9.097, 'eval_samples_per_second': 306.144, 'eval_steps_per_second': 30.669, 'epoch': 0.01}
{'loss': 0.9818, 'grad_norm': 2.5144553184509277, 'learning_rate': 9.55e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.8781242966651917, 'eval_runtime': 8.8787, 'eval_samples_per_second': 313.674, 'eval_steps_per_second': 31.424, 'epoch': 0.01}
{'loss': 0.9657, 'grad_norm': 2.489537477493286, 'learning_rate': 9.5e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.8529115319252014, 'eval_runtime': 9.3023, 'eval_samples_per_second': 299.388, 'eval_steps_per_second': 29.993, 'epoch': 0.01}
{'loss': 0.9649, 'grad_norm': 2.653700828552246, 'learning_rate': 9.449999999999999e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.8327271342277527, 'eval_runtime': 9.333, 'eval_samples_per_second': 298.403, 'eval_steps_per_second': 29.894, 'epoch': 0.01}
{'loss': 0.9716, 'grad_norm': 2.296635389328003, 'learning_rate': 9.4e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.814315140247345, 'eval_runtime': 9.1302, 'eval_samples_per_second': 305.03, 'eval_steps_per_second': 30.558, 'epoch': 0.01}
{'loss': 0.9134, 'grad_norm': 2.1918656826019287, 'learning_rate': 9.350000000000001e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.801632285118103, 'eval_runtime': 9.2519, 'eval_samples_per_second': 301.02, 'eval_steps_per_second': 30.156, 'epoch': 0.02}
{'loss': 0.9011, 'grad_norm': 2.294973611831665, 'learning_rate': 9.300000000000001e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.791787326335907, 'eval_runtime': 9.4532, 'eval_samples_per_second': 294.611, 'eval_steps_per_second': 29.514, 'epoch': 0.02}
{'loss': 0.8602, 'grad_norm': 2.521223306655884, 'learning_rate': 9.250000000000001e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7835450768470764, 'eval_runtime': 9.671, 'eval_samples_per_second': 287.975, 'eval_steps_per_second': 28.849, 'epoch': 0.02}
{'loss': 0.8006, 'grad_norm': 2.102867603302002, 'learning_rate': 9.200000000000001e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7750697731971741, 'eval_runtime': 9.957, 'eval_samples_per_second': 279.704, 'eval_steps_per_second': 28.021, 'epoch': 0.02}
{'loss': 0.8673, 'grad_norm': 2.2691445350646973, 'learning_rate': 9.15e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.766700804233551, 'eval_runtime': 8.6345, 'eval_samples_per_second': 322.543, 'eval_steps_per_second': 32.312, 'epoch': 0.02}
{'loss': 0.9562, 'grad_norm': 2.7855703830718994, 'learning_rate': 9.1e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7576996088027954, 'eval_runtime': 10.1112, 'eval_samples_per_second': 275.437, 'eval_steps_per_second': 27.593, 'epoch': 0.02}
{'loss': 0.8139, 'grad_norm': 2.455047130584717, 'learning_rate': 9.05e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7486174702644348, 'eval_runtime': 9.3259, 'eval_samples_per_second': 298.631, 'eval_steps_per_second': 29.917, 'epoch': 0.02}
{'loss': 0.8608, 'grad_norm': 2.158742904663086, 'learning_rate': 9e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7404174208641052, 'eval_runtime': 9.1866, 'eval_samples_per_second': 303.159, 'eval_steps_per_second': 30.37, 'epoch': 0.02}
{'loss': 0.8584, 'grad_norm': 2.4783637523651123, 'learning_rate': 8.950000000000001e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7319285273551941, 'eval_runtime': 9.9258, 'eval_samples_per_second': 280.583, 'eval_steps_per_second': 28.109, 'epoch': 0.03}
{'loss': 0.7092, 'grad_norm': 1.7974846363067627, 'learning_rate': 8.900000000000001e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7255297303199768, 'eval_runtime': 9.8475, 'eval_samples_per_second': 282.814, 'eval_steps_per_second': 28.332, 'epoch': 0.03}
{'loss': 0.8418, 'grad_norm': 2.3059096336364746, 'learning_rate': 8.850000000000001e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.719792366027832, 'eval_runtime': 9.9376, 'eval_samples_per_second': 280.249, 'eval_steps_per_second': 28.075, 'epoch': 0.03}
{'loss': 0.813, 'grad_norm': 2.2088072299957275, 'learning_rate': 8.800000000000001e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7140117287635803, 'eval_runtime': 8.5072, 'eval_samples_per_second': 327.371, 'eval_steps_per_second': 32.796, 'epoch': 0.03}
{'loss': 0.7725, 'grad_norm': 1.7636003494262695, 'learning_rate': 8.75e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7091876268386841, 'eval_runtime': 9.4185, 'eval_samples_per_second': 295.695, 'eval_steps_per_second': 29.623, 'epoch': 0.03}
{'loss': 0.7142, 'grad_norm': 1.8274455070495605, 'learning_rate': 8.7e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.705407977104187, 'eval_runtime': 10.0321, 'eval_samples_per_second': 277.61, 'eval_steps_per_second': 27.811, 'epoch': 0.03}
{'loss': 0.7604, 'grad_norm': 1.8461942672729492, 'learning_rate': 8.65e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7025778293609619, 'eval_runtime': 9.2939, 'eval_samples_per_second': 299.66, 'eval_steps_per_second': 30.02, 'epoch': 0.03}
{'loss': 0.8301, 'grad_norm': 2.5847747325897217, 'learning_rate': 8.6e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6970378160476685, 'eval_runtime': 9.5244, 'eval_samples_per_second': 292.406, 'eval_steps_per_second': 29.293, 'epoch': 0.03}
{'loss': 0.823, 'grad_norm': 1.7387770414352417, 'learning_rate': 8.55e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6913326382637024, 'eval_runtime': 10.2144, 'eval_samples_per_second': 272.655, 'eval_steps_per_second': 27.314, 'epoch': 0.04}
{'loss': 0.802, 'grad_norm': 1.931065320968628, 'learning_rate': 8.5e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.685183048248291, 'eval_runtime': 9.0292, 'eval_samples_per_second': 308.445, 'eval_steps_per_second': 30.9, 'epoch': 0.04}
{'loss': 0.8557, 'grad_norm': 2.2608861923217773, 'learning_rate': 8.450000000000001e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6788799166679382, 'eval_runtime': 9.1968, 'eval_samples_per_second': 302.824, 'eval_steps_per_second': 30.337, 'epoch': 0.04}
{'loss': 0.6757, 'grad_norm': 1.5092382431030273, 'learning_rate': 8.4e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6736059188842773, 'eval_runtime': 9.0827, 'eval_samples_per_second': 306.626, 'eval_steps_per_second': 30.718, 'epoch': 0.04}
{'loss': 0.8804, 'grad_norm': 2.3624348640441895, 'learning_rate': 8.35e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6698030233383179, 'eval_runtime': 9.8154, 'eval_samples_per_second': 283.738, 'eval_steps_per_second': 28.425, 'epoch': 0.04}
{'loss': 0.7144, 'grad_norm': 1.8828076124191284, 'learning_rate': 8.3e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6668360829353333, 'eval_runtime': 9.5478, 'eval_samples_per_second': 291.69, 'eval_steps_per_second': 29.221, 'epoch': 0.04}
{'loss': 0.8285, 'grad_norm': 1.8342254161834717, 'learning_rate': 8.25e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6634515523910522, 'eval_runtime': 9.5041, 'eval_samples_per_second': 293.031, 'eval_steps_per_second': 29.356, 'epoch': 0.04}
{'loss': 0.7884, 'grad_norm': 1.925279140472412, 'learning_rate': 8.2e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6601476669311523, 'eval_runtime': 9.4633, 'eval_samples_per_second': 294.296, 'eval_steps_per_second': 29.482, 'epoch': 0.04}
{'loss': 0.691, 'grad_norm': 1.645245909690857, 'learning_rate': 8.15e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6570172905921936, 'eval_runtime': 12.3892, 'eval_samples_per_second': 224.793, 'eval_steps_per_second': 22.52, 'epoch': 0.05}
{'loss': 0.73, 'grad_norm': 1.7645148038864136, 'learning_rate': 8.1e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6536446213722229, 'eval_runtime': 17.9303, 'eval_samples_per_second': 155.324, 'eval_steps_per_second': 15.56, 'epoch': 0.05}
{'loss': 0.9548, 'grad_norm': 1.7982690334320068, 'learning_rate': 8.05e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6517364978790283, 'eval_runtime': 13.858, 'eval_samples_per_second': 200.967, 'eval_steps_per_second': 20.133, 'epoch': 0.05}
{'loss': 0.6898, 'grad_norm': 1.7260304689407349, 'learning_rate': 8e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6496744155883789, 'eval_runtime': 21.7013, 'eval_samples_per_second': 128.333, 'eval_steps_per_second': 12.856, 'epoch': 0.05}
{'loss': 0.7949, 'grad_norm': 1.8724344968795776, 'learning_rate': 7.950000000000001e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.646202564239502, 'eval_runtime': 19.7635, 'eval_samples_per_second': 140.917, 'eval_steps_per_second': 14.117, 'epoch': 0.05}
{'loss': 0.7975, 'grad_norm': 1.873264193534851, 'learning_rate': 7.900000000000001e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6434595584869385, 'eval_runtime': 19.8076, 'eval_samples_per_second': 140.603, 'eval_steps_per_second': 14.086, 'epoch': 0.05}
{'loss': 0.6491, 'grad_norm': 1.5016828775405884, 'learning_rate': 7.850000000000001e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6401530504226685, 'eval_runtime': 18.6047, 'eval_samples_per_second': 149.694, 'eval_steps_per_second': 14.996, 'epoch': 0.05}
{'loss': 0.8061, 'grad_norm': 1.7356047630310059, 'learning_rate': 7.800000000000001e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6387872099876404, 'eval_runtime': 19.2661, 'eval_samples_per_second': 144.554, 'eval_steps_per_second': 14.481, 'epoch': 0.05}
{'loss': 0.6491, 'grad_norm': 1.6132160425186157, 'learning_rate': 7.75e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6362088322639465, 'eval_runtime': 21.6158, 'eval_samples_per_second': 128.841, 'eval_steps_per_second': 12.907, 'epoch': 0.06}
{'loss': 0.7173, 'grad_norm': 2.1133272647857666, 'learning_rate': 7.7e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.632082462310791, 'eval_runtime': 19.1954, 'eval_samples_per_second': 145.087, 'eval_steps_per_second': 14.535, 'epoch': 0.06}
{'loss': 0.6994, 'grad_norm': 1.4951704740524292, 'learning_rate': 7.65e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6274380087852478, 'eval_runtime': 21.684, 'eval_samples_per_second': 128.436, 'eval_steps_per_second': 12.867, 'epoch': 0.06}
{'loss': 0.6388, 'grad_norm': 1.6945589780807495, 'learning_rate': 7.6e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6235431432723999, 'eval_runtime': 17.2176, 'eval_samples_per_second': 161.753, 'eval_steps_per_second': 16.204, 'epoch': 0.06}
{'loss': 0.642, 'grad_norm': 1.5877244472503662, 'learning_rate': 7.55e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6224969625473022, 'eval_runtime': 19.6657, 'eval_samples_per_second': 141.617, 'eval_steps_per_second': 14.187, 'epoch': 0.06}
{'loss': 0.7317, 'grad_norm': 1.640670895576477, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6222743988037109, 'eval_runtime': 15.7985, 'eval_samples_per_second': 176.282, 'eval_steps_per_second': 17.66, 'epoch': 0.06}
{'loss': 0.6778, 'grad_norm': 1.6114736795425415, 'learning_rate': 7.450000000000001e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6238296627998352, 'eval_runtime': 20.1059, 'eval_samples_per_second': 138.516, 'eval_steps_per_second': 13.876, 'epoch': 0.06}
{'loss': 0.8419, 'grad_norm': 2.0221221446990967, 'learning_rate': 7.4e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6252517104148865, 'eval_runtime': 24.6639, 'eval_samples_per_second': 112.918, 'eval_steps_per_second': 11.312, 'epoch': 0.06}
{'loss': 0.7185, 'grad_norm': 1.7596839666366577, 'learning_rate': 7.35e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6263050436973572, 'eval_runtime': 19.9567, 'eval_samples_per_second': 139.552, 'eval_steps_per_second': 13.98, 'epoch': 0.07}
{'loss': 0.6476, 'grad_norm': 1.789554238319397, 'learning_rate': 7.3e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6258611679077148, 'eval_runtime': 17.5535, 'eval_samples_per_second': 158.658, 'eval_steps_per_second': 15.894, 'epoch': 0.07}
{'loss': 0.9297, 'grad_norm': 1.85493004322052, 'learning_rate': 7.25e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.62535160779953, 'eval_runtime': 17.3954, 'eval_samples_per_second': 160.1, 'eval_steps_per_second': 16.039, 'epoch': 0.07}
{'loss': 0.9074, 'grad_norm': 1.7028961181640625, 'learning_rate': 7.2e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6252819299697876, 'eval_runtime': 16.9513, 'eval_samples_per_second': 164.294, 'eval_steps_per_second': 16.459, 'epoch': 0.07}
{'loss': 0.8112, 'grad_norm': 1.8108000755310059, 'learning_rate': 7.15e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.625129222869873, 'eval_runtime': 22.7912, 'eval_samples_per_second': 122.196, 'eval_steps_per_second': 12.242, 'epoch': 0.07}
{'loss': 0.7948, 'grad_norm': 1.9391603469848633, 'learning_rate': 7.1e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6229616403579712, 'eval_runtime': 20.2297, 'eval_samples_per_second': 137.669, 'eval_steps_per_second': 13.792, 'epoch': 0.07}
{'loss': 0.6165, 'grad_norm': 1.6407979726791382, 'learning_rate': 7.05e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6196973919868469, 'eval_runtime': 13.9307, 'eval_samples_per_second': 199.919, 'eval_steps_per_second': 20.028, 'epoch': 0.07}
{'loss': 0.7766, 'grad_norm': 1.7169671058654785, 'learning_rate': 7e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6182217597961426, 'eval_runtime': 19.246, 'eval_samples_per_second': 144.705, 'eval_steps_per_second': 14.496, 'epoch': 0.07}
{'loss': 0.7753, 'grad_norm': 1.4511038064956665, 'learning_rate': 6.95e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6166025400161743, 'eval_runtime': 14.468, 'eval_samples_per_second': 192.493, 'eval_steps_per_second': 19.284, 'epoch': 0.08}
{'loss': 0.7663, 'grad_norm': 1.9400206804275513, 'learning_rate': 6.9e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6153066754341125, 'eval_runtime': 21.1626, 'eval_samples_per_second': 131.6, 'eval_steps_per_second': 13.184, 'epoch': 0.08}
{'loss': 0.659, 'grad_norm': 1.8985393047332764, 'learning_rate': 6.850000000000001e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6121575236320496, 'eval_runtime': 18.9764, 'eval_samples_per_second': 146.761, 'eval_steps_per_second': 14.702, 'epoch': 0.08}
{'loss': 0.5934, 'grad_norm': 1.5300694704055786, 'learning_rate': 6.800000000000001e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6089303493499756, 'eval_runtime': 21.9952, 'eval_samples_per_second': 126.618, 'eval_steps_per_second': 12.685, 'epoch': 0.08}
{'loss': 0.7668, 'grad_norm': 1.7559036016464233, 'learning_rate': 6.750000000000001e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.605469822883606, 'eval_runtime': 14.472, 'eval_samples_per_second': 192.441, 'eval_steps_per_second': 19.279, 'epoch': 0.08}
{'loss': 0.777, 'grad_norm': 1.8704259395599365, 'learning_rate': 6.7e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6003857851028442, 'eval_runtime': 13.2798, 'eval_samples_per_second': 209.717, 'eval_steps_per_second': 21.009, 'epoch': 0.08}
{'loss': 0.6558, 'grad_norm': 1.2258787155151367, 'learning_rate': 6.65e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5969748497009277, 'eval_runtime': 18.2175, 'eval_samples_per_second': 152.875, 'eval_steps_per_second': 15.315, 'epoch': 0.08}
{'loss': 0.6818, 'grad_norm': 1.7250014543533325, 'learning_rate': 6.6e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5930539965629578, 'eval_runtime': 21.9105, 'eval_samples_per_second': 127.108, 'eval_steps_per_second': 12.734, 'epoch': 0.08}
{'loss': 0.6382, 'grad_norm': 1.7089920043945312, 'learning_rate': 6.55e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5896604657173157, 'eval_runtime': 14.851, 'eval_samples_per_second': 187.53, 'eval_steps_per_second': 18.787, 'epoch': 0.09}
{'loss': 0.6755, 'grad_norm': 1.5143076181411743, 'learning_rate': 6.500000000000001e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5868321061134338, 'eval_runtime': 12.8802, 'eval_samples_per_second': 216.223, 'eval_steps_per_second': 21.661, 'epoch': 0.09}
{'loss': 0.6344, 'grad_norm': 1.355013370513916, 'learning_rate': 6.450000000000001e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5841982960700989, 'eval_runtime': 13.7403, 'eval_samples_per_second': 202.689, 'eval_steps_per_second': 20.305, 'epoch': 0.09}
{'loss': 0.9038, 'grad_norm': 2.0964670181274414, 'learning_rate': 6.400000000000001e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5824336409568787, 'eval_runtime': 13.1713, 'eval_samples_per_second': 211.445, 'eval_steps_per_second': 21.182, 'epoch': 0.09}
{'loss': 0.6623, 'grad_norm': 1.9836039543151855, 'learning_rate': 6.35e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5810835361480713, 'eval_runtime': 15.2795, 'eval_samples_per_second': 182.27, 'eval_steps_per_second': 18.26, 'epoch': 0.09}
{'loss': 0.6714, 'grad_norm': 1.8433059453964233, 'learning_rate': 6.3e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5797852277755737, 'eval_runtime': 22.1797, 'eval_samples_per_second': 125.565, 'eval_steps_per_second': 12.579, 'epoch': 0.09}
{'loss': 0.6672, 'grad_norm': 1.6841570138931274, 'learning_rate': 6.25e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5782339572906494, 'eval_runtime': 17.4373, 'eval_samples_per_second': 159.716, 'eval_steps_per_second': 16.0, 'epoch': 0.09}
{'loss': 0.7245, 'grad_norm': 1.7824122905731201, 'learning_rate': 6.2e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5762900710105896, 'eval_runtime': 13.5654, 'eval_samples_per_second': 205.301, 'eval_steps_per_second': 20.567, 'epoch': 0.09}
{'loss': 0.6798, 'grad_norm': 1.657588243484497, 'learning_rate': 6.15e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5742844343185425, 'eval_runtime': 14.3657, 'eval_samples_per_second': 193.865, 'eval_steps_per_second': 19.421, 'epoch': 0.1}
{'loss': 0.701, 'grad_norm': 1.630065679550171, 'learning_rate': 6.1e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5724282264709473, 'eval_runtime': 15.2744, 'eval_samples_per_second': 182.331, 'eval_steps_per_second': 18.266, 'epoch': 0.1}
{'loss': 0.6853, 'grad_norm': 1.8007293939590454, 'learning_rate': 6.05e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5707835555076599, 'eval_runtime': 13.4911, 'eval_samples_per_second': 206.432, 'eval_steps_per_second': 20.68, 'epoch': 0.1}
{'loss': 0.6808, 'grad_norm': 1.4295599460601807, 'learning_rate': 6e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5697216391563416, 'eval_runtime': 15.2022, 'eval_samples_per_second': 183.198, 'eval_steps_per_second': 18.353, 'epoch': 0.1}
{'loss': 0.658, 'grad_norm': 1.4560422897338867, 'learning_rate': 5.95e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.56908118724823, 'eval_runtime': 13.6999, 'eval_samples_per_second': 203.287, 'eval_steps_per_second': 20.365, 'epoch': 0.1}
{'loss': 0.9009, 'grad_norm': 1.6588668823242188, 'learning_rate': 5.9e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5682172775268555, 'eval_runtime': 12.4467, 'eval_samples_per_second': 223.754, 'eval_steps_per_second': 22.416, 'epoch': 0.1}
{'loss': 0.7173, 'grad_norm': 1.7349097728729248, 'learning_rate': 5.85e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5684437155723572, 'eval_runtime': 23.2114, 'eval_samples_per_second': 119.984, 'eval_steps_per_second': 12.02, 'epoch': 0.1}
{'loss': 0.743, 'grad_norm': 1.5615774393081665, 'learning_rate': 5.8e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5694815516471863, 'eval_runtime': 12.6629, 'eval_samples_per_second': 219.934, 'eval_steps_per_second': 22.033, 'epoch': 0.1}
{'loss': 0.7575, 'grad_norm': 2.0948400497436523, 'learning_rate': 5.7499999999999995e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5705386400222778, 'eval_runtime': 12.2388, 'eval_samples_per_second': 227.555, 'eval_steps_per_second': 22.796, 'epoch': 0.11}
{'loss': 0.6526, 'grad_norm': 1.8712512254714966, 'learning_rate': 5.6999999999999996e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5715968608856201, 'eval_runtime': 12.4608, 'eval_samples_per_second': 223.5, 'eval_steps_per_second': 22.39, 'epoch': 0.11}
{'loss': 0.6954, 'grad_norm': 1.5647177696228027, 'learning_rate': 5.65e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5737383961677551, 'eval_runtime': 12.4482, 'eval_samples_per_second': 223.727, 'eval_steps_per_second': 22.413, 'epoch': 0.11}
{'loss': 0.6982, 'grad_norm': 1.8727302551269531, 'learning_rate': 5.6000000000000006e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.573963463306427, 'eval_runtime': 12.182, 'eval_samples_per_second': 228.615, 'eval_steps_per_second': 22.903, 'epoch': 0.11}
{'loss': 0.7591, 'grad_norm': 1.8358768224716187, 'learning_rate': 5.550000000000001e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5732495784759521, 'eval_runtime': 12.2226, 'eval_samples_per_second': 227.856, 'eval_steps_per_second': 22.826, 'epoch': 0.11}
{'loss': 0.6567, 'grad_norm': 2.058584451675415, 'learning_rate': 5.500000000000001e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5722103118896484, 'eval_runtime': 12.1298, 'eval_samples_per_second': 229.6, 'eval_steps_per_second': 23.001, 'epoch': 0.11}
{'loss': 0.5445, 'grad_norm': 1.3102307319641113, 'learning_rate': 5.45e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5703549385070801, 'eval_runtime': 12.1826, 'eval_samples_per_second': 228.604, 'eval_steps_per_second': 22.901, 'epoch': 0.11}
{'loss': 0.8118, 'grad_norm': 2.001948833465576, 'learning_rate': 5.4000000000000005e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5689282417297363, 'eval_runtime': 12.2535, 'eval_samples_per_second': 227.283, 'eval_steps_per_second': 22.769, 'epoch': 0.11}
{'loss': 0.6321, 'grad_norm': 1.5404586791992188, 'learning_rate': 5.3500000000000006e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5679359436035156, 'eval_runtime': 12.1934, 'eval_samples_per_second': 228.402, 'eval_steps_per_second': 22.881, 'epoch': 0.12}
{'loss': 0.7188, 'grad_norm': 1.6202837228775024, 'learning_rate': 5.300000000000001e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.566504716873169, 'eval_runtime': 12.4568, 'eval_samples_per_second': 223.573, 'eval_steps_per_second': 22.397, 'epoch': 0.12}
{'loss': 0.6603, 'grad_norm': 1.5652037858963013, 'learning_rate': 5.25e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5659753680229187, 'eval_runtime': 23.0479, 'eval_samples_per_second': 120.835, 'eval_steps_per_second': 12.105, 'epoch': 0.12}
{'loss': 0.7121, 'grad_norm': 1.9472382068634033, 'learning_rate': 5.2000000000000004e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5665712356567383, 'eval_runtime': 20.9573, 'eval_samples_per_second': 132.889, 'eval_steps_per_second': 13.313, 'epoch': 0.12}
{'loss': 0.694, 'grad_norm': 1.4772673845291138, 'learning_rate': 5.1500000000000005e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5659028887748718, 'eval_runtime': 20.7823, 'eval_samples_per_second': 134.008, 'eval_steps_per_second': 13.425, 'epoch': 0.12}
{'loss': 0.7669, 'grad_norm': 2.1975479125976562, 'learning_rate': 5.1000000000000006e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5659472346305847, 'eval_runtime': 15.0865, 'eval_samples_per_second': 184.602, 'eval_steps_per_second': 18.493, 'epoch': 0.12}
{'loss': 0.6915, 'grad_norm': 1.6111462116241455, 'learning_rate': 5.05e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5651167035102844, 'eval_runtime': 12.9665, 'eval_samples_per_second': 214.784, 'eval_steps_per_second': 21.517, 'epoch': 0.12}
{'loss': 0.5761, 'grad_norm': 1.6174280643463135, 'learning_rate': 5e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.563275933265686, 'eval_runtime': 13.6947, 'eval_samples_per_second': 203.363, 'eval_steps_per_second': 20.373, 'epoch': 0.12}
{'loss': 0.6209, 'grad_norm': 1.4149484634399414, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5615459084510803, 'eval_runtime': 12.6272, 'eval_samples_per_second': 220.556, 'eval_steps_per_second': 22.095, 'epoch': 0.13}
{'loss': 0.9241, 'grad_norm': 2.0765280723571777, 'learning_rate': 4.9e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5598450303077698, 'eval_runtime': 13.0976, 'eval_samples_per_second': 212.634, 'eval_steps_per_second': 21.302, 'epoch': 0.13}
{'loss': 0.8799, 'grad_norm': 2.220933675765991, 'learning_rate': 4.85e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5592681169509888, 'eval_runtime': 12.3924, 'eval_samples_per_second': 224.734, 'eval_steps_per_second': 22.514, 'epoch': 0.13}
{'loss': 0.605, 'grad_norm': 1.347348928451538, 'learning_rate': 4.8e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5583553314208984, 'eval_runtime': 12.8499, 'eval_samples_per_second': 216.734, 'eval_steps_per_second': 21.712, 'epoch': 0.13}
{'loss': 0.6815, 'grad_norm': 1.6251602172851562, 'learning_rate': 4.75e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.558383584022522, 'eval_runtime': 13.059, 'eval_samples_per_second': 213.262, 'eval_steps_per_second': 21.365, 'epoch': 0.13}
{'loss': 0.7239, 'grad_norm': 1.4461580514907837, 'learning_rate': 4.7e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5579475164413452, 'eval_runtime': 13.2365, 'eval_samples_per_second': 210.402, 'eval_steps_per_second': 21.078, 'epoch': 0.13}
{'loss': 0.7714, 'grad_norm': 1.7601290941238403, 'learning_rate': 4.6500000000000005e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5576735138893127, 'eval_runtime': 13.3929, 'eval_samples_per_second': 207.945, 'eval_steps_per_second': 20.832, 'epoch': 0.13}
{'loss': 0.6604, 'grad_norm': 1.5650584697723389, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5576792359352112, 'eval_runtime': 13.7928, 'eval_samples_per_second': 201.917, 'eval_steps_per_second': 20.228, 'epoch': 0.13}
{'loss': 0.6144, 'grad_norm': 1.1597437858581543, 'learning_rate': 4.55e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.557960569858551, 'eval_runtime': 12.9668, 'eval_samples_per_second': 214.78, 'eval_steps_per_second': 21.517, 'epoch': 0.14}
{'loss': 0.7049, 'grad_norm': 1.8543527126312256, 'learning_rate': 4.5e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5569501519203186, 'eval_runtime': 14.1866, 'eval_samples_per_second': 196.312, 'eval_steps_per_second': 19.666, 'epoch': 0.14}
{'loss': 0.8067, 'grad_norm': 2.127140760421753, 'learning_rate': 4.4500000000000004e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5554860830307007, 'eval_runtime': 15.701, 'eval_samples_per_second': 177.377, 'eval_steps_per_second': 17.77, 'epoch': 0.14}
{'loss': 0.7722, 'grad_norm': 1.968812346458435, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.554149866104126, 'eval_runtime': 16.5201, 'eval_samples_per_second': 168.583, 'eval_steps_per_second': 16.889, 'epoch': 0.14}
{'loss': 0.6388, 'grad_norm': 1.6399065256118774, 'learning_rate': 4.35e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5535171031951904, 'eval_runtime': 17.3309, 'eval_samples_per_second': 160.696, 'eval_steps_per_second': 16.098, 'epoch': 0.14}
{'loss': 0.7088, 'grad_norm': 1.6848498582839966, 'learning_rate': 4.3e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5528499484062195, 'eval_runtime': 15.98, 'eval_samples_per_second': 174.281, 'eval_steps_per_second': 17.459, 'epoch': 0.14}
{'loss': 0.7331, 'grad_norm': 2.0423800945281982, 'learning_rate': 4.25e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5537774562835693, 'eval_runtime': 14.0666, 'eval_samples_per_second': 197.987, 'eval_steps_per_second': 19.834, 'epoch': 0.14}
{'loss': 0.6837, 'grad_norm': 1.5136648416519165, 'learning_rate': 4.2e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5549222230911255, 'eval_runtime': 13.5612, 'eval_samples_per_second': 205.366, 'eval_steps_per_second': 20.573, 'epoch': 0.14}
{'loss': 0.7012, 'grad_norm': 1.554426670074463, 'learning_rate': 4.15e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5553197264671326, 'eval_runtime': 12.4842, 'eval_samples_per_second': 223.081, 'eval_steps_per_second': 22.348, 'epoch': 0.15}
{'loss': 0.7502, 'grad_norm': 1.4580152034759521, 'learning_rate': 4.1e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5564005374908447, 'eval_runtime': 12.7316, 'eval_samples_per_second': 218.747, 'eval_steps_per_second': 21.914, 'epoch': 0.15}
{'loss': 0.6445, 'grad_norm': 1.4816391468048096, 'learning_rate': 4.05e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5563676953315735, 'eval_runtime': 12.2705, 'eval_samples_per_second': 226.967, 'eval_steps_per_second': 22.737, 'epoch': 0.15}
{'loss': 0.7926, 'grad_norm': 1.6861342191696167, 'learning_rate': 4e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5559736490249634, 'eval_runtime': 12.1212, 'eval_samples_per_second': 229.763, 'eval_steps_per_second': 23.018, 'epoch': 0.15}
{'loss': 0.6931, 'grad_norm': 1.5395028591156006, 'learning_rate': 3.9500000000000005e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5553233027458191, 'eval_runtime': 12.326, 'eval_samples_per_second': 225.945, 'eval_steps_per_second': 22.635, 'epoch': 0.15}
{'loss': 0.6486, 'grad_norm': 1.6911847591400146, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5544948577880859, 'eval_runtime': 12.5813, 'eval_samples_per_second': 221.361, 'eval_steps_per_second': 22.176, 'epoch': 0.15}
{'loss': 0.7155, 'grad_norm': 1.7433277368545532, 'learning_rate': 3.85e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5526735782623291, 'eval_runtime': 12.3165, 'eval_samples_per_second': 226.119, 'eval_steps_per_second': 22.652, 'epoch': 0.15}
{'loss': 0.5764, 'grad_norm': 1.7290575504302979, 'learning_rate': 3.8e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5514786839485168, 'eval_runtime': 12.0081, 'eval_samples_per_second': 231.927, 'eval_steps_per_second': 23.234, 'epoch': 0.15}
{'loss': 0.7744, 'grad_norm': 1.9286189079284668, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5498279333114624, 'eval_runtime': 12.4606, 'eval_samples_per_second': 223.505, 'eval_steps_per_second': 22.391, 'epoch': 0.16}
{'loss': 0.6247, 'grad_norm': 1.6408281326293945, 'learning_rate': 3.7e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.548819899559021, 'eval_runtime': 12.2988, 'eval_samples_per_second': 226.445, 'eval_steps_per_second': 22.685, 'epoch': 0.16}
{'loss': 0.6525, 'grad_norm': 1.513285756111145, 'learning_rate': 3.65e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5476184487342834, 'eval_runtime': 12.1688, 'eval_samples_per_second': 228.864, 'eval_steps_per_second': 22.928, 'epoch': 0.16}
{'loss': 0.7012, 'grad_norm': 1.4775071144104004, 'learning_rate': 3.6e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5471243262290955, 'eval_runtime': 12.1368, 'eval_samples_per_second': 229.468, 'eval_steps_per_second': 22.988, 'epoch': 0.16}
{'loss': 0.6028, 'grad_norm': 1.4653894901275635, 'learning_rate': 3.55e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5463509559631348, 'eval_runtime': 12.1994, 'eval_samples_per_second': 228.289, 'eval_steps_per_second': 22.87, 'epoch': 0.16}
{'loss': 0.6427, 'grad_norm': 1.5056918859481812, 'learning_rate': 3.5e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5455015897750854, 'eval_runtime': 12.3741, 'eval_samples_per_second': 225.067, 'eval_steps_per_second': 22.547, 'epoch': 0.16}
{'loss': 0.605, 'grad_norm': 1.5010862350463867, 'learning_rate': 3.45e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5446032881736755, 'eval_runtime': 13.2005, 'eval_samples_per_second': 210.977, 'eval_steps_per_second': 21.136, 'epoch': 0.16}
{'loss': 0.8417, 'grad_norm': 1.936002492904663, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5440365076065063, 'eval_runtime': 12.3204, 'eval_samples_per_second': 226.048, 'eval_steps_per_second': 22.645, 'epoch': 0.16}
{'loss': 0.607, 'grad_norm': 1.800447940826416, 'learning_rate': 3.35e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5445457696914673, 'eval_runtime': 11.9185, 'eval_samples_per_second': 233.669, 'eval_steps_per_second': 23.409, 'epoch': 0.17}
{'loss': 0.6964, 'grad_norm': 1.7804754972457886, 'learning_rate': 3.3e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5447695851325989, 'eval_runtime': 12.2052, 'eval_samples_per_second': 228.181, 'eval_steps_per_second': 22.859, 'epoch': 0.17}
{'loss': 0.536, 'grad_norm': 1.6370898485183716, 'learning_rate': 3.2500000000000004e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.545932948589325, 'eval_runtime': 12.5687, 'eval_samples_per_second': 221.582, 'eval_steps_per_second': 22.198, 'epoch': 0.17}
{'loss': 0.7262, 'grad_norm': 1.5812853574752808, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5466992855072021, 'eval_runtime': 12.1213, 'eval_samples_per_second': 229.76, 'eval_steps_per_second': 23.017, 'epoch': 0.17}
{'loss': 0.6561, 'grad_norm': 1.548669457435608, 'learning_rate': 3.15e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5466105937957764, 'eval_runtime': 12.1249, 'eval_samples_per_second': 229.692, 'eval_steps_per_second': 23.01, 'epoch': 0.17}
{'loss': 0.8171, 'grad_norm': 1.8322139978408813, 'learning_rate': 3.1e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5462417006492615, 'eval_runtime': 12.1008, 'eval_samples_per_second': 230.15, 'eval_steps_per_second': 23.056, 'epoch': 0.17}
{'loss': 0.6521, 'grad_norm': 1.5780137777328491, 'learning_rate': 3.05e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5458855628967285, 'eval_runtime': 13.2489, 'eval_samples_per_second': 210.205, 'eval_steps_per_second': 21.058, 'epoch': 0.17}
{'loss': 0.6367, 'grad_norm': 1.4451229572296143, 'learning_rate': 3e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5450788736343384, 'eval_runtime': 22.4915, 'eval_samples_per_second': 123.825, 'eval_steps_per_second': 12.405, 'epoch': 0.17}
{'loss': 0.7092, 'grad_norm': 1.8418043851852417, 'learning_rate': 2.95e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5458484888076782, 'eval_runtime': 16.2055, 'eval_samples_per_second': 171.856, 'eval_steps_per_second': 17.216, 'epoch': 0.18}
{'loss': 0.5814, 'grad_norm': 1.3955429792404175, 'learning_rate': 2.9e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5463997721672058, 'eval_runtime': 13.4318, 'eval_samples_per_second': 207.344, 'eval_steps_per_second': 20.772, 'epoch': 0.18}
{'loss': 0.6554, 'grad_norm': 1.5485846996307373, 'learning_rate': 2.8499999999999998e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5473912954330444, 'eval_runtime': 13.3651, 'eval_samples_per_second': 208.378, 'eval_steps_per_second': 20.875, 'epoch': 0.18}
{'loss': 0.64, 'grad_norm': 1.6787248849868774, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5478713512420654, 'eval_runtime': 13.2306, 'eval_samples_per_second': 210.496, 'eval_steps_per_second': 21.087, 'epoch': 0.18}
{'loss': 0.6398, 'grad_norm': 1.5225932598114014, 'learning_rate': 2.7500000000000004e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.547752320766449, 'eval_runtime': 14.3765, 'eval_samples_per_second': 193.719, 'eval_steps_per_second': 19.407, 'epoch': 0.18}
{'loss': 0.6284, 'grad_norm': 1.956007719039917, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5467544794082642, 'eval_runtime': 21.8613, 'eval_samples_per_second': 127.394, 'eval_steps_per_second': 12.762, 'epoch': 0.18}
{'loss': 0.6208, 'grad_norm': 1.2649116516113281, 'learning_rate': 2.6500000000000004e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5472265481948853, 'eval_runtime': 21.6783, 'eval_samples_per_second': 128.47, 'eval_steps_per_second': 12.87, 'epoch': 0.18}
{'loss': 0.6769, 'grad_norm': 1.804410696029663, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5463798642158508, 'eval_runtime': 20.9312, 'eval_samples_per_second': 133.055, 'eval_steps_per_second': 13.329, 'epoch': 0.18}
{'loss': 0.5115, 'grad_norm': 1.2387770414352417, 'learning_rate': 2.5500000000000003e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5446130633354187, 'eval_runtime': 20.6532, 'eval_samples_per_second': 134.846, 'eval_steps_per_second': 13.509, 'epoch': 0.19}
{'loss': 0.6268, 'grad_norm': 1.2899013757705688, 'learning_rate': 2.5e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5428313612937927, 'eval_runtime': 21.4081, 'eval_samples_per_second': 130.091, 'eval_steps_per_second': 13.032, 'epoch': 0.19}
{'loss': 0.5782, 'grad_norm': 1.2354971170425415, 'learning_rate': 2.45e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5424527525901794, 'eval_runtime': 19.829, 'eval_samples_per_second': 140.451, 'eval_steps_per_second': 14.07, 'epoch': 0.19}
{'loss': 0.7053, 'grad_norm': 1.3151615858078003, 'learning_rate': 2.4e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5416966080665588, 'eval_runtime': 20.2316, 'eval_samples_per_second': 137.656, 'eval_steps_per_second': 13.79, 'epoch': 0.19}
{'loss': 0.7891, 'grad_norm': 1.7811373472213745, 'learning_rate': 2.35e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.542095959186554, 'eval_runtime': 20.1562, 'eval_samples_per_second': 138.171, 'eval_steps_per_second': 13.842, 'epoch': 0.19}
{'loss': 0.7548, 'grad_norm': 1.6664915084838867, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5411327481269836, 'eval_runtime': 21.9189, 'eval_samples_per_second': 127.06, 'eval_steps_per_second': 12.729, 'epoch': 0.19}
{'loss': 0.7377, 'grad_norm': 1.5957770347595215, 'learning_rate': 2.25e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5407599806785583, 'eval_runtime': 24.3364, 'eval_samples_per_second': 114.438, 'eval_steps_per_second': 11.464, 'epoch': 0.19}
{'loss': 0.666, 'grad_norm': 1.7650872468948364, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5403271317481995, 'eval_runtime': 19.9312, 'eval_samples_per_second': 139.73, 'eval_steps_per_second': 13.998, 'epoch': 0.19}
{'loss': 0.7052, 'grad_norm': 1.346930742263794, 'learning_rate': 2.15e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5392822623252869, 'eval_runtime': 14.9856, 'eval_samples_per_second': 185.845, 'eval_steps_per_second': 18.618, 'epoch': 0.2}
{'loss': 0.5701, 'grad_norm': 1.3789215087890625, 'learning_rate': 2.1e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5380026698112488, 'eval_runtime': 15.5784, 'eval_samples_per_second': 178.773, 'eval_steps_per_second': 17.909, 'epoch': 0.2}
{'loss': 0.5808, 'grad_norm': 1.4749406576156616, 'learning_rate': 2.05e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5377140641212463, 'eval_runtime': 21.9164, 'eval_samples_per_second': 127.074, 'eval_steps_per_second': 12.73, 'epoch': 0.2}
{'loss': 0.6362, 'grad_norm': 1.660429835319519, 'learning_rate': 2e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.536182701587677, 'eval_runtime': 21.4582, 'eval_samples_per_second': 129.787, 'eval_steps_per_second': 13.002, 'epoch': 0.2}
{'loss': 0.5726, 'grad_norm': 1.459836721420288, 'learning_rate': 1.9500000000000003e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5356904864311218, 'eval_runtime': 21.0369, 'eval_samples_per_second': 132.387, 'eval_steps_per_second': 13.262, 'epoch': 0.2}
{'loss': 0.6762, 'grad_norm': 1.9138052463531494, 'learning_rate': 1.9e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5344865322113037, 'eval_runtime': 21.3878, 'eval_samples_per_second': 130.215, 'eval_steps_per_second': 13.045, 'epoch': 0.2}
{'loss': 0.5585, 'grad_norm': 1.5684120655059814, 'learning_rate': 1.85e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5343432426452637, 'eval_runtime': 20.5856, 'eval_samples_per_second': 135.289, 'eval_steps_per_second': 13.553, 'epoch': 0.2}
{'loss': 0.6889, 'grad_norm': 1.4486392736434937, 'learning_rate': 1.8e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5343374609947205, 'eval_runtime': 20.5721, 'eval_samples_per_second': 135.378, 'eval_steps_per_second': 13.562, 'epoch': 0.2}
{'loss': 0.5808, 'grad_norm': 1.5583736896514893, 'learning_rate': 1.75e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5348631739616394, 'eval_runtime': 22.5093, 'eval_samples_per_second': 123.727, 'eval_steps_per_second': 12.395, 'epoch': 0.21}
{'loss': 0.7483, 'grad_norm': 1.653637409210205, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5342536568641663, 'eval_runtime': 17.7694, 'eval_samples_per_second': 156.73, 'eval_steps_per_second': 15.701, 'epoch': 0.21}
{'loss': 0.5896, 'grad_norm': 1.4737333059310913, 'learning_rate': 1.65e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5340431928634644, 'eval_runtime': 18.3451, 'eval_samples_per_second': 151.811, 'eval_steps_per_second': 15.208, 'epoch': 0.21}
{'loss': 0.7703, 'grad_norm': 1.9869074821472168, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5343901515007019, 'eval_runtime': 17.1174, 'eval_samples_per_second': 162.699, 'eval_steps_per_second': 16.299, 'epoch': 0.21}
{'loss': 0.6347, 'grad_norm': 1.4384758472442627, 'learning_rate': 1.55e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5337935090065002, 'eval_runtime': 18.2376, 'eval_samples_per_second': 152.707, 'eval_steps_per_second': 15.298, 'epoch': 0.21}
{'loss': 0.6397, 'grad_norm': 1.6138712167739868, 'learning_rate': 1.5e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5327961444854736, 'eval_runtime': 17.8914, 'eval_samples_per_second': 155.662, 'eval_steps_per_second': 15.594, 'epoch': 0.21}
{'loss': 0.5851, 'grad_norm': 1.4488650560379028, 'learning_rate': 1.45e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5324659943580627, 'eval_runtime': 14.8605, 'eval_samples_per_second': 187.409, 'eval_steps_per_second': 18.775, 'epoch': 0.21}
{'loss': 0.7155, 'grad_norm': 2.1293582916259766, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5321222543716431, 'eval_runtime': 14.2284, 'eval_samples_per_second': 195.736, 'eval_steps_per_second': 19.609, 'epoch': 0.21}
{'loss': 0.7792, 'grad_norm': 1.871437430381775, 'learning_rate': 1.3500000000000001e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.532491147518158, 'eval_runtime': 18.7324, 'eval_samples_per_second': 148.673, 'eval_steps_per_second': 14.894, 'epoch': 0.22}
{'loss': 0.7328, 'grad_norm': 1.8186744451522827, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5329000949859619, 'eval_runtime': 27.1397, 'eval_samples_per_second': 102.617, 'eval_steps_per_second': 10.28, 'epoch': 0.22}
{'loss': 0.6089, 'grad_norm': 1.6015866994857788, 'learning_rate': 1.25e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5332124829292297, 'eval_runtime': 26.983, 'eval_samples_per_second': 103.213, 'eval_steps_per_second': 10.34, 'epoch': 0.22}
{'loss': 0.5849, 'grad_norm': 1.5623114109039307, 'learning_rate': 1.2e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.534317672252655, 'eval_runtime': 24.0463, 'eval_samples_per_second': 115.818, 'eval_steps_per_second': 11.603, 'epoch': 0.22}
{'loss': 0.657, 'grad_norm': 2.092060089111328, 'learning_rate': 1.1500000000000002e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5352119207382202, 'eval_runtime': 22.2193, 'eval_samples_per_second': 125.342, 'eval_steps_per_second': 12.557, 'epoch': 0.22}
{'loss': 0.5632, 'grad_norm': 1.1954907178878784, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.534841001033783, 'eval_runtime': 22.7196, 'eval_samples_per_second': 122.581, 'eval_steps_per_second': 12.28, 'epoch': 0.22}
{'loss': 0.6489, 'grad_norm': 1.314626693725586, 'learning_rate': 1.05e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5351337790489197, 'eval_runtime': 24.3894, 'eval_samples_per_second': 114.189, 'eval_steps_per_second': 11.439, 'epoch': 0.22}
{'loss': 0.7073, 'grad_norm': 1.7793360948562622, 'learning_rate': 1e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5348128080368042, 'eval_runtime': 21.3444, 'eval_samples_per_second': 130.479, 'eval_steps_per_second': 13.071, 'epoch': 0.22}
{'loss': 0.548, 'grad_norm': 1.2846449613571167, 'learning_rate': 9.5e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5341229438781738, 'eval_runtime': 21.1108, 'eval_samples_per_second': 131.923, 'eval_steps_per_second': 13.216, 'epoch': 0.23}
{'loss': 0.6368, 'grad_norm': 1.4586780071258545, 'learning_rate': 9e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5341733694076538, 'eval_runtime': 20.4136, 'eval_samples_per_second': 136.429, 'eval_steps_per_second': 13.667, 'epoch': 0.23}
{'loss': 0.5866, 'grad_norm': 1.4598946571350098, 'learning_rate': 8.500000000000002e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5343791246414185, 'eval_runtime': 21.5755, 'eval_samples_per_second': 129.082, 'eval_steps_per_second': 12.931, 'epoch': 0.23}
{'loss': 0.5555, 'grad_norm': 1.4934961795806885, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.53511643409729, 'eval_runtime': 17.8867, 'eval_samples_per_second': 155.702, 'eval_steps_per_second': 15.598, 'epoch': 0.23}
{'loss': 0.5299, 'grad_norm': 1.451295256614685, 'learning_rate': 7.5e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5358287692070007, 'eval_runtime': 17.2049, 'eval_samples_per_second': 161.872, 'eval_steps_per_second': 16.216, 'epoch': 0.23}
{'loss': 0.7495, 'grad_norm': 1.4716964960098267, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5356104969978333, 'eval_runtime': 18.6778, 'eval_samples_per_second': 149.108, 'eval_steps_per_second': 14.938, 'epoch': 0.23}
{'loss': 0.6369, 'grad_norm': 1.3538953065872192, 'learning_rate': 6.5000000000000004e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5353147387504578, 'eval_runtime': 17.2907, 'eval_samples_per_second': 161.069, 'eval_steps_per_second': 16.136, 'epoch': 0.23}
{'loss': 0.8275, 'grad_norm': 1.671203374862671, 'learning_rate': 6e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5353061556816101, 'eval_runtime': 18.9502, 'eval_samples_per_second': 146.964, 'eval_steps_per_second': 14.723, 'epoch': 0.23}
{'loss': 0.5861, 'grad_norm': 1.3849132061004639, 'learning_rate': 5.500000000000001e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5348619222640991, 'eval_runtime': 18.2524, 'eval_samples_per_second': 152.583, 'eval_steps_per_second': 15.286, 'epoch': 0.24}
{'loss': 0.7343, 'grad_norm': 1.7360172271728516, 'learning_rate': 5e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5337761044502258, 'eval_runtime': 17.6757, 'eval_samples_per_second': 157.561, 'eval_steps_per_second': 15.784, 'epoch': 0.24}
{'loss': 0.6194, 'grad_norm': 1.3745862245559692, 'learning_rate': 4.5e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5337317585945129, 'eval_runtime': 15.4472, 'eval_samples_per_second': 180.291, 'eval_steps_per_second': 18.061, 'epoch': 0.24}
{'loss': 0.6296, 'grad_norm': 1.5797868967056274, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5340949892997742, 'eval_runtime': 22.5115, 'eval_samples_per_second': 123.715, 'eval_steps_per_second': 12.394, 'epoch': 0.24}
{'loss': 0.5779, 'grad_norm': 1.595638632774353, 'learning_rate': 3.5000000000000004e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5341569185256958, 'eval_runtime': 19.9081, 'eval_samples_per_second': 139.893, 'eval_steps_per_second': 14.014, 'epoch': 0.24}
{'loss': 0.7442, 'grad_norm': 1.4224494695663452, 'learning_rate': 3e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5340445637702942, 'eval_runtime': 19.6546, 'eval_samples_per_second': 141.697, 'eval_steps_per_second': 14.195, 'epoch': 0.24}
{'loss': 0.7479, 'grad_norm': 1.7259169816970825, 'learning_rate': 2.5e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5338525176048279, 'eval_runtime': 18.8621, 'eval_samples_per_second': 147.651, 'eval_steps_per_second': 14.792, 'epoch': 0.24}
{'loss': 0.7113, 'grad_norm': 1.7832869291305542, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5332269072532654, 'eval_runtime': 20.4075, 'eval_samples_per_second': 136.469, 'eval_steps_per_second': 13.671, 'epoch': 0.24}
{'loss': 0.6066, 'grad_norm': 1.6119431257247925, 'learning_rate': 1.5e-06, 'epoch': 0.25}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.533518373966217, 'eval_runtime': 15.1333, 'eval_samples_per_second': 184.031, 'eval_steps_per_second': 18.436, 'epoch': 0.25}
{'loss': 0.6123, 'grad_norm': 1.3323665857315063, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.25}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5339117050170898, 'eval_runtime': 11.883, 'eval_samples_per_second': 234.368, 'eval_steps_per_second': 23.479, 'epoch': 0.25}
{'loss': 0.739, 'grad_norm': 1.7843968868255615, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.25}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.533547043800354, 'eval_runtime': 11.408, 'eval_samples_per_second': 244.127, 'eval_steps_per_second': 24.457, 'epoch': 0.25}
{'loss': 0.6493, 'grad_norm': 1.5918866395950317, 'learning_rate': 0.0, 'epoch': 0.25}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5332947373390198, 'eval_runtime': 11.1541, 'eval_samples_per_second': 249.685, 'eval_steps_per_second': 25.013, 'epoch': 0.25}
{'train_runtime': 3094.2755, 'train_samples_per_second': 0.646, 'train_steps_per_second': 0.065, 'train_loss': 0.7286773881316185, 'epoch': 0.25}


TrainOutput(global_step=200, training_loss=0.7286773881316185, metrics={'train_runtime': 3094.2755, 'train_samples_per_second': 0.646, 'train_steps_per_second': 0.065, 'total_flos': 20375470080000.0, 'train_loss': 0.7286773881316185, 'epoch': 0.24875621890547264})

In [13]:
trainer_jepa.evaluate(valwindowds)
wandb.finish()

  0%|          | 0/279 [00:00<?, ?it/s]

Error: You must call wandb.init() before wandb.log()