In [1]:
import sys
import os
sys.path.append('..')

import random

import numpy as np
import pandas as pd
import torch
import transformers
import wandb

from transformers import PatchTSTConfig, Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import PatchTSTForPrediction as PatchTSTForPredictionOG
from TimeSeriesJEPA.models import PatchTSTModelJEPA, PatchTSTForPrediction
from TimeSeriesJEPA.datasets.benchmark_dataset import BenchmarkDataset
os.environ["WANDB_PROJECT"] = "TimeSeriesJEPA" 

In [2]:
csv_path = r"D:\Coursework\MTS\dataset\ETT-small\ETTh1.csv"
dataset = "ETTh1"
num_workers = 4  # Reduce this if you have low number of CPU cores
batch_size = 10  # Reduce if not enough GPU memory available
context_length = 512
forecast_horizon = 96
patch_length = 8
patch_stride = 8
num_input_channels=7
d_model=64
num_attention_heads=4
num_hidden_layers=3
ffn_dim=64
dropout=0.05
head_dropout=0.2
pooling_type=None
channel_attention=False
scaling="std"
loss="mse"
pre_norm=True
norm_type="batchnorm"
positional_encoding_type = "sincos"

learning_rate=0.0001
num_train_epochs=30
max_steps=200

setting = 'Supervised_{}_PatchTST_sl{}_dm{}_nh{}_el{}_fd{}_bs{}_lr{}_e{}'.format(
        dataset,
        context_length,
        d_model,
        num_attention_heads,
        num_hidden_layers,
        ffn_dim,
        batch_size,
        learning_rate,
        num_train_epochs
        )

In [3]:

trainwindowds = BenchmarkDataset(csv_path=csv_path, context_length=context_length, prediction_length=forecast_horizon, flag='train', returndict=True)
valwindowds = BenchmarkDataset(csv_path=csv_path, context_length=context_length, prediction_length=forecast_horizon, flag='test', returndict=True)
print("dataset loaded, total size: ", len(trainwindowds), len(valwindowds))

Total data size:  (17420, 7)
Total data size:  (17420, 7)
dataset loaded, total size:  8033 2785


In [None]:
print("Loading prediction model")

config = PatchTSTConfig(
    do_mask_input=False,
    context_length=context_length,
    patch_length=patch_length,
    num_input_channels=num_input_channels,
    patch_stride=patch_stride,
    prediction_length=forecast_horizon,
    d_model=d_model,
    num_attention_heads=num_attention_heads,
    num_hidden_layers=num_hidden_layers,
    ffn_dim=ffn_dim,
    dropout=dropout,
    head_dropout=head_dropout,
    pooling_type=pooling_type,
    channel_attention=channel_attention,
    scaling=scaling,
    loss=loss,
    pre_norm=pre_norm,
    norm_type=norm_type,
    positional_encoding_type = positional_encoding_type
)

model = PatchTSTForPredictionOG(config=config)

In [None]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model.parameters()])
print("encoder parameters: ", params)

In [None]:
wandb.init(project="TimeSeriesJEPA", name=setting)


train_args = TrainingArguments(
    output_dir=os.path.join("checkpoints", setting),
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    # num_train_epochs=num_train_epochs,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=1,
    max_steps=max_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=1,  # num_workers,
    logging_strategy="steps",
    logging_steps=1,
    save_total_limit=3,
    load_best_model_at_end=True,  # Load the best model when training ends
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
    greater_is_better=False,  # For loss
    label_names=["future_values"],
    report_to="wandb",
    run_name=setting
    )

# # Create a new early stopping callback with faster convergence properties
# early_stopping_callback = EarlyStoppingCallback(
#     early_stopping_patience=15,  # Number of epochs with no improvement after which to stop
#     early_stopping_threshold=0.001,  # Minimum improvement required to consider as improvement
# )

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=trainwindowds,
    eval_dataset=valwindowds,
    # callbacks=[early_stopping_callback],
)

In [None]:
print("\n\nDoing forecasting training")
trainer.train()

In [None]:
trainer.evaluate(valwindowds)
wandb.finish()

In [4]:
print("Loading pretrained encoder model")
setting = "PatchTST_Time300B_sl512_enc_dm64_nh4_el3_fd64_pred_dm32_nh2_el1_fd32_bs256_lr0.0001_pe10_data_nenc1_npred4_w_regularization"
encoder_model = PatchTSTModelJEPA.from_pretrained("D:\\Coursework\\MTS\\timeseriesJEPA\\results\\"+setting+"\\checkpoint-17136")
print("Done")
encoder_model.cuda()
run_name = "Finetuned_"+dataset+"_"+setting+"_17136"

Loading pretrained encoder model
Done


In [5]:
run_name

'Finetuned_ETTh1_PatchTST_Time300B_sl512_enc_dm64_nh4_el3_fd64_pred_dm32_nh2_el1_fd32_bs256_lr0.0001_pe10_data_nenc1_npred4_w_regularization_17136'

In [6]:
print("Loading prediction model")

config = PatchTSTConfig(
    do_mask_input=False,
    context_length=context_length,
    patch_length=patch_length,
    num_input_channels=num_input_channels,
    patch_stride=patch_length,
    prediction_length=forecast_horizon,
    d_model=64,
    num_attention_heads=4,
    # num_hidden_layers=4,
    ffn_dim=128,
    dropout=0.05,
    head_dropout=head_dropout,
    pooling_type=None,
    channel_attention=False,
    scaling="std",
    loss="mse",
    pre_norm=True,
    norm_type="batchnorm",
    positional_encoding_type = "sincos"
)

model = PatchTSTForPrediction(config=config, encoder_model=encoder_model)

Loading prediction model


In [7]:
encoder_model

PatchTSTModelJEPA(
  (scaler): PatchTSTScaler(
    (scaler): PatchTSTStdScaler()
  )
  (patchifier): PatchTSTPatchify()
  (encoder): PatchTSTEncoder(
    (embedder): PatchTSTEmbedding(
      (input_embedding): Linear(in_features=8, out_features=64, bias=True)
    )
    (positional_encoder): PatchTSTPositionalEncoding(
      (positional_dropout): Identity()
    )
    (layers): ModuleList(
      (0-2): 3 x PatchTSTEncoderLayer(
        (self_attn): PatchTSTAttention(
          (k_proj): Linear(in_features=64, out_features=64, bias=True)
          (v_proj): Linear(in_features=64, out_features=64, bias=True)
          (q_proj): Linear(in_features=64, out_features=64, bias=True)
          (out_proj): Linear(in_features=64, out_features=64, bias=True)
        )
        (dropout_path1): Identity()
        (norm_sublayer1): PatchTSTBatchNorm(
          (batchnorm): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (ff): Sequential(
          (0):

In [8]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("encoder parameters: ", params)

encoder parameters:  393312


In [9]:
params = sum([np.prod(p.size()) for p in encoder_model.parameters()])
print("encoder parameters: ", params)

encoder parameters:  80448


In [10]:
wandb.init(project="TimeSeriesJEPA", name=run_name)

train_args_jepa = TrainingArguments(
    output_dir=os.path.join("checkpoints", run_name),
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    # num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=1,  # num_workers,
    logging_strategy="steps",
    logging_steps=1,
    save_total_limit=3,
    load_best_model_at_end=True,  # Load the best model when training ends
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
    greater_is_better=False,  # For loss
    label_names=["future_values"],
    report_to="wandb",
    run_name=run_name
)


trainer_jepa = Trainer(
    model=model,
    args=train_args_jepa,
    train_dataset=trainwindowds,
    eval_dataset=valwindowds,
)

[34m[1mwandb[0m: Currently logged in as: [33mvg2523[0m ([33mhpml_4[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


max_steps is given, it will override any value given in num_train_epochs


In [11]:
print("\n\nDoing forecasting training")
trainer_jepa.train()



Doing forecasting training


  0%|          | 0/200 [00:00<?, ?it/s]

{'loss': 1.5955, 'grad_norm': 8.502449035644531, 'learning_rate': 9.95e-05, 'epoch': 0.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 1.318251371383667, 'eval_runtime': 27.4423, 'eval_samples_per_second': 101.485, 'eval_steps_per_second': 10.167, 'epoch': 0.0}
{'loss': 1.4068, 'grad_norm': 6.894443988800049, 'learning_rate': 9.900000000000001e-05, 'epoch': 0.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 1.1236175298690796, 'eval_runtime': 23.1536, 'eval_samples_per_second': 120.284, 'eval_steps_per_second': 12.05, 'epoch': 0.0}
{'loss': 1.0801, 'grad_norm': 4.897520065307617, 'learning_rate': 9.850000000000001e-05, 'epoch': 0.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.9966614246368408, 'eval_runtime': 28.0209, 'eval_samples_per_second': 99.39, 'eval_steps_per_second': 9.957, 'epoch': 0.0}
{'loss': 1.0564, 'grad_norm': 4.124776363372803, 'learning_rate': 9.8e-05, 'epoch': 0.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.9148288369178772, 'eval_runtime': 25.6371, 'eval_samples_per_second': 108.632, 'eval_steps_per_second': 10.883, 'epoch': 0.0}
{'loss': 1.0423, 'grad_norm': 3.8213164806365967, 'learning_rate': 9.75e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.8598943948745728, 'eval_runtime': 23.5909, 'eval_samples_per_second': 118.054, 'eval_steps_per_second': 11.827, 'epoch': 0.01}
{'loss': 0.8597, 'grad_norm': 3.3299670219421387, 'learning_rate': 9.7e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.8245490789413452, 'eval_runtime': 22.7569, 'eval_samples_per_second': 122.38, 'eval_steps_per_second': 12.26, 'epoch': 0.01}
{'loss': 0.6956, 'grad_norm': 2.8738627433776855, 'learning_rate': 9.65e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.8049271106719971, 'eval_runtime': 17.3281, 'eval_samples_per_second': 160.722, 'eval_steps_per_second': 16.101, 'epoch': 0.01}
{'loss': 1.0919, 'grad_norm': 3.263307809829712, 'learning_rate': 9.6e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7986776828765869, 'eval_runtime': 21.0102, 'eval_samples_per_second': 132.555, 'eval_steps_per_second': 13.279, 'epoch': 0.01}
{'loss': 0.908, 'grad_norm': 2.673847198486328, 'learning_rate': 9.55e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.796984076499939, 'eval_runtime': 21.1752, 'eval_samples_per_second': 131.522, 'eval_steps_per_second': 13.176, 'epoch': 0.01}
{'loss': 0.9262, 'grad_norm': 2.8787920475006104, 'learning_rate': 9.5e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7912077903747559, 'eval_runtime': 24.4383, 'eval_samples_per_second': 113.96, 'eval_steps_per_second': 11.417, 'epoch': 0.01}
{'loss': 0.9593, 'grad_norm': 3.1429741382598877, 'learning_rate': 9.449999999999999e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7852210402488708, 'eval_runtime': 25.0865, 'eval_samples_per_second': 111.016, 'eval_steps_per_second': 11.122, 'epoch': 0.01}
{'loss': 0.918, 'grad_norm': 2.6406240463256836, 'learning_rate': 9.4e-05, 'epoch': 0.01}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7757973074913025, 'eval_runtime': 24.8432, 'eval_samples_per_second': 112.103, 'eval_steps_per_second': 11.23, 'epoch': 0.01}
{'loss': 0.8553, 'grad_norm': 2.2143449783325195, 'learning_rate': 9.350000000000001e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7689844369888306, 'eval_runtime': 23.0166, 'eval_samples_per_second': 121.0, 'eval_steps_per_second': 12.122, 'epoch': 0.02}
{'loss': 0.8593, 'grad_norm': 2.7051868438720703, 'learning_rate': 9.300000000000001e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7626972198486328, 'eval_runtime': 19.4464, 'eval_samples_per_second': 143.214, 'eval_steps_per_second': 14.347, 'epoch': 0.02}
{'loss': 0.831, 'grad_norm': 2.7843120098114014, 'learning_rate': 9.250000000000001e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7553684711456299, 'eval_runtime': 15.8962, 'eval_samples_per_second': 175.199, 'eval_steps_per_second': 17.551, 'epoch': 0.02}
{'loss': 0.8077, 'grad_norm': 2.585803508758545, 'learning_rate': 9.200000000000001e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7470638155937195, 'eval_runtime': 18.642, 'eval_samples_per_second': 149.394, 'eval_steps_per_second': 14.966, 'epoch': 0.02}
{'loss': 0.8406, 'grad_norm': 2.1997737884521484, 'learning_rate': 9.15e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7406526803970337, 'eval_runtime': 15.0555, 'eval_samples_per_second': 184.982, 'eval_steps_per_second': 18.531, 'epoch': 0.02}
{'loss': 0.9126, 'grad_norm': 2.8699655532836914, 'learning_rate': 9.1e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.733005166053772, 'eval_runtime': 14.4427, 'eval_samples_per_second': 192.831, 'eval_steps_per_second': 19.318, 'epoch': 0.02}
{'loss': 0.8237, 'grad_norm': 2.443380117416382, 'learning_rate': 9.05e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7271396517753601, 'eval_runtime': 23.4133, 'eval_samples_per_second': 118.949, 'eval_steps_per_second': 11.916, 'epoch': 0.02}
{'loss': 0.8572, 'grad_norm': 2.617676258087158, 'learning_rate': 9e-05, 'epoch': 0.02}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7212975025177002, 'eval_runtime': 22.9468, 'eval_samples_per_second': 121.368, 'eval_steps_per_second': 12.159, 'epoch': 0.02}
{'loss': 0.8885, 'grad_norm': 2.5573296546936035, 'learning_rate': 8.950000000000001e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7169869542121887, 'eval_runtime': 15.0619, 'eval_samples_per_second': 184.904, 'eval_steps_per_second': 18.524, 'epoch': 0.03}
{'loss': 0.6811, 'grad_norm': 1.7601373195648193, 'learning_rate': 8.900000000000001e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7164736390113831, 'eval_runtime': 15.0709, 'eval_samples_per_second': 184.793, 'eval_steps_per_second': 18.512, 'epoch': 0.03}
{'loss': 0.8736, 'grad_norm': 2.7390432357788086, 'learning_rate': 8.850000000000001e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7160912156105042, 'eval_runtime': 14.4885, 'eval_samples_per_second': 192.221, 'eval_steps_per_second': 19.257, 'epoch': 0.03}
{'loss': 0.8613, 'grad_norm': 2.9131581783294678, 'learning_rate': 8.800000000000001e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.71491539478302, 'eval_runtime': 14.3887, 'eval_samples_per_second': 193.554, 'eval_steps_per_second': 19.39, 'epoch': 0.03}
{'loss': 0.7879, 'grad_norm': 1.9110196828842163, 'learning_rate': 8.75e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7127476334571838, 'eval_runtime': 13.3681, 'eval_samples_per_second': 208.332, 'eval_steps_per_second': 20.871, 'epoch': 0.03}
{'loss': 0.7577, 'grad_norm': 2.1160366535186768, 'learning_rate': 8.7e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7098274827003479, 'eval_runtime': 14.3759, 'eval_samples_per_second': 193.727, 'eval_steps_per_second': 19.407, 'epoch': 0.03}
{'loss': 0.7511, 'grad_norm': 2.0352299213409424, 'learning_rate': 8.65e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7063843011856079, 'eval_runtime': 13.9553, 'eval_samples_per_second': 199.566, 'eval_steps_per_second': 19.992, 'epoch': 0.03}
{'loss': 0.8829, 'grad_norm': 2.9684882164001465, 'learning_rate': 8.6e-05, 'epoch': 0.03}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.7006980776786804, 'eval_runtime': 13.7155, 'eval_samples_per_second': 203.055, 'eval_steps_per_second': 20.342, 'epoch': 0.03}
{'loss': 0.8202, 'grad_norm': 2.0110697746276855, 'learning_rate': 8.55e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6945437788963318, 'eval_runtime': 13.6794, 'eval_samples_per_second': 203.59, 'eval_steps_per_second': 20.396, 'epoch': 0.04}
{'loss': 0.8162, 'grad_norm': 2.031656265258789, 'learning_rate': 8.5e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6895507574081421, 'eval_runtime': 13.625, 'eval_samples_per_second': 204.404, 'eval_steps_per_second': 20.477, 'epoch': 0.04}
{'loss': 0.892, 'grad_norm': 2.1795639991760254, 'learning_rate': 8.450000000000001e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.685413658618927, 'eval_runtime': 14.8004, 'eval_samples_per_second': 188.17, 'eval_steps_per_second': 18.851, 'epoch': 0.04}
{'loss': 0.6898, 'grad_norm': 1.8014699220657349, 'learning_rate': 8.4e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6831006407737732, 'eval_runtime': 14.5695, 'eval_samples_per_second': 191.152, 'eval_steps_per_second': 19.15, 'epoch': 0.04}
{'loss': 0.9266, 'grad_norm': 2.618335008621216, 'learning_rate': 8.35e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6814950108528137, 'eval_runtime': 15.3198, 'eval_samples_per_second': 181.791, 'eval_steps_per_second': 18.212, 'epoch': 0.04}
{'loss': 0.7077, 'grad_norm': 2.143104076385498, 'learning_rate': 8.3e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6794959306716919, 'eval_runtime': 13.9657, 'eval_samples_per_second': 199.417, 'eval_steps_per_second': 19.977, 'epoch': 0.04}
{'loss': 0.8853, 'grad_norm': 2.0593807697296143, 'learning_rate': 8.25e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6760226488113403, 'eval_runtime': 13.4198, 'eval_samples_per_second': 207.529, 'eval_steps_per_second': 20.79, 'epoch': 0.04}
{'loss': 0.815, 'grad_norm': 2.3695926666259766, 'learning_rate': 8.2e-05, 'epoch': 0.04}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6728047728538513, 'eval_runtime': 14.2085, 'eval_samples_per_second': 196.01, 'eval_steps_per_second': 19.636, 'epoch': 0.04}
{'loss': 0.6785, 'grad_norm': 1.6404201984405518, 'learning_rate': 8.15e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6699132323265076, 'eval_runtime': 13.581, 'eval_samples_per_second': 205.066, 'eval_steps_per_second': 20.543, 'epoch': 0.05}
{'loss': 0.7651, 'grad_norm': 1.9920930862426758, 'learning_rate': 8.1e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6679134964942932, 'eval_runtime': 13.265, 'eval_samples_per_second': 209.951, 'eval_steps_per_second': 21.033, 'epoch': 0.05}
{'loss': 0.9835, 'grad_norm': 1.657626986503601, 'learning_rate': 8.05e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6694769859313965, 'eval_runtime': 16.5864, 'eval_samples_per_second': 167.909, 'eval_steps_per_second': 16.821, 'epoch': 0.05}
{'loss': 0.7239, 'grad_norm': 1.9415080547332764, 'learning_rate': 8e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6715027689933777, 'eval_runtime': 22.8419, 'eval_samples_per_second': 121.925, 'eval_steps_per_second': 12.214, 'epoch': 0.05}
{'loss': 0.9278, 'grad_norm': 2.3490893840789795, 'learning_rate': 7.950000000000001e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6718051433563232, 'eval_runtime': 22.2546, 'eval_samples_per_second': 125.143, 'eval_steps_per_second': 12.537, 'epoch': 0.05}
{'loss': 0.9336, 'grad_norm': 2.2827136516571045, 'learning_rate': 7.900000000000001e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6703476309776306, 'eval_runtime': 22.4849, 'eval_samples_per_second': 123.861, 'eval_steps_per_second': 12.408, 'epoch': 0.05}
{'loss': 0.6565, 'grad_norm': 1.7824699878692627, 'learning_rate': 7.850000000000001e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6672845482826233, 'eval_runtime': 22.363, 'eval_samples_per_second': 124.536, 'eval_steps_per_second': 12.476, 'epoch': 0.05}
{'loss': 0.8094, 'grad_norm': 1.8882728815078735, 'learning_rate': 7.800000000000001e-05, 'epoch': 0.05}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6659395098686218, 'eval_runtime': 21.32, 'eval_samples_per_second': 130.628, 'eval_steps_per_second': 13.086, 'epoch': 0.05}
{'loss': 0.6415, 'grad_norm': 1.8344019651412964, 'learning_rate': 7.75e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6609072685241699, 'eval_runtime': 21.9326, 'eval_samples_per_second': 126.98, 'eval_steps_per_second': 12.721, 'epoch': 0.06}
{'loss': 0.7824, 'grad_norm': 2.5490305423736572, 'learning_rate': 7.7e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6533912420272827, 'eval_runtime': 19.0487, 'eval_samples_per_second': 146.205, 'eval_steps_per_second': 14.647, 'epoch': 0.06}
{'loss': 0.7524, 'grad_norm': 1.8081141710281372, 'learning_rate': 7.65e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6469297409057617, 'eval_runtime': 13.769, 'eval_samples_per_second': 202.266, 'eval_steps_per_second': 20.263, 'epoch': 0.06}
{'loss': 0.6627, 'grad_norm': 1.836230754852295, 'learning_rate': 7.6e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6428654193878174, 'eval_runtime': 13.7839, 'eval_samples_per_second': 202.048, 'eval_steps_per_second': 20.241, 'epoch': 0.06}
{'loss': 0.6506, 'grad_norm': 1.4563040733337402, 'learning_rate': 7.55e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6417195796966553, 'eval_runtime': 14.8228, 'eval_samples_per_second': 187.886, 'eval_steps_per_second': 18.822, 'epoch': 0.06}
{'loss': 0.7311, 'grad_norm': 1.480999231338501, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6451135277748108, 'eval_runtime': 14.6772, 'eval_samples_per_second': 189.751, 'eval_steps_per_second': 19.009, 'epoch': 0.06}
{'loss': 0.6658, 'grad_norm': 1.6449741125106812, 'learning_rate': 7.450000000000001e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6519562602043152, 'eval_runtime': 14.1037, 'eval_samples_per_second': 197.466, 'eval_steps_per_second': 19.782, 'epoch': 0.06}
{'loss': 0.9314, 'grad_norm': 2.1862261295318604, 'learning_rate': 7.4e-05, 'epoch': 0.06}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.656775951385498, 'eval_runtime': 15.5954, 'eval_samples_per_second': 178.578, 'eval_steps_per_second': 17.89, 'epoch': 0.06}
{'loss': 0.765, 'grad_norm': 1.8851031064987183, 'learning_rate': 7.35e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6577572822570801, 'eval_runtime': 14.5294, 'eval_samples_per_second': 191.68, 'eval_steps_per_second': 19.202, 'epoch': 0.07}
{'loss': 0.6951, 'grad_norm': 1.8865113258361816, 'learning_rate': 7.3e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6554520130157471, 'eval_runtime': 14.7478, 'eval_samples_per_second': 188.842, 'eval_steps_per_second': 18.918, 'epoch': 0.07}
{'loss': 0.9937, 'grad_norm': 2.353119373321533, 'learning_rate': 7.25e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6525391340255737, 'eval_runtime': 14.2532, 'eval_samples_per_second': 195.395, 'eval_steps_per_second': 19.575, 'epoch': 0.07}
{'loss': 0.9163, 'grad_norm': 1.6517893075942993, 'learning_rate': 7.2e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.649549126625061, 'eval_runtime': 14.5784, 'eval_samples_per_second': 191.036, 'eval_steps_per_second': 19.138, 'epoch': 0.07}
{'loss': 0.838, 'grad_norm': 1.8713276386260986, 'learning_rate': 7.15e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6483864188194275, 'eval_runtime': 14.0048, 'eval_samples_per_second': 198.861, 'eval_steps_per_second': 19.922, 'epoch': 0.07}
{'loss': 0.8096, 'grad_norm': 1.9447518587112427, 'learning_rate': 7.1e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6446444392204285, 'eval_runtime': 14.0215, 'eval_samples_per_second': 198.623, 'eval_steps_per_second': 19.898, 'epoch': 0.07}
{'loss': 0.6393, 'grad_norm': 1.6179932355880737, 'learning_rate': 7.05e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6393353343009949, 'eval_runtime': 16.4441, 'eval_samples_per_second': 169.362, 'eval_steps_per_second': 16.967, 'epoch': 0.07}
{'loss': 0.8206, 'grad_norm': 1.9560915231704712, 'learning_rate': 7e-05, 'epoch': 0.07}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6373435854911804, 'eval_runtime': 14.0346, 'eval_samples_per_second': 198.439, 'eval_steps_per_second': 19.88, 'epoch': 0.07}
{'loss': 0.7667, 'grad_norm': 1.5253616571426392, 'learning_rate': 6.95e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6384202241897583, 'eval_runtime': 14.9895, 'eval_samples_per_second': 185.796, 'eval_steps_per_second': 18.613, 'epoch': 0.08}
{'loss': 0.8307, 'grad_norm': 1.8537498712539673, 'learning_rate': 6.9e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6395877003669739, 'eval_runtime': 13.7153, 'eval_samples_per_second': 203.059, 'eval_steps_per_second': 20.342, 'epoch': 0.08}
{'loss': 0.7517, 'grad_norm': 2.0567047595977783, 'learning_rate': 6.850000000000001e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6397823095321655, 'eval_runtime': 12.7099, 'eval_samples_per_second': 219.121, 'eval_steps_per_second': 21.951, 'epoch': 0.08}
{'loss': 0.6369, 'grad_norm': 1.5923819541931152, 'learning_rate': 6.800000000000001e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6409854888916016, 'eval_runtime': 14.2082, 'eval_samples_per_second': 196.014, 'eval_steps_per_second': 19.637, 'epoch': 0.08}
{'loss': 0.8256, 'grad_norm': 2.094935894012451, 'learning_rate': 6.750000000000001e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6403082013130188, 'eval_runtime': 12.8021, 'eval_samples_per_second': 217.543, 'eval_steps_per_second': 21.793, 'epoch': 0.08}
{'loss': 0.8368, 'grad_norm': 2.3926138877868652, 'learning_rate': 6.7e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6364654302597046, 'eval_runtime': 12.6166, 'eval_samples_per_second': 220.741, 'eval_steps_per_second': 22.114, 'epoch': 0.08}
{'loss': 0.7259, 'grad_norm': 1.242864727973938, 'learning_rate': 6.65e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6331297159194946, 'eval_runtime': 12.8941, 'eval_samples_per_second': 215.991, 'eval_steps_per_second': 21.638, 'epoch': 0.08}
{'loss': 0.7376, 'grad_norm': 1.9621460437774658, 'learning_rate': 6.6e-05, 'epoch': 0.08}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6288184523582458, 'eval_runtime': 13.4047, 'eval_samples_per_second': 207.764, 'eval_steps_per_second': 20.814, 'epoch': 0.08}
{'loss': 0.6469, 'grad_norm': 2.038653612136841, 'learning_rate': 6.55e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6233216524124146, 'eval_runtime': 12.9575, 'eval_samples_per_second': 214.934, 'eval_steps_per_second': 21.532, 'epoch': 0.09}
{'loss': 0.7943, 'grad_norm': 1.944062352180481, 'learning_rate': 6.500000000000001e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6178998351097107, 'eval_runtime': 12.5496, 'eval_samples_per_second': 221.92, 'eval_steps_per_second': 22.232, 'epoch': 0.09}
{'loss': 0.6373, 'grad_norm': 1.5241467952728271, 'learning_rate': 6.450000000000001e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.612833559513092, 'eval_runtime': 12.4586, 'eval_samples_per_second': 223.541, 'eval_steps_per_second': 22.394, 'epoch': 0.09}
{'loss': 0.9684, 'grad_norm': 2.1900384426116943, 'learning_rate': 6.400000000000001e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6099923253059387, 'eval_runtime': 13.1489, 'eval_samples_per_second': 211.805, 'eval_steps_per_second': 21.219, 'epoch': 0.09}
{'loss': 0.7686, 'grad_norm': 2.463373899459839, 'learning_rate': 6.35e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6090626120567322, 'eval_runtime': 12.8932, 'eval_samples_per_second': 216.005, 'eval_steps_per_second': 21.639, 'epoch': 0.09}
{'loss': 0.7507, 'grad_norm': 2.0765810012817383, 'learning_rate': 6.3e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6079962849617004, 'eval_runtime': 12.9657, 'eval_samples_per_second': 214.798, 'eval_steps_per_second': 21.518, 'epoch': 0.09}
{'loss': 0.7192, 'grad_norm': 1.9017691612243652, 'learning_rate': 6.25e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6081264019012451, 'eval_runtime': 12.9846, 'eval_samples_per_second': 214.484, 'eval_steps_per_second': 21.487, 'epoch': 0.09}
{'loss': 0.7737, 'grad_norm': 1.888586401939392, 'learning_rate': 6.2e-05, 'epoch': 0.09}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6074724197387695, 'eval_runtime': 12.6968, 'eval_samples_per_second': 219.347, 'eval_steps_per_second': 21.974, 'epoch': 0.09}
{'loss': 0.7467, 'grad_norm': 1.6754800081253052, 'learning_rate': 6.15e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6065251231193542, 'eval_runtime': 13.3303, 'eval_samples_per_second': 208.923, 'eval_steps_per_second': 20.93, 'epoch': 0.1}
{'loss': 0.7827, 'grad_norm': 2.226386547088623, 'learning_rate': 6.1e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6054191589355469, 'eval_runtime': 12.7713, 'eval_samples_per_second': 218.067, 'eval_steps_per_second': 21.846, 'epoch': 0.1}
{'loss': 0.7565, 'grad_norm': 1.7616043090820312, 'learning_rate': 6.05e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6030463576316833, 'eval_runtime': 8.373, 'eval_samples_per_second': 332.619, 'eval_steps_per_second': 33.322, 'epoch': 0.1}
{'loss': 0.7455, 'grad_norm': 1.7468063831329346, 'learning_rate': 6e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6002594232559204, 'eval_runtime': 7.4469, 'eval_samples_per_second': 373.979, 'eval_steps_per_second': 37.465, 'epoch': 0.1}
{'loss': 0.7258, 'grad_norm': 1.7726153135299683, 'learning_rate': 5.95e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5983285307884216, 'eval_runtime': 7.3889, 'eval_samples_per_second': 376.918, 'eval_steps_per_second': 37.759, 'epoch': 0.1}
{'loss': 0.9476, 'grad_norm': 1.8203929662704468, 'learning_rate': 5.9e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5974550843238831, 'eval_runtime': 6.7989, 'eval_samples_per_second': 409.623, 'eval_steps_per_second': 41.036, 'epoch': 0.1}
{'loss': 0.7915, 'grad_norm': 1.7746857404708862, 'learning_rate': 5.85e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5970901250839233, 'eval_runtime': 7.1695, 'eval_samples_per_second': 388.452, 'eval_steps_per_second': 38.915, 'epoch': 0.1}
{'loss': 0.8274, 'grad_norm': 1.6838266849517822, 'learning_rate': 5.8e-05, 'epoch': 0.1}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5978580713272095, 'eval_runtime': 6.8757, 'eval_samples_per_second': 405.047, 'eval_steps_per_second': 40.577, 'epoch': 0.1}
{'loss': 0.8113, 'grad_norm': 2.672278881072998, 'learning_rate': 5.7499999999999995e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5982525944709778, 'eval_runtime': 6.6564, 'eval_samples_per_second': 418.396, 'eval_steps_per_second': 41.915, 'epoch': 0.11}
{'loss': 0.6864, 'grad_norm': 2.207515239715576, 'learning_rate': 5.6999999999999996e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6000803709030151, 'eval_runtime': 6.724, 'eval_samples_per_second': 414.189, 'eval_steps_per_second': 41.493, 'epoch': 0.11}
{'loss': 0.7667, 'grad_norm': 1.9526821374893188, 'learning_rate': 5.65e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6009222269058228, 'eval_runtime': 6.9015, 'eval_samples_per_second': 403.537, 'eval_steps_per_second': 40.426, 'epoch': 0.11}
{'loss': 0.7208, 'grad_norm': 1.7680641412734985, 'learning_rate': 5.6000000000000006e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.6005706787109375, 'eval_runtime': 6.9837, 'eval_samples_per_second': 398.788, 'eval_steps_per_second': 39.95, 'epoch': 0.11}
{'loss': 0.7216, 'grad_norm': 2.197403907775879, 'learning_rate': 5.550000000000001e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5991877913475037, 'eval_runtime': 6.6766, 'eval_samples_per_second': 417.127, 'eval_steps_per_second': 41.788, 'epoch': 0.11}
{'loss': 0.6902, 'grad_norm': 2.4949750900268555, 'learning_rate': 5.500000000000001e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5965626239776611, 'eval_runtime': 6.5578, 'eval_samples_per_second': 424.687, 'eval_steps_per_second': 42.545, 'epoch': 0.11}
{'loss': 0.6235, 'grad_norm': 1.4430875778198242, 'learning_rate': 5.45e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5940285325050354, 'eval_runtime': 6.6716, 'eval_samples_per_second': 417.44, 'eval_steps_per_second': 41.819, 'epoch': 0.11}
{'loss': 0.9204, 'grad_norm': 2.6651718616485596, 'learning_rate': 5.4000000000000005e-05, 'epoch': 0.11}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5921502709388733, 'eval_runtime': 6.8653, 'eval_samples_per_second': 405.666, 'eval_steps_per_second': 40.639, 'epoch': 0.11}
{'loss': 0.721, 'grad_norm': 1.753010869026184, 'learning_rate': 5.3500000000000006e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5901501178741455, 'eval_runtime': 6.7619, 'eval_samples_per_second': 411.864, 'eval_steps_per_second': 41.26, 'epoch': 0.12}
{'loss': 0.7764, 'grad_norm': 1.792697548866272, 'learning_rate': 5.300000000000001e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5887574553489685, 'eval_runtime': 7.0911, 'eval_samples_per_second': 392.746, 'eval_steps_per_second': 39.345, 'epoch': 0.12}
{'loss': 0.6985, 'grad_norm': 1.6237537860870361, 'learning_rate': 5.25e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5881734490394592, 'eval_runtime': 6.8267, 'eval_samples_per_second': 407.96, 'eval_steps_per_second': 40.869, 'epoch': 0.12}
{'loss': 0.7323, 'grad_norm': 2.1526033878326416, 'learning_rate': 5.2000000000000004e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5898147821426392, 'eval_runtime': 7.7334, 'eval_samples_per_second': 360.126, 'eval_steps_per_second': 36.077, 'epoch': 0.12}
{'loss': 0.7519, 'grad_norm': 1.6613129377365112, 'learning_rate': 5.1500000000000005e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5903835892677307, 'eval_runtime': 7.4887, 'eval_samples_per_second': 371.894, 'eval_steps_per_second': 37.256, 'epoch': 0.12}
{'loss': 0.795, 'grad_norm': 2.3032217025756836, 'learning_rate': 5.1000000000000006e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5897014141082764, 'eval_runtime': 7.9885, 'eval_samples_per_second': 348.627, 'eval_steps_per_second': 34.925, 'epoch': 0.12}
{'loss': 0.7076, 'grad_norm': 1.7223432064056396, 'learning_rate': 5.05e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5881695747375488, 'eval_runtime': 8.4139, 'eval_samples_per_second': 330.999, 'eval_steps_per_second': 33.159, 'epoch': 0.12}
{'loss': 0.6286, 'grad_norm': 2.001746892929077, 'learning_rate': 5e-05, 'epoch': 0.12}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5851995348930359, 'eval_runtime': 8.501, 'eval_samples_per_second': 327.607, 'eval_steps_per_second': 32.819, 'epoch': 0.12}
{'loss': 0.6995, 'grad_norm': 1.7593415975570679, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5821016430854797, 'eval_runtime': 8.4448, 'eval_samples_per_second': 329.79, 'eval_steps_per_second': 33.038, 'epoch': 0.13}
{'loss': 0.9652, 'grad_norm': 2.459078550338745, 'learning_rate': 4.9e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5790550112724304, 'eval_runtime': 9.0328, 'eval_samples_per_second': 308.321, 'eval_steps_per_second': 30.887, 'epoch': 0.13}
{'loss': 0.8812, 'grad_norm': 2.5213255882263184, 'learning_rate': 4.85e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5774980783462524, 'eval_runtime': 8.0523, 'eval_samples_per_second': 345.865, 'eval_steps_per_second': 34.649, 'epoch': 0.13}
{'loss': 0.7084, 'grad_norm': 1.4264600276947021, 'learning_rate': 4.8e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5763981342315674, 'eval_runtime': 7.5961, 'eval_samples_per_second': 366.637, 'eval_steps_per_second': 36.73, 'epoch': 0.13}
{'loss': 0.6955, 'grad_norm': 1.6912829875946045, 'learning_rate': 4.75e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5760297775268555, 'eval_runtime': 7.691, 'eval_samples_per_second': 362.113, 'eval_steps_per_second': 36.276, 'epoch': 0.13}
{'loss': 0.7043, 'grad_norm': 1.5252974033355713, 'learning_rate': 4.7e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5760726928710938, 'eval_runtime': 7.4458, 'eval_samples_per_second': 374.036, 'eval_steps_per_second': 37.471, 'epoch': 0.13}
{'loss': 0.8223, 'grad_norm': 1.931477427482605, 'learning_rate': 4.6500000000000005e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5770355463027954, 'eval_runtime': 7.4986, 'eval_samples_per_second': 371.403, 'eval_steps_per_second': 37.207, 'epoch': 0.13}
{'loss': 0.6342, 'grad_norm': 1.7603024244308472, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.13}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.577174186706543, 'eval_runtime': 7.4149, 'eval_samples_per_second': 375.595, 'eval_steps_per_second': 37.627, 'epoch': 0.13}
{'loss': 0.584, 'grad_norm': 1.1545499563217163, 'learning_rate': 4.55e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5781015753746033, 'eval_runtime': 7.3132, 'eval_samples_per_second': 380.82, 'eval_steps_per_second': 38.15, 'epoch': 0.14}
{'loss': 0.7447, 'grad_norm': 2.0361833572387695, 'learning_rate': 4.5e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5786564350128174, 'eval_runtime': 7.3039, 'eval_samples_per_second': 381.301, 'eval_steps_per_second': 38.199, 'epoch': 0.14}
{'loss': 0.8593, 'grad_norm': 2.466827630996704, 'learning_rate': 4.4500000000000004e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5784499645233154, 'eval_runtime': 7.3663, 'eval_samples_per_second': 378.075, 'eval_steps_per_second': 37.875, 'epoch': 0.14}
{'loss': 0.8163, 'grad_norm': 2.2280349731445312, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5774025917053223, 'eval_runtime': 7.3129, 'eval_samples_per_second': 380.834, 'eval_steps_per_second': 38.152, 'epoch': 0.14}
{'loss': 0.6762, 'grad_norm': 1.670803427696228, 'learning_rate': 4.35e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5763758420944214, 'eval_runtime': 7.2445, 'eval_samples_per_second': 384.429, 'eval_steps_per_second': 38.512, 'epoch': 0.14}
{'loss': 0.7966, 'grad_norm': 2.03198504447937, 'learning_rate': 4.3e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5763428211212158, 'eval_runtime': 7.6102, 'eval_samples_per_second': 365.958, 'eval_steps_per_second': 36.661, 'epoch': 0.14}
{'loss': 0.7956, 'grad_norm': 2.2394323348999023, 'learning_rate': 4.25e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5769262313842773, 'eval_runtime': 7.3579, 'eval_samples_per_second': 378.506, 'eval_steps_per_second': 37.919, 'epoch': 0.14}
{'loss': 0.6946, 'grad_norm': 1.522552728652954, 'learning_rate': 4.2e-05, 'epoch': 0.14}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5782577395439148, 'eval_runtime': 7.4427, 'eval_samples_per_second': 374.195, 'eval_steps_per_second': 37.487, 'epoch': 0.14}
{'loss': 0.7641, 'grad_norm': 1.7455745935440063, 'learning_rate': 4.15e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5796240568161011, 'eval_runtime': 7.885, 'eval_samples_per_second': 353.2, 'eval_steps_per_second': 35.383, 'epoch': 0.15}
{'loss': 0.7605, 'grad_norm': 1.54032564163208, 'learning_rate': 4.1e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5803488492965698, 'eval_runtime': 7.8742, 'eval_samples_per_second': 353.685, 'eval_steps_per_second': 35.432, 'epoch': 0.15}
{'loss': 0.6638, 'grad_norm': 1.8612867593765259, 'learning_rate': 4.05e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5801676511764526, 'eval_runtime': 8.0961, 'eval_samples_per_second': 343.991, 'eval_steps_per_second': 34.461, 'epoch': 0.15}
{'loss': 0.844, 'grad_norm': 1.9747928380966187, 'learning_rate': 4e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5806931853294373, 'eval_runtime': 7.4288, 'eval_samples_per_second': 374.893, 'eval_steps_per_second': 37.557, 'epoch': 0.15}
{'loss': 0.7227, 'grad_norm': 1.5770962238311768, 'learning_rate': 3.9500000000000005e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5796533823013306, 'eval_runtime': 7.5711, 'eval_samples_per_second': 367.847, 'eval_steps_per_second': 36.851, 'epoch': 0.15}
{'loss': 0.6407, 'grad_norm': 1.9192404747009277, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5779518485069275, 'eval_runtime': 7.4674, 'eval_samples_per_second': 372.957, 'eval_steps_per_second': 37.363, 'epoch': 0.15}
{'loss': 0.7873, 'grad_norm': 2.1475157737731934, 'learning_rate': 3.85e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5751339197158813, 'eval_runtime': 7.3334, 'eval_samples_per_second': 379.767, 'eval_steps_per_second': 38.045, 'epoch': 0.15}
{'loss': 0.62, 'grad_norm': 1.791329026222229, 'learning_rate': 3.8e-05, 'epoch': 0.15}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5742160081863403, 'eval_runtime': 7.3411, 'eval_samples_per_second': 379.371, 'eval_steps_per_second': 38.005, 'epoch': 0.15}
{'loss': 0.8077, 'grad_norm': 2.13381290435791, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5719220042228699, 'eval_runtime': 7.2749, 'eval_samples_per_second': 382.825, 'eval_steps_per_second': 38.351, 'epoch': 0.16}
{'loss': 0.6711, 'grad_norm': 1.801479458808899, 'learning_rate': 3.7e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5691950917243958, 'eval_runtime': 7.2542, 'eval_samples_per_second': 383.917, 'eval_steps_per_second': 38.461, 'epoch': 0.16}
{'loss': 0.7006, 'grad_norm': 1.8110066652297974, 'learning_rate': 3.65e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.56683349609375, 'eval_runtime': 7.2236, 'eval_samples_per_second': 385.542, 'eval_steps_per_second': 38.623, 'epoch': 0.16}
{'loss': 0.7376, 'grad_norm': 1.5439586639404297, 'learning_rate': 3.6e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5657263398170471, 'eval_runtime': 7.2657, 'eval_samples_per_second': 383.307, 'eval_steps_per_second': 38.399, 'epoch': 0.16}
{'loss': 0.65, 'grad_norm': 1.661374807357788, 'learning_rate': 3.55e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5640659332275391, 'eval_runtime': 7.1919, 'eval_samples_per_second': 387.241, 'eval_steps_per_second': 38.794, 'epoch': 0.16}
{'loss': 0.7358, 'grad_norm': 2.0059332847595215, 'learning_rate': 3.5e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.563097357749939, 'eval_runtime': 7.1604, 'eval_samples_per_second': 388.945, 'eval_steps_per_second': 38.964, 'epoch': 0.16}
{'loss': 0.6828, 'grad_norm': 1.3933355808258057, 'learning_rate': 3.45e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5620504021644592, 'eval_runtime': 7.5426, 'eval_samples_per_second': 369.236, 'eval_steps_per_second': 36.99, 'epoch': 0.16}
{'loss': 0.8296, 'grad_norm': 1.9792377948760986, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.16}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5618495345115662, 'eval_runtime': 7.3455, 'eval_samples_per_second': 379.143, 'eval_steps_per_second': 37.982, 'epoch': 0.16}
{'loss': 0.6963, 'grad_norm': 2.115375280380249, 'learning_rate': 3.35e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5624001026153564, 'eval_runtime': 7.1045, 'eval_samples_per_second': 392.004, 'eval_steps_per_second': 39.271, 'epoch': 0.17}
{'loss': 0.7315, 'grad_norm': 2.1521177291870117, 'learning_rate': 3.3e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5630057454109192, 'eval_runtime': 7.0868, 'eval_samples_per_second': 392.982, 'eval_steps_per_second': 39.369, 'epoch': 0.17}
{'loss': 0.5782, 'grad_norm': 1.7326678037643433, 'learning_rate': 3.2500000000000004e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5638890266418457, 'eval_runtime': 7.154, 'eval_samples_per_second': 389.292, 'eval_steps_per_second': 38.999, 'epoch': 0.17}
{'loss': 0.8667, 'grad_norm': 1.9053581953048706, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5647157430648804, 'eval_runtime': 7.1543, 'eval_samples_per_second': 389.275, 'eval_steps_per_second': 38.997, 'epoch': 0.17}
{'loss': 0.6773, 'grad_norm': 1.5967597961425781, 'learning_rate': 3.15e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5650637745857239, 'eval_runtime': 7.0283, 'eval_samples_per_second': 396.255, 'eval_steps_per_second': 39.697, 'epoch': 0.17}
{'loss': 0.8672, 'grad_norm': 2.085824489593506, 'learning_rate': 3.1e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5642657279968262, 'eval_runtime': 7.0435, 'eval_samples_per_second': 395.4, 'eval_steps_per_second': 39.611, 'epoch': 0.17}
{'loss': 0.6507, 'grad_norm': 1.6655689477920532, 'learning_rate': 3.05e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5644684433937073, 'eval_runtime': 7.0704, 'eval_samples_per_second': 393.895, 'eval_steps_per_second': 39.46, 'epoch': 0.17}
{'loss': 0.6717, 'grad_norm': 1.4583196640014648, 'learning_rate': 3e-05, 'epoch': 0.17}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5652354955673218, 'eval_runtime': 7.1197, 'eval_samples_per_second': 391.167, 'eval_steps_per_second': 39.187, 'epoch': 0.17}
{'loss': 0.7319, 'grad_norm': 1.9978026151657104, 'learning_rate': 2.95e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5655685663223267, 'eval_runtime': 6.9719, 'eval_samples_per_second': 399.461, 'eval_steps_per_second': 40.018, 'epoch': 0.18}
{'loss': 0.6258, 'grad_norm': 1.3695390224456787, 'learning_rate': 2.9e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5664507746696472, 'eval_runtime': 6.986, 'eval_samples_per_second': 398.655, 'eval_steps_per_second': 39.937, 'epoch': 0.18}
{'loss': 0.6913, 'grad_norm': 1.6940181255340576, 'learning_rate': 2.8499999999999998e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5677338242530823, 'eval_runtime': 7.415, 'eval_samples_per_second': 375.588, 'eval_steps_per_second': 37.626, 'epoch': 0.18}
{'loss': 0.683, 'grad_norm': 1.934493899345398, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5677632689476013, 'eval_runtime': 7.2269, 'eval_samples_per_second': 385.366, 'eval_steps_per_second': 38.606, 'epoch': 0.18}
{'loss': 0.6946, 'grad_norm': 1.790331482887268, 'learning_rate': 2.7500000000000004e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5671019554138184, 'eval_runtime': 7.1588, 'eval_samples_per_second': 389.032, 'eval_steps_per_second': 38.973, 'epoch': 0.18}
{'loss': 0.7417, 'grad_norm': 2.0152511596679688, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.565534234046936, 'eval_runtime': 7.2595, 'eval_samples_per_second': 383.634, 'eval_steps_per_second': 38.432, 'epoch': 0.18}
{'loss': 0.6171, 'grad_norm': 1.3025025129318237, 'learning_rate': 2.6500000000000004e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5639768838882446, 'eval_runtime': 7.8932, 'eval_samples_per_second': 352.836, 'eval_steps_per_second': 35.347, 'epoch': 0.18}
{'loss': 0.7057, 'grad_norm': 2.0961549282073975, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.18}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5615280866622925, 'eval_runtime': 7.5041, 'eval_samples_per_second': 371.13, 'eval_steps_per_second': 37.18, 'epoch': 0.18}
{'loss': 0.5834, 'grad_norm': 1.4091992378234863, 'learning_rate': 2.5500000000000003e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5596816539764404, 'eval_runtime': 6.9918, 'eval_samples_per_second': 398.326, 'eval_steps_per_second': 39.904, 'epoch': 0.19}
{'loss': 0.6587, 'grad_norm': 1.4658902883529663, 'learning_rate': 2.5e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5583574175834656, 'eval_runtime': 6.985, 'eval_samples_per_second': 398.714, 'eval_steps_per_second': 39.943, 'epoch': 0.19}
{'loss': 0.6836, 'grad_norm': 1.417275309562683, 'learning_rate': 2.45e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5566409826278687, 'eval_runtime': 7.0877, 'eval_samples_per_second': 392.934, 'eval_steps_per_second': 39.364, 'epoch': 0.19}
{'loss': 0.7546, 'grad_norm': 1.5264508724212646, 'learning_rate': 2.4e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5555775761604309, 'eval_runtime': 7.1924, 'eval_samples_per_second': 387.214, 'eval_steps_per_second': 38.791, 'epoch': 0.19}
{'loss': 0.7871, 'grad_norm': 1.990151286125183, 'learning_rate': 2.35e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5545156002044678, 'eval_runtime': 6.9398, 'eval_samples_per_second': 401.306, 'eval_steps_per_second': 40.203, 'epoch': 0.19}
{'loss': 0.8203, 'grad_norm': 1.9443832635879517, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5536768436431885, 'eval_runtime': 6.8239, 'eval_samples_per_second': 408.127, 'eval_steps_per_second': 40.886, 'epoch': 0.19}
{'loss': 0.8085, 'grad_norm': 1.993800401687622, 'learning_rate': 2.25e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5528666377067566, 'eval_runtime': 6.9156, 'eval_samples_per_second': 402.712, 'eval_steps_per_second': 40.343, 'epoch': 0.19}
{'loss': 0.7575, 'grad_norm': 1.8442636728286743, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.19}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5518046617507935, 'eval_runtime': 6.84, 'eval_samples_per_second': 407.166, 'eval_steps_per_second': 40.79, 'epoch': 0.19}
{'loss': 0.6611, 'grad_norm': 1.5060871839523315, 'learning_rate': 2.15e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5511572957038879, 'eval_runtime': 6.8373, 'eval_samples_per_second': 407.324, 'eval_steps_per_second': 40.806, 'epoch': 0.2}
{'loss': 0.6346, 'grad_norm': 1.410455584526062, 'learning_rate': 2.1e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.550440788269043, 'eval_runtime': 6.8023, 'eval_samples_per_second': 409.419, 'eval_steps_per_second': 41.015, 'epoch': 0.2}
{'loss': 0.6309, 'grad_norm': 1.577312707901001, 'learning_rate': 2.05e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.549826443195343, 'eval_runtime': 6.8789, 'eval_samples_per_second': 404.859, 'eval_steps_per_second': 40.559, 'epoch': 0.2}
{'loss': 0.6647, 'grad_norm': 1.9582723379135132, 'learning_rate': 2e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5489954948425293, 'eval_runtime': 6.8791, 'eval_samples_per_second': 404.847, 'eval_steps_per_second': 40.557, 'epoch': 0.2}
{'loss': 0.6407, 'grad_norm': 1.8546961545944214, 'learning_rate': 1.9500000000000003e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.548554539680481, 'eval_runtime': 6.8205, 'eval_samples_per_second': 408.326, 'eval_steps_per_second': 40.906, 'epoch': 0.2}
{'loss': 0.7395, 'grad_norm': 2.165322780609131, 'learning_rate': 1.9e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5486090779304504, 'eval_runtime': 6.955, 'eval_samples_per_second': 400.43, 'eval_steps_per_second': 40.115, 'epoch': 0.2}
{'loss': 0.6488, 'grad_norm': 1.7290291786193848, 'learning_rate': 1.85e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5484023094177246, 'eval_runtime': 7.1565, 'eval_samples_per_second': 389.157, 'eval_steps_per_second': 38.986, 'epoch': 0.2}
{'loss': 0.6941, 'grad_norm': 1.5269039869308472, 'learning_rate': 1.8e-05, 'epoch': 0.2}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5480608344078064, 'eval_runtime': 7.4471, 'eval_samples_per_second': 373.971, 'eval_steps_per_second': 37.464, 'epoch': 0.2}
{'loss': 0.6308, 'grad_norm': 1.8658947944641113, 'learning_rate': 1.75e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5476962924003601, 'eval_runtime': 6.9836, 'eval_samples_per_second': 398.79, 'eval_steps_per_second': 39.951, 'epoch': 0.21}
{'loss': 0.8925, 'grad_norm': 1.7551965713500977, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.547685444355011, 'eval_runtime': 6.9244, 'eval_samples_per_second': 402.203, 'eval_steps_per_second': 40.293, 'epoch': 0.21}
{'loss': 0.587, 'grad_norm': 1.5964566469192505, 'learning_rate': 1.65e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5474446415901184, 'eval_runtime': 7.2654, 'eval_samples_per_second': 383.326, 'eval_steps_per_second': 38.401, 'epoch': 0.21}
{'loss': 0.7853, 'grad_norm': 2.3299074172973633, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5469081401824951, 'eval_runtime': 7.2217, 'eval_samples_per_second': 385.642, 'eval_steps_per_second': 38.633, 'epoch': 0.21}
{'loss': 0.7282, 'grad_norm': 1.435206651687622, 'learning_rate': 1.55e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5462746024131775, 'eval_runtime': 8.0517, 'eval_samples_per_second': 345.89, 'eval_steps_per_second': 34.651, 'epoch': 0.21}
{'loss': 0.706, 'grad_norm': 2.079233169555664, 'learning_rate': 1.5e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5455302596092224, 'eval_runtime': 8.2127, 'eval_samples_per_second': 339.107, 'eval_steps_per_second': 33.972, 'epoch': 0.21}
{'loss': 0.6426, 'grad_norm': 1.6699204444885254, 'learning_rate': 1.45e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5451833009719849, 'eval_runtime': 7.821, 'eval_samples_per_second': 356.094, 'eval_steps_per_second': 35.673, 'epoch': 0.21}
{'loss': 0.8009, 'grad_norm': 2.4243690967559814, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.21}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5451188683509827, 'eval_runtime': 7.1075, 'eval_samples_per_second': 391.839, 'eval_steps_per_second': 39.254, 'epoch': 0.21}
{'loss': 0.7858, 'grad_norm': 2.1564557552337646, 'learning_rate': 1.3500000000000001e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5452870726585388, 'eval_runtime': 7.3659, 'eval_samples_per_second': 378.092, 'eval_steps_per_second': 37.877, 'epoch': 0.22}
{'loss': 0.7802, 'grad_norm': 2.0039234161376953, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5456883907318115, 'eval_runtime': 7.9307, 'eval_samples_per_second': 351.166, 'eval_steps_per_second': 35.18, 'epoch': 0.22}
{'loss': 0.7044, 'grad_norm': 2.081120729446411, 'learning_rate': 1.25e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5459503531455994, 'eval_runtime': 8.1052, 'eval_samples_per_second': 343.608, 'eval_steps_per_second': 34.422, 'epoch': 0.22}
{'loss': 0.643, 'grad_norm': 1.529460072517395, 'learning_rate': 1.2e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.546285092830658, 'eval_runtime': 9.1415, 'eval_samples_per_second': 304.656, 'eval_steps_per_second': 30.52, 'epoch': 0.22}
{'loss': 0.7119, 'grad_norm': 2.0789923667907715, 'learning_rate': 1.1500000000000002e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5464292168617249, 'eval_runtime': 7.7834, 'eval_samples_per_second': 357.814, 'eval_steps_per_second': 35.846, 'epoch': 0.22}
{'loss': 0.5952, 'grad_norm': 1.3462274074554443, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5463478565216064, 'eval_runtime': 7.6667, 'eval_samples_per_second': 363.261, 'eval_steps_per_second': 36.391, 'epoch': 0.22}
{'loss': 0.7152, 'grad_norm': 1.3789838552474976, 'learning_rate': 1.05e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5461651086807251, 'eval_runtime': 8.1935, 'eval_samples_per_second': 339.904, 'eval_steps_per_second': 34.051, 'epoch': 0.22}
{'loss': 0.711, 'grad_norm': 1.9379148483276367, 'learning_rate': 1e-05, 'epoch': 0.22}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5458501577377319, 'eval_runtime': 7.2706, 'eval_samples_per_second': 383.05, 'eval_steps_per_second': 38.374, 'epoch': 0.22}
{'loss': 0.657, 'grad_norm': 1.3259408473968506, 'learning_rate': 9.5e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.54549241065979, 'eval_runtime': 7.5568, 'eval_samples_per_second': 368.544, 'eval_steps_per_second': 36.921, 'epoch': 0.23}
{'loss': 0.6478, 'grad_norm': 1.5945467948913574, 'learning_rate': 9e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5452901721000671, 'eval_runtime': 7.1192, 'eval_samples_per_second': 391.195, 'eval_steps_per_second': 39.19, 'epoch': 0.23}
{'loss': 0.5773, 'grad_norm': 1.4843013286590576, 'learning_rate': 8.500000000000002e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5453887581825256, 'eval_runtime': 7.7758, 'eval_samples_per_second': 358.16, 'eval_steps_per_second': 35.88, 'epoch': 0.23}
{'loss': 0.6136, 'grad_norm': 1.5078462362289429, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.545451819896698, 'eval_runtime': 8.0061, 'eval_samples_per_second': 347.86, 'eval_steps_per_second': 34.848, 'epoch': 0.23}
{'loss': 0.5279, 'grad_norm': 1.5333166122436523, 'learning_rate': 7.5e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5453025698661804, 'eval_runtime': 8.3706, 'eval_samples_per_second': 332.712, 'eval_steps_per_second': 33.331, 'epoch': 0.23}
{'loss': 0.7826, 'grad_norm': 1.6386840343475342, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5452343821525574, 'eval_runtime': 7.6145, 'eval_samples_per_second': 365.747, 'eval_steps_per_second': 36.64, 'epoch': 0.23}
{'loss': 0.6748, 'grad_norm': 1.3181158304214478, 'learning_rate': 6.5000000000000004e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5452266335487366, 'eval_runtime': 8.114, 'eval_samples_per_second': 343.234, 'eval_steps_per_second': 34.385, 'epoch': 0.23}
{'loss': 0.8695, 'grad_norm': 1.7330232858657837, 'learning_rate': 6e-06, 'epoch': 0.23}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5452199578285217, 'eval_runtime': 7.6921, 'eval_samples_per_second': 362.061, 'eval_steps_per_second': 36.271, 'epoch': 0.23}
{'loss': 0.6346, 'grad_norm': 1.376381516456604, 'learning_rate': 5.500000000000001e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5450654029846191, 'eval_runtime': 7.9566, 'eval_samples_per_second': 350.025, 'eval_steps_per_second': 35.065, 'epoch': 0.24}
{'loss': 0.745, 'grad_norm': 1.674022912979126, 'learning_rate': 5e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5448682308197021, 'eval_runtime': 8.4924, 'eval_samples_per_second': 327.94, 'eval_steps_per_second': 32.853, 'epoch': 0.24}
{'loss': 0.629, 'grad_norm': 1.5024123191833496, 'learning_rate': 4.5e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5446707010269165, 'eval_runtime': 7.2443, 'eval_samples_per_second': 384.442, 'eval_steps_per_second': 38.513, 'epoch': 0.24}
{'loss': 0.6794, 'grad_norm': 1.7146995067596436, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5446447730064392, 'eval_runtime': 7.418, 'eval_samples_per_second': 375.437, 'eval_steps_per_second': 37.611, 'epoch': 0.24}
{'loss': 0.6574, 'grad_norm': 1.7371456623077393, 'learning_rate': 3.5000000000000004e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.544526219367981, 'eval_runtime': 7.2197, 'eval_samples_per_second': 385.749, 'eval_steps_per_second': 38.644, 'epoch': 0.24}
{'loss': 0.7958, 'grad_norm': 1.5077793598175049, 'learning_rate': 3e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5444605350494385, 'eval_runtime': 7.3497, 'eval_samples_per_second': 378.928, 'eval_steps_per_second': 37.961, 'epoch': 0.24}
{'loss': 0.8127, 'grad_norm': 1.725454330444336, 'learning_rate': 2.5e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5443134903907776, 'eval_runtime': 7.0288, 'eval_samples_per_second': 396.226, 'eval_steps_per_second': 39.694, 'epoch': 0.24}
{'loss': 0.7032, 'grad_norm': 1.8584057092666626, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.24}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5442902445793152, 'eval_runtime': 7.1082, 'eval_samples_per_second': 391.803, 'eval_steps_per_second': 39.251, 'epoch': 0.24}
{'loss': 0.6456, 'grad_norm': 1.6674336194992065, 'learning_rate': 1.5e-06, 'epoch': 0.25}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.54422926902771, 'eval_runtime': 6.8983, 'eval_samples_per_second': 403.721, 'eval_steps_per_second': 40.445, 'epoch': 0.25}
{'loss': 0.6576, 'grad_norm': 1.1974949836730957, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.25}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.544135570526123, 'eval_runtime': 7.3743, 'eval_samples_per_second': 377.663, 'eval_steps_per_second': 37.834, 'epoch': 0.25}
{'loss': 0.8025, 'grad_norm': 2.0363376140594482, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.25}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5441548228263855, 'eval_runtime': 6.7366, 'eval_samples_per_second': 413.414, 'eval_steps_per_second': 41.416, 'epoch': 0.25}
{'loss': 0.6841, 'grad_norm': 1.5934981107711792, 'learning_rate': 0.0, 'epoch': 0.25}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.5440627336502075, 'eval_runtime': 6.9093, 'eval_samples_per_second': 403.078, 'eval_steps_per_second': 40.38, 'epoch': 0.25}
{'train_runtime': 2219.7004, 'train_samples_per_second': 0.901, 'train_steps_per_second': 0.09, 'train_loss': 0.7634830382466317, 'epoch': 0.25}


TrainOutput(global_step=200, training_loss=0.7634830382466317, metrics={'train_runtime': 2219.7004, 'train_samples_per_second': 0.901, 'train_steps_per_second': 0.09, 'total_flos': 20375470080000.0, 'train_loss': 0.7634830382466317, 'epoch': 0.24875621890547264})

In [12]:
trainer_jepa.evaluate(valwindowds)
wandb.finish()

  0%|          | 0/279 [00:00<?, ?it/s]

0,1
eval/loss,█▆▆▆▅▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,█▄▆▅▄▄▃▃▄▆▃▃▃▃▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁
eval/samples_per_second,▁▁▁▂▁▂▂▁▁▃▃▃▂▃▃▄▄▄██▇▇▇▇▆▇▇▇▇▇▇▇▇█▆▇▇▇██
eval/steps_per_second,▁▁▃▃▃▂▂▃▃▃▄▄▄▆▇█▆▇▇▇▇▇████▇████▇▇▇▆▇▇▇██
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇██
train/grad_norm,█▄▃▃▃▂▂▂▂▂▂▁▁▂▁▂▂▂▂▂▂▃▁▂▂▂▂▂▁▂▁▂▂▂▂▁▁▁▁▂
train/learning_rate,█▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁
train/loss,█▄▃▂▄▃▃▃▃▂▁▂▃▂▃▃▂▂▂▂▂▂▃▃▂▁▂▂▂▁▁▂▂▂▂▁▁▂▂▃

0,1
eval/loss,0.54406
eval/runtime,6.6914
eval/samples_per_second,416.204
eval/steps_per_second,41.695
total_flos,20375470080000.0
train/epoch,0.24876
train/global_step,200.0
train/grad_norm,1.5935
train/learning_rate,0.0
train/loss,0.6841
