In [11]:
import sys
sys.path.append('..')

import random

import numpy as np
import pandas as pd
import torch
import transformers

from transformers import PatchTSTConfig, Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import PatchTSTForPrediction as PatchTSTForPredictionOG
from TimeSeriesJEPA.models import PatchTSTModelJEPA, PatchTSTForPrediction
from TimeSeriesJEPA.datasets.benchmark_dataset import BenchmarkDataset

In [12]:
csv_path = r"D:\Coursework\MTS\dataset\ETT-small\ETTh2.csv"
dataset = "ETTh2"
num_workers = 4  # Reduce this if you have low number of CPU cores
batch_size = 32  # Reduce if not enough GPU memory available
context_length = 512
forecast_horizon = 96
patch_length = 8
num_input_channels=7

In [13]:

trainwindowds = BenchmarkDataset(csv_path=csv_path, context_length=context_length, prediction_length=forecast_horizon, flag='train', returndict=True)
valwindowds = BenchmarkDataset(csv_path=csv_path, context_length=context_length, prediction_length=forecast_horizon, flag='test', returndict=True)
print("dataset loaded, total size: ", len(trainwindowds), len(valwindowds))

Total data size:  (17420, 7)
Total data size:  (17420, 7)
dataset loaded, total size:  8033 2785


In [14]:
print("Loading prediction model")

config = PatchTSTConfig(
    do_mask_input=False,
    context_length=context_length,
    patch_length=patch_length,
    num_input_channels=num_input_channels,
    patch_stride=8,
    prediction_length=forecast_horizon,
    d_model=64,
    num_attention_heads=4,
    num_hidden_layers=3,
    ffn_dim=64,
    dropout=0.05,
    head_dropout=0.2,
    pooling_type=None,
    channel_attention=False,
    scaling="std",
    loss="mse",
    pre_norm=True,
    norm_type="batchnorm",
    positional_encoding_type = "sincos"
)

model = PatchTSTForPredictionOG(config=config)

Loading prediction model


In [15]:
model.cuda()

PatchTSTForPrediction(
  (model): PatchTSTModel(
    (scaler): PatchTSTScaler(
      (scaler): PatchTSTStdScaler()
    )
    (patchifier): PatchTSTPatchify()
    (masking): Identity()
    (encoder): PatchTSTEncoder(
      (embedder): PatchTSTEmbedding(
        (input_embedding): Linear(in_features=8, out_features=64, bias=True)
      )
      (positional_encoder): PatchTSTPositionalEncoding(
        (positional_dropout): Identity()
      )
      (layers): ModuleList(
        (0-2): 3 x PatchTSTEncoderLayer(
          (self_attn): PatchTSTAttention(
            (k_proj): Linear(in_features=64, out_features=64, bias=True)
            (v_proj): Linear(in_features=64, out_features=64, bias=True)
            (q_proj): Linear(in_features=64, out_features=64, bias=True)
            (out_proj): Linear(in_features=64, out_features=64, bias=True)
          )
          (dropout_path1): Identity()
          (norm_sublayer1): PatchTSTBatchNorm(
            (batchnorm): BatchNorm1d(64, eps=1e-05, mom

In [16]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model.parameters()])
print("encoder parameters: ", params)

encoder parameters:  473632


In [17]:
train_args = TrainingArguments(
    output_dir=r"checkpoints\finetuned_og",
    overwrite_output_dir=True,
    learning_rate=0.0001,
    num_train_epochs=30,
    do_eval=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=1,  # num_workers,
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=3,
    logging_dir=r"checkpoints\finetuned_og\logs",
    load_best_model_at_end=True,  # Load the best model when training ends
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
    greater_is_better=False,  # For loss
    label_names=["future_values"],
)

# Create a new early stopping callback with faster convergence properties
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=15,  # Number of epochs with no improvement after which to stop
    early_stopping_threshold=0.001,  # Minimum improvement required to consider as improvement
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=trainwindowds,
    eval_dataset=valwindowds,
    callbacks=[early_stopping_callback],
)



In [18]:
print("\n\nDoing forecasting training")
trainer.train()



Doing forecasting training


  0%|          | 0/7560 [00:00<?, ?it/s]

{'loss': 0.4713, 'grad_norm': 0.5749430656433105, 'learning_rate': 9.666666666666667e-05, 'epoch': 1.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2816503643989563, 'eval_runtime': 5.5824, 'eval_samples_per_second': 498.889, 'eval_steps_per_second': 15.764, 'epoch': 1.0}
{'loss': 0.4468, 'grad_norm': 32.79212951660156, 'learning_rate': 9.333333333333334e-05, 'epoch': 2.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.27998584508895874, 'eval_runtime': 5.6071, 'eval_samples_per_second': 496.691, 'eval_steps_per_second': 15.694, 'epoch': 2.0}
{'loss': 0.37, 'grad_norm': 0.45317402482032776, 'learning_rate': 9e-05, 'epoch': 3.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.27863338589668274, 'eval_runtime': 5.6284, 'eval_samples_per_second': 494.81, 'eval_steps_per_second': 15.635, 'epoch': 3.0}
{'loss': 0.3407, 'grad_norm': 1.025422215461731, 'learning_rate': 8.666666666666667e-05, 'epoch': 4.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.284548282623291, 'eval_runtime': 5.5568, 'eval_samples_per_second': 501.185, 'eval_steps_per_second': 15.836, 'epoch': 4.0}
{'loss': 0.325, 'grad_norm': 0.7343245148658752, 'learning_rate': 8.333333333333334e-05, 'epoch': 5.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.28048738837242126, 'eval_runtime': 5.5638, 'eval_samples_per_second': 500.559, 'eval_steps_per_second': 15.817, 'epoch': 5.0}
{'loss': 0.3131, 'grad_norm': 4.370523452758789, 'learning_rate': 8e-05, 'epoch': 6.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2909912168979645, 'eval_runtime': 5.5623, 'eval_samples_per_second': 500.689, 'eval_steps_per_second': 15.821, 'epoch': 6.0}
{'loss': 0.3001, 'grad_norm': 0.7435413002967834, 'learning_rate': 7.666666666666667e-05, 'epoch': 7.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2877395749092102, 'eval_runtime': 5.5189, 'eval_samples_per_second': 504.626, 'eval_steps_per_second': 15.945, 'epoch': 7.0}
{'loss': 0.2917, 'grad_norm': 1.6160026788711548, 'learning_rate': 7.333333333333333e-05, 'epoch': 8.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.29124587774276733, 'eval_runtime': 5.5208, 'eval_samples_per_second': 504.457, 'eval_steps_per_second': 15.94, 'epoch': 8.0}
{'loss': 0.2839, 'grad_norm': 0.6064866781234741, 'learning_rate': 7e-05, 'epoch': 9.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2951210141181946, 'eval_runtime': 5.6178, 'eval_samples_per_second': 495.741, 'eval_steps_per_second': 15.664, 'epoch': 9.0}
{'loss': 0.2774, 'grad_norm': 1.4424302577972412, 'learning_rate': 6.666666666666667e-05, 'epoch': 10.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2934449315071106, 'eval_runtime': 5.5025, 'eval_samples_per_second': 506.133, 'eval_steps_per_second': 15.993, 'epoch': 10.0}
{'loss': 0.273, 'grad_norm': 0.7836481928825378, 'learning_rate': 6.333333333333333e-05, 'epoch': 11.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2899249792098999, 'eval_runtime': 5.5801, 'eval_samples_per_second': 499.099, 'eval_steps_per_second': 15.77, 'epoch': 11.0}
{'loss': 0.27, 'grad_norm': 2.9216580390930176, 'learning_rate': 6e-05, 'epoch': 12.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.3024137616157532, 'eval_runtime': 5.4336, 'eval_samples_per_second': 512.556, 'eval_steps_per_second': 16.196, 'epoch': 12.0}
{'loss': 0.265, 'grad_norm': 1.4880750179290771, 'learning_rate': 5.666666666666667e-05, 'epoch': 13.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.29816505312919617, 'eval_runtime': 5.4839, 'eval_samples_per_second': 507.85, 'eval_steps_per_second': 16.047, 'epoch': 13.0}
{'loss': 0.2612, 'grad_norm': 1.1776795387268066, 'learning_rate': 5.333333333333333e-05, 'epoch': 14.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.3091077208518982, 'eval_runtime': 5.4832, 'eval_samples_per_second': 507.918, 'eval_steps_per_second': 16.049, 'epoch': 14.0}
{'loss': 0.257, 'grad_norm': 0.9793025255203247, 'learning_rate': 5e-05, 'epoch': 15.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.3193683326244354, 'eval_runtime': 5.4348, 'eval_samples_per_second': 512.443, 'eval_steps_per_second': 16.192, 'epoch': 15.0}
{'loss': 0.2541, 'grad_norm': 1.699536681175232, 'learning_rate': 4.666666666666667e-05, 'epoch': 16.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.30794575810432434, 'eval_runtime': 5.4493, 'eval_samples_per_second': 511.078, 'eval_steps_per_second': 16.149, 'epoch': 16.0}
{'loss': 0.2511, 'grad_norm': 3.5143752098083496, 'learning_rate': 4.3333333333333334e-05, 'epoch': 17.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.3022763729095459, 'eval_runtime': 5.5061, 'eval_samples_per_second': 505.806, 'eval_steps_per_second': 15.982, 'epoch': 17.0}
{'loss': 0.2486, 'grad_norm': 4.148008823394775, 'learning_rate': 4e-05, 'epoch': 18.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.3059213161468506, 'eval_runtime': 5.6237, 'eval_samples_per_second': 495.224, 'eval_steps_per_second': 15.648, 'epoch': 18.0}
{'train_runtime': 288.3963, 'train_samples_per_second': 835.621, 'train_steps_per_second': 26.214, 'train_loss': 0.30554847448170713, 'epoch': 18.0}


TrainOutput(global_step=4536, training_loss=0.30554847448170713, metrics={'train_runtime': 288.3963, 'train_samples_per_second': 835.621, 'train_steps_per_second': 26.214, 'total_flos': 1472687363653632.0, 'train_loss': 0.30554847448170713, 'epoch': 18.0})

In [19]:
trainer.evaluate(valwindowds)

  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.27863338589668274,
 'eval_runtime': 5.3796,
 'eval_samples_per_second': 517.695,
 'eval_steps_per_second': 16.358,
 'epoch': 18.0}

In [4]:
print("Loading pretrained encoder model")
encoder_model = PatchTSTModelJEPA.from_pretrained(r"D:\Coursework\MTS\timeseriesJEPA\results\PatchTST_etth2_sl512_enc_dm64_nh4_el3_fd64_pred_dm32_nh2_el1_fd32_bs256_lr0.0001_pe50_data_nenc1_npred4_clean_data\checkpoint-1600")
print("Done")
encoder_model.cuda()

Loading pretrained encoder model
Done


PatchTSTModelJEPA(
  (scaler): PatchTSTScaler(
    (scaler): PatchTSTStdScaler()
  )
  (patchifier): PatchTSTPatchify()
  (encoder): PatchTSTEncoder(
    (embedder): PatchTSTEmbedding(
      (input_embedding): Linear(in_features=8, out_features=64, bias=True)
    )
    (positional_encoder): PatchTSTPositionalEncoding(
      (positional_dropout): Identity()
    )
    (layers): ModuleList(
      (0-2): 3 x PatchTSTEncoderLayer(
        (self_attn): PatchTSTAttention(
          (k_proj): Linear(in_features=64, out_features=64, bias=True)
          (v_proj): Linear(in_features=64, out_features=64, bias=True)
          (q_proj): Linear(in_features=64, out_features=64, bias=True)
          (out_proj): Linear(in_features=64, out_features=64, bias=True)
        )
        (dropout_path1): Identity()
        (norm_sublayer1): PatchTSTBatchNorm(
          (batchnorm): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (ff): Sequential(
          (0):

In [5]:
print("Loading prediction model")

config = PatchTSTConfig(
    do_mask_input=False,
    context_length=context_length,
    patch_length=patch_length,
    num_input_channels=num_input_channels,
    patch_stride=patch_length,
    prediction_length=forecast_horizon,
    d_model=64,
    num_attention_heads=4,
    # num_hidden_layers=4,
    ffn_dim=128,
    dropout=0.05,
    head_dropout=0.2,
    pooling_type=None,
    channel_attention=False,
    scaling="std",
    loss="mse",
    pre_norm=True,
    norm_type="batchnorm",
    positional_encoding_type = "sincos"
)

model = PatchTSTForPrediction(config=config, encoder_model=encoder_model)
model.cuda()

Loading prediction model


PatchTSTForPrediction(
  (model): PatchTSTModelJEPA(
    (scaler): PatchTSTScaler(
      (scaler): PatchTSTStdScaler()
    )
    (patchifier): PatchTSTPatchify()
    (encoder): PatchTSTEncoder(
      (embedder): PatchTSTEmbedding(
        (input_embedding): Linear(in_features=8, out_features=64, bias=True)
      )
      (positional_encoder): PatchTSTPositionalEncoding(
        (positional_dropout): Identity()
      )
      (layers): ModuleList(
        (0-2): 3 x PatchTSTEncoderLayer(
          (self_attn): PatchTSTAttention(
            (k_proj): Linear(in_features=64, out_features=64, bias=True)
            (v_proj): Linear(in_features=64, out_features=64, bias=True)
            (q_proj): Linear(in_features=64, out_features=64, bias=True)
            (out_proj): Linear(in_features=64, out_features=64, bias=True)
          )
          (dropout_path1): Identity()
          (norm_sublayer1): PatchTSTBatchNorm(
            (batchnorm): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True

In [6]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("encoder parameters: ", params)

encoder parameters:  393312


In [7]:
params = sum([np.prod(p.size()) for p in encoder_model.parameters()])
print("encoder parameters: ", params)

encoder parameters:  80448


In [8]:
train_args = TrainingArguments(
    output_dir=r"checkpoints\finetuned",
    overwrite_output_dir=True,
    learning_rate=0.0001,
    num_train_epochs=30,
    do_eval=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=1,  # num_workers,
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=3,
    logging_dir=r"checkpoints\finetuned\logs",
    load_best_model_at_end=True,  # Load the best model when training ends
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
    greater_is_better=False,  # For loss
    label_names=["future_values"],
)

# Create a new early stopping callback with faster convergence properties
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=5,  # Number of epochs with no improvement after which to stop
    early_stopping_threshold=0.001,  # Minimum improvement required to consider as improvement
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=trainwindowds,
    eval_dataset=valwindowds,
    callbacks=[early_stopping_callback],
)



In [9]:
print("\n\nDoing forecasting training")
trainer.train()





Doing forecasting training


[34m[1mwandb[0m: Currently logged in as: [33mvg2523[0m ([33mhpml_4[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


  0%|          | 0/7560 [00:00<?, ?it/s]

{'loss': 0.7263, 'grad_norm': 2.5108461380004883, 'learning_rate': 9.666666666666667e-05, 'epoch': 1.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.3334067761898041, 'eval_runtime': 5.9996, 'eval_samples_per_second': 464.201, 'eval_steps_per_second': 14.668, 'epoch': 1.0}
{'loss': 0.6777, 'grad_norm': 33.01469039916992, 'learning_rate': 9.333333333333334e-05, 'epoch': 2.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.31504175066947937, 'eval_runtime': 6.245, 'eval_samples_per_second': 445.956, 'eval_steps_per_second': 14.091, 'epoch': 2.0}
{'loss': 0.6108, 'grad_norm': 1.0968929529190063, 'learning_rate': 9e-05, 'epoch': 3.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.300729364156723, 'eval_runtime': 6.1188, 'eval_samples_per_second': 455.153, 'eval_steps_per_second': 14.382, 'epoch': 3.0}
{'loss': 0.5898, 'grad_norm': 2.0467689037323, 'learning_rate': 8.666666666666667e-05, 'epoch': 4.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2985295355319977, 'eval_runtime': 5.8769, 'eval_samples_per_second': 473.89, 'eval_steps_per_second': 14.974, 'epoch': 4.0}
{'loss': 0.575, 'grad_norm': 1.5776655673980713, 'learning_rate': 8.333333333333334e-05, 'epoch': 5.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.29450955986976624, 'eval_runtime': 6.3028, 'eval_samples_per_second': 441.868, 'eval_steps_per_second': 13.962, 'epoch': 5.0}
{'loss': 0.5602, 'grad_norm': 3.413454294204712, 'learning_rate': 8e-05, 'epoch': 6.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.28875112533569336, 'eval_runtime': 6.284, 'eval_samples_per_second': 443.187, 'eval_steps_per_second': 14.004, 'epoch': 6.0}
{'loss': 0.5484, 'grad_norm': 1.235352873802185, 'learning_rate': 7.666666666666667e-05, 'epoch': 7.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2941756844520569, 'eval_runtime': 6.2127, 'eval_samples_per_second': 448.276, 'eval_steps_per_second': 14.165, 'epoch': 7.0}
{'loss': 0.537, 'grad_norm': 0.8781893253326416, 'learning_rate': 7.333333333333333e-05, 'epoch': 8.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.28755107522010803, 'eval_runtime': 6.3184, 'eval_samples_per_second': 440.775, 'eval_steps_per_second': 13.928, 'epoch': 8.0}
{'loss': 0.5296, 'grad_norm': 1.1507357358932495, 'learning_rate': 7e-05, 'epoch': 9.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2840966582298279, 'eval_runtime': 6.1267, 'eval_samples_per_second': 454.567, 'eval_steps_per_second': 14.363, 'epoch': 9.0}
{'loss': 0.5225, 'grad_norm': 1.1606619358062744, 'learning_rate': 6.666666666666667e-05, 'epoch': 10.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2858034372329712, 'eval_runtime': 6.3759, 'eval_samples_per_second': 436.8, 'eval_steps_per_second': 13.802, 'epoch': 10.0}
{'loss': 0.5154, 'grad_norm': 2.3830277919769287, 'learning_rate': 6.333333333333333e-05, 'epoch': 11.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2837510108947754, 'eval_runtime': 5.6794, 'eval_samples_per_second': 490.372, 'eval_steps_per_second': 15.495, 'epoch': 11.0}
{'loss': 0.5083, 'grad_norm': 1.6433653831481934, 'learning_rate': 6e-05, 'epoch': 12.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2853296101093292, 'eval_runtime': 5.8123, 'eval_samples_per_second': 479.157, 'eval_steps_per_second': 15.14, 'epoch': 12.0}
{'loss': 0.5034, 'grad_norm': 1.2151379585266113, 'learning_rate': 5.666666666666667e-05, 'epoch': 13.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2808360457420349, 'eval_runtime': 5.5673, 'eval_samples_per_second': 500.242, 'eval_steps_per_second': 15.807, 'epoch': 13.0}
{'loss': 0.4966, 'grad_norm': 1.2579824924468994, 'learning_rate': 5.333333333333333e-05, 'epoch': 14.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2806485593318939, 'eval_runtime': 5.7633, 'eval_samples_per_second': 483.234, 'eval_steps_per_second': 15.269, 'epoch': 14.0}
{'loss': 0.4922, 'grad_norm': 0.8908719420433044, 'learning_rate': 5e-05, 'epoch': 15.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.28745144605636597, 'eval_runtime': 6.1165, 'eval_samples_per_second': 455.324, 'eval_steps_per_second': 14.387, 'epoch': 15.0}
{'loss': 0.488, 'grad_norm': 3.286694288253784, 'learning_rate': 4.666666666666667e-05, 'epoch': 16.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.27890968322753906, 'eval_runtime': 6.3901, 'eval_samples_per_second': 435.833, 'eval_steps_per_second': 13.771, 'epoch': 16.0}
{'loss': 0.4842, 'grad_norm': 2.111264228820801, 'learning_rate': 4.3333333333333334e-05, 'epoch': 17.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2791726887226105, 'eval_runtime': 6.4604, 'eval_samples_per_second': 431.089, 'eval_steps_per_second': 13.621, 'epoch': 17.0}
{'loss': 0.4811, 'grad_norm': 4.9403300285339355, 'learning_rate': 4e-05, 'epoch': 18.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2785538136959076, 'eval_runtime': 6.125, 'eval_samples_per_second': 454.697, 'eval_steps_per_second': 14.367, 'epoch': 18.0}
{'loss': 0.4772, 'grad_norm': 1.5267564058303833, 'learning_rate': 3.6666666666666666e-05, 'epoch': 19.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2804473638534546, 'eval_runtime': 6.1344, 'eval_samples_per_second': 453.996, 'eval_steps_per_second': 14.345, 'epoch': 19.0}
{'loss': 0.4752, 'grad_norm': 2.27698016166687, 'learning_rate': 3.3333333333333335e-05, 'epoch': 20.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.28075912594795227, 'eval_runtime': 6.0702, 'eval_samples_per_second': 458.797, 'eval_steps_per_second': 14.497, 'epoch': 20.0}
{'loss': 0.4722, 'grad_norm': 1.8434979915618896, 'learning_rate': 3e-05, 'epoch': 21.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.276742547750473, 'eval_runtime': 6.4034, 'eval_samples_per_second': 434.927, 'eval_steps_per_second': 13.743, 'epoch': 21.0}
{'loss': 0.4704, 'grad_norm': 1.8174591064453125, 'learning_rate': 2.6666666666666667e-05, 'epoch': 22.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2773776352405548, 'eval_runtime': 5.9419, 'eval_samples_per_second': 468.704, 'eval_steps_per_second': 14.81, 'epoch': 22.0}
{'loss': 0.4687, 'grad_norm': 2.878575563430786, 'learning_rate': 2.3333333333333336e-05, 'epoch': 23.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.276972234249115, 'eval_runtime': 5.6282, 'eval_samples_per_second': 494.83, 'eval_steps_per_second': 15.636, 'epoch': 23.0}
{'loss': 0.4655, 'grad_norm': 0.8014932870864868, 'learning_rate': 2e-05, 'epoch': 24.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2766018509864807, 'eval_runtime': 6.1873, 'eval_samples_per_second': 450.114, 'eval_steps_per_second': 14.223, 'epoch': 24.0}
{'loss': 0.4641, 'grad_norm': 1.737629771232605, 'learning_rate': 1.6666666666666667e-05, 'epoch': 25.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.2783200144767761, 'eval_runtime': 6.3509, 'eval_samples_per_second': 438.521, 'eval_steps_per_second': 13.856, 'epoch': 25.0}
{'loss': 0.463, 'grad_norm': 2.5362839698791504, 'learning_rate': 1.3333333333333333e-05, 'epoch': 26.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.27611565589904785, 'eval_runtime': 6.138, 'eval_samples_per_second': 453.734, 'eval_steps_per_second': 14.337, 'epoch': 26.0}
{'train_runtime': 379.1393, 'train_samples_per_second': 635.624, 'train_steps_per_second': 19.94, 'train_loss': 0.5231814401927012, 'epoch': 26.0}


TrainOutput(global_step=6552, training_loss=0.5231814401927012, metrics={'train_runtime': 379.1393, 'train_samples_per_second': 635.624, 'train_steps_per_second': 19.94, 'total_flos': 2127789964984320.0, 'train_loss': 0.5231814401927012, 'epoch': 26.0})

In [10]:
trainer.evaluate(valwindowds)

  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 0.27611565589904785,
 'eval_runtime': 5.4027,
 'eval_samples_per_second': 515.485,
 'eval_steps_per_second': 16.288,
 'epoch': 26.0}