In [7]:
from transformers import ViTConfig, ViTModel, ViTImageProcessor, ViTForImageClassification, ViTForMaskedImageModeling

In [67]:
IMAGE_SIZE = 64

processor = ViTImageProcessor(size=IMAGE_SIZE)
processor

ViTImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 64,
    "width": 64
  }
}

In [30]:
from datasets import load_dataset

test_dataset = load_dataset("uoft-cs/cifar100", split="test")
test_dataset

Dataset({
    features: ['img', 'fine_label', 'coarse_label'],
    num_rows: 10000
})

In [40]:
train_dataset = load_dataset("uoft-cs/cifar100", split="train")
train_dataset

Dataset({
    features: ['img', 'fine_label', 'coarse_label'],
    num_rows: 50000
})

In [56]:
train_dataset_50 = train_dataset.train_test_split(train_size=0.5, shuffle=True, stratify_by_column="fine_label")["train"]
train_dataset_50

Dataset({
    features: ['img', 'fine_label', 'coarse_label'],
    num_rows: 25000
})

In [11]:
def process_example(example):
    inputs = processor(example['img'], return_tensors='pt')
    inputs['labels'] = example['fine_label']
    return inputs

process_example(dataset["train"][0])

{'pixel_values': tensor([[[[ 1.0000,  1.0000,  1.0000,  ...,  0.6078,  0.4902,  0.4275],
          [ 1.0000,  1.0000,  1.0000,  ...,  0.5137,  0.4118,  0.3569],
          [ 1.0000,  1.0000,  0.9922,  ...,  0.3255,  0.2549,  0.2157],
          ...,
          [-0.1137, -0.1137, -0.1137,  ..., -0.1451,  0.0196,  0.0980],
          [-0.2471, -0.2471, -0.2471,  ..., -0.1216,  0.0196,  0.0902],
          [-0.3176, -0.3176, -0.3098,  ..., -0.1137,  0.0196,  0.0824]],

         [[ 1.0000,  1.0000,  1.0000,  ...,  0.7020,  0.5843,  0.5216],
          [ 1.0000,  1.0000,  1.0000,  ...,  0.6000,  0.4980,  0.4431],
          [ 1.0000,  1.0000,  0.9922,  ...,  0.3961,  0.3255,  0.2863],
          ...,
          [ 0.1608,  0.1608,  0.1608,  ...,  0.0118,  0.2000,  0.2863],
          [ 0.0275,  0.0275,  0.0275,  ...,  0.0824,  0.2549,  0.3333],
          [-0.0431, -0.0431, -0.0431,  ...,  0.1216,  0.2784,  0.3569]],

         [[ 1.0000,  1.0000,  1.0000,  ...,  0.5294,  0.3804,  0.3098],
          [ 1

In [12]:
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = processor([x for x in example_batch['img']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['fine_label']
    return inputs

In [13]:
prepared_ds = dataset.with_transform(transform)

In [14]:
prepared_ds["train"][0:2]

{'pixel_values': tensor([[[[ 1.0000,  1.0000,  1.0000,  ...,  0.6078,  0.4902,  0.4275],
          [ 1.0000,  1.0000,  1.0000,  ...,  0.5137,  0.4118,  0.3569],
          [ 1.0000,  1.0000,  0.9922,  ...,  0.3255,  0.2549,  0.2157],
          ...,
          [-0.1137, -0.1137, -0.1137,  ..., -0.1451,  0.0196,  0.0980],
          [-0.2471, -0.2471, -0.2471,  ..., -0.1216,  0.0196,  0.0902],
          [-0.3176, -0.3176, -0.3098,  ..., -0.1137,  0.0196,  0.0824]],

         [[ 1.0000,  1.0000,  1.0000,  ...,  0.7020,  0.5843,  0.5216],
          [ 1.0000,  1.0000,  1.0000,  ...,  0.6000,  0.4980,  0.4431],
          [ 1.0000,  1.0000,  0.9922,  ...,  0.3961,  0.3255,  0.2863],
          ...,
          [ 0.1608,  0.1608,  0.1608,  ...,  0.0118,  0.2000,  0.2863],
          [ 0.0275,  0.0275,  0.0275,  ...,  0.0824,  0.2549,  0.3333],
          [-0.0431, -0.0431, -0.0431,  ...,  0.1216,  0.2784,  0.3569]],

         [[ 1.0000,  1.0000,  1.0000,  ...,  0.5294,  0.3804,  0.3098],
          [ 1

In [15]:
import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [16]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)


  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [17]:
prepared_ds["train"].features

{'img': Image(decode=True, id=None),
 'fine_label': ClassLabel(names=['apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle', 'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel', 'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock', 'cloud', 'cockroach', 'couch', 'cra', 'crocodile', 'cup', 'dinosaur', 'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster', 'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion', 'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse', 'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear', 'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine', 'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose', 'sea', 'seal', 'shark', 'shrew', 'skunk', 'skyscraper', 'snail', 'snake', 'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table', 'tank', 'telephone', 'television', 'tiger', 'tractor', 'tra

In [33]:
checkpoint = "/home/jupyter/datasphere/project/vit/outputs_mim_tiny_imagenet/checkpoint-47880/"

In [62]:
labels = prepared_ds['train'].features['fine_label'].names

configuration = ViTConfig.from_pretrained(checkpoint)

model = ViTForImageClassification.from_pretrained(checkpoint)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at /home/jupyter/datasphere/project/vit/outputs_mim_tiny_imagenet/checkpoint-47880/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
configuration = ViTConfig(
    num_hidden_layers=3,
    num_attention_heads=16,
    hidden_size=512,
    patch_size=4,
    image_size=IMAGE_SIZE,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

model = ViTForImageClassification(configuration)

In [69]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

12815460

In [70]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-mim-pretrained-50",
  per_device_train_batch_size=128,
  per_device_eval_batch_size=128,
  evaluation_strategy="epoch",
  save_strategy='epoch',
  num_train_epochs=100,
  logging_steps=10,
  learning_rate=2e-5,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  weight_decay=1e-4,
  report_to='tensorboard',
  load_best_model_at_end=True,
)



In [71]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_dataset_50.with_transform(transform),
    eval_dataset=test_dataset.with_transform(transform),
    compute_metrics=compute_metrics,
    tokenizer=processor
)


In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

  0%|          | 5/19600 [2:34:10<10070:44:23, 1850.20s/it]
  0%|          | 10/19600 [00:08<4:32:07,  1.20it/s]
                                                        
  0%|          | 10/19600 [00:08<4:32:07,  1.20it/s]t/s][A

{'loss': 4.5881, 'grad_norm': 5.421902179718018, 'learning_rate': 1.998979591836735e-05, 'epoch': 0.05}


  0%|          | 20/19600 [00:16<4:33:28,  1.19it/s]
                                                        
  0%|          | 20/19600 [00:16<4:33:28,  1.19it/s]t/s][A

{'loss': 4.4451, 'grad_norm': 4.683783531188965, 'learning_rate': 1.9979591836734697e-05, 'epoch': 0.1}


  0%|          | 30/19600 [00:25<4:35:36,  1.18it/s]
                                                        
  0%|          | 30/19600 [00:25<4:35:36,  1.18it/s]t/s][A

{'loss': 4.383, 'grad_norm': 4.519712448120117, 'learning_rate': 1.9969387755102042e-05, 'epoch': 0.15}


  0%|          | 40/19600 [00:33<4:37:12,  1.18it/s]
                                                        
  0%|          | 40/19600 [00:33<4:37:12,  1.18it/s]t/s][A

{'loss': 4.2978, 'grad_norm': 4.662993907928467, 'learning_rate': 1.9959183673469388e-05, 'epoch': 0.2}


  0%|          | 50/19600 [00:42<4:39:19,  1.17it/s]
                                                        
  0%|          | 50/19600 [00:42<4:39:19,  1.17it/s]t/s][A

{'loss': 4.2943, 'grad_norm': 5.090201377868652, 'learning_rate': 1.9948979591836737e-05, 'epoch': 0.26}


  0%|          | 60/19600 [00:50<4:40:38,  1.16it/s]
                                                        
  0%|          | 60/19600 [00:50<4:40:38,  1.16it/s]t/s][A

{'loss': 4.2531, 'grad_norm': 5.277846336364746, 'learning_rate': 1.9938775510204083e-05, 'epoch': 0.31}


  0%|          | 70/19600 [00:59<4:42:06,  1.15it/s]
                                                        
  0%|          | 70/19600 [00:59<4:42:06,  1.15it/s]t/s][A

{'loss': 4.2679, 'grad_norm': 5.540380001068115, 'learning_rate': 1.992857142857143e-05, 'epoch': 0.36}


  0%|          | 80/19600 [01:08<4:43:10,  1.15it/s]
                                                        
  0%|          | 80/19600 [01:08<4:43:10,  1.15it/s]t/s][A

{'loss': 4.192, 'grad_norm': 5.211980819702148, 'learning_rate': 1.9918367346938775e-05, 'epoch': 0.41}


  0%|          | 90/19600 [01:16<4:42:33,  1.15it/s]
                                                        
  0%|          | 90/19600 [01:16<4:42:33,  1.15it/s]t/s][A

{'loss': 4.2032, 'grad_norm': 5.073441505432129, 'learning_rate': 1.9908163265306124e-05, 'epoch': 0.46}


  1%|          | 100/19600 [01:25<4:42:43,  1.15it/s]
                                                        
  1%|          | 100/19600 [01:25<4:42:43,  1.15it/s]/s][A

{'loss': 4.1899, 'grad_norm': 4.529275417327881, 'learning_rate': 1.9897959183673473e-05, 'epoch': 0.51}


  1%|          | 110/19600 [01:34<4:42:49,  1.15it/s]
                                                        
  1%|          | 110/19600 [01:34<4:42:49,  1.15it/s]/s][A

{'loss': 4.2217, 'grad_norm': 4.994623184204102, 'learning_rate': 1.988775510204082e-05, 'epoch': 0.56}


  1%|          | 120/19600 [01:43<4:42:31,  1.15it/s]
                                                        
  1%|          | 120/19600 [01:43<4:42:31,  1.15it/s]/s][A

{'loss': 4.154, 'grad_norm': 4.892523765563965, 'learning_rate': 1.9877551020408165e-05, 'epoch': 0.61}


  1%|          | 130/19600 [01:51<4:42:40,  1.15it/s]
                                                        
  1%|          | 130/19600 [01:51<4:42:40,  1.15it/s]/s][A

{'loss': 4.1048, 'grad_norm': 5.192368984222412, 'learning_rate': 1.986734693877551e-05, 'epoch': 0.66}


  1%|          | 140/19600 [02:00<4:43:16,  1.14it/s]
                                                        
  1%|          | 140/19600 [02:00<4:43:16,  1.14it/s]/s][A

{'loss': 4.1433, 'grad_norm': 5.790770530700684, 'learning_rate': 1.985714285714286e-05, 'epoch': 0.71}


  1%|          | 150/19600 [02:09<4:43:35,  1.14it/s]
                                                        
  1%|          | 150/19600 [02:09<4:43:35,  1.14it/s]/s][A

{'loss': 4.1058, 'grad_norm': 5.308792591094971, 'learning_rate': 1.9846938775510205e-05, 'epoch': 0.77}


  1%|          | 160/19600 [02:18<4:43:14,  1.14it/s]
                                                        
  1%|          | 160/19600 [02:18<4:43:14,  1.14it/s]/s][A

{'loss': 4.0854, 'grad_norm': 5.228801727294922, 'learning_rate': 1.983673469387755e-05, 'epoch': 0.82}


  1%|          | 170/19600 [02:26<4:43:22,  1.14it/s]
                                                        
  1%|          | 170/19600 [02:26<4:43:22,  1.14it/s]/s][A

{'loss': 4.0703, 'grad_norm': 5.57295560836792, 'learning_rate': 1.9826530612244897e-05, 'epoch': 0.87}


  1%|          | 180/19600 [02:35<4:43:26,  1.14it/s]
                                                        
  1%|          | 180/19600 [02:35<4:43:26,  1.14it/s]/s][A

{'loss': 4.0565, 'grad_norm': 5.902100086212158, 'learning_rate': 1.9816326530612246e-05, 'epoch': 0.92}


  1%|          | 190/19600 [02:44<4:42:56,  1.14it/s]
                                                        
  1%|          | 190/19600 [02:44<4:42:56,  1.14it/s]/s][A

{'loss': 4.0104, 'grad_norm': 5.140363693237305, 'learning_rate': 1.9806122448979595e-05, 'epoch': 0.97}


  1%|          | 196/19600 [02:48<3:46:48,  1.43it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.05it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.16it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.63it/s][A[A

  6%|▋         | 5/79 [00:01<00:21,  3.38it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.22it/s][A[A

  9%|▉         | 7/79 [00:02<00:22,  3.13it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.08it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.02it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.01it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  3.00it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.98it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.98it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.97it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.97it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.96it/s][A[A

 23%|██▎       | 18/79 [00:

{'eval_loss': 4.012364864349365, 'eval_accuracy': 0.0959, 'eval_runtime': 26.4582, 'eval_samples_per_second': 377.955, 'eval_steps_per_second': 2.986, 'epoch': 1.0}


  1%|          | 196/19600 [03:15<3:46:48,  1.43it/s]

  1%|          | 200/19600 [03:20<19:45:18,  3.67s/it]
                                                        
  1%|          | 200/19600 [03:20<19:45:18,  3.67s/it]s][A

{'loss': 3.9779, 'grad_norm': 5.553143501281738, 'learning_rate': 1.979591836734694e-05, 'epoch': 1.02}


  1%|          | 210/19600 [03:28<5:08:41,  1.05it/s] 
                                                        
  1%|          | 210/19600 [03:28<5:08:41,  1.05it/s]/s][A

{'loss': 3.9902, 'grad_norm': 6.461032390594482, 'learning_rate': 1.9785714285714287e-05, 'epoch': 1.07}


  1%|          | 220/19600 [03:37<4:44:05,  1.14it/s]
                                                        
  1%|          | 220/19600 [03:37<4:44:05,  1.14it/s]/s][A

{'loss': 3.9623, 'grad_norm': 6.354334354400635, 'learning_rate': 1.9775510204081633e-05, 'epoch': 1.12}


  1%|          | 230/19600 [03:46<4:42:50,  1.14it/s]
                                                        
  1%|          | 230/19600 [03:46<4:42:50,  1.14it/s]/s][A

{'loss': 3.9326, 'grad_norm': 5.830616474151611, 'learning_rate': 1.9765306122448982e-05, 'epoch': 1.17}


  1%|          | 240/19600 [03:55<4:42:52,  1.14it/s]
                                                        
  1%|          | 240/19600 [03:55<4:42:52,  1.14it/s]/s][A

{'loss': 3.9655, 'grad_norm': 6.276158809661865, 'learning_rate': 1.9755102040816328e-05, 'epoch': 1.22}


  1%|▏         | 250/19600 [04:03<4:42:58,  1.14it/s]
                                                        
  1%|▏         | 250/19600 [04:03<4:42:58,  1.14it/s]/s][A

{'loss': 3.946, 'grad_norm': 6.800878047943115, 'learning_rate': 1.9744897959183677e-05, 'epoch': 1.28}


  1%|▏         | 260/19600 [04:12<4:42:42,  1.14it/s]
                                                        
  1%|▏         | 260/19600 [04:12<4:42:42,  1.14it/s]/s][A

{'loss': 3.9155, 'grad_norm': 6.107234001159668, 'learning_rate': 1.9734693877551023e-05, 'epoch': 1.33}


  1%|▏         | 270/19600 [04:21<4:43:07,  1.14it/s]
                                                        
  1%|▏         | 270/19600 [04:21<4:43:07,  1.14it/s]/s][A

{'loss': 3.8606, 'grad_norm': 6.654865741729736, 'learning_rate': 1.972448979591837e-05, 'epoch': 1.38}


  1%|▏         | 280/19600 [04:30<4:42:44,  1.14it/s]
                                                        
  1%|▏         | 280/19600 [04:30<4:42:44,  1.14it/s]/s][A

{'loss': 3.8702, 'grad_norm': 5.953544616699219, 'learning_rate': 1.9714285714285718e-05, 'epoch': 1.43}


  1%|▏         | 290/19600 [04:39<4:42:08,  1.14it/s]
                                                        
  1%|▏         | 290/19600 [04:39<4:42:08,  1.14it/s]/s][A

{'loss': 3.86, 'grad_norm': 7.213027477264404, 'learning_rate': 1.9704081632653063e-05, 'epoch': 1.48}


  2%|▏         | 300/19600 [04:47<4:42:58,  1.14it/s]
                                                        
  2%|▏         | 300/19600 [04:47<4:42:58,  1.14it/s]/s][A

{'loss': 3.8195, 'grad_norm': 6.852202415466309, 'learning_rate': 1.969387755102041e-05, 'epoch': 1.53}


  2%|▏         | 310/19600 [04:56<4:41:11,  1.14it/s]
                                                        
  2%|▏         | 310/19600 [04:56<4:41:11,  1.14it/s]/s][A

{'loss': 3.8381, 'grad_norm': 6.890439987182617, 'learning_rate': 1.9683673469387755e-05, 'epoch': 1.58}


  2%|▏         | 320/19600 [05:05<4:41:00,  1.14it/s]
                                                        
  2%|▏         | 320/19600 [05:05<4:41:00,  1.14it/s]/s][A

{'loss': 3.8199, 'grad_norm': 7.527639389038086, 'learning_rate': 1.9673469387755104e-05, 'epoch': 1.63}


  2%|▏         | 330/19600 [05:14<4:41:50,  1.14it/s]
                                                        
  2%|▏         | 330/19600 [05:14<4:41:50,  1.14it/s]/s][A

{'loss': 3.8186, 'grad_norm': 7.054855823516846, 'learning_rate': 1.966326530612245e-05, 'epoch': 1.68}


  2%|▏         | 340/19600 [05:22<4:41:27,  1.14it/s]
                                                        
  2%|▏         | 340/19600 [05:22<4:41:27,  1.14it/s]/s][A

{'loss': 3.7977, 'grad_norm': 8.698789596557617, 'learning_rate': 1.96530612244898e-05, 'epoch': 1.73}


  2%|▏         | 350/19600 [05:31<4:42:09,  1.14it/s]
                                                        
  2%|▏         | 350/19600 [05:31<4:42:09,  1.14it/s]/s][A

{'loss': 3.7998, 'grad_norm': 7.2718682289123535, 'learning_rate': 1.9642857142857145e-05, 'epoch': 1.79}


  2%|▏         | 360/19600 [05:40<4:41:32,  1.14it/s]
                                                        
  2%|▏         | 360/19600 [05:40<4:41:32,  1.14it/s]/s][A

{'loss': 3.7885, 'grad_norm': 6.972458839416504, 'learning_rate': 1.963265306122449e-05, 'epoch': 1.84}


  2%|▏         | 370/19600 [05:49<4:40:41,  1.14it/s]
                                                        
  2%|▏         | 370/19600 [05:49<4:40:41,  1.14it/s]/s][A

{'loss': 3.8084, 'grad_norm': 7.917996406555176, 'learning_rate': 1.962244897959184e-05, 'epoch': 1.89}


  2%|▏         | 380/19600 [05:58<4:40:54,  1.14it/s]
                                                        
  2%|▏         | 380/19600 [05:58<4:40:54,  1.14it/s]/s][A

{'loss': 3.7978, 'grad_norm': 6.8339433670043945, 'learning_rate': 1.9612244897959186e-05, 'epoch': 1.94}


  2%|▏         | 390/19600 [06:06<4:41:08,  1.14it/s]
                                                        
  2%|▏         | 390/19600 [06:06<4:41:08,  1.14it/s]/s][A

{'loss': 3.7593, 'grad_norm': 7.08496618270874, 'learning_rate': 1.960204081632653e-05, 'epoch': 1.99}


  2%|▏         | 392/19600 [06:08<3:45:51,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.07it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.18it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.63it/s][A[A

  6%|▋         | 5/79 [00:01<00:21,  3.37it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.21it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.04it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.01it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  3.00it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.99it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.97it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.97it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.97it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.96it/s][A[A

 23%|██▎       | 18/79 [00:

{'eval_loss': 3.7546870708465576, 'eval_accuracy': 0.1332, 'eval_runtime': 26.474, 'eval_samples_per_second': 377.729, 'eval_steps_per_second': 2.984, 'epoch': 2.0}


  2%|▏         | 400/19600 [06:42<8:15:38,  1.55s/it] 
                                                        
  2%|▏         | 400/19600 [06:42<8:15:38,  1.55s/it]/s][A

{'loss': 3.7681, 'grad_norm': 6.994754314422607, 'learning_rate': 1.9591836734693877e-05, 'epoch': 2.04}


  2%|▏         | 410/19600 [06:51<4:46:34,  1.12it/s]
                                                        
  2%|▏         | 410/19600 [06:51<4:46:34,  1.12it/s]/s][A

{'loss': 3.6705, 'grad_norm': 7.596919536590576, 'learning_rate': 1.9581632653061227e-05, 'epoch': 2.09}


  2%|▏         | 420/19600 [07:00<4:40:46,  1.14it/s]
                                                        
  2%|▏         | 420/19600 [07:00<4:40:46,  1.14it/s]/s][A

{'loss': 3.6989, 'grad_norm': 8.087489128112793, 'learning_rate': 1.9571428571428572e-05, 'epoch': 2.14}


  2%|▏         | 430/19600 [07:08<4:40:55,  1.14it/s]
                                                        
  2%|▏         | 430/19600 [07:08<4:40:55,  1.14it/s]/s][A

{'loss': 3.734, 'grad_norm': 8.958337783813477, 'learning_rate': 1.956122448979592e-05, 'epoch': 2.19}


  2%|▏         | 440/19600 [07:17<4:40:20,  1.14it/s]
                                                        
  2%|▏         | 440/19600 [07:17<4:40:20,  1.14it/s]/s][A

{'loss': 3.6537, 'grad_norm': 7.141852855682373, 'learning_rate': 1.9551020408163267e-05, 'epoch': 2.24}


  2%|▏         | 450/19600 [07:26<4:39:50,  1.14it/s]
                                                        
  2%|▏         | 450/19600 [07:26<4:39:50,  1.14it/s]/s][A

{'loss': 3.7146, 'grad_norm': 7.824423789978027, 'learning_rate': 1.9540816326530613e-05, 'epoch': 2.3}


  2%|▏         | 460/19600 [07:35<4:39:43,  1.14it/s]
                                                        
  2%|▏         | 460/19600 [07:35<4:39:43,  1.14it/s]/s][A

{'loss': 3.6635, 'grad_norm': 8.4516019821167, 'learning_rate': 1.9530612244897962e-05, 'epoch': 2.35}


  2%|▏         | 470/19600 [07:44<4:40:17,  1.14it/s]
                                                        
  2%|▏         | 470/19600 [07:44<4:40:17,  1.14it/s]/s][A

{'loss': 3.6612, 'grad_norm': 7.956578731536865, 'learning_rate': 1.9520408163265308e-05, 'epoch': 2.4}


  2%|▏         | 480/19600 [07:52<4:39:21,  1.14it/s]
                                                        
  2%|▏         | 480/19600 [07:52<4:39:21,  1.14it/s]/s][A

{'loss': 3.625, 'grad_norm': 8.520639419555664, 'learning_rate': 1.9510204081632654e-05, 'epoch': 2.45}


  2%|▎         | 490/19600 [08:01<4:39:49,  1.14it/s]
                                                        
  2%|▎         | 490/19600 [08:01<4:39:49,  1.14it/s]/s][A

{'loss': 3.6394, 'grad_norm': 8.921263694763184, 'learning_rate': 1.95e-05, 'epoch': 2.5}


  3%|▎         | 500/19600 [08:10<4:40:31,  1.13it/s]
                                                        
  3%|▎         | 500/19600 [08:10<4:40:31,  1.13it/s]/s][A

{'loss': 3.5844, 'grad_norm': 7.902652740478516, 'learning_rate': 1.948979591836735e-05, 'epoch': 2.55}


  3%|▎         | 510/19600 [08:19<4:39:36,  1.14it/s]
                                                        
  3%|▎         | 510/19600 [08:19<4:39:36,  1.14it/s]/s][A

{'loss': 3.5678, 'grad_norm': 9.679105758666992, 'learning_rate': 1.9479591836734695e-05, 'epoch': 2.6}


  3%|▎         | 520/19600 [08:28<4:39:24,  1.14it/s]
                                                        
  3%|▎         | 520/19600 [08:28<4:39:24,  1.14it/s]/s][A

{'loss': 3.5883, 'grad_norm': 8.636832237243652, 'learning_rate': 1.9469387755102044e-05, 'epoch': 2.65}


  3%|▎         | 530/19600 [08:36<4:37:48,  1.14it/s]
                                                        
  3%|▎         | 530/19600 [08:36<4:37:48,  1.14it/s]/s][A

{'loss': 3.5422, 'grad_norm': 9.999919891357422, 'learning_rate': 1.945918367346939e-05, 'epoch': 2.7}


  3%|▎         | 540/19600 [08:45<4:38:08,  1.14it/s]
                                                        
  3%|▎         | 540/19600 [08:45<4:38:08,  1.14it/s]/s][A

{'loss': 3.5663, 'grad_norm': 8.695366859436035, 'learning_rate': 1.9448979591836735e-05, 'epoch': 2.76}


  3%|▎         | 550/19600 [08:54<4:38:17,  1.14it/s]
                                                        
  3%|▎         | 550/19600 [08:54<4:38:17,  1.14it/s]/s][A

{'loss': 3.5495, 'grad_norm': 8.412345886230469, 'learning_rate': 1.9438775510204085e-05, 'epoch': 2.81}


  3%|▎         | 560/19600 [09:03<4:38:25,  1.14it/s]
                                                        
  3%|▎         | 560/19600 [09:03<4:38:25,  1.14it/s]/s][A

{'loss': 3.5717, 'grad_norm': 9.367110252380371, 'learning_rate': 1.942857142857143e-05, 'epoch': 2.86}


  3%|▎         | 570/19600 [09:11<4:39:13,  1.14it/s]
                                                        
  3%|▎         | 570/19600 [09:11<4:39:13,  1.14it/s]/s][A

{'loss': 3.5745, 'grad_norm': 8.354774475097656, 'learning_rate': 1.941836734693878e-05, 'epoch': 2.91}


  3%|▎         | 580/19600 [09:20<4:38:59,  1.14it/s]
                                                        
  3%|▎         | 580/19600 [09:20<4:38:59,  1.14it/s]/s][A

{'loss': 3.6114, 'grad_norm': 9.253328323364258, 'learning_rate': 1.9408163265306122e-05, 'epoch': 2.96}


  3%|▎         | 588/19600 [09:27<3:42:37,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.04it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.14it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.35it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.10it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.06it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.98it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.98it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.97it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.97it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.97it/s][A[A

 23%|██▎       | 18/79 [00:

{'eval_loss': 3.5800957679748535, 'eval_accuracy': 0.1618, 'eval_runtime': 26.5685, 'eval_samples_per_second': 376.386, 'eval_steps_per_second': 2.973, 'epoch': 3.0}


  3%|▎         | 590/19600 [09:56<34:55:15,  6.61s/it]
                                                        
  3%|▎         | 590/19600 [09:56<34:55:15,  6.61s/it]s][A

{'loss': 3.5418, 'grad_norm': 10.341446876525879, 'learning_rate': 1.939795918367347e-05, 'epoch': 3.01}


  3%|▎         | 600/19600 [10:05<5:28:55,  1.04s/it] 
                                                        
  3%|▎         | 600/19600 [10:05<5:28:55,  1.04s/it]/s][A

{'loss': 3.5239, 'grad_norm': 8.159499168395996, 'learning_rate': 1.9387755102040817e-05, 'epoch': 3.06}


  3%|▎         | 610/19600 [10:14<4:39:30,  1.13it/s]
                                                        
  3%|▎         | 610/19600 [10:14<4:39:30,  1.13it/s]/s][A

{'loss': 3.4572, 'grad_norm': 9.237455368041992, 'learning_rate': 1.9377551020408166e-05, 'epoch': 3.11}


  3%|▎         | 620/19600 [10:23<4:38:14,  1.14it/s]
                                                        
  3%|▎         | 620/19600 [10:23<4:38:14,  1.14it/s]/s][A

{'loss': 3.4508, 'grad_norm': 10.119324684143066, 'learning_rate': 1.9367346938775512e-05, 'epoch': 3.16}


  3%|▎         | 630/19600 [10:31<4:37:54,  1.14it/s]
                                                        
  3%|▎         | 630/19600 [10:31<4:37:54,  1.14it/s]/s][A

{'loss': 3.4332, 'grad_norm': 10.252942085266113, 'learning_rate': 1.9357142857142858e-05, 'epoch': 3.21}


  3%|▎         | 640/19600 [10:40<4:36:54,  1.14it/s]
                                                        
  3%|▎         | 640/19600 [10:40<4:36:54,  1.14it/s]/s][A

{'loss': 3.4561, 'grad_norm': 8.743207931518555, 'learning_rate': 1.9346938775510207e-05, 'epoch': 3.27}


  3%|▎         | 650/19600 [10:49<4:37:19,  1.14it/s]
                                                        
  3%|▎         | 650/19600 [10:49<4:37:19,  1.14it/s]/s][A

{'loss': 3.4579, 'grad_norm': 9.933248519897461, 'learning_rate': 1.9336734693877553e-05, 'epoch': 3.32}


  3%|▎         | 660/19600 [10:58<4:37:13,  1.14it/s]
                                                        
  3%|▎         | 660/19600 [10:58<4:37:13,  1.14it/s]/s][A

{'loss': 3.4746, 'grad_norm': 8.788043022155762, 'learning_rate': 1.9326530612244902e-05, 'epoch': 3.37}


  3%|▎         | 670/19600 [11:06<4:37:08,  1.14it/s]
                                                        
  3%|▎         | 670/19600 [11:06<4:37:08,  1.14it/s]/s][A

{'loss': 3.466, 'grad_norm': 9.833026885986328, 'learning_rate': 1.9316326530612248e-05, 'epoch': 3.42}


  3%|▎         | 680/19600 [11:15<4:37:07,  1.14it/s]
                                                        
  3%|▎         | 680/19600 [11:15<4:37:07,  1.14it/s]/s][A

{'loss': 3.4186, 'grad_norm': 10.40446949005127, 'learning_rate': 1.9306122448979593e-05, 'epoch': 3.47}


  4%|▎         | 690/19600 [11:24<4:37:06,  1.14it/s]
                                                        
  4%|▎         | 690/19600 [11:24<4:37:06,  1.14it/s]/s][A

{'loss': 3.4547, 'grad_norm': 9.88947582244873, 'learning_rate': 1.929591836734694e-05, 'epoch': 3.52}


  4%|▎         | 700/19600 [11:33<4:37:29,  1.14it/s]
                                                        
  4%|▎         | 700/19600 [11:33<4:37:29,  1.14it/s]/s][A

{'loss': 3.4241, 'grad_norm': 11.132515907287598, 'learning_rate': 1.928571428571429e-05, 'epoch': 3.57}


  4%|▎         | 710/19600 [11:42<4:37:02,  1.14it/s]
                                                        
  4%|▎         | 710/19600 [11:42<4:37:02,  1.14it/s]/s][A

{'loss': 3.5108, 'grad_norm': 9.694982528686523, 'learning_rate': 1.9275510204081634e-05, 'epoch': 3.62}


  4%|▎         | 720/19600 [11:50<4:36:30,  1.14it/s]
                                                        
  4%|▎         | 720/19600 [11:50<4:36:30,  1.14it/s]/s][A

{'loss': 3.4256, 'grad_norm': 9.745308876037598, 'learning_rate': 1.926530612244898e-05, 'epoch': 3.67}


  4%|▎         | 730/19600 [11:59<4:36:22,  1.14it/s]
                                                        
  4%|▎         | 730/19600 [11:59<4:36:22,  1.14it/s]/s][A

{'loss': 3.4011, 'grad_norm': 9.841904640197754, 'learning_rate': 1.925510204081633e-05, 'epoch': 3.72}


  4%|▍         | 740/19600 [12:08<4:36:30,  1.14it/s]
                                                        
  4%|▍         | 740/19600 [12:08<4:36:30,  1.14it/s]/s][A

{'loss': 3.4351, 'grad_norm': 9.622084617614746, 'learning_rate': 1.9244897959183675e-05, 'epoch': 3.78}


  4%|▍         | 750/19600 [12:17<4:36:21,  1.14it/s]
                                                        
  4%|▍         | 750/19600 [12:17<4:36:21,  1.14it/s]/s][A

{'loss': 3.4222, 'grad_norm': 9.594951629638672, 'learning_rate': 1.9234693877551024e-05, 'epoch': 3.83}


  4%|▍         | 760/19600 [12:26<4:35:41,  1.14it/s]
                                                        
  4%|▍         | 760/19600 [12:26<4:35:41,  1.14it/s]/s][A

{'loss': 3.4289, 'grad_norm': 8.247614860534668, 'learning_rate': 1.922448979591837e-05, 'epoch': 3.88}


  4%|▍         | 770/19600 [12:34<4:34:50,  1.14it/s]
                                                        
  4%|▍         | 770/19600 [12:34<4:34:50,  1.14it/s]/s][A

{'loss': 3.4747, 'grad_norm': 9.476156234741211, 'learning_rate': 1.9214285714285716e-05, 'epoch': 3.93}


  4%|▍         | 780/19600 [12:43<4:35:30,  1.14it/s]
                                                        
  4%|▍         | 780/19600 [12:43<4:35:30,  1.14it/s]/s][A

{'loss': 3.3796, 'grad_norm': 11.614023208618164, 'learning_rate': 1.920408163265306e-05, 'epoch': 3.98}


  4%|▍         | 784/19600 [12:46<3:40:26,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.08it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.17it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.63it/s][A[A

  6%|▋         | 5/79 [00:01<00:21,  3.37it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.21it/s][A[A

  9%|▉         | 7/79 [00:02<00:22,  3.13it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.08it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.04it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.02it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  3.00it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.98it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.96it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.95it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.97it/s][A[A

 23%|██▎       | 18/79 [00:

{'eval_loss': 3.4345850944519043, 'eval_accuracy': 0.193, 'eval_runtime': 26.4644, 'eval_samples_per_second': 377.867, 'eval_steps_per_second': 2.985, 'epoch': 4.0}


  4%|▍         | 790/19600 [13:19<11:43:08,  2.24s/it]
                                                        
  4%|▍         | 790/19600 [13:19<11:43:08,  2.24s/it]s][A

{'loss': 3.3395, 'grad_norm': 9.264826774597168, 'learning_rate': 1.919387755102041e-05, 'epoch': 4.03}


  4%|▍         | 800/19600 [13:28<4:46:37,  1.09it/s] 
                                                        
  4%|▍         | 800/19600 [13:28<4:46:37,  1.09it/s]/s][A

{'loss': 3.3721, 'grad_norm': 10.937347412109375, 'learning_rate': 1.9183673469387756e-05, 'epoch': 4.08}


  4%|▍         | 810/19600 [13:36<4:35:56,  1.13it/s]
                                                        
  4%|▍         | 810/19600 [13:36<4:35:56,  1.13it/s]/s][A

{'loss': 3.3247, 'grad_norm': 10.423150062561035, 'learning_rate': 1.9173469387755102e-05, 'epoch': 4.13}


  4%|▍         | 820/19600 [13:45<4:35:19,  1.14it/s]
                                                        
  4%|▍         | 820/19600 [13:45<4:35:19,  1.14it/s]/s][A

{'loss': 3.3191, 'grad_norm': 9.35423755645752, 'learning_rate': 1.916326530612245e-05, 'epoch': 4.18}


  4%|▍         | 830/19600 [13:54<4:35:03,  1.14it/s]
                                                        
  4%|▍         | 830/19600 [13:54<4:35:03,  1.14it/s]/s][A

{'loss': 3.3116, 'grad_norm': 11.187647819519043, 'learning_rate': 1.9153061224489797e-05, 'epoch': 4.23}


  4%|▍         | 840/19600 [14:03<4:33:45,  1.14it/s]
                                                        
  4%|▍         | 840/19600 [14:03<4:33:45,  1.14it/s]/s][A

{'loss': 3.2642, 'grad_norm': 11.379816055297852, 'learning_rate': 1.9142857142857146e-05, 'epoch': 4.29}


  4%|▍         | 850/19600 [14:12<4:33:59,  1.14it/s]
                                                        
  4%|▍         | 850/19600 [14:12<4:33:59,  1.14it/s]/s][A

{'loss': 3.2942, 'grad_norm': 10.868146896362305, 'learning_rate': 1.9132653061224492e-05, 'epoch': 4.34}


  4%|▍         | 860/19600 [14:20<4:33:58,  1.14it/s]
                                                        
  4%|▍         | 860/19600 [14:20<4:33:58,  1.14it/s]/s][A

{'loss': 3.337, 'grad_norm': 11.112472534179688, 'learning_rate': 1.9122448979591838e-05, 'epoch': 4.39}


  4%|▍         | 870/19600 [14:29<4:34:09,  1.14it/s]
                                                        
  4%|▍         | 870/19600 [14:29<4:34:09,  1.14it/s]/s][A

{'loss': 3.2508, 'grad_norm': 10.717109680175781, 'learning_rate': 1.9112244897959184e-05, 'epoch': 4.44}


  4%|▍         | 880/19600 [14:38<4:33:35,  1.14it/s]
                                                        
  4%|▍         | 880/19600 [14:38<4:33:35,  1.14it/s]/s][A

{'loss': 3.2779, 'grad_norm': 10.979413986206055, 'learning_rate': 1.9102040816326533e-05, 'epoch': 4.49}


  5%|▍         | 890/19600 [14:47<4:34:18,  1.14it/s]
                                                        
  5%|▍         | 890/19600 [14:47<4:34:18,  1.14it/s]/s][A

{'loss': 3.2691, 'grad_norm': 10.89268684387207, 'learning_rate': 1.909183673469388e-05, 'epoch': 4.54}


  5%|▍         | 900/19600 [14:56<4:34:19,  1.14it/s]
                                                        
  5%|▍         | 900/19600 [14:56<4:34:19,  1.14it/s]/s][A

{'loss': 3.2959, 'grad_norm': 10.920778274536133, 'learning_rate': 1.9081632653061225e-05, 'epoch': 4.59}


  5%|▍         | 910/19600 [15:04<4:33:56,  1.14it/s]
                                                        
  5%|▍         | 910/19600 [15:04<4:33:56,  1.14it/s]/s][A

{'loss': 3.2429, 'grad_norm': 12.410117149353027, 'learning_rate': 1.9071428571428574e-05, 'epoch': 4.64}


  5%|▍         | 920/19600 [15:13<4:34:04,  1.14it/s]
                                                        
  5%|▍         | 920/19600 [15:13<4:34:04,  1.14it/s]/s][A

{'loss': 3.2732, 'grad_norm': 11.734277725219727, 'learning_rate': 1.906122448979592e-05, 'epoch': 4.69}


  5%|▍         | 930/19600 [15:22<4:32:46,  1.14it/s]
                                                        
  5%|▍         | 930/19600 [15:22<4:32:46,  1.14it/s]/s][A

{'loss': 3.2676, 'grad_norm': 13.72843074798584, 'learning_rate': 1.905102040816327e-05, 'epoch': 4.74}


  5%|▍         | 940/19600 [15:31<4:33:01,  1.14it/s]
                                                        
  5%|▍         | 940/19600 [15:31<4:33:01,  1.14it/s]/s][A

{'loss': 3.2616, 'grad_norm': 11.211603164672852, 'learning_rate': 1.9040816326530614e-05, 'epoch': 4.8}


  5%|▍         | 950/19600 [15:40<4:33:49,  1.14it/s]
                                                        
  5%|▍         | 950/19600 [15:40<4:33:49,  1.14it/s]/s][A

{'loss': 3.3235, 'grad_norm': 13.628273010253906, 'learning_rate': 1.903061224489796e-05, 'epoch': 4.85}


  5%|▍         | 960/19600 [15:48<4:33:09,  1.14it/s]
                                                        
  5%|▍         | 960/19600 [15:48<4:33:09,  1.14it/s]/s][A

{'loss': 3.1783, 'grad_norm': 12.11004638671875, 'learning_rate': 1.9020408163265306e-05, 'epoch': 4.9}


  5%|▍         | 970/19600 [15:57<4:33:07,  1.14it/s]
                                                        
  5%|▍         | 970/19600 [15:57<4:33:07,  1.14it/s]/s][A

{'loss': 3.3026, 'grad_norm': 11.712539672851562, 'learning_rate': 1.9010204081632655e-05, 'epoch': 4.95}


  5%|▌         | 980/19600 [16:05<3:38:29,  1.42it/s]
                                                        
  5%|▌         | 980/19600 [16:05<3:38:29,  1.42it/s]/s][A

{'loss': 3.2649, 'grad_norm': 17.934280395507812, 'learning_rate': 1.9e-05, 'epoch': 5.0}




  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.05it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.17it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.35it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.21it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.04it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.01it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  3.00it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  3.00it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.98it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.98it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.97it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.97it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.97it/s][A[A

 23%|██▎       | 18/79 [00:05<00:20,  2.97it/s][A[A

 24%|██▍       | 19/79 [0

{'eval_loss': 3.3244752883911133, 'eval_accuracy': 0.2168, 'eval_runtime': 26.4813, 'eval_samples_per_second': 377.626, 'eval_steps_per_second': 2.983, 'epoch': 5.0}


  5%|▌         | 990/19600 [16:42<6:14:05,  1.21s/it] 
                                                        
  5%|▌         | 990/19600 [16:42<6:14:05,  1.21s/it]/s][A

{'loss': 3.1695, 'grad_norm': 10.470048904418945, 'learning_rate': 1.898979591836735e-05, 'epoch': 5.05}


  5%|▌         | 1000/19600 [16:50<4:35:32,  1.13it/s]
                                                        
  5%|▌         | 1000/19600 [16:50<4:35:32,  1.13it/s]s][A

{'loss': 3.1822, 'grad_norm': 10.560469627380371, 'learning_rate': 1.8979591836734696e-05, 'epoch': 5.1}


  5%|▌         | 1010/19600 [16:59<4:32:18,  1.14it/s]
                                                        
  5%|▌         | 1010/19600 [16:59<4:32:18,  1.14it/s]s][A

{'loss': 3.1258, 'grad_norm': 11.828632354736328, 'learning_rate': 1.8969387755102042e-05, 'epoch': 5.15}


  5%|▌         | 1020/19600 [17:08<4:32:32,  1.14it/s]
                                                        
  5%|▌         | 1020/19600 [17:08<4:32:32,  1.14it/s]s][A

{'loss': 3.1144, 'grad_norm': 12.816429138183594, 'learning_rate': 1.895918367346939e-05, 'epoch': 5.2}


  5%|▌         | 1030/19600 [17:17<4:32:06,  1.14it/s]
                                                        
  5%|▌         | 1030/19600 [17:17<4:32:06,  1.14it/s]s][A

{'loss': 3.1446, 'grad_norm': 12.233270645141602, 'learning_rate': 1.8948979591836737e-05, 'epoch': 5.26}


  5%|▌         | 1040/19600 [17:26<4:32:08,  1.14it/s]
                                                        
  5%|▌         | 1040/19600 [17:26<4:32:08,  1.14it/s]s][A

{'loss': 3.1385, 'grad_norm': 14.539220809936523, 'learning_rate': 1.8938775510204083e-05, 'epoch': 5.31}


  5%|▌         | 1050/19600 [17:34<4:31:59,  1.14it/s]
                                                        
  5%|▌         | 1050/19600 [17:35<4:31:59,  1.14it/s]s][A

{'loss': 3.1632, 'grad_norm': 13.007341384887695, 'learning_rate': 1.892857142857143e-05, 'epoch': 5.36}


  5%|▌         | 1060/19600 [17:43<4:31:31,  1.14it/s]
                                                        
  5%|▌         | 1060/19600 [17:43<4:31:31,  1.14it/s]s][A

{'loss': 3.2064, 'grad_norm': 13.65086841583252, 'learning_rate': 1.8918367346938778e-05, 'epoch': 5.41}


  5%|▌         | 1070/19600 [17:52<4:31:05,  1.14it/s]
                                                        
  5%|▌         | 1070/19600 [17:52<4:31:05,  1.14it/s]s][A

{'loss': 3.1522, 'grad_norm': 14.15380859375, 'learning_rate': 1.8908163265306123e-05, 'epoch': 5.46}


  6%|▌         | 1080/19600 [18:01<4:31:09,  1.14it/s]
                                                        
  6%|▌         | 1080/19600 [18:01<4:31:09,  1.14it/s]s][A

{'loss': 3.1339, 'grad_norm': 11.670838356018066, 'learning_rate': 1.8897959183673473e-05, 'epoch': 5.51}


  6%|▌         | 1090/19600 [18:10<4:31:05,  1.14it/s]
                                                        
  6%|▌         | 1090/19600 [18:10<4:31:05,  1.14it/s]s][A

{'loss': 3.156, 'grad_norm': 13.737561225891113, 'learning_rate': 1.888775510204082e-05, 'epoch': 5.56}


  6%|▌         | 1100/19600 [18:18<4:30:46,  1.14it/s]
                                                        
  6%|▌         | 1100/19600 [18:18<4:30:46,  1.14it/s]s][A

{'loss': 3.1155, 'grad_norm': 12.517879486083984, 'learning_rate': 1.8877551020408164e-05, 'epoch': 5.61}


  6%|▌         | 1110/19600 [18:27<4:30:58,  1.14it/s]
                                                        
  6%|▌         | 1110/19600 [18:27<4:30:58,  1.14it/s]s][A

{'loss': 3.1272, 'grad_norm': 11.729757308959961, 'learning_rate': 1.8867346938775513e-05, 'epoch': 5.66}


  6%|▌         | 1120/19600 [18:36<4:30:44,  1.14it/s]
                                                        
  6%|▌         | 1120/19600 [18:36<4:30:44,  1.14it/s]s][A

{'loss': 3.1581, 'grad_norm': 12.632522583007812, 'learning_rate': 1.885714285714286e-05, 'epoch': 5.71}


  6%|▌         | 1130/19600 [18:45<4:30:21,  1.14it/s]
                                                        
  6%|▌         | 1130/19600 [18:45<4:30:21,  1.14it/s]s][A

{'loss': 3.1102, 'grad_norm': 12.336773872375488, 'learning_rate': 1.8846938775510205e-05, 'epoch': 5.77}


  6%|▌         | 1140/19600 [18:54<4:30:27,  1.14it/s]
                                                        
  6%|▌         | 1140/19600 [18:54<4:30:27,  1.14it/s]s][A

{'loss': 3.1658, 'grad_norm': 11.849376678466797, 'learning_rate': 1.883673469387755e-05, 'epoch': 5.82}


  6%|▌         | 1150/19600 [19:02<4:29:45,  1.14it/s]
                                                        
  6%|▌         | 1150/19600 [19:02<4:29:45,  1.14it/s]s][A

{'loss': 3.1037, 'grad_norm': 13.261553764343262, 'learning_rate': 1.88265306122449e-05, 'epoch': 5.87}


  6%|▌         | 1160/19600 [19:11<4:29:45,  1.14it/s]
                                                        
  6%|▌         | 1160/19600 [19:11<4:29:45,  1.14it/s]s][A

{'loss': 3.134, 'grad_norm': 14.720545768737793, 'learning_rate': 1.8816326530612246e-05, 'epoch': 5.92}


  6%|▌         | 1170/19600 [19:20<4:29:52,  1.14it/s]
                                                        
  6%|▌         | 1170/19600 [19:20<4:29:52,  1.14it/s]s][A

{'loss': 3.0817, 'grad_norm': 12.855253219604492, 'learning_rate': 1.8806122448979595e-05, 'epoch': 5.97}


  6%|▌         | 1176/19600 [19:25<3:35:50,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.01it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.14it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.36it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.22it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.04it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.96it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.96it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.96it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.96it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.95it/s][A[A

 23%|██▎       | 18/79 [00

{'eval_loss': 3.2246811389923096, 'eval_accuracy': 0.2311, 'eval_runtime': 26.517, 'eval_samples_per_second': 377.117, 'eval_steps_per_second': 2.979, 'epoch': 6.0}


  6%|▌         | 1180/19600 [19:56<18:52:33,  3.69s/it]
                                                        
  6%|▌         | 1180/19600 [19:56<18:52:33,  3.69s/it]][A

{'loss': 3.0248, 'grad_norm': 13.140463829040527, 'learning_rate': 1.879591836734694e-05, 'epoch': 6.02}


  6%|▌         | 1190/19600 [20:05<4:53:37,  1.04it/s] 
                                                        
  6%|▌         | 1190/19600 [20:05<4:53:37,  1.04it/s]s][A

{'loss': 3.1022, 'grad_norm': 13.311887741088867, 'learning_rate': 1.8785714285714286e-05, 'epoch': 6.07}


  6%|▌         | 1200/19600 [20:13<4:30:08,  1.14it/s]
                                                        
  6%|▌         | 1200/19600 [20:13<4:30:08,  1.14it/s]s][A

{'loss': 3.0604, 'grad_norm': 13.764361381530762, 'learning_rate': 1.8775510204081636e-05, 'epoch': 6.12}


  6%|▌         | 1210/19600 [20:22<4:29:53,  1.14it/s]
                                                        
  6%|▌         | 1210/19600 [20:22<4:29:53,  1.14it/s]s][A

{'loss': 2.9964, 'grad_norm': 12.776761054992676, 'learning_rate': 1.876530612244898e-05, 'epoch': 6.17}


  6%|▌         | 1220/19600 [20:31<4:29:24,  1.14it/s]
                                                        
  6%|▌         | 1220/19600 [20:31<4:29:24,  1.14it/s]s][A

{'loss': 2.9942, 'grad_norm': 13.179728507995605, 'learning_rate': 1.8755102040816327e-05, 'epoch': 6.22}


  6%|▋         | 1230/19600 [20:40<4:30:06,  1.13it/s]
                                                        
  6%|▋         | 1230/19600 [20:40<4:30:06,  1.13it/s]s][A

{'loss': 2.9663, 'grad_norm': 14.817127227783203, 'learning_rate': 1.8744897959183673e-05, 'epoch': 6.28}


  6%|▋         | 1240/19600 [20:49<4:28:55,  1.14it/s]
                                                        
  6%|▋         | 1240/19600 [20:49<4:28:55,  1.14it/s]s][A

{'loss': 3.0342, 'grad_norm': 16.09879493713379, 'learning_rate': 1.8734693877551022e-05, 'epoch': 6.33}


  6%|▋         | 1250/19600 [20:57<4:28:59,  1.14it/s]
                                                        
  6%|▋         | 1250/19600 [20:58<4:28:59,  1.14it/s]s][A

{'loss': 3.0367, 'grad_norm': 13.90337085723877, 'learning_rate': 1.8724489795918368e-05, 'epoch': 6.38}


  6%|▋         | 1260/19600 [21:06<4:27:51,  1.14it/s]
                                                        
  6%|▋         | 1260/19600 [21:06<4:27:51,  1.14it/s]s][A

{'loss': 2.974, 'grad_norm': 14.726383209228516, 'learning_rate': 1.8714285714285717e-05, 'epoch': 6.43}


  6%|▋         | 1270/19600 [21:15<4:28:21,  1.14it/s]
                                                        
  6%|▋         | 1270/19600 [21:15<4:28:21,  1.14it/s]s][A

{'loss': 2.9814, 'grad_norm': 16.4366455078125, 'learning_rate': 1.8704081632653063e-05, 'epoch': 6.48}


  7%|▋         | 1280/19600 [21:24<4:27:50,  1.14it/s]
                                                        
  7%|▋         | 1280/19600 [21:24<4:27:50,  1.14it/s]s][A

{'loss': 3.0264, 'grad_norm': 14.648536682128906, 'learning_rate': 1.869387755102041e-05, 'epoch': 6.53}


  7%|▋         | 1290/19600 [21:33<4:28:06,  1.14it/s]
                                                        
  7%|▋         | 1290/19600 [21:33<4:28:06,  1.14it/s]s][A

{'loss': 3.0234, 'grad_norm': 17.460844039916992, 'learning_rate': 1.8683673469387758e-05, 'epoch': 6.58}


  7%|▋         | 1300/19600 [21:41<4:27:56,  1.14it/s]
                                                        
  7%|▋         | 1300/19600 [21:41<4:27:56,  1.14it/s]s][A

{'loss': 2.9387, 'grad_norm': 15.1658353805542, 'learning_rate': 1.8673469387755104e-05, 'epoch': 6.63}


  7%|▋         | 1310/19600 [21:50<4:27:15,  1.14it/s]
                                                        
  7%|▋         | 1310/19600 [21:50<4:27:15,  1.14it/s]s][A

{'loss': 3.0066, 'grad_norm': 13.966554641723633, 'learning_rate': 1.866326530612245e-05, 'epoch': 6.68}


  7%|▋         | 1320/19600 [21:59<4:27:05,  1.14it/s]
                                                        
  7%|▋         | 1320/19600 [21:59<4:27:05,  1.14it/s]s][A

{'loss': 3.0175, 'grad_norm': 14.336566925048828, 'learning_rate': 1.8653061224489795e-05, 'epoch': 6.73}


  7%|▋         | 1330/19600 [22:08<4:27:36,  1.14it/s]
                                                        
  7%|▋         | 1330/19600 [22:08<4:27:36,  1.14it/s]s][A

{'loss': 2.9388, 'grad_norm': 15.062383651733398, 'learning_rate': 1.8642857142857144e-05, 'epoch': 6.79}


  7%|▋         | 1340/19600 [22:17<4:27:18,  1.14it/s]
                                                        
  7%|▋         | 1340/19600 [22:17<4:27:18,  1.14it/s]s][A

{'loss': 3.0078, 'grad_norm': 16.429277420043945, 'learning_rate': 1.863265306122449e-05, 'epoch': 6.84}


  7%|▋         | 1350/19600 [22:25<4:27:23,  1.14it/s]
                                                        
  7%|▋         | 1350/19600 [22:25<4:27:23,  1.14it/s]s][A

{'loss': 3.0044, 'grad_norm': 13.678689956665039, 'learning_rate': 1.862244897959184e-05, 'epoch': 6.89}


  7%|▋         | 1360/19600 [22:34<4:27:10,  1.14it/s]
                                                        
  7%|▋         | 1360/19600 [22:34<4:27:10,  1.14it/s]s][A

{'loss': 2.993, 'grad_norm': 13.558757781982422, 'learning_rate': 1.8612244897959185e-05, 'epoch': 6.94}


  7%|▋         | 1370/19600 [22:43<4:27:29,  1.14it/s]
                                                        
  7%|▋         | 1370/19600 [22:43<4:27:29,  1.14it/s]s][A

{'loss': 3.0391, 'grad_norm': 15.922893524169922, 'learning_rate': 1.860204081632653e-05, 'epoch': 6.99}


  7%|▋         | 1372/19600 [22:44<3:34:05,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.03it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.12it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.60it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.35it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.06it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.02it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.96it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.95it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.95it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.95it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.95it/s][A[A

 22%|██▏       | 17/79 [00:05<00:21,  2.94it/s][A[A

 23%|██▎       | 18/79 [00

{'eval_loss': 3.157343864440918, 'eval_accuracy': 0.2516, 'eval_runtime': 26.5819, 'eval_samples_per_second': 376.195, 'eval_steps_per_second': 2.972, 'epoch': 7.0}


  7%|▋         | 1380/19600 [23:19<7:50:41,  1.55s/it] 
                                                        
  7%|▋         | 1380/19600 [23:19<7:50:41,  1.55s/it]s][A

{'loss': 2.9247, 'grad_norm': 15.904783248901367, 'learning_rate': 1.859183673469388e-05, 'epoch': 7.04}


  7%|▋         | 1390/19600 [23:28<4:32:21,  1.11it/s]
                                                        
  7%|▋         | 1390/19600 [23:28<4:32:21,  1.11it/s]s][A

{'loss': 2.9093, 'grad_norm': 16.880945205688477, 'learning_rate': 1.8581632653061226e-05, 'epoch': 7.09}


  7%|▋         | 1400/19600 [23:36<4:26:25,  1.14it/s]
                                                        
  7%|▋         | 1400/19600 [23:36<4:26:25,  1.14it/s]s][A

{'loss': 2.8901, 'grad_norm': 16.310699462890625, 'learning_rate': 1.8571428571428575e-05, 'epoch': 7.14}


  7%|▋         | 1410/19600 [23:45<4:26:28,  1.14it/s]
                                                        
  7%|▋         | 1410/19600 [23:45<4:26:28,  1.14it/s]s][A

{'loss': 2.8986, 'grad_norm': 16.366628646850586, 'learning_rate': 1.856122448979592e-05, 'epoch': 7.19}


  7%|▋         | 1420/19600 [23:54<4:26:37,  1.14it/s]
                                                        
  7%|▋         | 1420/19600 [23:54<4:26:37,  1.14it/s]s][A

{'loss': 2.8705, 'grad_norm': 15.72362232208252, 'learning_rate': 1.8551020408163267e-05, 'epoch': 7.24}


  7%|▋         | 1430/19600 [24:03<4:26:07,  1.14it/s]
                                                        
  7%|▋         | 1430/19600 [24:03<4:26:07,  1.14it/s]s][A

{'loss': 2.8631, 'grad_norm': 16.05551528930664, 'learning_rate': 1.8540816326530613e-05, 'epoch': 7.3}


  7%|▋         | 1440/19600 [24:12<4:25:56,  1.14it/s]
                                                        
  7%|▋         | 1440/19600 [24:12<4:25:56,  1.14it/s]s][A

{'loss': 2.8203, 'grad_norm': 16.660606384277344, 'learning_rate': 1.853061224489796e-05, 'epoch': 7.35}


  7%|▋         | 1450/19600 [24:20<4:25:46,  1.14it/s]
                                                        
  7%|▋         | 1450/19600 [24:20<4:25:46,  1.14it/s]s][A

{'loss': 2.8866, 'grad_norm': 20.065000534057617, 'learning_rate': 1.8520408163265307e-05, 'epoch': 7.4}


  7%|▋         | 1460/19600 [24:29<4:25:18,  1.14it/s]
                                                        
  7%|▋         | 1460/19600 [24:29<4:25:18,  1.14it/s]s][A

{'loss': 2.8999, 'grad_norm': 15.514944076538086, 'learning_rate': 1.8510204081632653e-05, 'epoch': 7.45}


  8%|▊         | 1470/19600 [24:38<4:25:38,  1.14it/s]
                                                        
  8%|▊         | 1470/19600 [24:38<4:25:38,  1.14it/s]s][A

{'loss': 2.897, 'grad_norm': 15.46363639831543, 'learning_rate': 1.8500000000000002e-05, 'epoch': 7.5}


  8%|▊         | 1480/19600 [24:47<4:25:44,  1.14it/s]
                                                        
  8%|▊         | 1480/19600 [24:47<4:25:44,  1.14it/s]s][A

{'loss': 2.9047, 'grad_norm': 17.330108642578125, 'learning_rate': 1.8489795918367348e-05, 'epoch': 7.55}


  8%|▊         | 1490/19600 [24:56<4:25:51,  1.14it/s]
                                                        
  8%|▊         | 1490/19600 [24:56<4:25:51,  1.14it/s]s][A

{'loss': 2.9238, 'grad_norm': 15.88371467590332, 'learning_rate': 1.8479591836734697e-05, 'epoch': 7.6}


  8%|▊         | 1500/19600 [25:04<4:25:35,  1.14it/s]
                                                        
  8%|▊         | 1500/19600 [25:04<4:25:35,  1.14it/s]s][A

{'loss': 2.8465, 'grad_norm': 18.158321380615234, 'learning_rate': 1.8469387755102043e-05, 'epoch': 7.65}


  8%|▊         | 1510/19600 [25:13<4:24:49,  1.14it/s]
                                                        
  8%|▊         | 1510/19600 [25:13<4:24:49,  1.14it/s]s][A

{'loss': 2.8967, 'grad_norm': 17.526945114135742, 'learning_rate': 1.845918367346939e-05, 'epoch': 7.7}


  8%|▊         | 1520/19600 [25:22<4:25:25,  1.14it/s]
                                                        
  8%|▊         | 1520/19600 [25:22<4:25:25,  1.14it/s]s][A

{'loss': 2.8578, 'grad_norm': 16.310810089111328, 'learning_rate': 1.8448979591836735e-05, 'epoch': 7.76}


  8%|▊         | 1530/19600 [25:31<4:24:24,  1.14it/s]
                                                        
  8%|▊         | 1530/19600 [25:31<4:24:24,  1.14it/s]s][A

{'loss': 2.8245, 'grad_norm': 17.10711669921875, 'learning_rate': 1.8438775510204084e-05, 'epoch': 7.81}


  8%|▊         | 1540/19600 [25:40<4:24:23,  1.14it/s]
                                                        
  8%|▊         | 1540/19600 [25:40<4:24:23,  1.14it/s]s][A

{'loss': 2.9094, 'grad_norm': 18.895193099975586, 'learning_rate': 1.842857142857143e-05, 'epoch': 7.86}


  8%|▊         | 1550/19600 [25:48<4:24:37,  1.14it/s]
                                                        
  8%|▊         | 1550/19600 [25:48<4:24:37,  1.14it/s]s][A

{'loss': 2.8668, 'grad_norm': 17.988555908203125, 'learning_rate': 1.8418367346938776e-05, 'epoch': 7.91}


  8%|▊         | 1560/19600 [25:57<4:24:15,  1.14it/s]
                                                        
  8%|▊         | 1560/19600 [25:57<4:24:15,  1.14it/s]s][A

{'loss': 2.92, 'grad_norm': 15.103913307189941, 'learning_rate': 1.8408163265306125e-05, 'epoch': 7.96}


  8%|▊         | 1568/19600 [26:04<3:31:30,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.04it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.13it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.36it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.11it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.06it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.01it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.98it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.99it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.99it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.97it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.96it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.96it/s][A[A

 23%|██▎       | 18/79 [00

{'eval_loss': 3.050079345703125, 'eval_accuracy': 0.2687, 'eval_runtime': 26.5205, 'eval_samples_per_second': 377.066, 'eval_steps_per_second': 2.979, 'epoch': 8.0}


  8%|▊         | 1570/19600 [26:33<32:48:37,  6.55s/it]
                                                        
  8%|▊         | 1570/19600 [26:33<32:48:37,  6.55s/it]][A

{'loss': 2.7928, 'grad_norm': 17.311073303222656, 'learning_rate': 1.839795918367347e-05, 'epoch': 8.01}


  8%|▊         | 1580/19600 [26:42<5:12:02,  1.04s/it] 
                                                        
  8%|▊         | 1580/19600 [26:42<5:12:02,  1.04s/it]s][A

{'loss': 2.7654, 'grad_norm': 17.070314407348633, 'learning_rate': 1.838775510204082e-05, 'epoch': 8.06}


  8%|▊         | 1590/19600 [26:50<4:25:34,  1.13it/s]
                                                        
  8%|▊         | 1590/19600 [26:50<4:25:34,  1.13it/s]s][A

{'loss': 2.7614, 'grad_norm': 17.935121536254883, 'learning_rate': 1.8377551020408165e-05, 'epoch': 8.11}


  8%|▊         | 1600/19600 [26:59<4:24:10,  1.14it/s]
                                                        
  8%|▊         | 1600/19600 [26:59<4:24:10,  1.14it/s]s][A

{'loss': 2.7584, 'grad_norm': 18.55483627319336, 'learning_rate': 1.836734693877551e-05, 'epoch': 8.16}


  8%|▊         | 1610/19600 [27:08<4:22:47,  1.14it/s]
                                                        
  8%|▊         | 1610/19600 [27:08<4:22:47,  1.14it/s]s][A

{'loss': 2.7312, 'grad_norm': 16.546092987060547, 'learning_rate': 1.8357142857142857e-05, 'epoch': 8.21}


  8%|▊         | 1620/19600 [27:17<4:22:57,  1.14it/s]
                                                        
  8%|▊         | 1620/19600 [27:17<4:22:57,  1.14it/s]s][A

{'loss': 2.7584, 'grad_norm': 20.032062530517578, 'learning_rate': 1.8346938775510206e-05, 'epoch': 8.27}


  8%|▊         | 1630/19600 [27:26<4:23:17,  1.14it/s]
                                                        
  8%|▊         | 1630/19600 [27:26<4:23:17,  1.14it/s]s][A

{'loss': 2.8178, 'grad_norm': 19.146331787109375, 'learning_rate': 1.8336734693877552e-05, 'epoch': 8.32}


  8%|▊         | 1640/19600 [27:34<4:22:36,  1.14it/s]
                                                        
  8%|▊         | 1640/19600 [27:34<4:22:36,  1.14it/s]s][A

{'loss': 2.7733, 'grad_norm': 18.504009246826172, 'learning_rate': 1.8326530612244898e-05, 'epoch': 8.37}


  8%|▊         | 1650/19600 [27:43<4:23:36,  1.13it/s]
                                                        
  8%|▊         | 1650/19600 [27:43<4:23:36,  1.13it/s]s][A

{'loss': 2.7464, 'grad_norm': 18.534421920776367, 'learning_rate': 1.8316326530612247e-05, 'epoch': 8.42}


  8%|▊         | 1660/19600 [27:52<4:23:08,  1.14it/s]
                                                        
  8%|▊         | 1660/19600 [27:52<4:23:08,  1.14it/s]s][A

{'loss': 2.7878, 'grad_norm': 16.304697036743164, 'learning_rate': 1.8306122448979593e-05, 'epoch': 8.47}


  9%|▊         | 1670/19600 [28:01<4:22:38,  1.14it/s]
                                                        
  9%|▊         | 1670/19600 [28:01<4:22:38,  1.14it/s]s][A

{'loss': 2.7739, 'grad_norm': 18.827817916870117, 'learning_rate': 1.8295918367346942e-05, 'epoch': 8.52}


  9%|▊         | 1680/19600 [28:10<4:22:51,  1.14it/s]
                                                        
  9%|▊         | 1680/19600 [28:10<4:22:51,  1.14it/s]s][A

{'loss': 2.8091, 'grad_norm': 15.282074928283691, 'learning_rate': 1.8285714285714288e-05, 'epoch': 8.57}


  9%|▊         | 1690/19600 [28:18<4:22:06,  1.14it/s]
                                                        
  9%|▊         | 1690/19600 [28:18<4:22:06,  1.14it/s]s][A

{'loss': 2.7506, 'grad_norm': 17.331336975097656, 'learning_rate': 1.8275510204081634e-05, 'epoch': 8.62}


  9%|▊         | 1700/19600 [28:27<4:22:03,  1.14it/s]
                                                        
  9%|▊         | 1700/19600 [28:27<4:22:03,  1.14it/s]s][A

{'loss': 2.8198, 'grad_norm': 17.180233001708984, 'learning_rate': 1.826530612244898e-05, 'epoch': 8.67}


  9%|▊         | 1710/19600 [28:36<4:22:35,  1.14it/s]
                                                        
  9%|▊         | 1710/19600 [28:36<4:22:35,  1.14it/s]s][A

{'loss': 2.7572, 'grad_norm': 17.080211639404297, 'learning_rate': 1.825510204081633e-05, 'epoch': 8.72}


  9%|▉         | 1720/19600 [28:45<4:22:15,  1.14it/s]
                                                        
  9%|▉         | 1720/19600 [28:45<4:22:15,  1.14it/s]s][A

{'loss': 2.7053, 'grad_norm': 17.214357376098633, 'learning_rate': 1.8244897959183674e-05, 'epoch': 8.78}


  9%|▉         | 1730/19600 [28:53<4:22:12,  1.14it/s]
                                                        
  9%|▉         | 1730/19600 [28:53<4:22:12,  1.14it/s]s][A

{'loss': 2.7531, 'grad_norm': 17.66777229309082, 'learning_rate': 1.823469387755102e-05, 'epoch': 8.83}


  9%|▉         | 1740/19600 [29:02<4:22:07,  1.14it/s]
                                                        
  9%|▉         | 1740/19600 [29:02<4:22:07,  1.14it/s]s][A

{'loss': 2.6943, 'grad_norm': 19.456085205078125, 'learning_rate': 1.822448979591837e-05, 'epoch': 8.88}


  9%|▉         | 1750/19600 [29:11<4:22:03,  1.14it/s]
                                                        
  9%|▉         | 1750/19600 [29:11<4:22:03,  1.14it/s]s][A

{'loss': 2.8109, 'grad_norm': 18.643962860107422, 'learning_rate': 1.8214285714285715e-05, 'epoch': 8.93}


  9%|▉         | 1760/19600 [29:20<4:21:36,  1.14it/s]
                                                        
  9%|▉         | 1760/19600 [29:20<4:21:36,  1.14it/s]s][A

{'loss': 2.7318, 'grad_norm': 17.64029312133789, 'learning_rate': 1.8204081632653064e-05, 'epoch': 8.98}


  9%|▉         | 1764/19600 [29:23<3:29:22,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.03it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.13it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.35it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.19it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.10it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:23,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.96it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.96it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.96it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.96it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.96it/s][A[A

 23%|██▎       | 18/79 [00

{'eval_loss': 2.9962310791015625, 'eval_accuracy': 0.2808, 'eval_runtime': 26.4925, 'eval_samples_per_second': 377.466, 'eval_steps_per_second': 2.982, 'epoch': 9.0}


  9%|▉         | 1770/19600 [29:55<11:04:16,  2.24s/it]
                                                        
  9%|▉         | 1770/19600 [29:55<11:04:16,  2.24s/it]][A

{'loss': 2.731, 'grad_norm': 22.65355682373047, 'learning_rate': 1.819387755102041e-05, 'epoch': 9.03}


  9%|▉         | 1780/19600 [30:04<4:32:22,  1.09it/s] 
                                                        
  9%|▉         | 1780/19600 [30:04<4:32:22,  1.09it/s]s][A

{'loss': 2.5868, 'grad_norm': 22.019729614257812, 'learning_rate': 1.8183673469387756e-05, 'epoch': 9.08}


  9%|▉         | 1790/19600 [30:13<4:21:45,  1.13it/s]
                                                        
  9%|▉         | 1790/19600 [30:13<4:21:45,  1.13it/s]s][A

{'loss': 2.7178, 'grad_norm': 18.166532516479492, 'learning_rate': 1.81734693877551e-05, 'epoch': 9.13}


  9%|▉         | 1800/19600 [30:22<4:20:32,  1.14it/s]
                                                        
  9%|▉         | 1800/19600 [30:22<4:20:32,  1.14it/s]s][A

{'loss': 2.6324, 'grad_norm': 18.91908836364746, 'learning_rate': 1.816326530612245e-05, 'epoch': 9.18}


  9%|▉         | 1810/19600 [30:31<4:20:40,  1.14it/s]
                                                        
  9%|▉         | 1810/19600 [30:31<4:20:40,  1.14it/s]s][A

{'loss': 2.6898, 'grad_norm': 19.395254135131836, 'learning_rate': 1.8153061224489797e-05, 'epoch': 9.23}


  9%|▉         | 1820/19600 [30:39<4:20:42,  1.14it/s]
                                                        
  9%|▉         | 1820/19600 [30:39<4:20:42,  1.14it/s]s][A

{'loss': 2.6666, 'grad_norm': 18.9864444732666, 'learning_rate': 1.8142857142857146e-05, 'epoch': 9.29}


  9%|▉         | 1830/19600 [30:48<4:20:51,  1.14it/s]
                                                        
  9%|▉         | 1830/19600 [30:48<4:20:51,  1.14it/s]s][A

{'loss': 2.6769, 'grad_norm': 16.916004180908203, 'learning_rate': 1.813265306122449e-05, 'epoch': 9.34}


  9%|▉         | 1840/19600 [30:57<4:20:21,  1.14it/s]
                                                        
  9%|▉         | 1840/19600 [30:57<4:20:21,  1.14it/s]s][A

{'loss': 2.7609, 'grad_norm': 21.08806610107422, 'learning_rate': 1.8122448979591837e-05, 'epoch': 9.39}


  9%|▉         | 1850/19600 [31:06<4:19:55,  1.14it/s]
                                                        
  9%|▉         | 1850/19600 [31:06<4:19:55,  1.14it/s]s][A

{'loss': 2.7053, 'grad_norm': 18.576950073242188, 'learning_rate': 1.8112244897959187e-05, 'epoch': 9.44}


  9%|▉         | 1860/19600 [31:15<4:19:51,  1.14it/s]
                                                        
  9%|▉         | 1860/19600 [31:15<4:19:51,  1.14it/s]s][A

{'loss': 2.5523, 'grad_norm': 17.00912857055664, 'learning_rate': 1.8102040816326532e-05, 'epoch': 9.49}


 10%|▉         | 1870/19600 [31:23<4:20:08,  1.14it/s]
                                                        
 10%|▉         | 1870/19600 [31:23<4:20:08,  1.14it/s]s][A

{'loss': 2.6147, 'grad_norm': 18.165712356567383, 'learning_rate': 1.8091836734693878e-05, 'epoch': 9.54}


 10%|▉         | 1880/19600 [31:32<4:19:55,  1.14it/s]
                                                        
 10%|▉         | 1880/19600 [31:32<4:19:55,  1.14it/s]s][A

{'loss': 2.618, 'grad_norm': 18.802642822265625, 'learning_rate': 1.8081632653061224e-05, 'epoch': 9.59}


 10%|▉         | 1890/19600 [31:41<4:18:35,  1.14it/s]
                                                        
 10%|▉         | 1890/19600 [31:41<4:18:35,  1.14it/s]s][A

{'loss': 2.6902, 'grad_norm': 17.95478630065918, 'learning_rate': 1.8071428571428573e-05, 'epoch': 9.64}


 10%|▉         | 1900/19600 [31:50<4:18:40,  1.14it/s]
                                                        
 10%|▉         | 1900/19600 [31:50<4:18:40,  1.14it/s]s][A

{'loss': 2.6584, 'grad_norm': 16.984804153442383, 'learning_rate': 1.806122448979592e-05, 'epoch': 9.69}


 10%|▉         | 1910/19600 [31:59<4:18:54,  1.14it/s]
                                                        
 10%|▉         | 1910/19600 [31:59<4:18:54,  1.14it/s]s][A

{'loss': 2.7311, 'grad_norm': 18.96785545349121, 'learning_rate': 1.8051020408163268e-05, 'epoch': 9.74}


 10%|▉         | 1920/19600 [32:07<4:18:20,  1.14it/s]
                                                        
 10%|▉         | 1920/19600 [32:07<4:18:20,  1.14it/s]s][A

{'loss': 2.675, 'grad_norm': 20.32529067993164, 'learning_rate': 1.8040816326530614e-05, 'epoch': 9.8}


 10%|▉         | 1930/19600 [32:16<4:18:32,  1.14it/s]
                                                        
 10%|▉         | 1930/19600 [32:16<4:18:32,  1.14it/s]s][A

{'loss': 2.6271, 'grad_norm': 19.416322708129883, 'learning_rate': 1.803061224489796e-05, 'epoch': 9.85}


 10%|▉         | 1940/19600 [32:25<4:19:11,  1.14it/s]
                                                        
 10%|▉         | 1940/19600 [32:25<4:19:11,  1.14it/s]s][A

{'loss': 2.6667, 'grad_norm': 22.124971389770508, 'learning_rate': 1.802040816326531e-05, 'epoch': 9.9}


 10%|▉         | 1950/19600 [32:34<4:18:06,  1.14it/s]
                                                        
 10%|▉         | 1950/19600 [32:34<4:18:06,  1.14it/s]s][A

{'loss': 2.648, 'grad_norm': 19.740203857421875, 'learning_rate': 1.8010204081632655e-05, 'epoch': 9.95}


 10%|█         | 1960/19600 [32:42<3:26:36,  1.42it/s]
                                                        
 10%|█         | 1960/19600 [32:42<3:26:36,  1.42it/s]s][A

{'loss': 2.6129, 'grad_norm': 36.024776458740234, 'learning_rate': 1.8e-05, 'epoch': 10.0}




  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.02it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.13it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.59it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.35it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.06it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.02it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  3.00it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.97it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.97it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.97it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.96it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.96it/s][A[A

 23%|██▎       | 18/79 [00:05<00:20,  2.95it/s][A[A

 24%|██▍       | 19/79 [0

{'eval_loss': 2.9404690265655518, 'eval_accuracy': 0.2933, 'eval_runtime': 26.6044, 'eval_samples_per_second': 375.878, 'eval_steps_per_second': 2.969, 'epoch': 10.0}


 10%|█         | 1970/19600 [33:19<5:55:52,  1.21s/it] 
                                                        
 10%|█         | 1970/19600 [33:19<5:55:52,  1.21s/it]s][A

{'loss': 2.5504, 'grad_norm': 22.23739242553711, 'learning_rate': 1.7989795918367346e-05, 'epoch': 10.05}


 10%|█         | 1980/19600 [33:27<4:20:49,  1.13it/s]
                                                        
 10%|█         | 1980/19600 [33:27<4:20:49,  1.13it/s]s][A

{'loss': 2.5458, 'grad_norm': 20.72662353515625, 'learning_rate': 1.7979591836734695e-05, 'epoch': 10.1}


 10%|█         | 1990/19600 [33:36<4:18:37,  1.13it/s]
                                                        
 10%|█         | 1990/19600 [33:36<4:18:37,  1.13it/s]s][A

{'loss': 2.5792, 'grad_norm': 18.31346893310547, 'learning_rate': 1.796938775510204e-05, 'epoch': 10.15}


 10%|█         | 2000/19600 [33:45<4:17:47,  1.14it/s]
                                                        
 10%|█         | 2000/19600 [33:45<4:17:47,  1.14it/s]s][A

{'loss': 2.557, 'grad_norm': 19.90450096130371, 'learning_rate': 1.795918367346939e-05, 'epoch': 10.2}


 10%|█         | 2010/19600 [33:54<4:18:18,  1.13it/s]
                                                        
 10%|█         | 2010/19600 [33:54<4:18:18,  1.13it/s]s][A

{'loss': 2.6232, 'grad_norm': 21.145578384399414, 'learning_rate': 1.7948979591836736e-05, 'epoch': 10.26}


 10%|█         | 2020/19600 [34:03<4:17:47,  1.14it/s]
                                                        
 10%|█         | 2020/19600 [34:03<4:17:47,  1.14it/s]s][A

{'loss': 2.5118, 'grad_norm': 19.223953247070312, 'learning_rate': 1.7938775510204082e-05, 'epoch': 10.31}


 10%|█         | 2030/19600 [34:11<4:16:47,  1.14it/s]
                                                        
 10%|█         | 2030/19600 [34:11<4:16:47,  1.14it/s]s][A

{'loss': 2.556, 'grad_norm': 20.774545669555664, 'learning_rate': 1.792857142857143e-05, 'epoch': 10.36}


 10%|█         | 2040/19600 [34:20<4:16:48,  1.14it/s]
                                                        
 10%|█         | 2040/19600 [34:20<4:16:48,  1.14it/s]s][A

{'loss': 2.5092, 'grad_norm': 24.72385025024414, 'learning_rate': 1.7918367346938777e-05, 'epoch': 10.41}


 10%|█         | 2050/19600 [34:29<4:17:04,  1.14it/s]
                                                        
 10%|█         | 2050/19600 [34:29<4:17:04,  1.14it/s]s][A

{'loss': 2.5697, 'grad_norm': 17.508220672607422, 'learning_rate': 1.7908163265306123e-05, 'epoch': 10.46}


 11%|█         | 2060/19600 [34:38<4:16:45,  1.14it/s]
                                                        
 11%|█         | 2060/19600 [34:38<4:16:45,  1.14it/s]s][A

{'loss': 2.5138, 'grad_norm': 20.648313522338867, 'learning_rate': 1.789795918367347e-05, 'epoch': 10.51}


 11%|█         | 2070/19600 [34:47<4:17:17,  1.14it/s]
                                                        
 11%|█         | 2070/19600 [34:47<4:17:17,  1.14it/s]s][A

{'loss': 2.5549, 'grad_norm': 17.475595474243164, 'learning_rate': 1.7887755102040818e-05, 'epoch': 10.56}


 11%|█         | 2080/19600 [34:55<4:17:08,  1.14it/s]
                                                        
 11%|█         | 2080/19600 [34:55<4:17:08,  1.14it/s]s][A

{'loss': 2.5689, 'grad_norm': 20.179424285888672, 'learning_rate': 1.7877551020408164e-05, 'epoch': 10.61}


 11%|█         | 2090/19600 [35:04<4:16:59,  1.14it/s]
                                                        
 11%|█         | 2090/19600 [35:04<4:16:59,  1.14it/s]s][A

{'loss': 2.4776, 'grad_norm': 18.719552993774414, 'learning_rate': 1.7867346938775513e-05, 'epoch': 10.66}


 11%|█         | 2100/19600 [35:13<4:16:40,  1.14it/s]
                                                        
 11%|█         | 2100/19600 [35:13<4:16:40,  1.14it/s]s][A

{'loss': 2.5022, 'grad_norm': 19.41474151611328, 'learning_rate': 1.785714285714286e-05, 'epoch': 10.71}


 11%|█         | 2110/19600 [35:22<4:16:05,  1.14it/s]
                                                        
 11%|█         | 2110/19600 [35:22<4:16:05,  1.14it/s]s][A

{'loss': 2.5645, 'grad_norm': 24.220230102539062, 'learning_rate': 1.7846938775510204e-05, 'epoch': 10.77}


 11%|█         | 2120/19600 [35:31<4:15:32,  1.14it/s]
                                                        
 11%|█         | 2120/19600 [35:31<4:15:32,  1.14it/s]s][A

{'loss': 2.5996, 'grad_norm': 18.6529541015625, 'learning_rate': 1.7836734693877553e-05, 'epoch': 10.82}


 11%|█         | 2130/19600 [35:39<4:15:33,  1.14it/s]
                                                        
 11%|█         | 2130/19600 [35:39<4:15:33,  1.14it/s]s][A

{'loss': 2.6099, 'grad_norm': 18.9697322845459, 'learning_rate': 1.78265306122449e-05, 'epoch': 10.87}


 11%|█         | 2140/19600 [35:48<4:15:12,  1.14it/s]
                                                        
 11%|█         | 2140/19600 [35:48<4:15:12,  1.14it/s]s][A

{'loss': 2.6252, 'grad_norm': 23.66621971130371, 'learning_rate': 1.781632653061225e-05, 'epoch': 10.92}


 11%|█         | 2150/19600 [35:57<4:15:22,  1.14it/s]
                                                        
 11%|█         | 2150/19600 [35:57<4:15:22,  1.14it/s]s][A

{'loss': 2.5674, 'grad_norm': 20.703968048095703, 'learning_rate': 1.780612244897959e-05, 'epoch': 10.97}


 11%|█         | 2156/19600 [36:02<3:24:12,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.09it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.16it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.62it/s][A[A

  6%|▋         | 5/79 [00:01<00:21,  3.37it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.21it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.13it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.08it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.97it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.95it/s][A[A

 18%|█▊        | 14/79 [00:04<00:22,  2.94it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.94it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.94it/s][A[A

 22%|██▏       | 17/79 [00:05<00:21,  2.95it/s][A[A

 23%|██▎       | 18/79 [00

{'eval_loss': 2.8986144065856934, 'eval_accuracy': 0.3026, 'eval_runtime': 26.5302, 'eval_samples_per_second': 376.93, 'eval_steps_per_second': 2.978, 'epoch': 11.0}


 11%|█         | 2160/19600 [36:33<17:46:38,  3.67s/it]
                                                        
 11%|█         | 2160/19600 [36:33<17:46:38,  3.67s/it]][A

{'loss': 2.4641, 'grad_norm': 21.444345474243164, 'learning_rate': 1.779591836734694e-05, 'epoch': 11.02}


 11%|█         | 2170/19600 [36:41<4:38:37,  1.04it/s] 
                                                        
 11%|█         | 2170/19600 [36:41<4:38:37,  1.04it/s]s][A

{'loss': 2.4467, 'grad_norm': 18.3427791595459, 'learning_rate': 1.7785714285714286e-05, 'epoch': 11.07}


 11%|█         | 2180/19600 [36:50<4:15:56,  1.13it/s]
                                                        
 11%|█         | 2180/19600 [36:50<4:15:56,  1.13it/s]s][A

{'loss': 2.4645, 'grad_norm': 22.427661895751953, 'learning_rate': 1.7775510204081635e-05, 'epoch': 11.12}


 11%|█         | 2190/19600 [36:59<4:15:02,  1.14it/s]
                                                        
 11%|█         | 2190/19600 [36:59<4:15:02,  1.14it/s]s][A

{'loss': 2.386, 'grad_norm': 20.439924240112305, 'learning_rate': 1.776530612244898e-05, 'epoch': 11.17}


 11%|█         | 2200/19600 [37:08<4:14:41,  1.14it/s]
                                                        
 11%|█         | 2200/19600 [37:08<4:14:41,  1.14it/s]s][A

{'loss': 2.4752, 'grad_norm': 24.136232376098633, 'learning_rate': 1.7755102040816327e-05, 'epoch': 11.22}


 11%|█▏        | 2210/19600 [37:17<4:15:10,  1.14it/s]
                                                        
 11%|█▏        | 2210/19600 [37:17<4:15:10,  1.14it/s]s][A

{'loss': 2.3856, 'grad_norm': 18.885465621948242, 'learning_rate': 1.7744897959183676e-05, 'epoch': 11.28}


 11%|█▏        | 2220/19600 [37:25<4:14:38,  1.14it/s]
                                                        
 11%|█▏        | 2220/19600 [37:25<4:14:38,  1.14it/s]s][A

{'loss': 2.4102, 'grad_norm': 20.137317657470703, 'learning_rate': 1.773469387755102e-05, 'epoch': 11.33}


 11%|█▏        | 2230/19600 [37:34<4:13:51,  1.14it/s]
                                                        
 11%|█▏        | 2230/19600 [37:34<4:13:51,  1.14it/s]s][A

{'loss': 2.4473, 'grad_norm': 23.142858505249023, 'learning_rate': 1.772448979591837e-05, 'epoch': 11.38}


 11%|█▏        | 2240/19600 [37:43<4:13:45,  1.14it/s]
                                                        
 11%|█▏        | 2240/19600 [37:43<4:13:45,  1.14it/s]s][A

{'loss': 2.5469, 'grad_norm': 19.65850830078125, 'learning_rate': 1.7714285714285717e-05, 'epoch': 11.43}


 11%|█▏        | 2250/19600 [37:52<4:14:22,  1.14it/s]
                                                        
 11%|█▏        | 2250/19600 [37:52<4:14:22,  1.14it/s]s][A

{'loss': 2.4727, 'grad_norm': 18.620162963867188, 'learning_rate': 1.7704081632653062e-05, 'epoch': 11.48}


 12%|█▏        | 2260/19600 [38:01<4:13:43,  1.14it/s]
                                                        
 12%|█▏        | 2260/19600 [38:01<4:13:43,  1.14it/s]s][A

{'loss': 2.4554, 'grad_norm': 19.59441375732422, 'learning_rate': 1.7693877551020408e-05, 'epoch': 11.53}


 12%|█▏        | 2270/19600 [38:09<4:13:50,  1.14it/s]
                                                        
 12%|█▏        | 2270/19600 [38:09<4:13:50,  1.14it/s]s][A

{'loss': 2.4953, 'grad_norm': 22.1669864654541, 'learning_rate': 1.7683673469387757e-05, 'epoch': 11.58}


 12%|█▏        | 2280/19600 [38:18<4:13:42,  1.14it/s]
                                                        
 12%|█▏        | 2280/19600 [38:18<4:13:42,  1.14it/s]s][A

{'loss': 2.4214, 'grad_norm': 21.66994285583496, 'learning_rate': 1.7673469387755103e-05, 'epoch': 11.63}


 12%|█▏        | 2290/19600 [38:27<4:13:42,  1.14it/s]
                                                        
 12%|█▏        | 2290/19600 [38:27<4:13:42,  1.14it/s]s][A

{'loss': 2.5076, 'grad_norm': 22.736303329467773, 'learning_rate': 1.766326530612245e-05, 'epoch': 11.68}


 12%|█▏        | 2300/19600 [38:36<4:13:36,  1.14it/s]
                                                        
 12%|█▏        | 2300/19600 [38:36<4:13:36,  1.14it/s]s][A

{'loss': 2.4161, 'grad_norm': 19.29757308959961, 'learning_rate': 1.7653061224489798e-05, 'epoch': 11.73}


 12%|█▏        | 2310/19600 [38:45<4:13:29,  1.14it/s]
                                                        
 12%|█▏        | 2310/19600 [38:45<4:13:29,  1.14it/s]s][A

{'loss': 2.4703, 'grad_norm': 24.619033813476562, 'learning_rate': 1.7642857142857144e-05, 'epoch': 11.79}


 12%|█▏        | 2320/19600 [38:53<4:13:21,  1.14it/s]
                                                        
 12%|█▏        | 2320/19600 [38:53<4:13:21,  1.14it/s]s][A

{'loss': 2.4793, 'grad_norm': 23.76540184020996, 'learning_rate': 1.7632653061224493e-05, 'epoch': 11.84}


 12%|█▏        | 2330/19600 [39:02<4:13:22,  1.14it/s]
                                                        
 12%|█▏        | 2330/19600 [39:02<4:13:22,  1.14it/s]s][A

{'loss': 2.405, 'grad_norm': 21.330873489379883, 'learning_rate': 1.762244897959184e-05, 'epoch': 11.89}


 12%|█▏        | 2340/19600 [39:11<4:13:19,  1.14it/s]
                                                        
 12%|█▏        | 2340/19600 [39:11<4:13:19,  1.14it/s]s][A

{'loss': 2.3869, 'grad_norm': 21.601211547851562, 'learning_rate': 1.7612244897959185e-05, 'epoch': 11.94}


 12%|█▏        | 2350/19600 [39:20<4:12:36,  1.14it/s]
                                                        
 12%|█▏        | 2350/19600 [39:20<4:12:36,  1.14it/s]s][A

{'loss': 2.474, 'grad_norm': 22.91109275817871, 'learning_rate': 1.760204081632653e-05, 'epoch': 11.99}


 12%|█▏        | 2352/19600 [39:21<3:22:16,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.03it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.15it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:21,  3.36it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.21it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.13it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.02it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  3.00it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.98it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.96it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.97it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.96it/s][A[A

 23%|██▎       | 18/79 [00

{'eval_loss': 2.854879379272461, 'eval_accuracy': 0.3099, 'eval_runtime': 26.4843, 'eval_samples_per_second': 377.582, 'eval_steps_per_second': 2.983, 'epoch': 12.0}


 12%|█▏        | 2360/19600 [39:56<7:24:42,  1.55s/it] 
                                                        
 12%|█▏        | 2360/19600 [39:56<7:24:42,  1.55s/it]s][A

{'loss': 2.3637, 'grad_norm': 19.645004272460938, 'learning_rate': 1.759183673469388e-05, 'epoch': 12.04}


 12%|█▏        | 2370/19600 [40:04<4:17:50,  1.11it/s]
                                                        
 12%|█▏        | 2370/19600 [40:04<4:17:50,  1.11it/s]s][A

{'loss': 2.3117, 'grad_norm': 28.42452049255371, 'learning_rate': 1.7581632653061225e-05, 'epoch': 12.09}


 12%|█▏        | 2380/19600 [40:13<4:12:40,  1.14it/s]
                                                        
 12%|█▏        | 2380/19600 [40:13<4:12:40,  1.14it/s]s][A

{'loss': 2.366, 'grad_norm': 23.13876724243164, 'learning_rate': 1.757142857142857e-05, 'epoch': 12.14}


 12%|█▏        | 2390/19600 [40:22<4:12:16,  1.14it/s]
                                                        
 12%|█▏        | 2390/19600 [40:22<4:12:16,  1.14it/s]s][A

{'loss': 2.3142, 'grad_norm': 24.42511749267578, 'learning_rate': 1.756122448979592e-05, 'epoch': 12.19}


 12%|█▏        | 2400/19600 [40:31<4:11:49,  1.14it/s]
                                                        
 12%|█▏        | 2400/19600 [40:31<4:11:49,  1.14it/s]s][A

{'loss': 2.3047, 'grad_norm': 21.66850471496582, 'learning_rate': 1.7551020408163266e-05, 'epoch': 12.24}


 12%|█▏        | 2410/19600 [40:39<4:11:39,  1.14it/s]
                                                        
 12%|█▏        | 2410/19600 [40:39<4:11:39,  1.14it/s]s][A

{'loss': 2.3921, 'grad_norm': 20.616418838500977, 'learning_rate': 1.7540816326530615e-05, 'epoch': 12.3}


 12%|█▏        | 2420/19600 [40:48<4:12:02,  1.14it/s]
                                                        
 12%|█▏        | 2420/19600 [40:48<4:12:02,  1.14it/s]s][A

{'loss': 2.3708, 'grad_norm': 19.06792640686035, 'learning_rate': 1.753061224489796e-05, 'epoch': 12.35}


 12%|█▏        | 2430/19600 [40:57<4:11:37,  1.14it/s]
                                                        
 12%|█▏        | 2430/19600 [40:57<4:11:37,  1.14it/s]s][A

{'loss': 2.3158, 'grad_norm': 21.895105361938477, 'learning_rate': 1.7520408163265307e-05, 'epoch': 12.4}


 12%|█▏        | 2440/19600 [41:06<4:11:36,  1.14it/s]
                                                        
 12%|█▏        | 2440/19600 [41:06<4:11:36,  1.14it/s]s][A

{'loss': 2.3729, 'grad_norm': 20.380041122436523, 'learning_rate': 1.7510204081632653e-05, 'epoch': 12.45}


 12%|█▎        | 2450/19600 [41:15<4:10:49,  1.14it/s]
                                                        
 12%|█▎        | 2450/19600 [41:15<4:10:49,  1.14it/s]s][A

{'loss': 2.3391, 'grad_norm': 23.48766326904297, 'learning_rate': 1.7500000000000002e-05, 'epoch': 12.5}


 13%|█▎        | 2460/19600 [41:23<4:10:57,  1.14it/s]
                                                        
 13%|█▎        | 2460/19600 [41:23<4:10:57,  1.14it/s]s][A

{'loss': 2.3494, 'grad_norm': 21.593429565429688, 'learning_rate': 1.748979591836735e-05, 'epoch': 12.55}


 13%|█▎        | 2470/19600 [41:32<4:10:36,  1.14it/s]
                                                        
 13%|█▎        | 2470/19600 [41:32<4:10:36,  1.14it/s]s][A

{'loss': 2.358, 'grad_norm': 24.278356552124023, 'learning_rate': 1.7479591836734693e-05, 'epoch': 12.6}


 13%|█▎        | 2480/19600 [41:41<4:10:28,  1.14it/s]
                                                        
 13%|█▎        | 2480/19600 [41:41<4:10:28,  1.14it/s]s][A

{'loss': 2.4383, 'grad_norm': 20.48424530029297, 'learning_rate': 1.7469387755102043e-05, 'epoch': 12.65}


 13%|█▎        | 2490/19600 [41:50<4:10:28,  1.14it/s]
                                                        
 13%|█▎        | 2490/19600 [41:50<4:10:28,  1.14it/s]s][A

{'loss': 2.3815, 'grad_norm': 22.79161262512207, 'learning_rate': 1.745918367346939e-05, 'epoch': 12.7}


 13%|█▎        | 2500/19600 [41:59<4:10:10,  1.14it/s]
                                                        
 13%|█▎        | 2500/19600 [41:59<4:10:10,  1.14it/s]s][A

{'loss': 2.3745, 'grad_norm': 23.940338134765625, 'learning_rate': 1.7448979591836738e-05, 'epoch': 12.76}


 13%|█▎        | 2510/19600 [42:07<4:10:29,  1.14it/s]
                                                        
 13%|█▎        | 2510/19600 [42:07<4:10:29,  1.14it/s]s][A

{'loss': 2.3541, 'grad_norm': 23.10235023498535, 'learning_rate': 1.7438775510204083e-05, 'epoch': 12.81}


 13%|█▎        | 2520/19600 [42:16<4:10:12,  1.14it/s]
                                                        
 13%|█▎        | 2520/19600 [42:16<4:10:12,  1.14it/s]s][A

{'loss': 2.3992, 'grad_norm': 21.523395538330078, 'learning_rate': 1.742857142857143e-05, 'epoch': 12.86}


 13%|█▎        | 2530/19600 [42:25<4:10:22,  1.14it/s]
                                                        
 13%|█▎        | 2530/19600 [42:25<4:10:22,  1.14it/s]s][A

{'loss': 2.3423, 'grad_norm': 23.73554039001465, 'learning_rate': 1.7418367346938775e-05, 'epoch': 12.91}


 13%|█▎        | 2540/19600 [42:34<4:10:10,  1.14it/s]
                                                        
 13%|█▎        | 2540/19600 [42:34<4:10:10,  1.14it/s]s][A

{'loss': 2.3369, 'grad_norm': 23.07662010192871, 'learning_rate': 1.7408163265306124e-05, 'epoch': 12.96}


 13%|█▎        | 2548/19600 [42:40<3:19:41,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.07it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.15it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.62it/s][A[A

  6%|▋         | 5/79 [00:01<00:21,  3.37it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.21it/s][A[A

  9%|▉         | 7/79 [00:02<00:22,  3.13it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.09it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.04it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.02it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.97it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.98it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.97it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.97it/s][A[A

 23%|██▎       | 18/79 [00

{'eval_loss': 2.819906711578369, 'eval_accuracy': 0.3153, 'eval_runtime': 26.4534, 'eval_samples_per_second': 378.023, 'eval_steps_per_second': 2.986, 'epoch': 13.0}


 13%|█▎        | 2550/19600 [43:10<31:19:51,  6.62s/it]
                                                        
 13%|█▎        | 2550/19600 [43:10<31:19:51,  6.62s/it]][A

{'loss': 2.2755, 'grad_norm': 25.85480499267578, 'learning_rate': 1.7397959183673473e-05, 'epoch': 13.01}


 13%|█▎        | 2560/19600 [43:19<4:55:17,  1.04s/it] 
                                                        
 13%|█▎        | 2560/19600 [43:19<4:55:17,  1.04s/it]s][A

{'loss': 2.273, 'grad_norm': 24.053401947021484, 'learning_rate': 1.738775510204082e-05, 'epoch': 13.06}


 13%|█▎        | 2570/19600 [43:27<4:10:42,  1.13it/s]
                                                        
 13%|█▎        | 2570/19600 [43:27<4:10:42,  1.13it/s]s][A

{'loss': 2.1412, 'grad_norm': 24.307680130004883, 'learning_rate': 1.7377551020408165e-05, 'epoch': 13.11}


 13%|█▎        | 2580/19600 [43:36<4:09:53,  1.14it/s]
                                                        
 13%|█▎        | 2580/19600 [43:36<4:09:53,  1.14it/s]s][A

{'loss': 2.2871, 'grad_norm': 20.14406967163086, 'learning_rate': 1.736734693877551e-05, 'epoch': 13.16}


 13%|█▎        | 2590/19600 [43:45<4:09:18,  1.14it/s]
                                                        
 13%|█▎        | 2590/19600 [43:45<4:09:18,  1.14it/s]s][A

{'loss': 2.27, 'grad_norm': 21.571945190429688, 'learning_rate': 1.735714285714286e-05, 'epoch': 13.21}


 13%|█▎        | 2600/19600 [43:54<4:08:56,  1.14it/s]
                                                        
 13%|█▎        | 2600/19600 [43:54<4:08:56,  1.14it/s]s][A

{'loss': 2.2675, 'grad_norm': 22.534744262695312, 'learning_rate': 1.7346938775510206e-05, 'epoch': 13.27}


 13%|█▎        | 2610/19600 [44:03<4:09:04,  1.14it/s]
                                                        
 13%|█▎        | 2610/19600 [44:03<4:09:04,  1.14it/s]s][A

{'loss': 2.2383, 'grad_norm': 25.0044002532959, 'learning_rate': 1.733673469387755e-05, 'epoch': 13.32}


 13%|█▎        | 2620/19600 [44:12<4:08:27,  1.14it/s]
                                                        
 13%|█▎        | 2620/19600 [44:12<4:08:27,  1.14it/s]s][A

{'loss': 2.2415, 'grad_norm': 21.410282135009766, 'learning_rate': 1.7326530612244897e-05, 'epoch': 13.37}


 13%|█▎        | 2630/19600 [44:20<4:08:48,  1.14it/s]
                                                        
 13%|█▎        | 2630/19600 [44:20<4:08:48,  1.14it/s]s][A

{'loss': 2.2329, 'grad_norm': 22.10288429260254, 'learning_rate': 1.7316326530612246e-05, 'epoch': 13.42}


 13%|█▎        | 2640/19600 [44:29<4:09:00,  1.14it/s]
                                                        
 13%|█▎        | 2640/19600 [44:29<4:09:00,  1.14it/s]s][A

{'loss': 2.3126, 'grad_norm': 22.73388671875, 'learning_rate': 1.7306122448979596e-05, 'epoch': 13.47}


 14%|█▎        | 2650/19600 [44:38<4:08:08,  1.14it/s]
                                                        
 14%|█▎        | 2650/19600 [44:38<4:08:08,  1.14it/s]s][A

{'loss': 2.2516, 'grad_norm': 24.528600692749023, 'learning_rate': 1.729591836734694e-05, 'epoch': 13.52}


 14%|█▎        | 2660/19600 [44:47<4:08:37,  1.14it/s]
                                                        
 14%|█▎        | 2660/19600 [44:47<4:08:37,  1.14it/s]s][A

{'loss': 2.2474, 'grad_norm': 22.082212448120117, 'learning_rate': 1.7285714285714287e-05, 'epoch': 13.57}


 14%|█▎        | 2670/19600 [44:56<4:08:18,  1.14it/s]
                                                        
 14%|█▎        | 2670/19600 [44:56<4:08:18,  1.14it/s]s][A

{'loss': 2.2765, 'grad_norm': 24.80488395690918, 'learning_rate': 1.7275510204081633e-05, 'epoch': 13.62}


 14%|█▎        | 2680/19600 [45:04<4:07:17,  1.14it/s]
                                                        
 14%|█▎        | 2680/19600 [45:04<4:07:17,  1.14it/s]s][A

{'loss': 2.2593, 'grad_norm': 24.24131965637207, 'learning_rate': 1.7265306122448982e-05, 'epoch': 13.67}


 14%|█▎        | 2690/19600 [45:13<4:07:50,  1.14it/s]
                                                        
 14%|█▎        | 2690/19600 [45:13<4:07:50,  1.14it/s]s][A

{'loss': 2.3182, 'grad_norm': 19.628923416137695, 'learning_rate': 1.7255102040816328e-05, 'epoch': 13.72}


 14%|█▍        | 2700/19600 [45:22<4:07:09,  1.14it/s]
                                                        
 14%|█▍        | 2700/19600 [45:22<4:07:09,  1.14it/s]s][A

{'loss': 2.226, 'grad_norm': 24.671430587768555, 'learning_rate': 1.7244897959183674e-05, 'epoch': 13.78}


 14%|█▍        | 2710/19600 [45:31<4:06:38,  1.14it/s]
                                                        
 14%|█▍        | 2710/19600 [45:31<4:06:38,  1.14it/s]s][A

{'loss': 2.2013, 'grad_norm': 23.363340377807617, 'learning_rate': 1.723469387755102e-05, 'epoch': 13.83}


 14%|█▍        | 2720/19600 [45:39<4:07:15,  1.14it/s]
                                                        
 14%|█▍        | 2720/19600 [45:39<4:07:15,  1.14it/s]s][A

{'loss': 2.2407, 'grad_norm': 24.200603485107422, 'learning_rate': 1.722448979591837e-05, 'epoch': 13.88}


 14%|█▍        | 2730/19600 [45:48<4:07:18,  1.14it/s]
                                                        
 14%|█▍        | 2730/19600 [45:48<4:07:18,  1.14it/s]s][A

{'loss': 2.2879, 'grad_norm': 25.470354080200195, 'learning_rate': 1.7214285714285718e-05, 'epoch': 13.93}


 14%|█▍        | 2740/19600 [45:57<4:07:08,  1.14it/s]
                                                        
 14%|█▍        | 2740/19600 [45:57<4:07:08,  1.14it/s]s][A

{'loss': 2.3131, 'grad_norm': 29.055313110351562, 'learning_rate': 1.7204081632653064e-05, 'epoch': 13.98}


 14%|█▍        | 2744/19600 [46:00<3:17:43,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.00it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.12it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.36it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.11it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.97it/s][A[A

 18%|█▊        | 14/79 [00:04<00:22,  2.95it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.95it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.95it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.95it/s][A[A

 23%|██▎       | 18/79 [00

{'loss': 2.0728, 'grad_norm': 25.294466018676758, 'learning_rate': 1.68265306122449e-05, 'epoch': 15.87}


 16%|█▌        | 3120/19600 [52:25<4:00:59,  1.14it/s]
                                                        
 16%|█▌        | 3120/19600 [52:25<4:00:59,  1.14it/s]s][A

{'loss': 2.058, 'grad_norm': 25.334436416625977, 'learning_rate': 1.6816326530612244e-05, 'epoch': 15.92}


 16%|█▌        | 3130/19600 [52:34<4:01:31,  1.14it/s]
                                                        
 16%|█▌        | 3130/19600 [52:34<4:01:31,  1.14it/s]s][A

{'loss': 2.0799, 'grad_norm': 27.50599479675293, 'learning_rate': 1.6806122448979594e-05, 'epoch': 15.97}


 16%|█▌        | 3136/19600 [52:38<3:12:45,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  5.99it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.14it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.60it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.36it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.22it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:23,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.98it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.97it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.96it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.95it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.96it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.96it/s][A[A

 23%|██▎       | 18/79 [00

{'eval_loss': 2.727675676345825, 'eval_accuracy': 0.33, 'eval_runtime': 26.5554, 'eval_samples_per_second': 376.571, 'eval_steps_per_second': 2.975, 'epoch': 16.0}


 16%|█▌        | 3140/19600 [53:10<16:50:07,  3.68s/it]
                                                        
 16%|█▌        | 3140/19600 [53:10<16:50:07,  3.68s/it]][A

{'loss': 2.0372, 'grad_norm': 25.909706115722656, 'learning_rate': 1.679591836734694e-05, 'epoch': 16.02}


 16%|█▌        | 3150/19600 [53:18<4:22:26,  1.04it/s] 
                                                        
 16%|█▌        | 3150/19600 [53:18<4:22:26,  1.04it/s]s][A

{'loss': 1.9049, 'grad_norm': 27.329700469970703, 'learning_rate': 1.678571428571429e-05, 'epoch': 16.07}


 16%|█▌        | 3160/19600 [53:27<4:00:48,  1.14it/s]
                                                        
 16%|█▌        | 3160/19600 [53:27<4:00:48,  1.14it/s]s][A

{'loss': 1.9846, 'grad_norm': 28.60801887512207, 'learning_rate': 1.6775510204081634e-05, 'epoch': 16.12}


 16%|█▌        | 3170/19600 [53:36<4:00:11,  1.14it/s]
                                                        
 16%|█▌        | 3170/19600 [53:36<4:00:11,  1.14it/s]s][A

{'loss': 1.9012, 'grad_norm': 28.152400970458984, 'learning_rate': 1.676530612244898e-05, 'epoch': 16.17}


 16%|█▌        | 3180/19600 [53:45<4:00:32,  1.14it/s]
                                                        
 16%|█▌        | 3180/19600 [53:45<4:00:32,  1.14it/s]s][A

{'loss': 1.9596, 'grad_norm': 26.440200805664062, 'learning_rate': 1.675510204081633e-05, 'epoch': 16.22}


 16%|█▋        | 3190/19600 [53:54<4:00:21,  1.14it/s]
                                                        
 16%|█▋        | 3190/19600 [53:54<4:00:21,  1.14it/s]s][A

{'loss': 1.9641, 'grad_norm': 26.52750015258789, 'learning_rate': 1.6744897959183675e-05, 'epoch': 16.28}


 16%|█▋        | 3200/19600 [54:02<4:00:20,  1.14it/s]
                                                        
 16%|█▋        | 3200/19600 [54:02<4:00:20,  1.14it/s]s][A

{'loss': 1.9505, 'grad_norm': 23.75358009338379, 'learning_rate': 1.673469387755102e-05, 'epoch': 16.33}


 16%|█▋        | 3210/19600 [54:11<4:00:20,  1.14it/s]
                                                        
 16%|█▋        | 3210/19600 [54:11<4:00:20,  1.14it/s]s][A

{'loss': 1.9926, 'grad_norm': 29.708389282226562, 'learning_rate': 1.6724489795918367e-05, 'epoch': 16.38}


 16%|█▋        | 3220/19600 [54:20<4:00:13,  1.14it/s]
                                                        
 16%|█▋        | 3220/19600 [54:20<4:00:13,  1.14it/s]s][A

{'loss': 1.9737, 'grad_norm': 24.105592727661133, 'learning_rate': 1.6714285714285716e-05, 'epoch': 16.43}


 16%|█▋        | 3230/19600 [54:29<3:59:51,  1.14it/s]
                                                        
 16%|█▋        | 3230/19600 [54:29<3:59:51,  1.14it/s]s][A

{'loss': 1.9697, 'grad_norm': 29.987152099609375, 'learning_rate': 1.6704081632653062e-05, 'epoch': 16.48}


 17%|█▋        | 3240/19600 [54:38<3:59:15,  1.14it/s]
                                                        
 17%|█▋        | 3240/19600 [54:38<3:59:15,  1.14it/s]s][A

{'loss': 1.9831, 'grad_norm': 23.455902099609375, 'learning_rate': 1.669387755102041e-05, 'epoch': 16.53}


 17%|█▋        | 3250/19600 [54:46<3:59:02,  1.14it/s]
                                                        
 17%|█▋        | 3250/19600 [54:46<3:59:02,  1.14it/s]s][A

{'loss': 1.968, 'grad_norm': 26.90349578857422, 'learning_rate': 1.6683673469387757e-05, 'epoch': 16.58}


 17%|█▋        | 3260/19600 [54:55<3:59:13,  1.14it/s]
                                                        
 17%|█▋        | 3260/19600 [54:55<3:59:13,  1.14it/s]s][A

{'loss': 1.9577, 'grad_norm': 26.349111557006836, 'learning_rate': 1.6673469387755102e-05, 'epoch': 16.63}


 17%|█▋        | 3270/19600 [55:04<3:59:07,  1.14it/s]
                                                        
 17%|█▋        | 3270/19600 [55:04<3:59:07,  1.14it/s]s][A

{'loss': 1.9972, 'grad_norm': 27.005231857299805, 'learning_rate': 1.666326530612245e-05, 'epoch': 16.68}


 17%|█▋        | 3280/19600 [55:13<3:59:23,  1.14it/s]
                                                        
 17%|█▋        | 3280/19600 [55:13<3:59:23,  1.14it/s]s][A

{'loss': 2.0233, 'grad_norm': 26.366044998168945, 'learning_rate': 1.6653061224489797e-05, 'epoch': 16.73}


 17%|█▋        | 3290/19600 [55:22<3:59:05,  1.14it/s]
                                                        
 17%|█▋        | 3290/19600 [55:22<3:59:05,  1.14it/s]s][A

{'loss': 1.9266, 'grad_norm': 26.511911392211914, 'learning_rate': 1.6642857142857147e-05, 'epoch': 16.79}


 17%|█▋        | 3300/19600 [55:30<3:59:04,  1.14it/s]
                                                        
 17%|█▋        | 3300/19600 [55:30<3:59:04,  1.14it/s]s][A

{'loss': 2.0289, 'grad_norm': 27.031625747680664, 'learning_rate': 1.6632653061224492e-05, 'epoch': 16.84}


 17%|█▋        | 3310/19600 [55:39<3:58:45,  1.14it/s]
                                                        
 17%|█▋        | 3310/19600 [55:39<3:58:45,  1.14it/s]s][A

{'loss': 1.9674, 'grad_norm': 29.973237991333008, 'learning_rate': 1.6622448979591838e-05, 'epoch': 16.89}


 17%|█▋        | 3320/19600 [55:48<3:58:50,  1.14it/s]
                                                        
 17%|█▋        | 3320/19600 [55:48<3:58:50,  1.14it/s]s][A

{'loss': 1.9871, 'grad_norm': 27.337696075439453, 'learning_rate': 1.6612244897959184e-05, 'epoch': 16.94}


 17%|█▋        | 3330/19600 [55:57<3:58:54,  1.14it/s]
                                                        
 17%|█▋        | 3330/19600 [55:57<3:58:54,  1.14it/s]s][A

{'loss': 1.9942, 'grad_norm': 26.014007568359375, 'learning_rate': 1.6602040816326533e-05, 'epoch': 16.99}


 17%|█▋        | 3332/19600 [55:58<3:11:10,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.05it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.12it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.35it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.19it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.10it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.01it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.99it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.99it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.98it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.98it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.97it/s][A[A

 23%|██▎       | 18/79 [00

{'eval_loss': 2.693408489227295, 'eval_accuracy': 0.339, 'eval_runtime': 26.4439, 'eval_samples_per_second': 378.159, 'eval_steps_per_second': 2.987, 'epoch': 17.0}


 17%|█▋        | 3340/19600 [56:33<6:59:23,  1.55s/it] 
                                                        
 17%|█▋        | 3340/19600 [56:33<6:59:23,  1.55s/it]s][A

{'loss': 1.8308, 'grad_norm': 26.34911346435547, 'learning_rate': 1.659183673469388e-05, 'epoch': 17.04}


 17%|█▋        | 3350/19600 [56:41<4:02:26,  1.12it/s]
                                                        
 17%|█▋        | 3350/19600 [56:41<4:02:26,  1.12it/s]s][A

{'loss': 1.8305, 'grad_norm': 28.379932403564453, 'learning_rate': 1.6581632653061225e-05, 'epoch': 17.09}


 17%|█▋        | 3360/19600 [56:50<3:57:36,  1.14it/s]
                                                        
 17%|█▋        | 3360/19600 [56:50<3:57:36,  1.14it/s]s][A

{'loss': 1.8374, 'grad_norm': 25.794031143188477, 'learning_rate': 1.6571428571428574e-05, 'epoch': 17.14}


 17%|█▋        | 3370/19600 [56:59<3:57:04,  1.14it/s]
                                                        
 17%|█▋        | 3370/19600 [56:59<3:57:04,  1.14it/s]s][A

{'loss': 1.7995, 'grad_norm': 23.609228134155273, 'learning_rate': 1.656122448979592e-05, 'epoch': 17.19}


 17%|█▋        | 3380/19600 [57:08<3:56:59,  1.14it/s]
                                                        
 17%|█▋        | 3380/19600 [57:08<3:56:59,  1.14it/s]s][A

{'loss': 1.8445, 'grad_norm': 30.613069534301758, 'learning_rate': 1.655102040816327e-05, 'epoch': 17.24}


 17%|█▋        | 3390/19600 [57:16<3:57:17,  1.14it/s]
                                                        
 17%|█▋        | 3390/19600 [57:16<3:57:17,  1.14it/s]s][A

{'loss': 1.9086, 'grad_norm': 24.345714569091797, 'learning_rate': 1.6540816326530615e-05, 'epoch': 17.3}


 17%|█▋        | 3400/19600 [57:25<3:56:53,  1.14it/s]
                                                        
 17%|█▋        | 3400/19600 [57:25<3:56:53,  1.14it/s]s][A

{'loss': 1.87, 'grad_norm': 27.750417709350586, 'learning_rate': 1.653061224489796e-05, 'epoch': 17.35}


 17%|█▋        | 3410/19600 [57:34<3:56:48,  1.14it/s]
                                                        
 17%|█▋        | 3410/19600 [57:34<3:56:48,  1.14it/s]s][A

{'loss': 1.8722, 'grad_norm': 27.79442024230957, 'learning_rate': 1.6520408163265306e-05, 'epoch': 17.4}


 17%|█▋        | 3420/19600 [57:43<3:56:33,  1.14it/s]
                                                        
 17%|█▋        | 3420/19600 [57:43<3:56:33,  1.14it/s]s][A

{'loss': 1.9487, 'grad_norm': 26.81186294555664, 'learning_rate': 1.6510204081632655e-05, 'epoch': 17.45}


 18%|█▊        | 3430/19600 [57:52<3:56:24,  1.14it/s]
                                                        
 18%|█▊        | 3430/19600 [57:52<3:56:24,  1.14it/s]s][A

{'loss': 1.8831, 'grad_norm': 25.095478057861328, 'learning_rate': 1.65e-05, 'epoch': 17.5}


 18%|█▊        | 3440/19600 [58:00<3:56:18,  1.14it/s]
                                                        
 18%|█▊        | 3440/19600 [58:00<3:56:18,  1.14it/s]s][A

{'loss': 1.8664, 'grad_norm': 30.185705184936523, 'learning_rate': 1.6489795918367347e-05, 'epoch': 17.55}


 18%|█▊        | 3450/19600 [58:09<3:57:06,  1.14it/s]
                                                        
 18%|█▊        | 3450/19600 [58:09<3:57:06,  1.14it/s]s][A

{'loss': 1.8388, 'grad_norm': 34.26905059814453, 'learning_rate': 1.6479591836734696e-05, 'epoch': 17.6}


 18%|█▊        | 3460/19600 [58:18<3:56:20,  1.14it/s]
                                                        
 18%|█▊        | 3460/19600 [58:18<3:56:20,  1.14it/s]s][A

{'loss': 1.8544, 'grad_norm': 25.259109497070312, 'learning_rate': 1.6469387755102042e-05, 'epoch': 17.65}


 18%|█▊        | 3470/19600 [58:27<3:56:45,  1.14it/s]
                                                        
 18%|█▊        | 3470/19600 [58:27<3:56:45,  1.14it/s]s][A

{'loss': 1.913, 'grad_norm': 25.95292091369629, 'learning_rate': 1.645918367346939e-05, 'epoch': 17.7}


 18%|█▊        | 3480/19600 [58:35<3:55:48,  1.14it/s]
                                                        
 18%|█▊        | 3480/19600 [58:35<3:55:48,  1.14it/s]s][A

{'loss': 1.9011, 'grad_norm': 30.624813079833984, 'learning_rate': 1.6448979591836737e-05, 'epoch': 17.76}


 18%|█▊        | 3490/19600 [58:44<3:56:04,  1.14it/s]
                                                        
 18%|█▊        | 3490/19600 [58:44<3:56:04,  1.14it/s]s][A

{'loss': 1.8711, 'grad_norm': 28.34302520751953, 'learning_rate': 1.6438775510204083e-05, 'epoch': 17.81}


 18%|█▊        | 3500/19600 [58:53<3:56:05,  1.14it/s]
                                                        
 18%|█▊        | 3500/19600 [58:53<3:56:05,  1.14it/s]s][A

{'loss': 1.9219, 'grad_norm': 27.137939453125, 'learning_rate': 1.642857142857143e-05, 'epoch': 17.86}


 18%|█▊        | 3510/19600 [59:02<3:55:56,  1.14it/s]
                                                        
 18%|█▊        | 3510/19600 [59:02<3:55:56,  1.14it/s]s][A

{'loss': 1.931, 'grad_norm': 31.460956573486328, 'learning_rate': 1.6418367346938778e-05, 'epoch': 17.91}


 18%|█▊        | 3520/19600 [59:11<3:55:39,  1.14it/s]
                                                        
 18%|█▊        | 3520/19600 [59:11<3:55:39,  1.14it/s]s][A

{'loss': 1.9015, 'grad_norm': 33.63336181640625, 'learning_rate': 1.6408163265306124e-05, 'epoch': 17.96}


 18%|█▊        | 3528/19600 [59:17<3:08:27,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.01it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.14it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.60it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.35it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.09it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.04it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.01it/s][A[A

 13%|█▎        | 10/79 [00:03<00:23,  2.99it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.98it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.96it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.96it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.96it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.95it/s][A[A

 22%|██▏       | 17/79 [00:05<00:21,  2.94it/s][A[A

 23%|██▎       | 18/79 [00

{'eval_loss': 2.7015368938446045, 'eval_accuracy': 0.3373, 'eval_runtime': 26.5882, 'eval_samples_per_second': 376.106, 'eval_steps_per_second': 2.971, 'epoch': 18.0}


 18%|█▊        | 3530/19600 [59:47<29:29:06,  6.61s/it]
                                                        
 18%|█▊        | 3530/19600 [59:47<29:29:06,  6.61s/it]][A

{'loss': 1.9004, 'grad_norm': 31.509809494018555, 'learning_rate': 1.639795918367347e-05, 'epoch': 18.01}


 18%|█▊        | 3540/19600 [59:55<4:38:19,  1.04s/it] 
                                                        
 18%|█▊        | 3540/19600 [59:55<4:38:19,  1.04s/it]s][A

{'loss': 1.7699, 'grad_norm': 29.888338088989258, 'learning_rate': 1.638775510204082e-05, 'epoch': 18.06}


 18%|█▊        | 3550/19600 [1:00:04<3:56:29,  1.13it/s]
                                                        
 18%|█▊        | 3550/19600 [1:00:04<3:56:29,  1.13it/s][A

{'loss': 1.7386, 'grad_norm': 29.799468994140625, 'learning_rate': 1.6377551020408164e-05, 'epoch': 18.11}


 18%|█▊        | 3560/19600 [1:00:13<3:55:09,  1.14it/s]
                                                        
 18%|█▊        | 3560/19600 [1:00:13<3:55:09,  1.14it/s][A

{'loss': 1.755, 'grad_norm': 29.208112716674805, 'learning_rate': 1.6367346938775513e-05, 'epoch': 18.16}


 18%|█▊        | 3570/19600 [1:00:22<3:55:07,  1.14it/s]
                                                        
 18%|█▊        | 3570/19600 [1:00:22<3:55:07,  1.14it/s][A

{'loss': 1.6957, 'grad_norm': 26.96186065673828, 'learning_rate': 1.635714285714286e-05, 'epoch': 18.21}


 18%|█▊        | 3580/19600 [1:00:31<3:54:51,  1.14it/s]
                                                        
 18%|█▊        | 3580/19600 [1:00:31<3:54:51,  1.14it/s][A

{'loss': 1.7104, 'grad_norm': 26.698314666748047, 'learning_rate': 1.6346938775510205e-05, 'epoch': 18.27}


 18%|█▊        | 3590/19600 [1:00:39<3:54:40,  1.14it/s]
                                                        
 18%|█▊        | 3590/19600 [1:00:39<3:54:40,  1.14it/s][A

{'loss': 1.7929, 'grad_norm': 26.955284118652344, 'learning_rate': 1.633673469387755e-05, 'epoch': 18.32}


 18%|█▊        | 3600/19600 [1:00:48<3:54:57,  1.13it/s]
                                                        
 18%|█▊        | 3600/19600 [1:00:48<3:54:57,  1.13it/s][A

{'loss': 1.769, 'grad_norm': 30.306529998779297, 'learning_rate': 1.63265306122449e-05, 'epoch': 18.37}


 18%|█▊        | 3610/19600 [1:00:57<3:54:52,  1.13it/s]
                                                        
 18%|█▊        | 3610/19600 [1:00:57<3:54:52,  1.13it/s][A

{'loss': 1.7725, 'grad_norm': 28.38247299194336, 'learning_rate': 1.6316326530612246e-05, 'epoch': 18.42}


 18%|█▊        | 3620/19600 [1:01:06<3:54:36,  1.14it/s]
                                                        
 18%|█▊        | 3620/19600 [1:01:06<3:54:36,  1.14it/s][A

{'loss': 1.8, 'grad_norm': 26.68183135986328, 'learning_rate': 1.630612244897959e-05, 'epoch': 18.47}


 19%|█▊        | 3630/19600 [1:01:15<3:54:06,  1.14it/s]
                                                        
 19%|█▊        | 3630/19600 [1:01:15<3:54:06,  1.14it/s][A

{'loss': 1.799, 'grad_norm': 28.343135833740234, 'learning_rate': 1.629591836734694e-05, 'epoch': 18.52}


 19%|█▊        | 3640/19600 [1:01:23<3:53:31,  1.14it/s]
                                                        
 19%|█▊        | 3640/19600 [1:01:23<3:53:31,  1.14it/s][A

{'loss': 1.8415, 'grad_norm': 24.955352783203125, 'learning_rate': 1.6285714285714287e-05, 'epoch': 18.57}


 19%|█▊        | 3650/19600 [1:01:32<3:52:50,  1.14it/s]
                                                        
 19%|█▊        | 3650/19600 [1:01:32<3:52:50,  1.14it/s][A

{'loss': 1.8024, 'grad_norm': 28.208539962768555, 'learning_rate': 1.6275510204081636e-05, 'epoch': 18.62}


 19%|█▊        | 3660/19600 [1:01:41<3:52:53,  1.14it/s]
                                                        
 19%|█▊        | 3660/19600 [1:01:41<3:52:53,  1.14it/s][A

{'loss': 1.7641, 'grad_norm': 34.29022216796875, 'learning_rate': 1.626530612244898e-05, 'epoch': 18.67}


 19%|█▊        | 3670/19600 [1:01:50<3:52:34,  1.14it/s]
                                                        
 19%|█▊        | 3670/19600 [1:01:50<3:52:34,  1.14it/s][A

{'loss': 1.7986, 'grad_norm': 29.497472763061523, 'learning_rate': 1.6255102040816327e-05, 'epoch': 18.72}


 19%|█▉        | 3680/19600 [1:01:58<3:52:27,  1.14it/s]
                                                        
 19%|█▉        | 3680/19600 [1:01:58<3:52:27,  1.14it/s][A

{'loss': 1.8386, 'grad_norm': 34.538291931152344, 'learning_rate': 1.6244897959183673e-05, 'epoch': 18.78}


 19%|█▉        | 3690/19600 [1:02:07<3:52:47,  1.14it/s]
                                                        
 19%|█▉        | 3690/19600 [1:02:07<3:52:47,  1.14it/s][A

{'loss': 1.7831, 'grad_norm': 28.40247344970703, 'learning_rate': 1.6234693877551022e-05, 'epoch': 18.83}


 19%|█▉        | 3700/19600 [1:02:16<3:52:38,  1.14it/s]
                                                        
 19%|█▉        | 3700/19600 [1:02:16<3:52:38,  1.14it/s][A

{'loss': 1.7674, 'grad_norm': 28.381399154663086, 'learning_rate': 1.6224489795918368e-05, 'epoch': 18.88}


 19%|█▉        | 3710/19600 [1:02:25<3:52:35,  1.14it/s]
                                                        
 19%|█▉        | 3710/19600 [1:02:25<3:52:35,  1.14it/s][A

{'loss': 1.8229, 'grad_norm': 30.275436401367188, 'learning_rate': 1.6214285714285717e-05, 'epoch': 18.93}


 19%|█▉        | 3720/19600 [1:02:34<3:52:27,  1.14it/s]
                                                        
 19%|█▉        | 3720/19600 [1:02:34<3:52:27,  1.14it/s][A

{'loss': 1.773, 'grad_norm': 32.121482849121094, 'learning_rate': 1.6204081632653063e-05, 'epoch': 18.98}


 19%|█▉        | 3724/19600 [1:02:37<3:06:06,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.02it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.14it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.36it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.11it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.05it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.97it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.96it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.95it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.96it/s][A[A

 23%|██▎       | 18/79 [

{'eval_loss': 2.6786272525787354, 'eval_accuracy': 0.3429, 'eval_runtime': 26.5391, 'eval_samples_per_second': 376.803, 'eval_steps_per_second': 2.977, 'epoch': 19.0}


 19%|█▉        | 3730/19600 [1:03:09<9:54:31,  2.25s/it] 
                                                        
 19%|█▉        | 3730/19600 [1:03:09<9:54:31,  2.25s/it][A

{'loss': 1.7389, 'grad_norm': 28.033796310424805, 'learning_rate': 1.619387755102041e-05, 'epoch': 19.03}


 19%|█▉        | 3740/19600 [1:03:18<4:02:16,  1.09it/s]
                                                        
 19%|█▉        | 3740/19600 [1:03:18<4:02:16,  1.09it/s][A

{'loss': 1.574, 'grad_norm': 27.5862979888916, 'learning_rate': 1.6183673469387758e-05, 'epoch': 19.08}


 19%|█▉        | 3750/19600 [1:03:27<3:52:07,  1.14it/s]
                                                        
 19%|█▉        | 3750/19600 [1:03:27<3:52:07,  1.14it/s][A

{'loss': 1.6583, 'grad_norm': 28.99136734008789, 'learning_rate': 1.6173469387755104e-05, 'epoch': 19.13}


 19%|█▉        | 3760/19600 [1:03:36<3:52:10,  1.14it/s]
                                                        
 19%|█▉        | 3760/19600 [1:03:36<3:52:10,  1.14it/s][A

{'loss': 1.6805, 'grad_norm': 35.39186477661133, 'learning_rate': 1.616326530612245e-05, 'epoch': 19.18}


 19%|█▉        | 3770/19600 [1:03:45<3:51:46,  1.14it/s]
                                                        
 19%|█▉        | 3770/19600 [1:03:45<3:51:46,  1.14it/s][A

{'loss': 1.6828, 'grad_norm': 28.571475982666016, 'learning_rate': 1.6153061224489795e-05, 'epoch': 19.23}


 19%|█▉        | 3780/19600 [1:03:53<3:51:57,  1.14it/s]
                                                        
 19%|█▉        | 3780/19600 [1:03:53<3:51:57,  1.14it/s][A

{'loss': 1.6551, 'grad_norm': 28.811731338500977, 'learning_rate': 1.6142857142857145e-05, 'epoch': 19.29}


 19%|█▉        | 3790/19600 [1:04:02<3:51:43,  1.14it/s]
                                                        
 19%|█▉        | 3790/19600 [1:04:02<3:51:43,  1.14it/s][A

{'loss': 1.6813, 'grad_norm': 28.668535232543945, 'learning_rate': 1.613265306122449e-05, 'epoch': 19.34}


 19%|█▉        | 3800/19600 [1:04:11<3:52:08,  1.13it/s]
                                                        
 19%|█▉        | 3800/19600 [1:04:11<3:52:08,  1.13it/s][A

{'loss': 1.6999, 'grad_norm': 29.269855499267578, 'learning_rate': 1.612244897959184e-05, 'epoch': 19.39}


 19%|█▉        | 3810/19600 [1:04:20<3:51:31,  1.14it/s]
                                                        
 19%|█▉        | 3810/19600 [1:04:20<3:51:31,  1.14it/s][A

{'loss': 1.7058, 'grad_norm': 28.800628662109375, 'learning_rate': 1.6112244897959185e-05, 'epoch': 19.44}


 19%|█▉        | 3820/19600 [1:04:29<3:51:14,  1.14it/s]
                                                        
 19%|█▉        | 3820/19600 [1:04:29<3:51:14,  1.14it/s][A

{'loss': 1.7161, 'grad_norm': 29.27474021911621, 'learning_rate': 1.610204081632653e-05, 'epoch': 19.49}


 20%|█▉        | 3830/19600 [1:04:37<3:51:05,  1.14it/s]
                                                        
 20%|█▉        | 3830/19600 [1:04:37<3:51:05,  1.14it/s][A

{'loss': 1.7385, 'grad_norm': 26.907058715820312, 'learning_rate': 1.609183673469388e-05, 'epoch': 19.54}


 20%|█▉        | 3840/19600 [1:04:46<3:51:04,  1.14it/s]
                                                        
 20%|█▉        | 3840/19600 [1:04:46<3:51:04,  1.14it/s][A

{'loss': 1.6972, 'grad_norm': 29.95958709716797, 'learning_rate': 1.6081632653061226e-05, 'epoch': 19.59}


 20%|█▉        | 3850/19600 [1:04:55<3:50:09,  1.14it/s]
                                                        
 20%|█▉        | 3850/19600 [1:04:55<3:50:09,  1.14it/s][A

{'loss': 1.7054, 'grad_norm': 32.51186752319336, 'learning_rate': 1.6071428571428572e-05, 'epoch': 19.64}


 20%|█▉        | 3860/19600 [1:05:04<3:50:32,  1.14it/s]
                                                        
 20%|█▉        | 3860/19600 [1:05:04<3:50:32,  1.14it/s][A

{'loss': 1.681, 'grad_norm': 30.14383316040039, 'learning_rate': 1.6061224489795918e-05, 'epoch': 19.69}


 20%|█▉        | 3870/19600 [1:05:13<3:50:02,  1.14it/s]
                                                        
 20%|█▉        | 3870/19600 [1:05:13<3:50:02,  1.14it/s][A

{'loss': 1.6283, 'grad_norm': 30.087297439575195, 'learning_rate': 1.6051020408163267e-05, 'epoch': 19.74}


 20%|█▉        | 3880/19600 [1:05:21<3:50:23,  1.14it/s]
                                                        
 20%|█▉        | 3880/19600 [1:05:21<3:50:23,  1.14it/s][A

{'loss': 1.6343, 'grad_norm': 27.299097061157227, 'learning_rate': 1.6040816326530613e-05, 'epoch': 19.8}


 20%|█▉        | 3890/19600 [1:05:30<3:49:57,  1.14it/s]
                                                        
 20%|█▉        | 3890/19600 [1:05:30<3:49:57,  1.14it/s][A

{'loss': 1.7065, 'grad_norm': 29.341449737548828, 'learning_rate': 1.6030612244897962e-05, 'epoch': 19.85}


 20%|█▉        | 3900/19600 [1:05:39<3:49:59,  1.14it/s]
                                                        
 20%|█▉        | 3900/19600 [1:05:39<3:49:59,  1.14it/s][A

{'loss': 1.7255, 'grad_norm': 34.65358352661133, 'learning_rate': 1.6020408163265308e-05, 'epoch': 19.9}


 20%|█▉        | 3910/19600 [1:05:48<3:49:41,  1.14it/s]
                                                        
 20%|█▉        | 3910/19600 [1:05:48<3:49:41,  1.14it/s][A

{'loss': 1.67, 'grad_norm': 30.92449951171875, 'learning_rate': 1.6010204081632653e-05, 'epoch': 19.95}


 20%|██        | 3920/19600 [1:05:56<3:03:42,  1.42it/s]
                                                        
 20%|██        | 3920/19600 [1:05:56<3:03:42,  1.42it/s][A

{'loss': 1.6855, 'grad_norm': 48.81502914428711, 'learning_rate': 1.6000000000000003e-05, 'epoch': 20.0}




  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.05it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.16it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:21,  3.36it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.21it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.06it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.01it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.99it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.98it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.97it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.96it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.95it/s][A[A

 23%|██▎       | 18/79 [00:05<00:20,  2.95it/s][A[A

 24%|██▍       | 19/79 [0

{'eval_loss': 2.66896653175354, 'eval_accuracy': 0.3429, 'eval_runtime': 26.5232, 'eval_samples_per_second': 377.029, 'eval_steps_per_second': 2.979, 'epoch': 20.0}


 20%|██        | 3930/19600 [1:06:32<5:15:10,  1.21s/it] 
                                                        
 20%|██        | 3930/19600 [1:06:32<5:15:10,  1.21s/it][A

{'loss': 1.5169, 'grad_norm': 30.622636795043945, 'learning_rate': 1.598979591836735e-05, 'epoch': 20.05}


 20%|██        | 3940/19600 [1:06:41<3:51:44,  1.13it/s]
                                                        
 20%|██        | 3940/19600 [1:06:41<3:51:44,  1.13it/s][A

{'loss': 1.5271, 'grad_norm': 33.38115692138672, 'learning_rate': 1.5979591836734694e-05, 'epoch': 20.1}


 20%|██        | 3950/19600 [1:06:50<3:48:56,  1.14it/s]
                                                        
 20%|██        | 3950/19600 [1:06:50<3:48:56,  1.14it/s][A

{'loss': 1.5545, 'grad_norm': 31.924440383911133, 'learning_rate': 1.596938775510204e-05, 'epoch': 20.15}


 20%|██        | 3960/19600 [1:06:59<3:49:01,  1.14it/s]
                                                        
 20%|██        | 3960/19600 [1:06:59<3:49:01,  1.14it/s][A

{'loss': 1.5812, 'grad_norm': 29.209136962890625, 'learning_rate': 1.595918367346939e-05, 'epoch': 20.2}


 20%|██        | 3970/19600 [1:07:07<3:49:20,  1.14it/s]
                                                        
 20%|██        | 3970/19600 [1:07:07<3:49:20,  1.14it/s][A

{'loss': 1.542, 'grad_norm': 33.8466796875, 'learning_rate': 1.5948979591836735e-05, 'epoch': 20.26}


 20%|██        | 3980/19600 [1:07:16<3:49:11,  1.14it/s]
                                                        
 20%|██        | 3980/19600 [1:07:16<3:49:11,  1.14it/s][A

{'loss': 1.6093, 'grad_norm': 37.850440979003906, 'learning_rate': 1.5938775510204084e-05, 'epoch': 20.31}


 20%|██        | 3990/19600 [1:07:25<3:48:43,  1.14it/s]
                                                        
 20%|██        | 3990/19600 [1:07:25<3:48:43,  1.14it/s][A

{'loss': 1.6065, 'grad_norm': 35.949806213378906, 'learning_rate': 1.592857142857143e-05, 'epoch': 20.36}


 20%|██        | 4000/19600 [1:07:34<3:48:42,  1.14it/s]
                                                        
 20%|██        | 4000/19600 [1:07:34<3:48:42,  1.14it/s][A

{'loss': 1.6035, 'grad_norm': 32.109703063964844, 'learning_rate': 1.5918367346938776e-05, 'epoch': 20.41}


 20%|██        | 4010/19600 [1:07:43<3:48:12,  1.14it/s]
                                                        
 20%|██        | 4010/19600 [1:07:43<3:48:12,  1.14it/s][A

{'loss': 1.5834, 'grad_norm': 27.74822235107422, 'learning_rate': 1.5908163265306125e-05, 'epoch': 20.46}


 21%|██        | 4020/19600 [1:07:51<3:48:43,  1.14it/s]
                                                        
 21%|██        | 4020/19600 [1:07:51<3:48:43,  1.14it/s][A

{'loss': 1.5458, 'grad_norm': 33.073368072509766, 'learning_rate': 1.589795918367347e-05, 'epoch': 20.51}


 21%|██        | 4030/19600 [1:08:00<3:48:20,  1.14it/s]
                                                        
 21%|██        | 4030/19600 [1:08:00<3:48:20,  1.14it/s][A

{'loss': 1.634, 'grad_norm': 28.355539321899414, 'learning_rate': 1.588775510204082e-05, 'epoch': 20.56}


 21%|██        | 4040/19600 [1:08:09<3:47:52,  1.14it/s]
                                                        
 21%|██        | 4040/19600 [1:08:09<3:47:52,  1.14it/s][A

{'loss': 1.593, 'grad_norm': 33.43430709838867, 'learning_rate': 1.5877551020408162e-05, 'epoch': 20.61}


 21%|██        | 4050/19600 [1:08:18<3:47:52,  1.14it/s]
                                                        
 21%|██        | 4050/19600 [1:08:18<3:47:52,  1.14it/s][A

{'loss': 1.5695, 'grad_norm': 30.651031494140625, 'learning_rate': 1.586734693877551e-05, 'epoch': 20.66}


 21%|██        | 4060/19600 [1:08:27<3:47:51,  1.14it/s]
                                                        
 21%|██        | 4060/19600 [1:08:27<3:47:51,  1.14it/s][A

{'loss': 1.6641, 'grad_norm': 33.87029266357422, 'learning_rate': 1.5857142857142857e-05, 'epoch': 20.71}


 21%|██        | 4070/19600 [1:08:35<3:47:42,  1.14it/s]
                                                        
 21%|██        | 4070/19600 [1:08:35<3:47:42,  1.14it/s][A

{'loss': 1.5885, 'grad_norm': 31.843923568725586, 'learning_rate': 1.5846938775510206e-05, 'epoch': 20.77}


 21%|██        | 4080/19600 [1:08:44<3:47:38,  1.14it/s]
                                                        
 21%|██        | 4080/19600 [1:08:44<3:47:38,  1.14it/s][A

{'loss': 1.6083, 'grad_norm': 30.70471954345703, 'learning_rate': 1.5836734693877552e-05, 'epoch': 20.82}


 21%|██        | 4090/19600 [1:08:53<3:47:06,  1.14it/s]
                                                        
 21%|██        | 4090/19600 [1:08:53<3:47:06,  1.14it/s][A

{'loss': 1.6367, 'grad_norm': 28.785213470458984, 'learning_rate': 1.5826530612244898e-05, 'epoch': 20.87}


 21%|██        | 4100/19600 [1:09:02<3:47:01,  1.14it/s]
                                                        
 21%|██        | 4100/19600 [1:09:02<3:47:01,  1.14it/s][A

{'loss': 1.6304, 'grad_norm': 34.07670211791992, 'learning_rate': 1.5816326530612247e-05, 'epoch': 20.92}


 21%|██        | 4110/19600 [1:09:11<3:46:45,  1.14it/s]
                                                        
 21%|██        | 4110/19600 [1:09:11<3:46:45,  1.14it/s][A

{'loss': 1.5628, 'grad_norm': 29.210481643676758, 'learning_rate': 1.5806122448979593e-05, 'epoch': 20.97}


 21%|██        | 4116/19600 [1:09:15<3:01:37,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.01it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.14it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.36it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.04it/s][A[A

 13%|█▎        | 10/79 [00:03<00:23,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.96it/s][A[A

 18%|█▊        | 14/79 [00:04<00:22,  2.95it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.95it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.96it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.95it/s][A[A

 23%|██▎       | 18/79 [

{'eval_loss': 2.65401029586792, 'eval_accuracy': 0.3476, 'eval_runtime': 26.5821, 'eval_samples_per_second': 376.193, 'eval_steps_per_second': 2.972, 'epoch': 21.0}


 21%|██        | 4120/19600 [1:09:46<15:49:19,  3.68s/it]
                                                         
 21%|██        | 4120/19600 [1:09:46<15:49:19,  3.68s/it][A

{'loss': 1.537, 'grad_norm': 31.036792755126953, 'learning_rate': 1.5795918367346942e-05, 'epoch': 21.02}


 21%|██        | 4130/19600 [1:09:55<4:06:41,  1.05it/s] 
                                                        
 21%|██        | 4130/19600 [1:09:55<4:06:41,  1.05it/s][A

{'loss': 1.4995, 'grad_norm': 32.198482513427734, 'learning_rate': 1.5785714285714288e-05, 'epoch': 21.07}


 21%|██        | 4140/19600 [1:10:04<3:46:51,  1.14it/s]
                                                        
 21%|██        | 4140/19600 [1:10:04<3:46:51,  1.14it/s][A

{'loss': 1.492, 'grad_norm': 32.813907623291016, 'learning_rate': 1.5775510204081634e-05, 'epoch': 21.12}


 21%|██        | 4150/19600 [1:10:13<3:46:15,  1.14it/s]
                                                        
 21%|██        | 4150/19600 [1:10:13<3:46:15,  1.14it/s][A

{'loss': 1.4491, 'grad_norm': 30.782394409179688, 'learning_rate': 1.576530612244898e-05, 'epoch': 21.17}


 21%|██        | 4160/19600 [1:10:22<3:46:00,  1.14it/s]
                                                        
 21%|██        | 4160/19600 [1:10:22<3:46:00,  1.14it/s][A

{'loss': 1.4566, 'grad_norm': 27.608251571655273, 'learning_rate': 1.575510204081633e-05, 'epoch': 21.22}


 21%|██▏       | 4170/19600 [1:10:30<3:45:50,  1.14it/s]
                                                        
 21%|██▏       | 4170/19600 [1:10:30<3:45:50,  1.14it/s][A

{'loss': 1.5343, 'grad_norm': 32.43219757080078, 'learning_rate': 1.5744897959183675e-05, 'epoch': 21.28}


 21%|██▏       | 4180/19600 [1:10:39<3:46:08,  1.14it/s]
                                                        
 21%|██▏       | 4180/19600 [1:10:39<3:46:08,  1.14it/s][A

{'loss': 1.5105, 'grad_norm': 33.429100036621094, 'learning_rate': 1.573469387755102e-05, 'epoch': 21.33}


 21%|██▏       | 4190/19600 [1:10:48<3:45:57,  1.14it/s]
                                                        
 21%|██▏       | 4190/19600 [1:10:48<3:45:57,  1.14it/s][A

{'loss': 1.4781, 'grad_norm': 35.44773864746094, 'learning_rate': 1.572448979591837e-05, 'epoch': 21.38}


 21%|██▏       | 4200/19600 [1:10:57<3:45:39,  1.14it/s]
                                                        
 21%|██▏       | 4200/19600 [1:10:57<3:45:39,  1.14it/s][A

{'loss': 1.5264, 'grad_norm': 33.2725715637207, 'learning_rate': 1.5714285714285715e-05, 'epoch': 21.43}


 21%|██▏       | 4210/19600 [1:11:06<3:45:24,  1.14it/s]
                                                        
 21%|██▏       | 4210/19600 [1:11:06<3:45:24,  1.14it/s][A

{'loss': 1.5301, 'grad_norm': 31.106685638427734, 'learning_rate': 1.5704081632653065e-05, 'epoch': 21.48}


 22%|██▏       | 4220/19600 [1:11:14<3:45:25,  1.14it/s]
                                                        
 22%|██▏       | 4220/19600 [1:11:14<3:45:25,  1.14it/s][A

{'loss': 1.494, 'grad_norm': 33.202476501464844, 'learning_rate': 1.569387755102041e-05, 'epoch': 21.53}


 22%|██▏       | 4230/19600 [1:11:23<3:45:34,  1.14it/s]
                                                        
 22%|██▏       | 4230/19600 [1:11:23<3:45:34,  1.14it/s][A

{'loss': 1.5026, 'grad_norm': 32.53908920288086, 'learning_rate': 1.5683673469387756e-05, 'epoch': 21.58}


 22%|██▏       | 4240/19600 [1:11:32<3:45:07,  1.14it/s]
                                                        
 22%|██▏       | 4240/19600 [1:11:32<3:45:07,  1.14it/s][A

{'loss': 1.5144, 'grad_norm': 33.30768966674805, 'learning_rate': 1.5673469387755102e-05, 'epoch': 21.63}


 22%|██▏       | 4250/19600 [1:11:41<3:44:58,  1.14it/s]
                                                        
 22%|██▏       | 4250/19600 [1:11:41<3:44:58,  1.14it/s][A

{'loss': 1.5138, 'grad_norm': 28.225791931152344, 'learning_rate': 1.566326530612245e-05, 'epoch': 21.68}


 22%|██▏       | 4260/19600 [1:11:50<3:44:57,  1.14it/s]
                                                        
 22%|██▏       | 4260/19600 [1:11:50<3:44:57,  1.14it/s][A

{'loss': 1.5259, 'grad_norm': 29.040414810180664, 'learning_rate': 1.5653061224489797e-05, 'epoch': 21.73}


 22%|██▏       | 4270/19600 [1:11:58<3:44:34,  1.14it/s]
                                                        
 22%|██▏       | 4270/19600 [1:11:58<3:44:34,  1.14it/s][A

{'loss': 1.443, 'grad_norm': 33.176116943359375, 'learning_rate': 1.5642857142857143e-05, 'epoch': 21.79}


 22%|██▏       | 4280/19600 [1:12:07<3:44:21,  1.14it/s]
                                                        
 22%|██▏       | 4280/19600 [1:12:07<3:44:21,  1.14it/s][A

{'loss': 1.5461, 'grad_norm': 32.567073822021484, 'learning_rate': 1.5632653061224492e-05, 'epoch': 21.84}


 22%|██▏       | 4290/19600 [1:12:16<3:44:14,  1.14it/s]
                                                        
 22%|██▏       | 4290/19600 [1:12:16<3:44:14,  1.14it/s][A

{'loss': 1.439, 'grad_norm': 28.042098999023438, 'learning_rate': 1.5622448979591838e-05, 'epoch': 21.89}


 22%|██▏       | 4300/19600 [1:12:25<3:44:32,  1.14it/s]
                                                        
 22%|██▏       | 4300/19600 [1:12:25<3:44:32,  1.14it/s][A

{'loss': 1.4775, 'grad_norm': 34.04130172729492, 'learning_rate': 1.5612244897959187e-05, 'epoch': 21.94}


 22%|██▏       | 4310/19600 [1:12:34<3:44:14,  1.14it/s]
                                                        
 22%|██▏       | 4310/19600 [1:12:34<3:44:14,  1.14it/s][A

{'loss': 1.516, 'grad_norm': 32.41482925415039, 'learning_rate': 1.5602040816326533e-05, 'epoch': 21.99}


 22%|██▏       | 4312/19600 [1:12:35<2:59:29,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.02it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.14it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.36it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.21it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.04it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.01it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  3.00it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.97it/s][A[A

 18%|█▊        | 14/79 [00:04<00:22,  2.95it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.96it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.96it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.96it/s][A[A

 23%|██▎       | 18/79 [

{'eval_loss': 2.6498541831970215, 'eval_accuracy': 0.352, 'eval_runtime': 26.5547, 'eval_samples_per_second': 376.581, 'eval_steps_per_second': 2.975, 'epoch': 22.0}


 22%|██▏       | 4320/19600 [1:13:09<6:34:40,  1.55s/it] 
                                                        
 22%|██▏       | 4320/19600 [1:13:10<6:34:40,  1.55s/it][A

{'loss': 1.3971, 'grad_norm': 31.06702995300293, 'learning_rate': 1.559183673469388e-05, 'epoch': 22.04}


 22%|██▏       | 4330/19600 [1:13:18<3:48:31,  1.11it/s]
                                                        
 22%|██▏       | 4330/19600 [1:13:18<3:48:31,  1.11it/s][A

{'loss': 1.3687, 'grad_norm': 28.319847106933594, 'learning_rate': 1.5581632653061224e-05, 'epoch': 22.09}


 22%|██▏       | 4340/19600 [1:13:27<3:43:40,  1.14it/s]
                                                        
 22%|██▏       | 4340/19600 [1:13:27<3:43:40,  1.14it/s][A

{'loss': 1.3461, 'grad_norm': 28.74053192138672, 'learning_rate': 1.5571428571428573e-05, 'epoch': 22.14}


 22%|██▏       | 4350/19600 [1:13:36<3:42:45,  1.14it/s]
                                                        
 22%|██▏       | 4350/19600 [1:13:36<3:42:45,  1.14it/s][A

{'loss': 1.3924, 'grad_norm': 33.377525329589844, 'learning_rate': 1.556122448979592e-05, 'epoch': 22.19}


 22%|██▏       | 4360/19600 [1:13:45<3:43:03,  1.14it/s]
                                                        
 22%|██▏       | 4360/19600 [1:13:45<3:43:03,  1.14it/s][A

{'loss': 1.4112, 'grad_norm': 31.880512237548828, 'learning_rate': 1.5551020408163265e-05, 'epoch': 22.24}


 22%|██▏       | 4370/19600 [1:13:53<3:42:41,  1.14it/s]
                                                        
 22%|██▏       | 4370/19600 [1:13:53<3:42:41,  1.14it/s][A

{'loss': 1.3857, 'grad_norm': 29.454376220703125, 'learning_rate': 1.5540816326530614e-05, 'epoch': 22.3}


 22%|██▏       | 4380/19600 [1:14:02<3:42:20,  1.14it/s]
                                                        
 22%|██▏       | 4380/19600 [1:14:02<3:42:20,  1.14it/s][A

{'loss': 1.41, 'grad_norm': 36.354225158691406, 'learning_rate': 1.553061224489796e-05, 'epoch': 22.35}


 22%|██▏       | 4390/19600 [1:14:11<3:42:14,  1.14it/s]
                                                        
 22%|██▏       | 4390/19600 [1:14:11<3:42:14,  1.14it/s][A

{'loss': 1.4264, 'grad_norm': 31.63094139099121, 'learning_rate': 1.552040816326531e-05, 'epoch': 22.4}


 22%|██▏       | 4400/19600 [1:14:20<3:42:12,  1.14it/s]
                                                        
 22%|██▏       | 4400/19600 [1:14:20<3:42:12,  1.14it/s][A

{'loss': 1.3753, 'grad_norm': 31.919645309448242, 'learning_rate': 1.5510204081632655e-05, 'epoch': 22.45}


 22%|██▎       | 4410/19600 [1:14:29<3:42:05,  1.14it/s]
                                                        
 22%|██▎       | 4410/19600 [1:14:29<3:42:05,  1.14it/s][A

{'loss': 1.3887, 'grad_norm': 33.54991149902344, 'learning_rate': 1.55e-05, 'epoch': 22.5}


 23%|██▎       | 4420/19600 [1:14:37<3:42:09,  1.14it/s]
                                                        
 23%|██▎       | 4420/19600 [1:14:37<3:42:09,  1.14it/s][A

{'loss': 1.4025, 'grad_norm': 38.3099250793457, 'learning_rate': 1.5489795918367346e-05, 'epoch': 22.55}


 23%|██▎       | 4430/19600 [1:14:46<3:42:11,  1.14it/s]
                                                        
 23%|██▎       | 4430/19600 [1:14:46<3:42:11,  1.14it/s][A

{'loss': 1.422, 'grad_norm': 31.899658203125, 'learning_rate': 1.5479591836734696e-05, 'epoch': 22.6}


 23%|██▎       | 4440/19600 [1:14:55<3:41:45,  1.14it/s]
                                                        
 23%|██▎       | 4440/19600 [1:14:55<3:41:45,  1.14it/s][A

{'loss': 1.4242, 'grad_norm': 38.495521545410156, 'learning_rate': 1.546938775510204e-05, 'epoch': 22.65}


 23%|██▎       | 4450/19600 [1:15:04<3:41:40,  1.14it/s]
                                                        
 23%|██▎       | 4450/19600 [1:15:04<3:41:40,  1.14it/s][A

{'loss': 1.433, 'grad_norm': 36.10063552856445, 'learning_rate': 1.545918367346939e-05, 'epoch': 22.7}


 23%|██▎       | 4460/19600 [1:15:12<3:41:38,  1.14it/s]
                                                        
 23%|██▎       | 4460/19600 [1:15:12<3:41:38,  1.14it/s][A

{'loss': 1.4491, 'grad_norm': 34.15703582763672, 'learning_rate': 1.5448979591836736e-05, 'epoch': 22.76}


 23%|██▎       | 4470/19600 [1:15:21<3:41:23,  1.14it/s]
                                                        
 23%|██▎       | 4470/19600 [1:15:21<3:41:23,  1.14it/s][A

{'loss': 1.4275, 'grad_norm': 30.790904998779297, 'learning_rate': 1.5438775510204082e-05, 'epoch': 22.81}


 23%|██▎       | 4480/19600 [1:15:30<3:41:19,  1.14it/s]
                                                        
 23%|██▎       | 4480/19600 [1:15:30<3:41:19,  1.14it/s][A

{'loss': 1.4279, 'grad_norm': 33.15435791015625, 'learning_rate': 1.542857142857143e-05, 'epoch': 22.86}


 23%|██▎       | 4490/19600 [1:15:39<3:41:23,  1.14it/s]
                                                        
 23%|██▎       | 4490/19600 [1:15:39<3:41:23,  1.14it/s][A

{'loss': 1.3865, 'grad_norm': 29.321163177490234, 'learning_rate': 1.5418367346938777e-05, 'epoch': 22.91}


 23%|██▎       | 4500/19600 [1:15:48<3:41:02,  1.14it/s]
                                                        
 23%|██▎       | 4500/19600 [1:15:48<3:41:02,  1.14it/s][A

{'loss': 1.3905, 'grad_norm': 31.209774017333984, 'learning_rate': 1.5408163265306123e-05, 'epoch': 22.96}


 23%|██▎       | 4508/19600 [1:15:54<2:56:39,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.08it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.13it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.61it/s][A[A

  6%|▋         | 5/79 [00:01<00:21,  3.38it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.01it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.98it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.96it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.97it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.96it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.94it/s][A[A

 22%|██▏       | 17/79 [00:05<00:21,  2.94it/s][A[A

 23%|██▎       | 18/79 [

{'eval_loss': 2.653628349304199, 'eval_accuracy': 0.3556, 'eval_runtime': 26.5659, 'eval_samples_per_second': 376.422, 'eval_steps_per_second': 2.974, 'epoch': 23.0}


 23%|██▎       | 4510/19600 [1:16:23<27:39:56,  6.60s/it]
                                                         
 23%|██▎       | 4510/19600 [1:16:23<27:39:56,  6.60s/it][A

{'loss': 1.3976, 'grad_norm': 33.179962158203125, 'learning_rate': 1.539795918367347e-05, 'epoch': 23.01}


 23%|██▎       | 4520/19600 [1:16:32<4:21:10,  1.04s/it] 
                                                        
 23%|██▎       | 4520/19600 [1:16:32<4:21:10,  1.04s/it][A

{'loss': 1.2312, 'grad_norm': 33.4210205078125, 'learning_rate': 1.5387755102040818e-05, 'epoch': 23.06}


 23%|██▎       | 4530/19600 [1:16:41<3:42:04,  1.13it/s]
                                                        
 23%|██▎       | 4530/19600 [1:16:41<3:42:04,  1.13it/s][A

{'loss': 1.2921, 'grad_norm': 35.51988983154297, 'learning_rate': 1.5377551020408164e-05, 'epoch': 23.11}


 23%|██▎       | 4540/19600 [1:16:50<3:40:07,  1.14it/s]
                                                        
 23%|██▎       | 4540/19600 [1:16:50<3:40:07,  1.14it/s][A

{'loss': 1.2309, 'grad_norm': 32.99867248535156, 'learning_rate': 1.5367346938775513e-05, 'epoch': 23.16}


 23%|██▎       | 4550/19600 [1:16:59<3:40:45,  1.14it/s]
                                                        
 23%|██▎       | 4550/19600 [1:16:59<3:40:45,  1.14it/s][A

{'loss': 1.3294, 'grad_norm': 32.05886459350586, 'learning_rate': 1.535714285714286e-05, 'epoch': 23.21}


 23%|██▎       | 4560/19600 [1:17:07<3:40:22,  1.14it/s]
                                                        
 23%|██▎       | 4560/19600 [1:17:07<3:40:22,  1.14it/s][A

{'loss': 1.3454, 'grad_norm': 33.182857513427734, 'learning_rate': 1.5346938775510204e-05, 'epoch': 23.27}


 23%|██▎       | 4570/19600 [1:17:16<3:40:24,  1.14it/s]
                                                        
 23%|██▎       | 4570/19600 [1:17:16<3:40:24,  1.14it/s][A

{'loss': 1.3134, 'grad_norm': 31.481220245361328, 'learning_rate': 1.5336734693877554e-05, 'epoch': 23.32}


 23%|██▎       | 4580/19600 [1:17:25<3:40:19,  1.14it/s]
                                                        
 23%|██▎       | 4580/19600 [1:17:25<3:40:19,  1.14it/s][A

{'loss': 1.2978, 'grad_norm': 34.10097885131836, 'learning_rate': 1.53265306122449e-05, 'epoch': 23.37}


 23%|██▎       | 4590/19600 [1:17:34<3:40:04,  1.14it/s]
                                                        
 23%|██▎       | 4590/19600 [1:17:34<3:40:04,  1.14it/s][A

{'loss': 1.3087, 'grad_norm': 32.913856506347656, 'learning_rate': 1.5316326530612245e-05, 'epoch': 23.42}


 23%|██▎       | 4600/19600 [1:17:43<3:39:43,  1.14it/s]
                                                        
 23%|██▎       | 4600/19600 [1:17:43<3:39:43,  1.14it/s][A

{'loss': 1.3031, 'grad_norm': 29.96341896057129, 'learning_rate': 1.530612244897959e-05, 'epoch': 23.47}


 24%|██▎       | 4610/19600 [1:17:51<3:39:34,  1.14it/s]
                                                        
 24%|██▎       | 4610/19600 [1:17:51<3:39:34,  1.14it/s][A

{'loss': 1.2819, 'grad_norm': 29.668960571289062, 'learning_rate': 1.529591836734694e-05, 'epoch': 23.52}


 24%|██▎       | 4620/19600 [1:18:00<3:39:22,  1.14it/s]
                                                        
 24%|██▎       | 4620/19600 [1:18:00<3:39:22,  1.14it/s][A

{'loss': 1.3132, 'grad_norm': 30.65439796447754, 'learning_rate': 1.5285714285714286e-05, 'epoch': 23.57}


 24%|██▎       | 4630/19600 [1:18:09<3:39:31,  1.14it/s]
                                                        
 24%|██▎       | 4630/19600 [1:18:09<3:39:31,  1.14it/s][A

{'loss': 1.3117, 'grad_norm': 33.56980895996094, 'learning_rate': 1.5275510204081635e-05, 'epoch': 23.62}


 24%|██▎       | 4640/19600 [1:18:18<3:39:08,  1.14it/s]
                                                        
 24%|██▎       | 4640/19600 [1:18:18<3:39:08,  1.14it/s][A

{'loss': 1.2756, 'grad_norm': 35.539581298828125, 'learning_rate': 1.526530612244898e-05, 'epoch': 23.67}


 24%|██▎       | 4650/19600 [1:18:27<3:38:53,  1.14it/s]
                                                        
 24%|██▎       | 4650/19600 [1:18:27<3:38:53,  1.14it/s][A

{'loss': 1.3227, 'grad_norm': 37.12700653076172, 'learning_rate': 1.5255102040816327e-05, 'epoch': 23.72}


 24%|██▍       | 4660/19600 [1:18:35<3:38:46,  1.14it/s]
                                                        
 24%|██▍       | 4660/19600 [1:18:35<3:38:46,  1.14it/s][A

{'loss': 1.337, 'grad_norm': 29.578989028930664, 'learning_rate': 1.5244897959183676e-05, 'epoch': 23.78}


 24%|██▍       | 4670/19600 [1:18:44<3:38:31,  1.14it/s]
                                                        
 24%|██▍       | 4670/19600 [1:18:44<3:38:31,  1.14it/s][A

{'loss': 1.3675, 'grad_norm': 34.87812805175781, 'learning_rate': 1.5234693877551022e-05, 'epoch': 23.83}


 24%|██▍       | 4680/19600 [1:18:53<3:38:10,  1.14it/s]
                                                        
 24%|██▍       | 4680/19600 [1:18:53<3:38:10,  1.14it/s][A

{'loss': 1.3176, 'grad_norm': 32.8350715637207, 'learning_rate': 1.522448979591837e-05, 'epoch': 23.88}


 24%|██▍       | 4690/19600 [1:19:02<3:38:09,  1.14it/s]
                                                        
 24%|██▍       | 4690/19600 [1:19:02<3:38:09,  1.14it/s][A

{'loss': 1.3961, 'grad_norm': 31.37291145324707, 'learning_rate': 1.5214285714285715e-05, 'epoch': 23.93}


 24%|██▍       | 4700/19600 [1:19:11<3:37:50,  1.14it/s]
                                                        
 24%|██▍       | 4700/19600 [1:19:11<3:37:50,  1.14it/s][A

{'loss': 1.3166, 'grad_norm': 33.72947692871094, 'learning_rate': 1.5204081632653063e-05, 'epoch': 23.98}


 24%|██▍       | 4704/19600 [1:19:14<2:54:20,  1.42it/s]

  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  6.06it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.16it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.63it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.36it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.04it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.00it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.99it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.97it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.96it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.96it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.96it/s][A[A

 22%|██▏       | 17/79 [00:05<00:20,  2.96it/s][A[A

 23%|██▎       | 18/79 [

{'eval_loss': 2.655998706817627, 'eval_accuracy': 0.3582, 'eval_runtime': 26.5053, 'eval_samples_per_second': 377.283, 'eval_steps_per_second': 2.981, 'epoch': 24.0}


 24%|██▍       | 4710/19600 [1:19:46<9:18:13,  2.25s/it] 
                                                        
 24%|██▍       | 4710/19600 [1:19:46<9:18:13,  2.25s/it][A

{'loss': 1.2439, 'grad_norm': 36.493675231933594, 'learning_rate': 1.5193877551020408e-05, 'epoch': 24.03}


 24%|██▍       | 4720/19600 [1:19:55<3:47:29,  1.09it/s]
                                                        
 24%|██▍       | 4720/19600 [1:19:55<3:47:29,  1.09it/s][A

{'loss': 1.2061, 'grad_norm': 31.518754959106445, 'learning_rate': 1.5183673469387756e-05, 'epoch': 24.08}


 24%|██▍       | 4730/19600 [1:20:04<3:37:38,  1.14it/s]
                                                        
 24%|██▍       | 4730/19600 [1:20:04<3:37:38,  1.14it/s][A

{'loss': 1.1536, 'grad_norm': 36.27762985229492, 'learning_rate': 1.5173469387755105e-05, 'epoch': 24.13}


 24%|██▍       | 4740/19600 [1:20:13<3:37:20,  1.14it/s]
                                                        
 24%|██▍       | 4740/19600 [1:20:13<3:37:20,  1.14it/s][A

{'loss': 1.207, 'grad_norm': 35.06393814086914, 'learning_rate': 1.516326530612245e-05, 'epoch': 24.18}


 24%|██▍       | 4750/19600 [1:20:21<3:37:45,  1.14it/s]
                                                        
 24%|██▍       | 4750/19600 [1:20:22<3:37:45,  1.14it/s][A

{'loss': 1.1866, 'grad_norm': 33.9117317199707, 'learning_rate': 1.5153061224489798e-05, 'epoch': 24.23}


 24%|██▍       | 4760/19600 [1:20:30<3:37:46,  1.14it/s]
                                                        
 24%|██▍       | 4760/19600 [1:20:30<3:37:46,  1.14it/s][A

{'loss': 1.1905, 'grad_norm': 31.852136611938477, 'learning_rate': 1.5142857142857144e-05, 'epoch': 24.29}


 24%|██▍       | 4770/19600 [1:20:39<3:37:24,  1.14it/s]
                                                        
 24%|██▍       | 4770/19600 [1:20:39<3:37:24,  1.14it/s][A

{'loss': 1.2553, 'grad_norm': 34.14736557006836, 'learning_rate': 1.5132653061224492e-05, 'epoch': 24.34}


 24%|██▍       | 4780/19600 [1:20:48<3:37:10,  1.14it/s]
                                                        
 24%|██▍       | 4780/19600 [1:20:48<3:37:10,  1.14it/s][A

{'loss': 1.2413, 'grad_norm': 31.542613983154297, 'learning_rate': 1.5122448979591837e-05, 'epoch': 24.39}


 24%|██▍       | 4790/19600 [1:20:57<3:36:54,  1.14it/s]
                                                        
 24%|██▍       | 4790/19600 [1:20:57<3:36:54,  1.14it/s][A

{'loss': 1.1976, 'grad_norm': 29.001449584960938, 'learning_rate': 1.5112244897959185e-05, 'epoch': 24.44}


 24%|██▍       | 4800/19600 [1:21:05<3:36:46,  1.14it/s]
                                                        
 24%|██▍       | 4800/19600 [1:21:05<3:36:46,  1.14it/s][A

{'loss': 1.2039, 'grad_norm': 33.24000930786133, 'learning_rate': 1.510204081632653e-05, 'epoch': 24.49}


 25%|██▍       | 4810/19600 [1:21:14<3:36:32,  1.14it/s]
                                                        
 25%|██▍       | 4810/19600 [1:21:14<3:36:32,  1.14it/s][A

{'loss': 1.1927, 'grad_norm': 33.658721923828125, 'learning_rate': 1.5091836734693878e-05, 'epoch': 24.54}


 25%|██▍       | 4820/19600 [1:21:23<3:36:14,  1.14it/s]
                                                        
 25%|██▍       | 4820/19600 [1:21:23<3:36:14,  1.14it/s][A

{'loss': 1.2623, 'grad_norm': 33.1195068359375, 'learning_rate': 1.5081632653061227e-05, 'epoch': 24.59}


 25%|██▍       | 4830/19600 [1:21:32<3:36:01,  1.14it/s]
                                                        
 25%|██▍       | 4830/19600 [1:21:32<3:36:01,  1.14it/s][A

{'loss': 1.2546, 'grad_norm': 37.35557556152344, 'learning_rate': 1.5071428571428573e-05, 'epoch': 24.64}


 25%|██▍       | 4840/19600 [1:21:41<3:36:16,  1.14it/s]
                                                        
 25%|██▍       | 4840/19600 [1:21:41<3:36:16,  1.14it/s][A

{'loss': 1.1911, 'grad_norm': 31.49387550354004, 'learning_rate': 1.506122448979592e-05, 'epoch': 24.69}


 25%|██▍       | 4850/19600 [1:21:49<3:36:49,  1.13it/s]
                                                        
 25%|██▍       | 4850/19600 [1:21:49<3:36:49,  1.13it/s][A

{'loss': 1.2309, 'grad_norm': 32.33113479614258, 'learning_rate': 1.5051020408163266e-05, 'epoch': 24.74}


 25%|██▍       | 4860/19600 [1:21:58<3:36:18,  1.14it/s]
                                                        
 25%|██▍       | 4860/19600 [1:21:58<3:36:18,  1.14it/s][A

{'loss': 1.21, 'grad_norm': 34.8453483581543, 'learning_rate': 1.5040816326530614e-05, 'epoch': 24.8}


 25%|██▍       | 4870/19600 [1:22:07<3:35:59,  1.14it/s]
                                                        
 25%|██▍       | 4870/19600 [1:22:07<3:35:59,  1.14it/s][A

{'loss': 1.2256, 'grad_norm': 32.871986389160156, 'learning_rate': 1.503061224489796e-05, 'epoch': 24.85}


 25%|██▍       | 4880/19600 [1:22:16<3:35:53,  1.14it/s]
                                                        
 25%|██▍       | 4880/19600 [1:22:16<3:35:53,  1.14it/s][A

{'loss': 1.268, 'grad_norm': 32.753597259521484, 'learning_rate': 1.5020408163265307e-05, 'epoch': 24.9}


 25%|██▍       | 4890/19600 [1:22:25<3:35:56,  1.14it/s]
                                                        
 25%|██▍       | 4890/19600 [1:22:25<3:35:56,  1.14it/s][A

{'loss': 1.2979, 'grad_norm': 33.12105941772461, 'learning_rate': 1.5010204081632653e-05, 'epoch': 24.95}


 25%|██▌       | 4900/19600 [1:22:33<2:52:11,  1.42it/s]
                                                        
 25%|██▌       | 4900/19600 [1:22:33<2:52:11,  1.42it/s][A

{'loss': 1.2169, 'grad_norm': 49.59101104736328, 'learning_rate': 1.5000000000000002e-05, 'epoch': 25.0}




  0%|          | 0/79 [00:00<?, ?it/s][A[A

  3%|▎         | 2/79 [00:00<00:12,  5.99it/s][A[A

  4%|▍         | 3/79 [00:00<00:18,  4.12it/s][A[A

  5%|▌         | 4/79 [00:01<00:20,  3.58it/s][A[A

  6%|▋         | 5/79 [00:01<00:22,  3.35it/s][A[A

  8%|▊         | 6/79 [00:01<00:22,  3.20it/s][A[A

  9%|▉         | 7/79 [00:02<00:23,  3.12it/s][A[A

 10%|█         | 8/79 [00:02<00:23,  3.07it/s][A[A

 11%|█▏        | 9/79 [00:02<00:23,  3.03it/s][A[A

 13%|█▎        | 10/79 [00:03<00:22,  3.01it/s][A[A

 14%|█▍        | 11/79 [00:03<00:22,  2.99it/s][A[A

 15%|█▌        | 12/79 [00:03<00:22,  2.97it/s][A[A

 16%|█▋        | 13/79 [00:04<00:22,  2.96it/s][A[A

 18%|█▊        | 14/79 [00:04<00:21,  2.96it/s][A[A

 19%|█▉        | 15/79 [00:04<00:21,  2.96it/s][A[A

 20%|██        | 16/79 [00:05<00:21,  2.95it/s][A[A

 22%|██▏       | 17/79 [00:05<00:21,  2.94it/s][A[A

 23%|██▎       | 18/79 [00:05<00:20,  2.93it/s][A[A

 24%|██▍       | 19/79 [0

{'eval_loss': 2.6565518379211426, 'eval_accuracy': 0.3515, 'eval_runtime': 26.5665, 'eval_samples_per_second': 376.414, 'eval_steps_per_second': 2.974, 'epoch': 25.0}


 25%|██▌       | 4910/19600 [1:23:09<4:55:07,  1.21s/it] 
                                                        
 25%|██▌       | 4910/19600 [1:23:09<4:55:07,  1.21s/it][A

{'loss': 1.08, 'grad_norm': 32.88031768798828, 'learning_rate': 1.498979591836735e-05, 'epoch': 25.05}


 25%|██▌       | 4920/19600 [1:23:18<3:37:06,  1.13it/s]
                                                        
 25%|██▌       | 4920/19600 [1:23:18<3:37:06,  1.13it/s][A

{'loss': 1.105, 'grad_norm': 31.456127166748047, 'learning_rate': 1.4979591836734695e-05, 'epoch': 25.1}


 25%|██▌       | 4923/19600 [1:23:21<3:35:52,  1.13it/s]

KeyboardInterrupt: 