In [17]:
# we want to load in a pretrained resnet model.
# we want to use the ImageFolder format specified by PyTorch
# we freeze the resnet parameters and train on our new dataset.
# train and evaluate

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import json
import glob
import itertools
from PIL import Image
from PIL.Image import BILINEAR
from torchinfo import summary
from transformers import (
    AutoImageProcessor,
    TrainingArguments,
    Trainer,
    EfficientNetConfig,
    EfficientNetForImageClassification,
    ViTForImageClassification,
    AutoTokenizer,
    EfficientNetImageProcessor,
    ViTImageProcessor,
)
import evaluate
import accelerate


cudnn.benchmark = True
plt.ion()   # interactive mode

<contextlib.ExitStack at 0x2c1f990d0>

In [19]:
# For straightforward datasets, sometimes you can make do with built-in PyTorch dataset objects.
# We want to apply automated data augmentations, which will be different for the training
# and eval scenarios

data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
    ]),
    'val': transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
    ]),
}

In [20]:
data_dir = "../../data/dataset/"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}

class_names = image_datasets['train'].classes

print(class_names)

print(image_datasets.keys())
print("label", image_datasets['train'][0][1])
# we will use this test image to do all our preliminary testing to make sure stuff works.
test_image = image_datasets['train'][0][0]
test_image = test_image.unsqueeze(0)
test_image.mean(), test_image.std(), test_image.shape

['1', '2', '3', '4', '5']
dict_keys(['train', 'val'])
label 0


(tensor(0.3929), tensor(0.2438), torch.Size([1, 3, 224, 224]))

In [21]:
def collate_fn(batch):
    return {
        "pixel_values": torch.stack([x[0] for x in batch]),
        "labels": torch.LongTensor([int(x[1]) for x in batch]),
    }

## Load ViT Model

In [22]:
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')

vit = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    num_labels=5,
    ignore_mismatched_sizes=True,
)
vit

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

### Turning off all ViT Parameters Except the Head.

In [23]:
for p in vit.parameters():
    p.requires_grad= False

for p in vit.classifier.parameters():
    p.requires_grad = True



In [25]:
"""
resnet.classifier = torch.nn.Sequential(
    torch.nn.Dropout(p = 0.2, inplace = True),
    torch.nn.Linear(in_features = 2048,
                    out_features = 5,
                    bias=True)
)
"""
efficientnet = EfficientNetForImageClassification(config)

efficientnet

output = efficientnet(test_image)
output

ImageClassifierOutputWithNoAttention(loss=None, logits=tensor([[-1.0680,  0.2316,  0.6660,  0.5855, -0.2125]],
       grad_fn=<AddmmBackward0>), hidden_states=None)

## EfficientNet HuggingFace Trainer

In [26]:
# Setup the training arguments
output_dir = "./pretrainedViT"

# we should edit things like number of training epochs and the batch size
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=100,
    lr_scheduler_type="cosine",
    logging_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    load_best_model_at_end=True,
    dataloader_num_workers=0,
#     gradient_accumulation_steps=8,
)

In [27]:
# Compute absolute learning rate
base_learning_rate = 1e-3
total_train_batch_size = (
    training_args.train_batch_size * training_args.gradient_accumulation_steps * training_args.world_size
)

training_args.learning_rate = base_learning_rate * total_train_batch_size / 256
print("Set learning rate to:", training_args.learning_rate)

Set learning rate to: 1.5625e-05


In [28]:
# Setup a function to compute accuracy metrics
metric = evaluate.load("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

In [29]:
# Create the trainer
trainer = Trainer(
    model=vit,
    args=training_args,
    train_dataset=image_datasets['train'],
    eval_dataset=image_datasets['val'],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [30]:
train_results = trainer.train()

  0%|          | 0/1900 [00:00<?, ?it/s]

  1%|          | 10/1900 [00:11<30:48,  1.02it/s] 

{'loss': 1.7575, 'grad_norm': 11.038022994995117, 'learning_rate': 1.5623932070253664e-05, 'epoch': 0.53}


                                                 
  1%|          | 19/1900 [00:20<23:05,  1.36it/s]

{'eval_loss': 1.5741634368896484, 'eval_accuracy': 0.3, 'eval_runtime': 1.6805, 'eval_samples_per_second': 5.95, 'eval_steps_per_second': 1.785, 'epoch': 1.0}


  1%|          | 20/1900 [00:22<53:40,  1.71s/it]

{'loss': 1.8914, 'grad_norm': 12.793962478637695, 'learning_rate': 1.5620728572975984e-05, 'epoch': 1.05}


  2%|▏         | 30/1900 [00:32<31:53,  1.02s/it]

{'loss': 1.8633, 'grad_norm': 14.619155883789062, 'learning_rate': 1.561539038397113e-05, 'epoch': 1.58}


                                                 
  2%|▏         | 38/1900 [00:39<19:52,  1.56it/s]

{'eval_loss': 1.5592066049575806, 'eval_accuracy': 0.3, 'eval_runtime': 1.2483, 'eval_samples_per_second': 8.011, 'eval_steps_per_second': 2.403, 'epoch': 2.0}


  2%|▏         | 40/1900 [00:42<39:46,  1.28s/it]

{'loss': 1.7003, 'grad_norm': 12.361721992492676, 'learning_rate': 1.5607918962646676e-05, 'epoch': 2.11}


  3%|▎         | 50/1900 [00:52<28:38,  1.08it/s]

{'loss': 1.8194, 'grad_norm': 11.863935470581055, 'learning_rate': 1.5598316351614608e-05, 'epoch': 2.63}


                                                 
  3%|▎         | 57/1900 [00:58<20:25,  1.50it/s]

{'eval_loss': 1.5458256006240845, 'eval_accuracy': 0.3, 'eval_runtime': 1.1972, 'eval_samples_per_second': 8.353, 'eval_steps_per_second': 2.506, 'epoch': 3.0}


  3%|▎         | 60/1900 [01:03<42:34,  1.39s/it]

{'loss': 1.6079, 'grad_norm': 12.882031440734863, 'learning_rate': 1.55865851761329e-05, 'epoch': 3.16}


  4%|▎         | 70/1900 [01:15<33:29,  1.10s/it]

{'loss': 1.7153, 'grad_norm': 10.410015106201172, 'learning_rate': 1.55727286433878e-05, 'epoch': 3.68}


                                                 
  4%|▍         | 76/1900 [01:22<23:15,  1.31it/s]

{'eval_loss': 1.5334198474884033, 'eval_accuracy': 0.4, 'eval_runtime': 1.3098, 'eval_samples_per_second': 7.635, 'eval_steps_per_second': 2.29, 'epoch': 4.0}


  4%|▍         | 80/1900 [01:27<38:01,  1.25s/it]

{'loss': 1.7844, 'grad_norm': 12.91823959350586, 'learning_rate': 1.5556750541616993e-05, 'epoch': 4.21}


  5%|▍         | 90/1900 [01:37<27:32,  1.10it/s]

{'loss': 1.7197, 'grad_norm': 11.423334121704102, 'learning_rate': 1.5538655239073973e-05, 'epoch': 4.74}


                                                 
  5%|▌         | 95/1900 [01:42<20:42,  1.45it/s]

{'eval_loss': 1.5214097499847412, 'eval_accuracy': 0.4, 'eval_runtime': 1.2013, 'eval_samples_per_second': 8.324, 'eval_steps_per_second': 2.497, 'epoch': 5.0}


  5%|▌         | 100/1900 [01:49<34:03,  1.14s/it]

{'loss': 1.7562, 'grad_norm': 11.415888786315918, 'learning_rate': 1.551844768283377e-05, 'epoch': 5.26}


  6%|▌         | 110/1900 [01:59<30:41,  1.03s/it]

{'loss': 1.6946, 'grad_norm': 11.739452362060547, 'learning_rate': 1.549613339744049e-05, 'epoch': 5.79}


                                                  
  6%|▌         | 114/1900 [02:04<23:51,  1.25it/s]

{'eval_loss': 1.5095951557159424, 'eval_accuracy': 0.4, 'eval_runtime': 1.5958, 'eval_samples_per_second': 6.267, 'eval_steps_per_second': 1.88, 'epoch': 6.0}


  6%|▋         | 120/1900 [02:11<31:53,  1.07s/it]

{'loss': 1.7511, 'grad_norm': 13.31485652923584, 'learning_rate': 1.5471718483396948e-05, 'epoch': 6.32}


  7%|▋         | 130/1900 [02:21<30:34,  1.04s/it]

{'loss': 1.6459, 'grad_norm': 10.76272201538086, 'learning_rate': 1.544520961549687e-05, 'epoch': 6.84}


                                                  
  7%|▋         | 133/1900 [02:25<24:09,  1.22it/s]

{'eval_loss': 1.4997527599334717, 'eval_accuracy': 0.4, 'eval_runtime': 1.1756, 'eval_samples_per_second': 8.506, 'eval_steps_per_second': 2.552, 'epoch': 7.0}


  7%|▋         | 140/1900 [02:33<33:58,  1.16s/it]

{'loss': 1.5523, 'grad_norm': 10.23902702331543, 'learning_rate': 1.5416614041000046e-05, 'epoch': 7.37}


  8%|▊         | 150/1900 [02:43<30:03,  1.03s/it]

{'loss': 1.6945, 'grad_norm': 12.410852432250977, 'learning_rate': 1.538593957765102e-05, 'epoch': 7.89}


                                                  
  8%|▊         | 152/1900 [02:46<21:59,  1.32it/s]

{'eval_loss': 1.4895769357681274, 'eval_accuracy': 0.4, 'eval_runtime': 1.3526, 'eval_samples_per_second': 7.393, 'eval_steps_per_second': 2.218, 'epoch': 8.0}


  8%|▊         | 160/1900 [02:55<31:27,  1.08s/it]

{'loss': 1.6037, 'grad_norm': 11.109977722167969, 'learning_rate': 1.5353194611541787e-05, 'epoch': 8.42}


  9%|▉         | 170/1900 [03:04<26:00,  1.11it/s]

{'loss': 1.6462, 'grad_norm': 9.599874496459961, 'learning_rate': 1.5318388094819127e-05, 'epoch': 8.95}


                                                  
  9%|▉         | 171/1900 [03:06<20:20,  1.42it/s]

{'eval_loss': 1.481583595275879, 'eval_accuracy': 0.4, 'eval_runtime': 1.3053, 'eval_samples_per_second': 7.661, 'eval_steps_per_second': 2.298, 'epoch': 9.0}


  9%|▉         | 180/1900 [03:16<27:39,  1.04it/s]

{'loss': 1.5705, 'grad_norm': 10.9679594039917, 'learning_rate': 1.528152954323717e-05, 'epoch': 9.47}


 10%|█         | 190/1900 [03:25<20:12,  1.41it/s]

{'loss': 1.7005, 'grad_norm': 14.98248291015625, 'learning_rate': 1.5242629033555888e-05, 'epoch': 10.0}


                                                  
 10%|█         | 190/1900 [03:26<20:12,  1.41it/s]

{'eval_loss': 1.4720083475112915, 'eval_accuracy': 0.4, 'eval_runtime': 1.2153, 'eval_samples_per_second': 8.228, 'eval_steps_per_second': 2.469, 'epoch': 10.0}


 11%|█         | 200/1900 [03:38<30:52,  1.09s/it]

{'loss': 1.5574, 'grad_norm': 13.475648880004883, 'learning_rate': 1.5201697200786208e-05, 'epoch': 10.53}


                                                  
 11%|█         | 209/1900 [03:47<19:30,  1.44it/s]

{'eval_loss': 1.465198278427124, 'eval_accuracy': 0.4, 'eval_runtime': 1.4616, 'eval_samples_per_second': 6.842, 'eval_steps_per_second': 2.053, 'epoch': 11.0}


 11%|█         | 210/1900 [03:49<46:43,  1.66s/it]

{'loss': 1.6264, 'grad_norm': 9.811136245727539, 'learning_rate': 1.5158745235282511e-05, 'epoch': 11.05}


 12%|█▏        | 220/1900 [04:00<32:49,  1.17s/it]

{'loss': 1.5943, 'grad_norm': 11.206347465515137, 'learning_rate': 1.5113784879683288e-05, 'epoch': 11.58}


                                                  
 12%|█▏        | 228/1900 [04:08<19:43,  1.41it/s]

{'eval_loss': 1.457028865814209, 'eval_accuracy': 0.4, 'eval_runtime': 1.3439, 'eval_samples_per_second': 7.441, 'eval_steps_per_second': 2.232, 'epoch': 12.0}


 12%|█▏        | 230/1900 [04:13<44:51,  1.61s/it]

{'loss': 1.5313, 'grad_norm': 10.90526008605957, 'learning_rate': 1.5066828425700837e-05, 'epoch': 12.11}


 13%|█▎        | 240/1900 [04:24<33:15,  1.20s/it]

{'loss': 1.644, 'grad_norm': 12.981010437011719, 'learning_rate': 1.5017888710760819e-05, 'epoch': 12.63}


                                                  
 13%|█▎        | 247/1900 [04:31<21:00,  1.31it/s]

{'eval_loss': 1.450195074081421, 'eval_accuracy': 0.3, 'eval_runtime': 1.1821, 'eval_samples_per_second': 8.459, 'eval_steps_per_second': 2.538, 'epoch': 13.0}


 13%|█▎        | 250/1900 [04:36<37:27,  1.36s/it]

{'loss': 1.5364, 'grad_norm': 12.827106475830078, 'learning_rate': 1.4966979114492635e-05, 'epoch': 13.16}


 14%|█▎        | 260/1900 [04:47<27:09,  1.01it/s]

{'loss': 1.5439, 'grad_norm': 10.590896606445312, 'learning_rate': 1.4914113555071582e-05, 'epoch': 13.68}


                                                  
 14%|█▍        | 266/1900 [04:53<20:24,  1.33it/s]

{'eval_loss': 1.4442840814590454, 'eval_accuracy': 0.3, 'eval_runtime': 1.2849, 'eval_samples_per_second': 7.782, 'eval_steps_per_second': 2.335, 'epoch': 14.0}


 14%|█▍        | 270/1900 [04:58<31:39,  1.17s/it]

{'loss': 1.5353, 'grad_norm': 13.417044639587402, 'learning_rate': 1.4859306485413743e-05, 'epoch': 14.21}


 15%|█▍        | 280/1900 [05:10<34:24,  1.27s/it]

{'loss': 1.5991, 'grad_norm': 12.524300575256348, 'learning_rate': 1.4802572889224715e-05, 'epoch': 14.74}


                                                  
 15%|█▌        | 285/1900 [05:15<22:19,  1.21it/s]

{'eval_loss': 1.437681794166565, 'eval_accuracy': 0.3, 'eval_runtime': 1.2497, 'eval_samples_per_second': 8.002, 'eval_steps_per_second': 2.401, 'epoch': 15.0}


 15%|█▌        | 290/1900 [05:22<31:59,  1.19s/it]

{'loss': 1.4321, 'grad_norm': 12.098123550415039, 'learning_rate': 1.4743928276903218e-05, 'epoch': 15.26}


 16%|█▌        | 300/1900 [05:34<31:48,  1.19s/it]

{'loss': 1.4835, 'grad_norm': 11.09827995300293, 'learning_rate': 1.4683388681300695e-05, 'epoch': 15.79}


                                                  
 16%|█▌        | 304/1900 [05:39<21:44,  1.22it/s]

{'eval_loss': 1.4318158626556396, 'eval_accuracy': 0.3, 'eval_runtime': 1.512, 'eval_samples_per_second': 6.614, 'eval_steps_per_second': 1.984, 'epoch': 16.0}


 16%|█▋        | 310/1900 [05:47<33:38,  1.27s/it]

{'loss': 1.5841, 'grad_norm': 12.02194595336914, 'learning_rate': 1.4620970653338104e-05, 'epoch': 16.32}


 17%|█▋        | 320/1900 [05:59<27:30,  1.04s/it]

{'loss': 1.4399, 'grad_norm': 9.179180145263672, 'learning_rate': 1.4556691257481048e-05, 'epoch': 16.84}


                                                  
 17%|█▋        | 323/1900 [06:02<18:34,  1.41it/s]

{'eval_loss': 1.4269609451293945, 'eval_accuracy': 0.3, 'eval_runtime': 1.298, 'eval_samples_per_second': 7.704, 'eval_steps_per_second': 2.311, 'epoch': 17.0}


 17%|█▋        | 330/1900 [06:11<31:01,  1.19s/it]

{'loss': 1.5762, 'grad_norm': 10.892380714416504, 'learning_rate': 1.4490568067074524e-05, 'epoch': 17.37}


 18%|█▊        | 340/1900 [06:24<28:21,  1.09s/it]

{'loss': 1.4075, 'grad_norm': 10.168344497680664, 'learning_rate': 1.4422619159538557e-05, 'epoch': 17.89}


                                                  
 18%|█▊        | 342/1900 [06:26<20:40,  1.26it/s]

{'eval_loss': 1.4221709966659546, 'eval_accuracy': 0.3, 'eval_runtime': 1.3054, 'eval_samples_per_second': 7.661, 'eval_steps_per_second': 2.298, 'epoch': 18.0}


 18%|█▊        | 350/1900 [06:38<30:21,  1.18s/it]

{'loss': 1.4265, 'grad_norm': 11.654729843139648, 'learning_rate': 1.4352863111426005e-05, 'epoch': 18.42}


 19%|█▉        | 360/1900 [06:49<24:50,  1.03it/s]

{'loss': 1.5273, 'grad_norm': 11.045371055603027, 'learning_rate': 1.4281318993343907e-05, 'epoch': 18.95}


                                                  
 19%|█▉        | 361/1900 [06:51<19:14,  1.33it/s]

{'eval_loss': 1.417417287826538, 'eval_accuracy': 0.3, 'eval_runtime': 1.311, 'eval_samples_per_second': 7.628, 'eval_steps_per_second': 2.288, 'epoch': 19.0}


 19%|█▉        | 370/1900 [07:04<32:25,  1.27s/it]

{'loss': 1.4824, 'grad_norm': 9.857304573059082, 'learning_rate': 1.4208006364739787e-05, 'epoch': 19.47}


 20%|██        | 380/1900 [07:14<19:07,  1.32it/s]

{'loss': 1.487, 'grad_norm': 16.10903549194336, 'learning_rate': 1.4132945268554277e-05, 'epoch': 20.0}


                                                  
 20%|██        | 380/1900 [07:15<19:07,  1.32it/s]

{'eval_loss': 1.4129526615142822, 'eval_accuracy': 0.3, 'eval_runtime': 1.4101, 'eval_samples_per_second': 7.092, 'eval_steps_per_second': 2.128, 'epoch': 20.0}


 21%|██        | 390/1900 [07:27<29:20,  1.17s/it]

{'loss': 1.37, 'grad_norm': 8.683388710021973, 'learning_rate': 1.40561562257416e-05, 'epoch': 20.53}


                                                  
 21%|██        | 399/1900 [07:38<20:27,  1.22it/s]

{'eval_loss': 1.4083919525146484, 'eval_accuracy': 0.3, 'eval_runtime': 1.3824, 'eval_samples_per_second': 7.234, 'eval_steps_per_second': 2.17, 'epoch': 21.0}


 21%|██        | 400/1900 [07:40<43:44,  1.75s/it]

{'loss': 1.5206, 'grad_norm': 11.055185317993164, 'learning_rate': 1.3977660229659327e-05, 'epoch': 21.05}


 22%|██▏       | 410/1900 [07:50<26:59,  1.09s/it]

{'loss': 1.4636, 'grad_norm': 11.749948501586914, 'learning_rate': 1.3897478740329005e-05, 'epoch': 21.58}


                                                  
 22%|██▏       | 418/1900 [07:59<18:25,  1.34it/s]

{'eval_loss': 1.4047787189483643, 'eval_accuracy': 0.3, 'eval_runtime': 1.3027, 'eval_samples_per_second': 7.676, 'eval_steps_per_second': 2.303, 'epoch': 22.0}


 22%|██▏       | 420/1900 [08:03<35:30,  1.44s/it]

{'loss': 1.4021, 'grad_norm': 12.101678848266602, 'learning_rate': 1.3815633678569213e-05, 'epoch': 22.11}


 23%|██▎       | 430/1900 [08:15<32:46,  1.34s/it]

{'loss': 1.4018, 'grad_norm': 9.490972518920898, 'learning_rate': 1.3732147420002617e-05, 'epoch': 22.63}


                                                  
 23%|██▎       | 437/1900 [08:23<19:01,  1.28it/s]

{'eval_loss': 1.4010672569274902, 'eval_accuracy': 0.3, 'eval_runtime': 1.2718, 'eval_samples_per_second': 7.863, 'eval_steps_per_second': 2.359, 'epoch': 23.0}


 23%|██▎       | 440/1900 [08:28<35:37,  1.46s/it]

{'loss': 1.4584, 'grad_norm': 10.68137264251709, 'learning_rate': 1.3647042788938703e-05, 'epoch': 23.16}


 24%|██▎       | 450/1900 [08:41<36:39,  1.52s/it]

{'loss': 1.425, 'grad_norm': 9.594575881958008, 'learning_rate': 1.3560343052133842e-05, 'epoch': 23.68}


                                                  
 24%|██▍       | 456/1900 [08:47<18:24,  1.31it/s]

{'eval_loss': 1.3967926502227783, 'eval_accuracy': 0.3, 'eval_runtime': 1.2554, 'eval_samples_per_second': 7.966, 'eval_steps_per_second': 2.39, 'epoch': 24.0}


 24%|██▍       | 460/1900 [08:53<29:25,  1.23s/it]

{'loss': 1.5046, 'grad_norm': 13.000336647033691, 'learning_rate': 1.3472071912430384e-05, 'epoch': 24.21}


 25%|██▍       | 470/1900 [09:04<27:24,  1.15s/it]

{'loss': 1.3748, 'grad_norm': 10.172225952148438, 'learning_rate': 1.338225350227654e-05, 'epoch': 24.74}


                                                  
 25%|██▌       | 475/1900 [09:10<20:12,  1.18it/s]

{'eval_loss': 1.3937911987304688, 'eval_accuracy': 0.3, 'eval_runtime': 1.3973, 'eval_samples_per_second': 7.157, 'eval_steps_per_second': 2.147, 'epoch': 25.0}


 25%|██▌       | 480/1900 [09:17<30:31,  1.29s/it]

{'loss': 1.3556, 'grad_norm': 10.40406322479248, 'learning_rate': 1.3290912377128817e-05, 'epoch': 25.26}


 26%|██▌       | 490/1900 [09:29<25:56,  1.10s/it]

{'loss': 1.4293, 'grad_norm': 10.250228881835938, 'learning_rate': 1.3198073508738796e-05, 'epoch': 25.79}


                                                  
 26%|██▌       | 494/1900 [09:34<19:55,  1.18it/s]

{'eval_loss': 1.3910754919052124, 'eval_accuracy': 0.3, 'eval_runtime': 1.4717, 'eval_samples_per_second': 6.795, 'eval_steps_per_second': 2.038, 'epoch': 26.0}


 26%|██▋       | 500/1900 [09:42<29:24,  1.26s/it]

{'loss': 1.3263, 'grad_norm': 9.197121620178223, 'learning_rate': 1.3103762278326103e-05, 'epoch': 26.32}


 27%|██▋       | 510/1900 [09:55<30:42,  1.33s/it]

{'loss': 1.4172, 'grad_norm': 9.965153694152832, 'learning_rate': 1.300800446963944e-05, 'epoch': 26.84}


                                                  
 27%|██▋       | 513/1900 [09:59<20:09,  1.15it/s]

{'eval_loss': 1.3882737159729004, 'eval_accuracy': 0.3, 'eval_runtime': 1.3575, 'eval_samples_per_second': 7.366, 'eval_steps_per_second': 2.21, 'epoch': 27.0}


 27%|██▋       | 520/1900 [10:10<35:41,  1.55s/it]

{'loss': 1.3954, 'grad_norm': 11.86302375793457, 'learning_rate': 1.2910826261907567e-05, 'epoch': 27.37}


 28%|██▊       | 530/1900 [10:22<25:05,  1.10s/it]

{'loss': 1.3514, 'grad_norm': 9.59329605102539, 'learning_rate': 1.2812254222682162e-05, 'epoch': 27.89}


                                                  
 28%|██▊       | 532/1900 [10:24<17:16,  1.32it/s]

{'eval_loss': 1.3848762512207031, 'eval_accuracy': 0.3, 'eval_runtime': 1.3887, 'eval_samples_per_second': 7.201, 'eval_steps_per_second': 2.16, 'epoch': 28.0}


 28%|██▊       | 540/1900 [10:37<30:06,  1.33s/it]

{'loss': 1.3986, 'grad_norm': 10.112848281860352, 'learning_rate': 1.2712315300574508e-05, 'epoch': 28.42}


 29%|██▉       | 550/1900 [10:47<22:17,  1.01it/s]

{'loss': 1.3445, 'grad_norm': 8.263205528259277, 'learning_rate': 1.2611036817888031e-05, 'epoch': 28.95}


                                                  
 29%|██▉       | 551/1900 [10:49<17:19,  1.30it/s]

{'eval_loss': 1.3822101354599, 'eval_accuracy': 0.3, 'eval_runtime': 1.3141, 'eval_samples_per_second': 7.61, 'eval_steps_per_second': 2.283, 'epoch': 29.0}


 29%|██▉       | 560/1900 [10:59<24:29,  1.10s/it]

{'loss': 1.2907, 'grad_norm': 14.780638694763184, 'learning_rate': 1.2508446463148633e-05, 'epoch': 29.47}


 30%|███       | 570/1900 [11:10<17:24,  1.27it/s]

{'loss': 1.4195, 'grad_norm': 11.360082626342773, 'learning_rate': 1.2404572283534946e-05, 'epoch': 30.0}


                                                  
 30%|███       | 570/1900 [11:11<17:24,  1.27it/s]

{'eval_loss': 1.3795673847198486, 'eval_accuracy': 0.4, 'eval_runtime': 1.1881, 'eval_samples_per_second': 8.417, 'eval_steps_per_second': 2.525, 'epoch': 30.0}


 31%|███       | 580/1900 [11:23<24:21,  1.11s/it]

{'loss': 1.2967, 'grad_norm': 11.1680908203125, 'learning_rate': 1.2299442677210496e-05, 'epoch': 30.53}


                                                  
 31%|███       | 589/1900 [11:33<15:27,  1.41it/s]

{'eval_loss': 1.377362608909607, 'eval_accuracy': 0.4, 'eval_runtime': 1.1842, 'eval_samples_per_second': 8.444, 'eval_steps_per_second': 2.533, 'epoch': 31.0}


 31%|███       | 590/1900 [11:35<36:22,  1.67s/it]

{'loss': 1.427, 'grad_norm': 12.817133903503418, 'learning_rate': 1.2193086385559945e-05, 'epoch': 31.05}


 32%|███▏      | 600/1900 [11:46<24:03,  1.11s/it]

{'loss': 1.2739, 'grad_norm': 11.548437118530273, 'learning_rate': 1.208553248533146e-05, 'epoch': 31.58}


                                                  
 32%|███▏      | 608/1900 [11:54<15:53,  1.35it/s]

{'eval_loss': 1.3747724294662476, 'eval_accuracy': 0.4, 'eval_runtime': 1.2417, 'eval_samples_per_second': 8.054, 'eval_steps_per_second': 2.416, 'epoch': 32.0}


 32%|███▏      | 610/1900 [11:57<30:16,  1.41s/it]

{'loss': 1.3741, 'grad_norm': 11.189805030822754, 'learning_rate': 1.1976810380687451e-05, 'epoch': 32.11}


 33%|███▎      | 620/1900 [12:08<27:20,  1.28s/it]

{'loss': 1.3042, 'grad_norm': 9.627227783203125, 'learning_rate': 1.1866949795165744e-05, 'epoch': 32.63}


                                                  
 33%|███▎      | 627/1900 [12:16<16:17,  1.30it/s]

{'eval_loss': 1.3721427917480469, 'eval_accuracy': 0.4, 'eval_runtime': 1.315, 'eval_samples_per_second': 7.605, 'eval_steps_per_second': 2.281, 'epoch': 33.0}


 33%|███▎      | 630/1900 [12:22<33:07,  1.56s/it]

{'loss': 1.3837, 'grad_norm': 10.21541690826416, 'learning_rate': 1.1755980763553469e-05, 'epoch': 33.16}


 34%|███▎      | 640/1900 [12:34<26:04,  1.24s/it]

{'loss': 1.2982, 'grad_norm': 9.560623168945312, 'learning_rate': 1.1643933623675845e-05, 'epoch': 33.68}


                                                  
 34%|███▍      | 646/1900 [12:41<16:59,  1.23it/s]

{'eval_loss': 1.3704599142074585, 'eval_accuracy': 0.4, 'eval_runtime': 1.1728, 'eval_samples_per_second': 8.526, 'eval_steps_per_second': 2.558, 'epoch': 34.0}


 34%|███▍      | 650/1900 [12:46<25:34,  1.23s/it]

{'loss': 1.3737, 'grad_norm': 9.790844917297363, 'learning_rate': 1.1530839008102138e-05, 'epoch': 34.21}


 35%|███▍      | 660/1900 [12:59<26:49,  1.30s/it]

{'loss': 1.2568, 'grad_norm': 11.676085472106934, 'learning_rate': 1.1416727835771e-05, 'epoch': 34.74}


                                                  
 35%|███▌      | 665/1900 [13:06<18:45,  1.10it/s]

{'eval_loss': 1.3691213130950928, 'eval_accuracy': 0.4, 'eval_runtime': 1.3589, 'eval_samples_per_second': 7.359, 'eval_steps_per_second': 2.208, 'epoch': 35.0}


KeyboardInterrupt: 