In [1]:
from datasets import Dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from transformers import AutoTokenizer, GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import zstandard as zstd
import chess.pgn
import io
import re
from torch.utils.data import DataLoader
import math
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import stockfish
import chess
import chess.engine

  from .autonotebook import tqdm as notebook_tqdm


# READ DATA

note to reader: this is where you can determine data size by choosing N when running read_first_n_games_as_strings

In [2]:
def format_pgn(text):

    # Remove metadata lines (anything within square brackets)
    text = re.sub(r'\[.*?\]', '', text)

    # Remove the result of the game (win, loss, or draw)
    text = re.sub(r'\s*(1-0|0-1|1/2-1/2)\s*', '', text)  # Remove game result

    # Remove excessive spaces and newlines
    text = ' '.join(text.split())

    # Remove `{ ... }` (comments or analysis)
    text = re.sub(r'\{.*?\}', '', text)

    # Normalize spaces 
    text = ' '.join(text.split())  

    # Define patterns for different components
    move_number_pattern = re.compile(r'(\d+\.)')  # Move numbers (e.g., "1.")
    piece_pattern = re.compile(r'([KQRBN])')  # Chess pieces (e.g., "N", "K")
    square_pattern = re.compile(r'([a-h][1-8])')  # Board squares (e4, d5, etc.)
    special_move_pattern = re.compile(r'(O-O|O-O-O|\+|#|x|=Q|=R|=B|=N)')  # Castling, check, capture, promotions
    
    # Ensure move numbers, pieces, and special moves are space-separated
    text = move_number_pattern.sub(r'\1 ', text)  # Move number spacing
    text = piece_pattern.sub(r'\1 ', text)  # Piece spacing
    text = special_move_pattern.sub(r' \1 ', text)  # Special moves spacing

    text += ' <EOS>'
    
    return ' '.join(text.split()) 


def read_first_n_games_as_strings(file_path, n=100):
    with open(file_path, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        decompressed = dctx.stream_reader(f)
        pgn_text = io.TextIOWrapper(decompressed, encoding='utf-8')

        games = []
        for _ in range(n):
            game = chess.pgn.read_game(pgn_text)
            if game is None:
                break  # Stop if no more games are available
            
            # Convert game to string
            game_str = io.StringIO()
            game.accept(chess.pgn.StringExporter())  # Corrected line
            games.append(format_pgn(str(game)))

    return games


# PREP DATA FOR TOKENIZER

In [3]:
# Replace with your file path
file_path = "/Users/nourfahmy/Documents/GitHub/llm-playbooks/lichess_db_standard_rated_2025-02.pgn.zst"
# Replace with number of games
number_of_games = 100000
games_as_strings = read_first_n_games_as_strings(file_path, n=number_of_games)

# Print the first game's PGN as a string
print(games_as_strings[0])

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({"text": games_as_strings})

train_data, val_data = train_test_split(dataset["text"], test_size=0.2, random_state=42)

train_dataset = Dataset.from_dict({"text": train_data})
val_dataset = Dataset.from_dict({"text": val_data})

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})


1. d4 1. .. N f6 2. f3 2. .. d5 3. c4 3. .. c6 4. e3 4. .. g6 5. N c3 5. .. B g7 6. B d3 6. .. O-O 7. N ge2 7. .. N a6 8. a3 8. .. N c7 9. b4 9. .. R e8 10. O-O 10. .. B d7 11. c5 11. .. e5 12. h3 12. .. e x d4 13. N x d4 13. .. N e6 14. R b1 14. .. N x d4 15. e x d4 15. .. N h5 16. N e2 16. .. Q f6 17. B c2 17. .. B f5 18. B x f5 18. .. Q x f5 19. B b2 19. .. N f4 20. N x f4 20. .. Q x f4 21. Q c2 21. .. R e3 22. B c1 22. .. B x d4 23. B x e3 23. .. B x e3 + 24. K h1 24. .. h5 25. Q e2 25. .. d4 26. R be1 26. .. R e8 27. Q d3 27. .. R e6 28. R e2 28. .. Q g3 29. R x e3 29. .. d x e3 30. Q d8 + 30. .. K g7 31. Q d3 31. .. e2 32. R g1 32. .. e1 =Q 33. R x e1 33. .. R x e1 + <EOS>


# TRAIN TOKENIZER

note to reader: if experimenting with model size, adjust tokenizer name accordingly below

In [4]:
# note to reader: if experimenting with model size, adjust tokenizer name accordingly below
tokenizer_name = "gpt2"
old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Define special tokens
special_tokens = {
    "eos_token": "<EOS>",
    "pad_token": "[PAD]"
}

# Resize the tokenizer with the new EOS token
old_tokenizer.add_special_tokens(special_tokens)

# Train a new tokenizer using the updated old_tokenizer
tokenizer = old_tokenizer.train_new_from_iterator(dataset['train'], 120)






# TOKENIZE DATA

In [5]:
# Tokenize Function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128,
        return_special_tokens_mask=True
    )


# Tokenize Dataset
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

train_subset = tokenized_datasets["train"]
eval_subset = tokenized_datasets["validation"]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 80000/80000 [00:15<00:00, 5018.38 examples/s]
Map: 100%|██████████| 20000/20000 [00:04<00:00, 4971.31 examples/s]


# SET UP AND CONFIGURE MODEL

In [None]:
# Create GPT model configuration
# note to reader: you can modify configuration to see how it affects chess performance
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,  # Match tokenizer's vocab size
    n_positions=128,
    n_embd=768,
    n_layer=12,
    n_head=12
)

# Initialize model
model = GPT2LMHeadModel(config)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False 
)

training_args = TrainingArguments(
    output_dir="./gpt-chess",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    warmup_steps=500,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()



  trainer = Trainer(
  0%|          | 10/30000 [00:03<2:44:27,  3.04it/s]

{'loss': 5.3712, 'grad_norm': 73.04951477050781, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


  0%|          | 20/30000 [00:06<2:37:32,  3.17it/s]

{'loss': 3.6444, 'grad_norm': 20.930946350097656, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


  0%|          | 30/30000 [00:10<2:36:37,  3.19it/s]

{'loss': 2.7407, 'grad_norm': 10.762998580932617, 'learning_rate': 3e-06, 'epoch': 0.0}


  0%|          | 40/30000 [00:13<2:36:07,  3.20it/s]

{'loss': 2.222, 'grad_norm': 6.899388313293457, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


  0%|          | 50/30000 [00:16<2:36:14,  3.19it/s]

{'loss': 1.8776, 'grad_norm': 8.799864768981934, 'learning_rate': 5e-06, 'epoch': 0.01}


  0%|          | 60/30000 [00:19<2:36:18,  3.19it/s]

{'loss': 1.6023, 'grad_norm': 6.537903308868408, 'learning_rate': 6e-06, 'epoch': 0.01}


  0%|          | 70/30000 [00:22<2:36:15,  3.19it/s]

{'loss': 1.4766, 'grad_norm': 8.028989791870117, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.01}


  0%|          | 80/30000 [00:25<2:36:07,  3.19it/s]

{'loss': 1.3859, 'grad_norm': 13.415802955627441, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}


  0%|          | 90/30000 [00:28<2:36:08,  3.19it/s]

{'loss': 1.3183, 'grad_norm': 10.153280258178711, 'learning_rate': 9e-06, 'epoch': 0.01}


  0%|          | 100/30000 [00:32<2:36:04,  3.19it/s]

{'loss': 1.2024, 'grad_norm': 9.133661270141602, 'learning_rate': 1e-05, 'epoch': 0.01}


  0%|          | 110/30000 [00:35<2:36:01,  3.19it/s]

{'loss': 1.1661, 'grad_norm': 12.107105255126953, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.01}


  0%|          | 120/30000 [00:38<2:36:00,  3.19it/s]

{'loss': 1.0985, 'grad_norm': 14.411551475524902, 'learning_rate': 1.2e-05, 'epoch': 0.01}


  0%|          | 130/30000 [00:41<2:35:53,  3.19it/s]

{'loss': 1.1009, 'grad_norm': 14.274845123291016, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.01}


  0%|          | 140/30000 [00:44<2:35:50,  3.19it/s]

{'loss': 1.0783, 'grad_norm': 5.611726760864258, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.01}


  0%|          | 150/30000 [00:47<2:35:46,  3.19it/s]

{'loss': 1.035, 'grad_norm': 7.783790588378906, 'learning_rate': 1.5e-05, 'epoch': 0.01}


  1%|          | 160/30000 [00:50<2:35:47,  3.19it/s]

{'loss': 1.0399, 'grad_norm': 6.530505180358887, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.02}


  1%|          | 170/30000 [00:53<2:35:37,  3.19it/s]

{'loss': 0.9998, 'grad_norm': 8.606317520141602, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.02}


  1%|          | 180/30000 [00:57<2:35:38,  3.19it/s]

{'loss': 1.0125, 'grad_norm': 4.469051361083984, 'learning_rate': 1.8e-05, 'epoch': 0.02}


  1%|          | 190/30000 [01:00<2:35:32,  3.19it/s]

{'loss': 1.0135, 'grad_norm': 9.8687744140625, 'learning_rate': 1.9e-05, 'epoch': 0.02}


  1%|          | 200/30000 [01:03<2:35:58,  3.18it/s]

{'loss': 0.9615, 'grad_norm': 5.607781410217285, 'learning_rate': 2e-05, 'epoch': 0.02}


  1%|          | 210/30000 [01:06<2:35:27,  3.19it/s]

{'loss': 0.9862, 'grad_norm': 5.918869495391846, 'learning_rate': 2.1e-05, 'epoch': 0.02}


  1%|          | 220/30000 [01:09<2:35:23,  3.19it/s]

{'loss': 0.943, 'grad_norm': 5.834027290344238, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.02}


  1%|          | 230/30000 [01:12<2:35:40,  3.19it/s]

{'loss': 0.948, 'grad_norm': 7.522834300994873, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.02}


  1%|          | 240/30000 [01:15<2:35:28,  3.19it/s]

{'loss': 0.894, 'grad_norm': 7.429811477661133, 'learning_rate': 2.4e-05, 'epoch': 0.02}


  1%|          | 250/30000 [01:19<2:35:28,  3.19it/s]

{'loss': 0.9591, 'grad_norm': 6.961455821990967, 'learning_rate': 2.5e-05, 'epoch': 0.03}


  1%|          | 260/30000 [01:22<2:35:29,  3.19it/s]

{'loss': 0.896, 'grad_norm': 5.12556266784668, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.03}


  1%|          | 270/30000 [01:25<2:35:15,  3.19it/s]

{'loss': 0.9044, 'grad_norm': 5.60679292678833, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.03}


  1%|          | 280/30000 [01:28<2:35:06,  3.19it/s]

{'loss': 0.8674, 'grad_norm': 4.575209140777588, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.03}


  1%|          | 290/30000 [01:31<2:35:13,  3.19it/s]

{'loss': 0.896, 'grad_norm': 4.165699481964111, 'learning_rate': 2.9e-05, 'epoch': 0.03}


  1%|          | 300/30000 [01:34<2:35:12,  3.19it/s]

{'loss': 0.8635, 'grad_norm': 5.71096658706665, 'learning_rate': 3e-05, 'epoch': 0.03}


  1%|          | 310/30000 [01:37<2:34:58,  3.19it/s]

{'loss': 0.8234, 'grad_norm': 5.088802814483643, 'learning_rate': 3.1e-05, 'epoch': 0.03}


  1%|          | 320/30000 [01:40<2:34:51,  3.19it/s]

{'loss': 0.8429, 'grad_norm': 3.8426361083984375, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.03}


  1%|          | 330/30000 [01:44<2:34:51,  3.19it/s]

{'loss': 0.8349, 'grad_norm': 5.5127034187316895, 'learning_rate': 3.3e-05, 'epoch': 0.03}


  1%|          | 340/30000 [01:47<2:34:55,  3.19it/s]

{'loss': 0.818, 'grad_norm': 4.984922409057617, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.03}


  1%|          | 350/30000 [01:50<2:34:58,  3.19it/s]

{'loss': 0.7732, 'grad_norm': 4.0469489097595215, 'learning_rate': 3.5e-05, 'epoch': 0.04}


  1%|          | 360/30000 [01:53<2:34:47,  3.19it/s]

{'loss': 0.8079, 'grad_norm': 5.402649879455566, 'learning_rate': 3.6e-05, 'epoch': 0.04}


  1%|          | 370/30000 [01:56<2:34:49,  3.19it/s]

{'loss': 0.7709, 'grad_norm': 4.341023921966553, 'learning_rate': 3.7e-05, 'epoch': 0.04}


  1%|▏         | 380/30000 [01:59<2:34:38,  3.19it/s]

{'loss': 0.7495, 'grad_norm': 7.628941059112549, 'learning_rate': 3.8e-05, 'epoch': 0.04}


  1%|▏         | 390/30000 [02:02<2:34:43,  3.19it/s]

{'loss': 0.7699, 'grad_norm': 5.031327247619629, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.04}


  1%|▏         | 400/30000 [02:06<2:34:44,  3.19it/s]

{'loss': 0.7159, 'grad_norm': 3.387741804122925, 'learning_rate': 4e-05, 'epoch': 0.04}


  1%|▏         | 410/30000 [02:09<2:34:42,  3.19it/s]

{'loss': 0.6779, 'grad_norm': 4.853594779968262, 'learning_rate': 4.1e-05, 'epoch': 0.04}


  1%|▏         | 420/30000 [02:12<2:34:57,  3.18it/s]

{'loss': 0.6657, 'grad_norm': 3.757446765899658, 'learning_rate': 4.2e-05, 'epoch': 0.04}


  1%|▏         | 430/30000 [02:15<2:34:32,  3.19it/s]

{'loss': 0.6365, 'grad_norm': 4.688810348510742, 'learning_rate': 4.3e-05, 'epoch': 0.04}


  1%|▏         | 440/30000 [02:18<2:34:28,  3.19it/s]

{'loss': 0.6419, 'grad_norm': 5.567150592803955, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.04}


  2%|▏         | 450/30000 [02:21<2:34:23,  3.19it/s]

{'loss': 0.6126, 'grad_norm': 5.677238941192627, 'learning_rate': 4.5e-05, 'epoch': 0.04}


  2%|▏         | 460/30000 [02:24<2:35:02,  3.18it/s]

{'loss': 0.5867, 'grad_norm': 4.044055938720703, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.05}


  2%|▏         | 470/30000 [02:28<2:34:36,  3.18it/s]

{'loss': 0.5925, 'grad_norm': 3.986818790435791, 'learning_rate': 4.7e-05, 'epoch': 0.05}


  2%|▏         | 480/30000 [02:31<2:34:03,  3.19it/s]

{'loss': 0.578, 'grad_norm': 3.553083896636963, 'learning_rate': 4.8e-05, 'epoch': 0.05}


  2%|▏         | 490/30000 [02:34<2:34:13,  3.19it/s]

{'loss': 0.5769, 'grad_norm': 3.2285828590393066, 'learning_rate': 4.9e-05, 'epoch': 0.05}


  2%|▏         | 500/30000 [02:37<2:34:09,  3.19it/s]

{'loss': 0.5605, 'grad_norm': 3.86979603767395, 'learning_rate': 5e-05, 'epoch': 0.05}


  2%|▏         | 510/30000 [02:40<2:34:12,  3.19it/s]

{'loss': 0.545, 'grad_norm': 2.890911817550659, 'learning_rate': 4.998305084745763e-05, 'epoch': 0.05}


  2%|▏         | 520/30000 [02:43<2:34:11,  3.19it/s]

{'loss': 0.524, 'grad_norm': 2.608962059020996, 'learning_rate': 4.9966101694915254e-05, 'epoch': 0.05}


  2%|▏         | 530/30000 [02:46<2:33:56,  3.19it/s]

{'loss': 0.5429, 'grad_norm': 2.9013326168060303, 'learning_rate': 4.9949152542372884e-05, 'epoch': 0.05}


  2%|▏         | 540/30000 [02:50<2:34:06,  3.19it/s]

{'loss': 0.5106, 'grad_norm': 2.3382768630981445, 'learning_rate': 4.993220338983051e-05, 'epoch': 0.05}


  2%|▏         | 550/30000 [02:53<2:33:52,  3.19it/s]

{'loss': 0.5351, 'grad_norm': 3.322052478790283, 'learning_rate': 4.991525423728814e-05, 'epoch': 0.06}


  2%|▏         | 560/30000 [02:56<2:33:50,  3.19it/s]

{'loss': 0.5242, 'grad_norm': 2.7272088527679443, 'learning_rate': 4.9898305084745765e-05, 'epoch': 0.06}


  2%|▏         | 570/30000 [02:59<2:33:53,  3.19it/s]

{'loss': 0.492, 'grad_norm': 2.3481574058532715, 'learning_rate': 4.9881355932203394e-05, 'epoch': 0.06}


  2%|▏         | 580/30000 [03:02<2:33:24,  3.20it/s]

{'loss': 0.5036, 'grad_norm': 2.1494462490081787, 'learning_rate': 4.9864406779661024e-05, 'epoch': 0.06}


  2%|▏         | 590/30000 [03:05<2:33:25,  3.19it/s]

{'loss': 0.4672, 'grad_norm': 1.6624137163162231, 'learning_rate': 4.9847457627118646e-05, 'epoch': 0.06}


  2%|▏         | 600/30000 [03:08<2:33:18,  3.20it/s]

{'loss': 0.5004, 'grad_norm': 2.849114418029785, 'learning_rate': 4.9830508474576276e-05, 'epoch': 0.06}


  2%|▏         | 610/30000 [03:11<2:33:13,  3.20it/s]

{'loss': 0.5154, 'grad_norm': 2.2779464721679688, 'learning_rate': 4.98135593220339e-05, 'epoch': 0.06}


  2%|▏         | 620/30000 [03:15<2:33:42,  3.19it/s]

{'loss': 0.4936, 'grad_norm': 2.365666627883911, 'learning_rate': 4.979661016949153e-05, 'epoch': 0.06}


  2%|▏         | 630/30000 [03:18<2:33:56,  3.18it/s]

{'loss': 0.4837, 'grad_norm': 1.9477224349975586, 'learning_rate': 4.977966101694915e-05, 'epoch': 0.06}


  2%|▏         | 640/30000 [03:21<2:33:41,  3.18it/s]

{'loss': 0.4714, 'grad_norm': 2.2166361808776855, 'learning_rate': 4.976271186440678e-05, 'epoch': 0.06}


  2%|▏         | 650/30000 [03:24<2:33:46,  3.18it/s]

{'loss': 0.4945, 'grad_norm': 2.028423309326172, 'learning_rate': 4.974576271186441e-05, 'epoch': 0.07}


  2%|▏         | 660/30000 [03:27<2:33:40,  3.18it/s]

{'loss': 0.5048, 'grad_norm': 2.0006279945373535, 'learning_rate': 4.972881355932204e-05, 'epoch': 0.07}


  2%|▏         | 670/30000 [03:30<2:33:26,  3.19it/s]

{'loss': 0.4828, 'grad_norm': 1.4866183996200562, 'learning_rate': 4.971186440677966e-05, 'epoch': 0.07}


  2%|▏         | 680/30000 [03:33<2:33:26,  3.18it/s]

{'loss': 0.4881, 'grad_norm': 2.641690254211426, 'learning_rate': 4.969491525423729e-05, 'epoch': 0.07}


  2%|▏         | 690/30000 [03:37<2:33:23,  3.18it/s]

{'loss': 0.4643, 'grad_norm': 1.4234434366226196, 'learning_rate': 4.967796610169492e-05, 'epoch': 0.07}


  2%|▏         | 700/30000 [03:40<2:33:20,  3.18it/s]

{'loss': 0.4529, 'grad_norm': 1.3023124933242798, 'learning_rate': 4.966101694915254e-05, 'epoch': 0.07}


  2%|▏         | 710/30000 [03:43<2:33:33,  3.18it/s]

{'loss': 0.4778, 'grad_norm': 1.327633023262024, 'learning_rate': 4.964406779661017e-05, 'epoch': 0.07}


  2%|▏         | 720/30000 [03:46<2:33:17,  3.18it/s]

{'loss': 0.458, 'grad_norm': 1.4078112840652466, 'learning_rate': 4.96271186440678e-05, 'epoch': 0.07}


  2%|▏         | 730/30000 [03:49<2:33:02,  3.19it/s]

{'loss': 0.4526, 'grad_norm': 1.80861234664917, 'learning_rate': 4.961016949152543e-05, 'epoch': 0.07}


  2%|▏         | 740/30000 [03:52<2:33:05,  3.19it/s]

{'loss': 0.4654, 'grad_norm': 2.200946092605591, 'learning_rate': 4.959322033898305e-05, 'epoch': 0.07}


  2%|▎         | 750/30000 [03:55<2:33:10,  3.18it/s]

{'loss': 0.464, 'grad_norm': 1.5864311456680298, 'learning_rate': 4.957627118644068e-05, 'epoch': 0.07}


  3%|▎         | 760/30000 [03:59<2:32:53,  3.19it/s]

{'loss': 0.4495, 'grad_norm': 1.4120900630950928, 'learning_rate': 4.955932203389831e-05, 'epoch': 0.08}


  3%|▎         | 770/30000 [04:02<2:32:52,  3.19it/s]

{'loss': 0.4324, 'grad_norm': 1.1852022409439087, 'learning_rate': 4.9542372881355934e-05, 'epoch': 0.08}


  3%|▎         | 780/30000 [04:05<2:32:44,  3.19it/s]

{'loss': 0.4568, 'grad_norm': 1.4322680234909058, 'learning_rate': 4.952542372881356e-05, 'epoch': 0.08}


  3%|▎         | 790/30000 [04:08<2:32:43,  3.19it/s]

{'loss': 0.4448, 'grad_norm': 1.537676453590393, 'learning_rate': 4.950847457627119e-05, 'epoch': 0.08}


  3%|▎         | 800/30000 [04:11<2:32:42,  3.19it/s]

{'loss': 0.4291, 'grad_norm': 1.2230217456817627, 'learning_rate': 4.9491525423728815e-05, 'epoch': 0.08}


  3%|▎         | 810/30000 [04:14<2:32:27,  3.19it/s]

{'loss': 0.448, 'grad_norm': 1.5921324491500854, 'learning_rate': 4.9474576271186444e-05, 'epoch': 0.08}


  3%|▎         | 820/30000 [04:17<2:32:38,  3.19it/s]

{'loss': 0.4376, 'grad_norm': 1.411829948425293, 'learning_rate': 4.945762711864407e-05, 'epoch': 0.08}


  3%|▎         | 830/30000 [04:21<2:32:29,  3.19it/s]

{'loss': 0.4535, 'grad_norm': 2.086671829223633, 'learning_rate': 4.9440677966101696e-05, 'epoch': 0.08}


  3%|▎         | 840/30000 [04:24<2:32:17,  3.19it/s]

{'loss': 0.4479, 'grad_norm': 1.5522617101669312, 'learning_rate': 4.9423728813559326e-05, 'epoch': 0.08}


  3%|▎         | 850/30000 [04:27<2:32:23,  3.19it/s]

{'loss': 0.4308, 'grad_norm': 1.2813936471939087, 'learning_rate': 4.940677966101695e-05, 'epoch': 0.09}


  3%|▎         | 860/30000 [04:30<2:32:27,  3.19it/s]

{'loss': 0.4428, 'grad_norm': 1.4738385677337646, 'learning_rate': 4.938983050847458e-05, 'epoch': 0.09}


  3%|▎         | 870/30000 [04:33<2:32:34,  3.18it/s]

{'loss': 0.4494, 'grad_norm': 1.6380295753479004, 'learning_rate': 4.937288135593221e-05, 'epoch': 0.09}


  3%|▎         | 880/30000 [04:36<2:32:30,  3.18it/s]

{'loss': 0.4368, 'grad_norm': 1.3539868593215942, 'learning_rate': 4.935593220338983e-05, 'epoch': 0.09}


  3%|▎         | 890/30000 [04:39<2:32:24,  3.18it/s]

{'loss': 0.4312, 'grad_norm': 1.244022250175476, 'learning_rate': 4.933898305084746e-05, 'epoch': 0.09}


  3%|▎         | 900/30000 [04:43<2:32:22,  3.18it/s]

{'loss': 0.4149, 'grad_norm': 1.1598868370056152, 'learning_rate': 4.932203389830509e-05, 'epoch': 0.09}


  3%|▎         | 910/30000 [04:46<2:32:06,  3.19it/s]

{'loss': 0.4266, 'grad_norm': 1.4250339269638062, 'learning_rate': 4.930508474576271e-05, 'epoch': 0.09}


  3%|▎         | 920/30000 [04:49<2:31:52,  3.19it/s]

{'loss': 0.4262, 'grad_norm': 1.4850281476974487, 'learning_rate': 4.928813559322034e-05, 'epoch': 0.09}


  3%|▎         | 930/30000 [04:52<2:32:12,  3.18it/s]

{'loss': 0.427, 'grad_norm': 1.075903296470642, 'learning_rate': 4.927118644067797e-05, 'epoch': 0.09}


  3%|▎         | 940/30000 [04:55<2:31:38,  3.19it/s]

{'loss': 0.4208, 'grad_norm': 1.2169100046157837, 'learning_rate': 4.92542372881356e-05, 'epoch': 0.09}


  3%|▎         | 950/30000 [04:58<2:31:41,  3.19it/s]

{'loss': 0.4161, 'grad_norm': 1.1424704790115356, 'learning_rate': 4.923728813559322e-05, 'epoch': 0.1}


  3%|▎         | 960/30000 [05:01<2:31:39,  3.19it/s]

{'loss': 0.4335, 'grad_norm': 0.9873906970024109, 'learning_rate': 4.922033898305085e-05, 'epoch': 0.1}


  3%|▎         | 970/30000 [05:05<2:31:29,  3.19it/s]

{'loss': 0.4188, 'grad_norm': 1.5415222644805908, 'learning_rate': 4.920338983050848e-05, 'epoch': 0.1}


  3%|▎         | 980/30000 [05:08<2:31:35,  3.19it/s]

{'loss': 0.4239, 'grad_norm': 1.017797589302063, 'learning_rate': 4.91864406779661e-05, 'epoch': 0.1}


  3%|▎         | 990/30000 [05:11<2:32:09,  3.18it/s]

{'loss': 0.425, 'grad_norm': 1.1166071891784668, 'learning_rate': 4.916949152542373e-05, 'epoch': 0.1}


  3%|▎         | 1000/30000 [05:14<2:31:29,  3.19it/s]

{'loss': 0.4219, 'grad_norm': 1.119576096534729, 'learning_rate': 4.915254237288136e-05, 'epoch': 0.1}


  3%|▎         | 1010/30000 [05:17<2:31:26,  3.19it/s]

{'loss': 0.4211, 'grad_norm': 1.184313178062439, 'learning_rate': 4.913559322033899e-05, 'epoch': 0.1}


  3%|▎         | 1020/30000 [05:20<2:31:19,  3.19it/s]

{'loss': 0.4068, 'grad_norm': 1.0770071744918823, 'learning_rate': 4.9118644067796607e-05, 'epoch': 0.1}


  3%|▎         | 1030/30000 [05:23<2:31:12,  3.19it/s]

{'loss': 0.4062, 'grad_norm': 1.642909288406372, 'learning_rate': 4.9101694915254236e-05, 'epoch': 0.1}


  3%|▎         | 1040/30000 [05:26<2:31:09,  3.19it/s]

{'loss': 0.4039, 'grad_norm': 1.0051499605178833, 'learning_rate': 4.9084745762711865e-05, 'epoch': 0.1}


  4%|▎         | 1050/30000 [05:30<2:31:26,  3.19it/s]

{'loss': 0.4138, 'grad_norm': 1.1768585443496704, 'learning_rate': 4.9067796610169495e-05, 'epoch': 0.1}


  4%|▎         | 1060/30000 [05:33<2:31:17,  3.19it/s]

{'loss': 0.3999, 'grad_norm': 1.1262483596801758, 'learning_rate': 4.905084745762712e-05, 'epoch': 0.11}


  4%|▎         | 1070/30000 [05:36<2:31:41,  3.18it/s]

{'loss': 0.4085, 'grad_norm': 0.9252805113792419, 'learning_rate': 4.9033898305084746e-05, 'epoch': 0.11}


  4%|▎         | 1080/30000 [05:39<2:31:00,  3.19it/s]

{'loss': 0.3899, 'grad_norm': 1.1620256900787354, 'learning_rate': 4.9016949152542376e-05, 'epoch': 0.11}


  4%|▎         | 1090/30000 [05:42<2:31:02,  3.19it/s]

{'loss': 0.3943, 'grad_norm': 0.8694259524345398, 'learning_rate': 4.9e-05, 'epoch': 0.11}


  4%|▎         | 1100/30000 [05:45<2:31:06,  3.19it/s]

{'loss': 0.4083, 'grad_norm': 1.0833278894424438, 'learning_rate': 4.898305084745763e-05, 'epoch': 0.11}


  4%|▎         | 1110/30000 [05:48<2:30:53,  3.19it/s]

{'loss': 0.3973, 'grad_norm': 1.2866069078445435, 'learning_rate': 4.896610169491526e-05, 'epoch': 0.11}


  4%|▎         | 1120/30000 [05:52<2:30:55,  3.19it/s]

{'loss': 0.3873, 'grad_norm': 0.929372251033783, 'learning_rate': 4.8949152542372886e-05, 'epoch': 0.11}


  4%|▍         | 1130/30000 [05:55<2:30:46,  3.19it/s]

{'loss': 0.3939, 'grad_norm': 0.913616418838501, 'learning_rate': 4.893220338983051e-05, 'epoch': 0.11}


  4%|▍         | 1140/30000 [05:58<2:31:12,  3.18it/s]

{'loss': 0.4395, 'grad_norm': 1.8211619853973389, 'learning_rate': 4.891525423728814e-05, 'epoch': 0.11}


  4%|▍         | 1150/30000 [06:01<2:31:04,  3.18it/s]

{'loss': 0.4176, 'grad_norm': 0.8932573199272156, 'learning_rate': 4.889830508474577e-05, 'epoch': 0.12}


  4%|▍         | 1160/30000 [06:04<2:30:30,  3.19it/s]

{'loss': 0.3932, 'grad_norm': 0.9455888271331787, 'learning_rate': 4.888135593220339e-05, 'epoch': 0.12}


  4%|▍         | 1170/30000 [06:07<2:30:27,  3.19it/s]

{'loss': 0.4075, 'grad_norm': 1.5138254165649414, 'learning_rate': 4.886440677966102e-05, 'epoch': 0.12}


  4%|▍         | 1180/30000 [06:10<2:30:40,  3.19it/s]

{'loss': 0.3823, 'grad_norm': 1.0327067375183105, 'learning_rate': 4.884745762711865e-05, 'epoch': 0.12}


  4%|▍         | 1190/30000 [06:14<2:29:58,  3.20it/s]

{'loss': 0.3981, 'grad_norm': 1.032094955444336, 'learning_rate': 4.883050847457628e-05, 'epoch': 0.12}


  4%|▍         | 1200/30000 [06:45<8:07:51,  1.02s/it] 

{'loss': 0.3766, 'grad_norm': 1.017377495765686, 'learning_rate': 4.88135593220339e-05, 'epoch': 0.12}


  4%|▍         | 1210/30000 [06:48<2:36:20,  3.07it/s]

{'loss': 0.3912, 'grad_norm': 1.527477502822876, 'learning_rate': 4.879661016949153e-05, 'epoch': 0.12}


  4%|▍         | 1220/30000 [06:53<5:36:37,  1.42it/s]

{'loss': 0.4192, 'grad_norm': 1.1216864585876465, 'learning_rate': 4.877966101694916e-05, 'epoch': 0.12}


  4%|▍         | 1230/30000 [06:57<2:31:41,  3.16it/s]

{'loss': 0.3723, 'grad_norm': 0.9763370156288147, 'learning_rate': 4.876271186440678e-05, 'epoch': 0.12}


  4%|▍         | 1240/30000 [07:00<2:26:37,  3.27it/s]

{'loss': 0.3956, 'grad_norm': 1.462831735610962, 'learning_rate': 4.8745762711864405e-05, 'epoch': 0.12}


  4%|▍         | 1250/30000 [07:03<2:26:23,  3.27it/s]

{'loss': 0.3813, 'grad_norm': 1.1866285800933838, 'learning_rate': 4.8728813559322034e-05, 'epoch': 0.12}


  4%|▍         | 1260/30000 [07:06<2:26:22,  3.27it/s]

{'loss': 0.3921, 'grad_norm': 1.1933984756469727, 'learning_rate': 4.8711864406779663e-05, 'epoch': 0.13}


  4%|▍         | 1270/30000 [07:09<2:26:21,  3.27it/s]

{'loss': 0.4022, 'grad_norm': 1.6609269380569458, 'learning_rate': 4.8694915254237286e-05, 'epoch': 0.13}


  4%|▍         | 1280/30000 [07:12<2:26:17,  3.27it/s]

{'loss': 0.3699, 'grad_norm': 1.0440477132797241, 'learning_rate': 4.8677966101694915e-05, 'epoch': 0.13}


  4%|▍         | 1290/30000 [07:15<2:26:12,  3.27it/s]

{'loss': 0.3678, 'grad_norm': 0.7969001531600952, 'learning_rate': 4.8661016949152545e-05, 'epoch': 0.13}


  4%|▍         | 1300/30000 [07:18<2:26:26,  3.27it/s]

{'loss': 0.3748, 'grad_norm': 1.1296801567077637, 'learning_rate': 4.8644067796610174e-05, 'epoch': 0.13}


  4%|▍         | 1310/30000 [07:21<2:26:07,  3.27it/s]

{'loss': 0.3697, 'grad_norm': 0.8819137215614319, 'learning_rate': 4.86271186440678e-05, 'epoch': 0.13}


  4%|▍         | 1320/30000 [07:24<2:26:02,  3.27it/s]

{'loss': 0.3848, 'grad_norm': 0.8544536232948303, 'learning_rate': 4.8610169491525426e-05, 'epoch': 0.13}


  4%|▍         | 1330/30000 [07:27<2:26:03,  3.27it/s]

{'loss': 0.3817, 'grad_norm': 0.7795281410217285, 'learning_rate': 4.8593220338983055e-05, 'epoch': 0.13}


  4%|▍         | 1340/30000 [07:30<2:26:00,  3.27it/s]

{'loss': 0.3794, 'grad_norm': 0.7683534622192383, 'learning_rate': 4.857627118644068e-05, 'epoch': 0.13}


  4%|▍         | 1350/30000 [11:41<36:29:12,  4.58s/it] 

{'loss': 0.3787, 'grad_norm': 1.284743309020996, 'learning_rate': 4.855932203389831e-05, 'epoch': 0.14}


  5%|▍         | 1360/30000 [11:44<3:23:21,  2.35it/s] 

{'loss': 0.3935, 'grad_norm': 0.904860258102417, 'learning_rate': 4.8542372881355937e-05, 'epoch': 0.14}


  5%|▍         | 1370/30000 [11:47<2:27:17,  3.24it/s]

{'loss': 0.3852, 'grad_norm': 1.125594973564148, 'learning_rate': 4.8525423728813566e-05, 'epoch': 0.14}


  5%|▍         | 1380/30000 [11:50<2:25:55,  3.27it/s]

{'loss': 0.3742, 'grad_norm': 1.0734056234359741, 'learning_rate': 4.850847457627119e-05, 'epoch': 0.14}


  5%|▍         | 1390/30000 [11:53<2:25:46,  3.27it/s]

{'loss': 0.3661, 'grad_norm': 0.8937897682189941, 'learning_rate': 4.849152542372882e-05, 'epoch': 0.14}


  5%|▍         | 1400/30000 [11:56<2:25:41,  3.27it/s]

{'loss': 0.3995, 'grad_norm': 1.0186976194381714, 'learning_rate': 4.847457627118645e-05, 'epoch': 0.14}


  5%|▍         | 1410/30000 [11:59<2:25:35,  3.27it/s]

{'loss': 0.3959, 'grad_norm': 1.420222520828247, 'learning_rate': 4.845762711864407e-05, 'epoch': 0.14}


  5%|▍         | 1420/30000 [12:02<2:25:32,  3.27it/s]

{'loss': 0.3612, 'grad_norm': 1.4895142316818237, 'learning_rate': 4.84406779661017e-05, 'epoch': 0.14}


  5%|▍         | 1430/30000 [12:05<2:25:32,  3.27it/s]

{'loss': 0.3689, 'grad_norm': 1.2722821235656738, 'learning_rate': 4.842372881355933e-05, 'epoch': 0.14}


  5%|▍         | 1440/30000 [12:08<2:25:22,  3.27it/s]

{'loss': 0.3646, 'grad_norm': 1.3554208278656006, 'learning_rate': 4.840677966101695e-05, 'epoch': 0.14}


  5%|▍         | 1450/30000 [12:11<2:25:17,  3.28it/s]

{'loss': 0.366, 'grad_norm': 0.949082612991333, 'learning_rate': 4.8389830508474574e-05, 'epoch': 0.14}


  5%|▍         | 1460/30000 [12:14<2:25:23,  3.27it/s]

{'loss': 0.3705, 'grad_norm': 1.2990548610687256, 'learning_rate': 4.83728813559322e-05, 'epoch': 0.15}


  5%|▍         | 1470/30000 [12:17<2:25:17,  3.27it/s]

{'loss': 0.375, 'grad_norm': 1.1080161333084106, 'learning_rate': 4.835593220338983e-05, 'epoch': 0.15}


  5%|▍         | 1480/30000 [12:22<5:49:25,  1.36it/s]

{'loss': 0.3692, 'grad_norm': 1.0573596954345703, 'learning_rate': 4.833898305084746e-05, 'epoch': 0.15}


  5%|▍         | 1490/30000 [27:33<255:56:19, 32.32s/it]  

{'loss': 0.3593, 'grad_norm': 1.1285009384155273, 'learning_rate': 4.8322033898305084e-05, 'epoch': 0.15}


  5%|▌         | 1500/30000 [27:36<9:34:39,  1.21s/it]  

{'loss': 0.3536, 'grad_norm': 0.8771714568138123, 'learning_rate': 4.8305084745762714e-05, 'epoch': 0.15}


  5%|▌         | 1510/30000 [27:39<2:36:38,  3.03it/s]

{'loss': 0.374, 'grad_norm': 1.6024178266525269, 'learning_rate': 4.828813559322034e-05, 'epoch': 0.15}


  5%|▌         | 1520/30000 [27:42<2:24:48,  3.28it/s]

{'loss': 0.3496, 'grad_norm': 1.2373666763305664, 'learning_rate': 4.8271186440677966e-05, 'epoch': 0.15}


  5%|▌         | 1530/30000 [27:45<2:24:24,  3.29it/s]

{'loss': 0.3713, 'grad_norm': 1.1375706195831299, 'learning_rate': 4.8254237288135595e-05, 'epoch': 0.15}


  5%|▌         | 1540/30000 [27:48<2:24:37,  3.28it/s]

{'loss': 0.3451, 'grad_norm': 0.9731981158256531, 'learning_rate': 4.8237288135593224e-05, 'epoch': 0.15}


  5%|▌         | 1550/30000 [27:51<2:24:40,  3.28it/s]

{'loss': 0.3662, 'grad_norm': 1.0004758834838867, 'learning_rate': 4.822033898305085e-05, 'epoch': 0.15}


  5%|▌         | 1560/30000 [27:55<2:25:16,  3.26it/s]

{'loss': 0.3549, 'grad_norm': 0.7973238229751587, 'learning_rate': 4.8203389830508476e-05, 'epoch': 0.16}


  5%|▌         | 1570/30000 [27:58<2:24:48,  3.27it/s]

{'loss': 0.3516, 'grad_norm': 1.2529547214508057, 'learning_rate': 4.8186440677966105e-05, 'epoch': 0.16}


  5%|▌         | 1580/30000 [28:01<2:24:37,  3.28it/s]

{'loss': 0.3309, 'grad_norm': 0.9086198210716248, 'learning_rate': 4.8169491525423735e-05, 'epoch': 0.16}


  5%|▌         | 1590/30000 [28:04<2:24:40,  3.27it/s]

{'loss': 0.3428, 'grad_norm': 0.9944342374801636, 'learning_rate': 4.815254237288136e-05, 'epoch': 0.16}


  5%|▌         | 1600/30000 [28:07<2:24:27,  3.28it/s]

{'loss': 0.3794, 'grad_norm': 1.0131906270980835, 'learning_rate': 4.813559322033899e-05, 'epoch': 0.16}


  5%|▌         | 1610/30000 [28:10<2:24:44,  3.27it/s]

{'loss': 0.3603, 'grad_norm': 0.9713866710662842, 'learning_rate': 4.8118644067796616e-05, 'epoch': 0.16}


  5%|▌         | 1620/30000 [28:13<2:24:37,  3.27it/s]

{'loss': 0.3614, 'grad_norm': 0.9816338419914246, 'learning_rate': 4.810169491525424e-05, 'epoch': 0.16}


  5%|▌         | 1630/30000 [28:16<2:24:20,  3.28it/s]

{'loss': 0.3672, 'grad_norm': 1.2262446880340576, 'learning_rate': 4.808474576271187e-05, 'epoch': 0.16}


  5%|▌         | 1640/30000 [28:19<2:24:17,  3.28it/s]

{'loss': 0.3705, 'grad_norm': 1.0026859045028687, 'learning_rate': 4.80677966101695e-05, 'epoch': 0.16}


  6%|▌         | 1650/30000 [28:22<2:24:18,  3.27it/s]

{'loss': 0.3718, 'grad_norm': 0.725653350353241, 'learning_rate': 4.805084745762712e-05, 'epoch': 0.17}


  6%|▌         | 1660/30000 [28:25<2:24:10,  3.28it/s]

{'loss': 0.3715, 'grad_norm': 0.728033185005188, 'learning_rate': 4.803389830508474e-05, 'epoch': 0.17}


  6%|▌         | 1670/30000 [28:28<2:24:07,  3.28it/s]

{'loss': 0.3475, 'grad_norm': 1.0787620544433594, 'learning_rate': 4.801694915254237e-05, 'epoch': 0.17}


  6%|▌         | 1680/30000 [28:31<2:24:09,  3.27it/s]

{'loss': 0.3565, 'grad_norm': 1.1931790113449097, 'learning_rate': 4.8e-05, 'epoch': 0.17}


  6%|▌         | 1690/30000 [28:34<2:23:52,  3.28it/s]

{'loss': 0.3683, 'grad_norm': 1.2433775663375854, 'learning_rate': 4.798305084745763e-05, 'epoch': 0.17}


  6%|▌         | 1700/30000 [28:37<2:23:55,  3.28it/s]

{'loss': 0.372, 'grad_norm': 1.0193594694137573, 'learning_rate': 4.796610169491525e-05, 'epoch': 0.17}


  6%|▌         | 1710/30000 [28:40<2:23:57,  3.28it/s]

{'loss': 0.3628, 'grad_norm': 0.9191087484359741, 'learning_rate': 4.794915254237288e-05, 'epoch': 0.17}


  6%|▌         | 1720/30000 [28:43<2:23:56,  3.27it/s]

{'loss': 0.3629, 'grad_norm': 0.859653651714325, 'learning_rate': 4.793220338983051e-05, 'epoch': 0.17}


  6%|▌         | 1730/30000 [28:46<2:23:55,  3.27it/s]

{'loss': 0.3521, 'grad_norm': 0.6321346759796143, 'learning_rate': 4.7915254237288134e-05, 'epoch': 0.17}


  6%|▌         | 1740/30000 [28:50<2:23:56,  3.27it/s]

{'loss': 0.3483, 'grad_norm': 0.8540596961975098, 'learning_rate': 4.7898305084745764e-05, 'epoch': 0.17}


  6%|▌         | 1750/30000 [28:53<2:23:39,  3.28it/s]

{'loss': 0.3338, 'grad_norm': 0.7294313311576843, 'learning_rate': 4.788135593220339e-05, 'epoch': 0.17}


  6%|▌         | 1760/30000 [28:56<2:23:48,  3.27it/s]

{'loss': 0.3454, 'grad_norm': 0.9103872179985046, 'learning_rate': 4.786440677966102e-05, 'epoch': 0.18}


  6%|▌         | 1770/30000 [28:59<2:23:54,  3.27it/s]

{'loss': 0.3746, 'grad_norm': 0.9627324342727661, 'learning_rate': 4.7847457627118645e-05, 'epoch': 0.18}


  6%|▌         | 1780/30000 [29:02<2:23:35,  3.28it/s]

{'loss': 0.3383, 'grad_norm': 0.8279797434806824, 'learning_rate': 4.7830508474576274e-05, 'epoch': 0.18}


  6%|▌         | 1790/30000 [29:05<2:24:09,  3.26it/s]

{'loss': 0.3464, 'grad_norm': 1.0267833471298218, 'learning_rate': 4.7813559322033904e-05, 'epoch': 0.18}


  6%|▌         | 1800/30000 [29:08<2:24:27,  3.25it/s]

{'loss': 0.3429, 'grad_norm': 0.9191983938217163, 'learning_rate': 4.7796610169491526e-05, 'epoch': 0.18}


  6%|▌         | 1810/30000 [29:11<2:23:59,  3.26it/s]

{'loss': 0.3615, 'grad_norm': 0.8952113389968872, 'learning_rate': 4.7779661016949156e-05, 'epoch': 0.18}


  6%|▌         | 1820/30000 [29:14<2:24:02,  3.26it/s]

{'loss': 0.3626, 'grad_norm': 0.7497138381004333, 'learning_rate': 4.7762711864406785e-05, 'epoch': 0.18}


  6%|▌         | 1830/30000 [29:17<2:23:24,  3.27it/s]

{'loss': 0.3341, 'grad_norm': 0.6733661890029907, 'learning_rate': 4.7745762711864414e-05, 'epoch': 0.18}


  6%|▌         | 1840/30000 [29:20<2:23:28,  3.27it/s]

{'loss': 0.3378, 'grad_norm': 0.8539861440658569, 'learning_rate': 4.772881355932204e-05, 'epoch': 0.18}


  6%|▌         | 1850/30000 [29:23<2:23:20,  3.27it/s]

{'loss': 0.3452, 'grad_norm': 1.1258652210235596, 'learning_rate': 4.7711864406779666e-05, 'epoch': 0.18}


  6%|▌         | 1860/30000 [29:26<2:23:20,  3.27it/s]

{'loss': 0.3607, 'grad_norm': 0.8301769495010376, 'learning_rate': 4.769491525423729e-05, 'epoch': 0.19}


  6%|▌         | 1870/30000 [29:30<2:39:15,  2.94it/s]

{'loss': 0.3581, 'grad_norm': 0.8929471373558044, 'learning_rate': 4.767796610169492e-05, 'epoch': 0.19}


  6%|▋         | 1880/30000 [29:33<2:32:34,  3.07it/s]

{'loss': 0.3355, 'grad_norm': 1.0931376218795776, 'learning_rate': 4.766101694915254e-05, 'epoch': 0.19}


  6%|▋         | 1890/30000 [29:36<2:33:39,  3.05it/s]

{'loss': 0.3518, 'grad_norm': 1.031983733177185, 'learning_rate': 4.764406779661017e-05, 'epoch': 0.19}


  6%|▋         | 1900/30000 [29:43<6:39:25,  1.17it/s]

{'loss': 0.3703, 'grad_norm': 1.2984211444854736, 'learning_rate': 4.76271186440678e-05, 'epoch': 0.19}


  6%|▋         | 1910/30000 [39:28<1354:07:17, 173.54s/it]

{'loss': 0.3312, 'grad_norm': 0.862895667552948, 'learning_rate': 4.761016949152542e-05, 'epoch': 0.19}


  6%|▋         | 1920/30000 [39:31<40:35:03,  5.20s/it]   

{'loss': 0.3335, 'grad_norm': 0.8367176651954651, 'learning_rate': 4.759322033898305e-05, 'epoch': 0.19}


  6%|▋         | 1930/30000 [39:34<3:27:07,  2.26it/s] 

{'loss': 0.3343, 'grad_norm': 1.074446201324463, 'learning_rate': 4.757627118644068e-05, 'epoch': 0.19}


  6%|▋         | 1940/30000 [39:37<2:26:13,  3.20it/s]

{'loss': 0.3372, 'grad_norm': 1.0143601894378662, 'learning_rate': 4.755932203389831e-05, 'epoch': 0.19}


  6%|▋         | 1950/30000 [39:40<2:22:39,  3.28it/s]

{'loss': 0.3423, 'grad_norm': 0.8144118785858154, 'learning_rate': 4.754237288135593e-05, 'epoch': 0.2}


  7%|▋         | 1960/30000 [39:43<2:22:18,  3.28it/s]

{'loss': 0.3295, 'grad_norm': 0.8849963545799255, 'learning_rate': 4.752542372881356e-05, 'epoch': 0.2}


  7%|▋         | 1970/30000 [39:46<2:22:21,  3.28it/s]

{'loss': 0.339, 'grad_norm': 1.1007435321807861, 'learning_rate': 4.750847457627119e-05, 'epoch': 0.2}


  7%|▋         | 1980/30000 [39:49<2:22:23,  3.28it/s]

{'loss': 0.3278, 'grad_norm': 0.7637067437171936, 'learning_rate': 4.7491525423728814e-05, 'epoch': 0.2}


  7%|▋         | 1990/30000 [39:52<2:22:18,  3.28it/s]

{'loss': 0.3313, 'grad_norm': 0.864208459854126, 'learning_rate': 4.747457627118644e-05, 'epoch': 0.2}


  7%|▋         | 2000/30000 [39:55<2:22:29,  3.27it/s]

{'loss': 0.3305, 'grad_norm': 1.0391666889190674, 'learning_rate': 4.745762711864407e-05, 'epoch': 0.2}


  7%|▋         | 2010/30000 [39:58<2:22:55,  3.26it/s]

{'loss': 0.34, 'grad_norm': 0.9002070426940918, 'learning_rate': 4.74406779661017e-05, 'epoch': 0.2}


  7%|▋         | 2020/30000 [40:01<2:22:28,  3.27it/s]

{'loss': 0.3253, 'grad_norm': 1.0309134721755981, 'learning_rate': 4.7423728813559325e-05, 'epoch': 0.2}


  7%|▋         | 2030/30000 [40:04<2:22:21,  3.27it/s]

{'loss': 0.3345, 'grad_norm': 0.8539524674415588, 'learning_rate': 4.7406779661016954e-05, 'epoch': 0.2}


  7%|▋         | 2040/30000 [40:07<2:22:23,  3.27it/s]

{'loss': 0.3462, 'grad_norm': 1.1172009706497192, 'learning_rate': 4.738983050847458e-05, 'epoch': 0.2}


  7%|▋         | 2050/30000 [40:10<2:22:19,  3.27it/s]

{'loss': 0.3313, 'grad_norm': 0.9664563536643982, 'learning_rate': 4.7372881355932206e-05, 'epoch': 0.2}


  7%|▋         | 2060/30000 [40:13<2:22:26,  3.27it/s]

{'loss': 0.3405, 'grad_norm': 0.9912631511688232, 'learning_rate': 4.735593220338983e-05, 'epoch': 0.21}


  7%|▋         | 2070/30000 [54:03<160:49:47, 20.73s/it]  

{'loss': 0.344, 'grad_norm': 0.8318184614181519, 'learning_rate': 4.733898305084746e-05, 'epoch': 0.21}


  7%|▋         | 2080/30000 [54:06<6:50:14,  1.13it/s]  

{'loss': 0.3358, 'grad_norm': 1.0056339502334595, 'learning_rate': 4.732203389830509e-05, 'epoch': 0.21}


  7%|▋         | 2090/30000 [54:09<2:29:14,  3.12it/s]

{'loss': 0.3293, 'grad_norm': 0.8200686573982239, 'learning_rate': 4.730508474576271e-05, 'epoch': 0.21}


  7%|▋         | 2100/30000 [54:12<2:21:42,  3.28it/s]

{'loss': 0.319, 'grad_norm': 0.8832597136497498, 'learning_rate': 4.728813559322034e-05, 'epoch': 0.21}


  7%|▋         | 2110/30000 [54:15<2:21:45,  3.28it/s]

{'loss': 0.3411, 'grad_norm': 1.276639699935913, 'learning_rate': 4.727118644067797e-05, 'epoch': 0.21}


  7%|▋         | 2120/30000 [54:19<2:21:41,  3.28it/s]

{'loss': 0.3137, 'grad_norm': 0.7005897760391235, 'learning_rate': 4.72542372881356e-05, 'epoch': 0.21}


  7%|▋         | 2130/30000 [54:22<2:21:45,  3.28it/s]

{'loss': 0.3122, 'grad_norm': 0.8296260237693787, 'learning_rate': 4.723728813559322e-05, 'epoch': 0.21}


  7%|▋         | 2140/30000 [54:25<2:21:40,  3.28it/s]

{'loss': 0.3397, 'grad_norm': 0.8379347324371338, 'learning_rate': 4.722033898305085e-05, 'epoch': 0.21}


  7%|▋         | 2150/30000 [54:28<2:21:43,  3.28it/s]

{'loss': 0.3195, 'grad_norm': 0.8692143559455872, 'learning_rate': 4.720338983050848e-05, 'epoch': 0.21}


  7%|▋         | 2160/30000 [54:31<2:21:39,  3.28it/s]

{'loss': 0.3307, 'grad_norm': 0.8694899678230286, 'learning_rate': 4.71864406779661e-05, 'epoch': 0.22}


  7%|▋         | 2170/30000 [54:34<2:21:34,  3.28it/s]

{'loss': 0.3653, 'grad_norm': 0.8341266512870789, 'learning_rate': 4.716949152542373e-05, 'epoch': 0.22}


  7%|▋         | 2180/30000 [54:37<2:21:18,  3.28it/s]

{'loss': 0.3419, 'grad_norm': 0.8273482918739319, 'learning_rate': 4.715254237288136e-05, 'epoch': 0.22}


  7%|▋         | 2190/30000 [54:40<2:21:18,  3.28it/s]

{'loss': 0.3378, 'grad_norm': 1.1250839233398438, 'learning_rate': 4.713559322033898e-05, 'epoch': 0.22}


  7%|▋         | 2200/30000 [54:43<2:21:08,  3.28it/s]

{'loss': 0.3162, 'grad_norm': 0.7439759373664856, 'learning_rate': 4.711864406779661e-05, 'epoch': 0.22}


  7%|▋         | 2210/30000 [54:46<2:21:13,  3.28it/s]

{'loss': 0.3341, 'grad_norm': 1.4254416227340698, 'learning_rate': 4.710169491525424e-05, 'epoch': 0.22}


  7%|▋         | 2220/30000 [54:49<2:21:02,  3.28it/s]

{'loss': 0.3274, 'grad_norm': 1.08259916305542, 'learning_rate': 4.708474576271187e-05, 'epoch': 0.22}


  7%|▋         | 2230/30000 [54:52<2:21:31,  3.27it/s]

{'loss': 0.3257, 'grad_norm': 0.8658353686332703, 'learning_rate': 4.7067796610169493e-05, 'epoch': 0.22}


  7%|▋         | 2240/30000 [1:11:00<92:25:07, 11.99s/it]   

{'loss': 0.3217, 'grad_norm': 0.9694400429725647, 'learning_rate': 4.705084745762712e-05, 'epoch': 0.22}


  8%|▊         | 2250/30000 [1:11:03<4:53:19,  1.58it/s] 

{'loss': 0.3247, 'grad_norm': 0.860080361366272, 'learning_rate': 4.703389830508475e-05, 'epoch': 0.23}


  8%|▊         | 2260/30000 [1:11:06<2:25:05,  3.19it/s]

{'loss': 0.3279, 'grad_norm': 0.9528919458389282, 'learning_rate': 4.7016949152542375e-05, 'epoch': 0.23}


  8%|▊         | 2270/30000 [1:11:09<2:20:45,  3.28it/s]

{'loss': 0.3506, 'grad_norm': 0.9484975337982178, 'learning_rate': 4.7e-05, 'epoch': 0.23}


  8%|▊         | 2280/30000 [1:11:12<2:20:37,  3.29it/s]

{'loss': 0.3693, 'grad_norm': 1.2718393802642822, 'learning_rate': 4.6983050847457627e-05, 'epoch': 0.23}


  8%|▊         | 2290/30000 [1:11:15<2:20:37,  3.28it/s]

{'loss': 0.3438, 'grad_norm': 0.8496630191802979, 'learning_rate': 4.6966101694915256e-05, 'epoch': 0.23}


  8%|▊         | 2300/30000 [1:11:18<2:20:32,  3.28it/s]

{'loss': 0.3473, 'grad_norm': 0.7024959325790405, 'learning_rate': 4.694915254237288e-05, 'epoch': 0.23}


  8%|▊         | 2310/30000 [1:11:21<2:21:08,  3.27it/s]

{'loss': 0.3153, 'grad_norm': 1.0435595512390137, 'learning_rate': 4.693220338983051e-05, 'epoch': 0.23}


  8%|▊         | 2320/30000 [1:11:24<2:20:40,  3.28it/s]

{'loss': 0.335, 'grad_norm': 0.8874940276145935, 'learning_rate': 4.691525423728814e-05, 'epoch': 0.23}


  8%|▊         | 2330/30000 [1:11:27<2:20:42,  3.28it/s]

{'loss': 0.3247, 'grad_norm': 0.6540865898132324, 'learning_rate': 4.6898305084745767e-05, 'epoch': 0.23}


  8%|▊         | 2340/30000 [1:11:31<2:20:48,  3.27it/s]

{'loss': 0.3071, 'grad_norm': 0.9318116903305054, 'learning_rate': 4.688135593220339e-05, 'epoch': 0.23}


  8%|▊         | 2350/30000 [1:11:34<2:20:46,  3.27it/s]

{'loss': 0.3305, 'grad_norm': 0.7057228684425354, 'learning_rate': 4.686440677966102e-05, 'epoch': 0.23}


  8%|▊         | 2360/30000 [1:11:37<2:20:45,  3.27it/s]

{'loss': 0.3151, 'grad_norm': 0.6385005712509155, 'learning_rate': 4.684745762711865e-05, 'epoch': 0.24}


  8%|▊         | 2370/30000 [1:11:40<2:20:43,  3.27it/s]

{'loss': 0.3418, 'grad_norm': 1.2314938306808472, 'learning_rate': 4.683050847457627e-05, 'epoch': 0.24}


  8%|▊         | 2380/30000 [1:11:43<2:20:40,  3.27it/s]

{'loss': 0.3318, 'grad_norm': 0.7554792165756226, 'learning_rate': 4.68135593220339e-05, 'epoch': 0.24}


  8%|▊         | 2390/30000 [1:26:50<247:08:06, 32.22s/it]  

{'loss': 0.3358, 'grad_norm': 0.8206941485404968, 'learning_rate': 4.679661016949153e-05, 'epoch': 0.24}


  8%|▊         | 2400/30000 [1:26:53<9:14:29,  1.21s/it]  

{'loss': 0.3375, 'grad_norm': 0.848188579082489, 'learning_rate': 4.677966101694916e-05, 'epoch': 0.24}


  8%|▊         | 2410/30000 [1:26:56<2:31:23,  3.04it/s]

{'loss': 0.3106, 'grad_norm': 0.6892242431640625, 'learning_rate': 4.676271186440678e-05, 'epoch': 0.24}


  8%|▊         | 2420/30000 [1:26:59<2:19:49,  3.29it/s]

{'loss': 0.3194, 'grad_norm': 0.6341240406036377, 'learning_rate': 4.674576271186441e-05, 'epoch': 0.24}


  8%|▊         | 2430/30000 [1:27:02<2:19:56,  3.28it/s]

{'loss': 0.315, 'grad_norm': 0.7995321750640869, 'learning_rate': 4.672881355932204e-05, 'epoch': 0.24}


  8%|▊         | 2440/30000 [1:27:05<2:19:28,  3.29it/s]

{'loss': 0.32, 'grad_norm': 0.8831521272659302, 'learning_rate': 4.671186440677966e-05, 'epoch': 0.24}


  8%|▊         | 2450/30000 [1:27:08<2:19:32,  3.29it/s]

{'loss': 0.338, 'grad_norm': 0.8115801215171814, 'learning_rate': 4.669491525423729e-05, 'epoch': 0.24}


  8%|▊         | 2460/30000 [1:27:11<2:19:43,  3.29it/s]

{'loss': 0.347, 'grad_norm': 0.8839223980903625, 'learning_rate': 4.667796610169492e-05, 'epoch': 0.25}


  8%|▊         | 2470/30000 [1:27:14<2:19:39,  3.29it/s]

{'loss': 0.3206, 'grad_norm': 0.6719303131103516, 'learning_rate': 4.666101694915255e-05, 'epoch': 0.25}


  8%|▊         | 2480/30000 [1:27:18<2:19:38,  3.28it/s]

{'loss': 0.3196, 'grad_norm': 0.6893274188041687, 'learning_rate': 4.6644067796610166e-05, 'epoch': 0.25}


  8%|▊         | 2490/30000 [1:27:21<2:19:35,  3.28it/s]

{'loss': 0.3288, 'grad_norm': 0.8352471590042114, 'learning_rate': 4.6627118644067795e-05, 'epoch': 0.25}


  8%|▊         | 2500/30000 [1:27:24<2:19:45,  3.28it/s]

{'loss': 0.3194, 'grad_norm': 0.6209055781364441, 'learning_rate': 4.6610169491525425e-05, 'epoch': 0.25}


  8%|▊         | 2510/30000 [1:27:27<2:20:01,  3.27it/s]

{'loss': 0.3163, 'grad_norm': 0.7975791096687317, 'learning_rate': 4.6593220338983054e-05, 'epoch': 0.25}


  8%|▊         | 2520/30000 [1:27:30<2:19:59,  3.27it/s]

{'loss': 0.3198, 'grad_norm': 0.7364035248756409, 'learning_rate': 4.657627118644068e-05, 'epoch': 0.25}


  8%|▊         | 2530/30000 [1:27:33<2:19:53,  3.27it/s]

{'loss': 0.3089, 'grad_norm': 0.7989888787269592, 'learning_rate': 4.6559322033898306e-05, 'epoch': 0.25}


  8%|▊         | 2540/30000 [1:31:47<199:19:01, 26.13s/it]

{'loss': 0.3071, 'grad_norm': 0.6763617396354675, 'learning_rate': 4.6542372881355935e-05, 'epoch': 0.25}


  8%|▊         | 2550/30000 [1:31:50<7:53:00,  1.03s/it]  

{'loss': 0.32, 'grad_norm': 0.84530109167099, 'learning_rate': 4.652542372881356e-05, 'epoch': 0.26}


  9%|▊         | 2560/30000 [1:31:53<2:28:37,  3.08it/s]

{'loss': 0.3043, 'grad_norm': 0.6960268020629883, 'learning_rate': 4.650847457627119e-05, 'epoch': 0.26}


  9%|▊         | 2570/30000 [1:31:56<2:19:25,  3.28it/s]

{'loss': 0.3284, 'grad_norm': 0.72306227684021, 'learning_rate': 4.649152542372882e-05, 'epoch': 0.26}


  9%|▊         | 2580/30000 [1:31:59<2:19:06,  3.29it/s]

{'loss': 0.3245, 'grad_norm': 0.8130708932876587, 'learning_rate': 4.6474576271186446e-05, 'epoch': 0.26}


  9%|▊         | 2590/30000 [1:32:02<2:19:16,  3.28it/s]

{'loss': 0.3106, 'grad_norm': 0.6189382672309875, 'learning_rate': 4.645762711864407e-05, 'epoch': 0.26}


  9%|▊         | 2600/30000 [1:32:05<2:19:21,  3.28it/s]

{'loss': 0.316, 'grad_norm': 0.7201411724090576, 'learning_rate': 4.64406779661017e-05, 'epoch': 0.26}


  9%|▊         | 2610/30000 [1:32:08<2:20:54,  3.24it/s]

{'loss': 0.309, 'grad_norm': 0.8481175899505615, 'learning_rate': 4.642372881355933e-05, 'epoch': 0.26}


  9%|▊         | 2620/30000 [1:32:11<2:19:22,  3.27it/s]

{'loss': 0.3209, 'grad_norm': 0.7917857766151428, 'learning_rate': 4.640677966101695e-05, 'epoch': 0.26}


  9%|▉         | 2630/30000 [1:32:14<2:19:06,  3.28it/s]

{'loss': 0.3364, 'grad_norm': 0.8117488026618958, 'learning_rate': 4.638983050847458e-05, 'epoch': 0.26}


  9%|▉         | 2640/30000 [1:32:17<2:19:39,  3.27it/s]

{'loss': 0.3175, 'grad_norm': 0.9148954749107361, 'learning_rate': 4.637288135593221e-05, 'epoch': 0.26}


  9%|▉         | 2650/30000 [1:32:20<2:19:03,  3.28it/s]

{'loss': 0.3161, 'grad_norm': 0.7659308314323425, 'learning_rate': 4.635593220338984e-05, 'epoch': 0.27}


  9%|▉         | 2660/30000 [1:32:23<2:18:42,  3.29it/s]

{'loss': 0.3155, 'grad_norm': 0.6978754997253418, 'learning_rate': 4.633898305084746e-05, 'epoch': 0.27}


  9%|▉         | 2670/30000 [1:32:27<2:18:59,  3.28it/s]

{'loss': 0.3172, 'grad_norm': 0.6546132564544678, 'learning_rate': 4.632203389830509e-05, 'epoch': 0.27}


  9%|▉         | 2680/30000 [1:32:30<2:18:55,  3.28it/s]

{'loss': 0.308, 'grad_norm': 0.772638738155365, 'learning_rate': 4.630508474576272e-05, 'epoch': 0.27}


  9%|▉         | 2690/30000 [1:32:33<2:18:39,  3.28it/s]

{'loss': 0.3125, 'grad_norm': 0.6664716005325317, 'learning_rate': 4.628813559322034e-05, 'epoch': 0.27}


  9%|▉         | 2700/30000 [1:32:36<2:18:50,  3.28it/s]

{'loss': 0.3301, 'grad_norm': 0.9247205257415771, 'learning_rate': 4.6271186440677964e-05, 'epoch': 0.27}


  9%|▉         | 2710/30000 [1:32:39<2:18:39,  3.28it/s]

{'loss': 0.3082, 'grad_norm': 0.8732995986938477, 'learning_rate': 4.6254237288135594e-05, 'epoch': 0.27}


  9%|▉         | 2720/30000 [1:32:42<2:18:41,  3.28it/s]

{'loss': 0.331, 'grad_norm': 0.7244877219200134, 'learning_rate': 4.623728813559322e-05, 'epoch': 0.27}


  9%|▉         | 2730/30000 [1:32:45<2:18:46,  3.28it/s]

{'loss': 0.3114, 'grad_norm': 1.2012540102005005, 'learning_rate': 4.6220338983050846e-05, 'epoch': 0.27}


  9%|▉         | 2740/30000 [1:32:48<2:18:41,  3.28it/s]

{'loss': 0.3418, 'grad_norm': 0.7326661944389343, 'learning_rate': 4.6203389830508475e-05, 'epoch': 0.27}


  9%|▉         | 2750/30000 [1:32:51<2:18:29,  3.28it/s]

{'loss': 0.3028, 'grad_norm': 0.9103760123252869, 'learning_rate': 4.6186440677966104e-05, 'epoch': 0.28}


  9%|▉         | 2760/30000 [1:32:54<2:18:24,  3.28it/s]

{'loss': 0.3194, 'grad_norm': 0.8840307593345642, 'learning_rate': 4.6169491525423734e-05, 'epoch': 0.28}


  9%|▉         | 2770/30000 [1:32:57<2:18:37,  3.27it/s]

{'loss': 0.3017, 'grad_norm': 0.602769136428833, 'learning_rate': 4.6152542372881356e-05, 'epoch': 0.28}


  9%|▉         | 2780/30000 [1:33:00<2:18:20,  3.28it/s]

{'loss': 0.2942, 'grad_norm': 0.628430962562561, 'learning_rate': 4.6135593220338986e-05, 'epoch': 0.28}


  9%|▉         | 2790/30000 [1:33:03<2:18:28,  3.28it/s]

{'loss': 0.337, 'grad_norm': 0.8597489595413208, 'learning_rate': 4.6118644067796615e-05, 'epoch': 0.28}


  9%|▉         | 2800/30000 [1:33:06<2:18:07,  3.28it/s]

{'loss': 0.3084, 'grad_norm': 0.8605775237083435, 'learning_rate': 4.610169491525424e-05, 'epoch': 0.28}


  9%|▉         | 2810/30000 [1:33:09<2:18:44,  3.27it/s]

{'loss': 0.3253, 'grad_norm': 0.68231600522995, 'learning_rate': 4.608474576271187e-05, 'epoch': 0.28}


  9%|▉         | 2820/30000 [1:33:12<2:25:32,  3.11it/s]

{'loss': 0.324, 'grad_norm': 0.7476484179496765, 'learning_rate': 4.6067796610169496e-05, 'epoch': 0.28}


  9%|▉         | 2830/30000 [1:33:16<2:27:57,  3.06it/s]

{'loss': 0.3142, 'grad_norm': 0.8418848514556885, 'learning_rate': 4.605084745762712e-05, 'epoch': 0.28}


  9%|▉         | 2840/30000 [1:33:19<2:26:01,  3.10it/s]

{'loss': 0.3172, 'grad_norm': 0.9606682658195496, 'learning_rate': 4.603389830508475e-05, 'epoch': 0.28}


 10%|▉         | 2850/30000 [1:33:22<2:29:45,  3.02it/s]

{'loss': 0.312, 'grad_norm': 1.2212049961090088, 'learning_rate': 4.601694915254238e-05, 'epoch': 0.28}


 10%|▉         | 2860/30000 [1:33:25<2:21:45,  3.19it/s]

{'loss': 0.3177, 'grad_norm': 0.85260409116745, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.29}


 10%|▉         | 2870/30000 [1:33:29<2:21:48,  3.19it/s]

{'loss': 0.2978, 'grad_norm': 0.7661054730415344, 'learning_rate': 4.598305084745763e-05, 'epoch': 0.29}


 10%|▉         | 2880/30000 [1:33:32<2:21:42,  3.19it/s]

{'loss': 0.3026, 'grad_norm': 0.681261420249939, 'learning_rate': 4.596610169491526e-05, 'epoch': 0.29}


 10%|▉         | 2890/30000 [1:33:35<2:29:06,  3.03it/s]

{'loss': 0.3045, 'grad_norm': 0.6449880003929138, 'learning_rate': 4.594915254237288e-05, 'epoch': 0.29}


 10%|▉         | 2900/30000 [1:33:38<2:27:35,  3.06it/s]

{'loss': 0.3294, 'grad_norm': 0.7692212462425232, 'learning_rate': 4.593220338983051e-05, 'epoch': 0.29}


 10%|▉         | 2910/30000 [1:33:41<2:21:33,  3.19it/s]

{'loss': 0.3083, 'grad_norm': 0.8821718096733093, 'learning_rate': 4.591525423728813e-05, 'epoch': 0.29}


 10%|▉         | 2920/30000 [1:33:45<2:25:35,  3.10it/s]

{'loss': 0.3263, 'grad_norm': 0.7191026210784912, 'learning_rate': 4.589830508474576e-05, 'epoch': 0.29}


 10%|▉         | 2930/30000 [1:33:48<2:22:10,  3.17it/s]

{'loss': 0.3044, 'grad_norm': 0.7710386514663696, 'learning_rate': 4.588135593220339e-05, 'epoch': 0.29}


 10%|▉         | 2940/30000 [1:33:51<2:21:01,  3.20it/s]

{'loss': 0.3185, 'grad_norm': 0.6253623962402344, 'learning_rate': 4.5864406779661014e-05, 'epoch': 0.29}


 10%|▉         | 2950/30000 [1:33:54<2:21:52,  3.18it/s]

{'loss': 0.3115, 'grad_norm': 0.7403371334075928, 'learning_rate': 4.5847457627118644e-05, 'epoch': 0.29}


 10%|▉         | 2960/30000 [1:33:57<2:26:45,  3.07it/s]

{'loss': 0.3141, 'grad_norm': 0.9560918211936951, 'learning_rate': 4.583050847457627e-05, 'epoch': 0.3}


 10%|▉         | 2970/30000 [1:34:00<2:21:33,  3.18it/s]

{'loss': 0.3168, 'grad_norm': 0.736293613910675, 'learning_rate': 4.58135593220339e-05, 'epoch': 0.3}


 10%|▉         | 2980/30000 [1:34:04<2:20:42,  3.20it/s]

{'loss': 0.3081, 'grad_norm': 1.0167522430419922, 'learning_rate': 4.5796610169491525e-05, 'epoch': 0.3}


 10%|▉         | 2990/30000 [1:34:07<2:22:55,  3.15it/s]

{'loss': 0.3214, 'grad_norm': 0.6843724846839905, 'learning_rate': 4.5779661016949154e-05, 'epoch': 0.3}


 10%|█         | 3000/30000 [1:34:10<2:21:01,  3.19it/s]

{'loss': 0.3106, 'grad_norm': 0.8084708452224731, 'learning_rate': 4.5762711864406784e-05, 'epoch': 0.3}


 10%|█         | 3010/30000 [1:34:13<2:23:01,  3.15it/s]

{'loss': 0.31, 'grad_norm': 0.8443323373794556, 'learning_rate': 4.5745762711864406e-05, 'epoch': 0.3}


 10%|█         | 3020/30000 [1:34:16<2:22:19,  3.16it/s]

{'loss': 0.3023, 'grad_norm': 0.6966710686683655, 'learning_rate': 4.5728813559322036e-05, 'epoch': 0.3}


 10%|█         | 3030/30000 [1:34:19<2:21:38,  3.17it/s]

{'loss': 0.3122, 'grad_norm': 0.694304883480072, 'learning_rate': 4.5711864406779665e-05, 'epoch': 0.3}


 10%|█         | 3040/30000 [1:34:23<2:22:17,  3.16it/s]

{'loss': 0.3328, 'grad_norm': 0.9893161654472351, 'learning_rate': 4.5694915254237294e-05, 'epoch': 0.3}


 10%|█         | 3050/30000 [1:34:26<2:21:46,  3.17it/s]

{'loss': 0.3146, 'grad_norm': 0.688357949256897, 'learning_rate': 4.567796610169492e-05, 'epoch': 0.3}


 10%|█         | 3060/30000 [1:34:29<2:20:11,  3.20it/s]

{'loss': 0.3233, 'grad_norm': 0.698090136051178, 'learning_rate': 4.5661016949152546e-05, 'epoch': 0.31}


 10%|█         | 3070/30000 [1:34:32<2:23:49,  3.12it/s]

{'loss': 0.3173, 'grad_norm': 0.8120661973953247, 'learning_rate': 4.5644067796610176e-05, 'epoch': 0.31}


 10%|█         | 3080/30000 [1:34:35<2:25:01,  3.09it/s]

{'loss': 0.3236, 'grad_norm': 0.6879006028175354, 'learning_rate': 4.56271186440678e-05, 'epoch': 0.31}


 10%|█         | 3090/30000 [1:34:39<2:27:43,  3.04it/s]

{'loss': 0.3067, 'grad_norm': 0.6901822090148926, 'learning_rate': 4.561016949152543e-05, 'epoch': 0.31}


 10%|█         | 3100/30000 [1:34:42<2:25:41,  3.08it/s]

{'loss': 0.3054, 'grad_norm': 0.9958844184875488, 'learning_rate': 4.559322033898305e-05, 'epoch': 0.31}


 10%|█         | 3110/30000 [1:34:45<2:20:44,  3.18it/s]

{'loss': 0.3176, 'grad_norm': 0.8527092933654785, 'learning_rate': 4.557627118644068e-05, 'epoch': 0.31}


 10%|█         | 3120/30000 [1:34:48<2:23:44,  3.12it/s]

{'loss': 0.3078, 'grad_norm': 0.6410993337631226, 'learning_rate': 4.55593220338983e-05, 'epoch': 0.31}


 10%|█         | 3130/30000 [1:34:52<2:22:28,  3.14it/s]

{'loss': 0.3003, 'grad_norm': 0.6534008979797363, 'learning_rate': 4.554237288135593e-05, 'epoch': 0.31}


 10%|█         | 3140/30000 [1:34:55<2:26:26,  3.06it/s]

{'loss': 0.3011, 'grad_norm': 0.6609850525856018, 'learning_rate': 4.552542372881356e-05, 'epoch': 0.31}


 10%|█         | 3150/30000 [1:34:58<2:22:12,  3.15it/s]

{'loss': 0.3157, 'grad_norm': 0.85470050573349, 'learning_rate': 4.550847457627119e-05, 'epoch': 0.32}


 11%|█         | 3160/30000 [1:35:01<2:21:15,  3.17it/s]

{'loss': 0.2957, 'grad_norm': 0.7855815291404724, 'learning_rate': 4.549152542372881e-05, 'epoch': 0.32}


 11%|█         | 3170/30000 [1:35:04<2:21:29,  3.16it/s]

{'loss': 0.3237, 'grad_norm': 0.7657629251480103, 'learning_rate': 4.547457627118644e-05, 'epoch': 0.32}


 11%|█         | 3180/30000 [1:35:07<2:20:56,  3.17it/s]

{'loss': 0.2969, 'grad_norm': 0.6148530840873718, 'learning_rate': 4.545762711864407e-05, 'epoch': 0.32}


 11%|█         | 3190/30000 [1:35:11<2:24:25,  3.09it/s]

{'loss': 0.3184, 'grad_norm': 0.6861439943313599, 'learning_rate': 4.5440677966101694e-05, 'epoch': 0.32}


 11%|█         | 3200/30000 [1:35:14<2:20:43,  3.17it/s]

{'loss': 0.3107, 'grad_norm': 0.6708412170410156, 'learning_rate': 4.542372881355932e-05, 'epoch': 0.32}


 11%|█         | 3210/30000 [1:35:17<2:27:17,  3.03it/s]

{'loss': 0.3058, 'grad_norm': 1.0065813064575195, 'learning_rate': 4.540677966101695e-05, 'epoch': 0.32}


 11%|█         | 3220/30000 [1:35:20<2:21:45,  3.15it/s]

{'loss': 0.3112, 'grad_norm': 0.8518561720848083, 'learning_rate': 4.538983050847458e-05, 'epoch': 0.32}


 11%|█         | 3230/30000 [1:35:23<2:22:28,  3.13it/s]

{'loss': 0.3184, 'grad_norm': 0.8105792999267578, 'learning_rate': 4.5372881355932205e-05, 'epoch': 0.32}


 11%|█         | 3240/30000 [1:35:27<2:21:09,  3.16it/s]

{'loss': 0.3077, 'grad_norm': 1.141700267791748, 'learning_rate': 4.5355932203389834e-05, 'epoch': 0.32}


 11%|█         | 3250/30000 [1:35:30<2:20:25,  3.18it/s]

{'loss': 0.309, 'grad_norm': 0.7714330554008484, 'learning_rate': 4.533898305084746e-05, 'epoch': 0.33}


 11%|█         | 3260/30000 [1:35:33<2:20:38,  3.17it/s]

{'loss': 0.3022, 'grad_norm': 0.6974601745605469, 'learning_rate': 4.5322033898305086e-05, 'epoch': 0.33}


 11%|█         | 3270/30000 [1:35:36<2:20:34,  3.17it/s]

{'loss': 0.3243, 'grad_norm': 0.7135137319564819, 'learning_rate': 4.5305084745762715e-05, 'epoch': 0.33}


 11%|█         | 3280/30000 [1:35:39<2:21:34,  3.15it/s]

{'loss': 0.3033, 'grad_norm': 0.8076284527778625, 'learning_rate': 4.5288135593220345e-05, 'epoch': 0.33}


 11%|█         | 3290/30000 [1:35:42<2:20:12,  3.17it/s]

{'loss': 0.3066, 'grad_norm': 0.7779918909072876, 'learning_rate': 4.5271186440677974e-05, 'epoch': 0.33}


 11%|█         | 3300/30000 [1:35:45<2:20:43,  3.16it/s]

{'loss': 0.3008, 'grad_norm': 0.7328923940658569, 'learning_rate': 4.5254237288135596e-05, 'epoch': 0.33}


 11%|█         | 3310/30000 [1:35:49<2:20:06,  3.17it/s]

{'loss': 0.3054, 'grad_norm': 0.8717153668403625, 'learning_rate': 4.523728813559322e-05, 'epoch': 0.33}


 11%|█         | 3320/30000 [1:35:52<2:21:16,  3.15it/s]

{'loss': 0.2923, 'grad_norm': 0.6817702651023865, 'learning_rate': 4.522033898305085e-05, 'epoch': 0.33}


 11%|█         | 3330/30000 [1:35:55<2:20:43,  3.16it/s]

{'loss': 0.298, 'grad_norm': 0.7132118940353394, 'learning_rate': 4.520338983050848e-05, 'epoch': 0.33}


 11%|█         | 3340/30000 [1:35:58<2:20:05,  3.17it/s]

{'loss': 0.3056, 'grad_norm': 0.8314599394798279, 'learning_rate': 4.51864406779661e-05, 'epoch': 0.33}


 11%|█         | 3350/30000 [1:36:01<2:19:49,  3.18it/s]

{'loss': 0.287, 'grad_norm': 0.803253710269928, 'learning_rate': 4.516949152542373e-05, 'epoch': 0.34}


 11%|█         | 3360/30000 [1:36:04<2:19:14,  3.19it/s]

{'loss': 0.2858, 'grad_norm': 0.7501228451728821, 'learning_rate': 4.515254237288136e-05, 'epoch': 0.34}


 11%|█         | 3370/30000 [1:36:08<2:19:41,  3.18it/s]

{'loss': 0.2936, 'grad_norm': 0.759333074092865, 'learning_rate': 4.513559322033898e-05, 'epoch': 0.34}


 11%|█▏        | 3380/30000 [1:36:11<2:18:55,  3.19it/s]

{'loss': 0.3036, 'grad_norm': 0.9124815464019775, 'learning_rate': 4.511864406779661e-05, 'epoch': 0.34}


 11%|█▏        | 3390/30000 [1:36:14<2:19:03,  3.19it/s]

{'loss': 0.2901, 'grad_norm': 0.6310060620307922, 'learning_rate': 4.510169491525424e-05, 'epoch': 0.34}


 11%|█▏        | 3400/30000 [1:36:17<2:19:55,  3.17it/s]

{'loss': 0.295, 'grad_norm': 0.7059845924377441, 'learning_rate': 4.508474576271187e-05, 'epoch': 0.34}


 11%|█▏        | 3410/30000 [1:36:20<2:19:29,  3.18it/s]

{'loss': 0.3071, 'grad_norm': 0.7798197865486145, 'learning_rate': 4.506779661016949e-05, 'epoch': 0.34}


 11%|█▏        | 3420/30000 [1:36:23<2:19:50,  3.17it/s]

{'loss': 0.3067, 'grad_norm': 0.7282974720001221, 'learning_rate': 4.505084745762712e-05, 'epoch': 0.34}


 11%|█▏        | 3430/30000 [1:36:27<2:19:59,  3.16it/s]

{'loss': 0.3046, 'grad_norm': 0.6267776489257812, 'learning_rate': 4.503389830508475e-05, 'epoch': 0.34}


 11%|█▏        | 3440/30000 [1:36:30<2:19:25,  3.18it/s]

{'loss': 0.2931, 'grad_norm': 0.6012943387031555, 'learning_rate': 4.5016949152542373e-05, 'epoch': 0.34}


 12%|█▏        | 3450/30000 [1:36:33<2:21:59,  3.12it/s]

{'loss': 0.2981, 'grad_norm': 0.5630289912223816, 'learning_rate': 4.5e-05, 'epoch': 0.34}


 12%|█▏        | 3460/30000 [1:36:36<2:24:58,  3.05it/s]

{'loss': 0.2953, 'grad_norm': 0.6730179786682129, 'learning_rate': 4.498305084745763e-05, 'epoch': 0.35}


 12%|█▏        | 3470/30000 [1:36:39<2:25:01,  3.05it/s]

{'loss': 0.3161, 'grad_norm': 0.8401612043380737, 'learning_rate': 4.4966101694915255e-05, 'epoch': 0.35}


 12%|█▏        | 3480/30000 [1:36:43<2:21:05,  3.13it/s]

{'loss': 0.3005, 'grad_norm': 0.7342632412910461, 'learning_rate': 4.4949152542372884e-05, 'epoch': 0.35}


 12%|█▏        | 3490/30000 [1:36:46<2:20:04,  3.15it/s]

{'loss': 0.3091, 'grad_norm': 0.6293397545814514, 'learning_rate': 4.4932203389830513e-05, 'epoch': 0.35}


 12%|█▏        | 3500/30000 [1:36:49<2:29:45,  2.95it/s]

{'loss': 0.2834, 'grad_norm': 0.7827072739601135, 'learning_rate': 4.491525423728814e-05, 'epoch': 0.35}


 12%|█▏        | 3510/30000 [1:36:53<2:24:03,  3.06it/s]

{'loss': 0.3042, 'grad_norm': 0.7142381072044373, 'learning_rate': 4.4898305084745765e-05, 'epoch': 0.35}


 12%|█▏        | 3520/30000 [1:36:56<2:21:29,  3.12it/s]

{'loss': 0.3034, 'grad_norm': 0.8257798552513123, 'learning_rate': 4.488135593220339e-05, 'epoch': 0.35}


 12%|█▏        | 3530/30000 [1:36:59<2:20:14,  3.15it/s]

{'loss': 0.3087, 'grad_norm': 0.8046056032180786, 'learning_rate': 4.486440677966102e-05, 'epoch': 0.35}


 12%|█▏        | 3540/30000 [1:37:02<2:20:33,  3.14it/s]

{'loss': 0.2949, 'grad_norm': 0.7415270805358887, 'learning_rate': 4.484745762711865e-05, 'epoch': 0.35}


 12%|█▏        | 3550/30000 [1:37:06<2:19:30,  3.16it/s]

{'loss': 0.3131, 'grad_norm': 0.6271710991859436, 'learning_rate': 4.483050847457627e-05, 'epoch': 0.35}


 12%|█▏        | 3560/30000 [1:37:09<2:22:11,  3.10it/s]

{'loss': 0.3127, 'grad_norm': 0.8978949189186096, 'learning_rate': 4.48135593220339e-05, 'epoch': 0.36}


 12%|█▏        | 3570/30000 [1:37:12<2:20:22,  3.14it/s]

{'loss': 0.2865, 'grad_norm': 0.72867751121521, 'learning_rate': 4.479661016949153e-05, 'epoch': 0.36}


 12%|█▏        | 3580/30000 [1:37:15<2:19:51,  3.15it/s]

{'loss': 0.3061, 'grad_norm': 0.6408154368400574, 'learning_rate': 4.477966101694915e-05, 'epoch': 0.36}


 12%|█▏        | 3590/30000 [1:37:18<2:19:29,  3.16it/s]

{'loss': 0.2767, 'grad_norm': 0.6344459652900696, 'learning_rate': 4.476271186440678e-05, 'epoch': 0.36}


 12%|█▏        | 3600/30000 [1:37:21<2:19:05,  3.16it/s]

{'loss': 0.2892, 'grad_norm': 0.6811996102333069, 'learning_rate': 4.474576271186441e-05, 'epoch': 0.36}


 12%|█▏        | 3610/30000 [1:37:25<2:19:01,  3.16it/s]

{'loss': 0.287, 'grad_norm': 0.7207954525947571, 'learning_rate': 4.472881355932204e-05, 'epoch': 0.36}


 12%|█▏        | 3620/30000 [1:37:28<2:19:02,  3.16it/s]

{'loss': 0.296, 'grad_norm': 0.7185862064361572, 'learning_rate': 4.471186440677966e-05, 'epoch': 0.36}


 12%|█▏        | 3630/30000 [1:37:31<2:18:57,  3.16it/s]

{'loss': 0.3018, 'grad_norm': 0.7950380444526672, 'learning_rate': 4.469491525423729e-05, 'epoch': 0.36}


 12%|█▏        | 3640/30000 [1:37:34<2:19:14,  3.16it/s]

{'loss': 0.296, 'grad_norm': 1.0953879356384277, 'learning_rate': 4.467796610169492e-05, 'epoch': 0.36}


 12%|█▏        | 3650/30000 [1:37:37<2:19:03,  3.16it/s]

{'loss': 0.3079, 'grad_norm': 1.1495002508163452, 'learning_rate': 4.466101694915254e-05, 'epoch': 0.36}


 12%|█▏        | 3660/30000 [1:37:40<2:18:47,  3.16it/s]

{'loss': 0.2909, 'grad_norm': 0.7752143144607544, 'learning_rate': 4.464406779661017e-05, 'epoch': 0.37}


 12%|█▏        | 3670/30000 [1:37:44<2:18:54,  3.16it/s]

{'loss': 0.2967, 'grad_norm': 0.777330756187439, 'learning_rate': 4.46271186440678e-05, 'epoch': 0.37}


 12%|█▏        | 3680/30000 [1:37:47<2:18:27,  3.17it/s]

{'loss': 0.2799, 'grad_norm': 0.5447575449943542, 'learning_rate': 4.461016949152543e-05, 'epoch': 0.37}


 12%|█▏        | 3690/30000 [1:37:50<2:18:25,  3.17it/s]

{'loss': 0.2948, 'grad_norm': 0.7463875412940979, 'learning_rate': 4.459322033898305e-05, 'epoch': 0.37}


 12%|█▏        | 3700/30000 [1:37:53<2:18:52,  3.16it/s]

{'loss': 0.2922, 'grad_norm': 0.587532639503479, 'learning_rate': 4.457627118644068e-05, 'epoch': 0.37}


 12%|█▏        | 3710/30000 [1:37:56<2:18:56,  3.15it/s]

{'loss': 0.2864, 'grad_norm': 0.6429153084754944, 'learning_rate': 4.455932203389831e-05, 'epoch': 0.37}


 12%|█▏        | 3720/30000 [1:37:59<2:18:15,  3.17it/s]

{'loss': 0.284, 'grad_norm': 0.6080722212791443, 'learning_rate': 4.4542372881355934e-05, 'epoch': 0.37}


 12%|█▏        | 3730/30000 [1:38:03<2:19:11,  3.15it/s]

{'loss': 0.2843, 'grad_norm': 0.6601408123970032, 'learning_rate': 4.452542372881356e-05, 'epoch': 0.37}


 12%|█▏        | 3740/30000 [1:38:06<2:17:58,  3.17it/s]

{'loss': 0.2922, 'grad_norm': 0.682041347026825, 'learning_rate': 4.4508474576271186e-05, 'epoch': 0.37}


 12%|█▎        | 3750/30000 [1:38:09<2:19:02,  3.15it/s]

{'loss': 0.3027, 'grad_norm': 0.759139895439148, 'learning_rate': 4.4491525423728816e-05, 'epoch': 0.38}


 13%|█▎        | 3760/30000 [1:38:12<2:19:55,  3.13it/s]

{'loss': 0.2819, 'grad_norm': 0.6112969517707825, 'learning_rate': 4.447457627118644e-05, 'epoch': 0.38}


 13%|█▎        | 3770/30000 [1:38:15<2:19:14,  3.14it/s]

{'loss': 0.2947, 'grad_norm': 0.7362275123596191, 'learning_rate': 4.445762711864407e-05, 'epoch': 0.38}


 13%|█▎        | 3780/30000 [1:38:19<2:18:07,  3.16it/s]

{'loss': 0.295, 'grad_norm': 0.8038157820701599, 'learning_rate': 4.44406779661017e-05, 'epoch': 0.38}


 13%|█▎        | 3790/30000 [1:38:22<2:18:12,  3.16it/s]

{'loss': 0.2982, 'grad_norm': 0.7158764600753784, 'learning_rate': 4.4423728813559326e-05, 'epoch': 0.38}


 13%|█▎        | 3800/30000 [1:38:25<2:18:13,  3.16it/s]

{'loss': 0.295, 'grad_norm': 0.6731598377227783, 'learning_rate': 4.440677966101695e-05, 'epoch': 0.38}


 13%|█▎        | 3810/30000 [1:38:28<2:20:55,  3.10it/s]

{'loss': 0.3081, 'grad_norm': 0.5984122157096863, 'learning_rate': 4.438983050847458e-05, 'epoch': 0.38}


 13%|█▎        | 3820/30000 [1:38:31<2:17:46,  3.17it/s]

{'loss': 0.299, 'grad_norm': 0.8038256764411926, 'learning_rate': 4.437288135593221e-05, 'epoch': 0.38}


 13%|█▎        | 3830/30000 [1:38:34<2:22:24,  3.06it/s]

{'loss': 0.2935, 'grad_norm': 0.7748740911483765, 'learning_rate': 4.435593220338983e-05, 'epoch': 0.38}


 13%|█▎        | 3840/30000 [1:38:38<2:18:03,  3.16it/s]

{'loss': 0.2988, 'grad_norm': 0.8511162996292114, 'learning_rate': 4.433898305084746e-05, 'epoch': 0.38}


 13%|█▎        | 3850/30000 [1:38:41<2:17:31,  3.17it/s]

{'loss': 0.2874, 'grad_norm': 0.6450610160827637, 'learning_rate': 4.432203389830509e-05, 'epoch': 0.39}


 13%|█▎        | 3860/30000 [1:38:44<2:16:33,  3.19it/s]

{'loss': 0.309, 'grad_norm': 0.7412271499633789, 'learning_rate': 4.430508474576272e-05, 'epoch': 0.39}


 13%|█▎        | 3870/30000 [1:38:47<2:15:57,  3.20it/s]

{'loss': 0.2994, 'grad_norm': 0.9870760440826416, 'learning_rate': 4.428813559322034e-05, 'epoch': 0.39}


 13%|█▎        | 3880/30000 [1:38:50<2:16:05,  3.20it/s]

{'loss': 0.3023, 'grad_norm': 0.6739212870597839, 'learning_rate': 4.427118644067797e-05, 'epoch': 0.39}


 13%|█▎        | 3890/30000 [1:38:53<2:15:58,  3.20it/s]

{'loss': 0.292, 'grad_norm': 0.7232638001441956, 'learning_rate': 4.42542372881356e-05, 'epoch': 0.39}


 13%|█▎        | 3900/30000 [1:38:57<2:15:58,  3.20it/s]

{'loss': 0.2955, 'grad_norm': 0.634164571762085, 'learning_rate': 4.423728813559322e-05, 'epoch': 0.39}


 13%|█▎        | 3910/30000 [1:39:00<2:15:53,  3.20it/s]

{'loss': 0.299, 'grad_norm': 0.7180224061012268, 'learning_rate': 4.422033898305085e-05, 'epoch': 0.39}


 13%|█▎        | 3920/30000 [1:39:03<2:15:53,  3.20it/s]

{'loss': 0.2932, 'grad_norm': 0.5457201600074768, 'learning_rate': 4.420338983050848e-05, 'epoch': 0.39}


 13%|█▎        | 3930/30000 [1:39:06<2:15:53,  3.20it/s]

{'loss': 0.2782, 'grad_norm': 0.7488620281219482, 'learning_rate': 4.41864406779661e-05, 'epoch': 0.39}


 13%|█▎        | 3940/30000 [1:39:09<2:15:51,  3.20it/s]

{'loss': 0.2917, 'grad_norm': 0.9910287261009216, 'learning_rate': 4.4169491525423726e-05, 'epoch': 0.39}


 13%|█▎        | 3950/30000 [1:39:12<2:15:44,  3.20it/s]

{'loss': 0.2805, 'grad_norm': 0.7616119980812073, 'learning_rate': 4.4152542372881355e-05, 'epoch': 0.4}


 13%|█▎        | 3960/30000 [1:39:15<2:15:31,  3.20it/s]

{'loss': 0.2821, 'grad_norm': 0.7116876840591431, 'learning_rate': 4.4135593220338984e-05, 'epoch': 0.4}


 13%|█▎        | 3970/30000 [1:39:18<2:15:36,  3.20it/s]

{'loss': 0.2842, 'grad_norm': 0.7194894552230835, 'learning_rate': 4.4118644067796614e-05, 'epoch': 0.4}


 13%|█▎        | 3980/30000 [1:39:22<2:15:36,  3.20it/s]

{'loss': 0.2877, 'grad_norm': 0.8190382122993469, 'learning_rate': 4.4101694915254236e-05, 'epoch': 0.4}


 13%|█▎        | 3990/30000 [1:39:25<2:15:20,  3.20it/s]

{'loss': 0.2675, 'grad_norm': 0.8096761107444763, 'learning_rate': 4.4084745762711866e-05, 'epoch': 0.4}


 13%|█▎        | 4000/30000 [1:39:28<2:15:21,  3.20it/s]

{'loss': 0.3151, 'grad_norm': 0.5768618583679199, 'learning_rate': 4.4067796610169495e-05, 'epoch': 0.4}


 13%|█▎        | 4010/30000 [1:39:31<2:15:13,  3.20it/s]

{'loss': 0.29, 'grad_norm': 0.6308997869491577, 'learning_rate': 4.405084745762712e-05, 'epoch': 0.4}


 13%|█▎        | 4020/30000 [1:39:34<2:15:14,  3.20it/s]

{'loss': 0.3092, 'grad_norm': 0.7176519632339478, 'learning_rate': 4.403389830508475e-05, 'epoch': 0.4}


 13%|█▎        | 4030/30000 [1:39:37<2:15:09,  3.20it/s]

{'loss': 0.2821, 'grad_norm': 0.9088622331619263, 'learning_rate': 4.4016949152542376e-05, 'epoch': 0.4}


 13%|█▎        | 4040/30000 [1:39:40<2:15:09,  3.20it/s]

{'loss': 0.2945, 'grad_norm': 0.7490049600601196, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.4}


 14%|█▎        | 4050/30000 [1:39:43<2:15:02,  3.20it/s]

{'loss': 0.2949, 'grad_norm': 0.661195695400238, 'learning_rate': 4.398305084745763e-05, 'epoch': 0.41}


 14%|█▎        | 4060/30000 [1:39:47<2:14:56,  3.20it/s]

{'loss': 0.2819, 'grad_norm': 0.6416125893592834, 'learning_rate': 4.396610169491526e-05, 'epoch': 0.41}


 14%|█▎        | 4070/30000 [1:39:50<2:14:51,  3.20it/s]

{'loss': 0.293, 'grad_norm': 0.7409297823905945, 'learning_rate': 4.394915254237289e-05, 'epoch': 0.41}


 14%|█▎        | 4080/30000 [1:39:53<2:14:51,  3.20it/s]

{'loss': 0.2992, 'grad_norm': 0.6464699506759644, 'learning_rate': 4.393220338983051e-05, 'epoch': 0.41}


 14%|█▎        | 4090/30000 [1:39:56<2:14:54,  3.20it/s]

{'loss': 0.2981, 'grad_norm': 0.6443000435829163, 'learning_rate': 4.391525423728814e-05, 'epoch': 0.41}


 14%|█▎        | 4100/30000 [1:39:59<2:14:46,  3.20it/s]

{'loss': 0.317, 'grad_norm': 0.9178438186645508, 'learning_rate': 4.389830508474577e-05, 'epoch': 0.41}


 14%|█▎        | 4110/30000 [1:40:02<2:15:17,  3.19it/s]

{'loss': 0.3051, 'grad_norm': 0.6673476696014404, 'learning_rate': 4.388135593220339e-05, 'epoch': 0.41}


 14%|█▎        | 4120/30000 [1:40:05<2:14:45,  3.20it/s]

{'loss': 0.2815, 'grad_norm': 0.5579895377159119, 'learning_rate': 4.386440677966102e-05, 'epoch': 0.41}


 14%|█▍        | 4130/30000 [1:40:08<2:14:39,  3.20it/s]

{'loss': 0.2998, 'grad_norm': 0.681257426738739, 'learning_rate': 4.384745762711865e-05, 'epoch': 0.41}


 14%|█▍        | 4140/30000 [1:40:12<2:14:28,  3.20it/s]

{'loss': 0.313, 'grad_norm': 0.6754911541938782, 'learning_rate': 4.383050847457627e-05, 'epoch': 0.41}


 14%|█▍        | 4150/30000 [1:40:15<2:14:24,  3.21it/s]

{'loss': 0.2803, 'grad_norm': 0.6955018639564514, 'learning_rate': 4.38135593220339e-05, 'epoch': 0.41}


 14%|█▍        | 4160/30000 [1:40:18<2:14:26,  3.20it/s]

{'loss': 0.3026, 'grad_norm': 0.6838437914848328, 'learning_rate': 4.3796610169491524e-05, 'epoch': 0.42}


 14%|█▍        | 4170/30000 [1:40:21<2:14:21,  3.20it/s]

{'loss': 0.2707, 'grad_norm': 0.9460506439208984, 'learning_rate': 4.377966101694915e-05, 'epoch': 0.42}


 14%|█▍        | 4180/30000 [1:40:24<2:14:06,  3.21it/s]

{'loss': 0.2958, 'grad_norm': 0.5922210812568665, 'learning_rate': 4.376271186440678e-05, 'epoch': 0.42}


 14%|█▍        | 4190/30000 [1:40:27<2:14:18,  3.20it/s]

{'loss': 0.3149, 'grad_norm': 0.6678944826126099, 'learning_rate': 4.3745762711864405e-05, 'epoch': 0.42}


 14%|█▍        | 4200/30000 [1:40:30<2:14:12,  3.20it/s]

{'loss': 0.2918, 'grad_norm': 0.6385821104049683, 'learning_rate': 4.3728813559322035e-05, 'epoch': 0.42}


 14%|█▍        | 4210/30000 [1:40:33<2:14:17,  3.20it/s]

{'loss': 0.2916, 'grad_norm': 0.6755231022834778, 'learning_rate': 4.3711864406779664e-05, 'epoch': 0.42}


 14%|█▍        | 4220/30000 [1:40:37<2:14:01,  3.21it/s]

{'loss': 0.2814, 'grad_norm': 0.7159159779548645, 'learning_rate': 4.3694915254237286e-05, 'epoch': 0.42}


 14%|█▍        | 4230/30000 [1:40:40<2:13:57,  3.21it/s]

{'loss': 0.276, 'grad_norm': 0.6560961008071899, 'learning_rate': 4.3677966101694916e-05, 'epoch': 0.42}


 14%|█▍        | 4240/30000 [1:40:43<2:13:50,  3.21it/s]

{'loss': 0.289, 'grad_norm': 0.6071156859397888, 'learning_rate': 4.3661016949152545e-05, 'epoch': 0.42}


 14%|█▍        | 4250/30000 [1:40:46<2:13:53,  3.21it/s]

{'loss': 0.2812, 'grad_norm': 0.641610324382782, 'learning_rate': 4.3644067796610175e-05, 'epoch': 0.42}


 14%|█▍        | 4260/30000 [1:40:49<2:13:53,  3.20it/s]

{'loss': 0.2837, 'grad_norm': 0.6931858062744141, 'learning_rate': 4.36271186440678e-05, 'epoch': 0.43}


 14%|█▍        | 4270/30000 [1:40:52<2:13:42,  3.21it/s]

{'loss': 0.2729, 'grad_norm': 0.6781604886054993, 'learning_rate': 4.3610169491525426e-05, 'epoch': 0.43}


 14%|█▍        | 4280/30000 [1:40:55<2:13:34,  3.21it/s]

{'loss': 0.2826, 'grad_norm': 0.6961559057235718, 'learning_rate': 4.3593220338983056e-05, 'epoch': 0.43}


 14%|█▍        | 4290/30000 [1:40:58<2:13:36,  3.21it/s]

{'loss': 0.295, 'grad_norm': 0.8776082396507263, 'learning_rate': 4.357627118644068e-05, 'epoch': 0.43}


 14%|█▍        | 4300/30000 [1:41:02<2:13:43,  3.20it/s]

{'loss': 0.2903, 'grad_norm': 0.5766346454620361, 'learning_rate': 4.355932203389831e-05, 'epoch': 0.43}


 14%|█▍        | 4310/30000 [1:41:05<2:13:33,  3.21it/s]

{'loss': 0.2966, 'grad_norm': 0.7156338691711426, 'learning_rate': 4.354237288135594e-05, 'epoch': 0.43}


 14%|█▍        | 4320/30000 [1:41:08<2:13:27,  3.21it/s]

{'loss': 0.3028, 'grad_norm': 0.6804577112197876, 'learning_rate': 4.3525423728813566e-05, 'epoch': 0.43}


 14%|█▍        | 4330/30000 [1:41:11<2:14:09,  3.19it/s]

{'loss': 0.2971, 'grad_norm': 0.6359792351722717, 'learning_rate': 4.350847457627119e-05, 'epoch': 0.43}


 14%|█▍        | 4340/30000 [1:41:14<2:13:26,  3.20it/s]

{'loss': 0.2764, 'grad_norm': 0.7212865948677063, 'learning_rate': 4.349152542372882e-05, 'epoch': 0.43}


 14%|█▍        | 4350/30000 [1:41:17<2:13:20,  3.21it/s]

{'loss': 0.2947, 'grad_norm': 0.722240686416626, 'learning_rate': 4.347457627118644e-05, 'epoch': 0.43}


 15%|█▍        | 4360/30000 [1:41:20<2:13:12,  3.21it/s]

{'loss': 0.2815, 'grad_norm': 0.9357856512069702, 'learning_rate': 4.345762711864407e-05, 'epoch': 0.44}


 15%|█▍        | 4370/30000 [1:41:23<2:13:11,  3.21it/s]

{'loss': 0.3063, 'grad_norm': 0.8875013589859009, 'learning_rate': 4.344067796610169e-05, 'epoch': 0.44}


 15%|█▍        | 4380/30000 [1:41:26<2:13:07,  3.21it/s]

{'loss': 0.3009, 'grad_norm': 0.6959116458892822, 'learning_rate': 4.342372881355932e-05, 'epoch': 0.44}


 15%|█▍        | 4390/30000 [1:41:30<2:13:23,  3.20it/s]

{'loss': 0.3117, 'grad_norm': 0.5802127718925476, 'learning_rate': 4.340677966101695e-05, 'epoch': 0.44}


 15%|█▍        | 4400/30000 [1:41:33<2:13:09,  3.20it/s]

{'loss': 0.3051, 'grad_norm': 0.603352427482605, 'learning_rate': 4.3389830508474574e-05, 'epoch': 0.44}


 15%|█▍        | 4410/30000 [1:41:36<2:12:56,  3.21it/s]

{'loss': 0.2959, 'grad_norm': 0.612513542175293, 'learning_rate': 4.3372881355932203e-05, 'epoch': 0.44}


 15%|█▍        | 4420/30000 [1:41:39<2:13:01,  3.20it/s]

{'loss': 0.2943, 'grad_norm': 0.6462284922599792, 'learning_rate': 4.335593220338983e-05, 'epoch': 0.44}


 15%|█▍        | 4430/30000 [1:41:42<2:12:55,  3.21it/s]

{'loss': 0.3017, 'grad_norm': 0.9099350571632385, 'learning_rate': 4.333898305084746e-05, 'epoch': 0.44}


 15%|█▍        | 4440/30000 [1:41:45<2:12:54,  3.21it/s]

{'loss': 0.2915, 'grad_norm': 0.5531995892524719, 'learning_rate': 4.3322033898305085e-05, 'epoch': 0.44}


 15%|█▍        | 4450/30000 [1:41:48<2:12:48,  3.21it/s]

{'loss': 0.2899, 'grad_norm': 0.547208309173584, 'learning_rate': 4.3305084745762714e-05, 'epoch': 0.45}


 15%|█▍        | 4460/30000 [1:41:51<2:12:47,  3.21it/s]

{'loss': 0.3014, 'grad_norm': 0.694875180721283, 'learning_rate': 4.3288135593220343e-05, 'epoch': 0.45}


 15%|█▍        | 4470/30000 [1:41:55<2:12:34,  3.21it/s]

{'loss': 0.2841, 'grad_norm': 0.6519914269447327, 'learning_rate': 4.3271186440677966e-05, 'epoch': 0.45}


 15%|█▍        | 4480/30000 [1:41:58<2:12:39,  3.21it/s]

{'loss': 0.2856, 'grad_norm': 0.6049374341964722, 'learning_rate': 4.3254237288135595e-05, 'epoch': 0.45}


 15%|█▍        | 4490/30000 [1:42:01<2:12:32,  3.21it/s]

{'loss': 0.3075, 'grad_norm': 0.7155234217643738, 'learning_rate': 4.3237288135593225e-05, 'epoch': 0.45}


 15%|█▌        | 4500/30000 [1:42:04<2:12:39,  3.20it/s]

{'loss': 0.2744, 'grad_norm': 0.652427077293396, 'learning_rate': 4.3220338983050854e-05, 'epoch': 0.45}


 15%|█▌        | 4510/30000 [1:42:07<2:12:30,  3.21it/s]

{'loss': 0.279, 'grad_norm': 0.6613799333572388, 'learning_rate': 4.3203389830508477e-05, 'epoch': 0.45}


 15%|█▌        | 4520/30000 [1:42:10<2:12:25,  3.21it/s]

{'loss': 0.3018, 'grad_norm': 0.5982864499092102, 'learning_rate': 4.3186440677966106e-05, 'epoch': 0.45}


 15%|█▌        | 4530/30000 [1:42:13<2:12:24,  3.21it/s]

{'loss': 0.2922, 'grad_norm': 0.7038689255714417, 'learning_rate': 4.3169491525423735e-05, 'epoch': 0.45}


 15%|█▌        | 4540/30000 [1:42:16<2:12:18,  3.21it/s]

{'loss': 0.279, 'grad_norm': 0.7758260369300842, 'learning_rate': 4.315254237288136e-05, 'epoch': 0.45}


 15%|█▌        | 4550/30000 [1:42:20<2:12:22,  3.20it/s]

{'loss': 0.2736, 'grad_norm': 0.5846870541572571, 'learning_rate': 4.313559322033899e-05, 'epoch': 0.46}


 15%|█▌        | 4560/30000 [1:42:23<2:12:17,  3.20it/s]

{'loss': 0.2834, 'grad_norm': 0.6674789190292358, 'learning_rate': 4.311864406779661e-05, 'epoch': 0.46}


 15%|█▌        | 4570/30000 [1:42:26<2:12:04,  3.21it/s]

{'loss': 0.2718, 'grad_norm': 0.7452945113182068, 'learning_rate': 4.310169491525424e-05, 'epoch': 0.46}


 15%|█▌        | 4580/30000 [1:42:29<2:12:01,  3.21it/s]

{'loss': 0.2995, 'grad_norm': 0.6834344267845154, 'learning_rate': 4.308474576271186e-05, 'epoch': 0.46}


 15%|█▌        | 4590/30000 [1:42:32<2:12:01,  3.21it/s]

{'loss': 0.2727, 'grad_norm': 0.5574472546577454, 'learning_rate': 4.306779661016949e-05, 'epoch': 0.46}


 15%|█▌        | 4600/30000 [1:42:35<2:12:08,  3.20it/s]

{'loss': 0.2829, 'grad_norm': 0.6871572136878967, 'learning_rate': 4.305084745762712e-05, 'epoch': 0.46}


 15%|█▌        | 4610/30000 [1:42:38<2:12:01,  3.21it/s]

{'loss': 0.2772, 'grad_norm': 0.4914412796497345, 'learning_rate': 4.303389830508475e-05, 'epoch': 0.46}


 15%|█▌        | 4620/30000 [1:42:41<2:12:03,  3.20it/s]

{'loss': 0.2976, 'grad_norm': 0.7423998713493347, 'learning_rate': 4.301694915254237e-05, 'epoch': 0.46}


 15%|█▌        | 4630/30000 [1:42:45<2:11:50,  3.21it/s]

{'loss': 0.2903, 'grad_norm': 0.6706809401512146, 'learning_rate': 4.3e-05, 'epoch': 0.46}


 15%|█▌        | 4640/30000 [1:42:48<2:11:45,  3.21it/s]

{'loss': 0.2842, 'grad_norm': 0.5318421721458435, 'learning_rate': 4.298305084745763e-05, 'epoch': 0.46}


 16%|█▌        | 4650/30000 [1:42:51<2:11:49,  3.20it/s]

{'loss': 0.2898, 'grad_norm': 0.638216495513916, 'learning_rate': 4.2966101694915254e-05, 'epoch': 0.47}


 16%|█▌        | 4660/30000 [1:42:54<2:11:45,  3.21it/s]

{'loss': 0.2749, 'grad_norm': 0.9176488518714905, 'learning_rate': 4.294915254237288e-05, 'epoch': 0.47}


 16%|█▌        | 4670/30000 [1:42:57<2:11:39,  3.21it/s]

{'loss': 0.3138, 'grad_norm': 0.5972967147827148, 'learning_rate': 4.293220338983051e-05, 'epoch': 0.47}


 16%|█▌        | 4680/30000 [1:43:00<2:11:35,  3.21it/s]

{'loss': 0.2722, 'grad_norm': 0.6253763437271118, 'learning_rate': 4.291525423728814e-05, 'epoch': 0.47}


 16%|█▌        | 4690/30000 [1:43:03<2:11:35,  3.21it/s]

{'loss': 0.2908, 'grad_norm': 0.752830982208252, 'learning_rate': 4.2898305084745764e-05, 'epoch': 0.47}


 16%|█▌        | 4700/30000 [1:43:06<2:11:28,  3.21it/s]

{'loss': 0.2826, 'grad_norm': 0.7489004135131836, 'learning_rate': 4.2881355932203394e-05, 'epoch': 0.47}


 16%|█▌        | 4710/30000 [1:43:10<2:11:58,  3.19it/s]

{'loss': 0.2866, 'grad_norm': 0.8288213014602661, 'learning_rate': 4.286440677966102e-05, 'epoch': 0.47}


 16%|█▌        | 4720/30000 [1:43:13<2:11:51,  3.20it/s]

{'loss': 0.2883, 'grad_norm': 1.0432730913162231, 'learning_rate': 4.2847457627118645e-05, 'epoch': 0.47}


 16%|█▌        | 4730/30000 [1:43:16<2:11:22,  3.21it/s]

{'loss': 0.2752, 'grad_norm': 0.8808561563491821, 'learning_rate': 4.2830508474576275e-05, 'epoch': 0.47}


 16%|█▌        | 4740/30000 [1:43:19<2:11:24,  3.20it/s]

{'loss': 0.2678, 'grad_norm': 0.7220816016197205, 'learning_rate': 4.2813559322033904e-05, 'epoch': 0.47}


 16%|█▌        | 4750/30000 [1:43:22<2:11:15,  3.21it/s]

{'loss': 0.2802, 'grad_norm': 0.6146757006645203, 'learning_rate': 4.279661016949153e-05, 'epoch': 0.47}


 16%|█▌        | 4760/30000 [1:43:25<2:11:09,  3.21it/s]

{'loss': 0.2914, 'grad_norm': 0.7112389802932739, 'learning_rate': 4.277966101694915e-05, 'epoch': 0.48}


 16%|█▌        | 4770/30000 [1:43:28<2:11:49,  3.19it/s]

{'loss': 0.3067, 'grad_norm': 0.6461759209632874, 'learning_rate': 4.276271186440678e-05, 'epoch': 0.48}


 16%|█▌        | 4780/30000 [1:43:31<2:11:35,  3.19it/s]

{'loss': 0.2823, 'grad_norm': 0.6710173487663269, 'learning_rate': 4.274576271186441e-05, 'epoch': 0.48}


 16%|█▌        | 4790/30000 [1:43:35<2:11:12,  3.20it/s]

{'loss': 0.2872, 'grad_norm': 0.6261210441589355, 'learning_rate': 4.272881355932204e-05, 'epoch': 0.48}


 16%|█▌        | 4800/30000 [1:43:38<2:11:04,  3.20it/s]

{'loss': 0.2659, 'grad_norm': 0.6817012429237366, 'learning_rate': 4.271186440677966e-05, 'epoch': 0.48}


 16%|█▌        | 4810/30000 [1:43:41<2:11:00,  3.20it/s]

{'loss': 0.2876, 'grad_norm': 1.0594570636749268, 'learning_rate': 4.269491525423729e-05, 'epoch': 0.48}


 16%|█▌        | 4820/30000 [1:43:44<2:10:57,  3.20it/s]

{'loss': 0.2886, 'grad_norm': 0.6318069100379944, 'learning_rate': 4.267796610169492e-05, 'epoch': 0.48}


 16%|█▌        | 4830/30000 [1:43:47<2:10:35,  3.21it/s]

{'loss': 0.2897, 'grad_norm': 0.6661199331283569, 'learning_rate': 4.266101694915254e-05, 'epoch': 0.48}


 16%|█▌        | 4840/30000 [1:43:50<2:10:25,  3.22it/s]

{'loss': 0.276, 'grad_norm': 0.6402759552001953, 'learning_rate': 4.264406779661017e-05, 'epoch': 0.48}


 16%|█▌        | 4850/30000 [1:43:53<2:10:26,  3.21it/s]

{'loss': 0.2925, 'grad_norm': 0.7013927698135376, 'learning_rate': 4.26271186440678e-05, 'epoch': 0.48}


 16%|█▌        | 4860/30000 [1:43:56<2:10:41,  3.21it/s]

{'loss': 0.2884, 'grad_norm': 0.9156714081764221, 'learning_rate': 4.261016949152542e-05, 'epoch': 0.49}


 16%|█▌        | 4870/30000 [1:44:00<2:10:37,  3.21it/s]

{'loss': 0.2841, 'grad_norm': 0.6620802879333496, 'learning_rate': 4.259322033898305e-05, 'epoch': 0.49}


 16%|█▋        | 4880/30000 [1:44:03<2:10:31,  3.21it/s]

{'loss': 0.2989, 'grad_norm': 0.6059100031852722, 'learning_rate': 4.257627118644068e-05, 'epoch': 0.49}


 16%|█▋        | 4890/30000 [1:44:06<2:10:27,  3.21it/s]

{'loss': 0.2996, 'grad_norm': 0.627979576587677, 'learning_rate': 4.255932203389831e-05, 'epoch': 0.49}


 16%|█▋        | 4900/30000 [1:44:09<2:10:18,  3.21it/s]

{'loss': 0.291, 'grad_norm': 0.5713611245155334, 'learning_rate': 4.254237288135593e-05, 'epoch': 0.49}


 16%|█▋        | 4910/30000 [1:44:12<2:10:21,  3.21it/s]

{'loss': 0.2809, 'grad_norm': 0.7863122820854187, 'learning_rate': 4.252542372881356e-05, 'epoch': 0.49}


 16%|█▋        | 4920/30000 [1:44:15<2:10:21,  3.21it/s]

{'loss': 0.2957, 'grad_norm': 0.7173184156417847, 'learning_rate': 4.250847457627119e-05, 'epoch': 0.49}


 16%|█▋        | 4930/30000 [1:44:18<2:10:23,  3.20it/s]

{'loss': 0.292, 'grad_norm': 0.6882187724113464, 'learning_rate': 4.2491525423728814e-05, 'epoch': 0.49}


 16%|█▋        | 4940/30000 [1:44:21<2:10:07,  3.21it/s]

{'loss': 0.2688, 'grad_norm': 0.5730521082878113, 'learning_rate': 4.2474576271186444e-05, 'epoch': 0.49}


 16%|█▋        | 4950/30000 [1:44:25<2:10:13,  3.21it/s]

{'loss': 0.2899, 'grad_norm': 0.8524004817008972, 'learning_rate': 4.245762711864407e-05, 'epoch': 0.49}


 17%|█▋        | 4960/30000 [1:44:28<2:10:04,  3.21it/s]

{'loss': 0.2931, 'grad_norm': 0.7970860004425049, 'learning_rate': 4.24406779661017e-05, 'epoch': 0.5}


 17%|█▋        | 4970/30000 [1:44:31<2:10:00,  3.21it/s]

{'loss': 0.3009, 'grad_norm': 0.6563714742660522, 'learning_rate': 4.242372881355932e-05, 'epoch': 0.5}


 17%|█▋        | 4980/30000 [1:44:34<2:09:52,  3.21it/s]

{'loss': 0.2885, 'grad_norm': 0.686176598072052, 'learning_rate': 4.240677966101695e-05, 'epoch': 0.5}


 17%|█▋        | 4990/30000 [1:44:37<2:10:00,  3.21it/s]

{'loss': 0.2764, 'grad_norm': 0.6247406601905823, 'learning_rate': 4.238983050847458e-05, 'epoch': 0.5}


 17%|█▋        | 5000/30000 [1:44:40<2:09:45,  3.21it/s]

{'loss': 0.263, 'grad_norm': 0.7860041856765747, 'learning_rate': 4.2372881355932206e-05, 'epoch': 0.5}


 17%|█▋        | 5010/30000 [1:44:43<2:09:50,  3.21it/s]

{'loss': 0.2743, 'grad_norm': 0.5972041487693787, 'learning_rate': 4.235593220338983e-05, 'epoch': 0.5}


 17%|█▋        | 5020/30000 [1:44:46<2:09:40,  3.21it/s]

{'loss': 0.2828, 'grad_norm': 0.5497397184371948, 'learning_rate': 4.233898305084746e-05, 'epoch': 0.5}


 17%|█▋        | 5030/30000 [1:44:49<2:09:38,  3.21it/s]

{'loss': 0.2732, 'grad_norm': 0.5465169548988342, 'learning_rate': 4.232203389830509e-05, 'epoch': 0.5}


 17%|█▋        | 5040/30000 [1:44:53<2:09:45,  3.21it/s]

{'loss': 0.2973, 'grad_norm': 0.6113858819007874, 'learning_rate': 4.230508474576271e-05, 'epoch': 0.5}


 17%|█▋        | 5050/30000 [1:44:56<2:09:43,  3.21it/s]

{'loss': 0.2911, 'grad_norm': 0.5601432919502258, 'learning_rate': 4.228813559322034e-05, 'epoch': 0.51}


 17%|█▋        | 5060/30000 [1:44:59<2:09:39,  3.21it/s]

{'loss': 0.2992, 'grad_norm': 0.6641830801963806, 'learning_rate': 4.227118644067797e-05, 'epoch': 0.51}


 17%|█▋        | 5070/30000 [1:45:02<2:09:34,  3.21it/s]

{'loss': 0.2855, 'grad_norm': 0.5951101779937744, 'learning_rate': 4.22542372881356e-05, 'epoch': 0.51}


 17%|█▋        | 5080/30000 [1:45:05<2:09:39,  3.20it/s]

{'loss': 0.2861, 'grad_norm': 0.7895799279212952, 'learning_rate': 4.223728813559322e-05, 'epoch': 0.51}


 17%|█▋        | 5090/30000 [1:45:08<2:09:23,  3.21it/s]

{'loss': 0.2929, 'grad_norm': 0.7873754501342773, 'learning_rate': 4.222033898305085e-05, 'epoch': 0.51}


 17%|█▋        | 5100/30000 [1:45:11<2:09:29,  3.20it/s]

{'loss': 0.2778, 'grad_norm': 0.5839818716049194, 'learning_rate': 4.220338983050848e-05, 'epoch': 0.51}


 17%|█▋        | 5110/30000 [1:45:14<2:09:21,  3.21it/s]

{'loss': 0.2959, 'grad_norm': 0.5607213377952576, 'learning_rate': 4.21864406779661e-05, 'epoch': 0.51}


 17%|█▋        | 5120/30000 [1:45:18<2:09:20,  3.21it/s]

{'loss': 0.2946, 'grad_norm': 0.5849746465682983, 'learning_rate': 4.216949152542373e-05, 'epoch': 0.51}


 17%|█▋        | 5130/30000 [1:45:21<2:09:15,  3.21it/s]

{'loss': 0.2841, 'grad_norm': 0.6462767720222473, 'learning_rate': 4.215254237288136e-05, 'epoch': 0.51}


 17%|█▋        | 5140/30000 [1:45:24<2:09:02,  3.21it/s]

{'loss': 0.2792, 'grad_norm': 1.026129961013794, 'learning_rate': 4.213559322033899e-05, 'epoch': 0.51}


 17%|█▋        | 5150/30000 [1:45:27<2:09:13,  3.20it/s]

{'loss': 0.2865, 'grad_norm': 0.6862741112709045, 'learning_rate': 4.211864406779661e-05, 'epoch': 0.52}


 17%|█▋        | 5160/30000 [1:45:30<2:08:59,  3.21it/s]

{'loss': 0.2725, 'grad_norm': 0.5383834838867188, 'learning_rate': 4.210169491525424e-05, 'epoch': 0.52}


 17%|█▋        | 5170/30000 [1:45:33<2:08:49,  3.21it/s]

{'loss': 0.2728, 'grad_norm': 0.543906033039093, 'learning_rate': 4.208474576271187e-05, 'epoch': 0.52}


 17%|█▋        | 5180/30000 [1:45:36<2:09:05,  3.20it/s]

{'loss': 0.2834, 'grad_norm': 0.5373082756996155, 'learning_rate': 4.2067796610169494e-05, 'epoch': 0.52}


 17%|█▋        | 5190/30000 [1:45:39<2:08:56,  3.21it/s]

{'loss': 0.2928, 'grad_norm': 0.6329408884048462, 'learning_rate': 4.2050847457627116e-05, 'epoch': 0.52}


 17%|█▋        | 5200/30000 [1:45:43<2:08:54,  3.21it/s]

{'loss': 0.2966, 'grad_norm': 0.5718880295753479, 'learning_rate': 4.2033898305084746e-05, 'epoch': 0.52}


 17%|█▋        | 5210/30000 [1:45:46<2:08:45,  3.21it/s]

{'loss': 0.2813, 'grad_norm': 0.6787216663360596, 'learning_rate': 4.2016949152542375e-05, 'epoch': 0.52}


 17%|█▋        | 5220/30000 [1:45:49<2:09:00,  3.20it/s]

{'loss': 0.281, 'grad_norm': 0.656615674495697, 'learning_rate': 4.2e-05, 'epoch': 0.52}


 17%|█▋        | 5230/30000 [1:45:52<2:08:40,  3.21it/s]

{'loss': 0.2816, 'grad_norm': 0.5574270486831665, 'learning_rate': 4.198305084745763e-05, 'epoch': 0.52}


 17%|█▋        | 5240/30000 [1:45:55<2:08:39,  3.21it/s]

{'loss': 0.3011, 'grad_norm': 0.5432788729667664, 'learning_rate': 4.1966101694915256e-05, 'epoch': 0.52}


 18%|█▊        | 5250/30000 [1:45:58<2:08:43,  3.20it/s]

{'loss': 0.2938, 'grad_norm': 0.6063789129257202, 'learning_rate': 4.1949152542372886e-05, 'epoch': 0.53}


 18%|█▊        | 5260/30000 [1:46:01<2:08:35,  3.21it/s]

{'loss': 0.2806, 'grad_norm': 0.7483961582183838, 'learning_rate': 4.193220338983051e-05, 'epoch': 0.53}


 18%|█▊        | 5270/30000 [1:46:04<2:08:29,  3.21it/s]

{'loss': 0.275, 'grad_norm': 0.6928058862686157, 'learning_rate': 4.191525423728814e-05, 'epoch': 0.53}


 18%|█▊        | 5280/30000 [1:46:07<2:08:15,  3.21it/s]

{'loss': 0.2778, 'grad_norm': 0.7306016087532043, 'learning_rate': 4.189830508474577e-05, 'epoch': 0.53}


 18%|█▊        | 5290/30000 [1:46:11<2:08:18,  3.21it/s]

{'loss': 0.298, 'grad_norm': 0.9359308481216431, 'learning_rate': 4.188135593220339e-05, 'epoch': 0.53}


 18%|█▊        | 5300/30000 [1:46:14<2:08:19,  3.21it/s]

{'loss': 0.2837, 'grad_norm': 0.63205885887146, 'learning_rate': 4.186440677966102e-05, 'epoch': 0.53}


 18%|█▊        | 5310/30000 [1:46:17<2:08:05,  3.21it/s]

{'loss': 0.2784, 'grad_norm': 0.6701255440711975, 'learning_rate': 4.184745762711865e-05, 'epoch': 0.53}


 18%|█▊        | 5320/30000 [1:46:20<2:08:18,  3.21it/s]

{'loss': 0.2827, 'grad_norm': 0.6533389091491699, 'learning_rate': 4.183050847457628e-05, 'epoch': 0.53}


 18%|█▊        | 5330/30000 [1:46:23<2:08:16,  3.21it/s]

{'loss': 0.2811, 'grad_norm': 0.6398295164108276, 'learning_rate': 4.18135593220339e-05, 'epoch': 0.53}


 18%|█▊        | 5340/30000 [1:46:26<2:07:59,  3.21it/s]

{'loss': 0.2865, 'grad_norm': 0.6693788766860962, 'learning_rate': 4.179661016949153e-05, 'epoch': 0.53}


 18%|█▊        | 5350/30000 [1:46:29<2:07:59,  3.21it/s]

{'loss': 0.2706, 'grad_norm': 0.6622362732887268, 'learning_rate': 4.177966101694916e-05, 'epoch': 0.54}


 18%|█▊        | 5360/30000 [1:46:32<2:08:07,  3.21it/s]

{'loss': 0.2869, 'grad_norm': 0.6586081385612488, 'learning_rate': 4.176271186440678e-05, 'epoch': 0.54}


 18%|█▊        | 5370/30000 [1:46:36<2:07:58,  3.21it/s]

{'loss': 0.2758, 'grad_norm': 0.6510668396949768, 'learning_rate': 4.174576271186441e-05, 'epoch': 0.54}


 18%|█▊        | 5380/30000 [1:46:39<2:07:57,  3.21it/s]

{'loss': 0.2774, 'grad_norm': 0.5718453526496887, 'learning_rate': 4.172881355932204e-05, 'epoch': 0.54}


 18%|█▊        | 5390/30000 [1:46:42<2:08:00,  3.20it/s]

{'loss': 0.2764, 'grad_norm': 0.5310908555984497, 'learning_rate': 4.171186440677966e-05, 'epoch': 0.54}


 18%|█▊        | 5400/30000 [1:46:45<2:07:55,  3.21it/s]

{'loss': 0.2617, 'grad_norm': 0.6803273558616638, 'learning_rate': 4.1694915254237285e-05, 'epoch': 0.54}


 18%|█▊        | 5410/30000 [1:46:48<2:07:50,  3.21it/s]

{'loss': 0.2677, 'grad_norm': 0.9825714230537415, 'learning_rate': 4.1677966101694915e-05, 'epoch': 0.54}


 18%|█▊        | 5420/30000 [1:46:51<2:07:39,  3.21it/s]

{'loss': 0.2915, 'grad_norm': 0.6068102121353149, 'learning_rate': 4.1661016949152544e-05, 'epoch': 0.54}


 18%|█▊        | 5430/30000 [1:46:54<2:07:50,  3.20it/s]

{'loss': 0.2786, 'grad_norm': 0.5758827328681946, 'learning_rate': 4.164406779661017e-05, 'epoch': 0.54}


 18%|█▊        | 5440/30000 [1:46:57<2:07:32,  3.21it/s]

{'loss': 0.2666, 'grad_norm': 0.5468906760215759, 'learning_rate': 4.1627118644067796e-05, 'epoch': 0.54}


 18%|█▊        | 5450/30000 [1:47:01<2:07:35,  3.21it/s]

{'loss': 0.2804, 'grad_norm': 0.5812966227531433, 'learning_rate': 4.1610169491525425e-05, 'epoch': 0.55}


 18%|█▊        | 5460/30000 [1:47:04<2:07:30,  3.21it/s]

{'loss': 0.3006, 'grad_norm': 0.6478279829025269, 'learning_rate': 4.1593220338983055e-05, 'epoch': 0.55}


 18%|█▊        | 5470/30000 [1:47:07<2:07:31,  3.21it/s]

{'loss': 0.2924, 'grad_norm': 0.590232789516449, 'learning_rate': 4.157627118644068e-05, 'epoch': 0.55}


 18%|█▊        | 5480/30000 [1:47:10<2:07:30,  3.21it/s]

{'loss': 0.288, 'grad_norm': 0.9402172565460205, 'learning_rate': 4.1559322033898307e-05, 'epoch': 0.55}


 18%|█▊        | 5490/30000 [1:47:13<2:07:18,  3.21it/s]

{'loss': 0.2828, 'grad_norm': 0.6055483818054199, 'learning_rate': 4.1542372881355936e-05, 'epoch': 0.55}


 18%|█▊        | 5500/30000 [1:47:16<2:07:03,  3.21it/s]

{'loss': 0.278, 'grad_norm': 0.6174171566963196, 'learning_rate': 4.152542372881356e-05, 'epoch': 0.55}


 18%|█▊        | 5510/30000 [1:47:19<2:05:18,  3.26it/s]

{'loss': 0.2693, 'grad_norm': 1.0318808555603027, 'learning_rate': 4.150847457627119e-05, 'epoch': 0.55}


 18%|█▊        | 5520/30000 [1:47:22<2:05:43,  3.25it/s]

{'loss': 0.2769, 'grad_norm': 0.549939751625061, 'learning_rate': 4.149152542372882e-05, 'epoch': 0.55}


 18%|█▊        | 5530/30000 [1:47:25<2:05:18,  3.25it/s]

{'loss': 0.2742, 'grad_norm': 0.7676630020141602, 'learning_rate': 4.1474576271186446e-05, 'epoch': 0.55}


 18%|█▊        | 5540/30000 [1:47:28<2:04:54,  3.26it/s]

{'loss': 0.2901, 'grad_norm': 0.8085494041442871, 'learning_rate': 4.145762711864407e-05, 'epoch': 0.55}


 18%|█▊        | 5550/30000 [1:47:32<2:04:52,  3.26it/s]

{'loss': 0.2806, 'grad_norm': 0.5759931802749634, 'learning_rate': 4.14406779661017e-05, 'epoch': 0.56}


 19%|█▊        | 5560/30000 [1:47:35<2:04:46,  3.26it/s]

{'loss': 0.2662, 'grad_norm': 0.6267228126525879, 'learning_rate': 4.142372881355933e-05, 'epoch': 0.56}


 19%|█▊        | 5570/30000 [1:47:38<2:04:57,  3.26it/s]

{'loss': 0.2789, 'grad_norm': 0.522432804107666, 'learning_rate': 4.140677966101695e-05, 'epoch': 0.56}


 19%|█▊        | 5580/30000 [1:47:41<2:04:46,  3.26it/s]

{'loss': 0.2877, 'grad_norm': 0.6660064458847046, 'learning_rate': 4.138983050847458e-05, 'epoch': 0.56}


 19%|█▊        | 5590/30000 [1:47:44<2:04:41,  3.26it/s]

{'loss': 0.2631, 'grad_norm': 0.6452409029006958, 'learning_rate': 4.13728813559322e-05, 'epoch': 0.56}


 19%|█▊        | 5600/30000 [1:47:47<2:04:41,  3.26it/s]

{'loss': 0.2792, 'grad_norm': 0.5974439978599548, 'learning_rate': 4.135593220338983e-05, 'epoch': 0.56}


 19%|█▊        | 5610/30000 [1:47:50<2:04:47,  3.26it/s]

{'loss': 0.2763, 'grad_norm': 0.5443986058235168, 'learning_rate': 4.1338983050847454e-05, 'epoch': 0.56}


 19%|█▊        | 5620/30000 [1:47:53<2:04:33,  3.26it/s]

{'loss': 0.2849, 'grad_norm': 0.776688814163208, 'learning_rate': 4.1322033898305084e-05, 'epoch': 0.56}


 19%|█▉        | 5630/30000 [1:47:56<2:04:32,  3.26it/s]

{'loss': 0.2755, 'grad_norm': 0.6445805430412292, 'learning_rate': 4.130508474576271e-05, 'epoch': 0.56}


 19%|█▉        | 5640/30000 [1:47:59<2:04:25,  3.26it/s]

{'loss': 0.2705, 'grad_norm': 0.6272467970848083, 'learning_rate': 4.128813559322034e-05, 'epoch': 0.56}


 19%|█▉        | 5650/30000 [1:48:02<2:04:23,  3.26it/s]

{'loss': 0.2804, 'grad_norm': 0.5241942405700684, 'learning_rate': 4.1271186440677965e-05, 'epoch': 0.56}


 19%|█▉        | 5660/30000 [1:48:05<2:04:18,  3.26it/s]

{'loss': 0.2685, 'grad_norm': 0.5640804767608643, 'learning_rate': 4.1254237288135594e-05, 'epoch': 0.57}


 19%|█▉        | 5670/30000 [1:48:08<2:04:34,  3.25it/s]

{'loss': 0.2786, 'grad_norm': 0.6777458786964417, 'learning_rate': 4.1237288135593223e-05, 'epoch': 0.57}


 19%|█▉        | 5680/30000 [1:48:11<2:04:30,  3.26it/s]

{'loss': 0.2719, 'grad_norm': 0.5719543695449829, 'learning_rate': 4.1220338983050846e-05, 'epoch': 0.57}


 19%|█▉        | 5690/30000 [1:48:14<2:04:08,  3.26it/s]

{'loss': 0.277, 'grad_norm': 0.5060262084007263, 'learning_rate': 4.1203389830508475e-05, 'epoch': 0.57}


 19%|█▉        | 5700/30000 [1:48:18<2:04:08,  3.26it/s]

{'loss': 0.2816, 'grad_norm': 0.5547688007354736, 'learning_rate': 4.1186440677966105e-05, 'epoch': 0.57}


 19%|█▉        | 5710/30000 [1:48:21<2:04:14,  3.26it/s]

{'loss': 0.2922, 'grad_norm': 0.5895429253578186, 'learning_rate': 4.1169491525423734e-05, 'epoch': 0.57}


 19%|█▉        | 5720/30000 [1:48:24<2:04:06,  3.26it/s]

{'loss': 0.2838, 'grad_norm': 0.5174298286437988, 'learning_rate': 4.115254237288136e-05, 'epoch': 0.57}


 19%|█▉        | 5730/30000 [1:48:27<2:04:01,  3.26it/s]

{'loss': 0.2682, 'grad_norm': 0.6357223987579346, 'learning_rate': 4.1135593220338986e-05, 'epoch': 0.57}


 19%|█▉        | 5740/30000 [1:48:30<2:03:49,  3.27it/s]

{'loss': 0.2652, 'grad_norm': 0.5756484270095825, 'learning_rate': 4.1118644067796615e-05, 'epoch': 0.57}


 19%|█▉        | 5750/30000 [1:48:33<2:03:52,  3.26it/s]

{'loss': 0.2657, 'grad_norm': 0.6492198705673218, 'learning_rate': 4.110169491525424e-05, 'epoch': 0.57}


 19%|█▉        | 5760/30000 [1:48:36<2:03:58,  3.26it/s]

{'loss': 0.2865, 'grad_norm': 0.6777011156082153, 'learning_rate': 4.108474576271187e-05, 'epoch': 0.58}


 19%|█▉        | 5770/30000 [1:48:39<2:03:51,  3.26it/s]

{'loss': 0.289, 'grad_norm': 0.5815180540084839, 'learning_rate': 4.10677966101695e-05, 'epoch': 0.58}


 19%|█▉        | 5780/30000 [1:48:42<2:04:11,  3.25it/s]

{'loss': 0.279, 'grad_norm': 0.602249264717102, 'learning_rate': 4.1050847457627126e-05, 'epoch': 0.58}


 19%|█▉        | 5790/30000 [1:48:45<2:03:56,  3.26it/s]

{'loss': 0.2788, 'grad_norm': 2.677976369857788, 'learning_rate': 4.103389830508475e-05, 'epoch': 0.58}


 19%|█▉        | 5800/30000 [1:48:48<2:03:48,  3.26it/s]

{'loss': 0.2632, 'grad_norm': 0.661588728427887, 'learning_rate': 4.101694915254237e-05, 'epoch': 0.58}


 19%|█▉        | 5810/30000 [1:48:51<2:03:35,  3.26it/s]

{'loss': 0.2772, 'grad_norm': 0.6164467334747314, 'learning_rate': 4.1e-05, 'epoch': 0.58}


 19%|█▉        | 5820/30000 [1:48:54<2:05:42,  3.21it/s]

{'loss': 0.2761, 'grad_norm': 0.5477871298789978, 'learning_rate': 4.098305084745763e-05, 'epoch': 0.58}


 19%|█▉        | 5830/30000 [1:48:58<2:05:31,  3.21it/s]

{'loss': 0.2708, 'grad_norm': 0.5774955153465271, 'learning_rate': 4.096610169491525e-05, 'epoch': 0.58}


 19%|█▉        | 5840/30000 [1:49:29<15:58:54,  2.38s/it]

{'loss': 0.2649, 'grad_norm': 0.8352431654930115, 'learning_rate': 4.094915254237288e-05, 'epoch': 0.58}


 20%|█▉        | 5850/30000 [1:49:33<2:26:51,  2.74it/s] 

{'loss': 0.2673, 'grad_norm': 0.5508883595466614, 'learning_rate': 4.093220338983051e-05, 'epoch': 0.58}


 20%|█▉        | 5860/30000 [1:49:36<2:37:41,  2.55it/s]

{'loss': 0.2756, 'grad_norm': 0.838259756565094, 'learning_rate': 4.0915254237288134e-05, 'epoch': 0.59}


 20%|█▉        | 5870/30000 [1:49:41<2:16:03,  2.96it/s]

{'loss': 0.2679, 'grad_norm': 0.4873601794242859, 'learning_rate': 4.089830508474576e-05, 'epoch': 0.59}


 20%|█▉        | 5880/30000 [1:49:44<2:03:02,  3.27it/s]

{'loss': 0.2651, 'grad_norm': 0.6088406443595886, 'learning_rate': 4.088135593220339e-05, 'epoch': 0.59}


 20%|█▉        | 5890/30000 [1:49:47<2:02:46,  3.27it/s]

{'loss': 0.2801, 'grad_norm': 0.6650732755661011, 'learning_rate': 4.086440677966102e-05, 'epoch': 0.59}


 20%|█▉        | 5900/30000 [1:49:50<2:02:43,  3.27it/s]

{'loss': 0.2774, 'grad_norm': 0.5685262680053711, 'learning_rate': 4.0847457627118644e-05, 'epoch': 0.59}


 20%|█▉        | 5910/30000 [1:49:53<2:02:38,  3.27it/s]

{'loss': 0.2875, 'grad_norm': 0.5480454564094543, 'learning_rate': 4.0830508474576274e-05, 'epoch': 0.59}


 20%|█▉        | 5920/30000 [1:49:56<2:02:32,  3.28it/s]

{'loss': 0.2856, 'grad_norm': 0.6079798936843872, 'learning_rate': 4.08135593220339e-05, 'epoch': 0.59}


 20%|█▉        | 5930/30000 [1:49:59<2:02:21,  3.28it/s]

{'loss': 0.2669, 'grad_norm': 0.489536315202713, 'learning_rate': 4.0796610169491526e-05, 'epoch': 0.59}


 20%|█▉        | 5940/30000 [1:50:02<2:02:21,  3.28it/s]

{'loss': 0.2948, 'grad_norm': 0.6557697057723999, 'learning_rate': 4.0779661016949155e-05, 'epoch': 0.59}


 20%|█▉        | 5950/30000 [1:50:05<2:02:23,  3.28it/s]

{'loss': 0.283, 'grad_norm': 0.5537935495376587, 'learning_rate': 4.0762711864406784e-05, 'epoch': 0.59}


 20%|█▉        | 5960/30000 [1:50:08<2:02:29,  3.27it/s]

{'loss': 0.2667, 'grad_norm': 0.6743266582489014, 'learning_rate': 4.0745762711864414e-05, 'epoch': 0.6}


 20%|█▉        | 5970/30000 [1:50:11<2:02:20,  3.27it/s]

{'loss': 0.2923, 'grad_norm': 0.6542189121246338, 'learning_rate': 4.0728813559322036e-05, 'epoch': 0.6}


 20%|█▉        | 5980/30000 [1:50:14<2:02:26,  3.27it/s]

{'loss': 0.2807, 'grad_norm': 0.5432649850845337, 'learning_rate': 4.0711864406779666e-05, 'epoch': 0.6}


 20%|█▉        | 5990/30000 [2:07:28<120:56:31, 18.13s/it]  

{'loss': 0.2703, 'grad_norm': 0.6074256896972656, 'learning_rate': 4.0694915254237295e-05, 'epoch': 0.6}


 20%|██        | 6000/30000 [2:07:31<5:23:29,  1.24it/s]  

{'loss': 0.2697, 'grad_norm': 0.6452338099479675, 'learning_rate': 4.067796610169492e-05, 'epoch': 0.6}


 20%|██        | 6010/30000 [2:07:34<2:08:25,  3.11it/s]

{'loss': 0.2742, 'grad_norm': 0.5793877840042114, 'learning_rate': 4.066101694915254e-05, 'epoch': 0.6}


 20%|██        | 6020/30000 [2:07:37<2:02:36,  3.26it/s]

{'loss': 0.28, 'grad_norm': 0.5082154273986816, 'learning_rate': 4.064406779661017e-05, 'epoch': 0.6}


 20%|██        | 6030/30000 [2:07:40<2:01:48,  3.28it/s]

{'loss': 0.2709, 'grad_norm': 0.695497989654541, 'learning_rate': 4.06271186440678e-05, 'epoch': 0.6}


 20%|██        | 6040/30000 [2:07:43<2:02:29,  3.26it/s]

{'loss': 0.2672, 'grad_norm': 0.6071168184280396, 'learning_rate': 4.061016949152542e-05, 'epoch': 0.6}


 20%|██        | 6050/30000 [2:07:46<2:02:05,  3.27it/s]

{'loss': 0.2679, 'grad_norm': 0.6964779496192932, 'learning_rate': 4.059322033898305e-05, 'epoch': 0.6}


 20%|██        | 6060/30000 [2:07:49<2:01:58,  3.27it/s]

{'loss': 0.266, 'grad_norm': 0.4739990830421448, 'learning_rate': 4.057627118644068e-05, 'epoch': 0.61}


 20%|██        | 6070/30000 [2:07:53<2:01:43,  3.28it/s]

{'loss': 0.2663, 'grad_norm': 0.511032223701477, 'learning_rate': 4.055932203389831e-05, 'epoch': 0.61}


 20%|██        | 6080/30000 [2:07:56<2:01:35,  3.28it/s]

{'loss': 0.2858, 'grad_norm': 0.7788791656494141, 'learning_rate': 4.054237288135593e-05, 'epoch': 0.61}


 20%|██        | 6090/30000 [2:07:59<2:01:32,  3.28it/s]

{'loss': 0.2715, 'grad_norm': 0.6360536813735962, 'learning_rate': 4.052542372881356e-05, 'epoch': 0.61}


 20%|██        | 6100/30000 [2:08:02<2:01:37,  3.28it/s]

{'loss': 0.2695, 'grad_norm': 0.6284618973731995, 'learning_rate': 4.050847457627119e-05, 'epoch': 0.61}


 20%|██        | 6110/30000 [2:08:05<2:01:27,  3.28it/s]

{'loss': 0.2906, 'grad_norm': 0.6025656461715698, 'learning_rate': 4.049152542372881e-05, 'epoch': 0.61}


 20%|██        | 6120/30000 [2:08:08<2:01:19,  3.28it/s]

{'loss': 0.277, 'grad_norm': 0.6190683841705322, 'learning_rate': 4.047457627118644e-05, 'epoch': 0.61}


 20%|██        | 6130/30000 [2:08:11<2:01:15,  3.28it/s]

{'loss': 0.2674, 'grad_norm': 0.6095278859138489, 'learning_rate': 4.045762711864407e-05, 'epoch': 0.61}


 20%|██        | 6140/30000 [2:08:14<2:01:07,  3.28it/s]

{'loss': 0.2677, 'grad_norm': 0.6219593286514282, 'learning_rate': 4.0440677966101694e-05, 'epoch': 0.61}


 20%|██        | 6150/30000 [2:08:17<2:01:18,  3.28it/s]

{'loss': 0.2708, 'grad_norm': 0.5933437943458557, 'learning_rate': 4.0423728813559324e-05, 'epoch': 0.61}


 21%|██        | 6160/30000 [2:08:20<2:01:27,  3.27it/s]

{'loss': 0.2748, 'grad_norm': 0.6577286124229431, 'learning_rate': 4.040677966101695e-05, 'epoch': 0.62}


 21%|██        | 6170/30000 [2:08:23<2:01:15,  3.28it/s]

{'loss': 0.2617, 'grad_norm': 1.893086552619934, 'learning_rate': 4.038983050847458e-05, 'epoch': 0.62}


 21%|██        | 6180/30000 [2:08:26<2:01:11,  3.28it/s]

{'loss': 0.2792, 'grad_norm': 0.9719403982162476, 'learning_rate': 4.0372881355932205e-05, 'epoch': 0.62}


 21%|██        | 6190/30000 [2:08:29<2:01:04,  3.28it/s]

{'loss': 0.2767, 'grad_norm': 0.643384575843811, 'learning_rate': 4.0355932203389834e-05, 'epoch': 0.62}


 21%|██        | 6200/30000 [2:08:32<2:01:22,  3.27it/s]

{'loss': 0.2815, 'grad_norm': 0.6569064259529114, 'learning_rate': 4.0338983050847464e-05, 'epoch': 0.62}


 21%|██        | 6210/30000 [2:08:35<2:01:19,  3.27it/s]

{'loss': 0.2705, 'grad_norm': 0.5760931968688965, 'learning_rate': 4.0322033898305086e-05, 'epoch': 0.62}


 21%|██        | 6220/30000 [2:25:14<333:34:10, 50.50s/it]  

{'loss': 0.2745, 'grad_norm': 0.6373935341835022, 'learning_rate': 4.030508474576271e-05, 'epoch': 0.62}


 21%|██        | 6230/30000 [2:25:17<11:22:11,  1.72s/it] 

{'loss': 0.2717, 'grad_norm': 0.5473300814628601, 'learning_rate': 4.028813559322034e-05, 'epoch': 0.62}


 21%|██        | 6240/30000 [2:25:20<2:16:23,  2.90it/s] 

{'loss': 0.27, 'grad_norm': 0.5850008726119995, 'learning_rate': 4.027118644067797e-05, 'epoch': 0.62}


 21%|██        | 6250/30000 [2:25:23<2:00:49,  3.28it/s]

{'loss': 0.2753, 'grad_norm': 0.6189756393432617, 'learning_rate': 4.025423728813559e-05, 'epoch': 0.62}


 21%|██        | 6260/30000 [2:25:26<2:00:22,  3.29it/s]

{'loss': 0.2669, 'grad_norm': 0.6692574620246887, 'learning_rate': 4.023728813559322e-05, 'epoch': 0.63}


 21%|██        | 6270/30000 [2:25:29<2:00:27,  3.28it/s]

{'loss': 0.286, 'grad_norm': 0.7367512583732605, 'learning_rate': 4.022033898305085e-05, 'epoch': 0.63}


 21%|██        | 6280/30000 [2:25:32<2:00:26,  3.28it/s]

{'loss': 0.2713, 'grad_norm': 0.7400110363960266, 'learning_rate': 4.020338983050848e-05, 'epoch': 0.63}


 21%|██        | 6290/30000 [2:25:35<2:00:23,  3.28it/s]

{'loss': 0.2975, 'grad_norm': 0.7515279054641724, 'learning_rate': 4.01864406779661e-05, 'epoch': 0.63}


 21%|██        | 6300/30000 [2:25:38<2:00:21,  3.28it/s]

{'loss': 0.2705, 'grad_norm': 0.6819774508476257, 'learning_rate': 4.016949152542373e-05, 'epoch': 0.63}


 21%|██        | 6310/30000 [2:25:41<2:00:29,  3.28it/s]

{'loss': 0.2724, 'grad_norm': 0.6554727554321289, 'learning_rate': 4.015254237288136e-05, 'epoch': 0.63}


 21%|██        | 6320/30000 [2:25:44<2:00:36,  3.27it/s]

{'loss': 0.2774, 'grad_norm': 0.5387411117553711, 'learning_rate': 4.013559322033898e-05, 'epoch': 0.63}


 21%|██        | 6330/30000 [2:25:47<2:00:36,  3.27it/s]

{'loss': 0.2675, 'grad_norm': 0.9414543509483337, 'learning_rate': 4.011864406779661e-05, 'epoch': 0.63}


 21%|██        | 6340/30000 [2:25:50<2:00:28,  3.27it/s]

{'loss': 0.2747, 'grad_norm': 0.572397768497467, 'learning_rate': 4.010169491525424e-05, 'epoch': 0.63}


 21%|██        | 6350/30000 [2:25:53<2:00:26,  3.27it/s]

{'loss': 0.2589, 'grad_norm': 0.9532087445259094, 'learning_rate': 4.008474576271187e-05, 'epoch': 0.64}


 21%|██        | 6360/30000 [2:25:57<2:00:25,  3.27it/s]

{'loss': 0.276, 'grad_norm': 0.7822365164756775, 'learning_rate': 4.006779661016949e-05, 'epoch': 0.64}


 21%|██        | 6370/30000 [2:26:00<2:00:16,  3.27it/s]

{'loss': 0.2842, 'grad_norm': 0.6061400175094604, 'learning_rate': 4.005084745762712e-05, 'epoch': 0.64}


 21%|██▏       | 6380/30000 [2:26:03<2:00:37,  3.26it/s]

{'loss': 0.2716, 'grad_norm': 0.5958137512207031, 'learning_rate': 4.003389830508475e-05, 'epoch': 0.64}


 21%|██▏       | 6390/30000 [2:26:06<2:00:15,  3.27it/s]

{'loss': 0.2552, 'grad_norm': 0.5979343056678772, 'learning_rate': 4.0016949152542374e-05, 'epoch': 0.64}


 21%|██▏       | 6400/30000 [2:26:09<2:00:22,  3.27it/s]

{'loss': 0.2978, 'grad_norm': 0.5017026662826538, 'learning_rate': 4e-05, 'epoch': 0.64}


 21%|██▏       | 6410/30000 [2:26:12<2:00:03,  3.27it/s]

{'loss': 0.2811, 'grad_norm': 0.6187149882316589, 'learning_rate': 3.998305084745763e-05, 'epoch': 0.64}


 21%|██▏       | 6420/30000 [2:26:15<2:00:03,  3.27it/s]

{'loss': 0.2768, 'grad_norm': 0.5916363596916199, 'learning_rate': 3.996610169491526e-05, 'epoch': 0.64}


 21%|██▏       | 6430/30000 [2:26:18<2:00:02,  3.27it/s]

{'loss': 0.2679, 'grad_norm': 0.5928835868835449, 'learning_rate': 3.994915254237288e-05, 'epoch': 0.64}


 21%|██▏       | 6440/30000 [2:26:21<1:59:59,  3.27it/s]

{'loss': 0.2719, 'grad_norm': 0.47531741857528687, 'learning_rate': 3.993220338983051e-05, 'epoch': 0.64}


 22%|██▏       | 6450/30000 [2:26:24<1:59:58,  3.27it/s]

{'loss': 0.2706, 'grad_norm': 0.6900959610939026, 'learning_rate': 3.9915254237288136e-05, 'epoch': 0.65}


 22%|██▏       | 6460/30000 [2:26:27<2:00:02,  3.27it/s]

{'loss': 0.274, 'grad_norm': 0.6420883536338806, 'learning_rate': 3.9898305084745766e-05, 'epoch': 0.65}


 22%|██▏       | 6470/30000 [2:26:30<1:59:55,  3.27it/s]

{'loss': 0.2709, 'grad_norm': 0.644903838634491, 'learning_rate': 3.988135593220339e-05, 'epoch': 0.65}


 22%|██▏       | 6480/30000 [2:26:33<1:59:41,  3.28it/s]

{'loss': 0.2724, 'grad_norm': 0.5661100745201111, 'learning_rate': 3.986440677966102e-05, 'epoch': 0.65}


 22%|██▏       | 6490/30000 [2:26:36<1:59:41,  3.27it/s]

{'loss': 0.2747, 'grad_norm': 0.5786612033843994, 'learning_rate': 3.984745762711865e-05, 'epoch': 0.65}


 22%|██▏       | 6500/30000 [2:26:39<1:59:36,  3.27it/s]

{'loss': 0.3052, 'grad_norm': 0.713962972164154, 'learning_rate': 3.983050847457627e-05, 'epoch': 0.65}


 22%|██▏       | 6510/30000 [2:26:42<1:59:39,  3.27it/s]

{'loss': 0.269, 'grad_norm': 0.5817779898643494, 'learning_rate': 3.98135593220339e-05, 'epoch': 0.65}


 22%|██▏       | 6520/30000 [2:26:45<1:59:28,  3.28it/s]

{'loss': 0.2671, 'grad_norm': 0.7207322120666504, 'learning_rate': 3.979661016949153e-05, 'epoch': 0.65}


 22%|██▏       | 6530/30000 [2:26:49<1:59:39,  3.27it/s]

{'loss': 0.2618, 'grad_norm': 0.5659072995185852, 'learning_rate': 3.977966101694916e-05, 'epoch': 0.65}


 22%|██▏       | 6540/30000 [2:26:52<1:59:27,  3.27it/s]

{'loss': 0.278, 'grad_norm': 0.5757046341896057, 'learning_rate': 3.976271186440678e-05, 'epoch': 0.65}


 22%|██▏       | 6550/30000 [2:26:55<1:59:32,  3.27it/s]

{'loss': 0.2815, 'grad_norm': 0.5782840847969055, 'learning_rate': 3.974576271186441e-05, 'epoch': 0.66}


 22%|██▏       | 6560/30000 [2:26:58<1:59:29,  3.27it/s]

{'loss': 0.2783, 'grad_norm': 0.6923466324806213, 'learning_rate': 3.972881355932204e-05, 'epoch': 0.66}


 22%|██▏       | 6570/30000 [2:27:01<1:59:18,  3.27it/s]

{'loss': 0.2758, 'grad_norm': 0.5419848561286926, 'learning_rate': 3.971186440677966e-05, 'epoch': 0.66}


 22%|██▏       | 6580/30000 [2:27:04<1:59:19,  3.27it/s]

{'loss': 0.2723, 'grad_norm': 0.5717641115188599, 'learning_rate': 3.969491525423729e-05, 'epoch': 0.66}


 22%|██▏       | 6590/30000 [2:27:07<1:59:14,  3.27it/s]

{'loss': 0.2581, 'grad_norm': 0.6543943285942078, 'learning_rate': 3.967796610169492e-05, 'epoch': 0.66}


 22%|██▏       | 6600/30000 [2:27:10<1:59:15,  3.27it/s]

{'loss': 0.2924, 'grad_norm': 0.7014393210411072, 'learning_rate': 3.966101694915255e-05, 'epoch': 0.66}


 22%|██▏       | 6610/30000 [2:27:13<1:59:13,  3.27it/s]

{'loss': 0.2524, 'grad_norm': 0.6216135025024414, 'learning_rate': 3.964406779661017e-05, 'epoch': 0.66}


 22%|██▏       | 6620/30000 [2:27:16<1:58:58,  3.28it/s]

{'loss': 0.2763, 'grad_norm': 0.5815132260322571, 'learning_rate': 3.96271186440678e-05, 'epoch': 0.66}


 22%|██▏       | 6630/30000 [2:27:19<1:59:32,  3.26it/s]

{'loss': 0.2692, 'grad_norm': 0.6567731499671936, 'learning_rate': 3.9610169491525424e-05, 'epoch': 0.66}


 22%|██▏       | 6640/30000 [2:27:22<2:01:58,  3.19it/s]

{'loss': 0.2607, 'grad_norm': 0.6005278825759888, 'learning_rate': 3.9593220338983053e-05, 'epoch': 0.66}


 22%|██▏       | 6650/30000 [2:27:25<2:04:19,  3.13it/s]

{'loss': 0.2652, 'grad_norm': 0.710435152053833, 'learning_rate': 3.9576271186440676e-05, 'epoch': 0.67}


 22%|██▏       | 6660/30000 [2:27:29<2:06:25,  3.08it/s]

{'loss': 0.2701, 'grad_norm': 0.7505931258201599, 'learning_rate': 3.9559322033898305e-05, 'epoch': 0.67}


 22%|██▏       | 6670/30000 [2:27:32<2:08:19,  3.03it/s]

{'loss': 0.275, 'grad_norm': 0.4885140657424927, 'learning_rate': 3.9542372881355935e-05, 'epoch': 0.67}


 22%|██▏       | 6680/30000 [2:27:35<2:10:10,  2.99it/s]

{'loss': 0.2789, 'grad_norm': 0.6899858713150024, 'learning_rate': 3.952542372881356e-05, 'epoch': 0.67}


 22%|██▏       | 6690/30000 [2:31:29<109:18:32, 16.88s/it]

{'loss': 0.2649, 'grad_norm': 0.5729796886444092, 'learning_rate': 3.9508474576271187e-05, 'epoch': 0.67}


 22%|██▏       | 6700/30000 [2:31:32<5:01:09,  1.29it/s]  

{'loss': 0.271, 'grad_norm': 0.7223866581916809, 'learning_rate': 3.9491525423728816e-05, 'epoch': 0.67}


 22%|██▏       | 6710/30000 [2:31:35<2:03:28,  3.14it/s]

{'loss': 0.2958, 'grad_norm': 0.7618289589881897, 'learning_rate': 3.9474576271186445e-05, 'epoch': 0.67}


 22%|██▏       | 6720/30000 [2:31:38<1:58:24,  3.28it/s]

{'loss': 0.2836, 'grad_norm': 0.5834408402442932, 'learning_rate': 3.945762711864407e-05, 'epoch': 0.67}


 22%|██▏       | 6730/30000 [2:31:41<1:58:21,  3.28it/s]

{'loss': 0.2626, 'grad_norm': 0.6893288493156433, 'learning_rate': 3.94406779661017e-05, 'epoch': 0.67}


 22%|██▏       | 6740/30000 [2:31:44<1:58:23,  3.27it/s]

{'loss': 0.276, 'grad_norm': 0.8353472352027893, 'learning_rate': 3.9423728813559327e-05, 'epoch': 0.67}


 22%|██▎       | 6750/30000 [2:31:47<1:58:44,  3.26it/s]

{'loss': 0.2576, 'grad_norm': 0.6012494564056396, 'learning_rate': 3.940677966101695e-05, 'epoch': 0.68}


 23%|██▎       | 6760/30000 [2:31:50<1:58:18,  3.27it/s]

{'loss': 0.2693, 'grad_norm': 0.5923000574111938, 'learning_rate': 3.938983050847458e-05, 'epoch': 0.68}


 23%|██▎       | 6770/30000 [2:31:53<1:58:19,  3.27it/s]

{'loss': 0.272, 'grad_norm': 0.874743640422821, 'learning_rate': 3.937288135593221e-05, 'epoch': 0.68}


 23%|██▎       | 6780/30000 [2:31:56<1:58:09,  3.28it/s]

{'loss': 0.2704, 'grad_norm': 1.236801266670227, 'learning_rate': 3.935593220338983e-05, 'epoch': 0.68}


 23%|██▎       | 6790/30000 [2:31:59<1:58:16,  3.27it/s]

{'loss': 0.2508, 'grad_norm': 0.6598697900772095, 'learning_rate': 3.933898305084746e-05, 'epoch': 0.68}


 23%|██▎       | 6800/30000 [2:32:03<1:58:08,  3.27it/s]

{'loss': 0.2715, 'grad_norm': 0.47968852519989014, 'learning_rate': 3.932203389830509e-05, 'epoch': 0.68}


 23%|██▎       | 6810/30000 [2:32:06<1:58:08,  3.27it/s]

{'loss': 0.2768, 'grad_norm': 0.6133090853691101, 'learning_rate': 3.930508474576272e-05, 'epoch': 0.68}


 23%|██▎       | 6820/30000 [2:32:09<1:58:09,  3.27it/s]

{'loss': 0.2622, 'grad_norm': 0.5619397759437561, 'learning_rate': 3.928813559322034e-05, 'epoch': 0.68}


 23%|██▎       | 6830/30000 [2:32:12<1:58:00,  3.27it/s]

{'loss': 0.2713, 'grad_norm': 0.5323675870895386, 'learning_rate': 3.927118644067797e-05, 'epoch': 0.68}


 23%|██▎       | 6840/30000 [2:32:15<1:58:03,  3.27it/s]

{'loss': 0.2799, 'grad_norm': 0.5657944083213806, 'learning_rate': 3.925423728813559e-05, 'epoch': 0.68}


 23%|██▎       | 6850/30000 [2:32:18<1:57:53,  3.27it/s]

{'loss': 0.2707, 'grad_norm': 0.5188151597976685, 'learning_rate': 3.923728813559322e-05, 'epoch': 0.69}


 23%|██▎       | 6860/30000 [2:32:21<1:57:50,  3.27it/s]

{'loss': 0.2923, 'grad_norm': 1.2612487077713013, 'learning_rate': 3.9220338983050845e-05, 'epoch': 0.69}


 23%|██▎       | 6870/30000 [2:32:24<1:57:50,  3.27it/s]

{'loss': 0.2594, 'grad_norm': 0.6025970578193665, 'learning_rate': 3.9203389830508474e-05, 'epoch': 0.69}


 23%|██▎       | 6880/30000 [2:32:27<1:57:44,  3.27it/s]

{'loss': 0.2602, 'grad_norm': 0.7520135045051575, 'learning_rate': 3.9186440677966104e-05, 'epoch': 0.69}


 23%|██▎       | 6890/30000 [2:32:30<1:57:42,  3.27it/s]

{'loss': 0.2762, 'grad_norm': 0.5480858683586121, 'learning_rate': 3.9169491525423726e-05, 'epoch': 0.69}


 23%|██▎       | 6900/30000 [2:32:33<1:57:38,  3.27it/s]

{'loss': 0.271, 'grad_norm': 0.5891125202178955, 'learning_rate': 3.9152542372881355e-05, 'epoch': 0.69}


 23%|██▎       | 6910/30000 [2:32:36<1:57:40,  3.27it/s]

{'loss': 0.2646, 'grad_norm': 0.5759999752044678, 'learning_rate': 3.9135593220338985e-05, 'epoch': 0.69}


 23%|██▎       | 6920/30000 [2:32:39<1:57:34,  3.27it/s]

{'loss': 0.271, 'grad_norm': 0.5810021162033081, 'learning_rate': 3.9118644067796614e-05, 'epoch': 0.69}


 23%|██▎       | 6930/30000 [2:32:42<1:57:32,  3.27it/s]

{'loss': 0.265, 'grad_norm': 0.6289494633674622, 'learning_rate': 3.910169491525424e-05, 'epoch': 0.69}


 23%|██▎       | 6940/30000 [2:32:45<1:57:31,  3.27it/s]

{'loss': 0.2549, 'grad_norm': 0.5402081608772278, 'learning_rate': 3.9084745762711866e-05, 'epoch': 0.69}


 23%|██▎       | 6950/30000 [2:32:48<1:57:27,  3.27it/s]

{'loss': 0.2771, 'grad_norm': 0.5928326845169067, 'learning_rate': 3.9067796610169495e-05, 'epoch': 0.69}


 23%|██▎       | 6960/30000 [2:32:51<1:57:16,  3.27it/s]

{'loss': 0.29, 'grad_norm': 0.5195896029472351, 'learning_rate': 3.905084745762712e-05, 'epoch': 0.7}


 23%|██▎       | 6970/30000 [2:32:55<1:57:17,  3.27it/s]

{'loss': 0.2802, 'grad_norm': 0.6282797455787659, 'learning_rate': 3.903389830508475e-05, 'epoch': 0.7}


 23%|██▎       | 6980/30000 [2:32:58<1:57:14,  3.27it/s]

{'loss': 0.2749, 'grad_norm': 0.6055401563644409, 'learning_rate': 3.901694915254238e-05, 'epoch': 0.7}


 23%|██▎       | 6990/30000 [2:33:01<1:57:16,  3.27it/s]

{'loss': 0.2615, 'grad_norm': 0.5830493569374084, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.7}


 23%|██▎       | 7000/30000 [2:33:04<1:57:09,  3.27it/s]

{'loss': 0.2688, 'grad_norm': 0.6866586208343506, 'learning_rate': 3.898305084745763e-05, 'epoch': 0.7}


 23%|██▎       | 7010/30000 [2:33:07<1:57:08,  3.27it/s]

{'loss': 0.2738, 'grad_norm': 0.5634047389030457, 'learning_rate': 3.896610169491526e-05, 'epoch': 0.7}


 23%|██▎       | 7020/30000 [2:33:11<3:19:37,  1.92it/s]

{'loss': 0.278, 'grad_norm': 0.5595762729644775, 'learning_rate': 3.894915254237289e-05, 'epoch': 0.7}


 23%|██▎       | 7030/30000 [2:33:20<5:49:58,  1.09it/s]

{'loss': 0.2557, 'grad_norm': 0.5284333825111389, 'learning_rate': 3.893220338983051e-05, 'epoch': 0.7}


 23%|██▎       | 7040/30000 [2:35:25<40:59:30,  6.43s/it] 

{'loss': 0.2579, 'grad_norm': 0.5377721786499023, 'learning_rate': 3.891525423728814e-05, 'epoch': 0.7}


 24%|██▎       | 7050/30000 [2:35:28<3:02:58,  2.09it/s] 

{'loss': 0.2587, 'grad_norm': 0.5346081852912903, 'learning_rate': 3.889830508474576e-05, 'epoch': 0.7}


 24%|██▎       | 7060/30000 [2:35:31<1:58:38,  3.22it/s]

{'loss': 0.2667, 'grad_norm': 0.5830910801887512, 'learning_rate': 3.888135593220339e-05, 'epoch': 0.71}


 24%|██▎       | 7070/30000 [2:35:34<1:56:43,  3.27it/s]

{'loss': 0.2512, 'grad_norm': 0.8088367581367493, 'learning_rate': 3.8864406779661014e-05, 'epoch': 0.71}


 24%|██▎       | 7080/30000 [2:35:38<1:56:45,  3.27it/s]

{'loss': 0.277, 'grad_norm': 0.6791463494300842, 'learning_rate': 3.884745762711864e-05, 'epoch': 0.71}


 24%|██▎       | 7090/30000 [2:35:41<1:56:45,  3.27it/s]

{'loss': 0.2761, 'grad_norm': 0.6654943227767944, 'learning_rate': 3.883050847457627e-05, 'epoch': 0.71}


 24%|██▎       | 7100/30000 [2:35:44<1:56:41,  3.27it/s]

{'loss': 0.2723, 'grad_norm': 0.5488938689231873, 'learning_rate': 3.88135593220339e-05, 'epoch': 0.71}


 24%|██▎       | 7110/30000 [2:35:47<1:56:30,  3.27it/s]

{'loss': 0.2661, 'grad_norm': 0.5751431584358215, 'learning_rate': 3.8796610169491524e-05, 'epoch': 0.71}


 24%|██▎       | 7120/30000 [2:35:50<1:56:29,  3.27it/s]

{'loss': 0.2826, 'grad_norm': 0.5315729379653931, 'learning_rate': 3.8779661016949154e-05, 'epoch': 0.71}


 24%|██▍       | 7130/30000 [2:35:53<1:56:25,  3.27it/s]

{'loss': 0.2596, 'grad_norm': 0.5974525213241577, 'learning_rate': 3.876271186440678e-05, 'epoch': 0.71}


 24%|██▍       | 7140/30000 [2:35:56<1:56:26,  3.27it/s]

{'loss': 0.2776, 'grad_norm': 0.6010546684265137, 'learning_rate': 3.8745762711864406e-05, 'epoch': 0.71}


 24%|██▍       | 7150/30000 [2:35:59<1:56:17,  3.27it/s]

{'loss': 0.2668, 'grad_norm': 0.8218600153923035, 'learning_rate': 3.8728813559322035e-05, 'epoch': 0.71}


 24%|██▍       | 7160/30000 [2:36:02<1:56:14,  3.27it/s]

{'loss': 0.2813, 'grad_norm': 0.5576153993606567, 'learning_rate': 3.8711864406779664e-05, 'epoch': 0.72}


 24%|██▍       | 7170/30000 [2:36:05<1:56:18,  3.27it/s]

{'loss': 0.2745, 'grad_norm': 0.6365702152252197, 'learning_rate': 3.8694915254237294e-05, 'epoch': 0.72}


 24%|██▍       | 7180/30000 [2:36:08<1:56:09,  3.27it/s]

{'loss': 0.2776, 'grad_norm': 0.9928062558174133, 'learning_rate': 3.8677966101694916e-05, 'epoch': 0.72}


 24%|██▍       | 7190/30000 [2:52:18<902:20:51, 142.41s/it] 

{'loss': 0.2673, 'grad_norm': 0.5323950052261353, 'learning_rate': 3.8661016949152546e-05, 'epoch': 0.72}


 24%|██▍       | 7200/30000 [2:52:21<27:20:45,  4.32s/it]  

{'loss': 0.2602, 'grad_norm': 0.6297430396080017, 'learning_rate': 3.8644067796610175e-05, 'epoch': 0.72}


 24%|██▍       | 7210/30000 [2:52:24<2:38:17,  2.40it/s] 

{'loss': 0.2549, 'grad_norm': 0.8348257541656494, 'learning_rate': 3.86271186440678e-05, 'epoch': 0.72}


 24%|██▍       | 7220/30000 [2:52:27<1:56:35,  3.26it/s]

{'loss': 0.2696, 'grad_norm': 0.6760842800140381, 'learning_rate': 3.861016949152543e-05, 'epoch': 0.72}


 24%|██▍       | 7230/30000 [2:52:30<1:55:31,  3.29it/s]

{'loss': 0.2695, 'grad_norm': 0.7825217843055725, 'learning_rate': 3.8593220338983056e-05, 'epoch': 0.72}


 24%|██▍       | 7240/30000 [2:52:33<1:55:32,  3.28it/s]

{'loss': 0.2836, 'grad_norm': 0.632393479347229, 'learning_rate': 3.8576271186440686e-05, 'epoch': 0.72}


 24%|██▍       | 7250/30000 [2:52:36<1:55:27,  3.28it/s]

{'loss': 0.2609, 'grad_norm': 0.5216580629348755, 'learning_rate': 3.855932203389831e-05, 'epoch': 0.72}


 24%|██▍       | 7260/30000 [2:52:39<1:55:29,  3.28it/s]

{'loss': 0.2919, 'grad_norm': 0.7562510967254639, 'learning_rate': 3.854237288135593e-05, 'epoch': 0.73}


 24%|██▍       | 7270/30000 [2:52:42<1:55:26,  3.28it/s]

{'loss': 0.271, 'grad_norm': 0.5353607535362244, 'learning_rate': 3.852542372881356e-05, 'epoch': 0.73}


 24%|██▍       | 7280/30000 [2:52:45<1:55:27,  3.28it/s]

{'loss': 0.2701, 'grad_norm': 0.7374398708343506, 'learning_rate': 3.850847457627119e-05, 'epoch': 0.73}


 24%|██▍       | 7290/30000 [2:52:48<1:55:28,  3.28it/s]

{'loss': 0.2367, 'grad_norm': 0.5261383056640625, 'learning_rate': 3.849152542372881e-05, 'epoch': 0.73}


 24%|██▍       | 7300/30000 [2:52:51<1:55:24,  3.28it/s]

{'loss': 0.272, 'grad_norm': 0.5518999695777893, 'learning_rate': 3.847457627118644e-05, 'epoch': 0.73}


 24%|██▍       | 7310/30000 [2:52:54<1:55:40,  3.27it/s]

{'loss': 0.2693, 'grad_norm': 0.525232195854187, 'learning_rate': 3.845762711864407e-05, 'epoch': 0.73}


 24%|██▍       | 7320/30000 [2:52:58<1:55:34,  3.27it/s]

{'loss': 0.2813, 'grad_norm': 0.6264735460281372, 'learning_rate': 3.844067796610169e-05, 'epoch': 0.73}


 24%|██▍       | 7330/30000 [2:53:01<1:55:24,  3.27it/s]

{'loss': 0.2735, 'grad_norm': 0.5397584438323975, 'learning_rate': 3.842372881355932e-05, 'epoch': 0.73}


 24%|██▍       | 7340/30000 [2:53:04<1:55:24,  3.27it/s]

{'loss': 0.2657, 'grad_norm': 0.5811464190483093, 'learning_rate': 3.840677966101695e-05, 'epoch': 0.73}


 24%|██▍       | 7350/30000 [3:08:31<72:18:37, 11.49s/it]   

{'loss': 0.2717, 'grad_norm': 0.6427739262580872, 'learning_rate': 3.838983050847458e-05, 'epoch': 0.73}


 25%|██▍       | 7360/30000 [3:08:34<3:54:10,  1.61it/s] 

{'loss': 0.254, 'grad_norm': 0.49875885248184204, 'learning_rate': 3.8372881355932204e-05, 'epoch': 0.74}


 25%|██▍       | 7370/30000 [3:08:37<1:58:10,  3.19it/s]

{'loss': 0.2737, 'grad_norm': 0.6053526997566223, 'learning_rate': 3.835593220338983e-05, 'epoch': 0.74}


 25%|██▍       | 7380/30000 [3:08:40<1:55:04,  3.28it/s]

{'loss': 0.2514, 'grad_norm': 0.43403515219688416, 'learning_rate': 3.833898305084746e-05, 'epoch': 0.74}


 25%|██▍       | 7390/30000 [3:08:43<1:55:04,  3.27it/s]

{'loss': 0.2663, 'grad_norm': 0.5211274027824402, 'learning_rate': 3.8322033898305085e-05, 'epoch': 0.74}


 25%|██▍       | 7400/30000 [3:08:46<1:54:57,  3.28it/s]

{'loss': 0.2636, 'grad_norm': 0.5819969177246094, 'learning_rate': 3.8305084745762714e-05, 'epoch': 0.74}


 25%|██▍       | 7410/30000 [3:08:49<1:54:53,  3.28it/s]

{'loss': 0.2668, 'grad_norm': 0.5806111097335815, 'learning_rate': 3.8288135593220344e-05, 'epoch': 0.74}


 25%|██▍       | 7420/30000 [3:08:52<1:54:42,  3.28it/s]

{'loss': 0.2644, 'grad_norm': 0.506111204624176, 'learning_rate': 3.8271186440677966e-05, 'epoch': 0.74}


 25%|██▍       | 7430/30000 [3:08:55<1:55:10,  3.27it/s]

{'loss': 0.2733, 'grad_norm': 0.7657807469367981, 'learning_rate': 3.8254237288135596e-05, 'epoch': 0.74}


 25%|██▍       | 7440/30000 [3:08:58<1:55:07,  3.27it/s]

{'loss': 0.2634, 'grad_norm': 0.5132492184638977, 'learning_rate': 3.8237288135593225e-05, 'epoch': 0.74}


 25%|██▍       | 7450/30000 [3:09:01<1:54:34,  3.28it/s]

{'loss': 0.2684, 'grad_norm': 0.6063508987426758, 'learning_rate': 3.8220338983050854e-05, 'epoch': 0.74}


 25%|██▍       | 7460/30000 [3:09:04<1:54:51,  3.27it/s]

{'loss': 0.2674, 'grad_norm': 0.5735150575637817, 'learning_rate': 3.820338983050848e-05, 'epoch': 0.75}


 25%|██▍       | 7470/30000 [3:09:08<1:54:45,  3.27it/s]

{'loss': 0.2496, 'grad_norm': 0.5868464112281799, 'learning_rate': 3.81864406779661e-05, 'epoch': 0.75}


 25%|██▍       | 7480/30000 [3:09:11<1:54:42,  3.27it/s]

{'loss': 0.2473, 'grad_norm': 0.6062225103378296, 'learning_rate': 3.816949152542373e-05, 'epoch': 0.75}


 25%|██▍       | 7490/30000 [3:09:14<1:54:37,  3.27it/s]

{'loss': 0.2571, 'grad_norm': 0.6976430416107178, 'learning_rate': 3.815254237288136e-05, 'epoch': 0.75}


 25%|██▌       | 7500/30000 [3:09:17<1:54:33,  3.27it/s]

{'loss': 0.2741, 'grad_norm': 0.5385390520095825, 'learning_rate': 3.813559322033898e-05, 'epoch': 0.75}


 25%|██▌       | 7510/30000 [3:09:20<1:54:34,  3.27it/s]

{'loss': 0.2618, 'grad_norm': 0.5473780035972595, 'learning_rate': 3.811864406779661e-05, 'epoch': 0.75}


 25%|██▌       | 7520/30000 [3:09:23<1:54:33,  3.27it/s]

{'loss': 0.2599, 'grad_norm': 0.5391940474510193, 'learning_rate': 3.810169491525424e-05, 'epoch': 0.75}


 25%|██▌       | 7530/30000 [3:09:26<1:54:27,  3.27it/s]

{'loss': 0.2618, 'grad_norm': 0.6159722805023193, 'learning_rate': 3.808474576271186e-05, 'epoch': 0.75}


 25%|██▌       | 7540/30000 [3:09:29<1:54:30,  3.27it/s]

{'loss': 0.266, 'grad_norm': 0.5775614380836487, 'learning_rate': 3.806779661016949e-05, 'epoch': 0.75}


 25%|██▌       | 7550/30000 [3:09:32<1:54:25,  3.27it/s]

{'loss': 0.2555, 'grad_norm': 0.5832473635673523, 'learning_rate': 3.805084745762712e-05, 'epoch': 0.76}


 25%|██▌       | 7560/30000 [3:09:35<1:54:12,  3.27it/s]

{'loss': 0.2641, 'grad_norm': 0.5168424844741821, 'learning_rate': 3.803389830508475e-05, 'epoch': 0.76}


 25%|██▌       | 7570/30000 [3:09:38<1:54:11,  3.27it/s]

{'loss': 0.2561, 'grad_norm': 0.6136870384216309, 'learning_rate': 3.801694915254237e-05, 'epoch': 0.76}


 25%|██▌       | 7580/30000 [3:09:41<1:54:13,  3.27it/s]

{'loss': 0.2571, 'grad_norm': 0.4919454753398895, 'learning_rate': 3.8e-05, 'epoch': 0.76}


 25%|██▌       | 7590/30000 [3:09:44<1:54:13,  3.27it/s]

{'loss': 0.2623, 'grad_norm': 1.3237091302871704, 'learning_rate': 3.798305084745763e-05, 'epoch': 0.76}


 25%|██▌       | 7600/30000 [3:09:47<1:54:08,  3.27it/s]

{'loss': 0.2394, 'grad_norm': 0.524813711643219, 'learning_rate': 3.7966101694915254e-05, 'epoch': 0.76}


 25%|██▌       | 7610/30000 [3:09:50<1:53:53,  3.28it/s]

{'loss': 0.2541, 'grad_norm': 0.6009710431098938, 'learning_rate': 3.794915254237288e-05, 'epoch': 0.76}


 25%|██▌       | 7620/30000 [3:09:53<1:53:59,  3.27it/s]

{'loss': 0.282, 'grad_norm': 0.5133130550384521, 'learning_rate': 3.793220338983051e-05, 'epoch': 0.76}


 25%|██▌       | 7630/30000 [3:09:56<1:53:58,  3.27it/s]

{'loss': 0.2721, 'grad_norm': 0.7002784609794617, 'learning_rate': 3.791525423728814e-05, 'epoch': 0.76}


 25%|██▌       | 7640/30000 [3:10:00<1:53:51,  3.27it/s]

{'loss': 0.2697, 'grad_norm': 0.6804300546646118, 'learning_rate': 3.7898305084745765e-05, 'epoch': 0.76}


 26%|██▌       | 7650/30000 [3:10:03<1:53:55,  3.27it/s]

{'loss': 0.2684, 'grad_norm': 0.7103761434555054, 'learning_rate': 3.7881355932203394e-05, 'epoch': 0.77}


 26%|██▌       | 7660/30000 [3:10:06<1:53:48,  3.27it/s]

{'loss': 0.2626, 'grad_norm': 0.862943708896637, 'learning_rate': 3.786440677966102e-05, 'epoch': 0.77}


 26%|██▌       | 7670/30000 [3:10:09<1:53:45,  3.27it/s]

{'loss': 0.2747, 'grad_norm': 0.559529721736908, 'learning_rate': 3.7847457627118646e-05, 'epoch': 0.77}


 26%|██▌       | 7680/30000 [3:10:12<1:53:42,  3.27it/s]

{'loss': 0.2648, 'grad_norm': 0.4339146316051483, 'learning_rate': 3.783050847457627e-05, 'epoch': 0.77}


 26%|██▌       | 7690/30000 [3:10:15<1:53:37,  3.27it/s]

{'loss': 0.263, 'grad_norm': 0.5492746233940125, 'learning_rate': 3.78135593220339e-05, 'epoch': 0.77}


 26%|██▌       | 7700/30000 [3:10:18<1:53:34,  3.27it/s]

{'loss': 0.2589, 'grad_norm': 0.6281899213790894, 'learning_rate': 3.779661016949153e-05, 'epoch': 0.77}


 26%|██▌       | 7710/30000 [3:10:21<1:53:31,  3.27it/s]

{'loss': 0.2579, 'grad_norm': 0.6111029386520386, 'learning_rate': 3.777966101694915e-05, 'epoch': 0.77}


 26%|██▌       | 7720/30000 [3:10:24<1:53:28,  3.27it/s]

{'loss': 0.2648, 'grad_norm': 0.5250844955444336, 'learning_rate': 3.776271186440678e-05, 'epoch': 0.77}


 26%|██▌       | 7730/30000 [3:10:27<1:53:22,  3.27it/s]

{'loss': 0.2764, 'grad_norm': 0.6175439357757568, 'learning_rate': 3.774576271186441e-05, 'epoch': 0.77}


 26%|██▌       | 7740/30000 [3:10:30<1:53:21,  3.27it/s]

{'loss': 0.2793, 'grad_norm': 0.5677777528762817, 'learning_rate': 3.772881355932204e-05, 'epoch': 0.77}


 26%|██▌       | 7750/30000 [3:10:33<1:53:17,  3.27it/s]

{'loss': 0.2683, 'grad_norm': 0.64151930809021, 'learning_rate': 3.771186440677966e-05, 'epoch': 0.78}


 26%|██▌       | 7760/30000 [3:10:36<1:53:14,  3.27it/s]

{'loss': 0.284, 'grad_norm': 0.7806605696678162, 'learning_rate': 3.769491525423729e-05, 'epoch': 0.78}


 26%|██▌       | 7770/30000 [3:10:39<1:53:13,  3.27it/s]

{'loss': 0.2738, 'grad_norm': 0.6141505837440491, 'learning_rate': 3.767796610169492e-05, 'epoch': 0.78}


 26%|██▌       | 7780/30000 [3:10:42<1:55:31,  3.21it/s]

{'loss': 0.2616, 'grad_norm': 0.6015521287918091, 'learning_rate': 3.766101694915254e-05, 'epoch': 0.78}


 26%|██▌       | 7790/30000 [3:10:46<1:57:54,  3.14it/s]

{'loss': 0.2514, 'grad_norm': 0.5930050015449524, 'learning_rate': 3.764406779661017e-05, 'epoch': 0.78}


 26%|██▌       | 7800/30000 [3:10:49<1:59:47,  3.09it/s]

{'loss': 0.2654, 'grad_norm': 0.6419509649276733, 'learning_rate': 3.76271186440678e-05, 'epoch': 0.78}


 26%|██▌       | 7810/30000 [3:10:52<2:01:40,  3.04it/s]

{'loss': 0.2683, 'grad_norm': 0.5445104241371155, 'learning_rate': 3.761016949152543e-05, 'epoch': 0.78}


 26%|██▌       | 7820/30000 [3:10:55<2:02:58,  3.01it/s]

{'loss': 0.2689, 'grad_norm': 0.5577893853187561, 'learning_rate': 3.759322033898305e-05, 'epoch': 0.78}


 26%|██▌       | 7830/30000 [3:10:59<2:03:54,  2.98it/s]

{'loss': 0.2715, 'grad_norm': 0.6103917956352234, 'learning_rate': 3.757627118644068e-05, 'epoch': 0.78}


 26%|██▌       | 7840/30000 [3:11:02<2:04:57,  2.96it/s]

{'loss': 0.2702, 'grad_norm': 0.5215007662773132, 'learning_rate': 3.755932203389831e-05, 'epoch': 0.78}


 26%|██▌       | 7850/30000 [3:26:17<140:30:18, 22.84s/it]  

{'loss': 0.2638, 'grad_norm': 0.551977276802063, 'learning_rate': 3.7542372881355934e-05, 'epoch': 0.79}


 26%|██▌       | 7860/30000 [3:26:20<5:47:28,  1.06it/s]  

{'loss': 0.2605, 'grad_norm': 0.48130425810813904, 'learning_rate': 3.752542372881356e-05, 'epoch': 0.79}


 26%|██▌       | 7870/30000 [3:26:23<1:59:13,  3.09it/s]

{'loss': 0.2625, 'grad_norm': 0.4999271631240845, 'learning_rate': 3.750847457627119e-05, 'epoch': 0.79}


 26%|██▋       | 7880/30000 [3:26:26<1:52:23,  3.28it/s]

{'loss': 0.2612, 'grad_norm': 0.5634621381759644, 'learning_rate': 3.7491525423728815e-05, 'epoch': 0.79}


 26%|██▋       | 7890/30000 [3:26:29<1:52:13,  3.28it/s]

{'loss': 0.2582, 'grad_norm': 0.5415963530540466, 'learning_rate': 3.747457627118644e-05, 'epoch': 0.79}


 26%|██▋       | 7900/30000 [3:26:32<1:52:13,  3.28it/s]

{'loss': 0.279, 'grad_norm': 0.6288506388664246, 'learning_rate': 3.745762711864407e-05, 'epoch': 0.79}


 26%|██▋       | 7910/30000 [3:26:35<1:52:16,  3.28it/s]

{'loss': 0.2619, 'grad_norm': 0.5465676784515381, 'learning_rate': 3.7440677966101696e-05, 'epoch': 0.79}


 26%|██▋       | 7920/30000 [3:26:38<1:52:08,  3.28it/s]

{'loss': 0.27, 'grad_norm': 0.6356766223907471, 'learning_rate': 3.7423728813559325e-05, 'epoch': 0.79}


 26%|██▋       | 7930/30000 [3:26:41<1:52:15,  3.28it/s]

{'loss': 0.2634, 'grad_norm': 0.6237371563911438, 'learning_rate': 3.740677966101695e-05, 'epoch': 0.79}


 26%|██▋       | 7940/30000 [3:26:44<1:52:19,  3.27it/s]

{'loss': 0.2429, 'grad_norm': 0.5200973153114319, 'learning_rate': 3.738983050847458e-05, 'epoch': 0.79}


 26%|██▋       | 7950/30000 [3:26:47<1:52:18,  3.27it/s]

{'loss': 0.2632, 'grad_norm': 0.587527334690094, 'learning_rate': 3.737288135593221e-05, 'epoch': 0.8}


 27%|██▋       | 7960/30000 [3:26:50<1:52:11,  3.27it/s]

{'loss': 0.2574, 'grad_norm': 0.5094069242477417, 'learning_rate': 3.735593220338983e-05, 'epoch': 0.8}


 27%|██▋       | 7970/30000 [3:26:53<1:52:15,  3.27it/s]

{'loss': 0.2716, 'grad_norm': 0.7183315753936768, 'learning_rate': 3.733898305084746e-05, 'epoch': 0.8}


 27%|██▋       | 7980/30000 [3:26:56<1:52:15,  3.27it/s]

{'loss': 0.266, 'grad_norm': 0.49031075835227966, 'learning_rate': 3.732203389830509e-05, 'epoch': 0.8}


 27%|██▋       | 7990/30000 [3:26:59<1:52:15,  3.27it/s]

{'loss': 0.2706, 'grad_norm': 0.7027722001075745, 'learning_rate': 3.730508474576272e-05, 'epoch': 0.8}


 27%|██▋       | 8000/30000 [3:42:59<296:30:04, 48.52s/it]  

{'loss': 0.2668, 'grad_norm': 1.1480588912963867, 'learning_rate': 3.728813559322034e-05, 'epoch': 0.8}


 27%|██▋       | 8010/30000 [3:43:02<10:10:50,  1.67s/it] 

{'loss': 0.2542, 'grad_norm': 0.49620234966278076, 'learning_rate': 3.727118644067797e-05, 'epoch': 0.8}


 27%|██▋       | 8020/30000 [3:43:05<2:06:16,  2.90it/s] 

{'loss': 0.2563, 'grad_norm': 0.6164290308952332, 'learning_rate': 3.72542372881356e-05, 'epoch': 0.8}


 27%|██▋       | 8030/30000 [3:43:08<1:52:16,  3.26it/s]

{'loss': 0.2603, 'grad_norm': 0.4957297444343567, 'learning_rate': 3.723728813559322e-05, 'epoch': 0.8}


 27%|██▋       | 8040/30000 [3:43:11<1:51:44,  3.28it/s]

{'loss': 0.2561, 'grad_norm': 0.578121542930603, 'learning_rate': 3.722033898305085e-05, 'epoch': 0.8}


 27%|██▋       | 8050/30000 [3:43:14<1:51:39,  3.28it/s]

{'loss': 0.2648, 'grad_norm': 0.8309479355812073, 'learning_rate': 3.720338983050848e-05, 'epoch': 0.81}


 27%|██▋       | 8060/30000 [3:43:17<1:51:25,  3.28it/s]

{'loss': 0.2513, 'grad_norm': 0.4923246204853058, 'learning_rate': 3.71864406779661e-05, 'epoch': 0.81}


 27%|██▋       | 8070/30000 [3:43:20<1:53:01,  3.23it/s]

{'loss': 0.2512, 'grad_norm': 0.5755264163017273, 'learning_rate': 3.716949152542373e-05, 'epoch': 0.81}


 27%|██▋       | 8080/30000 [3:43:23<1:52:12,  3.26it/s]

{'loss': 0.2647, 'grad_norm': 0.7621423602104187, 'learning_rate': 3.715254237288136e-05, 'epoch': 0.81}


 27%|██▋       | 8090/30000 [3:43:26<1:51:27,  3.28it/s]

{'loss': 0.2591, 'grad_norm': 0.6400922536849976, 'learning_rate': 3.7135593220338984e-05, 'epoch': 0.81}


 27%|██▋       | 8100/30000 [3:43:29<1:51:21,  3.28it/s]

{'loss': 0.2476, 'grad_norm': 0.612435519695282, 'learning_rate': 3.711864406779661e-05, 'epoch': 0.81}


 27%|██▋       | 8110/30000 [3:43:32<1:51:29,  3.27it/s]

{'loss': 0.2887, 'grad_norm': 0.7390195727348328, 'learning_rate': 3.7101694915254236e-05, 'epoch': 0.81}


 27%|██▋       | 8120/30000 [3:43:35<1:51:30,  3.27it/s]

{'loss': 0.2584, 'grad_norm': 0.49569645524024963, 'learning_rate': 3.7084745762711865e-05, 'epoch': 0.81}


 27%|██▋       | 8130/30000 [3:43:38<1:51:27,  3.27it/s]

{'loss': 0.2688, 'grad_norm': 0.683088481426239, 'learning_rate': 3.7067796610169494e-05, 'epoch': 0.81}


 27%|██▋       | 8140/30000 [3:43:42<1:51:25,  3.27it/s]

{'loss': 0.2744, 'grad_norm': 0.5835986137390137, 'learning_rate': 3.705084745762712e-05, 'epoch': 0.81}


 27%|██▋       | 8150/30000 [3:43:45<1:51:27,  3.27it/s]

{'loss': 0.2544, 'grad_norm': 0.5075735449790955, 'learning_rate': 3.7033898305084746e-05, 'epoch': 0.81}


 27%|██▋       | 8160/30000 [3:43:48<1:51:08,  3.27it/s]

{'loss': 0.2563, 'grad_norm': 0.5652232766151428, 'learning_rate': 3.7016949152542376e-05, 'epoch': 0.82}


 27%|██▋       | 8170/30000 [3:43:51<1:51:14,  3.27it/s]

{'loss': 0.2609, 'grad_norm': 0.5321837067604065, 'learning_rate': 3.7e-05, 'epoch': 0.82}


 27%|██▋       | 8180/30000 [3:43:54<1:51:10,  3.27it/s]

{'loss': 0.2584, 'grad_norm': 0.4986732006072998, 'learning_rate': 3.698305084745763e-05, 'epoch': 0.82}


 27%|██▋       | 8190/30000 [3:43:57<1:51:08,  3.27it/s]

{'loss': 0.2584, 'grad_norm': 0.7354019284248352, 'learning_rate': 3.696610169491526e-05, 'epoch': 0.82}


 27%|██▋       | 8200/30000 [3:44:00<1:51:01,  3.27it/s]

{'loss': 0.2442, 'grad_norm': 0.6019887924194336, 'learning_rate': 3.6949152542372886e-05, 'epoch': 0.82}


 27%|██▋       | 8210/30000 [3:44:03<1:50:56,  3.27it/s]

{'loss': 0.2519, 'grad_norm': 0.7593237161636353, 'learning_rate': 3.693220338983051e-05, 'epoch': 0.82}


 27%|██▋       | 8220/30000 [3:44:06<1:51:04,  3.27it/s]

{'loss': 0.2593, 'grad_norm': 0.6004468202590942, 'learning_rate': 3.691525423728814e-05, 'epoch': 0.82}


 27%|██▋       | 8230/30000 [3:44:09<1:50:54,  3.27it/s]

{'loss': 0.261, 'grad_norm': 0.5357620120048523, 'learning_rate': 3.689830508474577e-05, 'epoch': 0.82}


 27%|██▋       | 8240/30000 [3:44:12<1:50:57,  3.27it/s]

{'loss': 0.2619, 'grad_norm': 0.49839121103286743, 'learning_rate': 3.688135593220339e-05, 'epoch': 0.82}


 28%|██▊       | 8250/30000 [3:44:15<1:50:46,  3.27it/s]

{'loss': 0.2731, 'grad_norm': 0.6411162614822388, 'learning_rate': 3.686440677966102e-05, 'epoch': 0.82}


 28%|██▊       | 8260/30000 [3:44:18<1:50:47,  3.27it/s]

{'loss': 0.2489, 'grad_norm': 0.5618029832839966, 'learning_rate': 3.684745762711865e-05, 'epoch': 0.83}


 28%|██▊       | 8270/30000 [3:44:21<1:50:40,  3.27it/s]

{'loss': 0.2585, 'grad_norm': 0.4245680868625641, 'learning_rate': 3.683050847457628e-05, 'epoch': 0.83}


 28%|██▊       | 8280/30000 [3:44:24<1:50:48,  3.27it/s]

{'loss': 0.2453, 'grad_norm': 0.5244588255882263, 'learning_rate': 3.68135593220339e-05, 'epoch': 0.83}


 28%|██▊       | 8290/30000 [3:44:27<1:50:37,  3.27it/s]

{'loss': 0.2496, 'grad_norm': 0.7471327781677246, 'learning_rate': 3.679661016949153e-05, 'epoch': 0.83}


 28%|██▊       | 8300/30000 [3:44:30<1:50:38,  3.27it/s]

{'loss': 0.2527, 'grad_norm': 0.7680377960205078, 'learning_rate': 3.677966101694915e-05, 'epoch': 0.83}


 28%|██▊       | 8310/30000 [3:44:34<1:50:32,  3.27it/s]

{'loss': 0.2538, 'grad_norm': 0.6451481580734253, 'learning_rate': 3.676271186440678e-05, 'epoch': 0.83}


 28%|██▊       | 8320/30000 [3:44:37<1:50:17,  3.28it/s]

{'loss': 0.25, 'grad_norm': 0.5875876545906067, 'learning_rate': 3.6745762711864404e-05, 'epoch': 0.83}


 28%|██▊       | 8330/30000 [3:44:40<1:50:25,  3.27it/s]

{'loss': 0.2621, 'grad_norm': 0.5276845693588257, 'learning_rate': 3.6728813559322034e-05, 'epoch': 0.83}


 28%|██▊       | 8340/30000 [3:44:43<1:50:14,  3.27it/s]

{'loss': 0.2822, 'grad_norm': 0.6069691181182861, 'learning_rate': 3.671186440677966e-05, 'epoch': 0.83}


 28%|██▊       | 8350/30000 [3:44:46<1:50:22,  3.27it/s]

{'loss': 0.2472, 'grad_norm': 0.4931374192237854, 'learning_rate': 3.6694915254237286e-05, 'epoch': 0.83}


 28%|██▊       | 8360/30000 [3:44:49<1:50:16,  3.27it/s]

{'loss': 0.2744, 'grad_norm': 0.6213462948799133, 'learning_rate': 3.6677966101694915e-05, 'epoch': 0.84}


 28%|██▊       | 8370/30000 [3:44:52<1:50:13,  3.27it/s]

{'loss': 0.2591, 'grad_norm': 0.43675607442855835, 'learning_rate': 3.6661016949152544e-05, 'epoch': 0.84}


 28%|██▊       | 8380/30000 [3:44:55<1:50:06,  3.27it/s]

{'loss': 0.2587, 'grad_norm': 0.5927374958992004, 'learning_rate': 3.6644067796610174e-05, 'epoch': 0.84}


 28%|██▊       | 8390/30000 [3:44:58<1:50:09,  3.27it/s]

{'loss': 0.2637, 'grad_norm': 0.5306989550590515, 'learning_rate': 3.6627118644067796e-05, 'epoch': 0.84}


 28%|██▊       | 8400/30000 [3:45:01<1:50:06,  3.27it/s]

{'loss': 0.2478, 'grad_norm': 0.5495116710662842, 'learning_rate': 3.6610169491525426e-05, 'epoch': 0.84}


 28%|██▊       | 8410/30000 [3:45:04<1:49:58,  3.27it/s]

{'loss': 0.2677, 'grad_norm': 0.6273800134658813, 'learning_rate': 3.6593220338983055e-05, 'epoch': 0.84}


 28%|██▊       | 8420/30000 [3:45:07<1:49:49,  3.27it/s]

{'loss': 0.2644, 'grad_norm': 0.5096489191055298, 'learning_rate': 3.657627118644068e-05, 'epoch': 0.84}


 28%|██▊       | 8430/30000 [3:45:10<1:49:53,  3.27it/s]

{'loss': 0.2508, 'grad_norm': 0.6499760150909424, 'learning_rate': 3.655932203389831e-05, 'epoch': 0.84}


 28%|██▊       | 8440/30000 [3:45:13<1:51:44,  3.22it/s]

{'loss': 0.263, 'grad_norm': 0.5206574201583862, 'learning_rate': 3.6542372881355936e-05, 'epoch': 0.84}


 28%|██▊       | 8450/30000 [3:45:16<1:53:52,  3.15it/s]

{'loss': 0.2698, 'grad_norm': 0.6341913342475891, 'learning_rate': 3.6525423728813566e-05, 'epoch': 0.84}


 28%|██▊       | 8460/30000 [3:45:20<1:55:58,  3.10it/s]

{'loss': 0.2492, 'grad_norm': 0.5631489753723145, 'learning_rate': 3.650847457627119e-05, 'epoch': 0.85}


 28%|██▊       | 8470/30000 [3:45:23<1:57:54,  3.04it/s]

{'loss': 0.247, 'grad_norm': 0.5307002663612366, 'learning_rate': 3.649152542372882e-05, 'epoch': 0.85}


 28%|██▊       | 8480/30000 [3:45:26<1:59:37,  3.00it/s]

{'loss': 0.2619, 'grad_norm': 0.6506287455558777, 'learning_rate': 3.647457627118645e-05, 'epoch': 0.85}


 28%|██▊       | 8490/30000 [4:02:38<76:21:49, 12.78s/it]   

{'loss': 0.2693, 'grad_norm': 0.6374133825302124, 'learning_rate': 3.645762711864407e-05, 'epoch': 0.85}


 28%|██▊       | 8500/30000 [4:02:41<3:55:33,  1.52it/s] 

{'loss': 0.2791, 'grad_norm': 0.6027133464813232, 'learning_rate': 3.644067796610169e-05, 'epoch': 0.85}


 28%|██▊       | 8510/30000 [4:02:44<1:52:40,  3.18it/s]

{'loss': 0.2681, 'grad_norm': 0.485393762588501, 'learning_rate': 3.642372881355932e-05, 'epoch': 0.85}


 28%|██▊       | 8520/30000 [4:02:47<1:49:18,  3.27it/s]

{'loss': 0.265, 'grad_norm': 0.5299261808395386, 'learning_rate': 3.640677966101695e-05, 'epoch': 0.85}


 28%|██▊       | 8530/30000 [4:02:50<1:49:05,  3.28it/s]

{'loss': 0.2547, 'grad_norm': 0.6261526942253113, 'learning_rate': 3.638983050847457e-05, 'epoch': 0.85}


 28%|██▊       | 8540/30000 [4:02:53<1:49:12,  3.27it/s]

{'loss': 0.268, 'grad_norm': 0.5182968974113464, 'learning_rate': 3.63728813559322e-05, 'epoch': 0.85}


 28%|██▊       | 8550/30000 [4:02:56<1:49:06,  3.28it/s]

{'loss': 0.2602, 'grad_norm': 0.601585865020752, 'learning_rate': 3.635593220338983e-05, 'epoch': 0.85}


 29%|██▊       | 8560/30000 [4:02:59<1:48:55,  3.28it/s]

{'loss': 0.2684, 'grad_norm': 0.505571722984314, 'learning_rate': 3.633898305084746e-05, 'epoch': 0.86}


 29%|██▊       | 8570/30000 [4:03:03<1:49:01,  3.28it/s]

{'loss': 0.2613, 'grad_norm': 0.5465781092643738, 'learning_rate': 3.6322033898305084e-05, 'epoch': 0.86}


 29%|██▊       | 8580/30000 [4:03:06<1:48:58,  3.28it/s]

{'loss': 0.2621, 'grad_norm': 0.6253358721733093, 'learning_rate': 3.630508474576271e-05, 'epoch': 0.86}


 29%|██▊       | 8590/30000 [4:03:09<1:48:54,  3.28it/s]

{'loss': 0.273, 'grad_norm': 0.39974743127822876, 'learning_rate': 3.628813559322034e-05, 'epoch': 0.86}


 29%|██▊       | 8600/30000 [4:03:12<1:49:00,  3.27it/s]

{'loss': 0.2781, 'grad_norm': 0.7228972315788269, 'learning_rate': 3.6271186440677965e-05, 'epoch': 0.86}


 29%|██▊       | 8610/30000 [4:03:15<1:48:55,  3.27it/s]

{'loss': 0.2641, 'grad_norm': 0.593126118183136, 'learning_rate': 3.6254237288135595e-05, 'epoch': 0.86}


 29%|██▊       | 8620/30000 [4:03:18<1:48:55,  3.27it/s]

{'loss': 0.2627, 'grad_norm': 0.674546480178833, 'learning_rate': 3.6237288135593224e-05, 'epoch': 0.86}


 29%|██▉       | 8630/30000 [4:03:21<1:48:48,  3.27it/s]

{'loss': 0.2522, 'grad_norm': 0.5259471535682678, 'learning_rate': 3.622033898305085e-05, 'epoch': 0.86}


 29%|██▉       | 8640/30000 [4:03:24<1:48:48,  3.27it/s]

{'loss': 0.2662, 'grad_norm': 0.6637784242630005, 'learning_rate': 3.6203389830508476e-05, 'epoch': 0.86}


 29%|██▉       | 8650/30000 [4:03:27<1:48:48,  3.27it/s]

{'loss': 0.2669, 'grad_norm': 0.5841864943504333, 'learning_rate': 3.6186440677966105e-05, 'epoch': 0.86}


 29%|██▉       | 8660/30000 [4:03:30<1:48:45,  3.27it/s]

{'loss': 0.2515, 'grad_norm': 0.5376945734024048, 'learning_rate': 3.6169491525423735e-05, 'epoch': 0.87}


 29%|██▉       | 8670/30000 [4:03:33<1:48:35,  3.27it/s]

{'loss': 0.2587, 'grad_norm': 0.44646865129470825, 'learning_rate': 3.615254237288136e-05, 'epoch': 0.87}


 29%|██▉       | 8680/30000 [4:03:36<1:48:33,  3.27it/s]

{'loss': 0.2438, 'grad_norm': 0.6827381253242493, 'learning_rate': 3.6135593220338986e-05, 'epoch': 0.87}


 29%|██▉       | 8690/30000 [4:03:39<1:48:30,  3.27it/s]

{'loss': 0.2594, 'grad_norm': 0.581770122051239, 'learning_rate': 3.6118644067796616e-05, 'epoch': 0.87}


 29%|██▉       | 8700/30000 [4:03:42<1:48:19,  3.28it/s]

{'loss': 0.2519, 'grad_norm': 0.5717304348945618, 'learning_rate': 3.610169491525424e-05, 'epoch': 0.87}


 29%|██▉       | 8710/30000 [4:03:45<1:48:25,  3.27it/s]

{'loss': 0.259, 'grad_norm': 0.6998792290687561, 'learning_rate': 3.608474576271186e-05, 'epoch': 0.87}


 29%|██▉       | 8720/30000 [4:03:48<1:48:19,  3.27it/s]

{'loss': 0.2772, 'grad_norm': 0.5182678699493408, 'learning_rate': 3.606779661016949e-05, 'epoch': 0.87}


 29%|██▉       | 8730/30000 [4:03:51<1:48:21,  3.27it/s]

{'loss': 0.2616, 'grad_norm': 0.6060200929641724, 'learning_rate': 3.605084745762712e-05, 'epoch': 0.87}


 29%|██▉       | 8740/30000 [4:03:55<1:48:17,  3.27it/s]

{'loss': 0.2444, 'grad_norm': 0.49798405170440674, 'learning_rate': 3.603389830508475e-05, 'epoch': 0.87}


 29%|██▉       | 8750/30000 [4:03:58<1:48:09,  3.27it/s]

{'loss': 0.2414, 'grad_norm': 0.5579347014427185, 'learning_rate': 3.601694915254237e-05, 'epoch': 0.88}


 29%|██▉       | 8760/30000 [4:04:01<1:48:10,  3.27it/s]

{'loss': 0.2592, 'grad_norm': 0.7189533710479736, 'learning_rate': 3.6e-05, 'epoch': 0.88}


 29%|██▉       | 8770/30000 [4:04:04<1:47:59,  3.28it/s]

{'loss': 0.2674, 'grad_norm': 0.5462751388549805, 'learning_rate': 3.598305084745763e-05, 'epoch': 0.88}


 29%|██▉       | 8780/30000 [4:04:07<1:48:00,  3.27it/s]

{'loss': 0.2592, 'grad_norm': 0.5717427134513855, 'learning_rate': 3.596610169491525e-05, 'epoch': 0.88}


 29%|██▉       | 8790/30000 [4:04:10<1:48:07,  3.27it/s]

{'loss': 0.2511, 'grad_norm': 0.5577507019042969, 'learning_rate': 3.594915254237288e-05, 'epoch': 0.88}


 29%|██▉       | 8800/30000 [4:04:13<1:47:58,  3.27it/s]

{'loss': 0.2439, 'grad_norm': 0.5563211441040039, 'learning_rate': 3.593220338983051e-05, 'epoch': 0.88}


 29%|██▉       | 8810/30000 [4:04:16<1:48:00,  3.27it/s]

{'loss': 0.2471, 'grad_norm': 0.45832180976867676, 'learning_rate': 3.5915254237288134e-05, 'epoch': 0.88}


 29%|██▉       | 8820/30000 [4:04:19<1:47:53,  3.27it/s]

{'loss': 0.2709, 'grad_norm': 0.763852596282959, 'learning_rate': 3.5898305084745763e-05, 'epoch': 0.88}


 29%|██▉       | 8830/30000 [4:04:22<1:47:52,  3.27it/s]

{'loss': 0.2611, 'grad_norm': 0.6070753335952759, 'learning_rate': 3.588135593220339e-05, 'epoch': 0.88}


 29%|██▉       | 8840/30000 [4:04:25<1:47:45,  3.27it/s]

{'loss': 0.2495, 'grad_norm': 0.9745280742645264, 'learning_rate': 3.586440677966102e-05, 'epoch': 0.88}


 30%|██▉       | 8850/30000 [4:04:28<1:47:44,  3.27it/s]

{'loss': 0.2601, 'grad_norm': 0.7350980043411255, 'learning_rate': 3.5847457627118645e-05, 'epoch': 0.89}


 30%|██▉       | 8860/30000 [4:04:31<1:47:34,  3.28it/s]

{'loss': 0.2538, 'grad_norm': 0.5589996576309204, 'learning_rate': 3.5830508474576274e-05, 'epoch': 0.89}


 30%|██▉       | 8870/30000 [4:04:34<1:47:33,  3.27it/s]

{'loss': 0.2477, 'grad_norm': 0.42504119873046875, 'learning_rate': 3.5813559322033903e-05, 'epoch': 0.89}


 30%|██▉       | 8880/30000 [4:04:37<1:47:35,  3.27it/s]

{'loss': 0.251, 'grad_norm': 0.7050066590309143, 'learning_rate': 3.5796610169491526e-05, 'epoch': 0.89}


 30%|██▉       | 8890/30000 [4:04:40<1:47:32,  3.27it/s]

{'loss': 0.2826, 'grad_norm': 0.6696656942367554, 'learning_rate': 3.5779661016949155e-05, 'epoch': 0.89}


 30%|██▉       | 8900/30000 [4:04:43<1:47:30,  3.27it/s]

{'loss': 0.2597, 'grad_norm': 0.6372590065002441, 'learning_rate': 3.5762711864406785e-05, 'epoch': 0.89}


 30%|██▉       | 8910/30000 [4:04:47<1:47:21,  3.27it/s]

{'loss': 0.2588, 'grad_norm': 0.555194079875946, 'learning_rate': 3.5745762711864414e-05, 'epoch': 0.89}


 30%|██▉       | 8920/30000 [4:04:50<1:48:21,  3.24it/s]

{'loss': 0.2587, 'grad_norm': 0.7149935364723206, 'learning_rate': 3.572881355932203e-05, 'epoch': 0.89}


 30%|██▉       | 8930/30000 [4:04:53<1:53:12,  3.10it/s]

{'loss': 0.2571, 'grad_norm': 0.4992477595806122, 'learning_rate': 3.571186440677966e-05, 'epoch': 0.89}


 30%|██▉       | 8940/30000 [4:04:56<1:55:04,  3.05it/s]

{'loss': 0.2604, 'grad_norm': 0.5973383188247681, 'learning_rate': 3.569491525423729e-05, 'epoch': 0.89}


 30%|██▉       | 8950/30000 [4:19:52<1566:55:33, 267.98s/it]

{'loss': 0.2618, 'grad_norm': 0.48611384630203247, 'learning_rate': 3.567796610169492e-05, 'epoch': 0.9}


 30%|██▉       | 8960/30000 [4:19:55<46:02:25,  7.88s/it]   

{'loss': 0.2554, 'grad_norm': 0.606724202632904, 'learning_rate': 3.566101694915254e-05, 'epoch': 0.9}


 30%|██▉       | 8970/30000 [4:19:58<3:01:43,  1.93it/s] 

{'loss': 0.2529, 'grad_norm': 0.5789754986763, 'learning_rate': 3.564406779661017e-05, 'epoch': 0.9}


 30%|██▉       | 8980/30000 [4:20:01<1:48:46,  3.22it/s]

{'loss': 0.2448, 'grad_norm': 0.6528916954994202, 'learning_rate': 3.56271186440678e-05, 'epoch': 0.9}


 30%|██▉       | 8990/30000 [4:20:04<1:46:46,  3.28it/s]

{'loss': 0.2409, 'grad_norm': 0.5414039492607117, 'learning_rate': 3.561016949152542e-05, 'epoch': 0.9}


 30%|███       | 9000/30000 [4:20:07<1:46:42,  3.28it/s]

{'loss': 0.2615, 'grad_norm': 0.7227181196212769, 'learning_rate': 3.559322033898305e-05, 'epoch': 0.9}


 30%|███       | 9010/30000 [4:20:10<1:46:41,  3.28it/s]

{'loss': 0.2724, 'grad_norm': 0.5971134901046753, 'learning_rate': 3.557627118644068e-05, 'epoch': 0.9}


 30%|███       | 9020/30000 [4:20:13<1:46:37,  3.28it/s]

{'loss': 0.2458, 'grad_norm': 0.5792104005813599, 'learning_rate': 3.555932203389831e-05, 'epoch': 0.9}


 30%|███       | 9030/30000 [4:20:16<1:46:45,  3.27it/s]

{'loss': 0.2576, 'grad_norm': 0.7054551839828491, 'learning_rate': 3.554237288135593e-05, 'epoch': 0.9}


 30%|███       | 9040/30000 [4:20:19<1:46:35,  3.28it/s]

{'loss': 0.2729, 'grad_norm': 0.5321029424667358, 'learning_rate': 3.552542372881356e-05, 'epoch': 0.9}


 30%|███       | 9050/30000 [4:20:22<1:46:34,  3.28it/s]

{'loss': 0.2796, 'grad_norm': 0.7519622445106506, 'learning_rate': 3.550847457627119e-05, 'epoch': 0.91}


 30%|███       | 9060/30000 [4:20:25<1:46:43,  3.27it/s]

{'loss': 0.2655, 'grad_norm': 0.6059890985488892, 'learning_rate': 3.5491525423728814e-05, 'epoch': 0.91}


 30%|███       | 9070/30000 [4:20:28<1:46:35,  3.27it/s]

{'loss': 0.2573, 'grad_norm': 0.6108173131942749, 'learning_rate': 3.547457627118644e-05, 'epoch': 0.91}


 30%|███       | 9080/30000 [4:20:31<1:46:26,  3.28it/s]

{'loss': 0.255, 'grad_norm': 0.5227458477020264, 'learning_rate': 3.545762711864407e-05, 'epoch': 0.91}


 30%|███       | 9090/30000 [4:20:34<1:46:25,  3.27it/s]

{'loss': 0.2714, 'grad_norm': 0.5541989207267761, 'learning_rate': 3.54406779661017e-05, 'epoch': 0.91}


 30%|███       | 9100/30000 [4:20:37<1:46:28,  3.27it/s]

{'loss': 0.2536, 'grad_norm': 0.5636459589004517, 'learning_rate': 3.5423728813559324e-05, 'epoch': 0.91}


 30%|███       | 9110/30000 [4:20:41<1:46:29,  3.27it/s]

{'loss': 0.2592, 'grad_norm': 0.49268436431884766, 'learning_rate': 3.5406779661016954e-05, 'epoch': 0.91}


 30%|███       | 9120/30000 [4:20:44<1:46:25,  3.27it/s]

{'loss': 0.254, 'grad_norm': 0.5475967526435852, 'learning_rate': 3.538983050847458e-05, 'epoch': 0.91}


 30%|███       | 9130/30000 [4:20:47<1:46:21,  3.27it/s]

{'loss': 0.2736, 'grad_norm': 0.5711052417755127, 'learning_rate': 3.5372881355932205e-05, 'epoch': 0.91}


 30%|███       | 9140/30000 [4:20:50<1:46:21,  3.27it/s]

{'loss': 0.258, 'grad_norm': 0.7219032049179077, 'learning_rate': 3.535593220338983e-05, 'epoch': 0.91}


 30%|███       | 9150/30000 [4:20:53<1:46:14,  3.27it/s]

{'loss': 0.2541, 'grad_norm': 0.7092418670654297, 'learning_rate': 3.533898305084746e-05, 'epoch': 0.92}


 31%|███       | 9160/30000 [4:20:56<1:46:08,  3.27it/s]

{'loss': 0.2681, 'grad_norm': 0.7965629696846008, 'learning_rate': 3.532203389830509e-05, 'epoch': 0.92}


 31%|███       | 9170/30000 [4:20:59<1:46:07,  3.27it/s]

{'loss': 0.2617, 'grad_norm': 0.6886920928955078, 'learning_rate': 3.530508474576271e-05, 'epoch': 0.92}


 31%|███       | 9180/30000 [4:21:02<1:45:59,  3.27it/s]

{'loss': 0.2584, 'grad_norm': 0.9540634751319885, 'learning_rate': 3.528813559322034e-05, 'epoch': 0.92}


 31%|███       | 9190/30000 [4:21:05<1:46:01,  3.27it/s]

{'loss': 0.2737, 'grad_norm': 0.6919590830802917, 'learning_rate': 3.527118644067797e-05, 'epoch': 0.92}


 31%|███       | 9200/30000 [4:21:08<1:46:00,  3.27it/s]

{'loss': 0.255, 'grad_norm': 0.5271408557891846, 'learning_rate': 3.52542372881356e-05, 'epoch': 0.92}


 31%|███       | 9210/30000 [4:21:11<1:45:50,  3.27it/s]

{'loss': 0.2562, 'grad_norm': 0.5684337019920349, 'learning_rate': 3.523728813559322e-05, 'epoch': 0.92}


 31%|███       | 9220/30000 [4:21:14<1:45:46,  3.27it/s]

{'loss': 0.264, 'grad_norm': 0.5711985230445862, 'learning_rate': 3.522033898305085e-05, 'epoch': 0.92}


 31%|███       | 9230/30000 [4:21:17<1:45:49,  3.27it/s]

{'loss': 0.2607, 'grad_norm': 0.6377294659614563, 'learning_rate': 3.520338983050848e-05, 'epoch': 0.92}


 31%|███       | 9240/30000 [4:21:20<1:45:42,  3.27it/s]

{'loss': 0.262, 'grad_norm': 0.5187332034111023, 'learning_rate': 3.51864406779661e-05, 'epoch': 0.92}


 31%|███       | 9250/30000 [4:21:23<1:45:40,  3.27it/s]

{'loss': 0.2599, 'grad_norm': 0.5612466931343079, 'learning_rate': 3.516949152542373e-05, 'epoch': 0.93}


 31%|███       | 9260/30000 [4:21:26<1:45:42,  3.27it/s]

{'loss': 0.2637, 'grad_norm': 0.5461302995681763, 'learning_rate': 3.515254237288136e-05, 'epoch': 0.93}


 31%|███       | 9270/30000 [4:21:29<1:45:31,  3.27it/s]

{'loss': 0.264, 'grad_norm': 0.7031240463256836, 'learning_rate': 3.513559322033899e-05, 'epoch': 0.93}


 31%|███       | 9280/30000 [4:21:33<1:45:31,  3.27it/s]

{'loss': 0.2616, 'grad_norm': 0.5196543335914612, 'learning_rate': 3.511864406779661e-05, 'epoch': 0.93}


 31%|███       | 9290/30000 [4:21:36<1:45:30,  3.27it/s]

{'loss': 0.2471, 'grad_norm': 0.7933576703071594, 'learning_rate': 3.510169491525424e-05, 'epoch': 0.93}


 31%|███       | 9300/30000 [4:21:39<1:45:24,  3.27it/s]

{'loss': 0.2647, 'grad_norm': 0.7147622108459473, 'learning_rate': 3.508474576271187e-05, 'epoch': 0.93}


 31%|███       | 9310/30000 [4:21:42<1:45:25,  3.27it/s]

{'loss': 0.2637, 'grad_norm': 0.6752926707267761, 'learning_rate': 3.506779661016949e-05, 'epoch': 0.93}


 31%|███       | 9320/30000 [4:21:45<1:45:20,  3.27it/s]

{'loss': 0.2635, 'grad_norm': 0.4469788372516632, 'learning_rate': 3.505084745762712e-05, 'epoch': 0.93}


 31%|███       | 9330/30000 [4:21:48<1:45:11,  3.28it/s]

{'loss': 0.2386, 'grad_norm': 0.7906374335289001, 'learning_rate': 3.5033898305084745e-05, 'epoch': 0.93}


 31%|███       | 9340/30000 [4:21:51<1:45:14,  3.27it/s]

{'loss': 0.2625, 'grad_norm': 0.5574406981468201, 'learning_rate': 3.5016949152542374e-05, 'epoch': 0.93}


 31%|███       | 9350/30000 [4:21:54<1:45:22,  3.27it/s]

{'loss': 0.245, 'grad_norm': 0.5249709486961365, 'learning_rate': 3.5e-05, 'epoch': 0.94}


 31%|███       | 9360/30000 [4:21:57<1:45:09,  3.27it/s]

{'loss': 0.2502, 'grad_norm': 0.4937158226966858, 'learning_rate': 3.4983050847457626e-05, 'epoch': 0.94}


 31%|███       | 9370/30000 [4:22:00<1:45:06,  3.27it/s]

{'loss': 0.2573, 'grad_norm': 0.9864760637283325, 'learning_rate': 3.4966101694915256e-05, 'epoch': 0.94}


 31%|███▏      | 9380/30000 [4:22:03<1:45:00,  3.27it/s]

{'loss': 0.2541, 'grad_norm': 0.5402787923812866, 'learning_rate': 3.4949152542372885e-05, 'epoch': 0.94}


 31%|███▏      | 9390/30000 [4:22:06<1:50:29,  3.11it/s]

{'loss': 0.261, 'grad_norm': 0.4875524342060089, 'learning_rate': 3.493220338983051e-05, 'epoch': 0.94}


 31%|███▏      | 9400/30000 [4:22:10<1:52:30,  3.05it/s]

{'loss': 0.2513, 'grad_norm': 0.5282691121101379, 'learning_rate': 3.491525423728814e-05, 'epoch': 0.94}


 31%|███▏      | 9410/30000 [4:22:13<1:54:05,  3.01it/s]

{'loss': 0.2628, 'grad_norm': 0.5585328936576843, 'learning_rate': 3.4898305084745766e-05, 'epoch': 0.94}


 31%|███▏      | 9420/30000 [4:22:16<1:55:16,  2.98it/s]

{'loss': 0.2572, 'grad_norm': 0.558936357498169, 'learning_rate': 3.488135593220339e-05, 'epoch': 0.94}


 31%|███▏      | 9430/30000 [4:22:23<4:56:31,  1.16it/s]

{'loss': 0.2553, 'grad_norm': 0.4896314740180969, 'learning_rate': 3.486440677966102e-05, 'epoch': 0.94}


 31%|███▏      | 9440/30000 [4:22:33<5:16:33,  1.08it/s]

{'loss': 0.2439, 'grad_norm': 0.5444395542144775, 'learning_rate': 3.484745762711865e-05, 'epoch': 0.94}


 32%|███▏      | 9450/30000 [4:38:24<95:27:34, 16.72s/it]   

{'loss': 0.2607, 'grad_norm': 0.5728549957275391, 'learning_rate': 3.483050847457627e-05, 'epoch': 0.94}


 32%|███▏      | 9460/30000 [4:38:27<4:22:55,  1.30it/s] 

{'loss': 0.2503, 'grad_norm': 0.5478812456130981, 'learning_rate': 3.48135593220339e-05, 'epoch': 0.95}


 32%|███▏      | 9470/30000 [4:38:30<1:48:36,  3.15it/s]

{'loss': 0.2564, 'grad_norm': 0.6719084978103638, 'learning_rate': 3.479661016949153e-05, 'epoch': 0.95}


 32%|███▏      | 9480/30000 [4:38:33<1:44:11,  3.28it/s]

{'loss': 0.2451, 'grad_norm': 0.6583508849143982, 'learning_rate': 3.477966101694916e-05, 'epoch': 0.95}


 32%|███▏      | 9490/30000 [4:38:36<1:44:00,  3.29it/s]

{'loss': 0.2759, 'grad_norm': 0.5175721645355225, 'learning_rate': 3.476271186440678e-05, 'epoch': 0.95}


 32%|███▏      | 9500/30000 [4:38:39<1:44:06,  3.28it/s]

{'loss': 0.2682, 'grad_norm': 0.5450952649116516, 'learning_rate': 3.474576271186441e-05, 'epoch': 0.95}


 32%|███▏      | 9510/30000 [4:38:42<1:44:01,  3.28it/s]

{'loss': 0.2575, 'grad_norm': 0.6913870573043823, 'learning_rate': 3.472881355932204e-05, 'epoch': 0.95}


 32%|███▏      | 9520/30000 [4:38:45<1:44:02,  3.28it/s]

{'loss': 0.264, 'grad_norm': 0.5501074194908142, 'learning_rate': 3.471186440677966e-05, 'epoch': 0.95}


 32%|███▏      | 9530/30000 [4:38:48<1:44:11,  3.27it/s]

{'loss': 0.2509, 'grad_norm': 0.6638250946998596, 'learning_rate': 3.469491525423729e-05, 'epoch': 0.95}


 32%|███▏      | 9540/30000 [4:38:51<1:43:54,  3.28it/s]

{'loss': 0.256, 'grad_norm': 0.7375466227531433, 'learning_rate': 3.4677966101694914e-05, 'epoch': 0.95}


 32%|███▏      | 9550/30000 [4:38:54<1:43:49,  3.28it/s]

{'loss': 0.2749, 'grad_norm': 0.5513647794723511, 'learning_rate': 3.466101694915254e-05, 'epoch': 0.95}


 32%|███▏      | 9560/30000 [4:38:57<1:44:03,  3.27it/s]

{'loss': 0.2576, 'grad_norm': 0.4682042896747589, 'learning_rate': 3.4644067796610166e-05, 'epoch': 0.96}


 32%|███▏      | 9570/30000 [4:39:00<1:44:04,  3.27it/s]

{'loss': 0.2777, 'grad_norm': 0.6754206418991089, 'learning_rate': 3.4627118644067795e-05, 'epoch': 0.96}


 32%|███▏      | 9580/30000 [4:39:03<1:43:56,  3.27it/s]

{'loss': 0.2633, 'grad_norm': 0.6831997036933899, 'learning_rate': 3.4610169491525425e-05, 'epoch': 0.96}


 32%|███▏      | 9590/30000 [4:39:07<1:44:02,  3.27it/s]

{'loss': 0.2628, 'grad_norm': 0.7172915935516357, 'learning_rate': 3.4593220338983054e-05, 'epoch': 0.96}


 32%|███▏      | 9600/30000 [4:39:10<1:44:03,  3.27it/s]

{'loss': 0.2476, 'grad_norm': 0.6867793798446655, 'learning_rate': 3.4576271186440676e-05, 'epoch': 0.96}


 32%|███▏      | 9610/30000 [4:39:13<1:43:53,  3.27it/s]

{'loss': 0.2752, 'grad_norm': 0.7182340621948242, 'learning_rate': 3.4559322033898306e-05, 'epoch': 0.96}


 32%|███▏      | 9620/30000 [4:39:16<1:43:57,  3.27it/s]

{'loss': 0.2669, 'grad_norm': 0.4876459538936615, 'learning_rate': 3.4542372881355935e-05, 'epoch': 0.96}


 32%|███▏      | 9630/30000 [4:39:19<1:43:49,  3.27it/s]

{'loss': 0.251, 'grad_norm': 0.7503118515014648, 'learning_rate': 3.452542372881356e-05, 'epoch': 0.96}


 32%|███▏      | 9640/30000 [4:39:22<1:43:43,  3.27it/s]

{'loss': 0.255, 'grad_norm': 0.7052143812179565, 'learning_rate': 3.450847457627119e-05, 'epoch': 0.96}


 32%|███▏      | 9650/30000 [4:39:25<1:43:39,  3.27it/s]

{'loss': 0.2635, 'grad_norm': 0.5007414221763611, 'learning_rate': 3.4491525423728816e-05, 'epoch': 0.96}


 32%|███▏      | 9660/30000 [4:39:28<1:43:37,  3.27it/s]

{'loss': 0.2571, 'grad_norm': 0.6712766885757446, 'learning_rate': 3.4474576271186446e-05, 'epoch': 0.97}


 32%|███▏      | 9670/30000 [4:39:31<1:43:32,  3.27it/s]

{'loss': 0.2644, 'grad_norm': 0.6767179369926453, 'learning_rate': 3.445762711864407e-05, 'epoch': 0.97}


 32%|███▏      | 9680/30000 [4:39:34<1:43:32,  3.27it/s]

{'loss': 0.2356, 'grad_norm': 0.5238375067710876, 'learning_rate': 3.44406779661017e-05, 'epoch': 0.97}


 32%|███▏      | 9690/30000 [4:39:37<1:43:30,  3.27it/s]

{'loss': 0.2607, 'grad_norm': 0.6581398248672485, 'learning_rate': 3.442372881355933e-05, 'epoch': 0.97}


 32%|███▏      | 9700/30000 [4:39:40<1:43:27,  3.27it/s]

{'loss': 0.2518, 'grad_norm': 0.5331130027770996, 'learning_rate': 3.440677966101695e-05, 'epoch': 0.97}


 32%|███▏      | 9710/30000 [4:39:43<1:43:21,  3.27it/s]

{'loss': 0.2404, 'grad_norm': 0.5416918396949768, 'learning_rate': 3.438983050847458e-05, 'epoch': 0.97}


 32%|███▏      | 9720/30000 [4:39:46<1:43:18,  3.27it/s]

{'loss': 0.25, 'grad_norm': 0.5340670347213745, 'learning_rate': 3.437288135593221e-05, 'epoch': 0.97}


 32%|███▏      | 9730/30000 [4:39:49<1:43:13,  3.27it/s]

{'loss': 0.2493, 'grad_norm': 0.5560003519058228, 'learning_rate': 3.435593220338984e-05, 'epoch': 0.97}


 32%|███▏      | 9740/30000 [4:39:52<1:43:13,  3.27it/s]

{'loss': 0.2552, 'grad_norm': 0.6528425812721252, 'learning_rate': 3.433898305084746e-05, 'epoch': 0.97}


 32%|███▎      | 9750/30000 [4:39:55<1:43:05,  3.27it/s]

{'loss': 0.2651, 'grad_norm': 0.5280506014823914, 'learning_rate': 3.432203389830508e-05, 'epoch': 0.97}


 33%|███▎      | 9760/30000 [4:39:59<1:43:03,  3.27it/s]

{'loss': 0.259, 'grad_norm': 0.5290386080741882, 'learning_rate': 3.430508474576271e-05, 'epoch': 0.98}


 33%|███▎      | 9770/30000 [4:40:02<1:43:02,  3.27it/s]

{'loss': 0.2563, 'grad_norm': 0.545465886592865, 'learning_rate': 3.428813559322034e-05, 'epoch': 0.98}


 33%|███▎      | 9780/30000 [4:40:05<1:43:00,  3.27it/s]

{'loss': 0.254, 'grad_norm': 0.553727388381958, 'learning_rate': 3.4271186440677964e-05, 'epoch': 0.98}


 33%|███▎      | 9790/30000 [4:40:08<1:42:59,  3.27it/s]

{'loss': 0.2601, 'grad_norm': 0.5834516882896423, 'learning_rate': 3.4254237288135593e-05, 'epoch': 0.98}


 33%|███▎      | 9800/30000 [4:40:11<1:42:56,  3.27it/s]

{'loss': 0.2606, 'grad_norm': 0.6160003542900085, 'learning_rate': 3.423728813559322e-05, 'epoch': 0.98}


 33%|███▎      | 9810/30000 [4:40:14<1:42:48,  3.27it/s]

{'loss': 0.2525, 'grad_norm': 0.5127760767936707, 'learning_rate': 3.4220338983050845e-05, 'epoch': 0.98}


 33%|███▎      | 9820/30000 [4:40:17<1:42:45,  3.27it/s]

{'loss': 0.2545, 'grad_norm': 0.502547562122345, 'learning_rate': 3.4203389830508475e-05, 'epoch': 0.98}


 33%|███▎      | 9830/30000 [4:40:20<1:42:48,  3.27it/s]

{'loss': 0.2608, 'grad_norm': 0.442190557718277, 'learning_rate': 3.4186440677966104e-05, 'epoch': 0.98}


 33%|███▎      | 9840/30000 [4:40:23<1:42:40,  3.27it/s]

{'loss': 0.2646, 'grad_norm': 0.4991950988769531, 'learning_rate': 3.416949152542373e-05, 'epoch': 0.98}


 33%|███▎      | 9850/30000 [4:40:26<1:42:40,  3.27it/s]

{'loss': 0.2262, 'grad_norm': 0.45101669430732727, 'learning_rate': 3.4152542372881356e-05, 'epoch': 0.98}


 33%|███▎      | 9860/30000 [4:40:29<1:42:45,  3.27it/s]

{'loss': 0.2504, 'grad_norm': 0.5784295797348022, 'learning_rate': 3.4135593220338985e-05, 'epoch': 0.99}


 33%|███▎      | 9870/30000 [4:40:32<1:44:37,  3.21it/s]

{'loss': 0.2613, 'grad_norm': 0.6523652672767639, 'learning_rate': 3.4118644067796615e-05, 'epoch': 0.99}


 33%|███▎      | 9880/30000 [4:40:35<1:46:57,  3.14it/s]

{'loss': 0.2638, 'grad_norm': 0.5769441723823547, 'learning_rate': 3.410169491525424e-05, 'epoch': 0.99}


 33%|███▎      | 9890/30000 [4:40:39<1:48:48,  3.08it/s]

{'loss': 0.2532, 'grad_norm': 0.46279335021972656, 'learning_rate': 3.4084745762711867e-05, 'epoch': 0.99}


 33%|███▎      | 9900/30000 [4:40:42<1:51:03,  3.02it/s]

{'loss': 0.2477, 'grad_norm': 0.547592282295227, 'learning_rate': 3.4067796610169496e-05, 'epoch': 0.99}


 33%|███▎      | 9910/30000 [4:40:45<1:51:46,  3.00it/s]

{'loss': 0.2442, 'grad_norm': 0.5427880883216858, 'learning_rate': 3.4050847457627125e-05, 'epoch': 0.99}


 33%|███▎      | 9920/30000 [4:40:51<4:13:08,  1.32it/s]

{'loss': 0.2459, 'grad_norm': 0.5736339092254639, 'learning_rate': 3.403389830508475e-05, 'epoch': 0.99}


 33%|███▎      | 9930/30000 [4:44:43<187:11:41, 33.58s/it]

{'loss': 0.2686, 'grad_norm': 0.6054428219795227, 'learning_rate': 3.401694915254238e-05, 'epoch': 0.99}


 33%|███▎      | 9940/30000 [4:44:46<6:56:33,  1.25s/it]  

{'loss': 0.2538, 'grad_norm': 0.6529703736305237, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.99}


 33%|███▎      | 9950/30000 [4:44:49<1:50:52,  3.01it/s]

{'loss': 0.2651, 'grad_norm': 0.5535969138145447, 'learning_rate': 3.398305084745763e-05, 'epoch': 0.99}


 33%|███▎      | 9960/30000 [4:44:52<1:42:16,  3.27it/s]

{'loss': 0.2456, 'grad_norm': 0.4780447781085968, 'learning_rate': 3.396610169491525e-05, 'epoch': 1.0}


 33%|███▎      | 9970/30000 [4:44:55<1:42:01,  3.27it/s]

{'loss': 0.2505, 'grad_norm': 0.4932645559310913, 'learning_rate': 3.394915254237288e-05, 'epoch': 1.0}


 33%|███▎      | 9980/30000 [4:44:58<1:42:00,  3.27it/s]

{'loss': 0.2496, 'grad_norm': 0.7137415409088135, 'learning_rate': 3.393220338983051e-05, 'epoch': 1.0}


 33%|███▎      | 9990/30000 [4:45:01<1:41:55,  3.27it/s]

{'loss': 0.233, 'grad_norm': 0.5489307641983032, 'learning_rate': 3.391525423728813e-05, 'epoch': 1.0}


 33%|███▎      | 10000/30000 [4:45:04<1:41:56,  3.27it/s]

{'loss': 0.2422, 'grad_norm': 0.5196059346199036, 'learning_rate': 3.389830508474576e-05, 'epoch': 1.0}


                                                         
 33%|███▎      | 10000/30000 [5:04:21<1:41:56,  3.27it/s]

{'eval_loss': 0.24730314314365387, 'eval_runtime': 1156.5794, 'eval_samples_per_second': 17.292, 'eval_steps_per_second': 2.162, 'epoch': 1.0}


 33%|███▎      | 10010/30000 [5:05:06<82:18:01, 14.82s/it]   

{'loss': 0.2459, 'grad_norm': 0.5203260183334351, 'learning_rate': 3.388135593220339e-05, 'epoch': 1.0}


 33%|███▎      | 10020/30000 [5:05:09<3:58:20,  1.40it/s] 

{'loss': 0.2396, 'grad_norm': 0.5749255418777466, 'learning_rate': 3.386440677966102e-05, 'epoch': 1.0}


 33%|███▎      | 10030/30000 [5:05:14<1:59:24,  2.79it/s]

{'loss': 0.247, 'grad_norm': 0.5529994368553162, 'learning_rate': 3.3847457627118644e-05, 'epoch': 1.0}


 33%|███▎      | 10040/30000 [5:05:17<1:42:09,  3.26it/s]

{'loss': 0.26, 'grad_norm': 0.6380698680877686, 'learning_rate': 3.383050847457627e-05, 'epoch': 1.0}


 34%|███▎      | 10050/30000 [5:05:21<1:41:38,  3.27it/s]

{'loss': 0.2708, 'grad_norm': 0.5699272751808167, 'learning_rate': 3.38135593220339e-05, 'epoch': 1.0}


 34%|███▎      | 10060/30000 [5:05:24<1:41:26,  3.28it/s]

{'loss': 0.2491, 'grad_norm': 0.5641351342201233, 'learning_rate': 3.3796610169491525e-05, 'epoch': 1.01}


 34%|███▎      | 10070/30000 [5:05:27<1:41:29,  3.27it/s]

{'loss': 0.2681, 'grad_norm': 0.6223743557929993, 'learning_rate': 3.3779661016949154e-05, 'epoch': 1.01}


 34%|███▎      | 10080/30000 [5:05:30<1:41:30,  3.27it/s]

{'loss': 0.2611, 'grad_norm': 0.5541309714317322, 'learning_rate': 3.3762711864406784e-05, 'epoch': 1.01}


 34%|███▎      | 10090/30000 [5:05:33<1:41:37,  3.27it/s]

{'loss': 0.2496, 'grad_norm': 0.6108355522155762, 'learning_rate': 3.3745762711864406e-05, 'epoch': 1.01}


 34%|███▎      | 10100/30000 [5:05:36<1:41:16,  3.28it/s]

{'loss': 0.2635, 'grad_norm': 0.565021276473999, 'learning_rate': 3.3728813559322035e-05, 'epoch': 1.01}


 34%|███▎      | 10110/30000 [5:05:39<1:41:17,  3.27it/s]

{'loss': 0.2518, 'grad_norm': 0.5756531357765198, 'learning_rate': 3.3711864406779665e-05, 'epoch': 1.01}


 34%|███▎      | 10120/30000 [5:05:42<1:41:20,  3.27it/s]

{'loss': 0.2553, 'grad_norm': 0.9763126969337463, 'learning_rate': 3.3694915254237294e-05, 'epoch': 1.01}


 34%|███▍      | 10130/30000 [5:05:45<1:41:14,  3.27it/s]

{'loss': 0.2586, 'grad_norm': 0.5772692561149597, 'learning_rate': 3.367796610169492e-05, 'epoch': 1.01}


 34%|███▍      | 10140/30000 [5:05:48<1:41:13,  3.27it/s]

{'loss': 0.2516, 'grad_norm': 0.6047634482383728, 'learning_rate': 3.3661016949152546e-05, 'epoch': 1.01}


 34%|███▍      | 10150/30000 [5:22:32<196:31:15, 35.64s/it]  

{'loss': 0.2531, 'grad_norm': 0.5119475722312927, 'learning_rate': 3.3644067796610175e-05, 'epoch': 1.01}


 34%|███▍      | 10160/30000 [5:22:35<7:10:32,  1.30s/it]  

{'loss': 0.2644, 'grad_norm': 0.4852638244628906, 'learning_rate': 3.36271186440678e-05, 'epoch': 1.02}


 34%|███▍      | 10170/30000 [5:22:38<1:49:46,  3.01it/s]

{'loss': 0.2481, 'grad_norm': 0.6277738809585571, 'learning_rate': 3.361016949152542e-05, 'epoch': 1.02}


 34%|███▍      | 10180/30000 [5:22:41<1:41:02,  3.27it/s]

{'loss': 0.2488, 'grad_norm': 0.5649338960647583, 'learning_rate': 3.359322033898305e-05, 'epoch': 1.02}


 34%|███▍      | 10190/30000 [5:22:44<1:40:40,  3.28it/s]

{'loss': 0.2434, 'grad_norm': 0.4568987488746643, 'learning_rate': 3.357627118644068e-05, 'epoch': 1.02}


 34%|███▍      | 10200/30000 [5:22:47<1:40:43,  3.28it/s]

{'loss': 0.2449, 'grad_norm': 0.5956727862358093, 'learning_rate': 3.35593220338983e-05, 'epoch': 1.02}


 34%|███▍      | 10210/30000 [5:22:51<1:40:37,  3.28it/s]

{'loss': 0.2545, 'grad_norm': 0.5460880398750305, 'learning_rate': 3.354237288135593e-05, 'epoch': 1.02}


 34%|███▍      | 10220/30000 [5:22:54<1:40:33,  3.28it/s]

{'loss': 0.2551, 'grad_norm': 0.6364100575447083, 'learning_rate': 3.352542372881356e-05, 'epoch': 1.02}


 34%|███▍      | 10230/30000 [5:22:57<1:40:31,  3.28it/s]

{'loss': 0.2489, 'grad_norm': 0.4984033405780792, 'learning_rate': 3.350847457627119e-05, 'epoch': 1.02}


 34%|███▍      | 10240/30000 [5:23:00<1:40:34,  3.27it/s]

{'loss': 0.2475, 'grad_norm': 0.5847160816192627, 'learning_rate': 3.349152542372881e-05, 'epoch': 1.02}


 34%|███▍      | 10250/30000 [5:23:03<1:40:30,  3.27it/s]

{'loss': 0.2533, 'grad_norm': 0.5527133345603943, 'learning_rate': 3.347457627118644e-05, 'epoch': 1.02}


 34%|███▍      | 10260/30000 [5:23:06<1:40:35,  3.27it/s]

{'loss': 0.2412, 'grad_norm': 0.6311824917793274, 'learning_rate': 3.345762711864407e-05, 'epoch': 1.03}


 34%|███▍      | 10270/30000 [5:23:09<1:40:29,  3.27it/s]

{'loss': 0.2534, 'grad_norm': 0.5088281631469727, 'learning_rate': 3.3440677966101694e-05, 'epoch': 1.03}


 34%|███▍      | 10280/30000 [5:23:12<1:40:27,  3.27it/s]

{'loss': 0.2472, 'grad_norm': 0.520300567150116, 'learning_rate': 3.342372881355932e-05, 'epoch': 1.03}


 34%|███▍      | 10290/30000 [5:23:15<1:40:26,  3.27it/s]

{'loss': 0.2416, 'grad_norm': 0.5875226855278015, 'learning_rate': 3.340677966101695e-05, 'epoch': 1.03}


 34%|███▍      | 10300/30000 [5:23:18<1:40:21,  3.27it/s]

{'loss': 0.2626, 'grad_norm': 0.5294546484947205, 'learning_rate': 3.338983050847458e-05, 'epoch': 1.03}


 34%|███▍      | 10310/30000 [5:23:21<1:40:18,  3.27it/s]

{'loss': 0.2486, 'grad_norm': 0.5643078684806824, 'learning_rate': 3.3372881355932204e-05, 'epoch': 1.03}


 34%|███▍      | 10320/30000 [5:23:24<1:40:16,  3.27it/s]

{'loss': 0.2506, 'grad_norm': 0.5196597576141357, 'learning_rate': 3.3355932203389834e-05, 'epoch': 1.03}


 34%|███▍      | 10330/30000 [5:23:27<1:40:16,  3.27it/s]

{'loss': 0.2639, 'grad_norm': 0.9709494709968567, 'learning_rate': 3.333898305084746e-05, 'epoch': 1.03}


 34%|███▍      | 10340/30000 [5:23:30<1:40:06,  3.27it/s]

{'loss': 0.2653, 'grad_norm': 0.5381131172180176, 'learning_rate': 3.3322033898305086e-05, 'epoch': 1.03}


 34%|███▍      | 10350/30000 [5:23:33<1:40:00,  3.27it/s]

{'loss': 0.2697, 'grad_norm': 0.5785926580429077, 'learning_rate': 3.3305084745762715e-05, 'epoch': 1.03}


 35%|███▍      | 10360/30000 [5:23:36<1:39:59,  3.27it/s]

{'loss': 0.2522, 'grad_norm': 0.5846459865570068, 'learning_rate': 3.3288135593220344e-05, 'epoch': 1.04}


 35%|███▍      | 10370/30000 [5:23:39<1:39:53,  3.27it/s]

{'loss': 0.2607, 'grad_norm': 0.6861108541488647, 'learning_rate': 3.327118644067797e-05, 'epoch': 1.04}


 35%|███▍      | 10380/30000 [5:23:43<1:39:54,  3.27it/s]

{'loss': 0.248, 'grad_norm': 0.4258383810520172, 'learning_rate': 3.325423728813559e-05, 'epoch': 1.04}


 35%|███▍      | 10390/30000 [5:23:46<1:39:52,  3.27it/s]

{'loss': 0.242, 'grad_norm': 0.4548999071121216, 'learning_rate': 3.323728813559322e-05, 'epoch': 1.04}


 35%|███▍      | 10400/30000 [5:23:49<1:39:47,  3.27it/s]

{'loss': 0.2654, 'grad_norm': 0.6988349556922913, 'learning_rate': 3.322033898305085e-05, 'epoch': 1.04}


 35%|███▍      | 10410/30000 [5:23:52<1:39:48,  3.27it/s]

{'loss': 0.2484, 'grad_norm': 0.5326705574989319, 'learning_rate': 3.320338983050848e-05, 'epoch': 1.04}


 35%|███▍      | 10420/30000 [5:23:55<1:39:50,  3.27it/s]

{'loss': 0.2514, 'grad_norm': 0.5084748864173889, 'learning_rate': 3.31864406779661e-05, 'epoch': 1.04}


 35%|███▍      | 10430/30000 [5:23:58<1:39:39,  3.27it/s]

{'loss': 0.2474, 'grad_norm': 0.5119325518608093, 'learning_rate': 3.316949152542373e-05, 'epoch': 1.04}


 35%|███▍      | 10440/30000 [5:24:01<1:39:40,  3.27it/s]

{'loss': 0.2392, 'grad_norm': 0.6027953624725342, 'learning_rate': 3.315254237288136e-05, 'epoch': 1.04}


 35%|███▍      | 10450/30000 [5:24:04<1:39:32,  3.27it/s]

{'loss': 0.2685, 'grad_norm': 0.5392552018165588, 'learning_rate': 3.313559322033898e-05, 'epoch': 1.04}


 35%|███▍      | 10460/30000 [5:24:07<1:39:28,  3.27it/s]

{'loss': 0.2448, 'grad_norm': 0.6099080443382263, 'learning_rate': 3.311864406779661e-05, 'epoch': 1.05}


 35%|███▍      | 10470/30000 [5:24:10<1:39:28,  3.27it/s]

{'loss': 0.2503, 'grad_norm': 0.5366322994232178, 'learning_rate': 3.310169491525424e-05, 'epoch': 1.05}


 35%|███▍      | 10480/30000 [5:24:13<1:39:27,  3.27it/s]

{'loss': 0.2412, 'grad_norm': 0.5309224128723145, 'learning_rate': 3.308474576271187e-05, 'epoch': 1.05}


 35%|███▍      | 10490/30000 [5:24:16<1:39:27,  3.27it/s]

{'loss': 0.2378, 'grad_norm': 0.5466753840446472, 'learning_rate': 3.306779661016949e-05, 'epoch': 1.05}


 35%|███▌      | 10500/30000 [5:24:19<1:39:20,  3.27it/s]

{'loss': 0.237, 'grad_norm': 0.5999539494514465, 'learning_rate': 3.305084745762712e-05, 'epoch': 1.05}


 35%|███▌      | 10510/30000 [5:24:22<1:39:23,  3.27it/s]

{'loss': 0.2212, 'grad_norm': 0.7215646505355835, 'learning_rate': 3.303389830508475e-05, 'epoch': 1.05}


 35%|███▌      | 10520/30000 [5:24:25<1:39:14,  3.27it/s]

{'loss': 0.2459, 'grad_norm': 0.49471431970596313, 'learning_rate': 3.301694915254237e-05, 'epoch': 1.05}


 35%|███▌      | 10530/30000 [5:24:28<1:39:11,  3.27it/s]

{'loss': 0.245, 'grad_norm': 0.49413859844207764, 'learning_rate': 3.3e-05, 'epoch': 1.05}


 35%|███▌      | 10540/30000 [5:24:31<1:39:04,  3.27it/s]

{'loss': 0.2382, 'grad_norm': 0.638896644115448, 'learning_rate': 3.298305084745763e-05, 'epoch': 1.05}


 35%|███▌      | 10550/30000 [5:24:35<1:39:05,  3.27it/s]

{'loss': 0.2444, 'grad_norm': 0.4584261178970337, 'learning_rate': 3.296610169491526e-05, 'epoch': 1.05}


 35%|███▌      | 10560/30000 [5:24:38<1:39:01,  3.27it/s]

{'loss': 0.2578, 'grad_norm': 0.5788519382476807, 'learning_rate': 3.2949152542372884e-05, 'epoch': 1.06}


 35%|███▌      | 10570/30000 [5:24:41<1:41:19,  3.20it/s]

{'loss': 0.2617, 'grad_norm': 0.9783979058265686, 'learning_rate': 3.293220338983051e-05, 'epoch': 1.06}


 35%|███▌      | 10580/30000 [5:24:44<1:43:38,  3.12it/s]

{'loss': 0.2489, 'grad_norm': 0.6128187775611877, 'learning_rate': 3.2915254237288136e-05, 'epoch': 1.06}


 35%|███▌      | 10590/30000 [5:24:47<1:45:28,  3.07it/s]

{'loss': 0.249, 'grad_norm': 0.4537731111049652, 'learning_rate': 3.2898305084745765e-05, 'epoch': 1.06}


 35%|███▌      | 10600/30000 [5:24:50<1:46:55,  3.02it/s]

{'loss': 0.249, 'grad_norm': 0.4993811845779419, 'learning_rate': 3.288135593220339e-05, 'epoch': 1.06}


 35%|███▌      | 10610/30000 [5:24:54<1:48:23,  2.98it/s]

{'loss': 0.249, 'grad_norm': 0.5905165672302246, 'learning_rate': 3.286440677966102e-05, 'epoch': 1.06}


 35%|███▌      | 10620/30000 [5:25:02<4:47:57,  1.12it/s]

{'loss': 0.2524, 'grad_norm': 0.42461147904396057, 'learning_rate': 3.2847457627118646e-05, 'epoch': 1.06}


 35%|███▌      | 10630/30000 [5:40:31<1043:35:27, 193.96s/it]

{'loss': 0.2662, 'grad_norm': 0.5945293307304382, 'learning_rate': 3.283050847457627e-05, 'epoch': 1.06}


 35%|███▌      | 10640/30000 [5:40:34<31:05:05,  5.78s/it]   

{'loss': 0.2635, 'grad_norm': 0.584071159362793, 'learning_rate': 3.28135593220339e-05, 'epoch': 1.06}


 36%|███▌      | 10650/30000 [5:40:37<2:28:15,  2.18it/s] 

{'loss': 0.2696, 'grad_norm': 0.5217599868774414, 'learning_rate': 3.279661016949153e-05, 'epoch': 1.06}


 36%|███▌      | 10660/30000 [5:40:40<1:39:39,  3.23it/s]

{'loss': 0.2426, 'grad_norm': 0.645649790763855, 'learning_rate': 3.277966101694916e-05, 'epoch': 1.07}


 36%|███▌      | 10670/30000 [5:40:43<1:38:20,  3.28it/s]

{'loss': 0.2482, 'grad_norm': 0.49342939257621765, 'learning_rate': 3.276271186440678e-05, 'epoch': 1.07}


 36%|███▌      | 10680/30000 [5:40:46<1:38:11,  3.28it/s]

{'loss': 0.2366, 'grad_norm': 0.6351715326309204, 'learning_rate': 3.274576271186441e-05, 'epoch': 1.07}


 36%|███▌      | 10690/30000 [5:40:49<1:38:06,  3.28it/s]

{'loss': 0.2518, 'grad_norm': 0.5243173837661743, 'learning_rate': 3.272881355932204e-05, 'epoch': 1.07}


 36%|███▌      | 10700/30000 [5:40:52<1:38:04,  3.28it/s]

{'loss': 0.2493, 'grad_norm': 0.5635057687759399, 'learning_rate': 3.271186440677966e-05, 'epoch': 1.07}


 36%|███▌      | 10710/30000 [5:40:55<1:37:57,  3.28it/s]

{'loss': 0.2577, 'grad_norm': 0.5781003832817078, 'learning_rate': 3.269491525423729e-05, 'epoch': 1.07}


 36%|███▌      | 10720/30000 [5:40:58<1:38:04,  3.28it/s]

{'loss': 0.2288, 'grad_norm': 0.6852072477340698, 'learning_rate': 3.267796610169492e-05, 'epoch': 1.07}


 36%|███▌      | 10730/30000 [5:41:01<1:37:48,  3.28it/s]

{'loss': 0.2522, 'grad_norm': 0.6702936291694641, 'learning_rate': 3.266101694915254e-05, 'epoch': 1.07}


 36%|███▌      | 10740/30000 [5:41:04<1:37:54,  3.28it/s]

{'loss': 0.2551, 'grad_norm': 0.5952029824256897, 'learning_rate': 3.264406779661017e-05, 'epoch': 1.07}


 36%|███▌      | 10750/30000 [5:41:07<1:38:05,  3.27it/s]

{'loss': 0.2578, 'grad_norm': 0.5955227613449097, 'learning_rate': 3.26271186440678e-05, 'epoch': 1.07}


 36%|███▌      | 10760/30000 [5:41:10<1:37:55,  3.27it/s]

{'loss': 0.2264, 'grad_norm': 0.5030474662780762, 'learning_rate': 3.261016949152543e-05, 'epoch': 1.08}


 36%|███▌      | 10770/30000 [5:41:13<1:37:58,  3.27it/s]

{'loss': 0.2593, 'grad_norm': 0.8964557647705078, 'learning_rate': 3.259322033898305e-05, 'epoch': 1.08}


 36%|███▌      | 10780/30000 [5:41:17<1:38:04,  3.27it/s]

{'loss': 0.2507, 'grad_norm': 0.41629600524902344, 'learning_rate': 3.257627118644068e-05, 'epoch': 1.08}


 36%|███▌      | 10790/30000 [5:41:20<1:37:48,  3.27it/s]

{'loss': 0.2528, 'grad_norm': 0.6700555682182312, 'learning_rate': 3.2559322033898305e-05, 'epoch': 1.08}


 36%|███▌      | 10800/30000 [5:41:23<1:37:49,  3.27it/s]

{'loss': 0.2572, 'grad_norm': 0.5666579604148865, 'learning_rate': 3.2542372881355934e-05, 'epoch': 1.08}


 36%|███▌      | 10810/30000 [5:41:26<1:37:47,  3.27it/s]

{'loss': 0.2519, 'grad_norm': 0.4881112277507782, 'learning_rate': 3.2525423728813557e-05, 'epoch': 1.08}


 36%|███▌      | 10820/30000 [5:41:29<1:37:41,  3.27it/s]

{'loss': 0.2758, 'grad_norm': 0.6480688452720642, 'learning_rate': 3.2508474576271186e-05, 'epoch': 1.08}


 36%|███▌      | 10830/30000 [5:41:32<1:37:37,  3.27it/s]

{'loss': 0.2648, 'grad_norm': 0.583768904209137, 'learning_rate': 3.2491525423728815e-05, 'epoch': 1.08}


 36%|███▌      | 10840/30000 [5:41:35<1:37:39,  3.27it/s]

{'loss': 0.2436, 'grad_norm': 0.44956445693969727, 'learning_rate': 3.247457627118644e-05, 'epoch': 1.08}


 36%|███▌      | 10850/30000 [5:41:38<1:37:31,  3.27it/s]

{'loss': 0.2525, 'grad_norm': 0.5215653777122498, 'learning_rate': 3.245762711864407e-05, 'epoch': 1.08}


 36%|███▌      | 10860/30000 [5:41:41<1:37:27,  3.27it/s]

{'loss': 0.2554, 'grad_norm': 0.5265592932701111, 'learning_rate': 3.2440677966101696e-05, 'epoch': 1.09}


 36%|███▌      | 10870/30000 [5:41:44<1:37:28,  3.27it/s]

{'loss': 0.2533, 'grad_norm': 0.5747042894363403, 'learning_rate': 3.2423728813559326e-05, 'epoch': 1.09}


 36%|███▋      | 10880/30000 [5:41:47<1:37:21,  3.27it/s]

{'loss': 0.2421, 'grad_norm': 0.48292094469070435, 'learning_rate': 3.240677966101695e-05, 'epoch': 1.09}


 36%|███▋      | 10890/30000 [5:41:50<1:37:16,  3.27it/s]

{'loss': 0.2633, 'grad_norm': 0.522053062915802, 'learning_rate': 3.238983050847458e-05, 'epoch': 1.09}


 36%|███▋      | 10900/30000 [5:41:53<1:37:12,  3.27it/s]

{'loss': 0.2296, 'grad_norm': 0.506827712059021, 'learning_rate': 3.237288135593221e-05, 'epoch': 1.09}


 36%|███▋      | 10910/30000 [5:41:56<1:37:12,  3.27it/s]

{'loss': 0.2381, 'grad_norm': 0.43546801805496216, 'learning_rate': 3.235593220338983e-05, 'epoch': 1.09}


 36%|███▋      | 10920/30000 [5:41:59<1:37:08,  3.27it/s]

{'loss': 0.2461, 'grad_norm': 0.6227218508720398, 'learning_rate': 3.233898305084746e-05, 'epoch': 1.09}


 36%|███▋      | 10930/30000 [5:42:02<1:37:09,  3.27it/s]

{'loss': 0.2664, 'grad_norm': 0.5193279981613159, 'learning_rate': 3.232203389830509e-05, 'epoch': 1.09}


 36%|███▋      | 10940/30000 [5:42:05<1:37:10,  3.27it/s]

{'loss': 0.2343, 'grad_norm': 0.4145248234272003, 'learning_rate': 3.230508474576272e-05, 'epoch': 1.09}


 36%|███▋      | 10950/30000 [5:42:09<1:37:01,  3.27it/s]

{'loss': 0.2482, 'grad_norm': 0.5444319844245911, 'learning_rate': 3.228813559322034e-05, 'epoch': 1.09}


 37%|███▋      | 10960/30000 [5:42:12<1:36:54,  3.27it/s]

{'loss': 0.2532, 'grad_norm': 0.5163333415985107, 'learning_rate': 3.227118644067797e-05, 'epoch': 1.1}


 37%|███▋      | 10970/30000 [5:42:15<1:36:54,  3.27it/s]

{'loss': 0.2532, 'grad_norm': 0.5033342838287354, 'learning_rate': 3.22542372881356e-05, 'epoch': 1.1}


 37%|███▋      | 10980/30000 [5:42:18<1:37:02,  3.27it/s]

{'loss': 0.2438, 'grad_norm': 0.495383083820343, 'learning_rate': 3.223728813559322e-05, 'epoch': 1.1}


 37%|███▋      | 10990/30000 [5:42:21<1:36:49,  3.27it/s]

{'loss': 0.2465, 'grad_norm': 0.4519290328025818, 'learning_rate': 3.222033898305085e-05, 'epoch': 1.1}


 37%|███▋      | 11000/30000 [5:42:24<1:36:51,  3.27it/s]

{'loss': 0.2528, 'grad_norm': 0.6124548316001892, 'learning_rate': 3.2203389830508473e-05, 'epoch': 1.1}


 37%|███▋      | 11010/30000 [5:42:27<1:36:46,  3.27it/s]

{'loss': 0.2509, 'grad_norm': 0.5157888531684875, 'learning_rate': 3.21864406779661e-05, 'epoch': 1.1}


 37%|███▋      | 11020/30000 [5:42:30<1:36:41,  3.27it/s]

{'loss': 0.2624, 'grad_norm': 0.5204874277114868, 'learning_rate': 3.2169491525423725e-05, 'epoch': 1.1}


 37%|███▋      | 11030/30000 [5:42:33<1:36:34,  3.27it/s]

{'loss': 0.2405, 'grad_norm': 0.5381526947021484, 'learning_rate': 3.2152542372881355e-05, 'epoch': 1.1}


 37%|███▋      | 11040/30000 [5:42:36<1:36:39,  3.27it/s]

{'loss': 0.2624, 'grad_norm': 0.4919154942035675, 'learning_rate': 3.2135593220338984e-05, 'epoch': 1.1}


 37%|███▋      | 11050/30000 [5:42:39<1:37:15,  3.25it/s]

{'loss': 0.2355, 'grad_norm': 0.5435601472854614, 'learning_rate': 3.2118644067796613e-05, 'epoch': 1.1}


 37%|███▋      | 11060/30000 [5:42:42<1:39:29,  3.17it/s]

{'loss': 0.2504, 'grad_norm': 0.6135016083717346, 'learning_rate': 3.2101694915254236e-05, 'epoch': 1.11}


 37%|███▋      | 11070/30000 [5:42:45<1:41:24,  3.11it/s]

{'loss': 0.247, 'grad_norm': 0.5071213841438293, 'learning_rate': 3.2084745762711865e-05, 'epoch': 1.11}


 37%|███▋      | 11080/30000 [5:42:49<1:43:10,  3.06it/s]

{'loss': 0.2481, 'grad_norm': 0.5667657852172852, 'learning_rate': 3.2067796610169495e-05, 'epoch': 1.11}


 37%|███▋      | 11090/30000 [5:42:52<1:44:44,  3.01it/s]

{'loss': 0.2411, 'grad_norm': 0.4759562313556671, 'learning_rate': 3.205084745762712e-05, 'epoch': 1.11}


 37%|███▋      | 11100/30000 [5:42:55<1:45:46,  2.98it/s]

{'loss': 0.2427, 'grad_norm': 0.4674232006072998, 'learning_rate': 3.203389830508475e-05, 'epoch': 1.11}


 37%|███▋      | 11110/30000 [5:43:01<4:06:10,  1.28it/s]

{'loss': 0.2645, 'grad_norm': 0.41883087158203125, 'learning_rate': 3.2016949152542376e-05, 'epoch': 1.11}


 37%|███▋      | 11120/30000 [5:45:44<86:50:26, 16.56s/it] 

{'loss': 0.2581, 'grad_norm': 0.48908957839012146, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.11}


 37%|███▋      | 11130/30000 [5:45:47<4:00:23,  1.31it/s] 

{'loss': 0.2532, 'grad_norm': 0.5386705994606018, 'learning_rate': 3.198305084745763e-05, 'epoch': 1.11}


 37%|███▋      | 11140/30000 [5:45:51<1:39:57,  3.14it/s]

{'loss': 0.2557, 'grad_norm': 0.6024916768074036, 'learning_rate': 3.196610169491526e-05, 'epoch': 1.11}


 37%|███▋      | 11150/30000 [5:45:54<1:36:01,  3.27it/s]

{'loss': 0.2562, 'grad_norm': 0.6363252997398376, 'learning_rate': 3.1949152542372887e-05, 'epoch': 1.11}


 37%|███▋      | 11160/30000 [5:45:57<1:35:58,  3.27it/s]

{'loss': 0.2394, 'grad_norm': 0.4276042878627777, 'learning_rate': 3.193220338983051e-05, 'epoch': 1.12}


 37%|███▋      | 11170/30000 [5:46:00<1:35:54,  3.27it/s]

{'loss': 0.2372, 'grad_norm': 0.5232705473899841, 'learning_rate': 3.191525423728814e-05, 'epoch': 1.12}


 37%|███▋      | 11180/30000 [5:46:03<1:35:48,  3.27it/s]

{'loss': 0.262, 'grad_norm': 0.5366113781929016, 'learning_rate': 3.189830508474577e-05, 'epoch': 1.12}


 37%|███▋      | 11190/30000 [5:46:06<1:35:44,  3.27it/s]

{'loss': 0.2583, 'grad_norm': 0.987607479095459, 'learning_rate': 3.18813559322034e-05, 'epoch': 1.12}


 37%|███▋      | 11200/30000 [5:46:09<1:35:41,  3.27it/s]

{'loss': 0.2353, 'grad_norm': 0.4708566963672638, 'learning_rate': 3.186440677966101e-05, 'epoch': 1.12}


 37%|███▋      | 11210/30000 [5:46:12<1:35:39,  3.27it/s]

{'loss': 0.2471, 'grad_norm': 0.691942572593689, 'learning_rate': 3.184745762711864e-05, 'epoch': 1.12}


 37%|███▋      | 11220/30000 [5:46:15<1:35:43,  3.27it/s]

{'loss': 0.2487, 'grad_norm': 0.6242690086364746, 'learning_rate': 3.183050847457627e-05, 'epoch': 1.12}


 37%|███▋      | 11230/30000 [5:46:18<1:35:40,  3.27it/s]

{'loss': 0.2518, 'grad_norm': 0.5892648100852966, 'learning_rate': 3.18135593220339e-05, 'epoch': 1.12}


 37%|███▋      | 11240/30000 [5:46:21<1:35:33,  3.27it/s]

{'loss': 0.264, 'grad_norm': 0.6175214052200317, 'learning_rate': 3.1796610169491524e-05, 'epoch': 1.12}


 38%|███▊      | 11250/30000 [5:46:24<1:35:33,  3.27it/s]

{'loss': 0.2558, 'grad_norm': 0.594355046749115, 'learning_rate': 3.177966101694915e-05, 'epoch': 1.12}


 38%|███▊      | 11260/30000 [5:46:27<1:35:33,  3.27it/s]

{'loss': 0.2622, 'grad_norm': 0.4776339530944824, 'learning_rate': 3.176271186440678e-05, 'epoch': 1.13}


 38%|███▊      | 11270/30000 [5:46:30<1:35:23,  3.27it/s]

{'loss': 0.2478, 'grad_norm': 0.5106923580169678, 'learning_rate': 3.1745762711864405e-05, 'epoch': 1.13}


 38%|███▊      | 11280/30000 [5:46:33<1:35:24,  3.27it/s]

{'loss': 0.2483, 'grad_norm': 0.6637285947799683, 'learning_rate': 3.1728813559322034e-05, 'epoch': 1.13}


 38%|███▊      | 11290/30000 [5:46:36<1:35:15,  3.27it/s]

{'loss': 0.2395, 'grad_norm': 0.5400782227516174, 'learning_rate': 3.1711864406779664e-05, 'epoch': 1.13}


 38%|███▊      | 11300/30000 [5:46:39<1:35:14,  3.27it/s]

{'loss': 0.2466, 'grad_norm': 0.5107462406158447, 'learning_rate': 3.169491525423729e-05, 'epoch': 1.13}


 38%|███▊      | 11310/30000 [5:46:43<1:35:09,  3.27it/s]

{'loss': 0.2518, 'grad_norm': 0.6630126237869263, 'learning_rate': 3.1677966101694916e-05, 'epoch': 1.13}


 38%|███▊      | 11320/30000 [5:46:46<1:35:11,  3.27it/s]

{'loss': 0.2592, 'grad_norm': 0.6089791655540466, 'learning_rate': 3.1661016949152545e-05, 'epoch': 1.13}


 38%|███▊      | 11330/30000 [5:46:49<1:35:03,  3.27it/s]

{'loss': 0.2513, 'grad_norm': 0.4242890179157257, 'learning_rate': 3.1644067796610174e-05, 'epoch': 1.13}


 38%|███▊      | 11340/30000 [5:46:52<1:35:01,  3.27it/s]

{'loss': 0.2505, 'grad_norm': 0.48633357882499695, 'learning_rate': 3.16271186440678e-05, 'epoch': 1.13}


 38%|███▊      | 11350/30000 [5:46:55<1:35:05,  3.27it/s]

{'loss': 0.253, 'grad_norm': 0.46159470081329346, 'learning_rate': 3.1610169491525426e-05, 'epoch': 1.14}


 38%|███▊      | 11360/30000 [5:46:58<1:34:52,  3.27it/s]

{'loss': 0.2511, 'grad_norm': 0.4989165663719177, 'learning_rate': 3.1593220338983055e-05, 'epoch': 1.14}


 38%|███▊      | 11370/30000 [5:47:01<1:34:51,  3.27it/s]

{'loss': 0.2526, 'grad_norm': 0.5251794457435608, 'learning_rate': 3.157627118644068e-05, 'epoch': 1.14}


 38%|███▊      | 11380/30000 [5:47:04<1:34:52,  3.27it/s]

{'loss': 0.2355, 'grad_norm': 0.4845725893974304, 'learning_rate': 3.155932203389831e-05, 'epoch': 1.14}


 38%|███▊      | 11390/30000 [5:47:07<1:34:45,  3.27it/s]

{'loss': 0.2348, 'grad_norm': 0.45555201172828674, 'learning_rate': 3.154237288135594e-05, 'epoch': 1.14}


 38%|███▊      | 11400/30000 [5:47:10<1:34:44,  3.27it/s]

{'loss': 0.2393, 'grad_norm': 0.5433248281478882, 'learning_rate': 3.1525423728813566e-05, 'epoch': 1.14}


 38%|███▊      | 11410/30000 [5:47:13<1:34:39,  3.27it/s]

{'loss': 0.2542, 'grad_norm': 0.5121618509292603, 'learning_rate': 3.150847457627118e-05, 'epoch': 1.14}


 38%|███▊      | 11420/30000 [5:47:22<4:38:07,  1.11it/s]

{'loss': 0.2533, 'grad_norm': 0.604317843914032, 'learning_rate': 3.149152542372881e-05, 'epoch': 1.14}


 38%|███▊      | 11430/30000 [6:03:12<1460:20:33, 283.10s/it]

{'loss': 0.2623, 'grad_norm': 0.5168729424476624, 'learning_rate': 3.147457627118644e-05, 'epoch': 1.14}


 38%|███▊      | 11440/30000 [6:03:15<42:45:11,  8.29s/it]   

{'loss': 0.2507, 'grad_norm': 0.5138193964958191, 'learning_rate': 3.145762711864407e-05, 'epoch': 1.14}


 38%|███▊      | 11450/30000 [6:03:18<2:43:53,  1.89it/s] 

{'loss': 0.2487, 'grad_norm': 0.5718364715576172, 'learning_rate': 3.144067796610169e-05, 'epoch': 1.15}


 38%|███▊      | 11460/30000 [6:03:21<1:36:01,  3.22it/s]

{'loss': 0.2566, 'grad_norm': 0.49493274092674255, 'learning_rate': 3.142372881355932e-05, 'epoch': 1.15}


 38%|███▊      | 11470/30000 [6:03:24<1:33:59,  3.29it/s]

{'loss': 0.2345, 'grad_norm': 0.44133123755455017, 'learning_rate': 3.140677966101695e-05, 'epoch': 1.15}


 38%|███▊      | 11480/30000 [6:03:27<1:34:03,  3.28it/s]

{'loss': 0.2344, 'grad_norm': 0.6033504009246826, 'learning_rate': 3.1389830508474574e-05, 'epoch': 1.15}


 38%|███▊      | 11490/30000 [6:03:30<1:34:01,  3.28it/s]

{'loss': 0.2418, 'grad_norm': 0.5431609749794006, 'learning_rate': 3.13728813559322e-05, 'epoch': 1.15}


 38%|███▊      | 11500/30000 [6:03:33<1:34:03,  3.28it/s]

{'loss': 0.2333, 'grad_norm': 0.5878642201423645, 'learning_rate': 3.135593220338983e-05, 'epoch': 1.15}


 38%|███▊      | 11510/30000 [6:03:36<1:33:57,  3.28it/s]

{'loss': 0.2403, 'grad_norm': 0.527534544467926, 'learning_rate': 3.133898305084746e-05, 'epoch': 1.15}


 38%|███▊      | 11520/30000 [6:03:39<1:33:58,  3.28it/s]

{'loss': 0.2536, 'grad_norm': 0.5273672938346863, 'learning_rate': 3.1322033898305084e-05, 'epoch': 1.15}


 38%|███▊      | 11530/30000 [6:03:42<1:33:57,  3.28it/s]

{'loss': 0.2478, 'grad_norm': 0.4303780198097229, 'learning_rate': 3.1305084745762714e-05, 'epoch': 1.15}


 38%|███▊      | 11540/30000 [6:03:45<1:33:58,  3.27it/s]

{'loss': 0.2431, 'grad_norm': 0.5198204517364502, 'learning_rate': 3.128813559322034e-05, 'epoch': 1.15}


 38%|███▊      | 11550/30000 [6:03:48<1:33:46,  3.28it/s]

{'loss': 0.2446, 'grad_norm': 0.6244497895240784, 'learning_rate': 3.1271186440677966e-05, 'epoch': 1.16}


 39%|███▊      | 11560/30000 [6:03:51<1:33:43,  3.28it/s]

{'loss': 0.2354, 'grad_norm': 0.5665987133979797, 'learning_rate': 3.1254237288135595e-05, 'epoch': 1.16}


 39%|███▊      | 11570/30000 [6:03:54<1:33:53,  3.27it/s]

{'loss': 0.2428, 'grad_norm': 0.6744362711906433, 'learning_rate': 3.1237288135593224e-05, 'epoch': 1.16}


 39%|███▊      | 11580/30000 [6:03:57<1:33:44,  3.28it/s]

{'loss': 0.2469, 'grad_norm': 0.5979886651039124, 'learning_rate': 3.1220338983050854e-05, 'epoch': 1.16}


 39%|███▊      | 11590/30000 [6:04:00<1:33:50,  3.27it/s]

{'loss': 0.2386, 'grad_norm': 0.4954853057861328, 'learning_rate': 3.1203389830508476e-05, 'epoch': 1.16}


 39%|███▊      | 11600/30000 [6:04:03<1:33:45,  3.27it/s]

{'loss': 0.249, 'grad_norm': 0.49311861395835876, 'learning_rate': 3.1186440677966106e-05, 'epoch': 1.16}


 39%|███▊      | 11610/30000 [6:04:07<1:33:41,  3.27it/s]

{'loss': 0.2496, 'grad_norm': 0.5248826146125793, 'learning_rate': 3.1169491525423735e-05, 'epoch': 1.16}


 39%|███▊      | 11620/30000 [6:04:10<1:33:32,  3.27it/s]

{'loss': 0.2468, 'grad_norm': 0.5112966895103455, 'learning_rate': 3.115254237288136e-05, 'epoch': 1.16}


 39%|███▉      | 11630/30000 [6:04:13<1:33:41,  3.27it/s]

{'loss': 0.2444, 'grad_norm': 0.5577347278594971, 'learning_rate': 3.113559322033898e-05, 'epoch': 1.16}


 39%|███▉      | 11640/30000 [6:04:16<1:33:27,  3.27it/s]

{'loss': 0.2393, 'grad_norm': 0.6325980424880981, 'learning_rate': 3.111864406779661e-05, 'epoch': 1.16}


 39%|███▉      | 11650/30000 [6:04:19<1:33:25,  3.27it/s]

{'loss': 0.2433, 'grad_norm': 0.5114786624908447, 'learning_rate': 3.110169491525424e-05, 'epoch': 1.17}


 39%|███▉      | 11660/30000 [6:04:22<1:33:21,  3.27it/s]

{'loss': 0.2425, 'grad_norm': 0.6012260317802429, 'learning_rate': 3.108474576271186e-05, 'epoch': 1.17}


 39%|███▉      | 11670/30000 [6:04:25<1:33:17,  3.27it/s]

{'loss': 0.2411, 'grad_norm': 0.43461933732032776, 'learning_rate': 3.106779661016949e-05, 'epoch': 1.17}


 39%|███▉      | 11680/30000 [6:04:28<1:33:18,  3.27it/s]

{'loss': 0.2483, 'grad_norm': 0.5182127356529236, 'learning_rate': 3.105084745762712e-05, 'epoch': 1.17}


 39%|███▉      | 11690/30000 [6:04:31<1:33:18,  3.27it/s]

{'loss': 0.2511, 'grad_norm': 0.517021119594574, 'learning_rate': 3.103389830508475e-05, 'epoch': 1.17}


 39%|███▉      | 11700/30000 [6:04:34<1:33:10,  3.27it/s]

{'loss': 0.2487, 'grad_norm': 0.6053915619850159, 'learning_rate': 3.101694915254237e-05, 'epoch': 1.17}


 39%|███▉      | 11710/30000 [6:04:37<1:33:07,  3.27it/s]

{'loss': 0.2511, 'grad_norm': 0.6944077014923096, 'learning_rate': 3.1e-05, 'epoch': 1.17}


 39%|███▉      | 11720/30000 [6:04:40<1:33:09,  3.27it/s]

{'loss': 0.263, 'grad_norm': 0.5402700901031494, 'learning_rate': 3.098305084745763e-05, 'epoch': 1.17}


 39%|███▉      | 11730/30000 [6:04:43<1:33:02,  3.27it/s]

{'loss': 0.2409, 'grad_norm': 0.5186158418655396, 'learning_rate': 3.096610169491525e-05, 'epoch': 1.17}


 39%|███▉      | 11740/30000 [6:04:46<1:32:59,  3.27it/s]

{'loss': 0.2525, 'grad_norm': 0.5828379392623901, 'learning_rate': 3.094915254237288e-05, 'epoch': 1.17}


 39%|███▉      | 11750/30000 [6:04:49<1:33:04,  3.27it/s]

{'loss': 0.2365, 'grad_norm': 0.6922386884689331, 'learning_rate': 3.093220338983051e-05, 'epoch': 1.18}


 39%|███▉      | 11760/30000 [6:04:52<1:32:54,  3.27it/s]

{'loss': 0.2575, 'grad_norm': 0.5039845705032349, 'learning_rate': 3.091525423728814e-05, 'epoch': 1.18}


 39%|███▉      | 11770/30000 [6:04:55<1:32:51,  3.27it/s]

{'loss': 0.2464, 'grad_norm': 0.518074095249176, 'learning_rate': 3.0898305084745764e-05, 'epoch': 1.18}


 39%|███▉      | 11780/30000 [6:04:59<1:32:47,  3.27it/s]

{'loss': 0.2535, 'grad_norm': 0.5557215809822083, 'learning_rate': 3.088135593220339e-05, 'epoch': 1.18}


 39%|███▉      | 11790/30000 [6:05:02<1:32:49,  3.27it/s]

{'loss': 0.2486, 'grad_norm': 0.7114435434341431, 'learning_rate': 3.086440677966102e-05, 'epoch': 1.18}


 39%|███▉      | 11800/30000 [6:05:05<1:32:43,  3.27it/s]

{'loss': 0.2562, 'grad_norm': 0.5485873818397522, 'learning_rate': 3.0847457627118645e-05, 'epoch': 1.18}


 39%|███▉      | 11810/30000 [6:05:08<1:32:38,  3.27it/s]

{'loss': 0.2412, 'grad_norm': 0.577904462814331, 'learning_rate': 3.0830508474576275e-05, 'epoch': 1.18}


 39%|███▉      | 11820/30000 [6:05:11<1:32:36,  3.27it/s]

{'loss': 0.2452, 'grad_norm': 0.48408618569374084, 'learning_rate': 3.0813559322033904e-05, 'epoch': 1.18}


 39%|███▉      | 11830/30000 [6:05:14<1:32:33,  3.27it/s]

{'loss': 0.2388, 'grad_norm': 0.4413832426071167, 'learning_rate': 3.0796610169491526e-05, 'epoch': 1.18}


 39%|███▉      | 11840/30000 [6:05:17<1:32:45,  3.26it/s]

{'loss': 0.2372, 'grad_norm': 0.4763267934322357, 'learning_rate': 3.077966101694915e-05, 'epoch': 1.18}


 40%|███▉      | 11850/30000 [6:05:20<1:34:35,  3.20it/s]

{'loss': 0.2646, 'grad_norm': 0.4439239203929901, 'learning_rate': 3.076271186440678e-05, 'epoch': 1.19}


 40%|███▉      | 11860/30000 [6:05:23<1:36:35,  3.13it/s]

{'loss': 0.2708, 'grad_norm': 0.5373955368995667, 'learning_rate': 3.074576271186441e-05, 'epoch': 1.19}


 40%|███▉      | 11870/30000 [6:05:26<1:38:07,  3.08it/s]

{'loss': 0.2635, 'grad_norm': 0.4911247491836548, 'learning_rate': 3.072881355932204e-05, 'epoch': 1.19}


 40%|███▉      | 11880/30000 [6:05:30<1:39:29,  3.04it/s]

{'loss': 0.2519, 'grad_norm': 0.5117018818855286, 'learning_rate': 3.071186440677966e-05, 'epoch': 1.19}


 40%|███▉      | 11890/30000 [6:05:33<1:40:57,  2.99it/s]

{'loss': 0.2559, 'grad_norm': 0.4196119010448456, 'learning_rate': 3.069491525423729e-05, 'epoch': 1.19}


 40%|███▉      | 11900/30000 [6:05:40<4:17:38,  1.17it/s]

{'loss': 0.2564, 'grad_norm': 0.48751765489578247, 'learning_rate': 3.067796610169492e-05, 'epoch': 1.19}


 40%|███▉      | 11910/30000 [6:05:49<4:38:37,  1.08it/s]

{'loss': 0.2314, 'grad_norm': 0.49969297647476196, 'learning_rate': 3.066101694915254e-05, 'epoch': 1.19}


 40%|███▉      | 11920/30000 [6:21:24<117:11:18, 23.33s/it]  

{'loss': 0.2469, 'grad_norm': 0.653050422668457, 'learning_rate': 3.064406779661017e-05, 'epoch': 1.19}


 40%|███▉      | 11930/30000 [6:21:27<4:47:39,  1.05it/s]  

{'loss': 0.2484, 'grad_norm': 1.1072380542755127, 'learning_rate': 3.06271186440678e-05, 'epoch': 1.19}


 40%|███▉      | 11940/30000 [6:21:30<1:37:13,  3.10it/s]

{'loss': 0.2385, 'grad_norm': 0.5085720419883728, 'learning_rate': 3.061016949152543e-05, 'epoch': 1.19}


 40%|███▉      | 11950/30000 [6:21:33<1:31:57,  3.27it/s]

{'loss': 0.2447, 'grad_norm': 0.5098572969436646, 'learning_rate': 3.059322033898305e-05, 'epoch': 1.2}


 40%|███▉      | 11960/30000 [6:21:36<1:31:38,  3.28it/s]

{'loss': 0.2505, 'grad_norm': 0.5499767661094666, 'learning_rate': 3.057627118644068e-05, 'epoch': 1.2}


 40%|███▉      | 11970/30000 [6:21:39<1:31:34,  3.28it/s]

{'loss': 0.2385, 'grad_norm': 0.534451961517334, 'learning_rate': 3.055932203389831e-05, 'epoch': 1.2}


 40%|███▉      | 11980/30000 [6:21:42<1:31:35,  3.28it/s]

{'loss': 0.2471, 'grad_norm': 0.6781660318374634, 'learning_rate': 3.054237288135593e-05, 'epoch': 1.2}


 40%|███▉      | 11990/30000 [6:21:45<1:31:42,  3.27it/s]

{'loss': 0.244, 'grad_norm': 0.6016896367073059, 'learning_rate': 3.052542372881356e-05, 'epoch': 1.2}


 40%|████      | 12000/30000 [6:21:48<1:31:32,  3.28it/s]

{'loss': 0.2503, 'grad_norm': 0.5010820031166077, 'learning_rate': 3.050847457627119e-05, 'epoch': 1.2}


 40%|████      | 12010/30000 [6:21:51<1:31:37,  3.27it/s]

{'loss': 0.2419, 'grad_norm': 0.5106900334358215, 'learning_rate': 3.0491525423728817e-05, 'epoch': 1.2}


 40%|████      | 12020/30000 [6:21:54<1:31:27,  3.28it/s]

{'loss': 0.234, 'grad_norm': 0.4931972622871399, 'learning_rate': 3.0474576271186443e-05, 'epoch': 1.2}


 40%|████      | 12030/30000 [6:21:57<1:31:31,  3.27it/s]

{'loss': 0.2384, 'grad_norm': 0.5120458006858826, 'learning_rate': 3.0457627118644066e-05, 'epoch': 1.2}


 40%|████      | 12040/30000 [6:22:00<1:31:26,  3.27it/s]

{'loss': 0.2399, 'grad_norm': 0.5514320135116577, 'learning_rate': 3.0440677966101695e-05, 'epoch': 1.2}


 40%|████      | 12050/30000 [6:22:04<1:31:32,  3.27it/s]

{'loss': 0.2411, 'grad_norm': 0.5284492373466492, 'learning_rate': 3.042372881355932e-05, 'epoch': 1.21}


 40%|████      | 12060/30000 [6:22:07<1:31:18,  3.27it/s]

{'loss': 0.2443, 'grad_norm': 0.5715873837471008, 'learning_rate': 3.0406779661016947e-05, 'epoch': 1.21}


 40%|████      | 12070/30000 [6:22:10<1:31:19,  3.27it/s]

{'loss': 0.2394, 'grad_norm': 0.510716438293457, 'learning_rate': 3.0389830508474577e-05, 'epoch': 1.21}


 40%|████      | 12080/30000 [6:22:13<1:31:15,  3.27it/s]

{'loss': 0.2432, 'grad_norm': 0.6111155152320862, 'learning_rate': 3.0372881355932203e-05, 'epoch': 1.21}


 40%|████      | 12090/30000 [6:22:16<1:31:13,  3.27it/s]

{'loss': 0.2398, 'grad_norm': 0.594577968120575, 'learning_rate': 3.0355932203389832e-05, 'epoch': 1.21}


 40%|████      | 12100/30000 [6:22:19<1:31:20,  3.27it/s]

{'loss': 0.2452, 'grad_norm': 0.5464320778846741, 'learning_rate': 3.0338983050847458e-05, 'epoch': 1.21}


 40%|████      | 12110/30000 [6:22:22<1:31:06,  3.27it/s]

{'loss': 0.258, 'grad_norm': 0.5377795100212097, 'learning_rate': 3.0322033898305087e-05, 'epoch': 1.21}


 40%|████      | 12120/30000 [6:22:25<1:31:02,  3.27it/s]

{'loss': 0.2393, 'grad_norm': 0.7167530059814453, 'learning_rate': 3.0305084745762713e-05, 'epoch': 1.21}


 40%|████      | 12130/30000 [6:22:28<1:31:02,  3.27it/s]

{'loss': 0.231, 'grad_norm': 0.6223605871200562, 'learning_rate': 3.028813559322034e-05, 'epoch': 1.21}


 40%|████      | 12140/30000 [6:22:31<1:30:56,  3.27it/s]

{'loss': 0.2405, 'grad_norm': 0.5431342124938965, 'learning_rate': 3.027118644067797e-05, 'epoch': 1.21}


 40%|████      | 12150/30000 [6:22:34<1:30:57,  3.27it/s]

{'loss': 0.2528, 'grad_norm': 0.5182140469551086, 'learning_rate': 3.0254237288135594e-05, 'epoch': 1.22}


 41%|████      | 12160/30000 [6:22:37<1:30:56,  3.27it/s]

{'loss': 0.2448, 'grad_norm': 0.5577381253242493, 'learning_rate': 3.0237288135593224e-05, 'epoch': 1.22}


 41%|████      | 12170/30000 [6:22:40<1:30:51,  3.27it/s]

{'loss': 0.2376, 'grad_norm': 0.5651726126670837, 'learning_rate': 3.022033898305085e-05, 'epoch': 1.22}


 41%|████      | 12180/30000 [6:22:43<1:30:47,  3.27it/s]

{'loss': 0.242, 'grad_norm': 0.500372588634491, 'learning_rate': 3.0203389830508476e-05, 'epoch': 1.22}


 41%|████      | 12190/30000 [6:22:46<1:30:38,  3.28it/s]

{'loss': 0.2472, 'grad_norm': 0.6601539850234985, 'learning_rate': 3.0186440677966105e-05, 'epoch': 1.22}


 41%|████      | 12200/30000 [6:22:49<1:30:40,  3.27it/s]

{'loss': 0.261, 'grad_norm': 0.530798614025116, 'learning_rate': 3.016949152542373e-05, 'epoch': 1.22}


 41%|████      | 12210/30000 [6:22:52<1:30:45,  3.27it/s]

{'loss': 0.2524, 'grad_norm': 0.5146695375442505, 'learning_rate': 3.015254237288136e-05, 'epoch': 1.22}


 41%|████      | 12220/30000 [6:22:56<1:30:38,  3.27it/s]

{'loss': 0.2368, 'grad_norm': 0.48334723711013794, 'learning_rate': 3.0135593220338986e-05, 'epoch': 1.22}


 41%|████      | 12230/30000 [6:22:59<1:30:30,  3.27it/s]

{'loss': 0.2353, 'grad_norm': 0.5230082869529724, 'learning_rate': 3.0118644067796616e-05, 'epoch': 1.22}


 41%|████      | 12240/30000 [6:23:02<1:30:34,  3.27it/s]

{'loss': 0.234, 'grad_norm': 0.6198675632476807, 'learning_rate': 3.0101694915254235e-05, 'epoch': 1.22}


 41%|████      | 12250/30000 [6:23:05<1:30:27,  3.27it/s]

{'loss': 0.2437, 'grad_norm': 0.5590417981147766, 'learning_rate': 3.0084745762711864e-05, 'epoch': 1.23}


 41%|████      | 12260/30000 [6:23:08<1:30:19,  3.27it/s]

{'loss': 0.2471, 'grad_norm': 0.5453950762748718, 'learning_rate': 3.006779661016949e-05, 'epoch': 1.23}


 41%|████      | 12270/30000 [6:23:11<1:30:23,  3.27it/s]

{'loss': 0.2476, 'grad_norm': 0.5518012642860413, 'learning_rate': 3.005084745762712e-05, 'epoch': 1.23}


 41%|████      | 12280/30000 [6:23:14<1:30:21,  3.27it/s]

{'loss': 0.2478, 'grad_norm': 0.38448062539100647, 'learning_rate': 3.0033898305084745e-05, 'epoch': 1.23}


 41%|████      | 12290/30000 [6:23:17<1:30:16,  3.27it/s]

{'loss': 0.2489, 'grad_norm': 0.5265499949455261, 'learning_rate': 3.001694915254237e-05, 'epoch': 1.23}


 41%|████      | 12300/30000 [6:23:20<1:30:09,  3.27it/s]

{'loss': 0.2511, 'grad_norm': 0.49723735451698303, 'learning_rate': 3e-05, 'epoch': 1.23}


 41%|████      | 12310/30000 [6:23:23<1:30:06,  3.27it/s]

{'loss': 0.2531, 'grad_norm': 0.626204252243042, 'learning_rate': 2.9983050847457627e-05, 'epoch': 1.23}


 41%|████      | 12320/30000 [6:23:26<1:30:06,  3.27it/s]

{'loss': 0.2431, 'grad_norm': 0.5368070602416992, 'learning_rate': 2.9966101694915256e-05, 'epoch': 1.23}


 41%|████      | 12330/30000 [6:39:51<709:56:48, 144.64s/it] 

{'loss': 0.2431, 'grad_norm': 0.6764097809791565, 'learning_rate': 2.9949152542372882e-05, 'epoch': 1.23}


 41%|████      | 12340/30000 [6:39:54<21:29:48,  4.38s/it]  

{'loss': 0.2529, 'grad_norm': 0.4686892032623291, 'learning_rate': 2.993220338983051e-05, 'epoch': 1.23}


 41%|████      | 12350/30000 [6:39:57<2:03:33,  2.38it/s] 

{'loss': 0.2435, 'grad_norm': 0.6501654982566833, 'learning_rate': 2.9915254237288137e-05, 'epoch': 1.23}


 41%|████      | 12360/30000 [6:40:00<1:30:46,  3.24it/s]

{'loss': 0.2403, 'grad_norm': 0.6941766738891602, 'learning_rate': 2.9898305084745763e-05, 'epoch': 1.24}


 41%|████      | 12370/30000 [6:40:03<1:29:36,  3.28it/s]

{'loss': 0.2355, 'grad_norm': 0.6362723708152771, 'learning_rate': 2.9881355932203393e-05, 'epoch': 1.24}


 41%|████▏     | 12380/30000 [6:40:06<1:29:30,  3.28it/s]

{'loss': 0.2468, 'grad_norm': 0.5172549486160278, 'learning_rate': 2.986440677966102e-05, 'epoch': 1.24}


 41%|████▏     | 12390/30000 [6:40:09<1:29:28,  3.28it/s]

{'loss': 0.2399, 'grad_norm': 0.50147545337677, 'learning_rate': 2.9847457627118648e-05, 'epoch': 1.24}


 41%|████▏     | 12400/30000 [6:40:12<1:29:28,  3.28it/s]

{'loss': 0.2586, 'grad_norm': 0.45603153109550476, 'learning_rate': 2.9830508474576274e-05, 'epoch': 1.24}


 41%|████▏     | 12410/30000 [6:40:15<1:29:25,  3.28it/s]

{'loss': 0.2447, 'grad_norm': 0.6166886687278748, 'learning_rate': 2.98135593220339e-05, 'epoch': 1.24}


 41%|████▏     | 12420/30000 [6:40:19<1:29:27,  3.28it/s]

{'loss': 0.2325, 'grad_norm': 0.598370373249054, 'learning_rate': 2.979661016949153e-05, 'epoch': 1.24}


 41%|████▏     | 12430/30000 [6:40:22<1:29:22,  3.28it/s]

{'loss': 0.2443, 'grad_norm': 0.6652074456214905, 'learning_rate': 2.9779661016949155e-05, 'epoch': 1.24}


 41%|████▏     | 12440/30000 [6:40:25<1:29:20,  3.28it/s]

{'loss': 0.2705, 'grad_norm': 0.5569531917572021, 'learning_rate': 2.9762711864406785e-05, 'epoch': 1.24}


 42%|████▏     | 12450/30000 [6:40:28<1:29:23,  3.27it/s]

{'loss': 0.2396, 'grad_norm': 0.4419252574443817, 'learning_rate': 2.9745762711864407e-05, 'epoch': 1.25}


 42%|████▏     | 12460/30000 [6:40:31<1:29:19,  3.27it/s]

{'loss': 0.2518, 'grad_norm': 0.48481422662734985, 'learning_rate': 2.9728813559322033e-05, 'epoch': 1.25}


 42%|████▏     | 12470/30000 [6:40:34<1:29:16,  3.27it/s]

{'loss': 0.2423, 'grad_norm': 0.6045740842819214, 'learning_rate': 2.971186440677966e-05, 'epoch': 1.25}


 42%|████▏     | 12480/30000 [6:40:37<1:29:15,  3.27it/s]

{'loss': 0.2443, 'grad_norm': 0.5421703457832336, 'learning_rate': 2.969491525423729e-05, 'epoch': 1.25}


 42%|████▏     | 12490/30000 [6:40:40<1:29:12,  3.27it/s]

{'loss': 0.2352, 'grad_norm': 0.5144826769828796, 'learning_rate': 2.9677966101694914e-05, 'epoch': 1.25}


 42%|████▏     | 12500/30000 [6:40:43<1:29:11,  3.27it/s]

{'loss': 0.2507, 'grad_norm': 0.7131608128547668, 'learning_rate': 2.9661016949152544e-05, 'epoch': 1.25}


 42%|████▏     | 12510/30000 [6:40:46<1:28:51,  3.28it/s]

{'loss': 0.2409, 'grad_norm': 0.6990581750869751, 'learning_rate': 2.964406779661017e-05, 'epoch': 1.25}


 42%|████▏     | 12520/30000 [6:40:49<1:28:57,  3.27it/s]

{'loss': 0.2406, 'grad_norm': 0.5620880722999573, 'learning_rate': 2.9627118644067796e-05, 'epoch': 1.25}


 42%|████▏     | 12530/30000 [6:40:52<1:29:05,  3.27it/s]

{'loss': 0.2497, 'grad_norm': 0.6317095160484314, 'learning_rate': 2.9610169491525425e-05, 'epoch': 1.25}


 42%|████▏     | 12540/30000 [6:40:55<1:28:58,  3.27it/s]

{'loss': 0.255, 'grad_norm': 0.5409470796585083, 'learning_rate': 2.959322033898305e-05, 'epoch': 1.25}


 42%|████▏     | 12550/30000 [6:40:58<1:28:50,  3.27it/s]

{'loss': 0.2436, 'grad_norm': 0.5610730051994324, 'learning_rate': 2.957627118644068e-05, 'epoch': 1.25}


 42%|████▏     | 12560/30000 [6:41:01<1:28:52,  3.27it/s]

{'loss': 0.2503, 'grad_norm': 0.5140414834022522, 'learning_rate': 2.9559322033898306e-05, 'epoch': 1.26}


 42%|████▏     | 12570/30000 [6:41:04<1:28:47,  3.27it/s]

{'loss': 0.2445, 'grad_norm': 0.45378148555755615, 'learning_rate': 2.9542372881355936e-05, 'epoch': 1.26}


 42%|████▏     | 12580/30000 [6:41:07<1:28:44,  3.27it/s]

{'loss': 0.2426, 'grad_norm': 0.5328658819198608, 'learning_rate': 2.952542372881356e-05, 'epoch': 1.26}


 42%|████▏     | 12590/30000 [6:41:11<1:28:41,  3.27it/s]

{'loss': 0.2538, 'grad_norm': 0.583824634552002, 'learning_rate': 2.9508474576271187e-05, 'epoch': 1.26}


 42%|████▏     | 12600/30000 [6:41:14<1:28:35,  3.27it/s]

{'loss': 0.2405, 'grad_norm': 0.48820093274116516, 'learning_rate': 2.9491525423728817e-05, 'epoch': 1.26}


 42%|████▏     | 12610/30000 [6:41:17<1:28:38,  3.27it/s]

{'loss': 0.2256, 'grad_norm': 0.9309399127960205, 'learning_rate': 2.9474576271186443e-05, 'epoch': 1.26}


 42%|████▏     | 12620/30000 [6:41:20<1:28:37,  3.27it/s]

{'loss': 0.2361, 'grad_norm': 0.5663521885871887, 'learning_rate': 2.9457627118644072e-05, 'epoch': 1.26}


 42%|████▏     | 12630/30000 [6:41:23<1:28:31,  3.27it/s]

{'loss': 0.2544, 'grad_norm': 0.4749203026294708, 'learning_rate': 2.9440677966101698e-05, 'epoch': 1.26}


 42%|████▏     | 12640/30000 [6:41:26<1:28:24,  3.27it/s]

{'loss': 0.2346, 'grad_norm': 0.6436954736709595, 'learning_rate': 2.9423728813559327e-05, 'epoch': 1.26}


 42%|████▏     | 12650/30000 [6:41:29<1:28:25,  3.27it/s]

{'loss': 0.2328, 'grad_norm': 0.5148730874061584, 'learning_rate': 2.9406779661016953e-05, 'epoch': 1.27}


 42%|████▏     | 12660/30000 [6:41:32<1:28:25,  3.27it/s]

{'loss': 0.2596, 'grad_norm': 0.4912870526313782, 'learning_rate': 2.9389830508474576e-05, 'epoch': 1.27}


 42%|████▏     | 12670/30000 [6:41:35<1:28:25,  3.27it/s]

{'loss': 0.2506, 'grad_norm': 0.5557481050491333, 'learning_rate': 2.9372881355932202e-05, 'epoch': 1.27}


 42%|████▏     | 12680/30000 [6:41:38<1:28:13,  3.27it/s]

{'loss': 0.2407, 'grad_norm': 0.5709033608436584, 'learning_rate': 2.935593220338983e-05, 'epoch': 1.27}


 42%|████▏     | 12690/30000 [6:41:41<1:28:16,  3.27it/s]

{'loss': 0.2543, 'grad_norm': 0.5561559796333313, 'learning_rate': 2.9338983050847457e-05, 'epoch': 1.27}


 42%|████▏     | 12700/30000 [6:41:44<1:28:08,  3.27it/s]

{'loss': 0.2464, 'grad_norm': 0.5459597110748291, 'learning_rate': 2.9322033898305083e-05, 'epoch': 1.27}


 42%|████▏     | 12710/30000 [6:41:47<1:28:03,  3.27it/s]

{'loss': 0.2379, 'grad_norm': 0.5758564472198486, 'learning_rate': 2.9305084745762713e-05, 'epoch': 1.27}


 42%|████▏     | 12720/30000 [6:41:50<1:27:59,  3.27it/s]

{'loss': 0.2419, 'grad_norm': 0.45604732632637024, 'learning_rate': 2.928813559322034e-05, 'epoch': 1.27}


 42%|████▏     | 12730/30000 [6:41:53<1:28:04,  3.27it/s]

{'loss': 0.2567, 'grad_norm': 0.6499395966529846, 'learning_rate': 2.9271186440677968e-05, 'epoch': 1.27}


 42%|████▏     | 12740/30000 [6:41:56<1:28:04,  3.27it/s]

{'loss': 0.2349, 'grad_norm': 0.5740160346031189, 'learning_rate': 2.9254237288135594e-05, 'epoch': 1.27}


 42%|████▎     | 12750/30000 [6:41:59<1:27:51,  3.27it/s]

{'loss': 0.2448, 'grad_norm': 0.5931098461151123, 'learning_rate': 2.9237288135593223e-05, 'epoch': 1.27}


 43%|████▎     | 12760/30000 [6:42:03<1:28:58,  3.23it/s]

{'loss': 0.2437, 'grad_norm': 0.5599607825279236, 'learning_rate': 2.922033898305085e-05, 'epoch': 1.28}


 43%|████▎     | 12770/30000 [6:42:06<1:29:39,  3.20it/s]

{'loss': 0.2367, 'grad_norm': 0.596270740032196, 'learning_rate': 2.9203389830508475e-05, 'epoch': 1.28}


 43%|████▎     | 12780/30000 [6:42:09<1:31:20,  3.14it/s]

{'loss': 0.2477, 'grad_norm': 0.47179147601127625, 'learning_rate': 2.9186440677966104e-05, 'epoch': 1.28}


 43%|████▎     | 12790/30000 [6:42:12<1:33:06,  3.08it/s]

{'loss': 0.2294, 'grad_norm': 0.5993204712867737, 'learning_rate': 2.916949152542373e-05, 'epoch': 1.28}


 43%|████▎     | 12800/30000 [6:42:15<1:34:08,  3.04it/s]

{'loss': 0.2428, 'grad_norm': 0.5745707750320435, 'learning_rate': 2.915254237288136e-05, 'epoch': 1.28}


 43%|████▎     | 12810/30000 [6:42:19<1:35:18,  3.01it/s]

{'loss': 0.2476, 'grad_norm': 0.48417699337005615, 'learning_rate': 2.9135593220338986e-05, 'epoch': 1.28}


 43%|████▎     | 12820/30000 [6:42:22<1:36:45,  2.96it/s]

{'loss': 0.2424, 'grad_norm': 0.5749951601028442, 'learning_rate': 2.911864406779661e-05, 'epoch': 1.28}


 43%|████▎     | 12830/30000 [6:42:25<1:36:53,  2.95it/s]

{'loss': 0.2489, 'grad_norm': 0.6172541379928589, 'learning_rate': 2.910169491525424e-05, 'epoch': 1.28}


 43%|████▎     | 12840/30000 [6:42:29<1:37:26,  2.94it/s]

{'loss': 0.2628, 'grad_norm': 0.51291424036026, 'learning_rate': 2.9084745762711867e-05, 'epoch': 1.28}


 43%|████▎     | 12850/30000 [6:42:37<4:15:40,  1.12it/s]

{'loss': 0.2394, 'grad_norm': 0.6009033918380737, 'learning_rate': 2.9067796610169496e-05, 'epoch': 1.28}


 43%|████▎     | 12860/30000 [6:42:46<4:24:33,  1.08it/s]

{'loss': 0.2546, 'grad_norm': 0.49742648005485535, 'learning_rate': 2.9050847457627122e-05, 'epoch': 1.29}


 43%|████▎     | 12870/30000 [6:46:47<29:28:02,  6.19s/it] 

{'loss': 0.2416, 'grad_norm': 0.4266953468322754, 'learning_rate': 2.9033898305084745e-05, 'epoch': 1.29}


 43%|████▎     | 12880/30000 [6:46:50<2:14:42,  2.12it/s] 

{'loss': 0.2244, 'grad_norm': 0.5677703619003296, 'learning_rate': 2.901694915254237e-05, 'epoch': 1.29}


 43%|████▎     | 12890/30000 [6:46:53<1:28:17,  3.23it/s]

{'loss': 0.2516, 'grad_norm': 0.5410939455032349, 'learning_rate': 2.9e-05, 'epoch': 1.29}


 43%|████▎     | 12900/30000 [6:46:56<1:27:01,  3.27it/s]

{'loss': 0.2523, 'grad_norm': 0.5155830979347229, 'learning_rate': 2.8983050847457626e-05, 'epoch': 1.29}


 43%|████▎     | 12910/30000 [6:46:59<1:26:56,  3.28it/s]

{'loss': 0.2532, 'grad_norm': 0.947717547416687, 'learning_rate': 2.8966101694915255e-05, 'epoch': 1.29}


 43%|████▎     | 12920/30000 [6:47:02<1:26:57,  3.27it/s]

{'loss': 0.2419, 'grad_norm': 0.45490702986717224, 'learning_rate': 2.894915254237288e-05, 'epoch': 1.29}


 43%|████▎     | 12930/30000 [6:47:05<1:26:54,  3.27it/s]

{'loss': 0.2502, 'grad_norm': 0.5700325965881348, 'learning_rate': 2.8932203389830507e-05, 'epoch': 1.29}


 43%|████▎     | 12940/30000 [6:47:08<1:26:54,  3.27it/s]

{'loss': 0.2575, 'grad_norm': 0.5868514776229858, 'learning_rate': 2.8915254237288137e-05, 'epoch': 1.29}


 43%|████▎     | 12950/30000 [6:47:11<1:26:51,  3.27it/s]

{'loss': 0.2504, 'grad_norm': 0.6446898579597473, 'learning_rate': 2.8898305084745763e-05, 'epoch': 1.29}


 43%|████▎     | 12960/30000 [6:47:14<1:26:45,  3.27it/s]

{'loss': 0.2653, 'grad_norm': 0.5794395804405212, 'learning_rate': 2.8881355932203392e-05, 'epoch': 1.3}


 43%|████▎     | 12970/30000 [6:47:17<1:26:43,  3.27it/s]

{'loss': 0.2364, 'grad_norm': 0.6671468019485474, 'learning_rate': 2.8864406779661018e-05, 'epoch': 1.3}


 43%|████▎     | 12980/30000 [6:47:21<1:26:43,  3.27it/s]

{'loss': 0.2492, 'grad_norm': 0.5766966938972473, 'learning_rate': 2.8847457627118647e-05, 'epoch': 1.3}


 43%|████▎     | 12990/30000 [6:47:24<1:26:42,  3.27it/s]

{'loss': 0.2514, 'grad_norm': 0.6224942803382874, 'learning_rate': 2.8830508474576273e-05, 'epoch': 1.3}


 43%|████▎     | 13000/30000 [6:47:27<1:26:36,  3.27it/s]

{'loss': 0.2813, 'grad_norm': 0.6378897428512573, 'learning_rate': 2.88135593220339e-05, 'epoch': 1.3}


 43%|████▎     | 13010/30000 [6:47:30<1:26:36,  3.27it/s]

{'loss': 0.2479, 'grad_norm': 0.46262168884277344, 'learning_rate': 2.879661016949153e-05, 'epoch': 1.3}


 43%|████▎     | 13020/30000 [6:57:02<96:11:50, 20.40s/it]  

{'loss': 0.2424, 'grad_norm': 0.5372979640960693, 'learning_rate': 2.8779661016949155e-05, 'epoch': 1.3}


 43%|████▎     | 13030/30000 [6:57:05<4:06:41,  1.15it/s] 

{'loss': 0.2367, 'grad_norm': 0.5707707405090332, 'learning_rate': 2.8762711864406784e-05, 'epoch': 1.3}


 43%|████▎     | 13040/30000 [6:57:08<1:30:35,  3.12it/s]

{'loss': 0.2416, 'grad_norm': 0.4727014899253845, 'learning_rate': 2.874576271186441e-05, 'epoch': 1.3}


 44%|████▎     | 13050/30000 [6:57:11<1:26:05,  3.28it/s]

{'loss': 0.2512, 'grad_norm': 0.5525685548782349, 'learning_rate': 2.8728813559322036e-05, 'epoch': 1.3}


 44%|████▎     | 13060/30000 [6:57:14<1:26:00,  3.28it/s]

{'loss': 0.2434, 'grad_norm': 0.6069933772087097, 'learning_rate': 2.8711864406779665e-05, 'epoch': 1.31}


 44%|████▎     | 13070/30000 [6:57:17<1:26:05,  3.28it/s]

{'loss': 0.2491, 'grad_norm': 0.6362388730049133, 'learning_rate': 2.8694915254237288e-05, 'epoch': 1.31}


 44%|████▎     | 13080/30000 [6:57:20<1:26:01,  3.28it/s]

{'loss': 0.2427, 'grad_norm': 0.5658567547798157, 'learning_rate': 2.8677966101694914e-05, 'epoch': 1.31}


 44%|████▎     | 13090/30000 [6:57:23<1:25:57,  3.28it/s]

{'loss': 0.2414, 'grad_norm': 0.4881482720375061, 'learning_rate': 2.8661016949152543e-05, 'epoch': 1.31}


 44%|████▎     | 13100/30000 [6:57:26<1:25:54,  3.28it/s]

{'loss': 0.2355, 'grad_norm': 0.5638443827629089, 'learning_rate': 2.864406779661017e-05, 'epoch': 1.31}


 44%|████▎     | 13110/30000 [6:57:29<1:25:55,  3.28it/s]

{'loss': 0.2223, 'grad_norm': 0.5377737283706665, 'learning_rate': 2.8627118644067795e-05, 'epoch': 1.31}


 44%|████▎     | 13120/30000 [6:57:33<1:25:53,  3.28it/s]

{'loss': 0.2448, 'grad_norm': 0.44961443543434143, 'learning_rate': 2.8610169491525424e-05, 'epoch': 1.31}


 44%|████▍     | 13130/30000 [6:57:36<1:26:00,  3.27it/s]

{'loss': 0.2391, 'grad_norm': 0.5665096044540405, 'learning_rate': 2.859322033898305e-05, 'epoch': 1.31}


 44%|████▍     | 13140/30000 [6:57:39<1:25:52,  3.27it/s]

{'loss': 0.2508, 'grad_norm': 0.5092164874076843, 'learning_rate': 2.857627118644068e-05, 'epoch': 1.31}


 44%|████▍     | 13150/30000 [6:57:42<1:25:54,  3.27it/s]

{'loss': 0.2435, 'grad_norm': 0.5282154083251953, 'learning_rate': 2.8559322033898306e-05, 'epoch': 1.31}


 44%|████▍     | 13160/30000 [6:57:45<1:25:48,  3.27it/s]

{'loss': 0.2366, 'grad_norm': 0.40181082487106323, 'learning_rate': 2.854237288135593e-05, 'epoch': 1.32}


 44%|████▍     | 13170/30000 [7:14:35<486:01:27, 103.96s/it] 

{'loss': 0.2416, 'grad_norm': 0.47478267550468445, 'learning_rate': 2.852542372881356e-05, 'epoch': 1.32}


 44%|████▍     | 13180/30000 [7:14:38<15:06:06,  3.23s/it]  

{'loss': 0.2392, 'grad_norm': 0.44813698530197144, 'learning_rate': 2.8508474576271187e-05, 'epoch': 1.32}


 44%|████▍     | 13190/30000 [7:14:41<1:48:25,  2.58it/s] 

{'loss': 0.2328, 'grad_norm': 0.49931100010871887, 'learning_rate': 2.8491525423728816e-05, 'epoch': 1.32}


 44%|████▍     | 13200/30000 [7:14:44<1:25:49,  3.26it/s]

{'loss': 0.2369, 'grad_norm': 0.518193244934082, 'learning_rate': 2.8474576271186442e-05, 'epoch': 1.32}


 44%|████▍     | 13210/30000 [7:14:47<1:25:18,  3.28it/s]

{'loss': 0.2321, 'grad_norm': 0.6562368273735046, 'learning_rate': 2.845762711864407e-05, 'epoch': 1.32}


 44%|████▍     | 13220/30000 [7:14:50<1:25:10,  3.28it/s]

{'loss': 0.2554, 'grad_norm': 0.5326790809631348, 'learning_rate': 2.8440677966101698e-05, 'epoch': 1.32}


 44%|████▍     | 13230/30000 [7:14:53<1:25:07,  3.28it/s]

{'loss': 0.2299, 'grad_norm': 0.5426509976387024, 'learning_rate': 2.8423728813559323e-05, 'epoch': 1.32}


 44%|████▍     | 13240/30000 [7:14:56<1:25:02,  3.28it/s]

{'loss': 0.2371, 'grad_norm': 0.5964985489845276, 'learning_rate': 2.8406779661016953e-05, 'epoch': 1.32}


 44%|████▍     | 13250/30000 [7:15:00<1:25:05,  3.28it/s]

{'loss': 0.2462, 'grad_norm': 0.5280324816703796, 'learning_rate': 2.838983050847458e-05, 'epoch': 1.32}


 44%|████▍     | 13260/30000 [7:15:03<1:25:07,  3.28it/s]

{'loss': 0.2368, 'grad_norm': 0.5305783748626709, 'learning_rate': 2.8372881355932208e-05, 'epoch': 1.33}


 44%|████▍     | 13270/30000 [7:15:06<1:25:01,  3.28it/s]

{'loss': 0.2426, 'grad_norm': 0.7094561457633972, 'learning_rate': 2.8355932203389834e-05, 'epoch': 1.33}


 44%|████▍     | 13280/30000 [7:15:09<1:25:04,  3.28it/s]

{'loss': 0.2482, 'grad_norm': 0.573050856590271, 'learning_rate': 2.8338983050847457e-05, 'epoch': 1.33}


 44%|████▍     | 13290/30000 [7:15:12<1:25:06,  3.27it/s]

{'loss': 0.2523, 'grad_norm': 0.6236212849617004, 'learning_rate': 2.8322033898305083e-05, 'epoch': 1.33}


 44%|████▍     | 13300/30000 [7:15:15<1:25:04,  3.27it/s]

{'loss': 0.2297, 'grad_norm': 0.48286107182502747, 'learning_rate': 2.8305084745762712e-05, 'epoch': 1.33}


 44%|████▍     | 13310/30000 [7:15:18<1:25:02,  3.27it/s]

{'loss': 0.2466, 'grad_norm': 0.5776840448379517, 'learning_rate': 2.8288135593220338e-05, 'epoch': 1.33}


 44%|████▍     | 13320/30000 [7:15:21<1:24:59,  3.27it/s]

{'loss': 0.2295, 'grad_norm': 0.4454347491264343, 'learning_rate': 2.8271186440677967e-05, 'epoch': 1.33}


 44%|████▍     | 13330/30000 [7:15:24<1:24:51,  3.27it/s]

{'loss': 0.2252, 'grad_norm': 0.4298066198825836, 'learning_rate': 2.8254237288135593e-05, 'epoch': 1.33}


 44%|████▍     | 13340/30000 [7:15:27<1:24:52,  3.27it/s]

{'loss': 0.2489, 'grad_norm': 0.5219320058822632, 'learning_rate': 2.823728813559322e-05, 'epoch': 1.33}


 44%|████▍     | 13350/30000 [7:15:30<1:24:51,  3.27it/s]

{'loss': 0.2321, 'grad_norm': 0.49796202778816223, 'learning_rate': 2.822033898305085e-05, 'epoch': 1.33}


 45%|████▍     | 13360/30000 [7:15:33<1:24:47,  3.27it/s]

{'loss': 0.2258, 'grad_norm': 0.40642493963241577, 'learning_rate': 2.8203389830508475e-05, 'epoch': 1.34}


 45%|████▍     | 13370/30000 [7:15:36<1:24:48,  3.27it/s]

{'loss': 0.2282, 'grad_norm': 0.5446206331253052, 'learning_rate': 2.8186440677966104e-05, 'epoch': 1.34}


 45%|████▍     | 13380/30000 [7:15:39<1:24:40,  3.27it/s]

{'loss': 0.2376, 'grad_norm': 0.7141706943511963, 'learning_rate': 2.816949152542373e-05, 'epoch': 1.34}


 45%|████▍     | 13390/30000 [7:15:42<1:24:33,  3.27it/s]

{'loss': 0.2442, 'grad_norm': 0.6130696535110474, 'learning_rate': 2.815254237288136e-05, 'epoch': 1.34}


 45%|████▍     | 13400/30000 [7:15:45<1:24:32,  3.27it/s]

{'loss': 0.2424, 'grad_norm': 0.5373878479003906, 'learning_rate': 2.8135593220338985e-05, 'epoch': 1.34}


 45%|████▍     | 13410/30000 [7:15:48<1:24:34,  3.27it/s]

{'loss': 0.2493, 'grad_norm': 0.617455005645752, 'learning_rate': 2.811864406779661e-05, 'epoch': 1.34}


 45%|████▍     | 13420/30000 [7:15:51<1:24:33,  3.27it/s]

{'loss': 0.2393, 'grad_norm': 0.4402185082435608, 'learning_rate': 2.810169491525424e-05, 'epoch': 1.34}


 45%|████▍     | 13430/30000 [7:15:55<1:24:23,  3.27it/s]

{'loss': 0.239, 'grad_norm': 0.5505290031433105, 'learning_rate': 2.8084745762711866e-05, 'epoch': 1.34}


 45%|████▍     | 13440/30000 [7:15:58<1:24:28,  3.27it/s]

{'loss': 0.2394, 'grad_norm': 0.49852654337882996, 'learning_rate': 2.8067796610169496e-05, 'epoch': 1.34}


 45%|████▍     | 13450/30000 [7:16:01<1:24:14,  3.27it/s]

{'loss': 0.2415, 'grad_norm': 0.4421837329864502, 'learning_rate': 2.8050847457627122e-05, 'epoch': 1.34}


 45%|████▍     | 13460/30000 [7:16:04<1:24:16,  3.27it/s]

{'loss': 0.2483, 'grad_norm': 0.44591063261032104, 'learning_rate': 2.8033898305084748e-05, 'epoch': 1.35}


 45%|████▍     | 13470/30000 [7:16:07<1:24:11,  3.27it/s]

{'loss': 0.2481, 'grad_norm': 0.6694694757461548, 'learning_rate': 2.8016949152542377e-05, 'epoch': 1.35}


 45%|████▍     | 13480/30000 [7:16:10<1:24:11,  3.27it/s]

{'loss': 0.2551, 'grad_norm': 0.48849940299987793, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.35}


 45%|████▍     | 13490/30000 [7:16:13<1:24:02,  3.27it/s]

{'loss': 0.2402, 'grad_norm': 0.5348761677742004, 'learning_rate': 2.7983050847457626e-05, 'epoch': 1.35}


 45%|████▌     | 13500/30000 [7:16:16<1:24:01,  3.27it/s]

{'loss': 0.2406, 'grad_norm': 0.5179685950279236, 'learning_rate': 2.7966101694915255e-05, 'epoch': 1.35}


 45%|████▌     | 13510/30000 [7:16:19<1:24:00,  3.27it/s]

{'loss': 0.2418, 'grad_norm': 0.5926191806793213, 'learning_rate': 2.794915254237288e-05, 'epoch': 1.35}


 45%|████▌     | 13520/30000 [7:16:22<1:23:58,  3.27it/s]

{'loss': 0.2332, 'grad_norm': 0.5018917322158813, 'learning_rate': 2.7932203389830507e-05, 'epoch': 1.35}


 45%|████▌     | 13530/30000 [7:16:25<1:23:54,  3.27it/s]

{'loss': 0.2523, 'grad_norm': 0.5123785734176636, 'learning_rate': 2.7915254237288136e-05, 'epoch': 1.35}


 45%|████▌     | 13540/30000 [7:16:28<1:23:50,  3.27it/s]

{'loss': 0.2507, 'grad_norm': 0.6496906280517578, 'learning_rate': 2.7898305084745762e-05, 'epoch': 1.35}


 45%|████▌     | 13550/30000 [7:16:31<1:23:46,  3.27it/s]

{'loss': 0.2232, 'grad_norm': 0.6060948967933655, 'learning_rate': 2.788135593220339e-05, 'epoch': 1.35}


 45%|████▌     | 13560/30000 [7:16:34<1:23:45,  3.27it/s]

{'loss': 0.2436, 'grad_norm': 0.5784517526626587, 'learning_rate': 2.7864406779661017e-05, 'epoch': 1.36}


 45%|████▌     | 13570/30000 [7:16:37<1:23:51,  3.27it/s]

{'loss': 0.2435, 'grad_norm': 0.5669368505477905, 'learning_rate': 2.7847457627118643e-05, 'epoch': 1.36}


 45%|████▌     | 13580/30000 [7:16:40<1:23:37,  3.27it/s]

{'loss': 0.2361, 'grad_norm': 0.668563961982727, 'learning_rate': 2.7830508474576273e-05, 'epoch': 1.36}


 45%|████▌     | 13590/30000 [7:16:44<1:23:33,  3.27it/s]

{'loss': 0.2344, 'grad_norm': 0.664941668510437, 'learning_rate': 2.78135593220339e-05, 'epoch': 1.36}


 45%|████▌     | 13600/30000 [7:16:47<1:23:38,  3.27it/s]

{'loss': 0.2472, 'grad_norm': 0.48250964283943176, 'learning_rate': 2.7796610169491528e-05, 'epoch': 1.36}


 45%|████▌     | 13610/30000 [7:16:50<1:23:30,  3.27it/s]

{'loss': 0.227, 'grad_norm': 0.5495811104774475, 'learning_rate': 2.7779661016949154e-05, 'epoch': 1.36}


 45%|████▌     | 13620/30000 [7:16:53<1:23:23,  3.27it/s]

{'loss': 0.2537, 'grad_norm': 0.6295754313468933, 'learning_rate': 2.7762711864406783e-05, 'epoch': 1.36}


 45%|████▌     | 13630/30000 [7:16:56<1:25:27,  3.19it/s]

{'loss': 0.2418, 'grad_norm': 0.4630430340766907, 'learning_rate': 2.774576271186441e-05, 'epoch': 1.36}


 45%|████▌     | 13640/30000 [7:16:59<1:27:49,  3.10it/s]

{'loss': 0.2424, 'grad_norm': 0.538496732711792, 'learning_rate': 2.7728813559322035e-05, 'epoch': 1.36}


 46%|████▌     | 13650/30000 [7:30:59<275:09:00, 60.58s/it]  

{'loss': 0.2541, 'grad_norm': 0.38157472014427185, 'learning_rate': 2.7711864406779665e-05, 'epoch': 1.36}


 46%|████▌     | 13660/30000 [7:31:02<9:06:58,  2.01s/it]  

{'loss': 0.2454, 'grad_norm': 0.4587932527065277, 'learning_rate': 2.769491525423729e-05, 'epoch': 1.37}


 46%|████▌     | 13670/30000 [7:31:05<1:35:59,  2.84it/s]

{'loss': 0.243, 'grad_norm': 0.5500476360321045, 'learning_rate': 2.767796610169492e-05, 'epoch': 1.37}


 46%|████▌     | 13680/30000 [7:31:08<1:23:18,  3.27it/s]

{'loss': 0.2417, 'grad_norm': 0.6116567850112915, 'learning_rate': 2.7661016949152546e-05, 'epoch': 1.37}


 46%|████▌     | 13690/30000 [7:31:11<1:22:56,  3.28it/s]

{'loss': 0.2518, 'grad_norm': 0.7101960778236389, 'learning_rate': 2.7644067796610172e-05, 'epoch': 1.37}


 46%|████▌     | 13700/30000 [7:31:14<1:22:44,  3.28it/s]

{'loss': 0.2412, 'grad_norm': 0.4503869414329529, 'learning_rate': 2.7627118644067794e-05, 'epoch': 1.37}


 46%|████▌     | 13710/30000 [7:31:17<1:22:44,  3.28it/s]

{'loss': 0.2233, 'grad_norm': 0.4826962649822235, 'learning_rate': 2.7610169491525424e-05, 'epoch': 1.37}


 46%|████▌     | 13720/30000 [7:31:20<1:22:39,  3.28it/s]

{'loss': 0.2397, 'grad_norm': 0.6239086985588074, 'learning_rate': 2.759322033898305e-05, 'epoch': 1.37}


 46%|████▌     | 13730/30000 [7:31:23<1:22:39,  3.28it/s]

{'loss': 0.2476, 'grad_norm': 0.5220036506652832, 'learning_rate': 2.757627118644068e-05, 'epoch': 1.37}


 46%|████▌     | 13740/30000 [7:31:26<1:22:34,  3.28it/s]

{'loss': 0.2545, 'grad_norm': 0.530498206615448, 'learning_rate': 2.7559322033898305e-05, 'epoch': 1.37}


 46%|████▌     | 13750/30000 [7:31:30<1:22:41,  3.28it/s]

{'loss': 0.2432, 'grad_norm': 0.6057897210121155, 'learning_rate': 2.754237288135593e-05, 'epoch': 1.38}


 46%|████▌     | 13760/30000 [7:31:33<1:22:37,  3.28it/s]

{'loss': 0.2473, 'grad_norm': 0.5148394703865051, 'learning_rate': 2.752542372881356e-05, 'epoch': 1.38}


 46%|████▌     | 13770/30000 [7:31:36<1:22:39,  3.27it/s]

{'loss': 0.2419, 'grad_norm': 0.4694422483444214, 'learning_rate': 2.7508474576271186e-05, 'epoch': 1.38}


 46%|████▌     | 13780/30000 [7:31:39<1:22:41,  3.27it/s]

{'loss': 0.2321, 'grad_norm': 0.4967331290245056, 'learning_rate': 2.7491525423728816e-05, 'epoch': 1.38}


 46%|████▌     | 13790/30000 [7:31:42<1:22:36,  3.27it/s]

{'loss': 0.2318, 'grad_norm': 0.5024102330207825, 'learning_rate': 2.747457627118644e-05, 'epoch': 1.38}


 46%|████▌     | 13800/30000 [7:47:45<908:32:54, 201.90s/it] 

{'loss': 0.2372, 'grad_norm': 0.4092722237110138, 'learning_rate': 2.7457627118644068e-05, 'epoch': 1.38}


 46%|████▌     | 13810/30000 [7:47:48<26:58:39,  6.00s/it]  

{'loss': 0.2392, 'grad_norm': 0.4515382647514343, 'learning_rate': 2.7440677966101697e-05, 'epoch': 1.38}


 46%|████▌     | 13820/30000 [7:47:51<2:05:26,  2.15it/s] 

{'loss': 0.2505, 'grad_norm': 0.5572725534439087, 'learning_rate': 2.7423728813559323e-05, 'epoch': 1.38}


 46%|████▌     | 13830/30000 [7:47:54<1:23:20,  3.23it/s]

{'loss': 0.2416, 'grad_norm': 0.3793348968029022, 'learning_rate': 2.7406779661016952e-05, 'epoch': 1.38}


 46%|████▌     | 13840/30000 [7:47:57<1:22:03,  3.28it/s]

{'loss': 0.2287, 'grad_norm': 0.49312520027160645, 'learning_rate': 2.7389830508474578e-05, 'epoch': 1.38}


 46%|████▌     | 13850/30000 [7:48:00<1:22:04,  3.28it/s]

{'loss': 0.266, 'grad_norm': 0.6320632100105286, 'learning_rate': 2.7372881355932208e-05, 'epoch': 1.39}


 46%|████▌     | 13860/30000 [7:48:03<1:21:48,  3.29it/s]

{'loss': 0.236, 'grad_norm': 0.8119969367980957, 'learning_rate': 2.7355932203389833e-05, 'epoch': 1.39}


 46%|████▌     | 13870/30000 [7:48:06<1:21:49,  3.29it/s]

{'loss': 0.2494, 'grad_norm': 0.6103100776672363, 'learning_rate': 2.733898305084746e-05, 'epoch': 1.39}


 46%|████▋     | 13880/30000 [7:48:09<1:21:53,  3.28it/s]

{'loss': 0.2394, 'grad_norm': 0.3773106038570404, 'learning_rate': 2.732203389830509e-05, 'epoch': 1.39}


 46%|████▋     | 13890/30000 [7:48:12<1:22:02,  3.27it/s]

{'loss': 0.2362, 'grad_norm': 0.4541527032852173, 'learning_rate': 2.7305084745762715e-05, 'epoch': 1.39}


 46%|████▋     | 13900/30000 [7:48:15<1:21:48,  3.28it/s]

{'loss': 0.2394, 'grad_norm': 0.5307490825653076, 'learning_rate': 2.7288135593220337e-05, 'epoch': 1.39}


 46%|████▋     | 13910/30000 [7:48:18<1:21:45,  3.28it/s]

{'loss': 0.2434, 'grad_norm': 0.555381178855896, 'learning_rate': 2.7271186440677963e-05, 'epoch': 1.39}


 46%|████▋     | 13920/30000 [7:48:21<1:21:49,  3.28it/s]

{'loss': 0.2378, 'grad_norm': 0.5123980641365051, 'learning_rate': 2.7254237288135593e-05, 'epoch': 1.39}


 46%|████▋     | 13930/30000 [7:48:24<1:21:42,  3.28it/s]

{'loss': 0.2427, 'grad_norm': 0.5147427320480347, 'learning_rate': 2.723728813559322e-05, 'epoch': 1.39}


 46%|████▋     | 13940/30000 [7:48:27<1:21:41,  3.28it/s]

{'loss': 0.2366, 'grad_norm': 0.45478594303131104, 'learning_rate': 2.7220338983050848e-05, 'epoch': 1.39}


 46%|████▋     | 13950/30000 [7:48:31<1:21:38,  3.28it/s]

{'loss': 0.2342, 'grad_norm': 0.5553879737854004, 'learning_rate': 2.7203389830508474e-05, 'epoch': 1.4}


 47%|████▋     | 13960/30000 [7:48:34<1:21:41,  3.27it/s]

{'loss': 0.2387, 'grad_norm': 0.4983001947402954, 'learning_rate': 2.7186440677966103e-05, 'epoch': 1.4}


 47%|████▋     | 13970/30000 [7:48:37<1:21:30,  3.28it/s]

{'loss': 0.2416, 'grad_norm': 0.5299504399299622, 'learning_rate': 2.716949152542373e-05, 'epoch': 1.4}


 47%|████▋     | 13980/30000 [7:48:40<1:21:32,  3.27it/s]

{'loss': 0.2397, 'grad_norm': 0.541739821434021, 'learning_rate': 2.7152542372881355e-05, 'epoch': 1.4}


 47%|████▋     | 13990/30000 [7:48:43<1:21:27,  3.28it/s]

{'loss': 0.2475, 'grad_norm': 0.5298669934272766, 'learning_rate': 2.7135593220338985e-05, 'epoch': 1.4}


 47%|████▋     | 14000/30000 [7:48:46<1:21:26,  3.27it/s]

{'loss': 0.2385, 'grad_norm': 0.45820263028144836, 'learning_rate': 2.711864406779661e-05, 'epoch': 1.4}


 47%|████▋     | 14010/30000 [7:48:49<1:21:23,  3.27it/s]

{'loss': 0.2341, 'grad_norm': 0.4709526300430298, 'learning_rate': 2.710169491525424e-05, 'epoch': 1.4}


 47%|████▋     | 14020/30000 [7:48:52<1:21:28,  3.27it/s]

{'loss': 0.2581, 'grad_norm': 0.579433023929596, 'learning_rate': 2.7084745762711866e-05, 'epoch': 1.4}


 47%|████▋     | 14030/30000 [7:48:55<1:21:15,  3.28it/s]

{'loss': 0.2384, 'grad_norm': 0.5827697515487671, 'learning_rate': 2.7067796610169495e-05, 'epoch': 1.4}


 47%|████▋     | 14040/30000 [7:48:58<1:21:08,  3.28it/s]

{'loss': 0.2355, 'grad_norm': 0.5756813287734985, 'learning_rate': 2.705084745762712e-05, 'epoch': 1.4}


 47%|████▋     | 14050/30000 [7:49:01<1:21:07,  3.28it/s]

{'loss': 0.2382, 'grad_norm': 0.6088148951530457, 'learning_rate': 2.7033898305084747e-05, 'epoch': 1.41}


 47%|████▋     | 14060/30000 [7:49:04<1:21:02,  3.28it/s]

{'loss': 0.2436, 'grad_norm': 0.5362594127655029, 'learning_rate': 2.7016949152542376e-05, 'epoch': 1.41}


 47%|████▋     | 14070/30000 [7:49:07<1:20:58,  3.28it/s]

{'loss': 0.2268, 'grad_norm': 0.5070714950561523, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.41}


 47%|████▋     | 14080/30000 [7:49:10<1:20:59,  3.28it/s]

{'loss': 0.2482, 'grad_norm': 0.6094871163368225, 'learning_rate': 2.6983050847457632e-05, 'epoch': 1.41}


 47%|████▋     | 14090/30000 [7:49:13<1:20:51,  3.28it/s]

{'loss': 0.2302, 'grad_norm': 0.43689411878585815, 'learning_rate': 2.6966101694915258e-05, 'epoch': 1.41}


 47%|████▋     | 14100/30000 [7:49:16<1:20:52,  3.28it/s]

{'loss': 0.2479, 'grad_norm': 0.46925994753837585, 'learning_rate': 2.6949152542372884e-05, 'epoch': 1.41}


 47%|████▋     | 14110/30000 [7:49:19<1:20:47,  3.28it/s]

{'loss': 0.2346, 'grad_norm': 0.5888428092002869, 'learning_rate': 2.6932203389830506e-05, 'epoch': 1.41}


 47%|████▋     | 14120/30000 [7:49:22<1:21:02,  3.27it/s]

{'loss': 0.2189, 'grad_norm': 0.5418212413787842, 'learning_rate': 2.6915254237288136e-05, 'epoch': 1.41}


 47%|████▋     | 14130/30000 [7:49:26<1:20:44,  3.28it/s]

{'loss': 0.2522, 'grad_norm': 0.5893318057060242, 'learning_rate': 2.689830508474576e-05, 'epoch': 1.41}


 47%|████▋     | 14140/30000 [7:49:29<1:20:41,  3.28it/s]

{'loss': 0.2415, 'grad_norm': 0.5618340373039246, 'learning_rate': 2.688135593220339e-05, 'epoch': 1.41}


 47%|████▋     | 14150/30000 [7:49:32<1:20:33,  3.28it/s]

{'loss': 0.2253, 'grad_norm': 0.5213756561279297, 'learning_rate': 2.6864406779661017e-05, 'epoch': 1.42}


 47%|████▋     | 14160/30000 [7:49:35<1:20:30,  3.28it/s]

{'loss': 0.2569, 'grad_norm': 0.5642077922821045, 'learning_rate': 2.6847457627118643e-05, 'epoch': 1.42}


 47%|████▋     | 14170/30000 [7:49:38<1:20:29,  3.28it/s]

{'loss': 0.2605, 'grad_norm': 0.5933680534362793, 'learning_rate': 2.6830508474576272e-05, 'epoch': 1.42}


 47%|████▋     | 14180/30000 [7:49:41<1:20:27,  3.28it/s]

{'loss': 0.2441, 'grad_norm': 0.5026837587356567, 'learning_rate': 2.6813559322033898e-05, 'epoch': 1.42}


 47%|████▋     | 14190/30000 [7:49:44<1:20:24,  3.28it/s]

{'loss': 0.2444, 'grad_norm': 0.5093046426773071, 'learning_rate': 2.6796610169491527e-05, 'epoch': 1.42}


 47%|████▋     | 14200/30000 [7:49:47<1:20:18,  3.28it/s]

{'loss': 0.2327, 'grad_norm': 0.5571191906929016, 'learning_rate': 2.6779661016949153e-05, 'epoch': 1.42}


 47%|████▋     | 14210/30000 [7:49:50<1:20:22,  3.27it/s]

{'loss': 0.2392, 'grad_norm': 0.41925764083862305, 'learning_rate': 2.676271186440678e-05, 'epoch': 1.42}


 47%|████▋     | 14220/30000 [7:49:53<1:20:13,  3.28it/s]

{'loss': 0.2468, 'grad_norm': 0.5290084481239319, 'learning_rate': 2.674576271186441e-05, 'epoch': 1.42}


 47%|████▋     | 14230/30000 [7:49:56<1:20:13,  3.28it/s]

{'loss': 0.2364, 'grad_norm': 0.5083810687065125, 'learning_rate': 2.6728813559322035e-05, 'epoch': 1.42}


 47%|████▋     | 14240/30000 [7:49:59<1:20:03,  3.28it/s]

{'loss': 0.2525, 'grad_norm': 0.5450607538223267, 'learning_rate': 2.6711864406779664e-05, 'epoch': 1.42}


 48%|████▊     | 14250/30000 [7:50:02<1:20:06,  3.28it/s]

{'loss': 0.2447, 'grad_norm': 0.49029162526130676, 'learning_rate': 2.669491525423729e-05, 'epoch': 1.43}


 48%|████▊     | 14260/30000 [7:50:05<1:20:25,  3.26it/s]

{'loss': 0.2374, 'grad_norm': 0.5231288075447083, 'learning_rate': 2.667796610169492e-05, 'epoch': 1.43}


 48%|████▊     | 14270/30000 [7:50:08<1:21:54,  3.20it/s]

{'loss': 0.2407, 'grad_norm': 0.6711390614509583, 'learning_rate': 2.6661016949152545e-05, 'epoch': 1.43}


 48%|████▊     | 14280/30000 [7:50:27<3:51:42,  1.13it/s] 

{'loss': 0.2462, 'grad_norm': 0.5178260803222656, 'learning_rate': 2.664406779661017e-05, 'epoch': 1.43}


 48%|████▊     | 14290/30000 [7:50:30<1:30:22,  2.90it/s]

{'loss': 0.2456, 'grad_norm': 0.5277623534202576, 'learning_rate': 2.66271186440678e-05, 'epoch': 1.43}


 48%|████▊     | 14300/30000 [7:50:34<1:26:31,  3.02it/s]

{'loss': 0.2409, 'grad_norm': 0.5087192058563232, 'learning_rate': 2.6610169491525427e-05, 'epoch': 1.43}


 48%|████▊     | 14310/30000 [7:50:37<1:27:47,  2.98it/s]

{'loss': 0.2407, 'grad_norm': 0.9264577627182007, 'learning_rate': 2.6593220338983056e-05, 'epoch': 1.43}


 48%|████▊     | 14320/30000 [7:50:40<1:22:59,  3.15it/s]

{'loss': 0.2364, 'grad_norm': 0.5264766812324524, 'learning_rate': 2.6576271186440675e-05, 'epoch': 1.43}


 48%|████▊     | 14330/30000 [7:50:44<1:22:09,  3.18it/s]

{'loss': 0.2514, 'grad_norm': 0.5145848989486694, 'learning_rate': 2.6559322033898304e-05, 'epoch': 1.43}


 48%|████▊     | 14340/30000 [7:50:47<1:22:19,  3.17it/s]

{'loss': 0.233, 'grad_norm': 0.4402417838573456, 'learning_rate': 2.654237288135593e-05, 'epoch': 1.43}


 48%|████▊     | 14350/30000 [7:50:50<1:22:21,  3.17it/s]

{'loss': 0.2287, 'grad_norm': 0.5233749151229858, 'learning_rate': 2.652542372881356e-05, 'epoch': 1.44}


 48%|████▊     | 14355/30000 [7:50:51<1:23:56,  3.11it/s]

# EVALUATE MODEL

In [None]:
import torch
device = torch.device("cpu")
model.to(device)

In [None]:
class ChessSimulator:

    def __init__(self, model, tokenizer, stockfish_filepath):
        self.model = model
        self.tokenizer = tokenizer
        self.board = chess.Board()
        self.engine = chess.engine.SimpleEngine.popen_uci(stockfish_filepath)
        self.prompt = ''
        self.move_number = 1
        self.modelTurn = True
        self.currentMove = None
    
    def simulateGame(self):
        while not self.board.is_checkmate():
            try:
                if self.modelTurn:
                    self.generateModelMove()
                    print(self.prompt)
                else:
                    self.generateChessMove()
                    self.move_number += 1
                self.modelTurn = not self.modelTurn

            except:
                return 0
            
        self.engine.quit()

        if self.board.is_checkmate():
            if self.board.turn == chess.WHITE:
                return 0
            else: # counts draws as a win
                return 1
            
    def extract_white_move(self,text):
        # Meant to extract white move from generated model string, based on the current move number
        # Normalize spaces
        cleaned_text = " ".join(text.split())

        # Define start and end markers
        start_marker = f"{self.move_number}."
        end_marker = f"{self.move_number}. .."

        # Find the position of the move number
        start_index = cleaned_text.find(start_marker)
        if start_index == -1:
            return None  # Move not found

        # Find the position of the next black move
        end_index = cleaned_text.find(end_marker, start_index)
        
        # Extract substring
        if end_index != -1:
            move_text = cleaned_text[start_index + len(start_marker):end_index].strip()
        else:
            # If no black move is found, take everything after the move number
            move_text = cleaned_text[start_index + len(start_marker):].strip().split()[0]

        return move_text

    def parseModelMove(self, move):
        # Extract move using regex
        move = self.extract_white_move(move)
        # Need to strip whitespace, this should accept special pieces and special moves
        # as they're in "algebraic notation"
        self.board.push_san(move.replace(' ',''))
        self.prompt += move

    
    def parseChessMove(self, move):
        # converts move in UCI notation to SAN notation, that model expects
        self.prompt += f' {self.move_number}. .. '
        san_move = self.board.san(move)
        # Define patterns for different components
        move_number_pattern = re.compile(r'(\d+\.)')  # Move numbers (e.g., "1.")
        piece_pattern = re.compile(r'([KQRBN])')  # Chess pieces (e.g., "N", "K")
        square_pattern = re.compile(r'([a-h][1-8])')  # Board squares (e4, d5, etc.)
        special_move_pattern = re.compile(r'(O-O|O-O-O|\+|#|x|=Q|=R|=B|=N)')  # Castling, check, capture, promotions
            
        # Ensure move numbers, pieces, and special moves are space-separated
        san_move = move_number_pattern.sub(r'\1 ', san_move)  # Move number spacing
        san_move = piece_pattern.sub(r'\1 ', san_move)  # Piece spacing
        san_move = special_move_pattern.sub(r' \1 ', san_move)  # Special moves spacing
        self.prompt += san_move +' '


    def generateModelMove(self):
        self.prompt += f'{self.move_number}. '
        inputs = tokenizer(self.prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=15, num_return_sequences=1)
        move = tokenizer.decode(outputs[0], skip_special_tokens=True)
        self.currentMove = self.parseModelMove(move)


    def generateChessMove(self):
        result = self.engine.play(self.board, chess.engine.Limit(time=2.0))  # Time limit for the move
        # move needs to parsed into SAN notation before being pushed to board
        self.parseChessMove(result.move)
        self.board.push(result.move)
        


In [None]:
game1 = ChessSimulator(model, tokenizer, "PATH_TO_STOCKFISH_BINARY")
game1.simulateGame()