### Importing require libraries

In [24]:
# Import necessary libraries
import torch
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [25]:
# Folder containing Excel files
folder_path = r'data\labeldata.parquet'

data = pd.read_parquet(folder_path)
data.head()

Unnamed: 0,id,subject,emailtext,label
0,738a2c78-e9b7-4886-8a1b-7af4c08f3906,RE: Yates Residence - Mechanical Contractor Me...,"Zac Stevenson, PEPRINCIPAL | BUILDING MEPC . ....",0
1,23e4febf-364f-464a-9a3a-80f9d89d0e6b,Re: 6771 - VeLa - Level 4 Slab Penetration Plans,You tooRespectfullyMichael BentleyGeneral Fore...,0
2,f0789ea4-0a5f-46c3-8adb-083766b63740,RE: FW: 13027A - 20240603 - Prl - Columbia + A...,You can update the two 12 ducts and adjust on ...,0
3,09474b0b-bc7f-4a1f-ab95-0ed2c497468a,RE: West Zephyrhills Elementary - Pasco County...,"You can start with Area A. Proceeded by B,C,D ...",0
4,b7438359-ce6c-42f7-bcfd-5690e875dcda,RE: MLW_0001_CRS_24410_ Connacht Stadium - Exi...,You can hold off on doing this for now.,0


In [26]:
df = data[['emailtext', 'label']].copy()
df = df.rename({'emailtext': 'text'}, axis=1)
df.head()

Unnamed: 0,text,label
0,"Zac Stevenson, PEPRINCIPAL | BUILDING MEPC . ....",0
1,You tooRespectfullyMichael BentleyGeneral Fore...,0
2,You can update the two 12 ducts and adjust on ...,0
3,"You can start with Area A. Proceeded by B,C,D ...",0
4,You can hold off on doing this for now.,0


In [28]:
# df.to_parquet(r'data\training_data.parquet')

In [4]:
df.label.value_counts()

label
0    5798
1      55
Name: count, dtype: int64

In [5]:
# 2. Split data into training and validation sets
train_df, val_df = train_test_split(
    df,
    test_size=0.3,
    stratify=df['label'],  # Ensures the split is stratified
    random_state=42
)

In [6]:
# Reset indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [7]:
# 3. Calculate class weights
# Assign class weights as variables in the code
labels = train_df['label'].values  # Assuming labels are 0 and 1

In [8]:
# Compute class weights using sklearn's compute_class_weight function
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(labels),
    y=labels
)

# # Convert class weights to a tensor
# class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights = class_weights.tolist()

# Print class weights for verification
print(f"Class Weights: {class_weights}")

Class Weights: [0.5046809559004681, 53.9078947368421]


In [9]:
# 4. Define the custom dataset
class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),  # Flatten to remove extra dimension
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [10]:
# 5. Initialize tokenizer and model configuration
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128
num_labels = len(np.unique(labels))  # Should be 2 for labels 0 and 1

In [11]:
# Load the configuration and set class weights as a variable in the config
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)
config.class_weights = class_weights  # Assign class_weights to the config

In [12]:
from transformers.modeling_outputs import SequenceClassifierOutput

class WeightedBertForSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.class_weights = torch.tensor(config.class_weights, dtype=torch.float)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # Get the outputs from the base model without computing the default loss
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None,  # Avoid default loss computation
            **kwargs
        )
        logits = outputs.logits

        # Compute custom loss with class weights
        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # Return outputs as a SequenceClassifierOutput
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [13]:
# 7. Instantiate the model using the custom class and config
model = WeightedBertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    config=config
)

Some weights of WeightedBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# 8. Create training and validation datasets
train_dataset = EmailDataset(
    texts=train_df['text'],
    labels=train_df['label'],
    tokenizer=tokenizer,
    max_length=max_length
)

val_dataset = EmailDataset(
    texts=val_df['text'],
    labels=val_df['label'],
    tokenizer=tokenizer,
    max_length=max_length
)

In [15]:

# 9. Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',           # Output directory
    num_train_epochs=1,               # Adjust number of epochs as needed
    per_device_train_batch_size=2,    # Batch size for training
    per_device_eval_batch_size=2,     # Batch size for evaluation
    evaluation_strategy='epoch',      # Evaluate every epoch
    save_strategy='epoch',            # Save model every epoch
    logging_dir='./logs',             # Directory for logs
    logging_steps=5,
    load_best_model_at_end=True,      # Load the best model at the end of training
    metric_for_best_model='f1',       # Use F1 score to select best model
    greater_is_better=True,           # Higher F1 score is better
)



In [16]:
# 10. Define the compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary', zero_division=1
    )
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [17]:

# 11. Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [18]:
# 12. Train the model
trainer.train()

  0%|          | 5/2049 [00:06<35:41,  1.05s/it]  

{'loss': 0.1036, 'grad_norm': 1.5480989217758179, 'learning_rate': 4.987798926305515e-05, 'epoch': 0.0}


  0%|          | 10/2049 [00:10<29:07,  1.17it/s]

{'loss': 0.0297, 'grad_norm': 0.6570143103599548, 'learning_rate': 4.9755978526110306e-05, 'epoch': 0.0}


  1%|          | 15/2049 [00:14<27:54,  1.21it/s]

{'loss': 0.01, 'grad_norm': 0.19772619009017944, 'learning_rate': 4.963396778916545e-05, 'epoch': 0.01}


  1%|          | 20/2049 [00:20<34:54,  1.03s/it]

{'loss': 0.0052, 'grad_norm': 0.08832549303770065, 'learning_rate': 4.9511957052220596e-05, 'epoch': 0.01}


  1%|          | 25/2049 [00:25<35:55,  1.06s/it]

{'loss': 0.0025, 'grad_norm': 0.05020628124475479, 'learning_rate': 4.9389946315275745e-05, 'epoch': 0.01}


  1%|▏         | 30/2049 [00:30<35:15,  1.05s/it]

{'loss': 0.0015, 'grad_norm': 0.0473594106733799, 'learning_rate': 4.92679355783309e-05, 'epoch': 0.01}


  2%|▏         | 35/2049 [00:36<37:08,  1.11s/it]

{'loss': 0.0011, 'grad_norm': 0.026264211162924767, 'learning_rate': 4.914592484138604e-05, 'epoch': 0.02}


  2%|▏         | 40/2049 [00:41<36:34,  1.09s/it]

{'loss': 0.0008, 'grad_norm': 0.022564170882105827, 'learning_rate': 4.902391410444119e-05, 'epoch': 0.02}


  2%|▏         | 45/2049 [00:47<34:57,  1.05s/it]

{'loss': 0.0006, 'grad_norm': 0.01484739501029253, 'learning_rate': 4.8901903367496345e-05, 'epoch': 0.02}


  2%|▏         | 50/2049 [00:52<36:26,  1.09s/it]

{'loss': 0.0006, 'grad_norm': 0.014515909366309643, 'learning_rate': 4.8779892630551494e-05, 'epoch': 0.02}


  3%|▎         | 55/2049 [00:57<35:44,  1.08s/it]

{'loss': 0.0004, 'grad_norm': 0.011566024273633957, 'learning_rate': 4.8657881893606636e-05, 'epoch': 0.03}


  3%|▎         | 60/2049 [01:03<36:28,  1.10s/it]

{'loss': 0.0004, 'grad_norm': 0.007897144183516502, 'learning_rate': 4.853587115666179e-05, 'epoch': 0.03}


  3%|▎         | 65/2049 [01:08<36:05,  1.09s/it]

{'loss': 0.0003, 'grad_norm': 0.009123879484832287, 'learning_rate': 4.841386041971694e-05, 'epoch': 0.03}


  3%|▎         | 70/2049 [01:14<36:01,  1.09s/it]

{'loss': 0.0003, 'grad_norm': 0.007148827891796827, 'learning_rate': 4.829184968277209e-05, 'epoch': 0.03}


  4%|▎         | 75/2049 [01:19<36:26,  1.11s/it]

{'loss': 0.0003, 'grad_norm': 0.009037812240421772, 'learning_rate': 4.8169838945827236e-05, 'epoch': 0.04}


  4%|▍         | 80/2049 [01:25<35:44,  1.09s/it]

{'loss': 0.0002, 'grad_norm': 0.007006917614489794, 'learning_rate': 4.8047828208882385e-05, 'epoch': 0.04}


  4%|▍         | 85/2049 [01:30<35:24,  1.08s/it]

{'loss': 0.0002, 'grad_norm': 0.005125027149915695, 'learning_rate': 4.792581747193753e-05, 'epoch': 0.04}


  4%|▍         | 90/2049 [01:35<34:28,  1.06s/it]

{'loss': 0.0002, 'grad_norm': 0.005742912646383047, 'learning_rate': 4.780380673499268e-05, 'epoch': 0.04}


  5%|▍         | 95/2049 [01:41<34:45,  1.07s/it]

{'loss': 0.0002, 'grad_norm': 0.005165533162653446, 'learning_rate': 4.768179599804783e-05, 'epoch': 0.05}


  5%|▍         | 100/2049 [01:46<35:02,  1.08s/it]

{'loss': 0.0001, 'grad_norm': 0.004449865780770779, 'learning_rate': 4.755978526110298e-05, 'epoch': 0.05}


  5%|▌         | 105/2049 [01:52<34:40,  1.07s/it]

{'loss': 0.0002, 'grad_norm': 0.0048528555780649185, 'learning_rate': 4.743777452415813e-05, 'epoch': 0.05}


  5%|▌         | 110/2049 [01:57<35:54,  1.11s/it]

{'loss': 0.0001, 'grad_norm': 0.0036624677013605833, 'learning_rate': 4.731576378721328e-05, 'epoch': 0.05}


  6%|▌         | 115/2049 [02:02<35:10,  1.09s/it]

{'loss': 0.0001, 'grad_norm': 0.004290217999368906, 'learning_rate': 4.7193753050268424e-05, 'epoch': 0.06}


  6%|▌         | 120/2049 [02:08<34:52,  1.08s/it]

{'loss': 0.0001, 'grad_norm': 0.0034389873035252094, 'learning_rate': 4.707174231332357e-05, 'epoch': 0.06}


  6%|▌         | 125/2049 [02:13<34:55,  1.09s/it]

{'loss': 0.0001, 'grad_norm': 0.002614661818370223, 'learning_rate': 4.694973157637873e-05, 'epoch': 0.06}


  6%|▋         | 130/2049 [02:18<34:33,  1.08s/it]

{'loss': 0.0001, 'grad_norm': 0.0032828962430357933, 'learning_rate': 4.6827720839433876e-05, 'epoch': 0.06}


  7%|▋         | 135/2049 [02:23<31:36,  1.01it/s]

{'loss': 0.0001, 'grad_norm': 0.0028688989114016294, 'learning_rate': 4.670571010248902e-05, 'epoch': 0.07}


  7%|▋         | 140/2049 [02:29<34:27,  1.08s/it]

{'loss': 0.0001, 'grad_norm': 0.0025413481052964926, 'learning_rate': 4.658369936554417e-05, 'epoch': 0.07}


  7%|▋         | 145/2049 [02:34<32:27,  1.02s/it]

{'loss': 0.0001, 'grad_norm': 0.002940974198281765, 'learning_rate': 4.646168862859932e-05, 'epoch': 0.07}


  7%|▋         | 150/2049 [02:39<33:43,  1.07s/it]

{'loss': 0.0001, 'grad_norm': 0.0024515551049262285, 'learning_rate': 4.633967789165447e-05, 'epoch': 0.07}


  8%|▊         | 155/2049 [02:44<33:14,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.0029402575455605984, 'learning_rate': 4.621766715470962e-05, 'epoch': 0.08}


  8%|▊         | 160/2049 [02:50<32:35,  1.04s/it]

{'loss': 0.0001, 'grad_norm': 0.0019477730384096503, 'learning_rate': 4.609565641776477e-05, 'epoch': 0.08}


  8%|▊         | 165/2049 [02:55<32:58,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.00224144384264946, 'learning_rate': 4.5973645680819915e-05, 'epoch': 0.08}


  8%|▊         | 170/2049 [03:00<33:50,  1.08s/it]

{'loss': 0.0001, 'grad_norm': 0.0030892998911440372, 'learning_rate': 4.5851634943875064e-05, 'epoch': 0.08}


  9%|▊         | 175/2049 [03:06<33:35,  1.08s/it]

{'loss': 0.0001, 'grad_norm': 0.0022156420163810253, 'learning_rate': 4.572962420693021e-05, 'epoch': 0.09}


  9%|▉         | 180/2049 [03:11<32:36,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.001979690743610263, 'learning_rate': 4.560761346998536e-05, 'epoch': 0.09}


  9%|▉         | 185/2049 [03:16<32:45,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.00795717816799879, 'learning_rate': 4.548560273304051e-05, 'epoch': 0.09}


  9%|▉         | 190/2049 [03:21<32:43,  1.06s/it]

{'loss': 0.0001, 'grad_norm': 0.00186453468631953, 'learning_rate': 4.5363591996095665e-05, 'epoch': 0.09}


 10%|▉         | 195/2049 [03:27<33:10,  1.07s/it]

{'loss': 0.0001, 'grad_norm': 0.001289786770939827, 'learning_rate': 4.5241581259150806e-05, 'epoch': 0.1}


 10%|▉         | 200/2049 [03:32<32:08,  1.04s/it]

{'loss': 0.0001, 'grad_norm': 0.002622041152790189, 'learning_rate': 4.5119570522205955e-05, 'epoch': 0.1}


 10%|█         | 205/2049 [03:37<32:23,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.004407462198287249, 'learning_rate': 4.49975597852611e-05, 'epoch': 0.1}


 10%|█         | 210/2049 [03:42<31:50,  1.04s/it]

{'loss': 0.0001, 'grad_norm': 0.0015084014739841223, 'learning_rate': 4.487554904831626e-05, 'epoch': 0.1}


 10%|█         | 215/2049 [03:48<32:36,  1.07s/it]

{'loss': 1.9468, 'grad_norm': 29.228561401367188, 'learning_rate': 4.47535383113714e-05, 'epoch': 0.1}


 11%|█         | 220/2049 [03:53<32:47,  1.08s/it]

{'loss': 1.3122, 'grad_norm': 83.31674194335938, 'learning_rate': 4.463152757442655e-05, 'epoch': 0.11}


 11%|█         | 225/2049 [03:58<32:11,  1.06s/it]

{'loss': 0.118, 'grad_norm': 0.04346587136387825, 'learning_rate': 4.4509516837481704e-05, 'epoch': 0.11}


 11%|█         | 230/2049 [04:04<32:06,  1.06s/it]

{'loss': 0.0007, 'grad_norm': 0.015533067286014557, 'learning_rate': 4.438750610053685e-05, 'epoch': 0.11}


 11%|█▏        | 235/2049 [04:09<32:58,  1.09s/it]

{'loss': 0.0006, 'grad_norm': 0.014567957259714603, 'learning_rate': 4.4265495363591994e-05, 'epoch': 0.11}


 12%|█▏        | 240/2049 [04:14<32:32,  1.08s/it]

{'loss': 0.0005, 'grad_norm': 0.015811536461114883, 'learning_rate': 4.414348462664715e-05, 'epoch': 0.12}


 12%|█▏        | 245/2049 [04:20<32:29,  1.08s/it]

{'loss': 0.0004, 'grad_norm': 0.010173707269132137, 'learning_rate': 4.40214738897023e-05, 'epoch': 0.12}


 12%|█▏        | 250/2049 [04:25<31:36,  1.05s/it]

{'loss': 0.0003, 'grad_norm': 0.007675522938370705, 'learning_rate': 4.3899463152757446e-05, 'epoch': 0.12}


 12%|█▏        | 255/2049 [04:30<32:26,  1.08s/it]

{'loss': 0.0003, 'grad_norm': 0.0055798133835196495, 'learning_rate': 4.3777452415812595e-05, 'epoch': 0.12}


 13%|█▎        | 260/2049 [04:36<31:39,  1.06s/it]

{'loss': 0.0002, 'grad_norm': 0.007163759786635637, 'learning_rate': 4.365544167886774e-05, 'epoch': 0.13}


 13%|█▎        | 265/2049 [04:41<31:56,  1.07s/it]

{'loss': 0.0002, 'grad_norm': 0.005039062816649675, 'learning_rate': 4.353343094192289e-05, 'epoch': 0.13}


 13%|█▎        | 270/2049 [04:47<32:00,  1.08s/it]

{'loss': 0.0002, 'grad_norm': 0.004794491454958916, 'learning_rate': 4.341142020497804e-05, 'epoch': 0.13}


 13%|█▎        | 275/2049 [04:52<31:07,  1.05s/it]

{'loss': 0.0002, 'grad_norm': 0.004587618168443441, 'learning_rate': 4.328940946803319e-05, 'epoch': 0.13}


 14%|█▎        | 280/2049 [04:57<32:21,  1.10s/it]

{'loss': 0.0002, 'grad_norm': 0.006077893078327179, 'learning_rate': 4.316739873108834e-05, 'epoch': 0.14}


 14%|█▍        | 285/2049 [05:02<30:46,  1.05s/it]

{'loss': 0.0002, 'grad_norm': 0.005371145438402891, 'learning_rate': 4.3045387994143486e-05, 'epoch': 0.14}


 14%|█▍        | 290/2049 [05:07<29:47,  1.02s/it]

{'loss': 0.0001, 'grad_norm': 0.004225699231028557, 'learning_rate': 4.292337725719864e-05, 'epoch': 0.14}


 14%|█▍        | 295/2049 [05:13<30:55,  1.06s/it]

{'loss': 0.0001, 'grad_norm': 0.003077967558056116, 'learning_rate': 4.280136652025378e-05, 'epoch': 0.14}


 15%|█▍        | 300/2049 [05:18<30:38,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.0034538120962679386, 'learning_rate': 4.267935578330893e-05, 'epoch': 0.15}


 15%|█▍        | 305/2049 [05:23<30:34,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.0038469545543193817, 'learning_rate': 4.2557345046364086e-05, 'epoch': 0.15}


 15%|█▌        | 310/2049 [05:29<30:50,  1.06s/it]

{'loss': 0.0001, 'grad_norm': 0.0032238690182566643, 'learning_rate': 4.2435334309419235e-05, 'epoch': 0.15}


 15%|█▌        | 315/2049 [05:34<29:45,  1.03s/it]

{'loss': 0.0001, 'grad_norm': 0.002834742423146963, 'learning_rate': 4.2313323572474376e-05, 'epoch': 0.15}


 16%|█▌        | 320/2049 [05:39<29:56,  1.04s/it]

{'loss': 0.0001, 'grad_norm': 0.002293366240337491, 'learning_rate': 4.219131283552953e-05, 'epoch': 0.16}


 16%|█▌        | 325/2049 [05:44<29:43,  1.03s/it]

{'loss': 0.0001, 'grad_norm': 0.0024502482265233994, 'learning_rate': 4.206930209858468e-05, 'epoch': 0.16}


 16%|█▌        | 330/2049 [05:49<30:00,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.002228273544460535, 'learning_rate': 4.194729136163983e-05, 'epoch': 0.16}


 16%|█▋        | 335/2049 [05:55<29:44,  1.04s/it]

{'loss': 0.0001, 'grad_norm': 0.0033460368867963552, 'learning_rate': 4.182528062469497e-05, 'epoch': 0.16}


 17%|█▋        | 340/2049 [06:00<30:02,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.0031557756010442972, 'learning_rate': 4.1703269887750125e-05, 'epoch': 0.17}


 17%|█▋        | 345/2049 [06:05<29:27,  1.04s/it]

{'loss': 0.0001, 'grad_norm': 0.002819201210513711, 'learning_rate': 4.1581259150805274e-05, 'epoch': 0.17}


 17%|█▋        | 350/2049 [06:10<28:25,  1.00s/it]

{'loss': 0.0001, 'grad_norm': 0.002541211899369955, 'learning_rate': 4.145924841386042e-05, 'epoch': 0.17}


 17%|█▋        | 355/2049 [06:15<27:40,  1.02it/s]

{'loss': 0.0001, 'grad_norm': 0.001836103736422956, 'learning_rate': 4.133723767691557e-05, 'epoch': 0.17}


 18%|█▊        | 360/2049 [06:20<27:36,  1.02it/s]

{'loss': 0.0001, 'grad_norm': 0.002173848683014512, 'learning_rate': 4.121522693997072e-05, 'epoch': 0.18}


 18%|█▊        | 365/2049 [06:25<27:25,  1.02it/s]

{'loss': 0.0001, 'grad_norm': 0.002099215518683195, 'learning_rate': 4.109321620302587e-05, 'epoch': 0.18}


 18%|█▊        | 370/2049 [06:30<29:55,  1.07s/it]

{'loss': 0.0001, 'grad_norm': 0.0024634639266878366, 'learning_rate': 4.0971205466081016e-05, 'epoch': 0.18}


 18%|█▊        | 375/2049 [06:35<30:03,  1.08s/it]

{'loss': 0.0001, 'grad_norm': 0.0018330513266846538, 'learning_rate': 4.0849194729136165e-05, 'epoch': 0.18}


 19%|█▊        | 380/2049 [06:41<29:49,  1.07s/it]

{'loss': 0.0001, 'grad_norm': 0.002288702642545104, 'learning_rate': 4.072718399219131e-05, 'epoch': 0.19}


 19%|█▉        | 385/2049 [06:46<29:00,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.001807446707971394, 'learning_rate': 4.060517325524646e-05, 'epoch': 0.19}


 19%|█▉        | 390/2049 [06:51<28:02,  1.01s/it]

{'loss': 1.9061, 'grad_norm': 0.0030335511546581984, 'learning_rate': 4.048316251830162e-05, 'epoch': 0.19}


 19%|█▉        | 395/2049 [06:56<28:27,  1.03s/it]

{'loss': 0.0003, 'grad_norm': 0.010769911110401154, 'learning_rate': 4.036115178135676e-05, 'epoch': 0.19}


 20%|█▉        | 400/2049 [07:01<28:25,  1.03s/it]

{'loss': 0.0004, 'grad_norm': 0.017346667125821114, 'learning_rate': 4.023914104441191e-05, 'epoch': 0.2}


 20%|█▉        | 405/2049 [07:07<28:26,  1.04s/it]

{'loss': 0.0006, 'grad_norm': 0.021265028044581413, 'learning_rate': 4.011713030746706e-05, 'epoch': 0.2}


 20%|██        | 410/2049 [07:12<28:43,  1.05s/it]

{'loss': 0.0006, 'grad_norm': 0.013524464331567287, 'learning_rate': 3.999511957052221e-05, 'epoch': 0.2}


 20%|██        | 415/2049 [07:17<28:50,  1.06s/it]

{'loss': 0.0004, 'grad_norm': 0.011769928969442844, 'learning_rate': 3.987310883357735e-05, 'epoch': 0.2}


 20%|██        | 420/2049 [07:22<28:08,  1.04s/it]

{'loss': 0.0004, 'grad_norm': 0.009136782959103584, 'learning_rate': 3.975109809663251e-05, 'epoch': 0.2}


 21%|██        | 425/2049 [07:27<28:03,  1.04s/it]

{'loss': 0.0003, 'grad_norm': 0.00673885690048337, 'learning_rate': 3.9629087359687656e-05, 'epoch': 0.21}


 21%|██        | 430/2049 [07:33<28:07,  1.04s/it]

{'loss': 0.0003, 'grad_norm': 0.0070765139535069466, 'learning_rate': 3.9507076622742805e-05, 'epoch': 0.21}


 21%|██        | 435/2049 [07:38<27:35,  1.03s/it]

{'loss': 0.0002, 'grad_norm': 0.007987765595316887, 'learning_rate': 3.938506588579795e-05, 'epoch': 0.21}


 21%|██▏       | 440/2049 [07:43<27:58,  1.04s/it]

{'loss': 0.0002, 'grad_norm': 0.005659799557179213, 'learning_rate': 3.92630551488531e-05, 'epoch': 0.21}


 22%|██▏       | 445/2049 [07:48<27:25,  1.03s/it]

{'loss': 0.0002, 'grad_norm': 0.0050647989846765995, 'learning_rate': 3.914104441190825e-05, 'epoch': 0.22}


 22%|██▏       | 450/2049 [07:53<27:22,  1.03s/it]

{'loss': 0.0002, 'grad_norm': 0.004091714508831501, 'learning_rate': 3.90190336749634e-05, 'epoch': 0.22}


 22%|██▏       | 455/2049 [07:59<27:19,  1.03s/it]

{'loss': 0.0001, 'grad_norm': 0.003868639236316085, 'learning_rate': 3.889702293801855e-05, 'epoch': 0.22}


 22%|██▏       | 460/2049 [08:04<27:39,  1.04s/it]

{'loss': 1.7614, 'grad_norm': 0.012227066792547703, 'learning_rate': 3.8775012201073696e-05, 'epoch': 0.22}


 23%|██▎       | 465/2049 [08:09<27:36,  1.05s/it]

{'loss': 0.0009, 'grad_norm': 0.03676991909742355, 'learning_rate': 3.8653001464128844e-05, 'epoch': 0.23}


 23%|██▎       | 470/2049 [08:14<27:22,  1.04s/it]

{'loss': 0.0008, 'grad_norm': 0.024736473336815834, 'learning_rate': 3.8530990727184e-05, 'epoch': 0.23}


 23%|██▎       | 475/2049 [08:19<27:34,  1.05s/it]

{'loss': 0.0006, 'grad_norm': 0.018209602683782578, 'learning_rate': 3.840897999023914e-05, 'epoch': 0.23}


 23%|██▎       | 480/2049 [08:25<27:10,  1.04s/it]

{'loss': 0.0006, 'grad_norm': 0.012982186861336231, 'learning_rate': 3.828696925329429e-05, 'epoch': 0.23}


 24%|██▎       | 485/2049 [08:30<26:41,  1.02s/it]

{'loss': 0.0005, 'grad_norm': 0.014620003290474415, 'learning_rate': 3.8164958516349445e-05, 'epoch': 0.24}


 24%|██▍       | 490/2049 [08:35<26:50,  1.03s/it]

{'loss': 0.0005, 'grad_norm': 0.011074901558458805, 'learning_rate': 3.804294777940459e-05, 'epoch': 0.24}


 24%|██▍       | 495/2049 [08:40<26:46,  1.03s/it]

{'loss': 0.0004, 'grad_norm': 0.01040351577103138, 'learning_rate': 3.7920937042459735e-05, 'epoch': 0.24}


 24%|██▍       | 500/2049 [08:45<27:05,  1.05s/it]

{'loss': 1.5549, 'grad_norm': 0.009741144254803658, 'learning_rate': 3.779892630551489e-05, 'epoch': 0.24}


 25%|██▍       | 505/2049 [08:51<27:21,  1.06s/it]

{'loss': 0.0009, 'grad_norm': 0.04990682378411293, 'learning_rate': 3.767691556857004e-05, 'epoch': 0.25}


 25%|██▍       | 510/2049 [08:56<27:24,  1.07s/it]

{'loss': 0.0017, 'grad_norm': 0.03610797971487045, 'learning_rate': 3.755490483162519e-05, 'epoch': 0.25}


 25%|██▌       | 515/2049 [09:01<26:22,  1.03s/it]

{'loss': 0.001, 'grad_norm': 0.023461485281586647, 'learning_rate': 3.743289409468033e-05, 'epoch': 0.25}


 25%|██▌       | 520/2049 [09:06<26:32,  1.04s/it]

{'loss': 0.0008, 'grad_norm': 0.01576247066259384, 'learning_rate': 3.7310883357735484e-05, 'epoch': 0.25}


 26%|██▌       | 525/2049 [09:12<26:29,  1.04s/it]

{'loss': 0.0005, 'grad_norm': 0.018101464956998825, 'learning_rate': 3.718887262079063e-05, 'epoch': 0.26}


 26%|██▌       | 530/2049 [09:17<26:22,  1.04s/it]

{'loss': 0.0005, 'grad_norm': 0.01446295715868473, 'learning_rate': 3.706686188384578e-05, 'epoch': 0.26}


 26%|██▌       | 535/2049 [09:22<25:59,  1.03s/it]

{'loss': 0.0005, 'grad_norm': 0.019171828404068947, 'learning_rate': 3.694485114690093e-05, 'epoch': 0.26}


 26%|██▋       | 540/2049 [09:27<26:22,  1.05s/it]

{'loss': 0.0004, 'grad_norm': 0.012591023929417133, 'learning_rate': 3.682284040995608e-05, 'epoch': 0.26}


 27%|██▋       | 545/2049 [09:32<26:03,  1.04s/it]

{'loss': 1.6051, 'grad_norm': 0.016715124249458313, 'learning_rate': 3.6700829673011226e-05, 'epoch': 0.27}


 27%|██▋       | 550/2049 [09:37<25:51,  1.04s/it]

{'loss': 0.0007, 'grad_norm': 0.03474292531609535, 'learning_rate': 3.6578818936066375e-05, 'epoch': 0.27}


 27%|██▋       | 555/2049 [09:43<26:13,  1.05s/it]

{'loss': 0.0008, 'grad_norm': 0.02938363328576088, 'learning_rate': 3.645680819912152e-05, 'epoch': 0.27}


 27%|██▋       | 560/2049 [09:48<25:30,  1.03s/it]

{'loss': 0.0009, 'grad_norm': 0.0272664912045002, 'learning_rate': 3.633479746217667e-05, 'epoch': 0.27}


 28%|██▊       | 565/2049 [09:53<26:52,  1.09s/it]

{'loss': 0.0006, 'grad_norm': 0.019266003742814064, 'learning_rate': 3.621278672523182e-05, 'epoch': 0.28}


 28%|██▊       | 570/2049 [09:58<25:45,  1.05s/it]

{'loss': 0.0007, 'grad_norm': 0.016124164685606956, 'learning_rate': 3.6090775988286975e-05, 'epoch': 0.28}


 28%|██▊       | 575/2049 [10:04<25:42,  1.05s/it]

{'loss': 1.4577, 'grad_norm': 0.02974941022694111, 'learning_rate': 3.596876525134212e-05, 'epoch': 0.28}


 28%|██▊       | 580/2049 [10:09<25:30,  1.04s/it]

{'loss': 0.001, 'grad_norm': 0.03564245626330376, 'learning_rate': 3.5846754514397266e-05, 'epoch': 0.28}


 29%|██▊       | 585/2049 [10:14<25:05,  1.03s/it]

{'loss': 0.0013, 'grad_norm': 0.03729170560836792, 'learning_rate': 3.572474377745242e-05, 'epoch': 0.29}


 29%|██▉       | 590/2049 [10:19<25:01,  1.03s/it]

{'loss': 0.0014, 'grad_norm': 0.04532971605658531, 'learning_rate': 3.560273304050757e-05, 'epoch': 0.29}


 29%|██▉       | 595/2049 [10:24<24:59,  1.03s/it]

{'loss': 0.001, 'grad_norm': 0.02925862930715084, 'learning_rate': 3.548072230356271e-05, 'epoch': 0.29}


 29%|██▉       | 600/2049 [10:29<24:35,  1.02s/it]

{'loss': 0.0008, 'grad_norm': 0.02390986680984497, 'learning_rate': 3.5358711566617866e-05, 'epoch': 0.29}


 30%|██▉       | 605/2049 [10:35<24:59,  1.04s/it]

{'loss': 0.0007, 'grad_norm': 0.02119731344282627, 'learning_rate': 3.5236700829673015e-05, 'epoch': 0.3}


 30%|██▉       | 610/2049 [10:40<25:28,  1.06s/it]

{'loss': 0.0007, 'grad_norm': 0.018175723031163216, 'learning_rate': 3.511469009272816e-05, 'epoch': 0.3}


 30%|███       | 615/2049 [10:45<24:53,  1.04s/it]

{'loss': 0.0005, 'grad_norm': 0.012804943136870861, 'learning_rate': 3.499267935578331e-05, 'epoch': 0.3}


 30%|███       | 620/2049 [10:50<24:52,  1.04s/it]

{'loss': 0.0004, 'grad_norm': 0.014999059960246086, 'learning_rate': 3.487066861883846e-05, 'epoch': 0.3}


 31%|███       | 625/2049 [10:55<24:16,  1.02s/it]

{'loss': 0.0004, 'grad_norm': 0.010607845149934292, 'learning_rate': 3.474865788189361e-05, 'epoch': 0.31}


 31%|███       | 630/2049 [11:01<24:19,  1.03s/it]

{'loss': 0.0004, 'grad_norm': 0.008481869474053383, 'learning_rate': 3.462664714494876e-05, 'epoch': 0.31}


 31%|███       | 635/2049 [11:06<24:31,  1.04s/it]

{'loss': 0.0003, 'grad_norm': 0.009784910827875137, 'learning_rate': 3.4504636408003906e-05, 'epoch': 0.31}


 31%|███       | 640/2049 [11:11<24:16,  1.03s/it]

{'loss': 0.0003, 'grad_norm': 0.008271763101220131, 'learning_rate': 3.4382625671059054e-05, 'epoch': 0.31}


 31%|███▏      | 645/2049 [11:16<24:14,  1.04s/it]

{'loss': 0.0003, 'grad_norm': 0.006460020784288645, 'learning_rate': 3.42606149341142e-05, 'epoch': 0.31}


 32%|███▏      | 650/2049 [11:21<24:06,  1.03s/it]

{'loss': 1.6931, 'grad_norm': 0.00785636343061924, 'learning_rate': 3.413860419716936e-05, 'epoch': 0.32}


 32%|███▏      | 655/2049 [11:26<23:35,  1.02s/it]

{'loss': 0.0005, 'grad_norm': 0.015899106860160828, 'learning_rate': 3.40165934602245e-05, 'epoch': 0.32}


 32%|███▏      | 660/2049 [11:32<24:06,  1.04s/it]

{'loss': 0.0007, 'grad_norm': 0.021323416382074356, 'learning_rate': 3.389458272327965e-05, 'epoch': 0.32}


 32%|███▏      | 665/2049 [11:37<23:46,  1.03s/it]

{'loss': 0.0007, 'grad_norm': 0.021829525008797646, 'learning_rate': 3.37725719863348e-05, 'epoch': 0.32}


 33%|███▎      | 670/2049 [11:42<23:33,  1.03s/it]

{'loss': 0.0007, 'grad_norm': 0.019978342577815056, 'learning_rate': 3.365056124938995e-05, 'epoch': 0.33}


 33%|███▎      | 675/2049 [11:47<23:34,  1.03s/it]

{'loss': 0.0006, 'grad_norm': 0.015538695268332958, 'learning_rate': 3.352855051244509e-05, 'epoch': 0.33}


 33%|███▎      | 680/2049 [11:52<23:30,  1.03s/it]

{'loss': 0.0005, 'grad_norm': 0.012988925911486149, 'learning_rate': 3.340653977550024e-05, 'epoch': 0.33}


 33%|███▎      | 685/2049 [11:57<23:20,  1.03s/it]

{'loss': 0.0004, 'grad_norm': 0.011943318881094456, 'learning_rate': 3.32845290385554e-05, 'epoch': 0.33}


 34%|███▎      | 690/2049 [12:03<23:29,  1.04s/it]

{'loss': 0.0004, 'grad_norm': 0.011236246675252914, 'learning_rate': 3.3162518301610545e-05, 'epoch': 0.34}


 34%|███▍      | 695/2049 [12:08<22:53,  1.01s/it]

{'loss': 0.0003, 'grad_norm': 0.010178432799875736, 'learning_rate': 3.304050756466569e-05, 'epoch': 0.34}


 34%|███▍      | 700/2049 [12:13<23:06,  1.03s/it]

{'loss': 0.0003, 'grad_norm': 0.00855993665754795, 'learning_rate': 3.291849682772084e-05, 'epoch': 0.34}


 34%|███▍      | 705/2049 [12:18<22:42,  1.01s/it]

{'loss': 0.0003, 'grad_norm': 0.008163219317793846, 'learning_rate': 3.279648609077599e-05, 'epoch': 0.34}


 35%|███▍      | 710/2049 [12:23<23:15,  1.04s/it]

{'loss': 0.0003, 'grad_norm': 0.007705547381192446, 'learning_rate': 3.267447535383114e-05, 'epoch': 0.35}


 35%|███▍      | 715/2049 [12:28<22:35,  1.02s/it]

{'loss': 0.0003, 'grad_norm': 0.009319129399955273, 'learning_rate': 3.255246461688629e-05, 'epoch': 0.35}


 35%|███▌      | 720/2049 [12:33<22:42,  1.03s/it]

{'loss': 0.0003, 'grad_norm': 0.007234126329421997, 'learning_rate': 3.2430453879941436e-05, 'epoch': 0.35}


 35%|███▌      | 725/2049 [12:38<23:12,  1.05s/it]

{'loss': 0.0002, 'grad_norm': 0.005543849430978298, 'learning_rate': 3.2308443142996585e-05, 'epoch': 0.35}


 36%|███▌      | 730/2049 [12:44<22:42,  1.03s/it]

{'loss': 0.0002, 'grad_norm': 0.0073270536959171295, 'learning_rate': 3.218643240605173e-05, 'epoch': 0.36}


 36%|███▌      | 735/2049 [12:49<22:21,  1.02s/it]

{'loss': 0.0002, 'grad_norm': 0.006634835619479418, 'learning_rate': 3.206442166910688e-05, 'epoch': 0.36}


 36%|███▌      | 740/2049 [12:54<22:53,  1.05s/it]

{'loss': 0.0002, 'grad_norm': 0.007526329252868891, 'learning_rate': 3.194241093216203e-05, 'epoch': 0.36}


 36%|███▋      | 745/2049 [12:59<22:18,  1.03s/it]

{'loss': 0.0002, 'grad_norm': 0.005971785634756088, 'learning_rate': 3.182040019521718e-05, 'epoch': 0.36}


 37%|███▋      | 750/2049 [13:04<22:16,  1.03s/it]

{'loss': 0.0002, 'grad_norm': 0.005042524542659521, 'learning_rate': 3.1698389458272334e-05, 'epoch': 0.37}


 37%|███▋      | 755/2049 [13:09<22:20,  1.04s/it]

{'loss': 1.6292, 'grad_norm': 0.007074897177517414, 'learning_rate': 3.1576378721327476e-05, 'epoch': 0.37}


 37%|███▋      | 760/2049 [13:15<22:34,  1.05s/it]

{'loss': 0.0004, 'grad_norm': 0.01710221730172634, 'learning_rate': 3.1454367984382624e-05, 'epoch': 0.37}


 37%|███▋      | 765/2049 [13:20<21:46,  1.02s/it]

{'loss': 0.0006, 'grad_norm': 0.025049367919564247, 'learning_rate': 3.133235724743778e-05, 'epoch': 0.37}


 38%|███▊      | 770/2049 [13:25<21:47,  1.02s/it]

{'loss': 0.0007, 'grad_norm': 0.018697170540690422, 'learning_rate': 3.121034651049293e-05, 'epoch': 0.38}


 38%|███▊      | 775/2049 [13:30<21:42,  1.02s/it]

{'loss': 0.0005, 'grad_norm': 0.016972731798887253, 'learning_rate': 3.108833577354807e-05, 'epoch': 0.38}


 38%|███▊      | 780/2049 [13:35<21:46,  1.03s/it]

{'loss': 0.0005, 'grad_norm': 0.013474651612341404, 'learning_rate': 3.0966325036603225e-05, 'epoch': 0.38}


 38%|███▊      | 785/2049 [13:40<21:32,  1.02s/it]

{'loss': 0.0004, 'grad_norm': 0.011853284202516079, 'learning_rate': 3.084431429965837e-05, 'epoch': 0.38}


 39%|███▊      | 790/2049 [13:45<21:32,  1.03s/it]

{'loss': 0.0004, 'grad_norm': 0.013481429778039455, 'learning_rate': 3.072230356271352e-05, 'epoch': 0.39}


 39%|███▉      | 795/2049 [13:51<21:40,  1.04s/it]

{'loss': 0.0004, 'grad_norm': 0.00770797161385417, 'learning_rate': 3.060029282576867e-05, 'epoch': 0.39}


 39%|███▉      | 800/2049 [13:56<21:24,  1.03s/it]

{'loss': 0.0003, 'grad_norm': 0.009809480980038643, 'learning_rate': 3.047828208882382e-05, 'epoch': 0.39}


 39%|███▉      | 805/2049 [14:01<21:34,  1.04s/it]

{'loss': 0.0003, 'grad_norm': 0.007903505116701126, 'learning_rate': 3.0356271351878967e-05, 'epoch': 0.39}


 40%|███▉      | 810/2049 [14:06<21:37,  1.05s/it]

{'loss': 0.0003, 'grad_norm': 0.007087347563356161, 'learning_rate': 3.023426061493412e-05, 'epoch': 0.4}


 40%|███▉      | 815/2049 [14:11<21:47,  1.06s/it]

{'loss': 0.0002, 'grad_norm': 0.007475017569959164, 'learning_rate': 3.0112249877989264e-05, 'epoch': 0.4}


 40%|████      | 820/2049 [14:17<21:06,  1.03s/it]

{'loss': 0.0003, 'grad_norm': 0.006770094856619835, 'learning_rate': 2.9990239141044413e-05, 'epoch': 0.4}


 40%|████      | 825/2049 [14:22<21:17,  1.04s/it]

{'loss': 1.6313, 'grad_norm': 0.009479639120399952, 'learning_rate': 2.9868228404099564e-05, 'epoch': 0.4}


 41%|████      | 830/2049 [14:27<20:58,  1.03s/it]

{'loss': 0.0004, 'grad_norm': 0.013034951873123646, 'learning_rate': 2.9746217667154713e-05, 'epoch': 0.41}


 41%|████      | 835/2049 [14:32<21:06,  1.04s/it]

{'loss': 0.0005, 'grad_norm': 0.022541020065546036, 'learning_rate': 2.9624206930209858e-05, 'epoch': 0.41}


 41%|████      | 840/2049 [14:37<20:03,  1.00it/s]

{'loss': 0.0006, 'grad_norm': 0.016332514584064484, 'learning_rate': 2.950219619326501e-05, 'epoch': 0.41}


 41%|████      | 845/2049 [14:42<19:51,  1.01it/s]

{'loss': 0.0006, 'grad_norm': 0.014957715757191181, 'learning_rate': 2.9380185456320158e-05, 'epoch': 0.41}


 41%|████▏     | 850/2049 [14:47<19:29,  1.02it/s]

{'loss': 1.5024, 'grad_norm': 0.024146810173988342, 'learning_rate': 2.925817471937531e-05, 'epoch': 0.41}


 42%|████▏     | 855/2049 [14:52<19:27,  1.02it/s]

{'loss': 0.0009, 'grad_norm': 0.03831836208701134, 'learning_rate': 2.9136163982430452e-05, 'epoch': 0.42}


 42%|████▏     | 860/2049 [14:58<25:22,  1.28s/it]

{'loss': 0.0012, 'grad_norm': 0.03282895311713219, 'learning_rate': 2.9014153245485604e-05, 'epoch': 0.42}


 42%|████▏     | 865/2049 [15:04<22:11,  1.12s/it]

{'loss': 0.001, 'grad_norm': 0.03077303245663643, 'learning_rate': 2.8892142508540752e-05, 'epoch': 0.42}


 42%|████▏     | 870/2049 [15:09<20:51,  1.06s/it]

{'loss': 0.0009, 'grad_norm': 0.03192334622144699, 'learning_rate': 2.8770131771595904e-05, 'epoch': 0.42}


 43%|████▎     | 875/2049 [15:15<20:47,  1.06s/it]

{'loss': 0.0008, 'grad_norm': 0.018505265936255455, 'learning_rate': 2.864812103465105e-05, 'epoch': 0.43}


 43%|████▎     | 880/2049 [15:20<20:48,  1.07s/it]

{'loss': 0.0008, 'grad_norm': 0.023959001526236534, 'learning_rate': 2.8526110297706198e-05, 'epoch': 0.43}


 43%|████▎     | 885/2049 [15:26<21:52,  1.13s/it]

{'loss': 0.0006, 'grad_norm': 0.015220584347844124, 'learning_rate': 2.840409956076135e-05, 'epoch': 0.43}


 43%|████▎     | 890/2049 [15:31<20:33,  1.06s/it]

{'loss': 0.0006, 'grad_norm': 0.0141470180824399, 'learning_rate': 2.8282088823816498e-05, 'epoch': 0.43}


 44%|████▎     | 895/2049 [15:37<21:20,  1.11s/it]

{'loss': 0.0005, 'grad_norm': 0.017469245940446854, 'learning_rate': 2.8160078086871643e-05, 'epoch': 0.44}


 44%|████▍     | 900/2049 [15:42<20:46,  1.08s/it]

{'loss': 0.0005, 'grad_norm': 0.011517420411109924, 'learning_rate': 2.8038067349926795e-05, 'epoch': 0.44}


 44%|████▍     | 905/2049 [15:47<19:34,  1.03s/it]

{'loss': 0.0004, 'grad_norm': 0.008978749625384808, 'learning_rate': 2.7916056612981943e-05, 'epoch': 0.44}


 44%|████▍     | 910/2049 [15:53<20:48,  1.10s/it]

{'loss': 0.0004, 'grad_norm': 0.008804168552160263, 'learning_rate': 2.7794045876037095e-05, 'epoch': 0.44}


 45%|████▍     | 915/2049 [15:58<19:53,  1.05s/it]

{'loss': 0.0003, 'grad_norm': 0.009010872803628445, 'learning_rate': 2.767203513909224e-05, 'epoch': 0.45}


 45%|████▍     | 920/2049 [16:03<19:35,  1.04s/it]

{'loss': 0.0003, 'grad_norm': 0.009298644959926605, 'learning_rate': 2.755002440214739e-05, 'epoch': 0.45}


 45%|████▌     | 925/2049 [16:08<19:57,  1.07s/it]

{'loss': 0.0003, 'grad_norm': 0.009667815640568733, 'learning_rate': 2.742801366520254e-05, 'epoch': 0.45}


 45%|████▌     | 930/2049 [16:14<19:44,  1.06s/it]

{'loss': 0.0003, 'grad_norm': 0.008970587514340878, 'learning_rate': 2.730600292825769e-05, 'epoch': 0.45}


 46%|████▌     | 935/2049 [16:19<19:32,  1.05s/it]

{'loss': 0.0003, 'grad_norm': 0.007698168512433767, 'learning_rate': 2.7183992191312834e-05, 'epoch': 0.46}


 46%|████▌     | 940/2049 [16:24<19:17,  1.04s/it]

{'loss': 0.0003, 'grad_norm': 0.009370662271976471, 'learning_rate': 2.7061981454367986e-05, 'epoch': 0.46}


 46%|████▌     | 945/2049 [16:29<19:01,  1.03s/it]

{'loss': 0.0002, 'grad_norm': 0.005673396401107311, 'learning_rate': 2.6939970717423134e-05, 'epoch': 0.46}


 46%|████▋     | 950/2049 [16:35<19:16,  1.05s/it]

{'loss': 0.0002, 'grad_norm': 0.006991889793425798, 'learning_rate': 2.6817959980478286e-05, 'epoch': 0.46}


 47%|████▋     | 955/2049 [16:40<19:12,  1.05s/it]

{'loss': 0.0002, 'grad_norm': 0.006513091269880533, 'learning_rate': 2.669594924353343e-05, 'epoch': 0.47}


 47%|████▋     | 960/2049 [16:45<19:25,  1.07s/it]

{'loss': 0.0002, 'grad_norm': 0.005166684277355671, 'learning_rate': 2.657393850658858e-05, 'epoch': 0.47}


 47%|████▋     | 965/2049 [16:51<19:05,  1.06s/it]

{'loss': 0.0002, 'grad_norm': 0.0061825974844396114, 'learning_rate': 2.645192776964373e-05, 'epoch': 0.47}


 47%|████▋     | 970/2049 [16:56<18:37,  1.04s/it]

{'loss': 0.0002, 'grad_norm': 0.004308903124183416, 'learning_rate': 2.632991703269888e-05, 'epoch': 0.47}


 48%|████▊     | 975/2049 [17:01<18:50,  1.05s/it]

{'loss': 0.0002, 'grad_norm': 0.005825825035572052, 'learning_rate': 2.6207906295754025e-05, 'epoch': 0.48}


 48%|████▊     | 980/2049 [17:06<19:05,  1.07s/it]

{'loss': 0.0002, 'grad_norm': 0.005779357627034187, 'learning_rate': 2.6085895558809177e-05, 'epoch': 0.48}


 48%|████▊     | 985/2049 [17:12<20:13,  1.14s/it]

{'loss': 0.0002, 'grad_norm': 0.004698681179434061, 'learning_rate': 2.5963884821864326e-05, 'epoch': 0.48}


 48%|████▊     | 990/2049 [17:18<19:35,  1.11s/it]

{'loss': 0.0002, 'grad_norm': 0.005392467603087425, 'learning_rate': 2.5841874084919477e-05, 'epoch': 0.48}


 49%|████▊     | 995/2049 [17:23<19:21,  1.10s/it]

{'loss': 0.0002, 'grad_norm': 0.006074706558138132, 'learning_rate': 2.5719863347974623e-05, 'epoch': 0.49}


 49%|████▉     | 1000/2049 [17:28<18:50,  1.08s/it]

{'loss': 0.0002, 'grad_norm': 0.006338128354400396, 'learning_rate': 2.559785261102977e-05, 'epoch': 0.49}


 49%|████▉     | 1005/2049 [17:34<18:09,  1.04s/it]

{'loss': 0.0002, 'grad_norm': 0.0048799291253089905, 'learning_rate': 2.5475841874084923e-05, 'epoch': 0.49}


 49%|████▉     | 1010/2049 [17:39<19:12,  1.11s/it]

{'loss': 0.0002, 'grad_norm': 0.0042411102913320065, 'learning_rate': 2.535383113714007e-05, 'epoch': 0.49}


 50%|████▉     | 1015/2049 [17:45<18:56,  1.10s/it]

{'loss': 0.0002, 'grad_norm': 0.005872428882867098, 'learning_rate': 2.5231820400195216e-05, 'epoch': 0.5}


 50%|████▉     | 1020/2049 [17:50<19:40,  1.15s/it]

{'loss': 0.0002, 'grad_norm': 0.003214580239728093, 'learning_rate': 2.5109809663250365e-05, 'epoch': 0.5}


 50%|█████     | 1025/2049 [17:56<18:41,  1.09s/it]

{'loss': 0.0001, 'grad_norm': 0.004225651267915964, 'learning_rate': 2.4987798926305517e-05, 'epoch': 0.5}


 50%|█████     | 1030/2049 [18:01<18:09,  1.07s/it]

{'loss': 0.0001, 'grad_norm': 0.004113923292607069, 'learning_rate': 2.4865788189360665e-05, 'epoch': 0.5}


 51%|█████     | 1035/2049 [18:07<18:35,  1.10s/it]

{'loss': 0.0001, 'grad_norm': 0.0039797392673790455, 'learning_rate': 2.4743777452415814e-05, 'epoch': 0.51}


 51%|█████     | 1040/2049 [18:12<18:18,  1.09s/it]

{'loss': 0.0002, 'grad_norm': 0.0043137273751199245, 'learning_rate': 2.4621766715470962e-05, 'epoch': 0.51}


 51%|█████     | 1045/2049 [18:18<18:25,  1.10s/it]

{'loss': 0.0001, 'grad_norm': 0.005468417424708605, 'learning_rate': 2.449975597852611e-05, 'epoch': 0.51}


 51%|█████     | 1050/2049 [18:23<18:24,  1.11s/it]

{'loss': 0.0001, 'grad_norm': 0.004080756101757288, 'learning_rate': 2.437774524158126e-05, 'epoch': 0.51}


 51%|█████▏    | 1055/2049 [18:28<17:35,  1.06s/it]

{'loss': 0.0001, 'grad_norm': 0.0054817539639770985, 'learning_rate': 2.425573450463641e-05, 'epoch': 0.51}


 52%|█████▏    | 1060/2049 [18:34<18:32,  1.12s/it]

{'loss': 0.0001, 'grad_norm': 0.003341288072988391, 'learning_rate': 2.4133723767691556e-05, 'epoch': 0.52}


 52%|█████▏    | 1065/2049 [18:39<18:01,  1.10s/it]

{'loss': 0.0001, 'grad_norm': 0.003945095930248499, 'learning_rate': 2.4011713030746708e-05, 'epoch': 0.52}


 52%|█████▏    | 1070/2049 [18:45<17:10,  1.05s/it]

{'loss': 0.0001, 'grad_norm': 0.0037189198192209005, 'learning_rate': 2.3889702293801856e-05, 'epoch': 0.52}


 52%|█████▏    | 1075/2049 [18:50<16:51,  1.04s/it]

{'loss': 0.0001, 'grad_norm': 0.004451721906661987, 'learning_rate': 2.3767691556857005e-05, 'epoch': 0.52}


 53%|█████▎    | 1080/2049 [18:55<17:36,  1.09s/it]

{'loss': 0.0001, 'grad_norm': 0.005930550862103701, 'learning_rate': 2.3645680819912153e-05, 'epoch': 0.53}


 53%|█████▎    | 1085/2049 [19:00<15:14,  1.05it/s]

{'loss': 1.818, 'grad_norm': 0.00403014337643981, 'learning_rate': 2.3523670082967302e-05, 'epoch': 0.53}


 53%|█████▎    | 1090/2049 [19:05<14:16,  1.12it/s]

{'loss': 0.0002, 'grad_norm': 0.006098912563174963, 'learning_rate': 2.340165934602245e-05, 'epoch': 0.53}


 53%|█████▎    | 1095/2049 [19:09<13:25,  1.18it/s]

{'loss': 0.0002, 'grad_norm': 0.006951890420168638, 'learning_rate': 2.3279648609077602e-05, 'epoch': 0.53}


 54%|█████▎    | 1100/2049 [19:13<13:21,  1.18it/s]

{'loss': 1.6515, 'grad_norm': 0.010286142118275166, 'learning_rate': 2.3157637872132747e-05, 'epoch': 0.54}


 54%|█████▍    | 1105/2049 [19:17<13:18,  1.18it/s]

{'loss': 1.4834, 'grad_norm': 0.015895908698439598, 'learning_rate': 2.30356271351879e-05, 'epoch': 0.54}


 54%|█████▍    | 1110/2049 [19:21<13:18,  1.18it/s]

{'loss': 0.0008, 'grad_norm': 0.03225291892886162, 'learning_rate': 2.2913616398243044e-05, 'epoch': 0.54}


 54%|█████▍    | 1115/2049 [19:26<13:10,  1.18it/s]

{'loss': 0.0012, 'grad_norm': 0.054562605917453766, 'learning_rate': 2.2791605661298196e-05, 'epoch': 0.54}


 55%|█████▍    | 1120/2049 [19:30<13:15,  1.17it/s]

{'loss': 0.0011, 'grad_norm': 0.03227059543132782, 'learning_rate': 2.2669594924353344e-05, 'epoch': 0.55}


 55%|█████▍    | 1125/2049 [19:34<12:58,  1.19it/s]

{'loss': 0.0013, 'grad_norm': 0.05840907245874405, 'learning_rate': 2.2547584187408493e-05, 'epoch': 0.55}


 55%|█████▌    | 1130/2049 [19:39<12:53,  1.19it/s]

{'loss': 0.0009, 'grad_norm': 0.026083696633577347, 'learning_rate': 2.242557345046364e-05, 'epoch': 0.55}


 55%|█████▌    | 1135/2049 [19:43<12:49,  1.19it/s]

{'loss': 1.4123, 'grad_norm': 0.024329151958227158, 'learning_rate': 2.230356271351879e-05, 'epoch': 0.55}


 56%|█████▌    | 1140/2049 [19:47<12:31,  1.21it/s]

{'loss': 0.0011, 'grad_norm': 0.030300412327051163, 'learning_rate': 2.218155197657394e-05, 'epoch': 0.56}


 56%|█████▌    | 1145/2049 [19:51<12:32,  1.20it/s]

{'loss': 0.0012, 'grad_norm': 0.03703514114022255, 'learning_rate': 2.205954123962909e-05, 'epoch': 0.56}


 56%|█████▌    | 1150/2049 [19:55<12:33,  1.19it/s]

{'loss': 0.0012, 'grad_norm': 0.03162536397576332, 'learning_rate': 2.1937530502684235e-05, 'epoch': 0.56}


 56%|█████▋    | 1155/2049 [20:00<12:32,  1.19it/s]

{'loss': 0.001, 'grad_norm': 0.028175530955195427, 'learning_rate': 2.1815519765739387e-05, 'epoch': 0.56}


 57%|█████▋    | 1160/2049 [20:04<12:24,  1.19it/s]

{'loss': 0.001, 'grad_norm': 0.03386906161904335, 'learning_rate': 2.1693509028794536e-05, 'epoch': 0.57}


 57%|█████▋    | 1165/2049 [20:08<12:14,  1.20it/s]

{'loss': 0.0008, 'grad_norm': 0.026933249086141586, 'learning_rate': 2.1571498291849684e-05, 'epoch': 0.57}


 57%|█████▋    | 1170/2049 [20:12<12:16,  1.19it/s]

{'loss': 0.0007, 'grad_norm': 0.020588546991348267, 'learning_rate': 2.1449487554904833e-05, 'epoch': 0.57}


 57%|█████▋    | 1175/2049 [20:16<12:08,  1.20it/s]

{'loss': 0.0007, 'grad_norm': 0.020250104367733, 'learning_rate': 2.132747681795998e-05, 'epoch': 0.57}


 58%|█████▊    | 1180/2049 [20:21<12:10,  1.19it/s]

{'loss': 0.0006, 'grad_norm': 0.02475075237452984, 'learning_rate': 2.120546608101513e-05, 'epoch': 0.58}


 58%|█████▊    | 1185/2049 [20:25<12:09,  1.18it/s]

{'loss': 0.0006, 'grad_norm': 0.01784803345799446, 'learning_rate': 2.108345534407028e-05, 'epoch': 0.58}


 58%|█████▊    | 1190/2049 [20:29<12:01,  1.19it/s]

{'loss': 1.4544, 'grad_norm': 0.024611052125692368, 'learning_rate': 2.0961444607125426e-05, 'epoch': 0.58}


 58%|█████▊    | 1195/2049 [20:33<11:56,  1.19it/s]

{'loss': 0.0007, 'grad_norm': 0.02236674167215824, 'learning_rate': 2.0839433870180578e-05, 'epoch': 0.58}


 59%|█████▊    | 1200/2049 [20:37<11:48,  1.20it/s]

{'loss': 0.0009, 'grad_norm': 0.032788779586553574, 'learning_rate': 2.0717423133235723e-05, 'epoch': 0.59}


 59%|█████▉    | 1205/2049 [20:42<11:54,  1.18it/s]

{'loss': 0.0008, 'grad_norm': 0.02910909429192543, 'learning_rate': 2.0595412396290875e-05, 'epoch': 0.59}


 59%|█████▉    | 1210/2049 [20:46<11:51,  1.18it/s]

{'loss': 0.0008, 'grad_norm': 0.0240279883146286, 'learning_rate': 2.0473401659346024e-05, 'epoch': 0.59}


 59%|█████▉    | 1215/2049 [20:50<11:32,  1.20it/s]

{'loss': 0.0007, 'grad_norm': 0.021282292902469635, 'learning_rate': 2.0351390922401172e-05, 'epoch': 0.59}


 60%|█████▉    | 1220/2049 [20:54<11:35,  1.19it/s]

{'loss': 0.0007, 'grad_norm': 0.01987724006175995, 'learning_rate': 2.022938018545632e-05, 'epoch': 0.6}


 60%|█████▉    | 1225/2049 [20:58<11:34,  1.19it/s]

{'loss': 1.4563, 'grad_norm': 0.015297355130314827, 'learning_rate': 2.010736944851147e-05, 'epoch': 0.6}


 60%|██████    | 1230/2049 [21:03<11:32,  1.18it/s]

{'loss': 0.0008, 'grad_norm': 0.030468840152025223, 'learning_rate': 1.9985358711566618e-05, 'epoch': 0.6}


 60%|██████    | 1235/2049 [21:07<11:35,  1.17it/s]

{'loss': 0.0007, 'grad_norm': 0.02326815575361252, 'learning_rate': 1.986334797462177e-05, 'epoch': 0.6}


 61%|██████    | 1240/2049 [21:11<11:22,  1.19it/s]

{'loss': 0.001, 'grad_norm': 0.029845470562577248, 'learning_rate': 1.9741337237676914e-05, 'epoch': 0.61}


 61%|██████    | 1245/2049 [21:16<12:43,  1.05it/s]

{'loss': 0.0008, 'grad_norm': 0.03360766917467117, 'learning_rate': 1.9619326500732066e-05, 'epoch': 0.61}


 61%|██████    | 1250/2049 [21:21<13:53,  1.04s/it]

{'loss': 1.389, 'grad_norm': 0.03525266423821449, 'learning_rate': 1.9497315763787215e-05, 'epoch': 0.61}


 61%|██████    | 1255/2049 [21:27<14:24,  1.09s/it]

{'loss': 0.001, 'grad_norm': 0.028288431465625763, 'learning_rate': 1.9375305026842363e-05, 'epoch': 0.61}


 61%|██████▏   | 1260/2049 [21:32<13:52,  1.05s/it]

{'loss': 0.0011, 'grad_norm': 0.032373037189245224, 'learning_rate': 1.9253294289897512e-05, 'epoch': 0.61}


 62%|██████▏   | 1265/2049 [21:37<14:00,  1.07s/it]

{'loss': 0.0011, 'grad_norm': 0.028891313821077347, 'learning_rate': 1.913128355295266e-05, 'epoch': 0.62}


 62%|██████▏   | 1270/2049 [21:43<13:52,  1.07s/it]

{'loss': 0.001, 'grad_norm': 0.026358285918831825, 'learning_rate': 1.900927281600781e-05, 'epoch': 0.62}


 62%|██████▏   | 1275/2049 [21:48<14:05,  1.09s/it]

{'loss': 1.4105, 'grad_norm': 0.029817072674632072, 'learning_rate': 1.888726207906296e-05, 'epoch': 0.62}


 62%|██████▏   | 1280/2049 [21:54<14:14,  1.11s/it]

{'loss': 0.0011, 'grad_norm': 0.023583753034472466, 'learning_rate': 1.8765251342118106e-05, 'epoch': 0.62}


 63%|██████▎   | 1285/2049 [21:59<13:46,  1.08s/it]

{'loss': 0.0011, 'grad_norm': 0.03196949139237404, 'learning_rate': 1.8643240605173258e-05, 'epoch': 0.63}


 63%|██████▎   | 1290/2049 [22:05<13:43,  1.09s/it]

{'loss': 0.0012, 'grad_norm': 0.034053005278110504, 'learning_rate': 1.8521229868228403e-05, 'epoch': 0.63}


 63%|██████▎   | 1295/2049 [22:10<13:56,  1.11s/it]

{'loss': 1.3763, 'grad_norm': 0.04099012166261673, 'learning_rate': 1.8399219131283554e-05, 'epoch': 0.63}


 63%|██████▎   | 1300/2049 [22:16<13:15,  1.06s/it]

{'loss': 0.0016, 'grad_norm': 0.05319812521338463, 'learning_rate': 1.8277208394338703e-05, 'epoch': 0.63}


 64%|██████▎   | 1305/2049 [22:21<13:37,  1.10s/it]

{'loss': 0.0016, 'grad_norm': 0.03047342039644718, 'learning_rate': 1.815519765739385e-05, 'epoch': 0.64}


 64%|██████▍   | 1310/2049 [22:27<13:34,  1.10s/it]

{'loss': 0.0014, 'grad_norm': 0.03009476140141487, 'learning_rate': 1.8033186920449e-05, 'epoch': 0.64}


 64%|██████▍   | 1315/2049 [22:32<13:12,  1.08s/it]

{'loss': 0.0013, 'grad_norm': 0.035692840814590454, 'learning_rate': 1.791117618350415e-05, 'epoch': 0.64}


 64%|██████▍   | 1320/2049 [22:37<13:11,  1.09s/it]

{'loss': 1.3207, 'grad_norm': 0.04451172798871994, 'learning_rate': 1.7789165446559297e-05, 'epoch': 0.64}


 65%|██████▍   | 1325/2049 [22:43<13:09,  1.09s/it]

{'loss': 0.0014, 'grad_norm': 0.04079539701342583, 'learning_rate': 1.766715470961445e-05, 'epoch': 0.65}


 65%|██████▍   | 1330/2049 [22:48<12:55,  1.08s/it]

{'loss': 0.0014, 'grad_norm': 0.04376484453678131, 'learning_rate': 1.7545143972669594e-05, 'epoch': 0.65}


 65%|██████▌   | 1335/2049 [22:54<13:03,  1.10s/it]

{'loss': 0.0012, 'grad_norm': 0.02741820737719536, 'learning_rate': 1.7423133235724746e-05, 'epoch': 0.65}


 65%|██████▌   | 1340/2049 [22:59<12:41,  1.07s/it]

{'loss': 0.0014, 'grad_norm': 0.04080605506896973, 'learning_rate': 1.7301122498779894e-05, 'epoch': 0.65}


 66%|██████▌   | 1345/2049 [23:04<12:47,  1.09s/it]

{'loss': 0.0012, 'grad_norm': 0.026939671486616135, 'learning_rate': 1.7179111761835043e-05, 'epoch': 0.66}


 66%|██████▌   | 1350/2049 [23:10<12:27,  1.07s/it]

{'loss': 0.001, 'grad_norm': 0.027850260958075523, 'learning_rate': 1.705710102489019e-05, 'epoch': 0.66}


 66%|██████▌   | 1355/2049 [23:15<12:27,  1.08s/it]

{'loss': 0.0009, 'grad_norm': 0.024538923054933548, 'learning_rate': 1.693509028794534e-05, 'epoch': 0.66}


 66%|██████▋   | 1360/2049 [23:21<12:13,  1.06s/it]

{'loss': 0.0008, 'grad_norm': 0.02525397390127182, 'learning_rate': 1.6813079551000488e-05, 'epoch': 0.66}


 67%|██████▋   | 1365/2049 [23:26<12:08,  1.06s/it]

{'loss': 1.4277, 'grad_norm': 30.239551544189453, 'learning_rate': 1.669106881405564e-05, 'epoch': 0.67}


 67%|██████▋   | 1370/2049 [23:31<11:32,  1.02s/it]

{'loss': 1.356, 'grad_norm': 0.02742438018321991, 'learning_rate': 1.6569058077110785e-05, 'epoch': 0.67}


 67%|██████▋   | 1375/2049 [23:36<11:30,  1.02s/it]

{'loss': 0.0013, 'grad_norm': 0.032708391547203064, 'learning_rate': 1.6447047340165937e-05, 'epoch': 0.67}


 67%|██████▋   | 1380/2049 [23:41<11:32,  1.04s/it]

{'loss': 0.0017, 'grad_norm': 0.032948561012744904, 'learning_rate': 1.6325036603221082e-05, 'epoch': 0.67}


 68%|██████▊   | 1385/2049 [23:47<11:32,  1.04s/it]

{'loss': 1.3207, 'grad_norm': 0.062180034816265106, 'learning_rate': 1.6203025866276234e-05, 'epoch': 0.68}


 68%|██████▊   | 1390/2049 [23:52<11:48,  1.08s/it]

{'loss': 1.2238, 'grad_norm': 0.09934663027524948, 'learning_rate': 1.6081015129331382e-05, 'epoch': 0.68}


 68%|██████▊   | 1395/2049 [23:57<11:28,  1.05s/it]

{'loss': 0.0033, 'grad_norm': 0.10288065671920776, 'learning_rate': 1.595900439238653e-05, 'epoch': 0.68}


 68%|██████▊   | 1400/2049 [24:03<11:39,  1.08s/it]

{'loss': 0.0027, 'grad_norm': 0.08634163439273834, 'learning_rate': 1.583699365544168e-05, 'epoch': 0.68}


 69%|██████▊   | 1405/2049 [24:08<11:49,  1.10s/it]

{'loss': 0.0029, 'grad_norm': 0.07672125101089478, 'learning_rate': 1.5714982918496828e-05, 'epoch': 0.69}


 69%|██████▉   | 1410/2049 [24:14<11:29,  1.08s/it]

{'loss': 0.0022, 'grad_norm': 0.06407904624938965, 'learning_rate': 1.5592972181551976e-05, 'epoch': 0.69}


 69%|██████▉   | 1415/2049 [24:19<11:02,  1.05s/it]

{'loss': 0.0019, 'grad_norm': 0.057735662907361984, 'learning_rate': 1.5470961444607128e-05, 'epoch': 0.69}


 69%|██████▉   | 1420/2049 [24:24<11:33,  1.10s/it]

{'loss': 0.0018, 'grad_norm': 0.05728689208626747, 'learning_rate': 1.5348950707662273e-05, 'epoch': 0.69}


 70%|██████▉   | 1425/2049 [24:30<11:01,  1.06s/it]

{'loss': 0.0014, 'grad_norm': 0.035020727664232254, 'learning_rate': 1.5226939970717425e-05, 'epoch': 0.7}


 70%|██████▉   | 1430/2049 [24:35<11:07,  1.08s/it]

{'loss': 0.0011, 'grad_norm': 0.03275752440094948, 'learning_rate': 1.5104929233772572e-05, 'epoch': 0.7}


 70%|███████   | 1435/2049 [24:40<10:55,  1.07s/it]

{'loss': 1.3609, 'grad_norm': 0.027154266834259033, 'learning_rate': 1.4982918496827722e-05, 'epoch': 0.7}


 70%|███████   | 1440/2049 [24:46<10:56,  1.08s/it]

{'loss': 0.001, 'grad_norm': 0.03687148541212082, 'learning_rate': 1.486090775988287e-05, 'epoch': 0.7}


 71%|███████   | 1445/2049 [24:51<10:46,  1.07s/it]

{'loss': 0.0013, 'grad_norm': 0.04183116927742958, 'learning_rate': 1.473889702293802e-05, 'epoch': 0.71}


 71%|███████   | 1450/2049 [24:57<10:48,  1.08s/it]

{'loss': 0.0012, 'grad_norm': 0.02913907915353775, 'learning_rate': 1.4616886285993167e-05, 'epoch': 0.71}


 71%|███████   | 1455/2049 [25:02<10:40,  1.08s/it]

{'loss': 1.2353, 'grad_norm': 31.786230087280273, 'learning_rate': 1.4494875549048317e-05, 'epoch': 0.71}


 71%|███████▏  | 1460/2049 [25:07<10:28,  1.07s/it]

{'loss': 0.0012, 'grad_norm': 0.03559740260243416, 'learning_rate': 1.4372864812103464e-05, 'epoch': 0.71}


 71%|███████▏  | 1465/2049 [25:13<10:28,  1.08s/it]

{'loss': 0.0014, 'grad_norm': 0.04101219400763512, 'learning_rate': 1.4250854075158614e-05, 'epoch': 0.71}


 72%|███████▏  | 1470/2049 [25:18<10:31,  1.09s/it]

{'loss': 0.0014, 'grad_norm': 0.04394756257534027, 'learning_rate': 1.4128843338213763e-05, 'epoch': 0.72}


 72%|███████▏  | 1475/2049 [25:24<10:30,  1.10s/it]

{'loss': 1.3639, 'grad_norm': 0.03446921706199646, 'learning_rate': 1.4006832601268913e-05, 'epoch': 0.72}


 72%|███████▏  | 1480/2049 [25:29<09:55,  1.05s/it]

{'loss': 0.0018, 'grad_norm': 0.06166542321443558, 'learning_rate': 1.388482186432406e-05, 'epoch': 0.72}


 72%|███████▏  | 1485/2049 [25:34<10:05,  1.07s/it]

{'loss': 0.0017, 'grad_norm': 0.07797189056873322, 'learning_rate': 1.376281112737921e-05, 'epoch': 0.72}


 73%|███████▎  | 1490/2049 [25:40<09:58,  1.07s/it]

{'loss': 0.0013, 'grad_norm': 0.028673246502876282, 'learning_rate': 1.3640800390434358e-05, 'epoch': 0.73}


 73%|███████▎  | 1495/2049 [25:45<09:56,  1.08s/it]

{'loss': 0.0013, 'grad_norm': 0.035136543214321136, 'learning_rate': 1.3518789653489509e-05, 'epoch': 0.73}


 73%|███████▎  | 1500/2049 [25:50<09:43,  1.06s/it]

{'loss': 0.0011, 'grad_norm': 0.03562557324767113, 'learning_rate': 1.3396778916544655e-05, 'epoch': 0.73}


 73%|███████▎  | 1505/2049 [25:56<10:10,  1.12s/it]

{'loss': 0.0009, 'grad_norm': 0.0274956077337265, 'learning_rate': 1.3274768179599805e-05, 'epoch': 0.73}


 74%|███████▎  | 1510/2049 [26:01<09:12,  1.03s/it]

{'loss': 0.0008, 'grad_norm': 0.026098504662513733, 'learning_rate': 1.3152757442654954e-05, 'epoch': 0.74}


 74%|███████▍  | 1515/2049 [26:06<09:05,  1.02s/it]

{'loss': 0.0006, 'grad_norm': 0.013987718150019646, 'learning_rate': 1.3030746705710104e-05, 'epoch': 0.74}


 74%|███████▍  | 1520/2049 [26:12<09:24,  1.07s/it]

{'loss': 0.0007, 'grad_norm': 0.017582425847649574, 'learning_rate': 1.2908735968765251e-05, 'epoch': 0.74}


 74%|███████▍  | 1525/2049 [26:17<09:12,  1.05s/it]

{'loss': 0.0006, 'grad_norm': 0.020903360098600388, 'learning_rate': 1.2786725231820401e-05, 'epoch': 0.74}


 75%|███████▍  | 1530/2049 [26:22<09:17,  1.07s/it]

{'loss': 0.0007, 'grad_norm': 0.014896544627845287, 'learning_rate': 1.2664714494875548e-05, 'epoch': 0.75}


 75%|███████▍  | 1535/2049 [26:28<09:08,  1.07s/it]

{'loss': 0.0006, 'grad_norm': 0.014832232147455215, 'learning_rate': 1.25427037579307e-05, 'epoch': 0.75}


 75%|███████▌  | 1540/2049 [26:33<08:57,  1.06s/it]

{'loss': 0.0005, 'grad_norm': 0.011984818615019321, 'learning_rate': 1.2420693020985848e-05, 'epoch': 0.75}


 75%|███████▌  | 1545/2049 [26:38<08:56,  1.06s/it]

{'loss': 0.0005, 'grad_norm': 0.00981852225959301, 'learning_rate': 1.2298682284040997e-05, 'epoch': 0.75}


 76%|███████▌  | 1550/2049 [26:44<09:04,  1.09s/it]

{'loss': 0.0004, 'grad_norm': 0.013686951249837875, 'learning_rate': 1.2176671547096145e-05, 'epoch': 0.76}


 76%|███████▌  | 1555/2049 [26:49<09:02,  1.10s/it]

{'loss': 0.0004, 'grad_norm': 0.016894420608878136, 'learning_rate': 1.2054660810151294e-05, 'epoch': 0.76}


 76%|███████▌  | 1560/2049 [26:54<08:39,  1.06s/it]

{'loss': 0.0005, 'grad_norm': 0.017434820532798767, 'learning_rate': 1.1932650073206444e-05, 'epoch': 0.76}


 76%|███████▋  | 1565/2049 [27:00<08:26,  1.05s/it]

{'loss': 0.0004, 'grad_norm': 0.010841756127774715, 'learning_rate': 1.1810639336261592e-05, 'epoch': 0.76}


 77%|███████▋  | 1570/2049 [27:05<08:14,  1.03s/it]

{'loss': 0.0004, 'grad_norm': 0.012900855392217636, 'learning_rate': 1.168862859931674e-05, 'epoch': 0.77}


 77%|███████▋  | 1575/2049 [27:10<08:39,  1.10s/it]

{'loss': 0.0004, 'grad_norm': 0.013710529543459415, 'learning_rate': 1.1566617862371889e-05, 'epoch': 0.77}


 77%|███████▋  | 1580/2049 [27:16<08:23,  1.07s/it]

{'loss': 0.0003, 'grad_norm': 0.010142569430172443, 'learning_rate': 1.144460712542704e-05, 'epoch': 0.77}


 77%|███████▋  | 1585/2049 [27:21<08:15,  1.07s/it]

{'loss': 0.0004, 'grad_norm': 0.010405960492789745, 'learning_rate': 1.1322596388482188e-05, 'epoch': 0.77}


 78%|███████▊  | 1590/2049 [27:26<08:05,  1.06s/it]

{'loss': 0.0004, 'grad_norm': 0.009890365414321423, 'learning_rate': 1.1200585651537336e-05, 'epoch': 0.78}


 78%|███████▊  | 1595/2049 [27:32<07:57,  1.05s/it]

{'loss': 0.0003, 'grad_norm': 0.010356414131820202, 'learning_rate': 1.1078574914592485e-05, 'epoch': 0.78}


 78%|███████▊  | 1600/2049 [27:37<07:59,  1.07s/it]

{'loss': 1.5771, 'grad_norm': 0.01211925782263279, 'learning_rate': 1.0956564177647633e-05, 'epoch': 0.78}


 78%|███████▊  | 1605/2049 [27:42<08:02,  1.09s/it]

{'loss': 0.0005, 'grad_norm': 0.02053501456975937, 'learning_rate': 1.0834553440702783e-05, 'epoch': 0.78}


 79%|███████▊  | 1610/2049 [27:48<07:57,  1.09s/it]

{'loss': 0.0005, 'grad_norm': 0.014211724512279034, 'learning_rate': 1.0712542703757932e-05, 'epoch': 0.79}


 79%|███████▉  | 1615/2049 [27:53<07:36,  1.05s/it]

{'loss': 0.0006, 'grad_norm': 0.016548896208405495, 'learning_rate': 1.059053196681308e-05, 'epoch': 0.79}


 79%|███████▉  | 1620/2049 [27:58<07:31,  1.05s/it]

{'loss': 0.0005, 'grad_norm': 0.013124611228704453, 'learning_rate': 1.0468521229868229e-05, 'epoch': 0.79}


 79%|███████▉  | 1625/2049 [28:04<07:39,  1.08s/it]

{'loss': 0.0005, 'grad_norm': 0.01245195884257555, 'learning_rate': 1.0346510492923377e-05, 'epoch': 0.79}


 80%|███████▉  | 1630/2049 [28:09<07:25,  1.06s/it]

{'loss': 0.0005, 'grad_norm': 0.01411798968911171, 'learning_rate': 1.0224499755978527e-05, 'epoch': 0.8}


 80%|███████▉  | 1635/2049 [28:15<07:35,  1.10s/it]

{'loss': 0.0005, 'grad_norm': 0.013179667294025421, 'learning_rate': 1.0102489019033676e-05, 'epoch': 0.8}


 80%|████████  | 1640/2049 [28:20<07:16,  1.07s/it]

{'loss': 0.0004, 'grad_norm': 0.014830172061920166, 'learning_rate': 9.980478282088824e-06, 'epoch': 0.8}


 80%|████████  | 1645/2049 [28:25<07:19,  1.09s/it]

{'loss': 0.0005, 'grad_norm': 0.015179676935076714, 'learning_rate': 9.858467545143973e-06, 'epoch': 0.8}


 81%|████████  | 1650/2049 [28:31<07:07,  1.07s/it]

{'loss': 0.0004, 'grad_norm': 0.013925987295806408, 'learning_rate': 9.736456808199123e-06, 'epoch': 0.81}


 81%|████████  | 1655/2049 [28:36<07:15,  1.11s/it]

{'loss': 0.0004, 'grad_norm': 0.015378270298242569, 'learning_rate': 9.614446071254271e-06, 'epoch': 0.81}


 81%|████████  | 1660/2049 [28:43<07:10,  1.11s/it]

{'loss': 0.0004, 'grad_norm': 0.010780628770589828, 'learning_rate': 9.49243533430942e-06, 'epoch': 0.81}


 81%|████████▏ | 1665/2049 [28:47<06:18,  1.01it/s]

{'loss': 0.0004, 'grad_norm': 0.012739050202071667, 'learning_rate': 9.370424597364568e-06, 'epoch': 0.81}


 82%|████████▏ | 1670/2049 [28:52<06:15,  1.01it/s]

{'loss': 0.0004, 'grad_norm': 0.01273959968239069, 'learning_rate': 9.248413860419717e-06, 'epoch': 0.82}


 82%|████████▏ | 1675/2049 [28:58<06:33,  1.05s/it]

{'loss': 0.0003, 'grad_norm': 0.008991283364593983, 'learning_rate': 9.126403123474867e-06, 'epoch': 0.82}


 82%|████████▏ | 1680/2049 [29:03<06:45,  1.10s/it]

{'loss': 0.0003, 'grad_norm': 0.012619981542229652, 'learning_rate': 9.004392386530015e-06, 'epoch': 0.82}


 82%|████████▏ | 1685/2049 [29:08<06:33,  1.08s/it]

{'loss': 0.0003, 'grad_norm': 0.010126342065632343, 'learning_rate': 8.882381649585164e-06, 'epoch': 0.82}


 82%|████████▏ | 1690/2049 [29:14<06:18,  1.05s/it]

{'loss': 0.0004, 'grad_norm': 0.008449310436844826, 'learning_rate': 8.760370912640312e-06, 'epoch': 0.82}


 83%|████████▎ | 1695/2049 [29:19<06:15,  1.06s/it]

{'loss': 0.0004, 'grad_norm': 0.011996488086879253, 'learning_rate': 8.638360175695463e-06, 'epoch': 0.83}


 83%|████████▎ | 1700/2049 [29:24<06:23,  1.10s/it]

{'loss': 0.0004, 'grad_norm': 0.010991094633936882, 'learning_rate': 8.516349438750611e-06, 'epoch': 0.83}


 83%|████████▎ | 1705/2049 [29:30<05:56,  1.04s/it]

{'loss': 0.0004, 'grad_norm': 0.016715683043003082, 'learning_rate': 8.39433870180576e-06, 'epoch': 0.83}


 83%|████████▎ | 1710/2049 [29:35<06:09,  1.09s/it]

{'loss': 0.0004, 'grad_norm': 0.011204938404262066, 'learning_rate': 8.272327964860908e-06, 'epoch': 0.83}


 84%|████████▎ | 1715/2049 [29:40<05:57,  1.07s/it]

{'loss': 0.0003, 'grad_norm': 0.00827395636588335, 'learning_rate': 8.150317227916056e-06, 'epoch': 0.84}


 84%|████████▍ | 1720/2049 [29:46<05:55,  1.08s/it]

{'loss': 0.0003, 'grad_norm': 0.01138731837272644, 'learning_rate': 8.028306490971207e-06, 'epoch': 0.84}


 84%|████████▍ | 1725/2049 [29:51<05:49,  1.08s/it]

{'loss': 0.0003, 'grad_norm': 0.006178366951644421, 'learning_rate': 7.906295754026355e-06, 'epoch': 0.84}


 84%|████████▍ | 1730/2049 [29:56<05:34,  1.05s/it]

{'loss': 0.0003, 'grad_norm': 0.007537882775068283, 'learning_rate': 7.784285017081504e-06, 'epoch': 0.84}


 85%|████████▍ | 1735/2049 [30:02<05:32,  1.06s/it]

{'loss': 0.0003, 'grad_norm': 0.008221166208386421, 'learning_rate': 7.662274280136652e-06, 'epoch': 0.85}


 85%|████████▍ | 1740/2049 [30:07<05:42,  1.11s/it]

{'loss': 1.6023, 'grad_norm': 0.007653723005205393, 'learning_rate': 7.540263543191801e-06, 'epoch': 0.85}


 85%|████████▌ | 1745/2049 [30:13<05:26,  1.07s/it]

{'loss': 0.0003, 'grad_norm': 0.007916788570582867, 'learning_rate': 7.418252806246951e-06, 'epoch': 0.85}


 85%|████████▌ | 1750/2049 [30:18<05:13,  1.05s/it]

{'loss': 1.5423, 'grad_norm': 0.012608028948307037, 'learning_rate': 7.296242069302099e-06, 'epoch': 0.85}


 86%|████████▌ | 1755/2049 [30:23<05:12,  1.06s/it]

{'loss': 0.0004, 'grad_norm': 0.013621365651488304, 'learning_rate': 7.174231332357248e-06, 'epoch': 0.86}


 86%|████████▌ | 1760/2049 [30:29<05:11,  1.08s/it]

{'loss': 0.0004, 'grad_norm': 0.011652352288365364, 'learning_rate': 7.052220595412397e-06, 'epoch': 0.86}


 86%|████████▌ | 1765/2049 [30:34<05:06,  1.08s/it]

{'loss': 0.0004, 'grad_norm': 0.011308204382658005, 'learning_rate': 6.930209858467545e-06, 'epoch': 0.86}


 86%|████████▋ | 1770/2049 [30:39<04:59,  1.07s/it]

{'loss': 0.0005, 'grad_norm': 0.015417666174471378, 'learning_rate': 6.808199121522695e-06, 'epoch': 0.86}


 87%|████████▋ | 1775/2049 [30:45<04:54,  1.08s/it]

{'loss': 0.0005, 'grad_norm': 0.01365558709949255, 'learning_rate': 6.686188384577843e-06, 'epoch': 0.87}


 87%|████████▋ | 1780/2049 [30:50<04:50,  1.08s/it]

{'loss': 0.0005, 'grad_norm': 0.012871116399765015, 'learning_rate': 6.5641776476329925e-06, 'epoch': 0.87}


 87%|████████▋ | 1785/2049 [30:55<04:39,  1.06s/it]

{'loss': 0.0004, 'grad_norm': 0.013377316296100616, 'learning_rate': 6.442166910688141e-06, 'epoch': 0.87}


 87%|████████▋ | 1790/2049 [31:01<04:39,  1.08s/it]

{'loss': 0.0005, 'grad_norm': 0.013452146202325821, 'learning_rate': 6.32015617374329e-06, 'epoch': 0.87}


 88%|████████▊ | 1795/2049 [31:06<04:21,  1.03s/it]

{'loss': 0.0005, 'grad_norm': 0.016177529469132423, 'learning_rate': 6.198145436798439e-06, 'epoch': 0.88}


 88%|████████▊ | 1800/2049 [31:11<04:24,  1.06s/it]

{'loss': 0.0004, 'grad_norm': 0.016952214762568474, 'learning_rate': 6.076134699853587e-06, 'epoch': 0.88}


 88%|████████▊ | 1805/2049 [31:17<04:13,  1.04s/it]

{'loss': 0.0005, 'grad_norm': 0.011365472339093685, 'learning_rate': 5.9541239629087365e-06, 'epoch': 0.88}


 88%|████████▊ | 1810/2049 [31:22<04:11,  1.05s/it]

{'loss': 0.0004, 'grad_norm': 0.011687441729009151, 'learning_rate': 5.832113225963885e-06, 'epoch': 0.88}


 89%|████████▊ | 1815/2049 [31:27<04:14,  1.09s/it]

{'loss': 0.0004, 'grad_norm': 0.012124829925596714, 'learning_rate': 5.710102489019034e-06, 'epoch': 0.89}


 89%|████████▉ | 1820/2049 [31:33<04:13,  1.11s/it]

{'loss': 1.5452, 'grad_norm': 0.01651458814740181, 'learning_rate': 5.588091752074183e-06, 'epoch': 0.89}


 89%|████████▉ | 1825/2049 [31:38<04:02,  1.08s/it]

{'loss': 0.0005, 'grad_norm': 0.016984112560749054, 'learning_rate': 5.466081015129332e-06, 'epoch': 0.89}


 89%|████████▉ | 1830/2049 [31:43<03:52,  1.06s/it]

{'loss': 0.0005, 'grad_norm': 0.013289409689605236, 'learning_rate': 5.3440702781844806e-06, 'epoch': 0.89}


 90%|████████▉ | 1835/2049 [31:49<03:49,  1.07s/it]

{'loss': 1.448, 'grad_norm': 0.017148932442069054, 'learning_rate': 5.22205954123963e-06, 'epoch': 0.9}


 90%|████████▉ | 1840/2049 [31:54<03:52,  1.11s/it]

{'loss': 0.0005, 'grad_norm': 0.016023501753807068, 'learning_rate': 5.100048804294778e-06, 'epoch': 0.9}


 90%|█████████ | 1845/2049 [32:00<03:35,  1.06s/it]

{'loss': 1.52, 'grad_norm': 0.01829642616212368, 'learning_rate': 4.978038067349927e-06, 'epoch': 0.9}


 90%|█████████ | 1850/2049 [32:05<03:35,  1.08s/it]

{'loss': 0.0007, 'grad_norm': 0.021233217790722847, 'learning_rate': 4.856027330405076e-06, 'epoch': 0.9}


 91%|█████████ | 1855/2049 [32:10<03:34,  1.11s/it]

{'loss': 1.4051, 'grad_norm': 0.030889492481946945, 'learning_rate': 4.734016593460225e-06, 'epoch': 0.91}


 91%|█████████ | 1860/2049 [32:16<03:19,  1.06s/it]

{'loss': 0.0008, 'grad_norm': 0.02279103733599186, 'learning_rate': 4.612005856515374e-06, 'epoch': 0.91}


 91%|█████████ | 1865/2049 [32:21<03:16,  1.07s/it]

{'loss': 0.0008, 'grad_norm': 0.021033676341176033, 'learning_rate': 4.489995119570522e-06, 'epoch': 0.91}


 91%|█████████▏| 1870/2049 [32:26<03:09,  1.06s/it]

{'loss': 0.0009, 'grad_norm': 0.029173102229833603, 'learning_rate': 4.367984382625672e-06, 'epoch': 0.91}


 92%|█████████▏| 1875/2049 [32:32<03:09,  1.09s/it]

{'loss': 0.001, 'grad_norm': 0.023998139426112175, 'learning_rate': 4.24597364568082e-06, 'epoch': 0.92}


 92%|█████████▏| 1880/2049 [32:37<02:59,  1.06s/it]

{'loss': 0.0007, 'grad_norm': 0.017057081684470177, 'learning_rate': 4.123962908735969e-06, 'epoch': 0.92}


 92%|█████████▏| 1885/2049 [32:42<02:52,  1.05s/it]

{'loss': 0.0008, 'grad_norm': 0.018653659150004387, 'learning_rate': 4.001952171791118e-06, 'epoch': 0.92}


 92%|█████████▏| 1890/2049 [32:48<02:47,  1.05s/it]

{'loss': 0.001, 'grad_norm': 0.021476658061146736, 'learning_rate': 3.8799414348462664e-06, 'epoch': 0.92}


 92%|█████████▏| 1895/2049 [32:53<02:51,  1.11s/it]

{'loss': 0.0007, 'grad_norm': 0.020197084173560143, 'learning_rate': 3.7579306979014158e-06, 'epoch': 0.92}


 93%|█████████▎| 1900/2049 [32:58<02:41,  1.09s/it]

{'loss': 0.0008, 'grad_norm': 0.02624216303229332, 'learning_rate': 3.6359199609565647e-06, 'epoch': 0.93}


 93%|█████████▎| 1905/2049 [33:04<02:35,  1.08s/it]

{'loss': 1.3256, 'grad_norm': 0.030293134972453117, 'learning_rate': 3.513909224011713e-06, 'epoch': 0.93}


 93%|█████████▎| 1910/2049 [33:09<02:30,  1.08s/it]

{'loss': 0.0008, 'grad_norm': 0.030324550345540047, 'learning_rate': 3.391898487066862e-06, 'epoch': 0.93}


 93%|█████████▎| 1915/2049 [33:15<02:28,  1.11s/it]

{'loss': 0.0008, 'grad_norm': 0.024737082421779633, 'learning_rate': 3.269887750122011e-06, 'epoch': 0.93}


 94%|█████████▎| 1920/2049 [33:20<02:24,  1.12s/it]

{'loss': 0.0009, 'grad_norm': 0.028718486428260803, 'learning_rate': 3.14787701317716e-06, 'epoch': 0.94}


 94%|█████████▍| 1925/2049 [33:26<02:12,  1.07s/it]

{'loss': 1.3532, 'grad_norm': 31.952844619750977, 'learning_rate': 3.0258662762323087e-06, 'epoch': 0.94}


 94%|█████████▍| 1930/2049 [33:31<02:02,  1.03s/it]

{'loss': 0.0012, 'grad_norm': 0.034468501806259155, 'learning_rate': 2.9038555392874576e-06, 'epoch': 0.94}


 94%|█████████▍| 1935/2049 [33:36<01:58,  1.04s/it]

{'loss': 0.0009, 'grad_norm': 0.03941502049565315, 'learning_rate': 2.7818448023426065e-06, 'epoch': 0.94}


 95%|█████████▍| 1940/2049 [33:41<01:53,  1.05s/it]

{'loss': 0.001, 'grad_norm': 0.022957419976592064, 'learning_rate': 2.6598340653977554e-06, 'epoch': 0.95}


 95%|█████████▍| 1945/2049 [33:47<01:50,  1.06s/it]

{'loss': 0.0008, 'grad_norm': 0.01753736473619938, 'learning_rate': 2.537823328452904e-06, 'epoch': 0.95}


 95%|█████████▌| 1950/2049 [33:52<01:48,  1.10s/it]

{'loss': 0.0008, 'grad_norm': 0.020727764815092087, 'learning_rate': 2.4158125915080528e-06, 'epoch': 0.95}


 95%|█████████▌| 1955/2049 [33:58<01:40,  1.07s/it]

{'loss': 1.3005, 'grad_norm': 0.027000878006219864, 'learning_rate': 2.2938018545632016e-06, 'epoch': 0.95}


 96%|█████████▌| 1960/2049 [34:03<01:37,  1.09s/it]

{'loss': 0.0012, 'grad_norm': 0.04693127050995827, 'learning_rate': 2.1717911176183505e-06, 'epoch': 0.96}


 96%|█████████▌| 1965/2049 [34:08<01:29,  1.06s/it]

{'loss': 0.001, 'grad_norm': 0.04117657616734505, 'learning_rate': 2.0497803806734994e-06, 'epoch': 0.96}


 96%|█████████▌| 1970/2049 [34:13<01:22,  1.05s/it]

{'loss': 0.001, 'grad_norm': 0.041588034480810165, 'learning_rate': 1.9277696437286483e-06, 'epoch': 0.96}


 96%|█████████▋| 1975/2049 [34:19<01:15,  1.02s/it]

{'loss': 0.0008, 'grad_norm': 0.028975751250982285, 'learning_rate': 1.805758906783797e-06, 'epoch': 0.96}


 97%|█████████▋| 1980/2049 [34:24<01:10,  1.02s/it]

{'loss': 0.0009, 'grad_norm': 0.02891462668776512, 'learning_rate': 1.683748169838946e-06, 'epoch': 0.97}


 97%|█████████▋| 1985/2049 [34:29<01:05,  1.03s/it]

{'loss': 0.0008, 'grad_norm': 0.02145669050514698, 'learning_rate': 1.5617374328940948e-06, 'epoch': 0.97}


 97%|█████████▋| 1990/2049 [34:34<01:00,  1.03s/it]

{'loss': 0.001, 'grad_norm': 0.03743280470371246, 'learning_rate': 1.4397266959492437e-06, 'epoch': 0.97}


 97%|█████████▋| 1995/2049 [34:39<00:53,  1.00it/s]

{'loss': 0.0008, 'grad_norm': 0.02759382128715515, 'learning_rate': 1.3177159590043926e-06, 'epoch': 0.97}


 98%|█████████▊| 2000/2049 [34:44<00:51,  1.06s/it]

{'loss': 0.0009, 'grad_norm': 0.04275890439748764, 'learning_rate': 1.1957052220595413e-06, 'epoch': 0.98}


 98%|█████████▊| 2005/2049 [34:49<00:45,  1.04s/it]

{'loss': 0.0008, 'grad_norm': 0.02641913667321205, 'learning_rate': 1.0736944851146902e-06, 'epoch': 0.98}


 98%|█████████▊| 2010/2049 [34:54<00:39,  1.01s/it]

{'loss': 0.0008, 'grad_norm': 0.022304954007267952, 'learning_rate': 9.51683748169839e-07, 'epoch': 0.98}


 98%|█████████▊| 2015/2049 [34:59<00:34,  1.01s/it]

{'loss': 0.0009, 'grad_norm': 0.019684089347720146, 'learning_rate': 8.296730112249878e-07, 'epoch': 0.98}


 99%|█████████▊| 2020/2049 [35:05<00:30,  1.04s/it]

{'loss': 0.0009, 'grad_norm': 0.025105079635977745, 'learning_rate': 7.076622742801367e-07, 'epoch': 0.99}


 99%|█████████▉| 2025/2049 [35:09<00:21,  1.12it/s]

{'loss': 0.001, 'grad_norm': 0.038726598024368286, 'learning_rate': 5.856515373352855e-07, 'epoch': 0.99}


 99%|█████████▉| 2030/2049 [35:14<00:18,  1.05it/s]

{'loss': 0.0007, 'grad_norm': 0.025849999859929085, 'learning_rate': 4.636408003904344e-07, 'epoch': 0.99}


 99%|█████████▉| 2035/2049 [35:19<00:13,  1.00it/s]

{'loss': 0.0008, 'grad_norm': 0.015947755426168442, 'learning_rate': 3.4163006344558326e-07, 'epoch': 0.99}


100%|█████████▉| 2040/2049 [35:24<00:09,  1.07s/it]

{'loss': 0.001, 'grad_norm': 0.051440611481666565, 'learning_rate': 2.1961932650073207e-07, 'epoch': 1.0}


100%|█████████▉| 2045/2049 [35:29<00:04,  1.04s/it]

{'loss': 0.0009, 'grad_norm': 0.037525661289691925, 'learning_rate': 9.760858955588092e-08, 'epoch': 1.0}


                                                   
100%|██████████| 2049/2049 [38:55<00:00,  1.00it/s]

{'eval_loss': 0.12999621033668518, 'eval_accuracy': 0.9903189066059226, 'eval_f1': 0.0, 'eval_precision': 1.0, 'eval_recall': 0.0, 'eval_runtime': 200.4217, 'eval_samples_per_second': 8.762, 'eval_steps_per_second': 4.381, 'epoch': 1.0}


100%|██████████| 2049/2049 [38:57<00:00,  1.14s/it]

{'train_runtime': 2337.6982, 'train_samples_per_second': 1.753, 'train_steps_per_second': 0.877, 'train_loss': 0.1394886291009212, 'epoch': 1.0}





TrainOutput(global_step=2049, training_loss=0.1394886291009212, metrics={'train_runtime': 2337.6982, 'train_samples_per_second': 1.753, 'train_steps_per_second': 0.877, 'total_flos': 269491498452480.0, 'train_loss': 0.1394886291009212, 'epoch': 1.0})

In [19]:
# 13. Evaluate the model
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

100%|██████████| 878/878 [02:46<00:00,  5.28it/s]

Evaluation Results: {'eval_loss': 0.12999621033668518, 'eval_accuracy': 0.9903189066059226, 'eval_f1': 0.0, 'eval_precision': 1.0, 'eval_recall': 0.0, 'eval_runtime': 166.4525, 'eval_samples_per_second': 10.55, 'eval_steps_per_second': 5.275, 'epoch': 1.0}





In [22]:
# # 14. Save the trained model and tokenizer
# model.save_pretrained('model/trained-model')
# tokenizer.save_pretrained('model/trained-model')

('model/trained-model\\tokenizer_config.json',
 'model/trained-model\\special_tokens_map.json',
 'model/trained-model\\vocab.txt',
 'model/trained-model\\added_tokens.json')