# Preparing raw dataset

In [1]:
import pandas as pd

In [39]:
df = pd.read_parquet("hf://datasets/Amirkid/MedQuad-dataset/data/train-00000-of-00001-744a44ed05ce0a48.parquet")
df

In [41]:
def process_medical_data(raw_data):
    questions = []
    answers = []
    for i in range(0, len(raw_data), 2):  # Step by 2 to get question and answer pairs
        question = raw_data.iloc[i]['text'].strip()  # Get the question
        answer = df.iloc[i + 1]['text'].strip() if (i + 1) < len(raw_data) else ""  # Get the answer
        
        questions.append(question)
        answers.append(answer)
    
    # Create a new DataFrame
    new_df = pd.DataFrame({
        'question': questions,
        'answer': answers
    })

    return new_df

new_df = process_medical_data(df)

In [42]:
new_df
# new_df.to_csv("data/raw/train.csv", index=False)

Unnamed: 0,question,answer
0,What is (are) Neck Injuries and Disorders ?,"Any part of your neck muscles, bones, joints,..."
1,What is (are) Heel Injuries and Disorders ?,Heel problems are common and can be painful. O...
2,Do you have information about CT Scans,Summary : Computed tomography (CT) is a type o...
3,What is (are) Self-harm ?,Selfharm refers to a person's harming their ow...
4,Do you have information about Seniors' Health,Summary : People in the U.S. are living longer...
...,...,...
16395,What is (are) Parasites - Fascioliasis (Fascio...,Fascioliasis is an infectious disease caused b...
16396,Who is at risk for Parasites - Fascioliasis (F...,Fascioliasis occurs in many areas of the world...
16397,How to diagnose Parasites - Fascioliasis (Fasc...,The standard way to be sure a person is infect...
16398,What are the treatments for Parasites - Fascio...,The first step is to make sure the diagnosis i...


In [48]:
df_qa = pd.read_csv('data/raw/train.csv')
df_symptoms = pd.read_csv('data/raw/Symptom2Disease.csv')
combined_df = pd.DataFrame({
    'label': pd.concat([df_qa['question'], df_symptoms['label']]).reset_index(drop=True),
    'text': pd.concat([df_qa['answer'], df_symptoms['text']]).reset_index(drop=True)
})

combined_df.to_csv('data/raw/combined_dataset.csv', index=False)

In [49]:
combined_df

Unnamed: 0,label,text
0,What is (are) Neck Injuries and Disorders ?,"Any part of your neck muscles, bones, joints,..."
1,What is (are) Heel Injuries and Disorders ?,Heel problems are common and can be painful. O...
2,Do you have information about CT Scans,Summary : Computed tomography (CT) is a type o...
3,What is (are) Self-harm ?,Selfharm refers to a person's harming their ow...
4,Do you have information about Seniors' Health,Summary : People in the U.S. are living longer...
...,...,...
17595,diabetes,I'm shaking and trembling all over. I've lost ...
17596,diabetes,"Particularly in the crevices of my skin, I hav..."
17597,diabetes,I regularly experience these intense urges and...
17598,diabetes,"I have trouble breathing, especially outside. ..."


# Preprocessing dataset

In [1]:
from src.data_preprocessing import preprocess_and_save
preprocess_and_save()

Loading raw data...
Cleaning training data...
Saving processed data to CSV files...
Data preprocessing completed successfully.


In [3]:
import pandas as pd
preprocessed_df = pd.read_csv('data/processed/train.csv')
preprocessed_df

Unnamed: 0,label,text
0,neck injury disorder,part neck muscle bone joint tendon ligament ne...
1,heel injury disorder,heel problem common painful often result much ...
2,information ct scan,summary computed tomography ct type imaging us...
3,selfharm,selfharm refers person harming body purpose 1 ...
4,information senior health,summary people u living longer ever many senio...
...,...,...
17595,diabetes,im shaking trembling ive lost sense taste smel...
17596,diabetes,particularly crevice skin skin rash irritation...
17597,diabetes,regularly experience intense urge want urinate...
17598,diabetes,trouble breathing especially outside start fee...


In [4]:
from datasets import Dataset
train = Dataset.from_pandas(preprocessed_df)

# Load pre-trained model Bert base and tokenizer 

In [5]:
train

Dataset({
    features: ['label', 'text'],
    num_rows: 17600
})

In [6]:
train[0]

{'label': 'neck injury disorder',
 'text': 'part neck muscle bone joint tendon ligament nerve cause neck problem neck pain common pain may also come shoulder jaw head upper arm muscle strain tension often cause neck pain problem usually overuse sitting computer long sometimes strain neck muscle sleeping awkward position overdoing exercise fall accident including car accident another common cause neck pain whiplash soft tissue injury neck also called neck sprain strain treatment depends cause may include applying ice taking pain reliever getting physical therapy wearing cervical collar rarely need surgery'}

In [2]:
# check torch using gpu or not
import torch
torch.cuda.is_available()
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 4050 Laptop GPU


In [20]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, TrainerCallback
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load the dataset
df = pd.read_csv('data/raw/train.csv')

# Preprocessing
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Define custom dataset
class InjuryDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
model = BertForSequenceClassification.from_pretrained('bert-large-cased', num_labels=len(df['label'].unique()))

BATCH_SIZE = 8
MAX_LEN = 128
GRADIENT_ACCUMULATION_STEPS = 8
EPOCHS = 10
LEARNING_RATE = 2e-5

# Split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.1, random_state=42)

# Create DataLoader
train_dataset = InjuryDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, MAX_LEN)
val_dataset = InjuryDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    max_grad_norm=0.5,  # Gradient clipping
    warmup_steps=1000,  # Increased warmup steps
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_steps=100,
    evaluation_strategy="steps",
    save_strategy='steps',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    seed=42,
    lr_scheduler_type="linear",
    optim="adamw_torch",
)

# Define a callback to print training logs
class PrinterCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero and logs is not None:
            if 'loss' in logs:
                print(f"Step {state.global_step}: Training Loss: {logs['loss']:.5f}")
            else:
                print(f"Step {state.global_step}: Logs: {logs}")

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[PrinterCallback()]
)

# Clear CUDA cache before training
torch.cuda.empty_cache()

# Train the model
trainer.train()

# Save the model and tokenizer
trainer.save_model('models/bert-medical-qa')
tokenizer.save_pretrained('models/bert-medical-qa')

# Evaluate the model and show metrics
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  8%|▊         | 200/2470 [21:16<4:01:23,  6.38s/it]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  0%|          | 10/2300 [05:01<21:05:39, 33.16s/it]

Step 10: Training Loss: 9.76430
{'loss': 9.7643, 'grad_norm': inf, 'learning_rate': 4.5e-07, 'epoch': 0.04}


  1%|          | 20/2300 [10:38<21:24:26, 33.80s/it]

Step 20: Training Loss: 9.73920
{'loss': 9.7392, 'grad_norm': 8.453700065612793, 'learning_rate': 9.5e-07, 'epoch': 0.09}


  1%|▏         | 30/2300 [16:42<23:07:26, 36.67s/it]

Step 30: Training Loss: 9.75300
{'loss': 9.753, 'grad_norm': 9.744769096374512, 'learning_rate': 1.45e-06, 'epoch': 0.13}


  2%|▏         | 40/2300 [22:37<21:23:41, 34.08s/it]

Step 40: Training Loss: 9.77990
{'loss': 9.7799, 'grad_norm': 11.375333786010742, 'learning_rate': 1.95e-06, 'epoch': 0.17}


  2%|▏         | 50/2300 [28:09<20:48:42, 33.30s/it]

Step 50: Training Loss: 9.75540
{'loss': 9.7554, 'grad_norm': 8.270465850830078, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.22}


  3%|▎         | 60/2300 [33:41<20:39:27, 33.20s/it]

Step 60: Training Loss: 9.75220
{'loss': 9.7522, 'grad_norm': 10.500877380371094, 'learning_rate': 2.9e-06, 'epoch': 0.26}


  3%|▎         | 70/2300 [39:09<20:17:57, 32.77s/it]

Step 70: Training Loss: 9.73320
{'loss': 9.7332, 'grad_norm': 8.821913719177246, 'learning_rate': 3.4000000000000005e-06, 'epoch': 0.3}


  3%|▎         | 80/2300 [44:38<20:14:58, 32.84s/it]

Step 80: Training Loss: 9.75530
{'loss': 9.7553, 'grad_norm': 8.254136085510254, 'learning_rate': 3.9e-06, 'epoch': 0.35}


  4%|▍         | 90/2300 [50:11<20:29:09, 33.37s/it]

Step 90: Training Loss: 9.75940
{'loss': 9.7594, 'grad_norm': 8.818243026733398, 'learning_rate': 4.4e-06, 'epoch': 0.39}


  4%|▍         | 100/2300 [55:39<20:00:58, 32.75s/it]

Step 100: Training Loss: 9.79040
{'loss': 9.7904, 'grad_norm': 10.002685546875, 'learning_rate': 4.9000000000000005e-06, 'epoch': 0.43}


                                                     
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 100: Logs: {'eval_loss': 9.76392936706543, 'eval_accuracy': 0.0, 'eval_runtime': 231.6022, 'eval_samples_per_second': 7.081, 'eval_steps_per_second': 1.77, 'epoch': 0.43360433604336046}
{'eval_loss': 9.76392936706543, 'eval_accuracy': 0.0, 'eval_runtime': 231.6022, 'eval_samples_per_second': 7.081, 'eval_steps_per_second': 1.77, 'epoch': 0.43}


  5%|▍         | 110/2300 [1:04:57<21:36:13, 35.51s/it] 

Step 110: Training Loss: 9.77550
{'loss': 9.7755, 'grad_norm': 8.325244903564453, 'learning_rate': 5.4e-06, 'epoch': 0.48}


  5%|▌         | 120/2300 [1:10:22<19:43:38, 32.58s/it]

Step 120: Training Loss: 9.74120
{'loss': 9.7412, 'grad_norm': 7.5735602378845215, 'learning_rate': 5.9e-06, 'epoch': 0.52}


  6%|▌         | 130/2300 [1:15:54<20:42:14, 34.35s/it]

Step 130: Training Loss: 9.78790
{'loss': 9.7879, 'grad_norm': 8.107720375061035, 'learning_rate': 6.4000000000000006e-06, 'epoch': 0.56}


  6%|▌         | 140/2300 [1:21:20<19:33:34, 32.60s/it]

Step 140: Training Loss: 9.76690
{'loss': 9.7669, 'grad_norm': 6.291647911071777, 'learning_rate': 6.900000000000001e-06, 'epoch': 0.61}


  7%|▋         | 150/2300 [1:26:44<19:20:56, 32.40s/it]

Step 150: Training Loss: 9.70160
{'loss': 9.7016, 'grad_norm': 6.489624977111816, 'learning_rate': 7.4e-06, 'epoch': 0.65}


  7%|▋         | 160/2300 [1:32:07<19:17:30, 32.45s/it]

Step 160: Training Loss: 9.65750
{'loss': 9.6575, 'grad_norm': inf, 'learning_rate': 7.8e-06, 'epoch': 0.69}


  7%|▋         | 170/2300 [1:37:32<19:12:15, 32.46s/it]

Step 170: Training Loss: 9.70200
{'loss': 9.702, 'grad_norm': 5.641952991485596, 'learning_rate': 8.3e-06, 'epoch': 0.74}


  8%|▊         | 180/2300 [1:42:51<18:45:17, 31.85s/it]

Step 180: Training Loss: 9.68610
{'loss': 9.6861, 'grad_norm': 7.824916362762451, 'learning_rate': 8.8e-06, 'epoch': 0.78}


  8%|▊         | 190/2300 [1:48:14<18:54:54, 32.27s/it]

Step 190: Training Loss: 9.70250
{'loss': 9.7025, 'grad_norm': 10.29664421081543, 'learning_rate': 9.3e-06, 'epoch': 0.82}


  9%|▊         | 200/2300 [1:53:36<18:46:08, 32.18s/it]

Step 200: Training Loss: 9.69330
{'loss': 9.6933, 'grad_norm': 6.419590473175049, 'learning_rate': 9.800000000000001e-06, 'epoch': 0.87}


                                                       
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 200: Logs: {'eval_loss': 9.705349922180176, 'eval_accuracy': 0.0, 'eval_runtime': 231.2537, 'eval_samples_per_second': 7.092, 'eval_steps_per_second': 1.773, 'epoch': 0.8672086720867209}
{'eval_loss': 9.705349922180176, 'eval_accuracy': 0.0, 'eval_runtime': 231.2537, 'eval_samples_per_second': 7.092, 'eval_steps_per_second': 1.773, 'epoch': 0.87}


  9%|▉         | 210/2300 [2:02:50<20:22:15, 35.09s/it] 

Step 210: Training Loss: 9.68940
{'loss': 9.6894, 'grad_norm': 10.284307479858398, 'learning_rate': 1.03e-05, 'epoch': 0.91}


 10%|▉         | 220/2300 [2:08:12<18:36:50, 32.22s/it]

Step 220: Training Loss: 9.73380
{'loss': 9.7338, 'grad_norm': 5.2216997146606445, 'learning_rate': 1.075e-05, 'epoch': 0.95}


 10%|█         | 230/2300 [2:13:35<18:36:22, 32.36s/it]

Step 230: Training Loss: 9.68630
{'loss': 9.6863, 'grad_norm': 6.723376750946045, 'learning_rate': 1.125e-05, 'epoch': 1.0}


 10%|█         | 240/2300 [2:18:54<18:13:58, 31.86s/it]

Step 240: Training Loss: 9.65830
{'loss': 9.6583, 'grad_norm': 6.410965442657471, 'learning_rate': 1.175e-05, 'epoch': 1.04}


 11%|█         | 250/2300 [2:24:18<18:25:48, 32.36s/it]

Step 250: Training Loss: 9.65680
{'loss': 9.6568, 'grad_norm': 5.758727550506592, 'learning_rate': 1.225e-05, 'epoch': 1.08}


 11%|█▏        | 260/2300 [2:29:40<18:13:50, 32.17s/it]

Step 260: Training Loss: 9.68010
{'loss': 9.6801, 'grad_norm': 5.456994533538818, 'learning_rate': 1.2750000000000002e-05, 'epoch': 1.13}


 12%|█▏        | 270/2300 [2:35:04<18:16:31, 32.41s/it]

Step 270: Training Loss: 9.65370
{'loss': 9.6537, 'grad_norm': 5.843208312988281, 'learning_rate': 1.3250000000000002e-05, 'epoch': 1.17}


 12%|█▏        | 280/2300 [2:40:26<18:03:32, 32.18s/it]

Step 280: Training Loss: 9.64030
{'loss': 9.6403, 'grad_norm': 4.174919605255127, 'learning_rate': 1.3750000000000002e-05, 'epoch': 1.21}


 13%|█▎        | 290/2300 [2:45:48<17:58:29, 32.19s/it]

Step 290: Training Loss: 9.63070
{'loss': 9.6307, 'grad_norm': 4.065707683563232, 'learning_rate': 1.4249999999999999e-05, 'epoch': 1.26}


 13%|█▎        | 300/2300 [2:51:11<17:58:03, 32.34s/it]

Step 300: Training Loss: 9.63960
{'loss': 9.6396, 'grad_norm': 3.6016104221343994, 'learning_rate': 1.475e-05, 'epoch': 1.3}


                                                       
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 300: Logs: {'eval_loss': 9.859413146972656, 'eval_accuracy': 0.0, 'eval_runtime': 227.0025, 'eval_samples_per_second': 7.225, 'eval_steps_per_second': 1.806, 'epoch': 1.3008130081300813}
{'eval_loss': 9.859413146972656, 'eval_accuracy': 0.0, 'eval_runtime': 227.0025, 'eval_samples_per_second': 7.225, 'eval_steps_per_second': 1.806, 'epoch': 1.3}


 13%|█▎        | 310/2300 [3:00:18<19:11:05, 34.71s/it] 

Step 310: Training Loss: 9.67040
{'loss': 9.6704, 'grad_norm': 4.95908784866333, 'learning_rate': 1.525e-05, 'epoch': 1.34}


 14%|█▍        | 320/2300 [3:05:42<17:51:30, 32.47s/it]

Step 320: Training Loss: 9.66480
{'loss': 9.6648, 'grad_norm': 3.0803775787353516, 'learning_rate': 1.575e-05, 'epoch': 1.39}


 14%|█▍        | 330/2300 [3:11:06<17:44:02, 32.41s/it]

Step 330: Training Loss: 9.63180
{'loss': 9.6318, 'grad_norm': 3.479515552520752, 'learning_rate': 1.6250000000000002e-05, 'epoch': 1.43}


 15%|█▍        | 340/2300 [3:16:28<17:31:00, 32.17s/it]

Step 340: Training Loss: 9.64510
{'loss': 9.6451, 'grad_norm': 3.2046163082122803, 'learning_rate': 1.675e-05, 'epoch': 1.47}


 15%|█▌        | 350/2300 [3:21:51<17:32:13, 32.38s/it]

Step 350: Training Loss: 9.64610
{'loss': 9.6461, 'grad_norm': 3.3723978996276855, 'learning_rate': 1.725e-05, 'epoch': 1.52}


 16%|█▌        | 360/2300 [3:27:15<17:27:50, 32.41s/it]

Step 360: Training Loss: 9.66160
{'loss': 9.6616, 'grad_norm': 3.603956937789917, 'learning_rate': 1.775e-05, 'epoch': 1.56}


 16%|█▌        | 370/2300 [3:32:40<17:29:47, 32.64s/it]

Step 370: Training Loss: 9.63740
{'loss': 9.6374, 'grad_norm': 2.8779473304748535, 'learning_rate': 1.825e-05, 'epoch': 1.6}


 17%|█▋        | 380/2300 [3:38:05<17:16:37, 32.39s/it]

Step 380: Training Loss: 9.66200
{'loss': 9.662, 'grad_norm': 3.266446352005005, 'learning_rate': 1.8750000000000002e-05, 'epoch': 1.65}


 17%|█▋        | 390/2300 [3:43:29<17:10:42, 32.38s/it]

Step 390: Training Loss: 9.62090
{'loss': 9.6209, 'grad_norm': 4.097597122192383, 'learning_rate': 1.925e-05, 'epoch': 1.69}


 17%|█▋        | 400/2300 [3:48:48<16:57:41, 32.14s/it]

Step 400: Training Loss: 9.67440
{'loss': 9.6744, 'grad_norm': 2.9465842247009277, 'learning_rate': 1.9750000000000002e-05, 'epoch': 1.73}


                                                       
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 400: Logs: {'eval_loss': 9.97491455078125, 'eval_accuracy': 0.0012195121951219512, 'eval_runtime': 231.4095, 'eval_samples_per_second': 7.087, 'eval_steps_per_second': 1.772, 'epoch': 1.7344173441734418}
{'eval_loss': 9.97491455078125, 'eval_accuracy': 0.0012195121951219512, 'eval_runtime': 231.4095, 'eval_samples_per_second': 7.087, 'eval_steps_per_second': 1.772, 'epoch': 1.73}


 18%|█▊        | 410/2300 [3:58:02<18:22:57, 35.01s/it] 

Step 410: Training Loss: 9.65820
{'loss': 9.6582, 'grad_norm': 3.3826746940612793, 'learning_rate': 2.025e-05, 'epoch': 1.78}


 18%|█▊        | 420/2300 [4:03:25<16:53:47, 32.36s/it]

Step 420: Training Loss: 9.64870
{'loss': 9.6487, 'grad_norm': 3.0755302906036377, 'learning_rate': 2.075e-05, 'epoch': 1.82}


 19%|█▊        | 430/2300 [4:08:49<16:49:56, 32.40s/it]

Step 430: Training Loss: 9.65220
{'loss': 9.6522, 'grad_norm': 3.590938091278076, 'learning_rate': 2.125e-05, 'epoch': 1.86}


 19%|█▉        | 440/2300 [4:14:09<16:34:29, 32.08s/it]

Step 440: Training Loss: 9.64350
{'loss': 9.6435, 'grad_norm': 4.745429039001465, 'learning_rate': 2.175e-05, 'epoch': 1.91}


 20%|█▉        | 450/2300 [4:19:31<16:32:22, 32.19s/it]

Step 450: Training Loss: 9.64300
{'loss': 9.643, 'grad_norm': 2.9966938495635986, 'learning_rate': 2.2250000000000002e-05, 'epoch': 1.95}


 20%|██        | 460/2300 [4:24:54<16:31:33, 32.33s/it]

Step 460: Training Loss: 9.65480
{'loss': 9.6548, 'grad_norm': 3.122136116027832, 'learning_rate': 2.275e-05, 'epoch': 1.99}


 20%|██        | 470/2300 [4:30:16<16:21:55, 32.19s/it]

Step 470: Training Loss: 9.61740
{'loss': 9.6174, 'grad_norm': 2.455094575881958, 'learning_rate': 2.3250000000000003e-05, 'epoch': 2.04}


 21%|██        | 480/2300 [4:35:38<16:15:46, 32.17s/it]

Step 480: Training Loss: 9.57440
{'loss': 9.5744, 'grad_norm': 2.8214869499206543, 'learning_rate': 2.375e-05, 'epoch': 2.08}


 21%|██▏       | 490/2300 [4:41:03<16:22:29, 32.57s/it]

Step 490: Training Loss: 9.58810
{'loss': 9.5881, 'grad_norm': 2.2107226848602295, 'learning_rate': 2.425e-05, 'epoch': 2.12}


 22%|██▏       | 500/2300 [4:46:25<16:05:27, 32.18s/it]

Step 500: Training Loss: 9.59360
{'loss': 9.5936, 'grad_norm': 2.4148716926574707, 'learning_rate': 2.4750000000000002e-05, 'epoch': 2.17}


                                                       
 22%|██▏       | 500/2300 [4:50:15<16:05:27, 32.18s/it]

Step 500: Logs: {'eval_loss': 10.093950271606445, 'eval_accuracy': 0.001829268292682927, 'eval_runtime': 230.0216, 'eval_samples_per_second': 7.13, 'eval_steps_per_second': 1.782, 'epoch': 2.168021680216802}
{'eval_loss': 10.093950271606445, 'eval_accuracy': 0.001829268292682927, 'eval_runtime': 230.0216, 'eval_samples_per_second': 7.13, 'eval_steps_per_second': 1.782, 'epoch': 2.17}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 22%|██▏       | 510/2300 [4:55:36<17:11:19, 34.57s/it] 

Step 510: Training Loss: 9.56480
{'loss': 9.5648, 'grad_norm': 2.4732463359832764, 'learning_rate': 2.525e-05, 'epoch': 2.21}


 23%|██▎       | 520/2300 [5:00:58<15:59:24, 32.34s/it]

Step 520: Training Loss: 9.58550
{'loss': 9.5855, 'grad_norm': 2.527837038040161, 'learning_rate': 2.5750000000000002e-05, 'epoch': 2.25}


 23%|██▎       | 530/2300 [5:06:21<15:55:31, 32.39s/it]

Step 530: Training Loss: 9.54740
{'loss': 9.5474, 'grad_norm': 3.0360920429229736, 'learning_rate': 2.625e-05, 'epoch': 2.3}


 23%|██▎       | 540/2300 [5:11:44<15:45:51, 32.25s/it]

Step 540: Training Loss: 9.58780
{'loss': 9.5878, 'grad_norm': 2.7373034954071045, 'learning_rate': 2.6750000000000003e-05, 'epoch': 2.34}


 24%|██▍       | 550/2300 [5:17:07<15:42:04, 32.30s/it]

Step 550: Training Loss: 9.60560
{'loss': 9.6056, 'grad_norm': 2.660691022872925, 'learning_rate': 2.725e-05, 'epoch': 2.38}


 24%|██▍       | 560/2300 [5:22:30<15:37:01, 32.31s/it]

Step 560: Training Loss: 9.59580
{'loss': 9.5958, 'grad_norm': 2.4933998584747314, 'learning_rate': 2.7750000000000004e-05, 'epoch': 2.43}


 25%|██▍       | 570/2300 [5:27:52<15:28:38, 32.21s/it]

Step 570: Training Loss: 9.58090
{'loss': 9.5809, 'grad_norm': 2.5547733306884766, 'learning_rate': 2.825e-05, 'epoch': 2.47}


 25%|██▌       | 580/2300 [5:33:15<15:25:46, 32.29s/it]

Step 580: Training Loss: 9.59630
{'loss': 9.5963, 'grad_norm': 2.8680977821350098, 'learning_rate': 2.8749999999999997e-05, 'epoch': 2.51}


 26%|██▌       | 590/2300 [5:38:38<15:19:13, 32.25s/it]

Step 590: Training Loss: 9.61520
{'loss': 9.6152, 'grad_norm': 2.6906239986419678, 'learning_rate': 2.925e-05, 'epoch': 2.56}


 26%|██▌       | 600/2300 [5:44:00<15:13:11, 32.23s/it]

Step 600: Training Loss: 9.64270
{'loss': 9.6427, 'grad_norm': 2.9956510066986084, 'learning_rate': 2.975e-05, 'epoch': 2.6}


                                                       
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 600: Logs: {'eval_loss': 10.175233840942383, 'eval_accuracy': 0.0012195121951219512, 'eval_runtime': 230.2791, 'eval_samples_per_second': 7.122, 'eval_steps_per_second': 1.78, 'epoch': 2.6016260162601625}
{'eval_loss': 10.175233840942383, 'eval_accuracy': 0.0012195121951219512, 'eval_runtime': 230.2791, 'eval_samples_per_second': 7.122, 'eval_steps_per_second': 1.78, 'epoch': 2.6}


 27%|██▋       | 610/2300 [5:53:15<16:31:07, 35.19s/it] 

Step 610: Training Loss: 9.60050
{'loss': 9.6005, 'grad_norm': 2.3856747150421143, 'learning_rate': 3.025e-05, 'epoch': 2.64}


 27%|██▋       | 620/2300 [5:58:32<14:49:43, 31.78s/it]

Step 620: Training Loss: 9.60750
{'loss': 9.6075, 'grad_norm': 2.5680501461029053, 'learning_rate': 3.075e-05, 'epoch': 2.69}


 27%|██▋       | 630/2300 [6:03:54<14:53:50, 32.11s/it]

Step 630: Training Loss: 9.60480
{'loss': 9.6048, 'grad_norm': 2.422698974609375, 'learning_rate': 3.125e-05, 'epoch': 2.73}


 28%|██▊       | 640/2300 [6:09:14<14:45:44, 32.01s/it]

Step 640: Training Loss: 9.62080
{'loss': 9.6208, 'grad_norm': 2.499603509902954, 'learning_rate': 3.175e-05, 'epoch': 2.78}


 28%|██▊       | 650/2300 [6:14:38<14:50:30, 32.38s/it]

Step 650: Training Loss: 9.59470
{'loss': 9.5947, 'grad_norm': 2.502742290496826, 'learning_rate': 3.2250000000000005e-05, 'epoch': 2.82}


 29%|██▊       | 660/2300 [6:20:02<14:44:36, 32.36s/it]

Step 660: Training Loss: 9.61970
{'loss': 9.6197, 'grad_norm': 2.6502513885498047, 'learning_rate': 3.275e-05, 'epoch': 2.86}


 29%|██▉       | 670/2300 [6:25:22<14:28:22, 31.96s/it]

Step 670: Training Loss: 9.58690
{'loss': 9.5869, 'grad_norm': 2.9868552684783936, 'learning_rate': 3.325e-05, 'epoch': 2.91}


 30%|██▉       | 680/2300 [6:30:43<14:28:19, 32.16s/it]

Step 680: Training Loss: 9.58920
{'loss': 9.5892, 'grad_norm': 2.483806848526001, 'learning_rate': 3.375000000000001e-05, 'epoch': 2.95}


 30%|███       | 690/2300 [6:36:06<14:27:23, 32.33s/it]

Step 690: Training Loss: 9.58730
{'loss': 9.5873, 'grad_norm': 2.5160913467407227, 'learning_rate': 3.4250000000000006e-05, 'epoch': 2.99}


 30%|███       | 700/2300 [6:41:30<14:23:05, 32.37s/it]

Step 700: Training Loss: 9.46220
{'loss': 9.4622, 'grad_norm': 3.497659683227539, 'learning_rate': 3.475e-05, 'epoch': 3.04}


                                                       
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 700: Logs: {'eval_loss': 10.436132431030273, 'eval_accuracy': 0.0024390243902439024, 'eval_runtime': 231.4329, 'eval_samples_per_second': 7.086, 'eval_steps_per_second': 1.772, 'epoch': 3.035230352303523}
{'eval_loss': 10.436132431030273, 'eval_accuracy': 0.0024390243902439024, 'eval_runtime': 231.4329, 'eval_samples_per_second': 7.086, 'eval_steps_per_second': 1.772, 'epoch': 3.04}


 31%|███       | 710/2300 [6:50:45<15:32:08, 35.18s/it] 

Step 710: Training Loss: 9.44510
{'loss': 9.4451, 'grad_norm': 2.8888819217681885, 'learning_rate': 3.525e-05, 'epoch': 3.08}


 31%|███▏      | 720/2300 [6:56:07<14:08:03, 32.20s/it]

Step 720: Training Loss: 9.40820
{'loss': 9.4082, 'grad_norm': 3.4610726833343506, 'learning_rate': 3.575e-05, 'epoch': 3.12}


 32%|███▏      | 730/2300 [7:01:29<14:02:32, 32.20s/it]

Step 730: Training Loss: 9.43490
{'loss': 9.4349, 'grad_norm': 3.210242509841919, 'learning_rate': 3.625e-05, 'epoch': 3.17}


 32%|███▏      | 740/2300 [7:06:53<14:02:00, 32.39s/it]

Step 740: Training Loss: 9.38940
{'loss': 9.3894, 'grad_norm': 3.515918731689453, 'learning_rate': 3.675e-05, 'epoch': 3.21}


 33%|███▎      | 750/2300 [7:12:16<13:54:04, 32.29s/it]

Step 750: Training Loss: 9.42790
{'loss': 9.4279, 'grad_norm': 3.6226019859313965, 'learning_rate': 3.7250000000000004e-05, 'epoch': 3.25}


 33%|███▎      | 760/2300 [7:17:38<13:46:16, 32.19s/it]

Step 760: Training Loss: 9.42020
{'loss': 9.4202, 'grad_norm': 3.5050997734069824, 'learning_rate': 3.775e-05, 'epoch': 3.3}


 33%|███▎      | 770/2300 [7:22:58<13:34:03, 31.92s/it]

Step 770: Training Loss: 9.43690
{'loss': 9.4369, 'grad_norm': 3.2615957260131836, 'learning_rate': 3.825e-05, 'epoch': 3.34}


 34%|███▍      | 780/2300 [7:28:15<13:22:06, 31.66s/it]

Step 780: Training Loss: 9.41820
{'loss': 9.4182, 'grad_norm': 3.351378917694092, 'learning_rate': 3.875e-05, 'epoch': 3.38}


 34%|███▍      | 790/2300 [7:33:38<13:33:48, 32.34s/it]

Step 790: Training Loss: 9.45410
{'loss': 9.4541, 'grad_norm': 3.1986234188079834, 'learning_rate': 3.9250000000000005e-05, 'epoch': 3.43}


 35%|███▍      | 800/2300 [7:39:02<13:29:20, 32.37s/it]

Step 800: Training Loss: 9.44960
{'loss': 9.4496, 'grad_norm': 3.1914992332458496, 'learning_rate': 3.9750000000000004e-05, 'epoch': 3.47}


                                                       
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 800: Logs: {'eval_loss': 10.582066535949707, 'eval_accuracy': 0.00426829268292683, 'eval_runtime': 229.4548, 'eval_samples_per_second': 7.147, 'eval_steps_per_second': 1.787, 'epoch': 3.4688346883468837}
{'eval_loss': 10.582066535949707, 'eval_accuracy': 0.00426829268292683, 'eval_runtime': 229.4548, 'eval_samples_per_second': 7.147, 'eval_steps_per_second': 1.787, 'epoch': 3.47}


 35%|███▌      | 810/2300 [7:48:11<14:21:24, 34.69s/it] 

Step 810: Training Loss: 9.40370
{'loss': 9.4037, 'grad_norm': 3.5021612644195557, 'learning_rate': 4.025e-05, 'epoch': 3.51}


 36%|███▌      | 820/2300 [7:53:35<13:21:16, 32.48s/it]

Step 820: Training Loss: 9.41190
{'loss': 9.4119, 'grad_norm': 3.39107608795166, 'learning_rate': 4.075e-05, 'epoch': 3.56}


 36%|███▌      | 830/2300 [7:58:58<13:12:47, 32.36s/it]

Step 830: Training Loss: 9.40580
{'loss': 9.4058, 'grad_norm': 3.365405321121216, 'learning_rate': 4.125e-05, 'epoch': 3.6}


 37%|███▋      | 840/2300 [8:04:21<13:05:00, 32.26s/it]

Step 840: Training Loss: 9.46830
{'loss': 9.4683, 'grad_norm': 3.3252689838409424, 'learning_rate': 4.175e-05, 'epoch': 3.64}


 37%|███▋      | 850/2300 [8:09:44<13:00:25, 32.29s/it]

Step 850: Training Loss: 9.39680
{'loss': 9.3968, 'grad_norm': 3.603492021560669, 'learning_rate': 4.2250000000000004e-05, 'epoch': 3.69}


 37%|███▋      | 860/2300 [8:15:06<12:53:26, 32.23s/it]

Step 860: Training Loss: 9.40520
{'loss': 9.4052, 'grad_norm': 3.4287331104278564, 'learning_rate': 4.275e-05, 'epoch': 3.73}


 38%|███▊      | 870/2300 [8:20:28<12:46:58, 32.18s/it]

Step 870: Training Loss: 9.40590
{'loss': 9.4059, 'grad_norm': 3.5004770755767822, 'learning_rate': 4.325e-05, 'epoch': 3.77}


 38%|███▊      | 880/2300 [8:25:51<12:43:54, 32.28s/it]

Step 880: Training Loss: 9.42380
{'loss': 9.4238, 'grad_norm': 3.510524272918701, 'learning_rate': 4.375e-05, 'epoch': 3.82}


 39%|███▊      | 890/2300 [8:31:13<12:36:01, 32.17s/it]

Step 890: Training Loss: 9.46030
{'loss': 9.4603, 'grad_norm': 3.486476421356201, 'learning_rate': 4.4250000000000005e-05, 'epoch': 3.86}


 39%|███▉      | 900/2300 [8:36:36<12:36:05, 32.40s/it]

Step 900: Training Loss: 9.43730
{'loss': 9.4373, 'grad_norm': 3.4445159435272217, 'learning_rate': 4.4750000000000004e-05, 'epoch': 3.9}


                                                       
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 900: Logs: {'eval_loss': 10.729153633117676, 'eval_accuracy': 0.003048780487804878, 'eval_runtime': 231.0308, 'eval_samples_per_second': 7.099, 'eval_steps_per_second': 1.775, 'epoch': 3.902439024390244}
{'eval_loss': 10.729153633117676, 'eval_accuracy': 0.003048780487804878, 'eval_runtime': 231.0308, 'eval_samples_per_second': 7.099, 'eval_steps_per_second': 1.775, 'epoch': 3.9}


 40%|███▉      | 910/2300 [8:45:51<13:34:33, 35.16s/it] 

Step 910: Training Loss: 9.41150
{'loss': 9.4115, 'grad_norm': 3.3804571628570557, 'learning_rate': 4.525e-05, 'epoch': 3.95}


 40%|████      | 920/2300 [8:51:13<12:22:55, 32.30s/it]

Step 920: Training Loss: 9.42240
{'loss': 9.4224, 'grad_norm': 3.329535961151123, 'learning_rate': 4.575e-05, 'epoch': 3.99}


 40%|████      | 930/2300 [8:56:34<12:14:39, 32.18s/it]

Step 930: Training Loss: 9.18980
{'loss': 9.1898, 'grad_norm': 3.8495264053344727, 'learning_rate': 4.6250000000000006e-05, 'epoch': 4.03}


 41%|████      | 940/2300 [9:01:52<11:58:42, 31.71s/it]

Step 940: Training Loss: 9.03100
{'loss': 9.031, 'grad_norm': 4.345518112182617, 'learning_rate': 4.6750000000000005e-05, 'epoch': 4.08}


 41%|████▏     | 950/2300 [9:07:12<12:00:42, 32.03s/it]

Step 950: Training Loss: 9.00580
{'loss': 9.0058, 'grad_norm': 4.004621505737305, 'learning_rate': 4.7249999999999997e-05, 'epoch': 4.12}


 42%|████▏     | 960/2300 [9:12:36<12:03:39, 32.40s/it]

Step 960: Training Loss: 9.07890
{'loss': 9.0789, 'grad_norm': 3.9499216079711914, 'learning_rate': 4.775e-05, 'epoch': 4.16}


 42%|████▏     | 970/2300 [9:18:00<11:58:08, 32.40s/it]

Step 970: Training Loss: 8.99060
{'loss': 8.9906, 'grad_norm': 4.125730991363525, 'learning_rate': 4.825e-05, 'epoch': 4.21}


 43%|████▎     | 980/2300 [9:23:23<11:48:20, 32.20s/it]

Step 980: Training Loss: 9.04200
{'loss': 9.042, 'grad_norm': 4.5427937507629395, 'learning_rate': 4.875e-05, 'epoch': 4.25}


 43%|████▎     | 990/2300 [9:28:47<11:47:30, 32.40s/it]

Step 990: Training Loss: 9.01620
{'loss': 9.0162, 'grad_norm': 3.936816453933716, 'learning_rate': 4.9250000000000004e-05, 'epoch': 4.29}


 43%|████▎     | 1000/2300 [9:34:07<11:32:02, 31.94s/it]

Step 1000: Training Loss: 9.06900
{'loss': 9.069, 'grad_norm': 3.9106459617614746, 'learning_rate': 4.975e-05, 'epoch': 4.34}


                                                        
 43%|████▎     | 1000/2300 [9:37:58<11:32:02, 31.94s/it]

Step 1000: Logs: {'eval_loss': 10.494622230529785, 'eval_accuracy': 0.003048780487804878, 'eval_runtime': 231.5831, 'eval_samples_per_second': 7.082, 'eval_steps_per_second': 1.77, 'epoch': 4.336043360433604}
{'eval_loss': 10.494622230529785, 'eval_accuracy': 0.003048780487804878, 'eval_runtime': 231.5831, 'eval_samples_per_second': 7.082, 'eval_steps_per_second': 1.77, 'epoch': 4.34}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 44%|████▍     | 1010/2300 [9:43:24<12:33:11, 35.03s/it] 

Step 1010: Training Loss: 9.05650
{'loss': 9.0565, 'grad_norm': 3.9066388607025146, 'learning_rate': 4.980769230769231e-05, 'epoch': 4.38}


 44%|████▍     | 1020/2300 [9:48:48<11:32:56, 32.48s/it]

Step 1020: Training Loss: 9.08170
{'loss': 9.0817, 'grad_norm': 3.970468282699585, 'learning_rate': 4.942307692307693e-05, 'epoch': 4.42}


 45%|████▍     | 1030/2300 [9:54:10<11:22:10, 32.23s/it]

Step 1030: Training Loss: 9.03220
{'loss': 9.0322, 'grad_norm': 4.031592845916748, 'learning_rate': 4.9038461538461536e-05, 'epoch': 4.47}


 45%|████▌     | 1040/2300 [9:59:29<11:07:01, 31.76s/it]

Step 1040: Training Loss: 9.09120
{'loss': 9.0912, 'grad_norm': 4.402336120605469, 'learning_rate': 4.865384615384616e-05, 'epoch': 4.51}


 46%|████▌     | 1050/2300 [10:04:52<11:12:50, 32.30s/it]

Step 1050: Training Loss: 9.10330
{'loss': 9.1033, 'grad_norm': 4.065919876098633, 'learning_rate': 4.826923076923077e-05, 'epoch': 4.55}


 46%|████▌     | 1060/2300 [10:10:11<10:57:10, 31.80s/it]

Step 1060: Training Loss: 9.06470
{'loss': 9.0647, 'grad_norm': 4.289345741271973, 'learning_rate': 4.788461538461539e-05, 'epoch': 4.6}


 47%|████▋     | 1070/2300 [10:15:34<11:03:48, 32.38s/it]

Step 1070: Training Loss: 9.08800
{'loss': 9.088, 'grad_norm': 4.19645357131958, 'learning_rate': 4.75e-05, 'epoch': 4.64}


 47%|████▋     | 1080/2300 [10:20:58<10:59:19, 32.43s/it]

Step 1080: Training Loss: 9.11310
{'loss': 9.1131, 'grad_norm': 4.085386753082275, 'learning_rate': 4.711538461538462e-05, 'epoch': 4.68}


 47%|████▋     | 1090/2300 [10:26:22<10:51:05, 32.29s/it]

Step 1090: Training Loss: 9.10970
{'loss': 9.1097, 'grad_norm': 4.051997184753418, 'learning_rate': 4.673076923076923e-05, 'epoch': 4.73}


 48%|████▊     | 1100/2300 [10:31:44<10:44:38, 32.23s/it]

Step 1100: Training Loss: 9.15270
{'loss': 9.1527, 'grad_norm': 4.127105712890625, 'learning_rate': 4.634615384615385e-05, 'epoch': 4.77}


                                                         
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 1100: Logs: {'eval_loss': 10.698600769042969, 'eval_accuracy': 0.003048780487804878, 'eval_runtime': 231.5712, 'eval_samples_per_second': 7.082, 'eval_steps_per_second': 1.771, 'epoch': 4.769647696476965}
{'eval_loss': 10.698600769042969, 'eval_accuracy': 0.003048780487804878, 'eval_runtime': 231.5712, 'eval_samples_per_second': 7.082, 'eval_steps_per_second': 1.771, 'epoch': 4.77}


 48%|████▊     | 1110/2300 [10:41:01<11:37:33, 35.17s/it] 

Step 1110: Training Loss: 9.09840
{'loss': 9.0984, 'grad_norm': 4.369615077972412, 'learning_rate': 4.596153846153846e-05, 'epoch': 4.81}


 49%|████▊     | 1120/2300 [10:46:25<10:37:15, 32.40s/it]

Step 1120: Training Loss: 9.13960
{'loss': 9.1396, 'grad_norm': 3.961667060852051, 'learning_rate': 4.557692307692308e-05, 'epoch': 4.86}


 49%|████▉     | 1130/2300 [10:51:45<10:24:16, 32.01s/it]

Step 1130: Training Loss: 9.13430
{'loss': 9.1343, 'grad_norm': 4.21079158782959, 'learning_rate': 4.519230769230769e-05, 'epoch': 4.9}


 50%|████▉     | 1140/2300 [10:57:09<10:27:06, 32.44s/it]

Step 1140: Training Loss: 9.08780
{'loss': 9.0878, 'grad_norm': 4.139468669891357, 'learning_rate': 4.4807692307692314e-05, 'epoch': 4.94}


 50%|█████     | 1150/2300 [11:02:34<10:20:59, 32.40s/it]

Step 1150: Training Loss: 9.12630
{'loss': 9.1263, 'grad_norm': 3.9276645183563232, 'learning_rate': 4.442307692307692e-05, 'epoch': 4.99}


 50%|█████     | 1160/2300 [11:07:57<10:15:42, 32.41s/it]

Step 1160: Training Loss: 8.71490
{'loss': 8.7149, 'grad_norm': 4.203327178955078, 'learning_rate': 4.403846153846154e-05, 'epoch': 5.03}


 51%|█████     | 1170/2300 [11:13:22<10:11:14, 32.46s/it]

Step 1170: Training Loss: 8.57940
{'loss': 8.5794, 'grad_norm': 4.301102638244629, 'learning_rate': 4.365384615384616e-05, 'epoch': 5.07}


 51%|█████▏    | 1180/2300 [11:18:46<10:04:54, 32.41s/it]

Step 1180: Training Loss: 8.58110
{'loss': 8.5811, 'grad_norm': 4.358078479766846, 'learning_rate': 4.326923076923077e-05, 'epoch': 5.12}


 52%|█████▏    | 1190/2300 [11:24:09<9:58:04, 32.33s/it] 

Step 1190: Training Loss: 8.57250
{'loss': 8.5725, 'grad_norm': 4.31022834777832, 'learning_rate': 4.288461538461538e-05, 'epoch': 5.16}


 52%|█████▏    | 1200/2300 [11:29:31<9:50:15, 32.20s/it]

Step 1200: Training Loss: 8.61890
{'loss': 8.6189, 'grad_norm': 4.638575553894043, 'learning_rate': 4.25e-05, 'epoch': 5.2}


                                                        
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 1200: Logs: {'eval_loss': 10.579862594604492, 'eval_accuracy': 0.004878048780487805, 'eval_runtime': 232.3565, 'eval_samples_per_second': 7.058, 'eval_steps_per_second': 1.765, 'epoch': 5.203252032520325}
{'eval_loss': 10.579862594604492, 'eval_accuracy': 0.004878048780487805, 'eval_runtime': 232.3565, 'eval_samples_per_second': 7.058, 'eval_steps_per_second': 1.765, 'epoch': 5.2}


 53%|█████▎    | 1210/2300 [11:41:18<16:13:26, 53.58s/it] 

Step 1210: Training Loss: 8.58710
{'loss': 8.5871, 'grad_norm': 4.3317742347717285, 'learning_rate': 4.211538461538462e-05, 'epoch': 5.25}


 53%|█████▎    | 1220/2300 [11:49:03<14:37:36, 48.76s/it]

Step 1220: Training Loss: 8.58110
{'loss': 8.5811, 'grad_norm': 4.127177715301514, 'learning_rate': 4.173076923076923e-05, 'epoch': 5.29}


 53%|█████▎    | 1230/2300 [11:56:51<13:45:48, 46.31s/it]

Step 1230: Training Loss: 8.57210
{'loss': 8.5721, 'grad_norm': 4.335701942443848, 'learning_rate': 4.134615384615385e-05, 'epoch': 5.33}


 54%|█████▍    | 1240/2300 [12:04:27<13:55:44, 47.31s/it]

Step 1240: Training Loss: 8.60400
{'loss': 8.604, 'grad_norm': 4.211338043212891, 'learning_rate': 4.096153846153846e-05, 'epoch': 5.38}


 54%|█████▍    | 1250/2300 [12:12:41<13:50:31, 47.46s/it]

Step 1250: Training Loss: 8.60250
{'loss': 8.6025, 'grad_norm': 4.171606540679932, 'learning_rate': 4.057692307692308e-05, 'epoch': 5.42}


 55%|█████▍    | 1260/2300 [12:20:21<13:19:14, 46.11s/it]

Step 1260: Training Loss: 8.54370
{'loss': 8.5437, 'grad_norm': 4.740675926208496, 'learning_rate': 4.019230769230769e-05, 'epoch': 5.46}


 55%|█████▌    | 1270/2300 [12:27:39<12:44:38, 44.54s/it]

Step 1270: Training Loss: 8.53530
{'loss': 8.5353, 'grad_norm': 4.32464075088501, 'learning_rate': 3.980769230769231e-05, 'epoch': 5.51}


 56%|█████▌    | 1280/2300 [12:34:39<12:44:05, 44.95s/it]

Step 1280: Training Loss: 8.62120
{'loss': 8.6212, 'grad_norm': 4.446717739105225, 'learning_rate': 3.942307692307692e-05, 'epoch': 5.55}


 56%|█████▌    | 1290/2300 [12:41:50<11:08:16, 39.70s/it]

Step 1290: Training Loss: 8.66370
{'loss': 8.6637, 'grad_norm': 4.435184478759766, 'learning_rate': 3.903846153846154e-05, 'epoch': 5.59}


 57%|█████▋    | 1300/2300 [12:47:57<10:12:07, 36.73s/it]

Step 1300: Training Loss: 8.67050
{'loss': 8.6705, 'grad_norm': 4.428670406341553, 'learning_rate': 3.865384615384616e-05, 'epoch': 5.64}


                                                         
 57%|█████▋    | 1300/2300 [12:51:53<10:12:07, 36.73s/it]

Step 1300: Logs: {'eval_loss': 10.675354957580566, 'eval_accuracy': 0.004878048780487805, 'eval_runtime': 236.3771, 'eval_samples_per_second': 6.938, 'eval_steps_per_second': 1.735, 'epoch': 5.636856368563686}
{'eval_loss': 10.675354957580566, 'eval_accuracy': 0.004878048780487805, 'eval_runtime': 236.3771, 'eval_samples_per_second': 6.938, 'eval_steps_per_second': 1.735, 'epoch': 5.64}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 57%|█████▋    | 1310/2300 [12:59:45<13:35:34, 49.43s/it] 

Step 1310: Training Loss: 8.59050
{'loss': 8.5905, 'grad_norm': 4.450871467590332, 'learning_rate': 3.826923076923077e-05, 'epoch': 5.68}


 57%|█████▋    | 1320/2300 [13:06:35<10:32:19, 38.71s/it]

Step 1320: Training Loss: 8.65170
{'loss': 8.6517, 'grad_norm': 4.248036861419678, 'learning_rate': 3.788461538461538e-05, 'epoch': 5.72}


 58%|█████▊    | 1330/2300 [13:12:59<10:26:50, 38.77s/it]

Step 1330: Training Loss: 8.65600
{'loss': 8.656, 'grad_norm': 4.490577697753906, 'learning_rate': 3.7500000000000003e-05, 'epoch': 5.77}


 58%|█████▊    | 1340/2300 [13:19:41<10:31:31, 39.47s/it]

Step 1340: Training Loss: 8.58640
{'loss': 8.5864, 'grad_norm': 4.687926769256592, 'learning_rate': 3.711538461538462e-05, 'epoch': 5.81}


 59%|█████▊    | 1350/2300 [13:26:22<10:22:56, 39.34s/it]

Step 1350: Training Loss: 8.64010
{'loss': 8.6401, 'grad_norm': 4.44835090637207, 'learning_rate': 3.673076923076923e-05, 'epoch': 5.85}


 59%|█████▉    | 1360/2300 [13:34:05<11:35:40, 44.41s/it]

Step 1360: Training Loss: 8.72850
{'loss': 8.7285, 'grad_norm': 4.448334217071533, 'learning_rate': 3.634615384615385e-05, 'epoch': 5.9}


 60%|█████▉    | 1370/2300 [13:42:33<13:40:23, 52.93s/it]

Step 1370: Training Loss: 8.67370
{'loss': 8.6737, 'grad_norm': 4.254135608673096, 'learning_rate': 3.596153846153846e-05, 'epoch': 5.94}


 60%|██████    | 1380/2300 [13:50:37<11:38:26, 45.55s/it]

Step 1380: Training Loss: 8.66160
{'loss': 8.6616, 'grad_norm': 4.398617744445801, 'learning_rate': 3.557692307692308e-05, 'epoch': 5.98}


 60%|██████    | 1390/2300 [13:58:41<12:41:37, 50.22s/it]

Step 1390: Training Loss: 8.36740
{'loss': 8.3674, 'grad_norm': 4.758970260620117, 'learning_rate': 3.51923076923077e-05, 'epoch': 6.03}


 61%|██████    | 1400/2300 [14:07:34<13:17:13, 53.15s/it]

Step 1400: Training Loss: 8.18240
{'loss': 8.1824, 'grad_norm': 4.569128513336182, 'learning_rate': 3.480769230769231e-05, 'epoch': 6.07}


                                                         
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 1400: Logs: {'eval_loss': 10.655547142028809, 'eval_accuracy': 0.004878048780487805, 'eval_runtime': 358.0756, 'eval_samples_per_second': 4.58, 'eval_steps_per_second': 1.145, 'epoch': 6.070460704607046}
{'eval_loss': 10.655547142028809, 'eval_accuracy': 0.004878048780487805, 'eval_runtime': 358.0756, 'eval_samples_per_second': 4.58, 'eval_steps_per_second': 1.145, 'epoch': 6.07}


 61%|██████▏   | 1410/2300 [14:22:31<13:15:32, 53.63s/it] 

Step 1410: Training Loss: 8.16710
{'loss': 8.1671, 'grad_norm': 4.369695663452148, 'learning_rate': 3.442307692307692e-05, 'epoch': 6.11}


 62%|██████▏   | 1420/2300 [14:30:51<11:26:04, 46.78s/it]

Step 1420: Training Loss: 8.18550
{'loss': 8.1855, 'grad_norm': 4.397416114807129, 'learning_rate': 3.4038461538461544e-05, 'epoch': 6.16}


 62%|██████▏   | 1430/2300 [14:39:38<13:52:10, 57.39s/it]

Step 1430: Training Loss: 8.16960
{'loss': 8.1696, 'grad_norm': 4.275537490844727, 'learning_rate': 3.365384615384616e-05, 'epoch': 6.2}


 63%|██████▎   | 1440/2300 [14:49:18<12:37:53, 52.88s/it]

Step 1440: Training Loss: 8.15030
{'loss': 8.1503, 'grad_norm': 4.299221992492676, 'learning_rate': 3.326923076923077e-05, 'epoch': 6.24}


 63%|██████▎   | 1450/2300 [14:57:31<11:33:19, 48.94s/it]

Step 1450: Training Loss: 8.18340
{'loss': 8.1834, 'grad_norm': 4.587093353271484, 'learning_rate': 3.288461538461539e-05, 'epoch': 6.29}


 63%|██████▎   | 1460/2300 [15:05:56<11:45:47, 50.41s/it]

Step 1460: Training Loss: 8.19390
{'loss': 8.1939, 'grad_norm': 4.405888080596924, 'learning_rate': 3.2500000000000004e-05, 'epoch': 6.33}


 64%|██████▍   | 1470/2300 [15:12:32<8:06:00, 35.13s/it] 

Step 1470: Training Loss: 8.18510
{'loss': 8.1851, 'grad_norm': 4.470627784729004, 'learning_rate': 3.211538461538462e-05, 'epoch': 6.37}


 64%|██████▍   | 1480/2300 [15:18:37<8:55:27, 39.18s/it]

Step 1480: Training Loss: 8.20840
{'loss': 8.2084, 'grad_norm': 4.485754013061523, 'learning_rate': 3.1730769230769234e-05, 'epoch': 6.42}


 65%|██████▍   | 1490/2300 [15:25:00<7:56:07, 35.27s/it]

Step 1490: Training Loss: 8.18790
{'loss': 8.1879, 'grad_norm': 4.409832954406738, 'learning_rate': 3.134615384615385e-05, 'epoch': 6.46}


 65%|██████▌   | 1500/2300 [15:30:36<7:25:41, 33.43s/it]

Step 1500: Training Loss: 8.17030
{'loss': 8.1703, 'grad_norm': 4.434277057647705, 'learning_rate': 3.0961538461538464e-05, 'epoch': 6.5}


                                                        
 65%|██████▌   | 1500/2300 [15:34:10<7:25:41, 33.43s/it]

Step 1500: Logs: {'eval_loss': 10.670845985412598, 'eval_accuracy': 0.006707317073170732, 'eval_runtime': 213.1222, 'eval_samples_per_second': 7.695, 'eval_steps_per_second': 1.924, 'epoch': 6.504065040650406}
{'eval_loss': 10.670845985412598, 'eval_accuracy': 0.006707317073170732, 'eval_runtime': 213.1222, 'eval_samples_per_second': 7.695, 'eval_steps_per_second': 1.924, 'epoch': 6.5}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 66%|██████▌   | 1510/2300 [15:39:48<7:53:29, 35.96s/it] 

Step 1510: Training Loss: 8.18430
{'loss': 8.1843, 'grad_norm': 4.368095397949219, 'learning_rate': 3.057692307692308e-05, 'epoch': 6.55}


 66%|██████▌   | 1520/2300 [15:45:21<7:13:05, 33.31s/it]

Step 1520: Training Loss: 8.21620
{'loss': 8.2162, 'grad_norm': 4.4139323234558105, 'learning_rate': 3.0192307692307693e-05, 'epoch': 6.59}


 67%|██████▋   | 1530/2300 [15:50:57<7:14:48, 33.88s/it]

Step 1530: Training Loss: 8.21950
{'loss': 8.2195, 'grad_norm': 4.507367134094238, 'learning_rate': 2.9807692307692308e-05, 'epoch': 6.63}


 67%|██████▋   | 1540/2300 [15:56:36<7:07:00, 33.71s/it]

Step 1540: Training Loss: 8.24510
{'loss': 8.2451, 'grad_norm': 5.0480170249938965, 'learning_rate': 2.9423076923076926e-05, 'epoch': 6.68}


 67%|██████▋   | 1550/2300 [16:02:19<7:01:26, 33.72s/it]

Step 1550: Training Loss: 8.17610
{'loss': 8.1761, 'grad_norm': 4.495598793029785, 'learning_rate': 2.903846153846154e-05, 'epoch': 6.72}


 68%|██████▊   | 1560/2300 [16:07:55<6:52:25, 33.44s/it]

Step 1560: Training Loss: 8.23510
{'loss': 8.2351, 'grad_norm': 5.350032329559326, 'learning_rate': 2.8653846153846153e-05, 'epoch': 6.76}


 68%|██████▊   | 1570/2300 [16:13:40<6:58:00, 34.36s/it]

Step 1570: Training Loss: 8.18960
{'loss': 8.1896, 'grad_norm': 4.461862087249756, 'learning_rate': 2.826923076923077e-05, 'epoch': 6.81}


 69%|██████▊   | 1580/2300 [16:19:19<6:42:16, 33.52s/it]

Step 1580: Training Loss: 8.20610
{'loss': 8.2061, 'grad_norm': 4.7515997886657715, 'learning_rate': 2.7884615384615386e-05, 'epoch': 6.85}


 69%|██████▉   | 1590/2300 [16:24:53<6:35:08, 33.39s/it]

Step 1590: Training Loss: 8.21700
{'loss': 8.217, 'grad_norm': 4.478387832641602, 'learning_rate': 2.7500000000000004e-05, 'epoch': 6.89}


 70%|██████▉   | 1600/2300 [16:30:35<6:40:58, 34.37s/it]

Step 1600: Training Loss: 8.29170
{'loss': 8.2917, 'grad_norm': 4.826689720153809, 'learning_rate': 2.7115384615384616e-05, 'epoch': 6.94}


                                                        
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 1600: Logs: {'eval_loss': 10.659586906433105, 'eval_accuracy': 0.006097560975609756, 'eval_runtime': 220.7533, 'eval_samples_per_second': 7.429, 'eval_steps_per_second': 1.857, 'epoch': 6.937669376693767}
{'eval_loss': 10.659586906433105, 'eval_accuracy': 0.006097560975609756, 'eval_runtime': 220.7533, 'eval_samples_per_second': 7.429, 'eval_steps_per_second': 1.857, 'epoch': 6.94}


 70%|███████   | 1610/2300 [16:40:09<7:15:24, 37.86s/it]  

Step 1610: Training Loss: 8.21700
{'loss': 8.217, 'grad_norm': 4.524073600769043, 'learning_rate': 2.673076923076923e-05, 'epoch': 6.98}


 70%|███████   | 1620/2300 [16:45:53<6:35:59, 34.94s/it]

Step 1620: Training Loss: 7.99920
{'loss': 7.9992, 'grad_norm': 4.549350261688232, 'learning_rate': 2.634615384615385e-05, 'epoch': 7.02}


 71%|███████   | 1630/2300 [16:51:42<6:39:50, 35.81s/it]

Step 1630: Training Loss: 7.74420
{'loss': 7.7442, 'grad_norm': 4.6656413078308105, 'learning_rate': 2.5961538461538464e-05, 'epoch': 7.07}


 71%|███████▏  | 1640/2300 [16:58:37<7:18:11, 39.83s/it]

Step 1640: Training Loss: 7.80450
{'loss': 7.8045, 'grad_norm': 4.529623031616211, 'learning_rate': 2.5576923076923075e-05, 'epoch': 7.11}


 72%|███████▏  | 1650/2300 [17:05:34<7:18:49, 40.51s/it]

Step 1650: Training Loss: 7.90060
{'loss': 7.9006, 'grad_norm': 4.547241687774658, 'learning_rate': 2.5192307692307694e-05, 'epoch': 7.15}


 72%|███████▏  | 1660/2300 [17:12:06<6:55:05, 38.91s/it]

Step 1660: Training Loss: 7.78310
{'loss': 7.7831, 'grad_norm': 4.42181396484375, 'learning_rate': 2.480769230769231e-05, 'epoch': 7.2}


 73%|███████▎  | 1670/2300 [17:18:37<6:47:54, 38.85s/it]

Step 1670: Training Loss: 7.82910
{'loss': 7.8291, 'grad_norm': 4.651551246643066, 'learning_rate': 2.4423076923076924e-05, 'epoch': 7.24}


 73%|███████▎  | 1680/2300 [17:25:06<6:40:15, 38.74s/it]

Step 1680: Training Loss: 7.82760
{'loss': 7.8276, 'grad_norm': 4.529719352722168, 'learning_rate': 2.4038461538461542e-05, 'epoch': 7.28}


 73%|███████▎  | 1690/2300 [17:32:08<7:35:49, 44.83s/it]

Step 1690: Training Loss: 7.88170
{'loss': 7.8817, 'grad_norm': 4.561180114746094, 'learning_rate': 2.3653846153846153e-05, 'epoch': 7.33}


 74%|███████▍  | 1700/2300 [17:39:10<7:04:07, 42.41s/it]

Step 1700: Training Loss: 7.82850
{'loss': 7.8285, 'grad_norm': 4.680431365966797, 'learning_rate': 2.326923076923077e-05, 'epoch': 7.37}


                                                        
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 1700: Logs: {'eval_loss': 10.634012222290039, 'eval_accuracy': 0.007317073170731708, 'eval_runtime': 273.0728, 'eval_samples_per_second': 6.006, 'eval_steps_per_second': 1.501, 'epoch': 7.3712737127371275}
{'eval_loss': 10.634012222290039, 'eval_accuracy': 0.007317073170731708, 'eval_runtime': 273.0728, 'eval_samples_per_second': 6.006, 'eval_steps_per_second': 1.501, 'epoch': 7.37}


 74%|███████▍  | 1710/2300 [17:51:33<9:12:00, 56.14s/it]  

Step 1710: Training Loss: 7.89960
{'loss': 7.8996, 'grad_norm': 4.593605995178223, 'learning_rate': 2.2884615384615387e-05, 'epoch': 7.41}


 75%|███████▍  | 1720/2300 [18:02:05<9:42:46, 60.29s/it] 

Step 1720: Training Loss: 7.78200
{'loss': 7.782, 'grad_norm': 4.617899417877197, 'learning_rate': 2.25e-05, 'epoch': 7.46}


 75%|███████▌  | 1730/2300 [18:11:59<8:37:45, 54.50s/it] 

Step 1730: Training Loss: 7.82730
{'loss': 7.8273, 'grad_norm': 4.774308204650879, 'learning_rate': 2.2115384615384616e-05, 'epoch': 7.5}


 76%|███████▌  | 1740/2300 [18:19:35<7:03:43, 45.40s/it]

Step 1740: Training Loss: 7.84320
{'loss': 7.8432, 'grad_norm': 4.508965969085693, 'learning_rate': 2.173076923076923e-05, 'epoch': 7.54}


 76%|███████▌  | 1750/2300 [18:29:09<9:01:48, 59.11s/it]

Step 1750: Training Loss: 7.84860
{'loss': 7.8486, 'grad_norm': 4.625045299530029, 'learning_rate': 2.1346153846153846e-05, 'epoch': 7.59}


 77%|███████▋  | 1760/2300 [18:38:23<8:11:10, 54.57s/it]

Step 1760: Training Loss: 7.87630
{'loss': 7.8763, 'grad_norm': 4.554872035980225, 'learning_rate': 2.0961538461538464e-05, 'epoch': 7.63}


 77%|███████▋  | 1770/2300 [18:47:38<8:13:06, 55.82s/it]

Step 1770: Training Loss: 7.92830
{'loss': 7.9283, 'grad_norm': 4.489283561706543, 'learning_rate': 2.0576923076923076e-05, 'epoch': 7.67}


 77%|███████▋  | 1780/2300 [18:55:12<7:12:56, 49.95s/it]

Step 1780: Training Loss: 7.91260
{'loss': 7.9126, 'grad_norm': 4.540469169616699, 'learning_rate': 2.0192307692307694e-05, 'epoch': 7.72}


 78%|███████▊  | 1790/2300 [19:04:02<8:56:53, 63.16s/it]

Step 1790: Training Loss: 7.81110
{'loss': 7.8111, 'grad_norm': 4.658694744110107, 'learning_rate': 1.980769230769231e-05, 'epoch': 7.76}


 78%|███████▊  | 1800/2300 [19:16:12<8:12:08, 59.06s/it] 

Step 1800: Training Loss: 7.88150
{'loss': 7.8815, 'grad_norm': 4.49746561050415, 'learning_rate': 1.9423076923076924e-05, 'epoch': 7.8}


                                                        
 78%|███████▊  | 1800/2300 [19:23:46<8:12:08, 59.06s/it]

Step 1800: Logs: {'eval_loss': 10.638997077941895, 'eval_accuracy': 0.009146341463414634, 'eval_runtime': 453.8697, 'eval_samples_per_second': 3.613, 'eval_steps_per_second': 0.903, 'epoch': 7.804878048780488}
{'eval_loss': 10.638997077941895, 'eval_accuracy': 0.009146341463414634, 'eval_runtime': 453.8697, 'eval_samples_per_second': 3.613, 'eval_steps_per_second': 0.903, 'epoch': 7.8}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 79%|███████▊  | 1810/2300 [19:37:45<11:21:28, 83.45s/it] 

Step 1810: Training Loss: 7.92020
{'loss': 7.9202, 'grad_norm': 4.768564224243164, 'learning_rate': 1.903846153846154e-05, 'epoch': 7.85}


 79%|███████▉  | 1820/2300 [19:51:48<10:38:45, 79.84s/it]

Step 1820: Training Loss: 7.89420
{'loss': 7.8942, 'grad_norm': 4.529917240142822, 'learning_rate': 1.8653846153846154e-05, 'epoch': 7.89}


 80%|███████▉  | 1830/2300 [20:01:43<6:40:14, 51.09s/it] 

Step 1830: Training Loss: 7.91440
{'loss': 7.9144, 'grad_norm': 4.581404685974121, 'learning_rate': 1.826923076923077e-05, 'epoch': 7.93}


 80%|████████  | 1840/2300 [20:10:24<6:45:16, 52.86s/it]

Step 1840: Training Loss: 7.77930
{'loss': 7.7793, 'grad_norm': 4.594456195831299, 'learning_rate': 1.7884615384615387e-05, 'epoch': 7.98}


 80%|████████  | 1850/2300 [20:18:03<5:38:47, 45.17s/it]

Step 1850: Training Loss: 7.74600
{'loss': 7.746, 'grad_norm': 4.58023738861084, 'learning_rate': 1.75e-05, 'epoch': 8.02}


 81%|████████  | 1860/2300 [20:25:31<5:27:23, 44.64s/it]

Step 1860: Training Loss: 7.57990
{'loss': 7.5799, 'grad_norm': 4.441186428070068, 'learning_rate': 1.7115384615384617e-05, 'epoch': 8.07}


 81%|████████▏ | 1870/2300 [20:32:59<5:20:30, 44.72s/it]

Step 1870: Training Loss: 7.53520
{'loss': 7.5352, 'grad_norm': 4.542048454284668, 'learning_rate': 1.673076923076923e-05, 'epoch': 8.11}


 82%|████████▏ | 1880/2300 [20:40:15<5:04:24, 43.49s/it]

Step 1880: Training Loss: 7.51440
{'loss': 7.5144, 'grad_norm': 4.5497965812683105, 'learning_rate': 1.6346153846153847e-05, 'epoch': 8.15}


 82%|████████▏ | 1890/2300 [20:47:45<5:07:26, 44.99s/it]

Step 1890: Training Loss: 7.57120
{'loss': 7.5712, 'grad_norm': 4.662736415863037, 'learning_rate': 1.596153846153846e-05, 'epoch': 8.2}


 83%|████████▎ | 1900/2300 [20:54:16<4:15:24, 38.31s/it]

Step 1900: Training Loss: 7.58930
{'loss': 7.5893, 'grad_norm': 4.622790813446045, 'learning_rate': 1.5576923076923076e-05, 'epoch': 8.24}


                                                        
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 1900: Logs: {'eval_loss': 10.647150993347168, 'eval_accuracy': 0.012195121951219513, 'eval_runtime': 236.5096, 'eval_samples_per_second': 6.934, 'eval_steps_per_second': 1.734, 'epoch': 8.238482384823849}
{'eval_loss': 10.647150993347168, 'eval_accuracy': 0.012195121951219513, 'eval_runtime': 236.5096, 'eval_samples_per_second': 6.934, 'eval_steps_per_second': 1.734, 'epoch': 8.24}


 83%|████████▎ | 1910/2300 [21:04:37<4:28:05, 41.24s/it]  

Step 1910: Training Loss: 7.58970
{'loss': 7.5897, 'grad_norm': 4.629359722137451, 'learning_rate': 1.5192307692307691e-05, 'epoch': 8.28}


 83%|████████▎ | 1920/2300 [21:11:03<4:04:20, 38.58s/it]

Step 1920: Training Loss: 7.59750
{'loss': 7.5975, 'grad_norm': 4.5311665534973145, 'learning_rate': 1.4807692307692308e-05, 'epoch': 8.33}


 84%|████████▍ | 1930/2300 [21:17:27<3:57:14, 38.47s/it]

Step 1930: Training Loss: 7.54660
{'loss': 7.5466, 'grad_norm': 4.655707836151123, 'learning_rate': 1.4423076923076923e-05, 'epoch': 8.37}


 84%|████████▍ | 1940/2300 [21:23:51<3:50:01, 38.34s/it]

Step 1940: Training Loss: 7.56720
{'loss': 7.5672, 'grad_norm': 4.685181140899658, 'learning_rate': 1.403846153846154e-05, 'epoch': 8.41}


 85%|████████▍ | 1950/2300 [21:30:14<3:42:52, 38.21s/it]

Step 1950: Training Loss: 7.60060
{'loss': 7.6006, 'grad_norm': 4.731838703155518, 'learning_rate': 1.3653846153846153e-05, 'epoch': 8.46}


 85%|████████▌ | 1960/2300 [21:36:38<3:38:00, 38.47s/it]

Step 1960: Training Loss: 7.54860
{'loss': 7.5486, 'grad_norm': 4.529146194458008, 'learning_rate': 1.3269230769230769e-05, 'epoch': 8.5}


 86%|████████▌ | 1970/2300 [21:43:01<3:30:52, 38.34s/it]

Step 1970: Training Loss: 7.58290
{'loss': 7.5829, 'grad_norm': 4.564640522003174, 'learning_rate': 1.2884615384615384e-05, 'epoch': 8.54}


 86%|████████▌ | 1980/2300 [21:49:27<3:25:34, 38.55s/it]

Step 1980: Training Loss: 7.58200
{'loss': 7.582, 'grad_norm': 4.809405326843262, 'learning_rate': 1.25e-05, 'epoch': 8.59}


 87%|████████▋ | 1990/2300 [21:55:52<3:19:12, 38.56s/it]

Step 1990: Training Loss: 7.62270
{'loss': 7.6227, 'grad_norm': 4.6991496086120605, 'learning_rate': 1.2115384615384615e-05, 'epoch': 8.63}


 87%|████████▋ | 2000/2300 [22:02:09<3:08:21, 37.67s/it]

Step 2000: Training Loss: 7.62600
{'loss': 7.626, 'grad_norm': 4.5552778244018555, 'learning_rate': 1.1730769230769232e-05, 'epoch': 8.67}


                                                        
 87%|████████▋ | 2000/2300 [22:06:05<3:08:21, 37.67s/it]

Step 2000: Logs: {'eval_loss': 10.700302124023438, 'eval_accuracy': 0.011585365853658536, 'eval_runtime': 236.2801, 'eval_samples_per_second': 6.941, 'eval_steps_per_second': 1.735, 'epoch': 8.672086720867208}
{'eval_loss': 10.700302124023438, 'eval_accuracy': 0.011585365853658536, 'eval_runtime': 236.2801, 'eval_samples_per_second': 6.941, 'eval_steps_per_second': 1.735, 'epoch': 8.67}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 87%|████████▋ | 2010/2300 [22:12:38<3:20:20, 41.45s/it] 

Step 2010: Training Loss: 7.54070
{'loss': 7.5407, 'grad_norm': 4.536120891571045, 'learning_rate': 1.1346153846153847e-05, 'epoch': 8.72}


 88%|████████▊ | 2020/2300 [22:19:01<2:59:29, 38.46s/it]

Step 2020: Training Loss: 7.61450
{'loss': 7.6145, 'grad_norm': 4.55925989151001, 'learning_rate': 1.0961538461538462e-05, 'epoch': 8.76}


 88%|████████▊ | 2030/2300 [22:25:26<2:53:19, 38.52s/it]

Step 2030: Training Loss: 7.61990
{'loss': 7.6199, 'grad_norm': 4.681885719299316, 'learning_rate': 1.0576923076923077e-05, 'epoch': 8.8}


 89%|████████▊ | 2040/2300 [22:31:50<2:46:28, 38.42s/it]

Step 2040: Training Loss: 7.62020
{'loss': 7.6202, 'grad_norm': 4.6192121505737305, 'learning_rate': 1.0192307692307693e-05, 'epoch': 8.85}


 89%|████████▉ | 2050/2300 [22:38:16<2:40:30, 38.52s/it]

Step 2050: Training Loss: 7.64190
{'loss': 7.6419, 'grad_norm': 4.536502361297607, 'learning_rate': 9.807692307692308e-06, 'epoch': 8.89}


 90%|████████▉ | 2060/2300 [22:44:44<2:36:31, 39.13s/it]

Step 2060: Training Loss: 7.52430
{'loss': 7.5243, 'grad_norm': 4.546051502227783, 'learning_rate': 9.423076923076923e-06, 'epoch': 8.93}


 90%|█████████ | 2070/2300 [22:51:09<2:27:45, 38.54s/it]

Step 2070: Training Loss: 7.56400
{'loss': 7.564, 'grad_norm': 4.6829705238342285, 'learning_rate': 9.038461538461538e-06, 'epoch': 8.98}


 90%|█████████ | 2080/2300 [22:57:33<2:20:35, 38.34s/it]

Step 2080: Training Loss: 7.49430
{'loss': 7.4943, 'grad_norm': 4.703489303588867, 'learning_rate': 8.653846153846155e-06, 'epoch': 9.02}


 91%|█████████ | 2090/2300 [23:03:57<2:14:42, 38.49s/it]

Step 2090: Training Loss: 7.35420
{'loss': 7.3542, 'grad_norm': 4.549832344055176, 'learning_rate': 8.26923076923077e-06, 'epoch': 9.06}


 91%|█████████▏| 2100/2300 [23:10:21<2:07:35, 38.28s/it]

Step 2100: Training Loss: 7.43280
{'loss': 7.4328, 'grad_norm': 4.518011093139648, 'learning_rate': 7.884615384615384e-06, 'epoch': 9.11}


                                                        
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 2100: Logs: {'eval_loss': 10.630249977111816, 'eval_accuracy': 0.01097560975609756, 'eval_runtime': 235.5523, 'eval_samples_per_second': 6.962, 'eval_steps_per_second': 1.741, 'epoch': 9.105691056910569}
{'eval_loss': 10.630249977111816, 'eval_accuracy': 0.01097560975609756, 'eval_runtime': 235.5523, 'eval_samples_per_second': 6.962, 'eval_steps_per_second': 1.741, 'epoch': 9.11}


 92%|█████████▏| 2110/2300 [23:21:05<2:24:26, 45.61s/it] 

Step 2110: Training Loss: 7.39120
{'loss': 7.3912, 'grad_norm': 4.649808406829834, 'learning_rate': 7.5e-06, 'epoch': 9.15}


 92%|█████████▏| 2120/2300 [23:28:22<2:11:02, 43.68s/it]

Step 2120: Training Loss: 7.45340
{'loss': 7.4534, 'grad_norm': 4.663898468017578, 'learning_rate': 7.115384615384615e-06, 'epoch': 9.19}


 93%|█████████▎| 2130/2300 [23:35:48<2:06:15, 44.56s/it]

Step 2130: Training Loss: 7.41980
{'loss': 7.4198, 'grad_norm': 4.508609294891357, 'learning_rate': 6.730769230769231e-06, 'epoch': 9.24}


 93%|█████████▎| 2140/2300 [23:43:21<2:00:00, 45.00s/it]

Step 2140: Training Loss: 7.42550
{'loss': 7.4255, 'grad_norm': 4.746102333068848, 'learning_rate': 6.346153846153846e-06, 'epoch': 9.28}


 93%|█████████▎| 2150/2300 [23:50:47<1:51:22, 44.55s/it]

Step 2150: Training Loss: 7.38840
{'loss': 7.3884, 'grad_norm': 4.36021614074707, 'learning_rate': 5.961538461538462e-06, 'epoch': 9.32}


 94%|█████████▍| 2160/2300 [23:58:10<1:41:05, 43.32s/it]

Step 2160: Training Loss: 7.38450
{'loss': 7.3845, 'grad_norm': 4.6023736000061035, 'learning_rate': 5.576923076923077e-06, 'epoch': 9.37}


 94%|█████████▍| 2170/2300 [24:04:33<1:23:16, 38.44s/it]

Step 2170: Training Loss: 7.41200
{'loss': 7.412, 'grad_norm': 4.980718612670898, 'learning_rate': 5.192307692307693e-06, 'epoch': 9.41}


 95%|█████████▍| 2180/2300 [24:11:08<1:17:43, 38.86s/it]

Step 2180: Training Loss: 7.38750
{'loss': 7.3875, 'grad_norm': 4.468564510345459, 'learning_rate': 4.807692307692308e-06, 'epoch': 9.45}


 95%|█████████▌| 2190/2300 [24:17:33<1:10:35, 38.51s/it]

Step 2190: Training Loss: 7.40870
{'loss': 7.4087, 'grad_norm': 4.840895652770996, 'learning_rate': 4.423076923076924e-06, 'epoch': 9.5}


 96%|█████████▌| 2200/2300 [24:23:55<1:03:40, 38.21s/it]

Step 2200: Training Loss: 7.43680
{'loss': 7.4368, 'grad_norm': 4.496683597564697, 'learning_rate': 4.0384615384615385e-06, 'epoch': 9.54}


                                                        
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 2200: Logs: {'eval_loss': 10.646992683410645, 'eval_accuracy': 0.011585365853658536, 'eval_runtime': 236.4934, 'eval_samples_per_second': 6.935, 'eval_steps_per_second': 1.734, 'epoch': 9.53929539295393}
{'eval_loss': 10.646992683410645, 'eval_accuracy': 0.011585365853658536, 'eval_runtime': 236.4934, 'eval_samples_per_second': 6.935, 'eval_steps_per_second': 1.734, 'epoch': 9.54}


 96%|█████████▌| 2210/2300 [24:34:15<1:01:50, 41.22s/it] 

Step 2210: Training Loss: 7.40370
{'loss': 7.4037, 'grad_norm': 4.660200595855713, 'learning_rate': 3.6538461538461542e-06, 'epoch': 9.58}


 97%|█████████▋| 2220/2300 [24:40:36<50:47, 38.10s/it]  

Step 2220: Training Loss: 7.33980
{'loss': 7.3398, 'grad_norm': 4.561900615692139, 'learning_rate': 3.2692307692307696e-06, 'epoch': 9.63}


 97%|█████████▋| 2230/2300 [24:46:59<44:36, 38.24s/it]

Step 2230: Training Loss: 7.31470
{'loss': 7.3147, 'grad_norm': 4.517223358154297, 'learning_rate': 2.884615384615385e-06, 'epoch': 9.67}


 97%|█████████▋| 2240/2300 [24:53:22<38:17, 38.29s/it]

Step 2240: Training Loss: 7.35380
{'loss': 7.3538, 'grad_norm': 4.640416622161865, 'learning_rate': 2.5e-06, 'epoch': 9.71}


 98%|█████████▊| 2250/2300 [24:59:47<32:07, 38.55s/it]

Step 2250: Training Loss: 7.42810
{'loss': 7.4281, 'grad_norm': 4.549184322357178, 'learning_rate': 2.1153846153846155e-06, 'epoch': 9.76}


 98%|█████████▊| 2260/2300 [25:06:10<25:34, 38.36s/it]

Step 2260: Training Loss: 7.37710
{'loss': 7.3771, 'grad_norm': 4.526759624481201, 'learning_rate': 1.7307692307692308e-06, 'epoch': 9.8}


 99%|█████████▊| 2270/2300 [25:12:36<19:16, 38.56s/it]

Step 2270: Training Loss: 7.43470
{'loss': 7.4347, 'grad_norm': 4.504891395568848, 'learning_rate': 1.3461538461538464e-06, 'epoch': 9.84}


 99%|█████████▉| 2280/2300 [25:19:00<12:48, 38.40s/it]

Step 2280: Training Loss: 7.43490
{'loss': 7.4349, 'grad_norm': 4.447793483734131, 'learning_rate': 9.615384615384617e-07, 'epoch': 9.89}


100%|█████████▉| 2290/2300 [25:25:25<06:25, 38.51s/it]

Step 2290: Training Loss: 7.40080
{'loss': 7.4008, 'grad_norm': 4.637731075286865, 'learning_rate': 5.76923076923077e-07, 'epoch': 9.93}


100%|██████████| 2300/2300 [25:31:49<00:00, 38.46s/it]

Step 2300: Training Loss: 7.47870
{'loss': 7.4787, 'grad_norm': 4.707476615905762, 'learning_rate': 1.9230769230769234e-07, 'epoch': 9.97}


                                                      
100%|██████████| 2300/2300 [25:35:45<00:00, 38.46s/it]

Step 2300: Logs: {'eval_loss': 10.64493179321289, 'eval_accuracy': 0.011585365853658536, 'eval_runtime': 235.8329, 'eval_samples_per_second': 6.954, 'eval_steps_per_second': 1.739, 'epoch': 9.97289972899729}
{'eval_loss': 10.64493179321289, 'eval_accuracy': 0.011585365853658536, 'eval_runtime': 235.8329, 'eval_samples_per_second': 6.954, 'eval_steps_per_second': 1.739, 'epoch': 9.97}


100%|██████████| 2300/2300 [25:35:51<00:00, 40.07s/it]

Step 2300: Logs: {'train_runtime': 92151.63, 'train_samples_per_second': 1.602, 'train_steps_per_second': 0.025, 'total_flos': 3.60298513711104e+16, 'train_loss': 8.716842704441236, 'epoch': 9.97289972899729}
{'train_runtime': 92151.63, 'train_samples_per_second': 1.602, 'train_steps_per_second': 0.025, 'train_loss': 8.716842704441236, 'epoch': 9.97}



100%|██████████| 410/410 [03:55<00:00,  1.74it/s]

Step 2300: Logs: {'eval_loss': 10.700302124023438, 'eval_accuracy': 0.011585365853658536, 'eval_runtime': 235.9645, 'eval_samples_per_second': 6.95, 'eval_steps_per_second': 1.738, 'epoch': 9.97289972899729}
Evaluation results: {'eval_loss': 10.700302124023438, 'eval_accuracy': 0.011585365853658536, 'eval_runtime': 235.9645, 'eval_samples_per_second': 6.95, 'eval_steps_per_second': 1.738, 'epoch': 9.97289972899729}



