In [1]:
# !pip install datasets
!pip install transformers[torch]
# !pip install torchaudio
# !pip install librosa
# !pip install jiwer
# !pip install evaluate
!pip install accelerate



In [2]:
import os
from datasets import load_dataset, Audio

data_files = {
    "train": "/home/mehedi/Projects/Learning/wav2vec/local_bn_dataset_2/metadata_train.csv",
    "validation": "/home/mehedi/Projects/Learning/wav2vec/local_bn_dataset_2/metadata_validation.csv",
    "test": "/home/mehedi/Projects/Learning/wav2vec/local_bn_dataset_2/metadata_test.csv"
}

audio_base_path = "/home/mehedi/Projects/Learning/wav2vec/local_bn_dataset_2/"

my_local_dataset_train = load_dataset(
    "csv", 
    data_files={"train": data_files["train"], "validation": data_files["validation"]}, 
    split="train+validation"
)

my_local_dataset_test = load_dataset(
    "csv", 
    data_files={"test": data_files["test"]}, 
    split="test"
)

def add_absolute_path(example):
    # Create a new 'audio' column with the absolute path
    example["audio"] = os.path.join(audio_base_path, example["path"])
    return example

my_local_dataset_train = my_local_dataset_train.map(add_absolute_path)
my_local_dataset_test = my_local_dataset_test.map(add_absolute_path)

common_voice_train = my_local_dataset_train.cast_column("audio", Audio(sampling_rate=16000))
common_voice_test = my_local_dataset_test.cast_column("audio", Audio(sampling_rate=16000))



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
my_local_dataset_train[0]

{'path': 'train/common_voice_bn_35453419.mp3',
 'sentence': 'প্রমার সভাপতি রাশেদ হাসানের সভাপতিত্বে অনুষ্ঠানে বিশেষ অতিথি ছিলেন কবি ওমর কায়সার।',
 'audio': '/home/mehedi/Projects/Learning/wav2vec/local_bn_dataset_2/train/common_voice_bn_35453419.mp3'}

In [5]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [6]:
show_random_elements(common_voice_train.remove_columns(["path","audio"]), num_examples=10)

Unnamed: 0,sentence
0,এ সময়ে তিনি নিজের কবিতা প্রকাশ করতে পারতেন না।
1,এটা তখনকার সময়কার খুবই আধুনিক একটি দালান ছিল।
2,এই প্রত্যেকটি ভূমিরূপ কে আবার অনেক ভাগে ভাগ করা হয়েছে।
3,ভারত সরকারের পুরাতাত্ত্বিক বিভাগ এর দেখাশুনা করে।
4,পিতা চিকিৎসক সুরেন্দ্রনাথ দাশ ও মাতা বিশিষ্ট শিক্ষিকা সবিতা দাশ।
5,এটি একটি আশ্চর্যজনক প্রশিক্ষণ কর্মক্ষমতা।
6,"পরবর্তীতে, তিনি আরও দুটি কিডস চয়েস অ্যাওয়ার্ডস জিতে নেন এবং একটি পিপলস চয়েস অ্যাওয়ার্ড জিতেন।"
7,এটি পরিচালনা করেছেন মার্টিন স্কোরসেজি এবং চিত্রনাট্য লিখেছেন জন লোগান।
8,এটি উত্তর দিনাজপুর জেলার সদর দপ্তর।
9,তিনি তার শৈশব কাটান নতুন দিল্লিতে।


In [7]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    return batch

In [8]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

In [9]:
show_random_elements(common_voice_train.remove_columns(["path","audio"]))

Unnamed: 0,sentence
0,তিনি একবার পশ্চিমবঙ্গ বিধানসভার সদস্য হিসেবে নির্বাচিত হয়েছেন।
1,কোয়ান্টাম মেকানিক্স।
2,বৌদ্ধধর্মে আদিবুদ্ধ শীর্ষক এ আস্তিক্যবাদী ধারণার উৎপত্তিস্থল সম্পর্কে মতভেদ আছে।
3,পরবর্তীতে তিনি লখনউ এ দরিদ্র ছাত্রছাত্রীদের জন্য একটি স্কুল প্রতিষ্ঠা করেন।
4,তিনি ছিলেন জমিদার ও বিত্তবান পরিবারের একমাত্র সন্তান।
5,এরপর তিনি দ্যা আফ্রিকান রিভিউ পত্রিকার ফিচার সম্পাদক ও গার্ডিয়ান টাইমসে ফ্রিল্যান্স লেখক হিসেবে কাজ করেন।
6,তার প্রাতিষ্ঠানিক শিক্ষা এইচএসসি পর্যন্ত।
7,ইসরায়েলে যেখানে পরিবারের অনেক সদস্য থাকেন তিনি এবং তার স্বামী একটি প্রযুক্তিগত কলেজে ফেলোশিপ দিয়েছিলেন।
8,ডানহাতি মিডিয়ামফাস্ট বোলার হিসেবে ভারত ক্রিকেট দলে খেলছেন।
9,কেন্দ্রটি ন্যাশনাল বোটানিক্যাল গার্ডেন নামেও পরিচিত।


In [10]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [11]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

Map: 100%|██████████| 9000/9000 [00:00<00:00, 186024.01 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 199804.88 examples/s]


In [12]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [13]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{' ': 0,
 'হ': 1,
 'জ': 2,
 'ং': 3,
 'a': 4,
 'উ': 5,
 'অ': 6,
 'ঝ': 7,
 'ঔ': 8,
 'ট': 9,
 'ঊ': 10,
 '্': 11,
 'র': 12,
 'ৃ': 13,
 'য়': 14,
 'চ': 15,
 'শ': 16,
 'ত': 17,
 'ঋ': 18,
 'ন': 19,
 'ঢ': 20,
 '।': 21,
 'ফ': 22,
 '’': 23,
 'ৌ': 24,
 "'": 25,
 'ঈ': 26,
 'প': 27,
 'ঐ': 28,
 'ৎ': 29,
 'ঞ': 30,
 '—': 31,
 'l': 32,
 'ও': 33,
 'ঃ': 34,
 'ু': 35,
 'স': 36,
 'ঠ': 37,
 'ষ': 38,
 'o': 39,
 'ূ': 40,
 'ভ': 41,
 '়': 42,
 'e': 43,
 'খ': 44,
 'y': 45,
 'g': 46,
 '॥': 47,
 'ঘ': 48,
 '–': 49,
 'ম': 50,
 'p': 51,
 'ড়': 52,
 'ই': 53,
 'ণ': 54,
 'ি': 55,
 'ী': 56,
 'গ': 57,
 '/': 58,
 'া': 59,
 'ো': 60,
 'থ': 61,
 'য': 62,
 'ব': 63,
 'ঙ': 64,
 'এ': 65,
 'আ': 66,
 'ঢ়': 67,
 'ল': 68,
 'দ': 69,
 'ৈ': 70,
 'ছ': 71,
 'ক': 72,
 'ধ': 73,
 'ড': 74,
 'ে': 75,
 'ঁ': 76}

In [14]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [15]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

79

In [16]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [17]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [18]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [19]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [20]:
common_voice_train[0]["path"]

'train/common_voice_bn_35453419.mp3'

In [21]:
common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16_000))

In [22]:
common_voice_train[0]["audio"]

{'path': '/home/mehedi/Projects/Learning/wav2vec/local_bn_dataset_2/train/common_voice_bn_35453419.mp3',
 'array': array([-2.32830644e-10, -4.65661287e-10, -4.65661287e-10, ...,
         1.02171907e-06,  5.25280484e-07,  3.05510912e-06]),
 'sampling_rate': 16000}

In [23]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(common_voice_train)-1)

ipd.Audio(data=common_voice_train[rand_int]["audio"]["array"], autoplay=True, rate=16000)

In [24]:
rand_int = random.randint(0, len(common_voice_train)-1)

print("Target text:", common_voice_train[rand_int]["sentence"])
print("Input array shape:", common_voice_train[rand_int]["audio"]["array"].shape)
print("Sampling rate:", common_voice_train[rand_int]["audio"]["sampling_rate"])

Target text: এতে তিনি ডাবল সেঞ্চুরি বা দ্বিশতক করেন। 
Input array shape: (92736,)
Sampling rate: 16000


In [25]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [26]:
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, num_proc=4)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, num_proc=4)

Map (num_proc=4): 100%|██████████| 9000/9000 [00:18<00:00, 498.08 examples/s]
Map (num_proc=4): 100%|██████████| 1000/1000 [00:02<00:00, 451.12 examples/s]


In [27]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [28]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [29]:
import evaluate

# Load the Word Error Rate (WER) metric
wer_metric = evaluate.load("wer")

In [30]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [31]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",  
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
model.freeze_feature_extractor()



In [33]:
model.gradient_checkpointing_enable()

In [34]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./wav2vec2-large-xlsr-bn-demo",
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=20,
  fp16=True,  # Disable mixed precision training on CPU
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)



In [35]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [36]:
trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  0%|          | 10/5620 [00:39<5:30:26,  3.53s/it]

{'loss': 17.8706, 'grad_norm': 5.545180320739746, 'learning_rate': 5.399999999999999e-06, 'epoch': 0.04}


  0%|          | 20/5620 [01:09<4:22:51,  2.82s/it]

{'loss': 18.3859, 'grad_norm': 10.400908470153809, 'learning_rate': 1.14e-05, 'epoch': 0.07}


  1%|          | 30/5620 [01:33<3:30:06,  2.26s/it]

{'loss': 18.8783, 'grad_norm': 10.058520317077637, 'learning_rate': 1.74e-05, 'epoch': 0.11}


  1%|          | 40/5620 [01:53<3:06:31,  2.01s/it]

{'loss': 18.9639, 'grad_norm': 9.740849494934082, 'learning_rate': 2.28e-05, 'epoch': 0.14}


  1%|          | 50/5620 [02:09<2:22:55,  1.54s/it]

{'loss': 19.42, 'grad_norm': 19.218948364257812, 'learning_rate': 2.88e-05, 'epoch': 0.18}


  1%|          | 60/5620 [02:48<5:24:03,  3.50s/it]

{'loss': 14.9929, 'grad_norm': 17.075782775878906, 'learning_rate': 3.48e-05, 'epoch': 0.21}


  1%|          | 70/5620 [03:18<4:21:40,  2.83s/it]

{'loss': 12.1051, 'grad_norm': 17.634010314941406, 'learning_rate': 4.08e-05, 'epoch': 0.25}


  1%|▏         | 80/5620 [03:42<3:36:45,  2.35s/it]

{'loss': 10.2936, 'grad_norm': 20.859359741210938, 'learning_rate': 4.68e-05, 'epoch': 0.28}


  2%|▏         | 90/5620 [04:03<3:00:52,  1.96s/it]

{'loss': 7.3923, 'grad_norm': 15.823247909545898, 'learning_rate': 5.279999999999999e-05, 'epoch': 0.32}


  2%|▏         | 100/5620 [04:19<2:17:18,  1.49s/it]

{'loss': 6.8982, 'grad_norm': 15.479625701904297, 'learning_rate': 5.88e-05, 'epoch': 0.36}


                                                    
  2%|▏         | 100/5620 [05:12<2:17:18,  1.49s/it]

{'eval_loss': 5.429949760437012, 'eval_wer': 1.0, 'eval_runtime': 53.1206, 'eval_samples_per_second': 18.825, 'eval_steps_per_second': 2.353, 'epoch': 0.36}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  2%|▏         | 110/5620 [05:55<6:05:10,  3.98s/it] 

{'loss': 5.2105, 'grad_norm': 11.416728019714355, 'learning_rate': 6.479999999999999e-05, 'epoch': 0.39}


  2%|▏         | 120/5620 [06:22<4:00:20,  2.62s/it]

{'loss': 4.9089, 'grad_norm': 5.9066267013549805, 'learning_rate': 7.079999999999999e-05, 'epoch': 0.43}


  2%|▏         | 130/5620 [06:44<3:19:07,  2.18s/it]

{'loss': 4.2588, 'grad_norm': 3.9258084297180176, 'learning_rate': 7.68e-05, 'epoch': 0.46}


  2%|▏         | 140/5620 [07:03<2:39:47,  1.75s/it]

{'loss': 3.9732, 'grad_norm': 3.0997462272644043, 'learning_rate': 8.28e-05, 'epoch': 0.5}


  3%|▎         | 150/5620 [07:17<2:04:11,  1.36s/it]

{'loss': 3.8228, 'grad_norm': 4.046639919281006, 'learning_rate': 8.879999999999999e-05, 'epoch': 0.53}


  3%|▎         | 160/5620 [07:52<4:48:13,  3.17s/it]

{'loss': 3.5798, 'grad_norm': 1.599672555923462, 'learning_rate': 9.479999999999999e-05, 'epoch': 0.57}


  3%|▎         | 170/5620 [08:18<3:46:42,  2.50s/it]

{'loss': 3.4992, 'grad_norm': 0.42286616563796997, 'learning_rate': 0.0001008, 'epoch': 0.6}


  3%|▎         | 180/5620 [08:40<3:11:32,  2.11s/it]

{'loss': 3.4713, 'grad_norm': 0.2267410010099411, 'learning_rate': 0.00010679999999999998, 'epoch': 0.64}


  3%|▎         | 190/5620 [08:59<2:40:59,  1.78s/it]

{'loss': 3.4535, 'grad_norm': 0.3514644503593445, 'learning_rate': 0.00011279999999999999, 'epoch': 0.67}


  4%|▎         | 200/5620 [09:13<2:06:16,  1.40s/it]

{'loss': 3.4946, 'grad_norm': 0.7973133325576782, 'learning_rate': 0.0001188, 'epoch': 0.71}


                                                    
  4%|▎         | 200/5620 [10:04<2:06:16,  1.40s/it]

{'eval_loss': 3.454160690307617, 'eval_wer': 1.0, 'eval_runtime': 50.3431, 'eval_samples_per_second': 19.864, 'eval_steps_per_second': 2.483, 'epoch': 0.71}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  4%|▎         | 210/5620 [10:44<5:50:03,  3.88s/it] 

{'loss': 3.4366, 'grad_norm': 0.6426085233688354, 'learning_rate': 0.00012479999999999997, 'epoch': 0.75}


  4%|▍         | 220/5620 [11:11<3:53:47,  2.60s/it]

{'loss': 3.4251, 'grad_norm': 0.26439738273620605, 'learning_rate': 0.00013079999999999998, 'epoch': 0.78}


  4%|▍         | 230/5620 [11:33<3:08:26,  2.10s/it]

{'loss': 3.4069, 'grad_norm': 1.0818825960159302, 'learning_rate': 0.0001368, 'epoch': 0.82}


  4%|▍         | 240/5620 [11:51<2:39:42,  1.78s/it]

{'loss': 3.442, 'grad_norm': 1.5829654932022095, 'learning_rate': 0.00014279999999999997, 'epoch': 0.85}


  4%|▍         | 250/5620 [12:06<2:06:43,  1.42s/it]

{'loss': 3.4593, 'grad_norm': 1.5711820125579834, 'learning_rate': 0.00014879999999999998, 'epoch': 0.89}


  5%|▍         | 260/5620 [12:39<4:17:14,  2.88s/it]

{'loss': 3.4133, 'grad_norm': 1.4259703159332275, 'learning_rate': 0.0001548, 'epoch': 0.92}


  5%|▍         | 270/5620 [13:02<3:10:41,  2.14s/it]

{'loss': 3.4211, 'grad_norm': 0.37221765518188477, 'learning_rate': 0.0001608, 'epoch': 0.96}


  5%|▍         | 280/5620 [13:17<2:06:49,  1.42s/it]

{'loss': 3.4412, 'grad_norm': 0.9676143527030945, 'learning_rate': 0.0001668, 'epoch': 0.99}


  5%|▌         | 290/5620 [13:50<4:49:29,  3.26s/it]

{'loss': 3.4197, 'grad_norm': 1.0752214193344116, 'learning_rate': 0.00017279999999999997, 'epoch': 1.03}


  5%|▌         | 300/5620 [14:18<3:58:47,  2.69s/it]

{'loss': 3.3909, 'grad_norm': 0.2502693831920624, 'learning_rate': 0.00017879999999999998, 'epoch': 1.07}


                                                    
  5%|▌         | 300/5620 [15:08<3:58:47,  2.69s/it]

{'eval_loss': 3.3997597694396973, 'eval_wer': 1.0, 'eval_runtime': 49.5876, 'eval_samples_per_second': 20.166, 'eval_steps_per_second': 2.521, 'epoch': 1.07}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  6%|▌         | 310/5620 [15:35<4:08:41,  2.81s/it] 

{'loss': 3.3901, 'grad_norm': 1.6807172298431396, 'learning_rate': 0.0001848, 'epoch': 1.1}


  6%|▌         | 320/5620 [15:54<2:42:34,  1.84s/it]

{'loss': 3.4033, 'grad_norm': 3.9373981952667236, 'learning_rate': 0.00019079999999999998, 'epoch': 1.14}


  6%|▌         | 330/5620 [16:09<2:05:04,  1.42s/it]

{'loss': 3.3858, 'grad_norm': 0.40738052129745483, 'learning_rate': 0.00019679999999999999, 'epoch': 1.17}


  6%|▌         | 340/5620 [16:41<4:43:22,  3.22s/it]

{'loss': 3.3918, 'grad_norm': 0.7556898593902588, 'learning_rate': 0.0002028, 'epoch': 1.21}


  6%|▌         | 350/5620 [17:08<3:42:44,  2.54s/it]

{'loss': 3.3644, 'grad_norm': 1.8355040550231934, 'learning_rate': 0.00020879999999999998, 'epoch': 1.24}


  6%|▋         | 360/5620 [17:30<3:09:11,  2.16s/it]

{'loss': 3.3781, 'grad_norm': 0.344926655292511, 'learning_rate': 0.00021479999999999996, 'epoch': 1.28}


  7%|▋         | 370/5620 [17:49<2:42:02,  1.85s/it]

{'loss': 3.3649, 'grad_norm': 0.7131491303443909, 'learning_rate': 0.00022079999999999997, 'epoch': 1.31}


  7%|▋         | 380/5620 [18:04<1:59:45,  1.37s/it]

{'loss': 3.3553, 'grad_norm': 1.1218371391296387, 'learning_rate': 0.00022679999999999998, 'epoch': 1.35}


  7%|▋         | 390/5620 [18:37<4:49:08,  3.32s/it]

{'loss': 3.3517, 'grad_norm': 0.23049870133399963, 'learning_rate': 0.0002328, 'epoch': 1.39}


  7%|▋         | 400/5620 [19:05<3:54:08,  2.69s/it]

{'loss': 3.3402, 'grad_norm': 0.5590311884880066, 'learning_rate': 0.0002388, 'epoch': 1.42}


                                                    
  7%|▋         | 400/5620 [19:54<3:54:08,  2.69s/it]

{'eval_loss': 3.340000867843628, 'eval_wer': 1.0, 'eval_runtime': 49.2128, 'eval_samples_per_second': 20.32, 'eval_steps_per_second': 2.54, 'epoch': 1.42}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  7%|▋         | 410/5620 [20:21<4:08:26,  2.86s/it] 

{'loss': 3.3221, 'grad_norm': 0.8896554708480835, 'learning_rate': 0.0002448, 'epoch': 1.46}


  7%|▋         | 420/5620 [20:40<2:47:47,  1.94s/it]

{'loss': 3.316, 'grad_norm': 1.7012865543365479, 'learning_rate': 0.00025079999999999997, 'epoch': 1.49}


  8%|▊         | 430/5620 [20:55<2:01:16,  1.40s/it]

{'loss': 3.2968, 'grad_norm': 0.9095801115036011, 'learning_rate': 0.00025679999999999995, 'epoch': 1.53}


  8%|▊         | 440/5620 [21:28<4:45:01,  3.30s/it]

{'loss': 3.3067, 'grad_norm': 1.1336607933044434, 'learning_rate': 0.0002628, 'epoch': 1.56}


  8%|▊         | 450/5620 [21:56<3:40:56,  2.56s/it]

{'loss': 3.2835, 'grad_norm': 0.3440011441707611, 'learning_rate': 0.0002688, 'epoch': 1.6}


  8%|▊         | 460/5620 [22:18<3:02:51,  2.13s/it]

{'loss': 3.2694, 'grad_norm': 2.9719388484954834, 'learning_rate': 0.0002748, 'epoch': 1.63}


  8%|▊         | 470/5620 [22:36<2:27:09,  1.71s/it]

{'loss': 3.2922, 'grad_norm': 2.697000026702881, 'learning_rate': 0.0002808, 'epoch': 1.67}


  9%|▊         | 480/5620 [22:50<1:53:15,  1.32s/it]

{'loss': 3.3276, 'grad_norm': 6.63627290725708, 'learning_rate': 0.0002868, 'epoch': 1.71}


  9%|▊         | 490/5620 [23:22<4:37:30,  3.25s/it]

{'loss': 3.2895, 'grad_norm': 0.3857899010181427, 'learning_rate': 0.00029279999999999996, 'epoch': 1.74}


  9%|▉         | 500/5620 [23:50<3:51:21,  2.71s/it]

{'loss': 3.2511, 'grad_norm': 0.32612693309783936, 'learning_rate': 0.0002988, 'epoch': 1.78}


                                                    
  9%|▉         | 500/5620 [24:39<3:51:21,  2.71s/it]

{'eval_loss': 3.2203476428985596, 'eval_wer': 1.0, 'eval_runtime': 48.3219, 'eval_samples_per_second': 20.695, 'eval_steps_per_second': 2.587, 'epoch': 1.78}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  9%|▉         | 510/5620 [25:05<3:57:46,  2.79s/it] 

{'loss': 3.2202, 'grad_norm': 0.7028724551200867, 'learning_rate': 0.00029953125, 'epoch': 1.81}


  9%|▉         | 520/5620 [25:24<2:36:52,  1.85s/it]

{'loss': 3.1968, 'grad_norm': 2.700896978378296, 'learning_rate': 0.00029894531249999995, 'epoch': 1.85}


  9%|▉         | 530/5620 [25:40<1:59:20,  1.41s/it]

{'loss': 3.1448, 'grad_norm': 1.0439801216125488, 'learning_rate': 0.00029835937499999996, 'epoch': 1.88}


 10%|▉         | 540/5620 [26:10<4:15:31,  3.02s/it]

{'loss': 3.2102, 'grad_norm': 2.1537840366363525, 'learning_rate': 0.00029777343749999997, 'epoch': 1.92}


 10%|▉         | 550/5620 [26:34<3:10:53,  2.26s/it]

{'loss': 3.076, 'grad_norm': 1.4161787033081055, 'learning_rate': 0.0002971875, 'epoch': 1.95}


 10%|▉         | 560/5620 [26:51<2:12:50,  1.58s/it]

{'loss': 2.923, 'grad_norm': 2.6490657329559326, 'learning_rate': 0.0002966015625, 'epoch': 1.99}


 10%|█         | 570/5620 [27:20<4:41:24,  3.34s/it]

{'loss': 2.8529, 'grad_norm': 1.3592194318771362, 'learning_rate': 0.00029601562499999995, 'epoch': 2.02}


 10%|█         | 580/5620 [27:49<3:45:06,  2.68s/it]

{'loss': 2.6891, 'grad_norm': 0.7104178667068481, 'learning_rate': 0.00029542968749999996, 'epoch': 2.06}


 10%|█         | 590/5620 [28:11<3:03:09,  2.18s/it]

{'loss': 2.4024, 'grad_norm': 1.0422918796539307, 'learning_rate': 0.00029484374999999997, 'epoch': 2.1}


 11%|█         | 600/5620 [28:31<2:36:18,  1.87s/it]

{'loss': 2.1115, 'grad_norm': 1.0231127738952637, 'learning_rate': 0.0002942578125, 'epoch': 2.13}


                                                    
 11%|█         | 600/5620 [29:19<2:36:18,  1.87s/it]

{'eval_loss': 1.8237167596817017, 'eval_wer': 1.0277442702050663, 'eval_runtime': 48.3587, 'eval_samples_per_second': 20.679, 'eval_steps_per_second': 2.585, 'epoch': 2.13}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 11%|█         | 610/5620 [29:38<2:53:22,  2.08s/it] 

{'loss': 1.9279, 'grad_norm': 1.1852765083312988, 'learning_rate': 0.000293671875, 'epoch': 2.17}


 11%|█         | 620/5620 [30:06<4:28:31,  3.22s/it]

{'loss': 1.9986, 'grad_norm': 1.5235787630081177, 'learning_rate': 0.00029308593749999994, 'epoch': 2.2}


 11%|█         | 630/5620 [30:34<3:41:58,  2.67s/it]

{'loss': 1.7498, 'grad_norm': 1.4372447729110718, 'learning_rate': 0.00029249999999999995, 'epoch': 2.24}


 11%|█▏        | 640/5620 [30:57<3:01:21,  2.18s/it]

{'loss': 1.5354, 'grad_norm': 0.7158452868461609, 'learning_rate': 0.00029191406249999996, 'epoch': 2.27}


 12%|█▏        | 650/5620 [31:16<2:31:08,  1.82s/it]

{'loss': 1.4604, 'grad_norm': 0.9508132934570312, 'learning_rate': 0.000291328125, 'epoch': 2.31}


 12%|█▏        | 660/5620 [31:32<1:58:05,  1.43s/it]

{'loss': 1.4283, 'grad_norm': 0.8853356242179871, 'learning_rate': 0.0002907421875, 'epoch': 2.34}


 12%|█▏        | 670/5620 [32:01<4:32:01,  3.30s/it]

{'loss': 1.4166, 'grad_norm': 0.5074292421340942, 'learning_rate': 0.00029015624999999994, 'epoch': 2.38}


 12%|█▏        | 680/5620 [32:29<3:35:39,  2.62s/it]

{'loss': 1.2939, 'grad_norm': 0.6744822859764099, 'learning_rate': 0.00028957031249999995, 'epoch': 2.42}


 12%|█▏        | 690/5620 [32:52<2:59:32,  2.19s/it]

{'loss': 1.2026, 'grad_norm': 0.7073540687561035, 'learning_rate': 0.00028898437499999996, 'epoch': 2.45}


 12%|█▏        | 700/5620 [33:11<2:29:28,  1.82s/it]

{'loss': 1.1843, 'grad_norm': 0.7712356448173523, 'learning_rate': 0.00028839843749999997, 'epoch': 2.49}


                                                    
 12%|█▏        | 700/5620 [33:59<2:29:28,  1.82s/it]

{'eval_loss': 0.9567633867263794, 'eval_wer': 0.8453777826516066, 'eval_runtime': 48.4219, 'eval_samples_per_second': 20.652, 'eval_steps_per_second': 2.581, 'epoch': 2.49}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 13%|█▎        | 710/5620 [34:18<2:48:52,  2.06s/it] 

{'loss': 1.1489, 'grad_norm': 1.0268362760543823, 'learning_rate': 0.0002878125, 'epoch': 2.52}


 13%|█▎        | 720/5620 [34:47<4:24:29,  3.24s/it]

{'loss': 1.1863, 'grad_norm': 0.6975992321968079, 'learning_rate': 0.0002872265625, 'epoch': 2.56}


 13%|█▎        | 730/5620 [35:15<3:41:40,  2.72s/it]

{'loss': 1.0645, 'grad_norm': 0.7835203409194946, 'learning_rate': 0.00028664062499999995, 'epoch': 2.59}


 13%|█▎        | 740/5620 [35:38<2:57:07,  2.18s/it]

{'loss': 1.0792, 'grad_norm': 0.8206751942634583, 'learning_rate': 0.00028605468749999996, 'epoch': 2.63}


 13%|█▎        | 750/5620 [35:57<2:26:07,  1.80s/it]

{'loss': 1.0625, 'grad_norm': 0.7864701151847839, 'learning_rate': 0.00028546874999999997, 'epoch': 2.66}


 14%|█▎        | 760/5620 [36:12<1:54:28,  1.41s/it]

{'loss': 1.033, 'grad_norm': 0.737440824508667, 'learning_rate': 0.0002848828125, 'epoch': 2.7}


 14%|█▎        | 770/5620 [36:41<4:25:38,  3.29s/it]

{'loss': 1.1211, 'grad_norm': 0.5121504068374634, 'learning_rate': 0.000284296875, 'epoch': 2.74}


 14%|█▍        | 780/5620 [37:10<3:39:50,  2.73s/it]

{'loss': 0.9539, 'grad_norm': 0.5672191977500916, 'learning_rate': 0.00028371093749999995, 'epoch': 2.77}


 14%|█▍        | 790/5620 [37:33<3:03:27,  2.28s/it]

{'loss': 0.9205, 'grad_norm': 0.7210434079170227, 'learning_rate': 0.00028312499999999996, 'epoch': 2.81}


 14%|█▍        | 800/5620 [37:53<2:29:01,  1.86s/it]

{'loss': 0.9177, 'grad_norm': 0.9004600048065186, 'learning_rate': 0.00028253906249999997, 'epoch': 2.84}


                                                    
 14%|█▍        | 800/5620 [38:41<2:29:01,  1.86s/it]

{'eval_loss': 0.7189396619796753, 'eval_wer': 0.7426252878605111, 'eval_runtime': 48.5789, 'eval_samples_per_second': 20.585, 'eval_steps_per_second': 2.573, 'epoch': 2.84}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 14%|█▍        | 810/5620 [39:01<2:47:48,  2.09s/it] 

{'loss': 0.9598, 'grad_norm': 0.9316354393959045, 'learning_rate': 0.000281953125, 'epoch': 2.88}


 15%|█▍        | 820/5620 [39:28<3:56:46,  2.96s/it]

{'loss': 1.0123, 'grad_norm': 0.5963002443313599, 'learning_rate': 0.0002813671875, 'epoch': 2.91}


 15%|█▍        | 830/5620 [39:51<2:59:49,  2.25s/it]

{'loss': 0.9559, 'grad_norm': 0.6262953281402588, 'learning_rate': 0.00028078124999999994, 'epoch': 2.95}


 15%|█▍        | 840/5620 [40:09<2:17:18,  1.72s/it]

{'loss': 0.8464, 'grad_norm': 0.8053315281867981, 'learning_rate': 0.00028019531249999995, 'epoch': 2.98}


 15%|█▌        | 850/5620 [40:36<4:20:13,  3.27s/it]

{'loss': 0.9468, 'grad_norm': 0.7129610180854797, 'learning_rate': 0.00027960937499999996, 'epoch': 3.02}


 15%|█▌        | 860/5620 [41:05<3:40:22,  2.78s/it]

{'loss': 0.8001, 'grad_norm': 0.5409874320030212, 'learning_rate': 0.0002790234375, 'epoch': 3.06}


 15%|█▌        | 870/5620 [41:29<2:58:44,  2.26s/it]

{'loss': 0.7518, 'grad_norm': 0.5214200615882874, 'learning_rate': 0.0002784375, 'epoch': 3.09}


 16%|█▌        | 880/5620 [41:48<2:28:41,  1.88s/it]

{'loss': 0.726, 'grad_norm': 0.6173517107963562, 'learning_rate': 0.0002778515625, 'epoch': 3.13}


 16%|█▌        | 890/5620 [42:05<2:01:40,  1.54s/it]

{'loss': 0.7661, 'grad_norm': 0.7578060626983643, 'learning_rate': 0.00027726562499999995, 'epoch': 3.16}


 16%|█▌        | 900/5620 [42:31<4:10:08,  3.18s/it]

{'loss': 0.9, 'grad_norm': 0.593809962272644, 'learning_rate': 0.00027667968749999996, 'epoch': 3.2}


                                                    
 16%|█▌        | 900/5620 [43:19<4:10:08,  3.18s/it]

{'eval_loss': 0.6458398699760437, 'eval_wer': 0.649851957451475, 'eval_runtime': 48.4683, 'eval_samples_per_second': 20.632, 'eval_steps_per_second': 2.579, 'epoch': 3.2}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 16%|█▌        | 910/5620 [43:52<4:26:10,  3.39s/it] 

{'loss': 0.8378, 'grad_norm': 0.5070281028747559, 'learning_rate': 0.00027609374999999997, 'epoch': 3.23}


 16%|█▋        | 920/5620 [44:15<2:56:09,  2.25s/it]

{'loss': 0.7168, 'grad_norm': 0.7138456702232361, 'learning_rate': 0.0002755078125, 'epoch': 3.27}


 17%|█▋        | 930/5620 [44:35<2:27:48,  1.89s/it]

{'loss': 0.755, 'grad_norm': 0.6190743446350098, 'learning_rate': 0.000274921875, 'epoch': 3.3}


 17%|█▋        | 940/5620 [44:51<1:57:45,  1.51s/it]

{'loss': 0.7641, 'grad_norm': 0.693598747253418, 'learning_rate': 0.00027433593749999995, 'epoch': 3.34}


 17%|█▋        | 950/5620 [45:17<4:10:41,  3.22s/it]

{'loss': 0.9108, 'grad_norm': 0.6922484636306763, 'learning_rate': 0.00027374999999999996, 'epoch': 3.37}


 17%|█▋        | 960/5620 [45:46<3:28:45,  2.69s/it]

{'loss': 0.738, 'grad_norm': 1.0861599445343018, 'learning_rate': 0.00027316406249999997, 'epoch': 3.41}


 17%|█▋        | 970/5620 [46:10<2:56:53,  2.28s/it]

{'loss': 0.7049, 'grad_norm': 0.581056535243988, 'learning_rate': 0.000272578125, 'epoch': 3.45}


 17%|█▋        | 980/5620 [46:29<2:24:59,  1.87s/it]

{'loss': 0.7392, 'grad_norm': 0.7307511568069458, 'learning_rate': 0.0002719921875, 'epoch': 3.48}


 18%|█▊        | 990/5620 [46:45<1:57:21,  1.52s/it]

{'loss': 0.7326, 'grad_norm': 0.8817386031150818, 'learning_rate': 0.00027140624999999995, 'epoch': 3.52}


 18%|█▊        | 1000/5620 [47:12<4:17:44,  3.35s/it]

{'loss': 0.7586, 'grad_norm': 0.5172353982925415, 'learning_rate': 0.00027082031249999996, 'epoch': 3.55}


                                                     
 18%|█▊        | 1000/5620 [48:00<4:17:44,  3.35s/it]

{'eval_loss': 0.5796821117401123, 'eval_wer': 0.5960083342471763, 'eval_runtime': 48.5895, 'eval_samples_per_second': 20.581, 'eval_steps_per_second': 2.573, 'epoch': 3.55}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 18%|█▊        | 1010/5620 [48:33<4:23:12,  3.43s/it] 

{'loss': 0.7582, 'grad_norm': 0.5027904510498047, 'learning_rate': 0.00027023437499999997, 'epoch': 3.59}


 18%|█▊        | 1020/5620 [48:57<2:52:57,  2.26s/it]

{'loss': 0.672, 'grad_norm': 0.6282051801681519, 'learning_rate': 0.0002696484375, 'epoch': 3.62}


 18%|█▊        | 1030/5620 [49:16<2:23:26,  1.87s/it]

{'loss': 0.6711, 'grad_norm': 0.6969748139381409, 'learning_rate': 0.0002690625, 'epoch': 3.66}


 19%|█▊        | 1040/5620 [49:32<1:54:51,  1.50s/it]

{'loss': 0.702, 'grad_norm': 0.8283044695854187, 'learning_rate': 0.00026847656249999994, 'epoch': 3.69}


 19%|█▊        | 1050/5620 [49:58<4:08:22,  3.26s/it]

{'loss': 0.7699, 'grad_norm': 0.5306990146636963, 'learning_rate': 0.00026789062499999995, 'epoch': 3.73}


 19%|█▉        | 1060/5620 [50:27<3:27:23,  2.73s/it]

{'loss': 0.7, 'grad_norm': 0.5306687355041504, 'learning_rate': 0.00026730468749999996, 'epoch': 3.77}


 19%|█▉        | 1070/5620 [50:51<2:50:50,  2.25s/it]

{'loss': 0.6625, 'grad_norm': 0.7023568749427795, 'learning_rate': 0.00026671875, 'epoch': 3.8}


 19%|█▉        | 1080/5620 [51:10<2:23:04,  1.89s/it]

{'loss': 0.6162, 'grad_norm': 0.603435754776001, 'learning_rate': 0.0002661328125, 'epoch': 3.84}


 19%|█▉        | 1090/5620 [51:27<1:55:02,  1.52s/it]

{'loss': 0.7098, 'grad_norm': 0.7785253524780273, 'learning_rate': 0.000265546875, 'epoch': 3.87}


 20%|█▉        | 1100/5620 [51:52<3:56:53,  3.14s/it]

{'loss': 0.7687, 'grad_norm': 0.4984847903251648, 'learning_rate': 0.00026496093749999995, 'epoch': 3.91}


                                                     
 20%|█▉        | 1100/5620 [52:41<3:56:53,  3.14s/it]

{'eval_loss': 0.5288936495780945, 'eval_wer': 0.5559820155718829, 'eval_runtime': 48.7264, 'eval_samples_per_second': 20.523, 'eval_steps_per_second': 2.565, 'epoch': 3.91}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 20%|█▉        | 1110/5620 [53:11<3:50:51,  3.07s/it] 

{'loss': 0.6553, 'grad_norm': 0.6203317046165466, 'learning_rate': 0.00026437499999999996, 'epoch': 3.94}


 20%|█▉        | 1120/5620 [53:30<2:14:49,  1.80s/it]

{'loss': 0.7304, 'grad_norm': 0.802864134311676, 'learning_rate': 0.00026378906249999997, 'epoch': 3.98}


 20%|██        | 1130/5620 [53:53<3:56:08,  3.16s/it]

{'loss': 0.6919, 'grad_norm': 0.44283032417297363, 'learning_rate': 0.000263203125, 'epoch': 4.01}


 20%|██        | 1140/5620 [54:24<3:35:47,  2.89s/it]

{'loss': 0.592, 'grad_norm': 0.5175955891609192, 'learning_rate': 0.0002626171875, 'epoch': 4.05}


 20%|██        | 1150/5620 [54:48<2:52:05,  2.31s/it]

{'loss': 0.5693, 'grad_norm': 0.6607251167297363, 'learning_rate': 0.00026203124999999995, 'epoch': 4.09}


 21%|██        | 1160/5620 [55:08<2:19:58,  1.88s/it]

{'loss': 0.5778, 'grad_norm': 0.9512099027633667, 'learning_rate': 0.00026144531249999996, 'epoch': 4.12}


 21%|██        | 1170/5620 [55:25<1:58:35,  1.60s/it]

{'loss': 0.5474, 'grad_norm': 0.7696147561073303, 'learning_rate': 0.00026085937499999997, 'epoch': 4.16}


 21%|██        | 1180/5620 [55:48<3:53:54,  3.16s/it]

{'loss': 0.6355, 'grad_norm': 0.5272123217582703, 'learning_rate': 0.0002602734375, 'epoch': 4.19}


 21%|██        | 1190/5620 [56:17<3:24:38,  2.77s/it]

{'loss': 0.6451, 'grad_norm': 0.6654644012451172, 'learning_rate': 0.0002596875, 'epoch': 4.23}


 21%|██▏       | 1200/5620 [56:41<2:48:12,  2.28s/it]

{'loss': 0.5748, 'grad_norm': 0.5465813279151917, 'learning_rate': 0.00025910156249999995, 'epoch': 4.26}


                                                     
 21%|██▏       | 1200/5620 [57:29<2:48:12,  2.28s/it]

{'eval_loss': 0.4925563335418701, 'eval_wer': 0.5513762473955478, 'eval_runtime': 48.3745, 'eval_samples_per_second': 20.672, 'eval_steps_per_second': 2.584, 'epoch': 4.26}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 22%|██▏       | 1210/5620 [57:53<3:10:11,  2.59s/it] 

{'loss': 0.5278, 'grad_norm': 0.7016194462776184, 'learning_rate': 0.00025851562499999996, 'epoch': 4.3}


 22%|██▏       | 1220/5620 [58:10<2:00:04,  1.64s/it]

{'loss': 0.5636, 'grad_norm': 0.9119442701339722, 'learning_rate': 0.00025792968749999997, 'epoch': 4.33}


 22%|██▏       | 1230/5620 [58:33<3:49:20,  3.13s/it]

{'loss': 0.6753, 'grad_norm': 0.55336993932724, 'learning_rate': 0.00025734375, 'epoch': 4.37}


 22%|██▏       | 1240/5620 [59:04<3:35:50,  2.96s/it]

{'loss': 0.6169, 'grad_norm': 0.5233244299888611, 'learning_rate': 0.0002567578125, 'epoch': 4.4}


 22%|██▏       | 1250/5620 [59:28<2:49:53,  2.33s/it]

{'loss': 0.5418, 'grad_norm': 0.6977956891059875, 'learning_rate': 0.000256171875, 'epoch': 4.44}


 22%|██▏       | 1260/5620 [59:48<2:18:11,  1.90s/it]

{'loss': 0.5694, 'grad_norm': 0.657643735408783, 'learning_rate': 0.00025558593749999995, 'epoch': 4.48}


 23%|██▎       | 1270/5620 [1:00:05<1:56:37,  1.61s/it]

{'loss': 0.5738, 'grad_norm': 0.7241307497024536, 'learning_rate': 0.00025499999999999996, 'epoch': 4.51}


 23%|██▎       | 1280/5620 [1:00:28<3:46:57,  3.14s/it]

{'loss': 0.6413, 'grad_norm': 0.6881661415100098, 'learning_rate': 0.0002544140625, 'epoch': 4.55}


 23%|██▎       | 1290/5620 [1:00:58<3:26:18,  2.86s/it]

{'loss': 0.5362, 'grad_norm': 0.5868576169013977, 'learning_rate': 0.000253828125, 'epoch': 4.58}


 23%|██▎       | 1300/5620 [1:01:22<2:45:35,  2.30s/it]

{'loss': 0.5432, 'grad_norm': 0.6995622515678406, 'learning_rate': 0.0002532421875, 'epoch': 4.62}


                                                       
 23%|██▎       | 1300/5620 [1:02:11<2:45:35,  2.30s/it]

{'eval_loss': 0.45127278566360474, 'eval_wer': 0.5019190700734729, 'eval_runtime': 48.472, 'eval_samples_per_second': 20.63, 'eval_steps_per_second': 2.579, 'epoch': 4.62}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 23%|██▎       | 1310/5620 [1:02:34<3:05:52,  2.59s/it] 

{'loss': 0.5352, 'grad_norm': 0.620255708694458, 'learning_rate': 0.00025265624999999995, 'epoch': 4.65}


 23%|██▎       | 1320/5620 [1:02:51<1:55:44,  1.61s/it]

{'loss': 0.5779, 'grad_norm': 0.9346354007720947, 'learning_rate': 0.00025207031249999996, 'epoch': 4.69}


 24%|██▎       | 1330/5620 [1:03:14<3:38:59,  3.06s/it]

{'loss': 0.6732, 'grad_norm': 0.47553274035453796, 'learning_rate': 0.00025148437499999997, 'epoch': 4.72}


 24%|██▍       | 1340/5620 [1:03:44<3:18:27,  2.78s/it]

{'loss': 0.5441, 'grad_norm': 0.5119175314903259, 'learning_rate': 0.0002508984375, 'epoch': 4.76}


 24%|██▍       | 1350/5620 [1:04:08<2:40:24,  2.25s/it]

{'loss': 0.5792, 'grad_norm': 0.9018890261650085, 'learning_rate': 0.0002503125, 'epoch': 4.8}


 24%|██▍       | 1360/5620 [1:04:28<2:16:48,  1.93s/it]

{'loss': 0.5335, 'grad_norm': 0.5371050834655762, 'learning_rate': 0.00024972656249999995, 'epoch': 4.83}


 24%|██▍       | 1370/5620 [1:04:45<1:54:22,  1.61s/it]

{'loss': 0.5715, 'grad_norm': 0.756286084651947, 'learning_rate': 0.00024914062499999996, 'epoch': 4.87}


 25%|██▍       | 1380/5620 [1:05:08<3:36:47,  3.07s/it]

{'loss': 0.6136, 'grad_norm': 0.56305992603302, 'learning_rate': 0.00024855468749999997, 'epoch': 4.9}


 25%|██▍       | 1390/5620 [1:05:36<2:59:48,  2.55s/it]

{'loss': 0.5349, 'grad_norm': 0.545341968536377, 'learning_rate': 0.00024796875, 'epoch': 4.94}


 25%|██▍       | 1400/5620 [1:05:55<2:10:00,  1.85s/it]

{'loss': 0.5372, 'grad_norm': 0.810308575630188, 'learning_rate': 0.0002473828125, 'epoch': 4.97}


                                                       
 25%|██▍       | 1400/5620 [1:06:44<2:10:00,  1.85s/it]

{'eval_loss': 0.43144628405570984, 'eval_wer': 0.4799868406623533, 'eval_runtime': 48.3182, 'eval_samples_per_second': 20.696, 'eval_steps_per_second': 2.587, 'epoch': 4.97}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 25%|██▌       | 1410/5620 [1:07:07<3:59:46,  3.42s/it] 

{'loss': 0.6266, 'grad_norm': 0.6179250478744507, 'learning_rate': 0.00024679687499999995, 'epoch': 5.01}


 25%|██▌       | 1420/5620 [1:07:39<3:25:11,  2.93s/it]

{'loss': 0.4995, 'grad_norm': 0.48217302560806274, 'learning_rate': 0.00024621093749999996, 'epoch': 5.04}


 25%|██▌       | 1430/5620 [1:08:03<2:44:58,  2.36s/it]

{'loss': 0.4801, 'grad_norm': 0.4900878965854645, 'learning_rate': 0.00024562499999999997, 'epoch': 5.08}


 26%|██▌       | 1440/5620 [1:08:24<2:17:37,  1.98s/it]

{'loss': 0.4419, 'grad_norm': 0.542972207069397, 'learning_rate': 0.0002450390625, 'epoch': 5.12}


 26%|██▌       | 1450/5620 [1:08:40<1:53:57,  1.64s/it]

{'loss': 0.5019, 'grad_norm': 0.9164965748786926, 'learning_rate': 0.000244453125, 'epoch': 5.15}


 26%|██▌       | 1460/5620 [1:09:01<3:16:26,  2.83s/it]

{'loss': 0.4754, 'grad_norm': 0.4844277501106262, 'learning_rate': 0.00024386718749999997, 'epoch': 5.19}


 26%|██▌       | 1470/5620 [1:09:32<3:25:25,  2.97s/it]

{'loss': 0.4907, 'grad_norm': 0.794083297252655, 'learning_rate': 0.00024328124999999998, 'epoch': 5.22}


 26%|██▋       | 1480/5620 [1:09:57<2:41:42,  2.34s/it]

{'loss': 0.5045, 'grad_norm': 0.5635599493980408, 'learning_rate': 0.00024269531249999996, 'epoch': 5.26}


 27%|██▋       | 1490/5620 [1:10:18<2:23:46,  2.09s/it]

{'loss': 0.4976, 'grad_norm': 0.5977628231048584, 'learning_rate': 0.00024210937499999997, 'epoch': 5.29}


 27%|██▋       | 1500/5620 [1:10:36<1:54:56,  1.67s/it]

{'loss': 0.4713, 'grad_norm': 0.8593668937683105, 'learning_rate': 0.00024152343749999998, 'epoch': 5.33}


                                                       
 27%|██▋       | 1500/5620 [1:11:24<1:54:56,  1.67s/it]

{'eval_loss': 0.42893609404563904, 'eval_wer': 0.4652922469569032, 'eval_runtime': 48.375, 'eval_samples_per_second': 20.672, 'eval_steps_per_second': 2.584, 'epoch': 5.33}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 27%|██▋       | 1510/5620 [1:11:47<3:48:53,  3.34s/it] 

{'loss': 0.5516, 'grad_norm': 0.6059084534645081, 'learning_rate': 0.00024093749999999997, 'epoch': 5.36}


 27%|██▋       | 1520/5620 [1:12:19<3:21:20,  2.95s/it]

{'loss': 0.4947, 'grad_norm': 0.6018845438957214, 'learning_rate': 0.00024035156249999998, 'epoch': 5.4}


 27%|██▋       | 1530/5620 [1:12:43<2:41:22,  2.37s/it]

{'loss': 0.44, 'grad_norm': 0.7216529846191406, 'learning_rate': 0.00023976562499999996, 'epoch': 5.44}


 27%|██▋       | 1540/5620 [1:13:04<2:15:40,  2.00s/it]

{'loss': 0.4258, 'grad_norm': 0.513842761516571, 'learning_rate': 0.00023917968749999997, 'epoch': 5.47}


 28%|██▊       | 1550/5620 [1:13:21<1:50:39,  1.63s/it]

{'loss': 0.4684, 'grad_norm': 0.7250943779945374, 'learning_rate': 0.00023859374999999998, 'epoch': 5.51}


 28%|██▊       | 1560/5620 [1:13:41<3:05:25,  2.74s/it]

{'loss': 0.5465, 'grad_norm': 0.6741055846214294, 'learning_rate': 0.00023800781249999997, 'epoch': 5.54}


 28%|██▊       | 1570/5620 [1:14:14<3:22:53,  3.01s/it]

{'loss': 0.4777, 'grad_norm': 0.5388421416282654, 'learning_rate': 0.00023742187499999998, 'epoch': 5.58}


 28%|██▊       | 1580/5620 [1:14:39<2:44:24,  2.44s/it]

{'loss': 0.444, 'grad_norm': 0.569648265838623, 'learning_rate': 0.00023683593749999999, 'epoch': 5.61}


 28%|██▊       | 1590/5620 [1:15:00<2:15:39,  2.02s/it]

{'loss': 0.415, 'grad_norm': 1.0279533863067627, 'learning_rate': 0.00023624999999999997, 'epoch': 5.65}


 28%|██▊       | 1600/5620 [1:15:17<1:50:17,  1.65s/it]

{'loss': 0.4574, 'grad_norm': 0.6851861476898193, 'learning_rate': 0.00023566406249999998, 'epoch': 5.68}


                                                       
 28%|██▊       | 1600/5620 [1:16:06<1:50:17,  1.65s/it]

{'eval_loss': 0.41755905747413635, 'eval_wer': 0.4629893628687356, 'eval_runtime': 48.4187, 'eval_samples_per_second': 20.653, 'eval_steps_per_second': 2.582, 'epoch': 5.68}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 29%|██▊       | 1610/5620 [1:16:29<3:54:51,  3.51s/it] 

{'loss': 0.5439, 'grad_norm': 0.6937316656112671, 'learning_rate': 0.00023507812499999996, 'epoch': 5.72}


 29%|██▉       | 1620/5620 [1:17:01<3:21:04,  3.02s/it]

{'loss': 0.4774, 'grad_norm': 0.6983432769775391, 'learning_rate': 0.00023449218749999997, 'epoch': 5.75}


 29%|██▉       | 1630/5620 [1:17:25<2:34:59,  2.33s/it]

{'loss': 0.4413, 'grad_norm': 0.620140552520752, 'learning_rate': 0.00023390624999999998, 'epoch': 5.79}


 29%|██▉       | 1640/5620 [1:17:46<2:13:12,  2.01s/it]

{'loss': 0.4296, 'grad_norm': 0.7060505747795105, 'learning_rate': 0.00023332031249999997, 'epoch': 5.83}


 29%|██▉       | 1650/5620 [1:18:02<1:43:09,  1.56s/it]

{'loss': 0.4768, 'grad_norm': 0.8447163105010986, 'learning_rate': 0.00023273437499999998, 'epoch': 5.86}


 30%|██▉       | 1660/5620 [1:18:22<3:03:52,  2.79s/it]

{'loss': 0.5169, 'grad_norm': 0.6244008541107178, 'learning_rate': 0.00023214843749999996, 'epoch': 5.9}


 30%|██▉       | 1670/5620 [1:18:51<2:51:00,  2.60s/it]

{'loss': 0.4647, 'grad_norm': 0.6142054200172424, 'learning_rate': 0.00023156249999999997, 'epoch': 5.93}


 30%|██▉       | 1680/5620 [1:19:11<2:07:23,  1.94s/it]

{'loss': 0.4195, 'grad_norm': 0.676738977432251, 'learning_rate': 0.00023097656249999998, 'epoch': 5.97}


 30%|███       | 1690/5620 [1:19:29<2:23:45,  2.19s/it]

{'loss': 0.5235, 'grad_norm': 0.6344739198684692, 'learning_rate': 0.00023039062499999996, 'epoch': 6.0}


 30%|███       | 1700/5620 [1:20:02<3:22:17,  3.10s/it]

{'loss': 0.4031, 'grad_norm': 0.4554711878299713, 'learning_rate': 0.00022980468749999997, 'epoch': 6.04}


                                                       
 30%|███       | 1700/5620 [1:20:51<3:22:17,  3.10s/it]

{'eval_loss': 0.4024040400981903, 'eval_wer': 0.4272398289286106, 'eval_runtime': 48.4251, 'eval_samples_per_second': 20.65, 'eval_steps_per_second': 2.581, 'epoch': 6.04}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 30%|███       | 1710/5620 [1:21:20<3:15:59,  3.01s/it] 

{'loss': 0.4327, 'grad_norm': 0.5687083601951599, 'learning_rate': 0.00022921874999999998, 'epoch': 6.07}


 31%|███       | 1720/5620 [1:21:41<2:12:35,  2.04s/it]

{'loss': 0.3741, 'grad_norm': 0.5998802185058594, 'learning_rate': 0.00022863281249999997, 'epoch': 6.11}


 31%|███       | 1730/5620 [1:21:58<1:48:55,  1.68s/it]

{'loss': 0.4287, 'grad_norm': 0.9237876534461975, 'learning_rate': 0.00022804687499999998, 'epoch': 6.15}


 31%|███       | 1740/5620 [1:22:15<2:17:49,  2.13s/it]

{'loss': 0.4494, 'grad_norm': 0.4351884722709656, 'learning_rate': 0.00022746093749999996, 'epoch': 6.18}


 31%|███       | 1750/5620 [1:22:48<3:17:52,  3.07s/it]

{'loss': 0.4394, 'grad_norm': 0.700130045413971, 'learning_rate': 0.00022687499999999997, 'epoch': 6.22}


 31%|███▏      | 1760/5620 [1:23:13<2:33:19,  2.38s/it]

{'loss': 0.415, 'grad_norm': 0.7486356496810913, 'learning_rate': 0.00022628906249999998, 'epoch': 6.25}


 31%|███▏      | 1770/5620 [1:23:34<2:12:44,  2.07s/it]

{'loss': 0.3651, 'grad_norm': 0.6370599269866943, 'learning_rate': 0.00022570312499999997, 'epoch': 6.29}


 32%|███▏      | 1780/5620 [1:23:52<1:51:54,  1.75s/it]

{'loss': 0.3723, 'grad_norm': 0.9279974102973938, 'learning_rate': 0.00022511718749999998, 'epoch': 6.32}


 32%|███▏      | 1790/5620 [1:24:09<2:16:47,  2.14s/it]

{'loss': 0.4527, 'grad_norm': 0.6851380467414856, 'learning_rate': 0.00022453124999999999, 'epoch': 6.36}


 32%|███▏      | 1800/5620 [1:24:42<3:15:51,  3.08s/it]

{'loss': 0.4277, 'grad_norm': 0.5446849465370178, 'learning_rate': 0.00022394531249999997, 'epoch': 6.39}


                                                       
 32%|███▏      | 1800/5620 [1:25:31<3:15:51,  3.08s/it]

{'eval_loss': 0.401576429605484, 'eval_wer': 0.41835727601710715, 'eval_runtime': 48.4011, 'eval_samples_per_second': 20.661, 'eval_steps_per_second': 2.583, 'epoch': 6.39}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 32%|███▏      | 1810/5620 [1:26:00<3:18:04,  3.12s/it] 

{'loss': 0.3929, 'grad_norm': 0.5770736932754517, 'learning_rate': 0.00022335937499999998, 'epoch': 6.43}


 32%|███▏      | 1820/5620 [1:26:22<2:12:15,  2.09s/it]

{'loss': 0.3765, 'grad_norm': 0.6641653776168823, 'learning_rate': 0.00022277343749999996, 'epoch': 6.47}


 33%|███▎      | 1830/5620 [1:26:39<1:47:45,  1.71s/it]

{'loss': 0.4118, 'grad_norm': 0.6415061950683594, 'learning_rate': 0.00022218749999999997, 'epoch': 6.5}


 33%|███▎      | 1840/5620 [1:26:56<2:15:00,  2.14s/it]

{'loss': 0.4587, 'grad_norm': 0.629558265209198, 'learning_rate': 0.00022160156249999998, 'epoch': 6.54}


 33%|███▎      | 1850/5620 [1:27:29<3:09:19,  3.01s/it]

{'loss': 0.4121, 'grad_norm': 0.49752920866012573, 'learning_rate': 0.00022101562499999997, 'epoch': 6.57}


 33%|███▎      | 1860/5620 [1:27:54<2:32:34,  2.43s/it]

{'loss': 0.4346, 'grad_norm': 0.5526525378227234, 'learning_rate': 0.00022042968749999998, 'epoch': 6.61}


 33%|███▎      | 1870/5620 [1:28:15<2:06:16,  2.02s/it]

{'loss': 0.3957, 'grad_norm': 0.6626412272453308, 'learning_rate': 0.00021984375, 'epoch': 6.64}


 33%|███▎      | 1880/5620 [1:28:33<1:47:29,  1.72s/it]

{'loss': 0.3696, 'grad_norm': 0.765130877494812, 'learning_rate': 0.00021925781249999997, 'epoch': 6.68}


 34%|███▎      | 1890/5620 [1:28:49<2:12:35,  2.13s/it]

{'loss': 0.4723, 'grad_norm': 0.4844018816947937, 'learning_rate': 0.00021867187499999998, 'epoch': 6.71}


 34%|███▍      | 1900/5620 [1:29:22<3:10:58,  3.08s/it]

{'loss': 0.4469, 'grad_norm': 0.5962949991226196, 'learning_rate': 0.00021808593749999996, 'epoch': 6.75}


                                                       
 34%|███▍      | 1900/5620 [1:30:11<3:10:58,  3.08s/it]

{'eval_loss': 0.3728272616863251, 'eval_wer': 0.4101326899879373, 'eval_runtime': 48.4069, 'eval_samples_per_second': 20.658, 'eval_steps_per_second': 2.582, 'epoch': 6.75}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 34%|███▍      | 1910/5620 [1:30:40<3:16:40,  3.18s/it] 

{'loss': 0.408, 'grad_norm': 0.6323874592781067, 'learning_rate': 0.00021749999999999997, 'epoch': 6.79}


 34%|███▍      | 1920/5620 [1:31:02<2:07:59,  2.08s/it]

{'loss': 0.326, 'grad_norm': 0.3984197676181793, 'learning_rate': 0.00021691406249999998, 'epoch': 6.82}


 34%|███▍      | 1930/5620 [1:31:20<1:44:26,  1.70s/it]

{'loss': 0.4171, 'grad_norm': 0.9933977723121643, 'learning_rate': 0.00021632812499999997, 'epoch': 6.86}


 35%|███▍      | 1940/5620 [1:31:36<2:09:37,  2.11s/it]

{'loss': 0.5279, 'grad_norm': 0.7125703692436218, 'learning_rate': 0.00021574218749999998, 'epoch': 6.89}


 35%|███▍      | 1950/5620 [1:32:07<2:45:15,  2.70s/it]

{'loss': 0.4223, 'grad_norm': 0.49379971623420715, 'learning_rate': 0.00021515624999999996, 'epoch': 6.93}


 35%|███▍      | 1960/5620 [1:32:28<2:02:16,  2.00s/it]

{'loss': 0.3485, 'grad_norm': 0.6452614068984985, 'learning_rate': 0.00021457031249999997, 'epoch': 6.96}


 35%|███▌      | 1970/5620 [1:32:43<1:22:23,  1.35s/it]

{'loss': 0.3915, 'grad_norm': 1.4129575490951538, 'learning_rate': 0.00021398437499999998, 'epoch': 7.0}


 35%|███▌      | 1980/5620 [1:33:17<3:12:36,  3.17s/it]

{'loss': 0.3701, 'grad_norm': 0.6237117052078247, 'learning_rate': 0.00021339843749999997, 'epoch': 7.03}


 35%|███▌      | 1990/5620 [1:33:44<2:33:29,  2.54s/it]

{'loss': 0.3538, 'grad_norm': 0.49408167600631714, 'learning_rate': 0.00021281249999999998, 'epoch': 7.07}


 36%|███▌      | 2000/5620 [1:34:05<2:03:53,  2.05s/it]

{'loss': 0.3166, 'grad_norm': 0.7586616277694702, 'learning_rate': 0.00021222656249999999, 'epoch': 7.1}


                                                       
 36%|███▌      | 2000/5620 [1:34:54<2:03:53,  2.05s/it]

{'eval_loss': 0.38104063272476196, 'eval_wer': 0.38699418795920604, 'eval_runtime': 48.3961, 'eval_samples_per_second': 20.663, 'eval_steps_per_second': 2.583, 'epoch': 7.1}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 36%|███▌      | 2010/5620 [1:35:15<2:21:44,  2.36s/it] 

{'loss': 0.38, 'grad_norm': 0.8995524048805237, 'learning_rate': 0.00021164062499999997, 'epoch': 7.14}


 36%|███▌      | 2020/5620 [1:35:29<1:15:30,  1.26s/it]

{'loss': 0.3762, 'grad_norm': 0.974122166633606, 'learning_rate': 0.00021105468749999998, 'epoch': 7.18}


 36%|███▌      | 2030/5620 [1:36:03<3:09:10,  3.16s/it]

{'loss': 0.387, 'grad_norm': 0.7177022099494934, 'learning_rate': 0.00021046874999999996, 'epoch': 7.21}


 36%|███▋      | 2040/5620 [1:36:30<2:34:08,  2.58s/it]

{'loss': 0.334, 'grad_norm': 0.7906502485275269, 'learning_rate': 0.00020988281249999997, 'epoch': 7.25}


 36%|███▋      | 2050/5620 [1:36:52<2:02:55,  2.07s/it]

{'loss': 0.3717, 'grad_norm': 0.6829425692558289, 'learning_rate': 0.00020929687499999998, 'epoch': 7.28}


 37%|███▋      | 2060/5620 [1:37:10<1:45:29,  1.78s/it]

{'loss': 0.3345, 'grad_norm': 0.9672988653182983, 'learning_rate': 0.00020871093749999997, 'epoch': 7.32}


 37%|███▋      | 2070/5620 [1:37:24<1:15:00,  1.27s/it]

{'loss': 0.3804, 'grad_norm': 0.8914594650268555, 'learning_rate': 0.00020812499999999998, 'epoch': 7.35}


 37%|███▋      | 2080/5620 [1:37:59<3:08:44,  3.20s/it]

{'loss': 0.3924, 'grad_norm': 0.7353417277336121, 'learning_rate': 0.0002075390625, 'epoch': 7.39}


 37%|███▋      | 2090/5620 [1:38:25<2:33:27,  2.61s/it]

{'loss': 0.326, 'grad_norm': 0.5418347120285034, 'learning_rate': 0.00020695312499999997, 'epoch': 7.42}


 37%|███▋      | 2100/5620 [1:38:47<2:00:04,  2.05s/it]

{'loss': 0.3616, 'grad_norm': 0.6532233953475952, 'learning_rate': 0.00020636718749999998, 'epoch': 7.46}


                                                       
 37%|███▋      | 2100/5620 [1:39:35<2:00:04,  2.05s/it]

{'eval_loss': 0.3854627311229706, 'eval_wer': 0.41068099572321526, 'eval_runtime': 48.3721, 'eval_samples_per_second': 20.673, 'eval_steps_per_second': 2.584, 'epoch': 7.46}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 38%|███▊      | 2110/5620 [1:39:57<2:19:19,  2.38s/it] 

{'loss': 0.3391, 'grad_norm': 0.7702137231826782, 'learning_rate': 0.00020578124999999996, 'epoch': 7.5}


 38%|███▊      | 2120/5620 [1:40:11<1:16:43,  1.32s/it]

{'loss': 0.3711, 'grad_norm': 1.2241042852401733, 'learning_rate': 0.00020519531249999997, 'epoch': 7.53}


 38%|███▊      | 2130/5620 [1:40:44<3:01:35,  3.12s/it]

{'loss': 0.4079, 'grad_norm': 0.4207267463207245, 'learning_rate': 0.00020460937499999998, 'epoch': 7.57}


 38%|███▊      | 2140/5620 [1:41:11<2:26:36,  2.53s/it]

{'loss': 0.3508, 'grad_norm': 0.4947180449962616, 'learning_rate': 0.00020402343749999997, 'epoch': 7.6}


 38%|███▊      | 2150/5620 [1:41:32<1:59:30,  2.07s/it]

{'loss': 0.3374, 'grad_norm': 0.6356584429740906, 'learning_rate': 0.00020343749999999998, 'epoch': 7.64}


 38%|███▊      | 2160/5620 [1:41:50<1:40:56,  1.75s/it]

{'loss': 0.3328, 'grad_norm': 0.6851476430892944, 'learning_rate': 0.0002028515625, 'epoch': 7.67}


 39%|███▊      | 2170/5620 [1:42:04<1:11:11,  1.24s/it]

{'loss': 0.3951, 'grad_norm': 1.230563759803772, 'learning_rate': 0.00020226562499999997, 'epoch': 7.71}


 39%|███▉      | 2180/5620 [1:42:38<2:59:28,  3.13s/it]

{'loss': 0.4125, 'grad_norm': 0.5845654606819153, 'learning_rate': 0.00020167968749999998, 'epoch': 7.74}


 39%|███▉      | 2190/5620 [1:43:04<2:21:23,  2.47s/it]

{'loss': 0.3659, 'grad_norm': 0.819823145866394, 'learning_rate': 0.00020109374999999997, 'epoch': 7.78}


 39%|███▉      | 2200/5620 [1:43:25<1:55:55,  2.03s/it]

{'loss': 0.3677, 'grad_norm': 0.7976038455963135, 'learning_rate': 0.00020050781249999998, 'epoch': 7.82}


                                                       
 39%|███▉      | 2200/5620 [1:44:13<1:55:55,  2.03s/it]

{'eval_loss': 0.35470330715179443, 'eval_wer': 0.38107248601820376, 'eval_runtime': 48.3521, 'eval_samples_per_second': 20.682, 'eval_steps_per_second': 2.585, 'epoch': 7.82}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 39%|███▉      | 2210/5620 [1:44:35<2:13:50,  2.35s/it] 

{'loss': 0.3855, 'grad_norm': 0.7398266196250916, 'learning_rate': 0.00019992187499999999, 'epoch': 7.85}


 40%|███▉      | 2220/5620 [1:44:49<1:13:40,  1.30s/it]

{'loss': 0.3984, 'grad_norm': 0.9832607507705688, 'learning_rate': 0.00019933593749999997, 'epoch': 7.89}


 40%|███▉      | 2230/5620 [1:45:21<2:43:14,  2.89s/it]

{'loss': 0.3833, 'grad_norm': 0.4879007637500763, 'learning_rate': 0.00019874999999999998, 'epoch': 7.92}


 40%|███▉      | 2240/5620 [1:45:44<2:03:03,  2.18s/it]

{'loss': 0.3299, 'grad_norm': 0.5522030591964722, 'learning_rate': 0.0001981640625, 'epoch': 7.96}


 40%|████      | 2250/5620 [1:46:00<1:23:27,  1.49s/it]

{'loss': 0.3903, 'grad_norm': 0.8139228224754333, 'learning_rate': 0.00019757812499999997, 'epoch': 7.99}


 40%|████      | 2260/5620 [1:46:31<3:00:36,  3.23s/it]

{'loss': 0.3568, 'grad_norm': 0.42930346727371216, 'learning_rate': 0.00019699218749999998, 'epoch': 8.03}


 40%|████      | 2270/5620 [1:46:58<2:19:51,  2.50s/it]

{'loss': 0.3024, 'grad_norm': 0.8148699998855591, 'learning_rate': 0.00019640624999999997, 'epoch': 8.06}


 41%|████      | 2280/5620 [1:47:20<1:58:09,  2.12s/it]

{'loss': 0.3057, 'grad_norm': 0.45959240198135376, 'learning_rate': 0.00019582031249999998, 'epoch': 8.1}


 41%|████      | 2290/5620 [1:47:38<1:37:01,  1.75s/it]

{'loss': 0.3334, 'grad_norm': 1.0153565406799316, 'learning_rate': 0.000195234375, 'epoch': 8.13}


 41%|████      | 2300/5620 [1:47:53<1:14:47,  1.35s/it]

{'loss': 0.3465, 'grad_norm': nan, 'learning_rate': 0.00019470703125, 'epoch': 8.17}


                                                       
 41%|████      | 2300/5620 [1:48:41<1:14:47,  1.35s/it]

{'eval_loss': 0.3699369430541992, 'eval_wer': 0.3831560478122601, 'eval_runtime': 48.5056, 'eval_samples_per_second': 20.616, 'eval_steps_per_second': 2.577, 'epoch': 8.17}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 41%|████      | 2310/5620 [1:49:15<3:32:22,  3.85s/it] 

{'loss': 0.3515, 'grad_norm': 0.3393856883049011, 'learning_rate': 0.00019412109374999998, 'epoch': 8.21}


 41%|████▏     | 2320/5620 [1:49:43<2:25:04,  2.64s/it]

{'loss': 0.2979, 'grad_norm': 0.5179024934768677, 'learning_rate': 0.00019353515625, 'epoch': 8.24}


 41%|████▏     | 2330/5620 [1:50:06<1:58:38,  2.16s/it]

{'loss': 0.3058, 'grad_norm': 0.4952671229839325, 'learning_rate': 0.00019294921874999998, 'epoch': 8.28}


 42%|████▏     | 2340/5620 [1:50:24<1:40:17,  1.83s/it]

{'loss': 0.2966, 'grad_norm': 0.6268970966339111, 'learning_rate': 0.00019236328125, 'epoch': 8.31}


 42%|████▏     | 2350/5620 [1:50:40<1:16:45,  1.41s/it]

{'loss': 0.3032, 'grad_norm': 0.7518585920333862, 'learning_rate': 0.00019177734375, 'epoch': 8.35}


 42%|████▏     | 2360/5620 [1:51:11<2:59:02,  3.30s/it]

{'loss': 0.3781, 'grad_norm': 0.562218189239502, 'learning_rate': 0.00019119140624999998, 'epoch': 8.38}


 42%|████▏     | 2370/5620 [1:51:39<2:22:20,  2.63s/it]

{'loss': 0.3064, 'grad_norm': 0.6619564890861511, 'learning_rate': 0.00019060546875, 'epoch': 8.42}


 42%|████▏     | 2380/5620 [1:52:01<1:54:57,  2.13s/it]

{'loss': 0.2762, 'grad_norm': 1.0013513565063477, 'learning_rate': 0.00019001953125, 'epoch': 8.45}


 43%|████▎     | 2390/5620 [1:52:19<1:36:02,  1.78s/it]

{'loss': 0.3218, 'grad_norm': 0.7865644693374634, 'learning_rate': 0.00018943359374999999, 'epoch': 8.49}


 43%|████▎     | 2400/5620 [1:52:34<1:15:01,  1.40s/it]

{'loss': 0.3042, 'grad_norm': 0.6602602601051331, 'learning_rate': 0.00018884765625, 'epoch': 8.53}


                                                       
 43%|████▎     | 2400/5620 [1:53:23<1:15:01,  1.40s/it]

{'eval_loss': 0.36700358986854553, 'eval_wer': 0.3856782541945389, 'eval_runtime': 48.5218, 'eval_samples_per_second': 20.609, 'eval_steps_per_second': 2.576, 'epoch': 8.53}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 43%|████▎     | 2410/5620 [1:53:57<3:27:38,  3.88s/it] 

{'loss': 0.3561, 'grad_norm': 0.5708282589912415, 'learning_rate': 0.00018826171874999998, 'epoch': 8.56}


 43%|████▎     | 2420/5620 [1:54:25<2:21:27,  2.65s/it]

{'loss': 0.3191, 'grad_norm': 0.42484134435653687, 'learning_rate': 0.00018767578125, 'epoch': 8.6}


 43%|████▎     | 2430/5620 [1:54:47<1:55:12,  2.17s/it]

{'loss': 0.3018, 'grad_norm': 0.5517683625221252, 'learning_rate': 0.00018708984375, 'epoch': 8.63}


 43%|████▎     | 2440/5620 [1:55:06<1:37:18,  1.84s/it]

{'loss': 0.278, 'grad_norm': 1.0955841541290283, 'learning_rate': 0.00018650390624999998, 'epoch': 8.67}


 44%|████▎     | 2450/5620 [1:55:21<1:10:59,  1.34s/it]

{'loss': 0.3473, 'grad_norm': 1.6554443836212158, 'learning_rate': 0.00018591796875, 'epoch': 8.7}


 44%|████▍     | 2460/5620 [1:55:52<2:48:19,  3.20s/it]

{'loss': 0.403, 'grad_norm': 0.5071585774421692, 'learning_rate': 0.00018533203124999998, 'epoch': 8.74}


 44%|████▍     | 2470/5620 [1:56:20<2:20:31,  2.68s/it]

{'loss': 0.3107, 'grad_norm': 0.51945561170578, 'learning_rate': 0.00018474609375, 'epoch': 8.77}


 44%|████▍     | 2480/5620 [1:56:43<1:51:37,  2.13s/it]

{'loss': 0.3137, 'grad_norm': 0.6409015655517578, 'learning_rate': 0.00018416015625, 'epoch': 8.81}


 44%|████▍     | 2490/5620 [1:57:02<1:35:28,  1.83s/it]

{'loss': 0.3107, 'grad_norm': 0.6941359639167786, 'learning_rate': 0.00018357421874999998, 'epoch': 8.85}


 44%|████▍     | 2500/5620 [1:57:17<1:12:57,  1.40s/it]

{'loss': 0.3372, 'grad_norm': 0.9873717427253723, 'learning_rate': 0.00018298828125, 'epoch': 8.88}


                                                       
 44%|████▍     | 2500/5620 [1:58:06<1:12:57,  1.40s/it]

{'eval_loss': 0.3580026924610138, 'eval_wer': 0.36791314837153194, 'eval_runtime': 48.6589, 'eval_samples_per_second': 20.551, 'eval_steps_per_second': 2.569, 'epoch': 8.88}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 45%|████▍     | 2510/5620 [1:58:38<3:03:11,  3.53s/it] 

{'loss': 0.3266, 'grad_norm': 0.554855227470398, 'learning_rate': 0.00018240234375, 'epoch': 8.92}


 45%|████▍     | 2520/5620 [1:59:02<1:55:20,  2.23s/it]

{'loss': 0.2854, 'grad_norm': 0.5996837615966797, 'learning_rate': 0.00018181640624999998, 'epoch': 8.95}


 45%|████▌     | 2530/5620 [1:59:19<1:25:53,  1.67s/it]

{'loss': 0.2846, 'grad_norm': 0.8464022278785706, 'learning_rate': 0.00018123046875, 'epoch': 8.99}


 45%|████▌     | 2540/5620 [1:59:47<2:48:53,  3.29s/it]

{'loss': 0.3563, 'grad_norm': 0.48094987869262695, 'learning_rate': 0.00018064453124999998, 'epoch': 9.02}


 45%|████▌     | 2550/5620 [2:00:16<2:19:51,  2.73s/it]

{'loss': 0.2733, 'grad_norm': 0.5377097129821777, 'learning_rate': 0.00018005859375, 'epoch': 9.06}


 46%|████▌     | 2560/5620 [2:00:39<1:52:40,  2.21s/it]

{'loss': 0.2839, 'grad_norm': 0.9905152916908264, 'learning_rate': 0.00017947265625, 'epoch': 9.09}


 46%|████▌     | 2570/5620 [2:00:58<1:33:41,  1.84s/it]

{'loss': 0.3081, 'grad_norm': 0.5736888647079468, 'learning_rate': 0.00017888671874999998, 'epoch': 9.13}


 46%|████▌     | 2580/5620 [2:01:14<1:15:14,  1.48s/it]

{'loss': 0.2526, 'grad_norm': 1.0601037740707397, 'learning_rate': 0.00017830078125, 'epoch': 9.17}


 46%|████▌     | 2590/5620 [2:01:42<2:46:31,  3.30s/it]

{'loss': 0.3278, 'grad_norm': 0.5225079655647278, 'learning_rate': 0.00017771484375, 'epoch': 9.2}


 46%|████▋     | 2600/5620 [2:02:11<2:19:13,  2.77s/it]

{'loss': 0.2695, 'grad_norm': 0.4707286059856415, 'learning_rate': 0.00017712890624999999, 'epoch': 9.24}


                                                       
 46%|████▋     | 2600/5620 [2:03:00<2:19:13,  2.77s/it]

{'eval_loss': 0.3638386130332947, 'eval_wer': 0.36758416493036516, 'eval_runtime': 48.5792, 'eval_samples_per_second': 20.585, 'eval_steps_per_second': 2.573, 'epoch': 9.24}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 46%|████▋     | 2610/5620 [2:03:27<2:23:56,  2.87s/it] 

{'loss': 0.2708, 'grad_norm': 0.5521630048751831, 'learning_rate': 0.00017654296875, 'epoch': 9.27}


 47%|████▋     | 2620/5620 [2:03:46<1:35:35,  1.91s/it]

{'loss': 0.276, 'grad_norm': 0.9296287298202515, 'learning_rate': 0.00017595703124999998, 'epoch': 9.31}


 47%|████▋     | 2630/5620 [2:04:02<1:14:03,  1.49s/it]

{'loss': 0.331, 'grad_norm': 0.6985962390899658, 'learning_rate': 0.00017537109375, 'epoch': 9.34}


 47%|████▋     | 2640/5620 [2:04:30<2:39:46,  3.22s/it]

{'loss': 0.3295, 'grad_norm': 0.5208333134651184, 'learning_rate': 0.00017478515625, 'epoch': 9.38}


 47%|████▋     | 2650/5620 [2:04:59<2:15:30,  2.74s/it]

{'loss': 0.2613, 'grad_norm': 0.8268219828605652, 'learning_rate': 0.00017419921874999998, 'epoch': 9.41}


 47%|████▋     | 2660/5620 [2:05:22<1:48:57,  2.21s/it]

{'loss': 0.2828, 'grad_norm': 0.7639422416687012, 'learning_rate': 0.00017361328125, 'epoch': 9.45}


 48%|████▊     | 2670/5620 [2:05:41<1:30:44,  1.85s/it]

{'loss': 0.2745, 'grad_norm': 0.5972168445587158, 'learning_rate': 0.00017302734375, 'epoch': 9.48}


 48%|████▊     | 2680/5620 [2:05:57<1:14:18,  1.52s/it]

{'loss': 0.2699, 'grad_norm': 0.8415752649307251, 'learning_rate': 0.00017244140625, 'epoch': 9.52}


 48%|████▊     | 2690/5620 [2:06:26<2:42:31,  3.33s/it]

{'loss': 0.3383, 'grad_norm': 0.41522398591041565, 'learning_rate': 0.00017185546875, 'epoch': 9.56}


 48%|████▊     | 2700/5620 [2:06:54<2:13:05,  2.73s/it]

{'loss': 0.3059, 'grad_norm': 0.4899861812591553, 'learning_rate': 0.00017126953124999998, 'epoch': 9.59}


                                                       
 48%|████▊     | 2700/5620 [2:07:43<2:13:05,  2.73s/it]

{'eval_loss': 0.3548935353755951, 'eval_wer': 0.3587016120188617, 'eval_runtime': 48.5618, 'eval_samples_per_second': 20.592, 'eval_steps_per_second': 2.574, 'epoch': 9.59}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 48%|████▊     | 2710/5620 [2:08:09<2:15:29,  2.79s/it] 

{'loss': 0.2874, 'grad_norm': 0.6139984726905823, 'learning_rate': 0.00017068359375, 'epoch': 9.63}


 48%|████▊     | 2720/5620 [2:08:28<1:28:36,  1.83s/it]

{'loss': 0.2885, 'grad_norm': 0.8228352665901184, 'learning_rate': 0.00017009765625, 'epoch': 9.66}


 49%|████▊     | 2730/5620 [2:08:43<1:10:29,  1.46s/it]

{'loss': 0.2962, 'grad_norm': 1.0338939428329468, 'learning_rate': 0.00016951171874999998, 'epoch': 9.7}


 49%|████▉     | 2740/5620 [2:09:11<2:37:10,  3.27s/it]

{'loss': 0.388, 'grad_norm': 0.4987642467021942, 'learning_rate': 0.00016892578125, 'epoch': 9.73}


 49%|████▉     | 2750/5620 [2:09:40<2:12:01,  2.76s/it]

{'loss': 0.2946, 'grad_norm': 0.6900378465652466, 'learning_rate': 0.00016833984375, 'epoch': 9.77}


 49%|████▉     | 2760/5620 [2:10:04<1:45:26,  2.21s/it]

{'loss': 0.2634, 'grad_norm': 0.5923492312431335, 'learning_rate': 0.00016775390625, 'epoch': 9.8}


 49%|████▉     | 2770/5620 [2:10:23<1:27:36,  1.84s/it]

{'loss': 0.2904, 'grad_norm': 0.5621572732925415, 'learning_rate': 0.00016716796875, 'epoch': 9.84}


 49%|████▉     | 2780/5620 [2:10:39<1:09:46,  1.47s/it]

{'loss': 0.3273, 'grad_norm': 0.8396136164665222, 'learning_rate': 0.00016658203124999998, 'epoch': 9.88}


 50%|████▉     | 2790/5620 [2:11:06<2:24:57,  3.07s/it]

{'loss': 0.3269, 'grad_norm': 0.6579383611679077, 'learning_rate': 0.00016599609375, 'epoch': 9.91}


 50%|████▉     | 2800/5620 [2:11:30<1:46:00,  2.26s/it]

{'loss': 0.3104, 'grad_norm': 0.6017203330993652, 'learning_rate': 0.00016541015625, 'epoch': 9.95}


                                                       
 50%|████▉     | 2800/5620 [2:12:18<1:46:00,  2.26s/it]

{'eval_loss': 0.365486741065979, 'eval_wer': 0.35530211646013815, 'eval_runtime': 48.4286, 'eval_samples_per_second': 20.649, 'eval_steps_per_second': 2.581, 'epoch': 9.95}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 50%|█████     | 2810/5620 [2:12:40<1:45:41,  2.26s/it] 

{'loss': 0.2999, 'grad_norm': 0.8562806248664856, 'learning_rate': 0.00016482421874999999, 'epoch': 9.98}


 50%|█████     | 2820/5620 [2:13:04<2:27:26,  3.16s/it]

{'loss': 0.3668, 'grad_norm': 0.49282729625701904, 'learning_rate': 0.00016423828125, 'epoch': 10.02}


 50%|█████     | 2830/5620 [2:13:34<2:08:30,  2.76s/it]

{'loss': 0.295, 'grad_norm': 0.46261072158813477, 'learning_rate': 0.00016365234374999998, 'epoch': 10.05}


 51%|█████     | 2840/5620 [2:13:57<1:46:52,  2.31s/it]

{'loss': 0.2511, 'grad_norm': 0.6359744668006897, 'learning_rate': 0.00016306640625, 'epoch': 10.09}


 51%|█████     | 2850/5620 [2:14:18<1:29:33,  1.94s/it]

{'loss': 0.2252, 'grad_norm': 0.48924145102500916, 'learning_rate': 0.00016248046875, 'epoch': 10.12}


 51%|█████     | 2860/5620 [2:14:34<1:11:37,  1.56s/it]

{'loss': 0.2495, 'grad_norm': 1.9386632442474365, 'learning_rate': 0.00016189453124999998, 'epoch': 10.16}


 51%|█████     | 2870/5620 [2:14:59<2:27:31,  3.22s/it]

{'loss': 0.3365, 'grad_norm': 1.0356849431991577, 'learning_rate': 0.00016130859375, 'epoch': 10.2}


 51%|█████     | 2880/5620 [2:15:30<2:10:34,  2.86s/it]

{'loss': 0.2478, 'grad_norm': 0.6851006150245667, 'learning_rate': 0.00016072265625, 'epoch': 10.23}


 51%|█████▏    | 2890/5620 [2:15:54<1:45:47,  2.33s/it]

{'loss': 0.2475, 'grad_norm': 0.499630331993103, 'learning_rate': 0.00016013671875, 'epoch': 10.27}


 52%|█████▏    | 2900/5620 [2:16:13<1:25:05,  1.88s/it]

{'loss': 0.2522, 'grad_norm': 0.6219022274017334, 'learning_rate': 0.00015955078125, 'epoch': 10.3}


                                                       
 52%|█████▏    | 2900/5620 [2:17:02<1:25:05,  1.88s/it]

{'eval_loss': 0.3797913193702698, 'eval_wer': 0.3544248272836934, 'eval_runtime': 48.5167, 'eval_samples_per_second': 20.611, 'eval_steps_per_second': 2.576, 'epoch': 10.3}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 52%|█████▏    | 2910/5620 [2:17:21<1:37:56,  2.17s/it] 

{'loss': 0.2622, 'grad_norm': 0.9725363254547119, 'learning_rate': 0.00015896484374999998, 'epoch': 10.34}


 52%|█████▏    | 2920/5620 [2:17:46<2:22:16,  3.16s/it]

{'loss': 0.3254, 'grad_norm': 0.6460405588150024, 'learning_rate': 0.00015837890625, 'epoch': 10.37}


 52%|█████▏    | 2930/5620 [2:18:16<2:03:56,  2.76s/it]

{'loss': 0.2721, 'grad_norm': 0.49017995595932007, 'learning_rate': 0.00015779296875, 'epoch': 10.41}


 52%|█████▏    | 2940/5620 [2:18:40<1:43:43,  2.32s/it]

{'loss': 0.2399, 'grad_norm': 0.5012296438217163, 'learning_rate': 0.00015720703124999998, 'epoch': 10.44}


 52%|█████▏    | 2950/5620 [2:19:00<1:25:32,  1.92s/it]

{'loss': 0.256, 'grad_norm': 0.5151152610778809, 'learning_rate': 0.00015662109375, 'epoch': 10.48}


 53%|█████▎    | 2960/5620 [2:19:16<1:07:29,  1.52s/it]

{'loss': 0.2694, 'grad_norm': 0.6395782232284546, 'learning_rate': 0.00015603515625, 'epoch': 10.52}


 53%|█████▎    | 2970/5620 [2:19:41<2:26:21,  3.31s/it]

{'loss': 0.3005, 'grad_norm': 0.44903770089149475, 'learning_rate': 0.00015544921875, 'epoch': 10.55}


 53%|█████▎    | 2980/5620 [2:20:11<2:01:14,  2.76s/it]

{'loss': 0.2687, 'grad_norm': 0.5544123649597168, 'learning_rate': 0.00015486328125, 'epoch': 10.59}


 53%|█████▎    | 2990/5620 [2:20:35<1:40:35,  2.30s/it]

{'loss': 0.2321, 'grad_norm': 0.639642596244812, 'learning_rate': 0.00015427734374999998, 'epoch': 10.62}


 53%|█████▎    | 3000/5620 [2:20:54<1:24:25,  1.93s/it]

{'loss': 0.2353, 'grad_norm': 0.5758784413337708, 'learning_rate': 0.00015369140625, 'epoch': 10.66}


                                                       
 53%|█████▎    | 3000/5620 [2:21:43<1:24:25,  1.93s/it]

{'eval_loss': 0.3617567718029022, 'eval_wer': 0.3425814234016888, 'eval_runtime': 48.4989, 'eval_samples_per_second': 20.619, 'eval_steps_per_second': 2.577, 'epoch': 10.66}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 54%|█████▎    | 3010/5620 [2:22:03<1:35:39,  2.20s/it] 

{'loss': 0.2467, 'grad_norm': 0.7535631656646729, 'learning_rate': 0.00015310546875, 'epoch': 10.69}


 54%|█████▎    | 3020/5620 [2:22:28<2:16:28,  3.15s/it]

{'loss': 0.3601, 'grad_norm': 0.5261889100074768, 'learning_rate': 0.00015251953124999999, 'epoch': 10.73}


 54%|█████▍    | 3030/5620 [2:22:57<2:00:12,  2.78s/it]

{'loss': 0.2615, 'grad_norm': 0.5005595684051514, 'learning_rate': 0.00015193359375, 'epoch': 10.76}


 54%|█████▍    | 3040/5620 [2:23:21<1:36:20,  2.24s/it]

{'loss': 0.2552, 'grad_norm': 0.5418822765350342, 'learning_rate': 0.00015134765625, 'epoch': 10.8}


 54%|█████▍    | 3050/5620 [2:23:40<1:18:58,  1.84s/it]

{'loss': 0.2512, 'grad_norm': 0.613331139087677, 'learning_rate': 0.00015076171875, 'epoch': 10.83}


 54%|█████▍    | 3060/5620 [2:23:56<1:04:11,  1.50s/it]

{'loss': 0.2798, 'grad_norm': 1.4555741548538208, 'learning_rate': 0.00015017578125, 'epoch': 10.87}


 55%|█████▍    | 3070/5620 [2:24:21<2:10:28,  3.07s/it]

{'loss': 0.3078, 'grad_norm': 0.6426699757575989, 'learning_rate': 0.00014958984374999998, 'epoch': 10.91}


 55%|█████▍    | 3080/5620 [2:24:47<1:40:15,  2.37s/it]

{'loss': 0.2452, 'grad_norm': 0.545131504535675, 'learning_rate': 0.00014900390625, 'epoch': 10.94}


 55%|█████▍    | 3090/5620 [2:25:06<1:14:58,  1.78s/it]

{'loss': 0.2803, 'grad_norm': 0.5365835428237915, 'learning_rate': 0.00014841796875, 'epoch': 10.98}


 55%|█████▌    | 3100/5620 [2:25:28<2:09:28,  3.08s/it]

{'loss': 0.2813, 'grad_norm': 0.47779110074043274, 'learning_rate': 0.00014783203125, 'epoch': 11.01}


                                                       
 55%|█████▌    | 3100/5620 [2:26:17<2:09:28,  3.08s/it]

{'eval_loss': 0.3562708795070648, 'eval_wer': 0.34685820813685714, 'eval_runtime': 48.7424, 'eval_samples_per_second': 20.516, 'eval_steps_per_second': 2.565, 'epoch': 11.01}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 55%|█████▌    | 3110/5620 [2:26:52<2:28:27,  3.55s/it] 

{'loss': 0.2398, 'grad_norm': 0.5966642498970032, 'learning_rate': 0.00014724609375, 'epoch': 11.05}


 56%|█████▌    | 3120/5620 [2:27:16<1:38:01,  2.35s/it]

{'loss': 0.2608, 'grad_norm': 0.5430663824081421, 'learning_rate': 0.00014666015624999998, 'epoch': 11.08}


 56%|█████▌    | 3130/5620 [2:27:37<1:23:21,  2.01s/it]

{'loss': 0.221, 'grad_norm': 0.5505279302597046, 'learning_rate': 0.00014607421875, 'epoch': 11.12}


 56%|█████▌    | 3140/5620 [2:27:54<1:06:05,  1.60s/it]

{'loss': 0.2389, 'grad_norm': 0.6085853576660156, 'learning_rate': 0.00014548828125, 'epoch': 11.15}


 56%|█████▌    | 3150/5620 [2:28:16<2:05:24,  3.05s/it]

{'loss': 0.2974, 'grad_norm': 0.5281742215156555, 'learning_rate': 0.00014490234374999998, 'epoch': 11.19}


 56%|█████▌    | 3160/5620 [2:28:47<1:57:13,  2.86s/it]

{'loss': 0.2622, 'grad_norm': 0.5273491144180298, 'learning_rate': 0.00014431640625, 'epoch': 11.23}


 56%|█████▋    | 3170/5620 [2:29:11<1:32:01,  2.25s/it]

{'loss': 0.2387, 'grad_norm': 0.567135214805603, 'learning_rate': 0.00014373046875, 'epoch': 11.26}


 57%|█████▋    | 3180/5620 [2:29:30<1:17:28,  1.91s/it]

{'loss': 0.2317, 'grad_norm': 0.7895539402961731, 'learning_rate': 0.00014314453125, 'epoch': 11.3}


 57%|█████▋    | 3190/5620 [2:29:47<1:04:37,  1.60s/it]

{'loss': 0.265, 'grad_norm': 0.6372637152671814, 'learning_rate': 0.00014255859375, 'epoch': 11.33}


 57%|█████▋    | 3200/5620 [2:30:09<2:04:16,  3.08s/it]

{'loss': 0.2984, 'grad_norm': 0.407438188791275, 'learning_rate': 0.00014197265624999998, 'epoch': 11.37}


                                                       
 57%|█████▋    | 3200/5620 [2:30:58<2:04:16,  3.08s/it]

{'eval_loss': 0.36532559990882874, 'eval_wer': 0.342252439960522, 'eval_runtime': 48.7085, 'eval_samples_per_second': 20.53, 'eval_steps_per_second': 2.566, 'epoch': 11.37}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 57%|█████▋    | 3210/5620 [2:31:33<2:22:27,  3.55s/it] 

{'loss': 0.2317, 'grad_norm': 0.6334208250045776, 'learning_rate': 0.00014138671875, 'epoch': 11.4}


 57%|█████▋    | 3220/5620 [2:31:57<1:32:46,  2.32s/it]

{'loss': 0.2758, 'grad_norm': 0.461556077003479, 'learning_rate': 0.00014080078125, 'epoch': 11.44}


 57%|█████▋    | 3230/5620 [2:32:17<1:16:21,  1.92s/it]

{'loss': 0.2143, 'grad_norm': 0.6099635362625122, 'learning_rate': 0.00014021484374999999, 'epoch': 11.47}


 58%|█████▊    | 3240/5620 [2:32:34<1:02:00,  1.56s/it]

{'loss': 0.2818, 'grad_norm': 0.7881305813789368, 'learning_rate': 0.00013962890625, 'epoch': 11.51}


 58%|█████▊    | 3250/5620 [2:32:56<2:02:07,  3.09s/it]

{'loss': 0.2666, 'grad_norm': 0.34333494305610657, 'learning_rate': 0.00013904296875, 'epoch': 11.55}


 58%|█████▊    | 3260/5620 [2:33:27<1:53:38,  2.89s/it]

{'loss': 0.2819, 'grad_norm': 0.47845742106437683, 'learning_rate': 0.00013845703125, 'epoch': 11.58}


 58%|█████▊    | 3270/5620 [2:33:51<1:32:26,  2.36s/it]

{'loss': 0.2186, 'grad_norm': 0.5690762996673584, 'learning_rate': 0.00013787109375, 'epoch': 11.62}


 58%|█████▊    | 3280/5620 [2:34:12<1:16:13,  1.95s/it]

{'loss': 0.2267, 'grad_norm': 0.5348265767097473, 'learning_rate': 0.00013728515624999998, 'epoch': 11.65}


 59%|█████▊    | 3290/5620 [2:34:29<1:02:21,  1.61s/it]

{'loss': 0.2687, 'grad_norm': 0.6397176384925842, 'learning_rate': 0.00013669921875, 'epoch': 11.69}


 59%|█████▊    | 3300/5620 [2:34:50<1:53:56,  2.95s/it]

{'loss': 0.3056, 'grad_norm': 0.5640934705734253, 'learning_rate': 0.00013611328125, 'epoch': 11.72}


                                                       
 59%|█████▊    | 3300/5620 [2:35:39<1:53:56,  2.95s/it]

{'eval_loss': 0.36773669719696045, 'eval_wer': 0.3344664985195745, 'eval_runtime': 48.7249, 'eval_samples_per_second': 20.523, 'eval_steps_per_second': 2.565, 'epoch': 11.72}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 59%|█████▉    | 3310/5620 [2:36:13<2:13:52,  3.48s/it] 

{'loss': 0.2623, 'grad_norm': 0.4185989201068878, 'learning_rate': 0.00013552734375, 'epoch': 11.76}


 59%|█████▉    | 3320/5620 [2:36:37<1:28:20,  2.30s/it]

{'loss': 0.2696, 'grad_norm': 0.6562384366989136, 'learning_rate': 0.00013494140625, 'epoch': 11.79}


 59%|█████▉    | 3330/5620 [2:36:58<1:15:42,  1.98s/it]

{'loss': 0.2256, 'grad_norm': 0.9390986561775208, 'learning_rate': 0.00013435546875, 'epoch': 11.83}


 59%|█████▉    | 3340/5620 [2:37:15<1:01:54,  1.63s/it]

{'loss': 0.2327, 'grad_norm': 0.8919268846511841, 'learning_rate': 0.00013376953125, 'epoch': 11.87}


 60%|█████▉    | 3350/5620 [2:37:36<1:52:04,  2.96s/it]

{'loss': 0.3055, 'grad_norm': 0.4219912886619568, 'learning_rate': 0.00013318359375, 'epoch': 11.9}


 60%|█████▉    | 3360/5620 [2:38:04<1:35:48,  2.54s/it]

{'loss': 0.2607, 'grad_norm': 0.6359744071960449, 'learning_rate': 0.00013259765624999998, 'epoch': 11.94}


 60%|█████▉    | 3370/5620 [2:38:24<1:11:26,  1.91s/it]

{'loss': 0.2541, 'grad_norm': 0.5799254179000854, 'learning_rate': 0.00013201171875, 'epoch': 11.97}


 60%|██████    | 3380/5620 [2:38:44<1:41:30,  2.72s/it]

{'loss': 0.2655, 'grad_norm': 0.47350209951400757, 'learning_rate': 0.00013142578125, 'epoch': 12.01}


 60%|██████    | 3390/5620 [2:39:16<1:53:27,  3.05s/it]

{'loss': 0.2424, 'grad_norm': 0.3408435881137848, 'learning_rate': 0.00013083984375, 'epoch': 12.04}


 60%|██████    | 3400/5620 [2:39:41<1:30:18,  2.44s/it]

{'loss': 0.2412, 'grad_norm': 0.6442155241966248, 'learning_rate': 0.00013025390625, 'epoch': 12.08}


                                                       
 60%|██████    | 3400/5620 [2:40:30<1:30:18,  2.44s/it]

{'eval_loss': 0.3416590690612793, 'eval_wer': 0.3341375150784077, 'eval_runtime': 48.68, 'eval_samples_per_second': 20.542, 'eval_steps_per_second': 2.568, 'epoch': 12.08}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 61%|██████    | 3410/5620 [2:40:54<1:38:33,  2.68s/it] 

{'loss': 0.1965, 'grad_norm': 0.4885920584201813, 'learning_rate': 0.00012966796874999998, 'epoch': 12.11}


 61%|██████    | 3420/5620 [2:41:11<1:02:01,  1.69s/it]

{'loss': 0.2032, 'grad_norm': 0.6099933981895447, 'learning_rate': 0.00012908203125, 'epoch': 12.15}


 61%|██████    | 3430/5620 [2:41:31<1:38:45,  2.71s/it]

{'loss': 0.2796, 'grad_norm': 0.4877433776855469, 'learning_rate': 0.00012849609375, 'epoch': 12.18}


 61%|██████    | 3440/5620 [2:42:04<1:52:01,  3.08s/it]

{'loss': 0.2214, 'grad_norm': 0.6011521816253662, 'learning_rate': 0.00012791015624999999, 'epoch': 12.22}


 61%|██████▏   | 3450/5620 [2:42:29<1:29:05,  2.46s/it]

{'loss': 0.2175, 'grad_norm': 0.49437761306762695, 'learning_rate': 0.00012732421875, 'epoch': 12.26}


 62%|██████▏   | 3460/5620 [2:42:50<1:13:36,  2.04s/it]

{'loss': 0.2039, 'grad_norm': 0.6712756156921387, 'learning_rate': 0.00012673828125, 'epoch': 12.29}


 62%|██████▏   | 3470/5620 [2:43:08<1:01:00,  1.70s/it]

{'loss': 0.2165, 'grad_norm': 0.7281160950660706, 'learning_rate': 0.00012615234375, 'epoch': 12.33}


 62%|██████▏   | 3480/5620 [2:43:27<1:35:56,  2.69s/it]

{'loss': 0.2396, 'grad_norm': 0.5724637508392334, 'learning_rate': 0.00012556640625, 'epoch': 12.36}


 62%|██████▏   | 3490/5620 [2:43:59<1:45:12,  2.96s/it]

{'loss': 0.223, 'grad_norm': 0.639596700668335, 'learning_rate': 0.00012498046874999998, 'epoch': 12.4}


 62%|██████▏   | 3500/5620 [2:44:24<1:25:37,  2.42s/it]

{'loss': 0.1913, 'grad_norm': 0.45837119221687317, 'learning_rate': 0.00012439453125, 'epoch': 12.43}


                                                       
 62%|██████▏   | 3500/5620 [2:45:13<1:25:37,  2.42s/it]

{'eval_loss': 0.3457069993019104, 'eval_wer': 0.3286544577256278, 'eval_runtime': 48.6557, 'eval_samples_per_second': 20.553, 'eval_steps_per_second': 2.569, 'epoch': 12.43}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 62%|██████▏   | 3510/5620 [2:45:37<1:32:59,  2.64s/it] 

{'loss': 0.1972, 'grad_norm': 0.586525559425354, 'learning_rate': 0.00012380859375, 'epoch': 12.47}


 63%|██████▎   | 3520/5620 [2:45:55<59:35,  1.70s/it]  

{'loss': 0.184, 'grad_norm': 0.8719655871391296, 'learning_rate': 0.00012322265625, 'epoch': 12.5}


 63%|██████▎   | 3530/5620 [2:46:14<1:33:01,  2.67s/it]

{'loss': 0.2375, 'grad_norm': 0.4707072973251343, 'learning_rate': 0.00012263671875, 'epoch': 12.54}


 63%|██████▎   | 3540/5620 [2:46:46<1:42:25,  2.95s/it]

{'loss': 0.2091, 'grad_norm': 0.9349853992462158, 'learning_rate': 0.00012205078125, 'epoch': 12.58}


 63%|██████▎   | 3550/5620 [2:47:11<1:21:30,  2.36s/it]

{'loss': 0.2232, 'grad_norm': 1.0192369222640991, 'learning_rate': 0.00012146484374999999, 'epoch': 12.61}


 63%|██████▎   | 3560/5620 [2:47:32<1:09:49,  2.03s/it]

{'loss': 0.2032, 'grad_norm': 0.9733346104621887, 'learning_rate': 0.00012087890625, 'epoch': 12.65}


 64%|██████▎   | 3570/5620 [2:47:50<57:45,  1.69s/it]  

{'loss': 0.2171, 'grad_norm': 0.5925300717353821, 'learning_rate': 0.00012029296875, 'epoch': 12.68}


 64%|██████▎   | 3580/5620 [2:48:08<1:28:28,  2.60s/it]

{'loss': 0.2567, 'grad_norm': 0.430265873670578, 'learning_rate': 0.00011970703125, 'epoch': 12.72}


 64%|██████▍   | 3590/5620 [2:48:41<1:41:32,  3.00s/it]

{'loss': 0.2641, 'grad_norm': 0.417451411485672, 'learning_rate': 0.00011912109374999999, 'epoch': 12.75}


 64%|██████▍   | 3600/5620 [2:49:06<1:20:51,  2.40s/it]

{'loss': 0.2165, 'grad_norm': 0.5585731267929077, 'learning_rate': 0.00011853515625, 'epoch': 12.79}


                                                       
 64%|██████▍   | 3600/5620 [2:49:55<1:20:51,  2.40s/it]

{'eval_loss': 0.34545156359672546, 'eval_wer': 0.33172496984318456, 'eval_runtime': 48.7184, 'eval_samples_per_second': 20.526, 'eval_steps_per_second': 2.566, 'epoch': 12.79}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 64%|██████▍   | 3610/5620 [2:50:19<1:27:57,  2.63s/it] 

{'loss': 0.1881, 'grad_norm': 0.6757665872573853, 'learning_rate': 0.00011794921875, 'epoch': 12.82}


 64%|██████▍   | 3620/5620 [2:50:36<54:23,  1.63s/it]  

{'loss': 0.2321, 'grad_norm': 0.7297771573066711, 'learning_rate': 0.00011736328125, 'epoch': 12.86}


 65%|██████▍   | 3630/5620 [2:50:54<1:24:43,  2.55s/it]

{'loss': 0.2452, 'grad_norm': 0.4104827344417572, 'learning_rate': 0.00011677734374999999, 'epoch': 12.9}


 65%|██████▍   | 3640/5620 [2:51:23<1:24:37,  2.56s/it]

{'loss': 0.2801, 'grad_norm': 0.500051736831665, 'learning_rate': 0.00011619140625, 'epoch': 12.93}


 65%|██████▍   | 3650/5620 [2:51:44<1:05:33,  2.00s/it]

{'loss': 0.1882, 'grad_norm': 0.6011412739753723, 'learning_rate': 0.00011560546875, 'epoch': 12.97}


 65%|██████▌   | 3660/5620 [2:52:01<1:00:42,  1.86s/it]

{'loss': 0.2198, 'grad_norm': 1.1865307092666626, 'learning_rate': 0.00011501953125, 'epoch': 13.0}


 65%|██████▌   | 3670/5620 [2:52:34<1:41:23,  3.12s/it]

{'loss': 0.22, 'grad_norm': 0.33384010195732117, 'learning_rate': 0.00011443359374999999, 'epoch': 13.04}


 65%|██████▌   | 3680/5620 [2:53:00<1:19:31,  2.46s/it]

{'loss': 0.2004, 'grad_norm': 0.7572473287582397, 'learning_rate': 0.00011384765625, 'epoch': 13.07}


 66%|██████▌   | 3690/5620 [2:53:21<1:05:28,  2.04s/it]

{'loss': 0.1927, 'grad_norm': 0.44739463925361633, 'learning_rate': 0.00011326171875, 'epoch': 13.11}


 66%|██████▌   | 3700/5620 [2:53:39<54:42,  1.71s/it]  

{'loss': 0.2137, 'grad_norm': 0.8821872472763062, 'learning_rate': 0.00011267578125, 'epoch': 13.14}


                                                     
 66%|██████▌   | 3700/5620 [2:54:28<54:42,  1.71s/it]

{'eval_loss': 0.3734857738018036, 'eval_wer': 0.32799649084329424, 'eval_runtime': 48.5861, 'eval_samples_per_second': 20.582, 'eval_steps_per_second': 2.573, 'epoch': 13.14}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 66%|██████▌   | 3710/5620 [2:54:47<1:18:05,  2.45s/it]

{'loss': 0.2183, 'grad_norm': 0.6790705323219299, 'learning_rate': 0.00011208984375, 'epoch': 13.18}


 66%|██████▌   | 3720/5620 [2:55:21<1:39:28,  3.14s/it]

{'loss': 0.2183, 'grad_norm': 0.4617016017436981, 'learning_rate': 0.00011150390625, 'epoch': 13.21}


 66%|██████▋   | 3730/5620 [2:55:47<1:17:15,  2.45s/it]

{'loss': 0.2328, 'grad_norm': 0.61958909034729, 'learning_rate': 0.00011091796875, 'epoch': 13.25}


 67%|██████▋   | 3740/5620 [2:56:09<1:05:40,  2.10s/it]

{'loss': 0.187, 'grad_norm': 0.4314570426940918, 'learning_rate': 0.00011033203125, 'epoch': 13.29}


 67%|██████▋   | 3750/5620 [2:56:27<55:06,  1.77s/it]  

{'loss': 0.1803, 'grad_norm': 0.6475825309753418, 'learning_rate': 0.00010974609375, 'epoch': 13.32}


 67%|██████▋   | 3760/5620 [2:56:43<55:26,  1.79s/it]

{'loss': 0.2451, 'grad_norm': 1.3728350400924683, 'learning_rate': 0.00010916015624999999, 'epoch': 13.36}


 67%|██████▋   | 3770/5620 [2:57:17<1:35:58,  3.11s/it]

{'loss': 0.2504, 'grad_norm': 0.4019307494163513, 'learning_rate': 0.00010857421875, 'epoch': 13.39}


 67%|██████▋   | 3780/5620 [2:57:43<1:17:37,  2.53s/it]

{'loss': 0.2064, 'grad_norm': 0.5386418104171753, 'learning_rate': 0.00010798828125, 'epoch': 13.43}


 67%|██████▋   | 3790/5620 [2:58:04<1:04:07,  2.10s/it]

{'loss': 0.1863, 'grad_norm': 0.4236670434474945, 'learning_rate': 0.00010740234375, 'epoch': 13.46}


 68%|██████▊   | 3800/5620 [2:58:22<51:28,  1.70s/it]  

{'loss': 0.2474, 'grad_norm': 0.5883898735046387, 'learning_rate': 0.00010681640624999999, 'epoch': 13.5}


                                                     
 68%|██████▊   | 3800/5620 [2:59:11<51:28,  1.70s/it]

{'eval_loss': 0.35863715410232544, 'eval_wer': 0.3260225901962934, 'eval_runtime': 48.717, 'eval_samples_per_second': 20.527, 'eval_steps_per_second': 2.566, 'epoch': 13.5}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 68%|██████▊   | 3810/5620 [2:59:30<1:12:51,  2.42s/it]

{'loss': 0.272, 'grad_norm': 1.3699324131011963, 'learning_rate': 0.00010623046875, 'epoch': 13.53}


 68%|██████▊   | 3820/5620 [3:00:03<1:32:14,  3.07s/it]

{'loss': 0.2009, 'grad_norm': 0.8709492683410645, 'learning_rate': 0.00010564453125, 'epoch': 13.57}


 68%|██████▊   | 3830/5620 [3:00:29<1:13:27,  2.46s/it]

{'loss': 0.1917, 'grad_norm': 0.8527363538742065, 'learning_rate': 0.00010505859375, 'epoch': 13.61}


 68%|██████▊   | 3840/5620 [3:00:51<1:03:10,  2.13s/it]

{'loss': 0.1972, 'grad_norm': 0.45319634675979614, 'learning_rate': 0.00010447265624999999, 'epoch': 13.64}


 69%|██████▊   | 3850/5620 [3:01:09<50:32,  1.71s/it]  

{'loss': 0.197, 'grad_norm': 0.9356494545936584, 'learning_rate': 0.00010388671875, 'epoch': 13.68}


 69%|██████▊   | 3860/5620 [3:01:24<50:54,  1.74s/it]

{'loss': 0.2199, 'grad_norm': 0.6238040924072266, 'learning_rate': 0.00010330078125, 'epoch': 13.71}


 69%|██████▉   | 3870/5620 [3:01:57<1:28:36,  3.04s/it]

{'loss': 0.2351, 'grad_norm': 0.5097337365150452, 'learning_rate': 0.00010271484375, 'epoch': 13.75}


 69%|██████▉   | 3880/5620 [3:02:23<1:10:53,  2.44s/it]

{'loss': 0.2063, 'grad_norm': 0.5795598030090332, 'learning_rate': 0.00010212890624999999, 'epoch': 13.78}


 69%|██████▉   | 3890/5620 [3:02:44<58:53,  2.04s/it]  

{'loss': 0.2077, 'grad_norm': 0.7077138423919678, 'learning_rate': 0.00010154296875, 'epoch': 13.82}


 69%|██████▉   | 3900/5620 [3:03:02<48:40,  1.70s/it]

{'loss': 0.2089, 'grad_norm': 0.5766010284423828, 'learning_rate': 0.00010095703125, 'epoch': 13.85}


                                                     
 69%|██████▉   | 3900/5620 [3:03:51<48:40,  1.70s/it]

{'eval_loss': 0.36199402809143066, 'eval_wer': 0.3170303761377344, 'eval_runtime': 48.8367, 'eval_samples_per_second': 20.476, 'eval_steps_per_second': 2.56, 'epoch': 13.85}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 70%|██████▉   | 3910/5620 [3:04:10<1:08:44,  2.41s/it]

{'loss': 0.2414, 'grad_norm': 1.0814398527145386, 'learning_rate': 0.00010037109375, 'epoch': 13.89}


 70%|██████▉   | 3920/5620 [3:04:41<1:19:32,  2.81s/it]

{'loss': 0.2269, 'grad_norm': 0.4198371171951294, 'learning_rate': 9.978515624999999e-05, 'epoch': 13.93}


 70%|██████▉   | 3930/5620 [3:05:04<59:38,  2.12s/it]  

{'loss': 0.1994, 'grad_norm': 0.5194023847579956, 'learning_rate': 9.919921875e-05, 'epoch': 13.96}


 70%|███████   | 3940/5620 [3:05:20<39:09,  1.40s/it]

{'loss': 0.2005, 'grad_norm': 0.8143740296363831, 'learning_rate': 9.861328125e-05, 'epoch': 14.0}


 70%|███████   | 3950/5620 [3:05:52<1:27:33,  3.15s/it]

{'loss': 0.2228, 'grad_norm': 0.6176759004592896, 'learning_rate': 9.802734375e-05, 'epoch': 14.03}


 70%|███████   | 3960/5620 [3:06:19<1:09:00,  2.49s/it]

{'loss': 0.201, 'grad_norm': 0.7255061864852905, 'learning_rate': 9.744140625e-05, 'epoch': 14.07}


 71%|███████   | 3970/5620 [3:06:40<57:33,  2.09s/it]  

{'loss': 0.1993, 'grad_norm': 0.5059537887573242, 'learning_rate': 9.685546875e-05, 'epoch': 14.1}


 71%|███████   | 3980/5620 [3:06:59<49:40,  1.82s/it]

{'loss': 0.2071, 'grad_norm': 0.9145037531852722, 'learning_rate': 9.626953125e-05, 'epoch': 14.14}


 71%|███████   | 3990/5620 [3:07:14<37:04,  1.36s/it]

{'loss': 0.1839, 'grad_norm': 1.0007697343826294, 'learning_rate': 9.568359375e-05, 'epoch': 14.17}


 71%|███████   | 4000/5620 [3:07:47<1:27:48,  3.25s/it]

{'loss': 0.2185, 'grad_norm': 0.4312702715396881, 'learning_rate': 9.509765625e-05, 'epoch': 14.21}


                                                       
 71%|███████   | 4000/5620 [3:08:36<1:27:48,  3.25s/it]

{'eval_loss': 0.3493541181087494, 'eval_wer': 0.3243776729904595, 'eval_runtime': 48.7401, 'eval_samples_per_second': 20.517, 'eval_steps_per_second': 2.565, 'epoch': 14.21}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 71%|███████▏  | 4010/5620 [3:09:07<1:26:17,  3.22s/it]

{'loss': 0.1714, 'grad_norm': 0.4500772953033447, 'learning_rate': 9.451171875e-05, 'epoch': 14.25}


 72%|███████▏  | 4020/5620 [3:09:28<56:09,  2.11s/it]  

{'loss': 0.2131, 'grad_norm': 0.6382493376731873, 'learning_rate': 9.392578125e-05, 'epoch': 14.28}


 72%|███████▏  | 4030/5620 [3:09:47<46:35,  1.76s/it]

{'loss': 0.2155, 'grad_norm': 0.7774702310562134, 'learning_rate': 9.333984375e-05, 'epoch': 14.32}


 72%|███████▏  | 4040/5620 [3:10:01<35:06,  1.33s/it]

{'loss': 0.2358, 'grad_norm': 1.465587854385376, 'learning_rate': 9.275390625e-05, 'epoch': 14.35}


 72%|███████▏  | 4050/5620 [3:10:34<1:22:07,  3.14s/it]

{'loss': 0.221, 'grad_norm': 0.4854643940925598, 'learning_rate': 9.216796874999999e-05, 'epoch': 14.39}


 72%|███████▏  | 4060/5620 [3:11:00<1:07:03,  2.58s/it]

{'loss': 0.2067, 'grad_norm': 0.798229992389679, 'learning_rate': 9.158203125e-05, 'epoch': 14.42}


 72%|███████▏  | 4070/5620 [3:11:22<54:53,  2.12s/it]  

{'loss': 0.191, 'grad_norm': 0.3587051033973694, 'learning_rate': 9.099609375e-05, 'epoch': 14.46}


 73%|███████▎  | 4080/5620 [3:11:41<45:39,  1.78s/it]

{'loss': 0.1775, 'grad_norm': 0.5752257108688354, 'learning_rate': 9.041015625e-05, 'epoch': 14.49}


 73%|███████▎  | 4090/5620 [3:11:55<33:41,  1.32s/it]

{'loss': 0.1989, 'grad_norm': 0.7767240405082703, 'learning_rate': 8.982421874999999e-05, 'epoch': 14.53}


 73%|███████▎  | 4100/5620 [3:12:28<1:19:21,  3.13s/it]

{'loss': 0.2238, 'grad_norm': 0.40253645181655884, 'learning_rate': 8.923828125e-05, 'epoch': 14.56}


                                                       
 73%|███████▎  | 4100/5620 [3:13:17<1:19:21,  3.13s/it]

{'eval_loss': 0.3492647409439087, 'eval_wer': 0.3204298716964579, 'eval_runtime': 48.7603, 'eval_samples_per_second': 20.508, 'eval_steps_per_second': 2.564, 'epoch': 14.56}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 73%|███████▎  | 4110/5620 [3:13:47<1:20:07,  3.18s/it]

{'loss': 0.1888, 'grad_norm': 0.5411642789840698, 'learning_rate': 8.865234375e-05, 'epoch': 14.6}


 73%|███████▎  | 4120/5620 [3:14:09<53:27,  2.14s/it]  

{'loss': 0.1598, 'grad_norm': 0.5567626953125, 'learning_rate': 8.806640625e-05, 'epoch': 14.64}


 73%|███████▎  | 4130/5620 [3:14:27<43:21,  1.75s/it]

{'loss': 0.1729, 'grad_norm': 0.5649412870407104, 'learning_rate': 8.748046874999999e-05, 'epoch': 14.67}


 74%|███████▎  | 4140/5620 [3:14:42<31:45,  1.29s/it]

{'loss': 0.195, 'grad_norm': 0.8454685807228088, 'learning_rate': 8.689453125e-05, 'epoch': 14.71}


 74%|███████▍  | 4150/5620 [3:15:15<1:19:26,  3.24s/it]

{'loss': 0.2034, 'grad_norm': 0.3561183214187622, 'learning_rate': 8.630859375e-05, 'epoch': 14.74}


 74%|███████▍  | 4160/5620 [3:15:42<1:02:47,  2.58s/it]

{'loss': 0.2007, 'grad_norm': 0.4551238715648651, 'learning_rate': 8.572265625e-05, 'epoch': 14.78}


 74%|███████▍  | 4170/5620 [3:16:04<51:18,  2.12s/it]  

{'loss': 0.1603, 'grad_norm': 0.5795608162879944, 'learning_rate': 8.513671875e-05, 'epoch': 14.81}


 74%|███████▍  | 4180/5620 [3:16:22<42:11,  1.76s/it]

{'loss': 0.1953, 'grad_norm': 0.7504843473434448, 'learning_rate': 8.455078125e-05, 'epoch': 14.85}


 75%|███████▍  | 4190/5620 [3:16:37<32:02,  1.34s/it]

{'loss': 0.1939, 'grad_norm': 0.846226155757904, 'learning_rate': 8.396484375e-05, 'epoch': 14.88}


 75%|███████▍  | 4200/5620 [3:17:08<1:06:35,  2.81s/it]

{'loss': 0.2321, 'grad_norm': 0.6629518866539001, 'learning_rate': 8.337890625e-05, 'epoch': 14.92}


                                                       
 75%|███████▍  | 4200/5620 [3:17:56<1:06:35,  2.81s/it]

{'eval_loss': 0.35131657123565674, 'eval_wer': 0.3177980041671236, 'eval_runtime': 48.8665, 'eval_samples_per_second': 20.464, 'eval_steps_per_second': 2.558, 'epoch': 14.92}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 75%|███████▍  | 4210/5620 [3:18:23<1:05:50,  2.80s/it]

{'loss': 0.1521, 'grad_norm': 1.4143621921539307, 'learning_rate': 8.279296875e-05, 'epoch': 14.96}


 75%|███████▌  | 4220/5620 [3:18:40<36:39,  1.57s/it]  

{'loss': 0.1672, 'grad_norm': 1.1633225679397583, 'learning_rate': 8.220703125e-05, 'epoch': 14.99}


 75%|███████▌  | 4230/5620 [3:19:10<1:15:45,  3.27s/it]

{'loss': 0.2609, 'grad_norm': 0.4073890745639801, 'learning_rate': 8.162109375e-05, 'epoch': 15.03}


 75%|███████▌  | 4240/5620 [3:19:38<59:54,  2.61s/it]  

{'loss': 0.1806, 'grad_norm': 0.7694066762924194, 'learning_rate': 8.103515625e-05, 'epoch': 15.06}


 76%|███████▌  | 4250/5620 [3:20:00<49:45,  2.18s/it]

{'loss': 0.1746, 'grad_norm': 0.4487888813018799, 'learning_rate': 8.044921875e-05, 'epoch': 15.1}


 76%|███████▌  | 4260/5620 [3:20:19<41:21,  1.82s/it]

{'loss': 0.1797, 'grad_norm': 0.9677624702453613, 'learning_rate': 7.986328125e-05, 'epoch': 15.13}


 76%|███████▌  | 4270/5620 [3:20:34<32:21,  1.44s/it]

{'loss': 0.1856, 'grad_norm': 0.895671010017395, 'learning_rate': 7.927734375e-05, 'epoch': 15.17}


 76%|███████▌  | 4280/5620 [3:21:04<1:12:47,  3.26s/it]

{'loss': 0.2068, 'grad_norm': 0.38051772117614746, 'learning_rate': 7.869140625e-05, 'epoch': 15.2}


 76%|███████▋  | 4290/5620 [3:21:32<58:49,  2.65s/it]  

{'loss': 0.2006, 'grad_norm': 0.533709704875946, 'learning_rate': 7.810546875e-05, 'epoch': 15.24}


 77%|███████▋  | 4300/5620 [3:21:54<48:21,  2.20s/it]

{'loss': 0.1802, 'grad_norm': 0.4595904052257538, 'learning_rate': 7.751953125e-05, 'epoch': 15.28}


                                                     
 77%|███████▋  | 4300/5620 [3:22:43<48:21,  2.20s/it]

{'eval_loss': 0.36213696002960205, 'eval_wer': 0.3169207149906788, 'eval_runtime': 48.8194, 'eval_samples_per_second': 20.484, 'eval_steps_per_second': 2.56, 'epoch': 15.28}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 77%|███████▋  | 4310/5620 [3:23:06<54:01,  2.47s/it]  

{'loss': 0.1677, 'grad_norm': 0.7845776677131653, 'learning_rate': 7.693359375e-05, 'epoch': 15.31}


 77%|███████▋  | 4320/5620 [3:23:21<31:21,  1.45s/it]

{'loss': 0.2092, 'grad_norm': 0.7857505679130554, 'learning_rate': 7.634765625e-05, 'epoch': 15.35}


 77%|███████▋  | 4330/5620 [3:23:52<1:11:58,  3.35s/it]

{'loss': 0.2062, 'grad_norm': 0.4955337345600128, 'learning_rate': 7.576171875e-05, 'epoch': 15.38}


 77%|███████▋  | 4340/5620 [3:24:20<55:41,  2.61s/it]  

{'loss': 0.202, 'grad_norm': 0.7213159203529358, 'learning_rate': 7.517578125e-05, 'epoch': 15.42}


 77%|███████▋  | 4350/5620 [3:24:42<45:55,  2.17s/it]

{'loss': 0.1713, 'grad_norm': 1.4341977834701538, 'learning_rate': 7.458984374999999e-05, 'epoch': 15.45}


 78%|███████▊  | 4360/5620 [3:25:01<38:21,  1.83s/it]

{'loss': 0.1739, 'grad_norm': 0.6799337863922119, 'learning_rate': 7.400390624999999e-05, 'epoch': 15.49}


 78%|███████▊  | 4370/5620 [3:25:17<30:00,  1.44s/it]

{'loss': 0.1765, 'grad_norm': 0.46740925312042236, 'learning_rate': 7.341796875e-05, 'epoch': 15.52}


 78%|███████▊  | 4380/5620 [3:25:47<1:07:40,  3.27s/it]

{'loss': 0.1988, 'grad_norm': 0.49797573685646057, 'learning_rate': 7.283203125e-05, 'epoch': 15.56}


 78%|███████▊  | 4390/5620 [3:26:15<54:12,  2.64s/it]  

{'loss': 0.1956, 'grad_norm': 0.38256776332855225, 'learning_rate': 7.224609374999999e-05, 'epoch': 15.6}


 78%|███████▊  | 4400/5620 [3:26:37<43:56,  2.16s/it]

{'loss': 0.1539, 'grad_norm': 0.5887414216995239, 'learning_rate': 7.166015624999999e-05, 'epoch': 15.63}


                                                     
 78%|███████▊  | 4400/5620 [3:27:26<43:56,  2.16s/it]

{'eval_loss': 0.35361284017562866, 'eval_wer': 0.3125342691084549, 'eval_runtime': 48.8906, 'eval_samples_per_second': 20.454, 'eval_steps_per_second': 2.557, 'epoch': 15.63}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 78%|███████▊  | 4410/5620 [3:27:48<48:47,  2.42s/it]  

{'loss': 0.1535, 'grad_norm': 0.5427671074867249, 'learning_rate': 7.107421875e-05, 'epoch': 15.67}


 79%|███████▊  | 4420/5620 [3:28:03<27:36,  1.38s/it]

{'loss': 0.1936, 'grad_norm': 1.527860403060913, 'learning_rate': 7.048828125e-05, 'epoch': 15.7}


 79%|███████▉  | 4430/5620 [3:28:33<1:03:38,  3.21s/it]

{'loss': 0.2099, 'grad_norm': 0.37093839049339294, 'learning_rate': 6.990234374999999e-05, 'epoch': 15.74}


 79%|███████▉  | 4440/5620 [3:29:02<53:36,  2.73s/it]  

{'loss': 0.1558, 'grad_norm': 0.4720085561275482, 'learning_rate': 6.931640624999999e-05, 'epoch': 15.77}


 79%|███████▉  | 4450/5620 [3:29:24<42:31,  2.18s/it]

{'loss': 0.1835, 'grad_norm': 0.46516281366348267, 'learning_rate': 6.873046875e-05, 'epoch': 15.81}


 79%|███████▉  | 4460/5620 [3:29:44<36:12,  1.87s/it]

{'loss': 0.1461, 'grad_norm': 0.8345612287521362, 'learning_rate': 6.814453125e-05, 'epoch': 15.84}


 80%|███████▉  | 4470/5620 [3:29:59<26:34,  1.39s/it]

{'loss': 0.1809, 'grad_norm': 0.9416826963424683, 'learning_rate': 6.755859374999999e-05, 'epoch': 15.88}


 80%|███████▉  | 4480/5620 [3:30:28<57:42,  3.04s/it]

{'loss': 0.1922, 'grad_norm': 0.29762741923332214, 'learning_rate': 6.697265624999999e-05, 'epoch': 15.91}


 80%|███████▉  | 4490/5620 [3:30:51<42:00,  2.23s/it]

{'loss': 0.1709, 'grad_norm': 0.5267624258995056, 'learning_rate': 6.638671875e-05, 'epoch': 15.95}


 80%|████████  | 4500/5620 [3:31:09<31:37,  1.69s/it]

{'loss': 0.1624, 'grad_norm': 0.7269221544265747, 'learning_rate': 6.580078125e-05, 'epoch': 15.99}


                                                     
 80%|████████  | 4500/5620 [3:31:58<31:37,  1.69s/it]

{'eval_loss': 0.36460980772972107, 'eval_wer': 0.3069415506086194, 'eval_runtime': 48.6794, 'eval_samples_per_second': 20.543, 'eval_steps_per_second': 2.568, 'epoch': 15.99}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 80%|████████  | 4510/5620 [3:32:29<1:12:10,  3.90s/it]

{'loss': 0.2223, 'grad_norm': 0.38482150435447693, 'learning_rate': 6.521484374999999e-05, 'epoch': 16.02}


 80%|████████  | 4520/5620 [3:32:57<49:24,  2.70s/it]  

{'loss': 0.1868, 'grad_norm': 0.6643146872520447, 'learning_rate': 6.462890624999999e-05, 'epoch': 16.06}


 81%|████████  | 4530/5620 [3:33:20<39:46,  2.19s/it]

{'loss': 0.1446, 'grad_norm': 0.6473674774169922, 'learning_rate': 6.404296875e-05, 'epoch': 16.09}


 81%|████████  | 4540/5620 [3:33:39<33:33,  1.86s/it]

{'loss': 0.1753, 'grad_norm': 0.7714102864265442, 'learning_rate': 6.345703125e-05, 'epoch': 16.13}


 81%|████████  | 4550/5620 [3:33:55<26:18,  1.48s/it]

{'loss': 0.1874, 'grad_norm': 2.978031873703003, 'learning_rate': 6.287109374999999e-05, 'epoch': 16.16}


 81%|████████  | 4560/5620 [3:34:22<59:32,  3.37s/it]

{'loss': 0.212, 'grad_norm': 0.3622574210166931, 'learning_rate': 6.228515624999999e-05, 'epoch': 16.2}


 81%|████████▏ | 4570/5620 [3:34:52<48:32,  2.77s/it]

{'loss': 0.1686, 'grad_norm': 0.7499713897705078, 'learning_rate': 6.169921874999999e-05, 'epoch': 16.23}


 81%|████████▏ | 4580/5620 [3:35:15<39:20,  2.27s/it]

{'loss': 0.1674, 'grad_norm': 0.5464836955070496, 'learning_rate': 6.111328125e-05, 'epoch': 16.27}


 82%|████████▏ | 4590/5620 [3:35:35<31:48,  1.85s/it]

{'loss': 0.1649, 'grad_norm': 0.6051888465881348, 'learning_rate': 6.0527343749999994e-05, 'epoch': 16.31}


 82%|████████▏ | 4600/5620 [3:35:51<25:48,  1.52s/it]

{'loss': 0.1671, 'grad_norm': 0.6264798641204834, 'learning_rate': 5.994140624999999e-05, 'epoch': 16.34}


                                                     
 82%|████████▏ | 4600/5620 [3:36:40<25:48,  1.52s/it]

{'eval_loss': 0.3769599497318268, 'eval_wer': 0.3060642614321746, 'eval_runtime': 48.7932, 'eval_samples_per_second': 20.495, 'eval_steps_per_second': 2.562, 'epoch': 16.34}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 82%|████████▏ | 4610/5620 [3:37:10<1:05:24,  3.89s/it]

{'loss': 0.2063, 'grad_norm': 0.4776245057582855, 'learning_rate': 5.9355468749999994e-05, 'epoch': 16.38}


 82%|████████▏ | 4620/5620 [3:37:40<46:19,  2.78s/it]  

{'loss': 0.1396, 'grad_norm': 0.4590265154838562, 'learning_rate': 5.876953124999999e-05, 'epoch': 16.41}


 82%|████████▏ | 4630/5620 [3:38:03<37:23,  2.27s/it]

{'loss': 0.1546, 'grad_norm': 0.5901877880096436, 'learning_rate': 5.8183593749999994e-05, 'epoch': 16.45}


 83%|████████▎ | 4640/5620 [3:38:23<31:13,  1.91s/it]

{'loss': 0.14, 'grad_norm': 0.4851406514644623, 'learning_rate': 5.759765624999999e-05, 'epoch': 16.48}


 83%|████████▎ | 4650/5620 [3:38:40<25:07,  1.55s/it]

{'loss': 0.1521, 'grad_norm': 0.5293036699295044, 'learning_rate': 5.7011718749999995e-05, 'epoch': 16.52}


 83%|████████▎ | 4660/5620 [3:39:07<52:00,  3.25s/it]

{'loss': 0.2084, 'grad_norm': 0.5118513107299805, 'learning_rate': 5.642578124999999e-05, 'epoch': 16.55}


 83%|████████▎ | 4670/5620 [3:39:36<43:45,  2.76s/it]

{'loss': 0.153, 'grad_norm': 0.32544341683387756, 'learning_rate': 5.5839843749999995e-05, 'epoch': 16.59}


 83%|████████▎ | 4680/5620 [3:39:59<35:04,  2.24s/it]

{'loss': 0.1554, 'grad_norm': 0.5809392929077148, 'learning_rate': 5.525390624999999e-05, 'epoch': 16.63}


 83%|████████▎ | 4690/5620 [3:40:19<28:59,  1.87s/it]

{'loss': 0.1522, 'grad_norm': 0.796394407749176, 'learning_rate': 5.4667968749999995e-05, 'epoch': 16.66}


 84%|████████▎ | 4700/5620 [3:40:35<23:26,  1.53s/it]

{'loss': 0.1813, 'grad_norm': 0.8162314295768738, 'learning_rate': 5.408203124999999e-05, 'epoch': 16.7}


                                                     
 84%|████████▎ | 4700/5620 [3:41:24<23:26,  1.53s/it]

{'eval_loss': 0.3682938814163208, 'eval_wer': 0.30639324487334135, 'eval_runtime': 48.8048, 'eval_samples_per_second': 20.49, 'eval_steps_per_second': 2.561, 'epoch': 16.7}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 84%|████████▍ | 4710/5620 [3:41:54<58:08,  3.83s/it]  

{'loss': 0.2182, 'grad_norm': 0.4152032732963562, 'learning_rate': 5.3496093749999995e-05, 'epoch': 16.73}


 84%|████████▍ | 4720/5620 [3:42:23<40:27,  2.70s/it]

{'loss': 0.188, 'grad_norm': 1.4418485164642334, 'learning_rate': 5.291015624999999e-05, 'epoch': 16.77}


 84%|████████▍ | 4730/5620 [3:42:46<33:02,  2.23s/it]

{'loss': 0.1628, 'grad_norm': 1.1482795476913452, 'learning_rate': 5.2324218749999996e-05, 'epoch': 16.8}


 84%|████████▍ | 4740/5620 [3:43:06<27:01,  1.84s/it]

{'loss': 0.1711, 'grad_norm': 0.5839106440544128, 'learning_rate': 5.173828124999999e-05, 'epoch': 16.84}


 85%|████████▍ | 4750/5620 [3:43:22<22:12,  1.53s/it]

{'loss': 0.1653, 'grad_norm': 0.6426301002502441, 'learning_rate': 5.1152343749999996e-05, 'epoch': 16.87}


 85%|████████▍ | 4760/5620 [3:43:48<43:37,  3.04s/it]

{'loss': 0.1918, 'grad_norm': 0.4514966607093811, 'learning_rate': 5.056640624999999e-05, 'epoch': 16.91}


 85%|████████▍ | 4770/5620 [3:44:12<32:16,  2.28s/it]

{'loss': 0.1665, 'grad_norm': 0.5400128364562988, 'learning_rate': 4.9980468749999996e-05, 'epoch': 16.94}


 85%|████████▌ | 4780/5620 [3:44:31<24:54,  1.78s/it]

{'loss': 0.1505, 'grad_norm': 1.03361976146698, 'learning_rate': 4.939453124999999e-05, 'epoch': 16.98}


 85%|████████▌ | 4790/5620 [3:44:56<45:01,  3.26s/it]

{'loss': 0.1961, 'grad_norm': 0.49317067861557007, 'learning_rate': 4.880859375e-05, 'epoch': 17.02}


 85%|████████▌ | 4800/5620 [3:45:26<39:42,  2.90s/it]

{'loss': 0.1511, 'grad_norm': 0.3150726854801178, 'learning_rate': 4.8222656249999993e-05, 'epoch': 17.05}


                                                     
 85%|████████▌ | 4800/5620 [3:46:15<39:42,  2.90s/it]

{'eval_loss': 0.36098113656044006, 'eval_wer': 0.2974010308147823, 'eval_runtime': 48.8323, 'eval_samples_per_second': 20.478, 'eval_steps_per_second': 2.56, 'epoch': 17.05}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 86%|████████▌ | 4810/5620 [3:46:43<40:04,  2.97s/it]  

{'loss': 0.1962, 'grad_norm': 0.3795250952243805, 'learning_rate': 4.763671875e-05, 'epoch': 17.09}


 86%|████████▌ | 4820/5620 [3:47:03<26:11,  1.96s/it]

{'loss': 0.1718, 'grad_norm': 0.4372929036617279, 'learning_rate': 4.7050781249999994e-05, 'epoch': 17.12}


 86%|████████▌ | 4830/5620 [3:47:20<21:31,  1.63s/it]

{'loss': 0.1752, 'grad_norm': 0.4801253080368042, 'learning_rate': 4.646484375e-05, 'epoch': 17.16}


 86%|████████▌ | 4840/5620 [3:47:44<41:49,  3.22s/it]

{'loss': 0.2127, 'grad_norm': 0.4030115008354187, 'learning_rate': 4.5878906249999994e-05, 'epoch': 17.19}


 86%|████████▋ | 4850/5620 [3:48:14<36:02,  2.81s/it]

{'loss': 0.2005, 'grad_norm': 0.2878894507884979, 'learning_rate': 4.529296875e-05, 'epoch': 17.23}


 86%|████████▋ | 4860/5620 [3:48:38<29:00,  2.29s/it]

{'loss': 0.1554, 'grad_norm': 0.5758928060531616, 'learning_rate': 4.4707031249999994e-05, 'epoch': 17.26}


 87%|████████▋ | 4870/5620 [3:48:57<23:43,  1.90s/it]

{'loss': 0.1512, 'grad_norm': 0.4895685017108917, 'learning_rate': 4.412109374999999e-05, 'epoch': 17.3}


 87%|████████▋ | 4880/5620 [3:49:14<19:20,  1.57s/it]

{'loss': 0.1935, 'grad_norm': 0.7117341160774231, 'learning_rate': 4.3535156249999995e-05, 'epoch': 17.34}


 87%|████████▋ | 4890/5620 [3:49:38<38:45,  3.19s/it]

{'loss': 0.1847, 'grad_norm': 0.49021822214126587, 'learning_rate': 4.294921874999999e-05, 'epoch': 17.37}


 87%|████████▋ | 4900/5620 [3:50:08<33:19,  2.78s/it]

{'loss': 0.1762, 'grad_norm': 0.5106465220451355, 'learning_rate': 4.2363281249999995e-05, 'epoch': 17.41}


                                                     
 87%|████████▋ | 4900/5620 [3:50:57<33:19,  2.78s/it]

{'eval_loss': 0.3623703718185425, 'eval_wer': 0.30156815440289503, 'eval_runtime': 48.818, 'eval_samples_per_second': 20.484, 'eval_steps_per_second': 2.561, 'epoch': 17.41}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 87%|████████▋ | 4910/5620 [3:51:25<34:57,  2.95s/it]  

{'loss': 0.1628, 'grad_norm': 0.46562302112579346, 'learning_rate': 4.177734374999999e-05, 'epoch': 17.44}


 88%|████████▊ | 4920/5620 [3:51:44<22:09,  1.90s/it]

{'loss': 0.1604, 'grad_norm': 0.6501423120498657, 'learning_rate': 4.1191406249999995e-05, 'epoch': 17.48}


 88%|████████▊ | 4930/5620 [3:52:00<17:58,  1.56s/it]

{'loss': 0.1976, 'grad_norm': 0.7450883388519287, 'learning_rate': 4.060546874999999e-05, 'epoch': 17.51}


 88%|████████▊ | 4940/5620 [3:52:24<35:40,  3.15s/it]

{'loss': 0.1997, 'grad_norm': 0.4296319782733917, 'learning_rate': 4.0019531249999996e-05, 'epoch': 17.55}


 88%|████████▊ | 4950/5620 [3:52:53<30:40,  2.75s/it]

{'loss': 0.1448, 'grad_norm': 0.47646600008010864, 'learning_rate': 3.943359374999999e-05, 'epoch': 17.58}


 88%|████████▊ | 4960/5620 [3:53:17<24:56,  2.27s/it]

{'loss': 0.1435, 'grad_norm': 0.4175063669681549, 'learning_rate': 3.8847656249999996e-05, 'epoch': 17.62}


 88%|████████▊ | 4970/5620 [3:53:37<20:56,  1.93s/it]

{'loss': 0.1452, 'grad_norm': 0.43021801114082336, 'learning_rate': 3.826171874999999e-05, 'epoch': 17.66}


 89%|████████▊ | 4980/5620 [3:53:53<16:48,  1.58s/it]

{'loss': 0.1631, 'grad_norm': 0.8445967435836792, 'learning_rate': 3.7675781249999996e-05, 'epoch': 17.69}


 89%|████████▉ | 4990/5620 [3:54:17<34:09,  3.25s/it]

{'loss': 0.1898, 'grad_norm': 0.2533028721809387, 'learning_rate': 3.708984374999999e-05, 'epoch': 17.73}


 89%|████████▉ | 5000/5620 [3:54:47<28:57,  2.80s/it]

{'loss': 0.1735, 'grad_norm': 0.4099242389202118, 'learning_rate': 3.650390625e-05, 'epoch': 17.76}


                                                     
 89%|████████▉ | 5000/5620 [3:55:36<28:57,  2.80s/it]

{'eval_loss': 0.36205601692199707, 'eval_wer': 0.3001425594911723, 'eval_runtime': 48.796, 'eval_samples_per_second': 20.493, 'eval_steps_per_second': 2.562, 'epoch': 17.76}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 89%|████████▉ | 5010/5620 [3:56:03<29:49,  2.93s/it]  

{'loss': 0.1631, 'grad_norm': 0.4571288824081421, 'learning_rate': 3.5917968749999993e-05, 'epoch': 17.8}


 89%|████████▉ | 5020/5620 [3:56:23<19:23,  1.94s/it]

{'loss': 0.1547, 'grad_norm': 0.43032509088516235, 'learning_rate': 3.533203125e-05, 'epoch': 17.83}


 90%|████████▉ | 5030/5620 [3:56:40<15:23,  1.57s/it]

{'loss': 0.1753, 'grad_norm': 0.6968296766281128, 'learning_rate': 3.4746093749999994e-05, 'epoch': 17.87}


 90%|████████▉ | 5040/5620 [3:57:03<29:30,  3.05s/it]

{'loss': 0.2067, 'grad_norm': 0.45484575629234314, 'learning_rate': 3.416015625e-05, 'epoch': 17.9}


 90%|████████▉ | 5050/5620 [3:57:30<23:22,  2.46s/it]

{'loss': 0.1545, 'grad_norm': 0.48106032609939575, 'learning_rate': 3.3574218749999994e-05, 'epoch': 17.94}


 90%|█████████ | 5060/5620 [3:57:49<16:55,  1.81s/it]

{'loss': 0.18, 'grad_norm': 0.764659583568573, 'learning_rate': 3.298828125e-05, 'epoch': 17.98}


 90%|█████████ | 5070/5620 [3:58:10<27:05,  2.95s/it]

{'loss': 0.1934, 'grad_norm': 0.33120498061180115, 'learning_rate': 3.2402343749999994e-05, 'epoch': 18.01}


 90%|█████████ | 5080/5620 [3:58:43<26:43,  2.97s/it]

{'loss': 0.133, 'grad_norm': 0.3713156282901764, 'learning_rate': 3.181640625e-05, 'epoch': 18.05}


 91%|█████████ | 5090/5620 [3:59:08<21:17,  2.41s/it]

{'loss': 0.1609, 'grad_norm': 0.5368723273277283, 'learning_rate': 3.1230468749999995e-05, 'epoch': 18.08}


 91%|█████████ | 5100/5620 [3:59:29<17:39,  2.04s/it]

{'loss': 0.1562, 'grad_norm': 0.4230067729949951, 'learning_rate': 3.064453125e-05, 'epoch': 18.12}


                                                     
 91%|█████████ | 5100/5620 [4:00:17<17:39,  2.04s/it]

{'eval_loss': 0.361026793718338, 'eval_wer': 0.30123917096172825, 'eval_runtime': 48.7042, 'eval_samples_per_second': 20.532, 'eval_steps_per_second': 2.567, 'epoch': 18.12}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 91%|█████████ | 5110/5620 [4:00:38<19:23,  2.28s/it]  

{'loss': 0.1551, 'grad_norm': 0.6223829388618469, 'learning_rate': 3.0058593749999995e-05, 'epoch': 18.15}


 91%|█████████ | 5120/5620 [4:01:00<25:32,  3.06s/it]

{'loss': 0.1943, 'grad_norm': 0.38995596766471863, 'learning_rate': 2.9472656249999995e-05, 'epoch': 18.19}


 91%|█████████▏| 5130/5620 [4:01:32<23:57,  2.93s/it]

{'loss': 0.1507, 'grad_norm': 0.6230383515357971, 'learning_rate': 2.8886718749999995e-05, 'epoch': 18.22}


 91%|█████████▏| 5140/5620 [4:01:57<19:08,  2.39s/it]

{'loss': 0.1543, 'grad_norm': 0.3461016118526459, 'learning_rate': 2.8300781249999995e-05, 'epoch': 18.26}


 92%|█████████▏| 5150/5620 [4:02:17<15:09,  1.94s/it]

{'loss': 0.1515, 'grad_norm': 0.43382149934768677, 'learning_rate': 2.7714843749999996e-05, 'epoch': 18.29}


 92%|█████████▏| 5160/5620 [4:02:34<12:23,  1.62s/it]

{'loss': 0.1799, 'grad_norm': 0.46505922079086304, 'learning_rate': 2.7128906249999996e-05, 'epoch': 18.33}


 92%|█████████▏| 5170/5620 [4:02:55<22:25,  2.99s/it]

{'loss': 0.1736, 'grad_norm': 0.46958088874816895, 'learning_rate': 2.6542968749999996e-05, 'epoch': 18.37}


 92%|█████████▏| 5180/5620 [4:03:27<21:47,  2.97s/it]

{'loss': 0.1439, 'grad_norm': 0.4202677011489868, 'learning_rate': 2.5957031249999996e-05, 'epoch': 18.4}


 92%|█████████▏| 5190/5620 [4:03:51<16:35,  2.31s/it]

{'loss': 0.1624, 'grad_norm': 0.5942394137382507, 'learning_rate': 2.5371093749999996e-05, 'epoch': 18.44}


 93%|█████████▎| 5200/5620 [4:04:12<13:45,  1.97s/it]

{'loss': 0.1474, 'grad_norm': 0.6587716937065125, 'learning_rate': 2.4785156249999996e-05, 'epoch': 18.47}


                                                     
 93%|█████████▎| 5200/5620 [4:05:00<13:45,  1.97s/it]

{'eval_loss': 0.3610471487045288, 'eval_wer': 0.2978396754030047, 'eval_runtime': 48.6273, 'eval_samples_per_second': 20.565, 'eval_steps_per_second': 2.571, 'epoch': 18.47}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 93%|█████████▎| 5210/5620 [4:05:21<15:29,  2.27s/it]  

{'loss': 0.1627, 'grad_norm': 0.6769352555274963, 'learning_rate': 2.4199218749999997e-05, 'epoch': 18.51}


 93%|█████████▎| 5220/5620 [4:05:42<20:23,  3.06s/it]

{'loss': 0.1843, 'grad_norm': 0.33735695481300354, 'learning_rate': 2.3613281249999997e-05, 'epoch': 18.54}


 93%|█████████▎| 5230/5620 [4:06:13<18:32,  2.85s/it]

{'loss': 0.1685, 'grad_norm': 0.6343465447425842, 'learning_rate': 2.3027343749999997e-05, 'epoch': 18.58}


 93%|█████████▎| 5240/5620 [4:06:37<14:06,  2.23s/it]

{'loss': 0.1633, 'grad_norm': 0.6121971607208252, 'learning_rate': 2.2441406249999997e-05, 'epoch': 18.61}


 93%|█████████▎| 5250/5620 [4:06:58<12:03,  1.95s/it]

{'loss': 0.1453, 'grad_norm': 0.628470242023468, 'learning_rate': 2.1855468749999997e-05, 'epoch': 18.65}


 94%|█████████▎| 5260/5620 [4:07:15<09:55,  1.65s/it]

{'loss': 0.1578, 'grad_norm': 0.688391923904419, 'learning_rate': 2.1269531249999997e-05, 'epoch': 18.69}


 94%|█████████▍| 5270/5620 [4:07:36<17:20,  2.97s/it]

{'loss': 0.2087, 'grad_norm': 0.493335485458374, 'learning_rate': 2.0683593749999997e-05, 'epoch': 18.72}


 94%|█████████▍| 5280/5620 [4:08:07<16:24,  2.89s/it]

{'loss': 0.1479, 'grad_norm': 0.39573994278907776, 'learning_rate': 2.0097656249999998e-05, 'epoch': 18.76}


 94%|█████████▍| 5290/5620 [4:08:31<12:43,  2.31s/it]

{'loss': 0.1628, 'grad_norm': 0.7761005163192749, 'learning_rate': 1.9511718749999998e-05, 'epoch': 18.79}


 94%|█████████▍| 5300/5620 [4:08:52<10:31,  1.97s/it]

{'loss': 0.141, 'grad_norm': 1.3984882831573486, 'learning_rate': 1.8925781249999998e-05, 'epoch': 18.83}


                                                     
 94%|█████████▍| 5300/5620 [4:09:40<10:31,  1.97s/it]

{'eval_loss': 0.359312504529953, 'eval_wer': 0.2955367913148372, 'eval_runtime': 48.7244, 'eval_samples_per_second': 20.524, 'eval_steps_per_second': 2.565, 'epoch': 18.83}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 94%|█████████▍| 5310/5620 [4:10:01<11:53,  2.30s/it]  

{'loss': 0.1634, 'grad_norm': 0.4272235035896301, 'learning_rate': 1.8339843749999998e-05, 'epoch': 18.86}


 95%|█████████▍| 5320/5620 [4:10:22<14:47,  2.96s/it]

{'loss': 0.1771, 'grad_norm': 0.37591949105262756, 'learning_rate': 1.7753906249999998e-05, 'epoch': 18.9}


 95%|█████████▍| 5330/5620 [4:10:50<12:17,  2.54s/it]

{'loss': 0.1793, 'grad_norm': 0.4018871486186981, 'learning_rate': 1.716796875e-05, 'epoch': 18.93}


 95%|█████████▌| 5340/5620 [4:11:10<08:59,  1.93s/it]

{'loss': 0.1421, 'grad_norm': 0.7663458585739136, 'learning_rate': 1.658203125e-05, 'epoch': 18.97}


 95%|█████████▌| 5350/5620 [4:11:28<10:54,  2.43s/it]

{'loss': 0.1655, 'grad_norm': 0.4676313102245331, 'learning_rate': 1.599609375e-05, 'epoch': 19.01}


 95%|█████████▌| 5360/5620 [4:12:01<13:26,  3.10s/it]

{'loss': 0.143, 'grad_norm': 0.5549266934394836, 'learning_rate': 1.541015625e-05, 'epoch': 19.04}


 96%|█████████▌| 5370/5620 [4:12:27<10:07,  2.43s/it]

{'loss': 0.1666, 'grad_norm': 0.5257809162139893, 'learning_rate': 1.4824218749999999e-05, 'epoch': 19.08}


 96%|█████████▌| 5380/5620 [4:12:48<08:00,  2.00s/it]

{'loss': 0.1432, 'grad_norm': 0.4670005440711975, 'learning_rate': 1.423828125e-05, 'epoch': 19.11}


 96%|█████████▌| 5390/5620 [4:13:05<06:39,  1.74s/it]

{'loss': 0.1395, 'grad_norm': 0.7975648045539856, 'learning_rate': 1.365234375e-05, 'epoch': 19.15}


 96%|█████████▌| 5400/5620 [4:13:23<09:07,  2.49s/it]

{'loss': 0.1453, 'grad_norm': 0.35299256443977356, 'learning_rate': 1.306640625e-05, 'epoch': 19.18}


                                                     
 96%|█████████▌| 5400/5620 [4:14:12<09:07,  2.49s/it]

{'eval_loss': 0.36702167987823486, 'eval_wer': 0.29465950213839237, 'eval_runtime': 48.8323, 'eval_samples_per_second': 20.478, 'eval_steps_per_second': 2.56, 'epoch': 19.18}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 96%|█████████▋| 5410/5620 [4:14:48<12:20,  3.53s/it]  

{'loss': 0.1596, 'grad_norm': 0.7836375832557678, 'learning_rate': 1.248046875e-05, 'epoch': 19.22}


 96%|█████████▋| 5420/5620 [4:15:13<08:01,  2.41s/it]

{'loss': 0.14, 'grad_norm': 0.4106810688972473, 'learning_rate': 1.189453125e-05, 'epoch': 19.25}


 97%|█████████▋| 5430/5620 [4:15:33<06:20,  2.00s/it]

{'loss': 0.1391, 'grad_norm': 0.6304717063903809, 'learning_rate': 1.130859375e-05, 'epoch': 19.29}


 97%|█████████▋| 5440/5620 [4:15:50<04:55,  1.64s/it]

{'loss': 0.1923, 'grad_norm': 0.3642953634262085, 'learning_rate': 1.072265625e-05, 'epoch': 19.33}


 97%|█████████▋| 5450/5620 [4:16:07<06:39,  2.35s/it]

{'loss': 0.2281, 'grad_norm': 1.0488030910491943, 'learning_rate': 1.013671875e-05, 'epoch': 19.36}


 97%|█████████▋| 5460/5620 [4:16:40<08:04,  3.03s/it]

{'loss': 0.1763, 'grad_norm': 0.43379586935043335, 'learning_rate': 9.550781249999999e-06, 'epoch': 19.4}


 97%|█████████▋| 5470/5620 [4:17:06<06:03,  2.43s/it]

{'loss': 0.1636, 'grad_norm': 0.40705132484436035, 'learning_rate': 8.964843749999999e-06, 'epoch': 19.43}


 98%|█████████▊| 5480/5620 [4:17:27<04:48,  2.06s/it]

{'loss': 0.1787, 'grad_norm': 0.4950937032699585, 'learning_rate': 8.378906249999999e-06, 'epoch': 19.47}


 98%|█████████▊| 5490/5620 [4:17:45<03:39,  1.69s/it]

{'loss': 0.1764, 'grad_norm': 0.813156247138977, 'learning_rate': 7.79296875e-06, 'epoch': 19.5}


 98%|█████████▊| 5500/5620 [4:18:03<04:51,  2.43s/it]

{'loss': 0.1844, 'grad_norm': 0.49328869581222534, 'learning_rate': 7.207031249999999e-06, 'epoch': 19.54}


                                                     
 98%|█████████▊| 5500/5620 [4:18:51<04:51,  2.43s/it]

{'eval_loss': 0.35999059677124023, 'eval_wer': 0.29444017984428117, 'eval_runtime': 48.8361, 'eval_samples_per_second': 20.477, 'eval_steps_per_second': 2.56, 'epoch': 19.54}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 98%|█████████▊| 5510/5620 [4:19:28<06:41,  3.65s/it]

{'loss': 0.162, 'grad_norm': 0.43283531069755554, 'learning_rate': 6.62109375e-06, 'epoch': 19.57}


 98%|█████████▊| 5520/5620 [4:19:53<04:08,  2.48s/it]

{'loss': 0.134, 'grad_norm': 0.45856213569641113, 'learning_rate': 6.03515625e-06, 'epoch': 19.61}


 98%|█████████▊| 5530/5620 [4:20:14<03:04,  2.04s/it]

{'loss': 0.1306, 'grad_norm': 0.426717609167099, 'learning_rate': 5.44921875e-06, 'epoch': 19.64}


 99%|█████████▊| 5540/5620 [4:20:33<02:19,  1.74s/it]

{'loss': 0.1403, 'grad_norm': 0.37484967708587646, 'learning_rate': 4.86328125e-06, 'epoch': 19.68}


 99%|█████████▉| 5550/5620 [4:20:51<02:50,  2.43s/it]

{'loss': 0.1784, 'grad_norm': 0.555993914604187, 'learning_rate': 4.277343749999999e-06, 'epoch': 19.72}


 99%|█████████▉| 5560/5620 [4:21:24<03:04,  3.08s/it]

{'loss': 0.1373, 'grad_norm': 0.5130627751350403, 'learning_rate': 3.6914062499999995e-06, 'epoch': 19.75}


 99%|█████████▉| 5570/5620 [4:21:50<02:03,  2.47s/it]

{'loss': 0.1368, 'grad_norm': 0.6084708571434021, 'learning_rate': 3.1054687499999997e-06, 'epoch': 19.79}


 99%|█████████▉| 5580/5620 [4:22:11<01:20,  2.00s/it]

{'loss': 0.1557, 'grad_norm': 0.5148667097091675, 'learning_rate': 2.51953125e-06, 'epoch': 19.82}


 99%|█████████▉| 5590/5620 [4:22:28<00:50,  1.68s/it]

{'loss': 0.132, 'grad_norm': 0.4632069170475006, 'learning_rate': 1.9335937499999996e-06, 'epoch': 19.86}


100%|█████████▉| 5600/5620 [4:22:46<00:47,  2.39s/it]

{'loss': 0.2035, 'grad_norm': 0.5641316771507263, 'learning_rate': 1.3476562499999997e-06, 'epoch': 19.89}


                                                     
100%|█████████▉| 5600/5620 [4:23:34<00:47,  2.39s/it]

{'eval_loss': 0.3643849492073059, 'eval_wer': 0.292575940344336, 'eval_runtime': 48.8038, 'eval_samples_per_second': 20.49, 'eval_steps_per_second': 2.561, 'epoch': 19.89}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
100%|█████████▉| 5610/5620 [4:24:09<00:33,  3.39s/it]

{'loss': 0.134, 'grad_norm': 0.45100101828575134, 'learning_rate': 7.6171875e-07, 'epoch': 19.93}


100%|██████████| 5620/5620 [4:24:30<00:00,  2.04s/it]

{'loss': 0.1424, 'grad_norm': 0.811058759689331, 'learning_rate': 1.7578124999999998e-07, 'epoch': 19.96}


100%|██████████| 5620/5620 [4:24:34<00:00,  2.82s/it]

{'train_runtime': 15874.0298, 'train_samples_per_second': 11.339, 'train_steps_per_second': 0.354, 'train_loss': 0.8742152554496752, 'epoch': 19.96}





TrainOutput(global_step=5620, training_loss=0.8742152554496752, metrics={'train_runtime': 15874.0298, 'train_samples_per_second': 11.339, 'train_steps_per_second': 0.354, 'total_flos': 3.271521370154527e+19, 'train_loss': 0.8742152554496752, 'epoch': 19.964476021314386})