## Load Pretrained Model and Tokenizer

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


## Load IMDb Dataset

In [1]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Tokenize the Dataset

In [3]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)


## Group Texts into Chunks

In [4]:
chunk_size = tokenizer.mask_token_id

# Drop the last chunk if it’s smaller than chunk_size.
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


# Pad the last chunk until its length equals chunk_size.
def group_texts_with_padding(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute total length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    
    # If the last chunk is smaller than chunk_size, pad it
    for key in result.keys():
        if len(result[key][-1]) < chunk_size:
            padding_length = chunk_size - len(result[key][-1])
            if key == "input_ids":
                # Pad input_ids with the tokenizer's pad token ID
                result[key][-1] += [tokenizer.pad_token_id] * padding_length
            else:
                # Pad other keys (e.g., attention_mask) with 0
                result[key][-1] += [0] * padding_length

    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result


In [5]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 76170
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 74448
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 152809
    })
})

In [6]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity'

In [7]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

'she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity'

## Combine train and unsupervised Datasets for Training

In [8]:
from datasets import concatenate_datasets

training_dataset = concatenate_datasets([lm_datasets["train"], lm_datasets["unsupervised"]])
evaluation_dataset = lm_datasets["test"]


## Set Up Data Collator

### Individual tokens masking

In [9]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)


In [10]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented i am curious - yellow from my [MASK] store because [MASK] all the controversy that surrounded it when ivo was first released in [MASK]. [MASK] also heard that at first it [MASK] seized by u. s. customs if [MASK] ever tried departmental enter this country, therefore being a fan of films considered " controversial " i really had [MASK] see [MASK] for [MASK]. < br [MASK] > < br / > the plot is centered around a young swedish drama student named [MASK] who wants to learn everything she can about [MASK] [MASK] in particular'

'>>> [MASK] wants [MASK] focus her [MASK] [MASK] to making some sort of documentary on what the average swede [MASK] about certain political issues such as the vietnam war [MASK] race [MASK] [MASK] the united states. [MASK] [MASK] [MASK] politicians and ordinary denizens [MASK] stockholm about their opinions on politics,uli has sex with her drama magical, classmates, and married men. < br / > < br / > what kills [MASK] about i am curious - yellow i

In [11]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.convert_ids_to_tokens(chunk)}'")


'>>> ['[CLS]', '[MASK]', 'rented', 'i', 'am', 'curious', '-', 'yellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it', 'was', 'first', 'released', 'in', '1967', '.', 'i', 'also', 'heard', 'that', 'at', 'first', 'it', 'was', 'seized', 'by', 'u', '.', 's', '.', '##nka', 'if', 'it', 'ever', '[MASK]', 'to', 'enter', 'this', 'country', ',', 'therefore', 'being', 'a', 'fan', 'of', 'films', 'considered', '"', 'controversial', '"', 'i', 'really', 'had', 'to', 'see', 'this', 'for', 'myself', '[MASK]', '<', 'georgetown', '/', '>', '<', '[MASK]', '/', '[MASK]', 'the', 'plot', '[MASK]', 'centered', 'around', 'a', 'young', 'swedish', 'drama', 'student', 'named', 'lena', 'who', 'wants', 'to', 'learn', '[MASK]', 'she', 'can', 'about', 'life', '.', '[MASK]', 'particular']'

'>>> ['she', 'wants', 'properties', 'focus', 'her', 'attention', '##s', '[MASK]', 'making', 'some', 'sort', 'of', 'す', 'on', 'what', 'abbott', 'average', 'sw

### Whole word masking

In [12]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [13]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented i [MASK] curious - yellow from my [MASK] store because of all the controversy that surrounded it when it [MASK] first released in 1967 [MASK] [MASK] also heard that [MASK] [MASK] it was seized [MASK] u. s. customs [MASK] it ever [MASK] to enter this country, therefore being a fan of [MASK] considered " controversial [MASK] i really [MASK] to see this for [MASK] [MASK] < br / [MASK] < br / > the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life. [MASK] [MASK]'

'>>> she wants to [MASK] [MASK] [MASK] [MASK] to making some sort of documentary on what the average swede thought [MASK] certain political issues such as the vietnam war and race [MASK] in [MASK] united [MASK]. in between asking politicians [MASK] ordinary denizens of stockholm about their [MASK] on politics, she has sex with her drama teacher [MASK] [MASK], and married [MASK]. < br / > < br / > what kills me about [MASK] am [MASK] - [MASK] is t

In [14]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.convert_ids_to_tokens(chunk)}'")


'>>> ['[CLS]', 'i', 'rented', '[MASK]', 'am', 'curious', '-', 'yellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', '[MASK]', '[MASK]', 'was', 'first', 'released', 'in', '1967', '.', 'i', 'also', 'heard', 'that', 'at', '[MASK]', 'it', 'was', '[MASK]', 'by', '[MASK]', '.', '[MASK]', '[MASK]', '[MASK]', 'if', 'it', 'ever', 'tried', 'to', 'enter', '[MASK]', '[MASK]', '[MASK]', 'therefore', 'being', 'a', 'fan', 'of', 'films', 'considered', '[MASK]', '[MASK]', '"', 'i', 'really', 'had', '[MASK]', 'see', 'this', 'for', 'myself', '[MASK]', '<', 'br', '[MASK]', '[MASK]', '<', 'br', '/', '[MASK]', 'the', 'plot', 'is', 'centered', '[MASK]', '[MASK]', 'young', 'swedish', '[MASK]', 'student', 'named', 'lena', 'who', 'wants', '[MASK]', 'learn', 'everything', '[MASK]', 'can', 'about', 'life', '[MASK]', 'in', 'particular']'

'>>> ['she', 'wants', 'to', 'focus', 'her', 'attention', '##s', 'to', '[MASK]', 'some', 'sort', 'of', 'documentary'

## Define Training Arguments and Initialize Trainer

In [16]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./distilbert-finetuned-imdb-mlm-checkpoint",
    num_train_epochs=10,
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=1000,
    fp16=True,
    logging_steps=100,
    gradient_accumulation_steps=2,
)

# Note: if you're using a whole word masking data collator, you’ll also need to set remove_unused_columns=False to ensure we don’t lose the word_ids column during training. By default, the Trainer will remove any columns that are not part of the model’s forward() method.


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=evaluation_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)




## Train the Model

In [17]:
trainer.train()

  0%|          | 101/35780 [00:13<1:14:13,  8.01it/s]

{'loss': 3.1047, 'grad_norm': 4.89377498626709, 'learning_rate': 1.9600000000000003e-06, 'epoch': 0.03}


  1%|          | 201/35780 [00:25<1:14:02,  8.01it/s]

{'loss': 2.8192, 'grad_norm': 4.3941850662231445, 'learning_rate': 3.96e-06, 'epoch': 0.06}


  1%|          | 301/35780 [00:37<1:13:32,  8.04it/s]

{'loss': 2.7448, 'grad_norm': 4.453423976898193, 'learning_rate': 5.9600000000000005e-06, 'epoch': 0.08}


  1%|          | 401/35780 [00:50<1:13:14,  8.05it/s]

{'loss': 2.7036, 'grad_norm': 4.592410087585449, 'learning_rate': 7.960000000000002e-06, 'epoch': 0.11}


  1%|▏         | 500/35780 [01:02<1:12:24,  8.12it/s]

{'loss': 2.6995, 'grad_norm': 4.377350330352783, 'learning_rate': 9.960000000000001e-06, 'epoch': 0.14}


                                                     
  1%|▏         | 500/35780 [01:55<1:12:24,  8.12it/s]

{'eval_loss': 2.5390326976776123, 'eval_runtime': 52.7041, 'eval_samples_per_second': 1412.566, 'eval_steps_per_second': 44.152, 'epoch': 0.14}


  2%|▏         | 601/35780 [02:08<1:13:18,  8.00it/s]  

{'loss': 2.6428, 'grad_norm': 4.321450710296631, 'learning_rate': 1.196e-05, 'epoch': 0.17}


  2%|▏         | 701/35780 [02:20<1:12:39,  8.05it/s]

{'loss': 2.6282, 'grad_norm': 4.6372761726379395, 'learning_rate': 1.396e-05, 'epoch': 0.2}


  2%|▏         | 801/35780 [02:33<1:12:25,  8.05it/s]

{'loss': 2.6057, 'grad_norm': 4.035613536834717, 'learning_rate': 1.5960000000000003e-05, 'epoch': 0.22}


  3%|▎         | 901/35780 [02:45<1:12:09,  8.06it/s]

{'loss': 2.5833, 'grad_norm': 4.126962661743164, 'learning_rate': 1.796e-05, 'epoch': 0.25}


  3%|▎         | 1000/35780 [02:57<1:11:23,  8.12it/s]

{'loss': 2.5528, 'grad_norm': 3.811983346939087, 'learning_rate': 1.9960000000000002e-05, 'epoch': 0.28}



  3%|▎         | 1000/35780 [03:50<1:11:23,  8.12it/s]

{'eval_loss': 2.441873788833618, 'eval_runtime': 52.7714, 'eval_samples_per_second': 1410.765, 'eval_steps_per_second': 44.096, 'epoch': 0.28}


  3%|▎         | 1101/35780 [04:03<1:12:09,  8.01it/s]  

{'loss': 2.5353, 'grad_norm': 3.8589742183685303, 'learning_rate': 1.994364577343301e-05, 'epoch': 0.31}


  3%|▎         | 1201/35780 [04:16<1:11:47,  8.03it/s]

{'loss': 2.5412, 'grad_norm': 3.977374792098999, 'learning_rate': 1.988614146060955e-05, 'epoch': 0.34}


  4%|▎         | 1301/35780 [04:28<1:11:28,  8.04it/s]

{'loss': 2.5234, 'grad_norm': 4.082961559295654, 'learning_rate': 1.9828637147786083e-05, 'epoch': 0.36}


  4%|▍         | 1401/35780 [04:40<1:11:21,  8.03it/s]

{'loss': 2.5178, 'grad_norm': 3.7309653759002686, 'learning_rate': 1.977113283496262e-05, 'epoch': 0.39}


  4%|▍         | 1500/35780 [04:53<1:10:33,  8.10it/s]

{'loss': 2.5053, 'grad_norm': 4.000097274780273, 'learning_rate': 1.971362852213916e-05, 'epoch': 0.42}


                                                      
  4%|▍         | 1500/35780 [05:45<1:10:33,  8.10it/s]

{'eval_loss': 2.3897337913513184, 'eval_runtime': 52.5138, 'eval_samples_per_second': 1417.684, 'eval_steps_per_second': 44.312, 'epoch': 0.42}


  4%|▍         | 1601/35780 [05:58<1:11:02,  8.02it/s]  

{'loss': 2.4885, 'grad_norm': 3.882849931716919, 'learning_rate': 1.96561242093157e-05, 'epoch': 0.45}


  5%|▍         | 1701/35780 [06:11<1:10:36,  8.04it/s]

{'loss': 2.4923, 'grad_norm': 3.9837706089019775, 'learning_rate': 1.959861989649224e-05, 'epoch': 0.48}


  5%|▌         | 1801/35780 [06:23<1:10:37,  8.02it/s]

{'loss': 2.4849, 'grad_norm': 3.8959197998046875, 'learning_rate': 1.9541115583668777e-05, 'epoch': 0.5}


  5%|▌         | 1901/35780 [06:36<1:10:31,  8.01it/s]

{'loss': 2.4648, 'grad_norm': 3.974475383758545, 'learning_rate': 1.9483611270845315e-05, 'epoch': 0.53}


  6%|▌         | 2000/35780 [06:48<1:09:26,  8.11it/s]

{'loss': 2.4633, 'grad_norm': 3.645838975906372, 'learning_rate': 1.9426106958021853e-05, 'epoch': 0.56}


                                                      
  6%|▌         | 2000/35780 [07:41<1:09:26,  8.11it/s]

{'eval_loss': 2.353388786315918, 'eval_runtime': 52.6135, 'eval_samples_per_second': 1414.999, 'eval_steps_per_second': 44.228, 'epoch': 0.56}


  6%|▌         | 2101/35780 [07:54<1:10:19,  7.98it/s]  

{'loss': 2.4576, 'grad_norm': 3.731006145477295, 'learning_rate': 1.936860264519839e-05, 'epoch': 0.59}


  6%|▌         | 2201/35780 [08:06<1:09:45,  8.02it/s]

{'loss': 2.459, 'grad_norm': 3.9809024333953857, 'learning_rate': 1.931109833237493e-05, 'epoch': 0.61}


  6%|▋         | 2301/35780 [08:18<1:09:45,  8.00it/s]

{'loss': 2.4334, 'grad_norm': 3.6916182041168213, 'learning_rate': 1.9253594019551467e-05, 'epoch': 0.64}


  7%|▋         | 2401/35780 [08:31<1:09:01,  8.06it/s]

{'loss': 2.4535, 'grad_norm': 3.899843692779541, 'learning_rate': 1.919608970672801e-05, 'epoch': 0.67}


  7%|▋         | 2500/35780 [08:43<1:08:15,  8.13it/s]

{'loss': 2.4247, 'grad_norm': 4.0207014083862305, 'learning_rate': 1.9138585393904546e-05, 'epoch': 0.7}


                                                      
  7%|▋         | 2500/35780 [09:36<1:08:15,  8.13it/s]

{'eval_loss': 2.3301198482513428, 'eval_runtime': 52.5034, 'eval_samples_per_second': 1417.966, 'eval_steps_per_second': 44.321, 'epoch': 0.7}


  7%|▋         | 2601/35780 [09:49<1:08:54,  8.03it/s]  

{'loss': 2.4408, 'grad_norm': 3.9347500801086426, 'learning_rate': 1.9081081081081084e-05, 'epoch': 0.73}


  8%|▊         | 2701/35780 [10:01<1:08:36,  8.04it/s]

{'loss': 2.4372, 'grad_norm': 3.785374164581299, 'learning_rate': 1.9023576768257622e-05, 'epoch': 0.75}


  8%|▊         | 2801/35780 [10:14<1:08:39,  8.01it/s]

{'loss': 2.4224, 'grad_norm': 3.9950473308563232, 'learning_rate': 1.896607245543416e-05, 'epoch': 0.78}


  8%|▊         | 2901/35780 [10:26<1:08:19,  8.02it/s]

{'loss': 2.423, 'grad_norm': 3.8601973056793213, 'learning_rate': 1.89085681426107e-05, 'epoch': 0.81}


  8%|▊         | 3000/35780 [10:38<1:07:41,  8.07it/s]

{'loss': 2.4158, 'grad_norm': 4.024655818939209, 'learning_rate': 1.8851063829787236e-05, 'epoch': 0.84}


                                                      
  8%|▊         | 3000/35780 [11:31<1:07:41,  8.07it/s]

{'eval_loss': 2.3086278438568115, 'eval_runtime': 52.6099, 'eval_samples_per_second': 1415.094, 'eval_steps_per_second': 44.231, 'epoch': 0.84}


  9%|▊         | 3101/35780 [11:44<1:07:55,  8.02it/s]  

{'loss': 2.412, 'grad_norm': 3.703744411468506, 'learning_rate': 1.8793559516963774e-05, 'epoch': 0.87}


  9%|▉         | 3201/35780 [11:56<1:07:17,  8.07it/s]

{'loss': 2.4024, 'grad_norm': 4.003916263580322, 'learning_rate': 1.8736055204140312e-05, 'epoch': 0.89}


  9%|▉         | 3301/35780 [12:09<1:06:56,  8.09it/s]

{'loss': 2.4073, 'grad_norm': 3.822333574295044, 'learning_rate': 1.867855089131685e-05, 'epoch': 0.92}


 10%|▉         | 3401/35780 [12:21<1:06:52,  8.07it/s]

{'loss': 2.4006, 'grad_norm': 3.93556547164917, 'learning_rate': 1.862104657849339e-05, 'epoch': 0.95}


 10%|▉         | 3500/35780 [12:33<1:06:08,  8.13it/s]

{'loss': 2.3959, 'grad_norm': 3.8699958324432373, 'learning_rate': 1.8563542265669927e-05, 'epoch': 0.98}


                                                      
 10%|▉         | 3500/35780 [13:25<1:06:08,  8.13it/s]

{'eval_loss': 2.291896104812622, 'eval_runtime': 52.1619, 'eval_samples_per_second': 1427.248, 'eval_steps_per_second': 44.611, 'epoch': 0.98}


 10%|█         | 3601/35780 [13:39<1:06:53,  8.02it/s]  

{'loss': 2.4, 'grad_norm': 3.7592082023620605, 'learning_rate': 1.8506037952846465e-05, 'epoch': 1.01}


 10%|█         | 3701/35780 [13:51<1:06:45,  8.01it/s]

{'loss': 2.3902, 'grad_norm': 3.7731549739837646, 'learning_rate': 1.8448533640023003e-05, 'epoch': 1.03}


 11%|█         | 3801/35780 [14:03<1:06:06,  8.06it/s]

{'loss': 2.3752, 'grad_norm': 3.9741852283477783, 'learning_rate': 1.839102932719954e-05, 'epoch': 1.06}


 11%|█         | 3901/35780 [14:16<1:06:11,  8.03it/s]

{'loss': 2.3824, 'grad_norm': 3.913311243057251, 'learning_rate': 1.833352501437608e-05, 'epoch': 1.09}


 11%|█         | 4000/35780 [14:28<1:05:51,  8.04it/s]

{'loss': 2.362, 'grad_norm': 3.866694211959839, 'learning_rate': 1.8276020701552617e-05, 'epoch': 1.12}


                                                      
 11%|█         | 4000/35780 [15:21<1:05:51,  8.04it/s]

{'eval_loss': 2.276815414428711, 'eval_runtime': 52.5809, 'eval_samples_per_second': 1415.876, 'eval_steps_per_second': 44.256, 'epoch': 1.12}


 11%|█▏        | 4101/35780 [15:34<1:05:41,  8.04it/s]  

{'loss': 2.3732, 'grad_norm': 3.940786123275757, 'learning_rate': 1.8218516388729158e-05, 'epoch': 1.15}


 12%|█▏        | 4201/35780 [15:46<1:05:45,  8.00it/s]

{'loss': 2.359, 'grad_norm': 3.6565425395965576, 'learning_rate': 1.8161012075905696e-05, 'epoch': 1.17}


 12%|█▏        | 4301/35780 [15:59<1:05:47,  7.97it/s]

{'loss': 2.3496, 'grad_norm': 3.6673896312713623, 'learning_rate': 1.8103507763082234e-05, 'epoch': 1.2}


 12%|█▏        | 4401/35780 [16:11<1:05:17,  8.01it/s]

{'loss': 2.3673, 'grad_norm': 3.906834602355957, 'learning_rate': 1.8046003450258772e-05, 'epoch': 1.23}


 13%|█▎        | 4500/35780 [16:23<1:04:33,  8.08it/s]

{'loss': 2.3645, 'grad_norm': 3.678053379058838, 'learning_rate': 1.798849913743531e-05, 'epoch': 1.26}


                                                      
 13%|█▎        | 4500/35780 [17:16<1:04:33,  8.08it/s]

{'eval_loss': 2.267490863800049, 'eval_runtime': 52.4165, 'eval_samples_per_second': 1420.317, 'eval_steps_per_second': 44.394, 'epoch': 1.26}


 13%|█▎        | 4601/35780 [17:29<1:04:47,  8.02it/s]  

{'loss': 2.348, 'grad_norm': 3.7969439029693604, 'learning_rate': 1.7930994824611848e-05, 'epoch': 1.29}


 13%|█▎        | 4701/35780 [17:41<1:04:32,  8.03it/s]

{'loss': 2.3396, 'grad_norm': 3.764622449874878, 'learning_rate': 1.7873490511788386e-05, 'epoch': 1.31}


 13%|█▎        | 4801/35780 [17:54<1:04:27,  8.01it/s]

{'loss': 2.3534, 'grad_norm': 3.7886691093444824, 'learning_rate': 1.7815986198964924e-05, 'epoch': 1.34}


 14%|█▎        | 4901/35780 [18:06<1:04:11,  8.02it/s]

{'loss': 2.3519, 'grad_norm': 3.940391778945923, 'learning_rate': 1.7758481886141462e-05, 'epoch': 1.37}


 14%|█▍        | 5000/35780 [18:18<1:03:37,  8.06it/s]

{'loss': 2.3485, 'grad_norm': 3.920891523361206, 'learning_rate': 1.7700977573318e-05, 'epoch': 1.4}


                                                      
 14%|█▍        | 5000/35780 [19:11<1:03:37,  8.06it/s]

{'eval_loss': 2.25654935836792, 'eval_runtime': 52.3717, 'eval_samples_per_second': 1421.53, 'eval_steps_per_second': 44.432, 'epoch': 1.4}


 14%|█▍        | 5101/35780 [19:24<1:04:09,  7.97it/s]  

{'loss': 2.3585, 'grad_norm': 3.7215590476989746, 'learning_rate': 1.7643473260494538e-05, 'epoch': 1.43}


 15%|█▍        | 5201/35780 [19:36<1:03:56,  7.97it/s]

{'loss': 2.326, 'grad_norm': 3.7235183715820312, 'learning_rate': 1.7585968947671076e-05, 'epoch': 1.45}


 15%|█▍        | 5301/35780 [19:49<1:03:20,  8.02it/s]

{'loss': 2.3449, 'grad_norm': 3.831592559814453, 'learning_rate': 1.7528464634847614e-05, 'epoch': 1.48}


 15%|█▌        | 5401/35780 [20:01<1:02:58,  8.04it/s]

{'loss': 2.3378, 'grad_norm': 3.7258005142211914, 'learning_rate': 1.7470960322024152e-05, 'epoch': 1.51}


 15%|█▌        | 5500/35780 [20:14<1:02:25,  8.08it/s]

{'loss': 2.3424, 'grad_norm': 3.7619292736053467, 'learning_rate': 1.741345600920069e-05, 'epoch': 1.54}


                                                      
 15%|█▌        | 5500/35780 [21:06<1:02:25,  8.08it/s]

{'eval_loss': 2.248929977416992, 'eval_runtime': 52.3545, 'eval_samples_per_second': 1421.998, 'eval_steps_per_second': 44.447, 'epoch': 1.54}


 16%|█▌        | 5601/35780 [21:19<1:02:26,  8.06it/s]  

{'loss': 2.323, 'grad_norm': 3.771364212036133, 'learning_rate': 1.7355951696377228e-05, 'epoch': 1.57}


 16%|█▌        | 5701/35780 [21:31<1:02:01,  8.08it/s]

{'loss': 2.3178, 'grad_norm': 3.536165237426758, 'learning_rate': 1.7298447383553766e-05, 'epoch': 1.59}


 16%|█▌        | 5801/35780 [21:44<1:02:19,  8.02it/s]

{'loss': 2.3407, 'grad_norm': 3.8707046508789062, 'learning_rate': 1.7240943070730308e-05, 'epoch': 1.62}


 16%|█▋        | 5901/35780 [21:56<1:02:15,  8.00it/s]

{'loss': 2.3297, 'grad_norm': 3.936491012573242, 'learning_rate': 1.7183438757906846e-05, 'epoch': 1.65}


 17%|█▋        | 6000/35780 [22:08<1:01:32,  8.06it/s]

{'loss': 2.3454, 'grad_norm': 3.846097230911255, 'learning_rate': 1.7125934445083384e-05, 'epoch': 1.68}


                                                      
 17%|█▋        | 6000/35780 [23:01<1:01:32,  8.06it/s]

{'eval_loss': 2.2266361713409424, 'eval_runtime': 52.4725, 'eval_samples_per_second': 1418.8, 'eval_steps_per_second': 44.347, 'epoch': 1.68}


 17%|█▋        | 6101/35780 [23:14<1:01:37,  8.03it/s]  

{'loss': 2.347, 'grad_norm': 3.876675844192505, 'learning_rate': 1.7068430132259922e-05, 'epoch': 1.7}


 17%|█▋        | 6201/35780 [23:26<1:01:36,  8.00it/s]

{'loss': 2.334, 'grad_norm': 3.8105781078338623, 'learning_rate': 1.701092581943646e-05, 'epoch': 1.73}


 18%|█▊        | 6301/35780 [23:39<1:01:09,  8.03it/s]

{'loss': 2.3271, 'grad_norm': 3.7261440753936768, 'learning_rate': 1.6953421506612998e-05, 'epoch': 1.76}


 18%|█▊        | 6401/35780 [23:51<1:00:58,  8.03it/s]

{'loss': 2.2933, 'grad_norm': 3.6390011310577393, 'learning_rate': 1.6895917193789536e-05, 'epoch': 1.79}


 18%|█▊        | 6500/35780 [24:04<1:00:28,  8.07it/s]

{'loss': 2.3364, 'grad_norm': 3.82381272315979, 'learning_rate': 1.6838412880966074e-05, 'epoch': 1.82}


                                                      
 18%|█▊        | 6500/35780 [24:56<1:00:28,  8.07it/s]

{'eval_loss': 2.2285444736480713, 'eval_runtime': 52.4294, 'eval_samples_per_second': 1419.968, 'eval_steps_per_second': 44.384, 'epoch': 1.82}


 18%|█▊        | 6601/35780 [25:09<1:00:32,  8.03it/s]  

{'loss': 2.3161, 'grad_norm': 3.7404496669769287, 'learning_rate': 1.6780908568142612e-05, 'epoch': 1.84}


 19%|█▊        | 6701/35780 [25:22<1:00:33,  8.00it/s]

{'loss': 2.3181, 'grad_norm': 3.818695306777954, 'learning_rate': 1.672340425531915e-05, 'epoch': 1.87}


 19%|█▉        | 6801/35780 [25:34<1:00:10,  8.03it/s]

{'loss': 2.316, 'grad_norm': 3.919891595840454, 'learning_rate': 1.6665899942495688e-05, 'epoch': 1.9}


 19%|█▉        | 6901/35780 [25:46<1:00:12,  7.99it/s]

{'loss': 2.293, 'grad_norm': 3.769660472869873, 'learning_rate': 1.6608395629672226e-05, 'epoch': 1.93}


 20%|█▉        | 7000/35780 [25:59<59:20,  8.08it/s]  

{'loss': 2.3177, 'grad_norm': 3.6235926151275635, 'learning_rate': 1.6550891316848764e-05, 'epoch': 1.96}


                                                    
 20%|█▉        | 7000/35780 [26:51<59:20,  8.08it/s]

{'eval_loss': 2.21305775642395, 'eval_runtime': 52.4648, 'eval_samples_per_second': 1419.009, 'eval_steps_per_second': 44.354, 'epoch': 1.96}


 20%|█▉        | 7101/35780 [27:04<59:21,  8.05it/s]    

{'loss': 2.2925, 'grad_norm': 3.7652528285980225, 'learning_rate': 1.6493387004025302e-05, 'epoch': 1.98}


 20%|██        | 7201/35780 [27:17<59:28,  8.01it/s]  

{'loss': 2.3195, 'grad_norm': 3.6505236625671387, 'learning_rate': 1.643588269120184e-05, 'epoch': 2.01}


 20%|██        | 7301/35780 [27:29<59:00,  8.04it/s]

{'loss': 2.2804, 'grad_norm': 3.809847831726074, 'learning_rate': 1.6378378378378378e-05, 'epoch': 2.04}


 21%|██        | 7401/35780 [27:41<58:43,  8.06it/s]

{'loss': 2.2886, 'grad_norm': 3.642303943634033, 'learning_rate': 1.6320874065554916e-05, 'epoch': 2.07}


 21%|██        | 7500/35780 [27:54<58:13,  8.10it/s]

{'loss': 2.2863, 'grad_norm': 3.722576141357422, 'learning_rate': 1.6263369752731457e-05, 'epoch': 2.1}


                                                    
 21%|██        | 7500/35780 [28:46<58:13,  8.10it/s]

{'eval_loss': 2.21160626411438, 'eval_runtime': 52.5919, 'eval_samples_per_second': 1415.58, 'eval_steps_per_second': 44.246, 'epoch': 2.1}


 21%|██        | 7601/35780 [28:59<58:18,  8.05it/s]    

{'loss': 2.2948, 'grad_norm': 3.6114726066589355, 'learning_rate': 1.6205865439907995e-05, 'epoch': 2.12}


 22%|██▏       | 7701/35780 [29:12<58:40,  7.98it/s]

{'loss': 2.2927, 'grad_norm': 3.5630359649658203, 'learning_rate': 1.6148361127084533e-05, 'epoch': 2.15}


 22%|██▏       | 7801/35780 [29:24<57:48,  8.07it/s]

{'loss': 2.2918, 'grad_norm': 3.8145341873168945, 'learning_rate': 1.609085681426107e-05, 'epoch': 2.18}


 22%|██▏       | 7901/35780 [29:36<57:46,  8.04it/s]

{'loss': 2.2887, 'grad_norm': 3.6148297786712646, 'learning_rate': 1.603335250143761e-05, 'epoch': 2.21}


 22%|██▏       | 8000/35780 [29:49<57:06,  8.11it/s]

{'loss': 2.2948, 'grad_norm': 3.7454118728637695, 'learning_rate': 1.5975848188614147e-05, 'epoch': 2.24}


                                                    
 22%|██▏       | 8000/35780 [30:41<57:06,  8.11it/s]

{'eval_loss': 2.2047204971313477, 'eval_runtime': 52.521, 'eval_samples_per_second': 1417.49, 'eval_steps_per_second': 44.306, 'epoch': 2.24}


 23%|██▎       | 8101/35780 [30:54<57:15,  8.06it/s]    

{'loss': 2.2993, 'grad_norm': 3.553147554397583, 'learning_rate': 1.591891891891892e-05, 'epoch': 2.26}


 23%|██▎       | 8201/35780 [31:07<56:59,  8.07it/s]

{'loss': 2.2956, 'grad_norm': 3.6165547370910645, 'learning_rate': 1.5861414606095457e-05, 'epoch': 2.29}


 23%|██▎       | 8301/35780 [31:19<57:19,  7.99it/s]

{'loss': 2.2829, 'grad_norm': 3.717777729034424, 'learning_rate': 1.5803910293271995e-05, 'epoch': 2.32}


 23%|██▎       | 8401/35780 [31:32<56:59,  8.01it/s]

{'loss': 2.2802, 'grad_norm': 3.904698610305786, 'learning_rate': 1.574698102357677e-05, 'epoch': 2.35}


 24%|██▍       | 8500/35780 [31:44<56:16,  8.08it/s]

{'loss': 2.2623, 'grad_norm': 3.624176263809204, 'learning_rate': 1.568947671075331e-05, 'epoch': 2.38}


                                                    
 24%|██▍       | 8500/35780 [32:36<56:16,  8.08it/s]

{'eval_loss': 2.1989293098449707, 'eval_runtime': 52.5119, 'eval_samples_per_second': 1417.735, 'eval_steps_per_second': 44.314, 'epoch': 2.38}


 24%|██▍       | 8601/35780 [32:50<56:15,  8.05it/s]    

{'loss': 2.2712, 'grad_norm': 3.886472702026367, 'learning_rate': 1.5631972397929847e-05, 'epoch': 2.4}


 24%|██▍       | 8701/35780 [33:02<56:00,  8.06it/s]

{'loss': 2.2858, 'grad_norm': 3.601848602294922, 'learning_rate': 1.5574468085106385e-05, 'epoch': 2.43}


 25%|██▍       | 8801/35780 [33:14<56:20,  7.98it/s]

{'loss': 2.2613, 'grad_norm': 3.795288562774658, 'learning_rate': 1.5516963772282923e-05, 'epoch': 2.46}


 25%|██▍       | 8901/35780 [33:27<56:10,  7.97it/s]

{'loss': 2.2842, 'grad_norm': 3.611344575881958, 'learning_rate': 1.545945945945946e-05, 'epoch': 2.49}


 25%|██▌       | 9000/35780 [33:39<55:22,  8.06it/s]

{'loss': 2.3049, 'grad_norm': 3.771401882171631, 'learning_rate': 1.5401955146636e-05, 'epoch': 2.52}


                                                    
 25%|██▌       | 9000/35780 [34:32<55:22,  8.06it/s]

{'eval_loss': 2.193098306655884, 'eval_runtime': 52.5903, 'eval_samples_per_second': 1415.622, 'eval_steps_per_second': 44.248, 'epoch': 2.52}


 25%|██▌       | 9101/35780 [34:45<55:15,  8.05it/s]    

{'loss': 2.2763, 'grad_norm': 3.623861312866211, 'learning_rate': 1.5344450833812537e-05, 'epoch': 2.54}


 26%|██▌       | 9201/35780 [34:57<54:59,  8.06it/s]

{'loss': 2.2771, 'grad_norm': 3.709577798843384, 'learning_rate': 1.5286946520989075e-05, 'epoch': 2.57}


 26%|██▌       | 9301/35780 [35:10<55:03,  8.01it/s]

{'loss': 2.2752, 'grad_norm': 3.6916894912719727, 'learning_rate': 1.5229442208165614e-05, 'epoch': 2.6}


 26%|██▋       | 9401/35780 [35:22<54:48,  8.02it/s]

{'loss': 2.2508, 'grad_norm': 3.7596371173858643, 'learning_rate': 1.5171937895342152e-05, 'epoch': 2.63}


 27%|██▋       | 9500/35780 [35:34<54:18,  8.06it/s]

{'loss': 2.2894, 'grad_norm': 3.72701358795166, 'learning_rate': 1.511443358251869e-05, 'epoch': 2.66}


                                                    
 27%|██▋       | 9500/35780 [36:27<54:18,  8.06it/s]

{'eval_loss': 2.1868321895599365, 'eval_runtime': 52.5627, 'eval_samples_per_second': 1416.365, 'eval_steps_per_second': 44.271, 'epoch': 2.66}


 27%|██▋       | 9601/35780 [36:40<54:10,  8.05it/s]    

{'loss': 2.2707, 'grad_norm': 3.6392552852630615, 'learning_rate': 1.5056929269695229e-05, 'epoch': 2.68}


 27%|██▋       | 9701/35780 [36:53<54:06,  8.03it/s]

{'loss': 2.2735, 'grad_norm': 3.893707513809204, 'learning_rate': 1.4999424956871767e-05, 'epoch': 2.71}


 27%|██▋       | 9801/35780 [37:05<53:52,  8.04it/s]

{'loss': 2.271, 'grad_norm': 3.811750650405884, 'learning_rate': 1.4941920644048305e-05, 'epoch': 2.74}


 28%|██▊       | 9901/35780 [37:17<53:57,  7.99it/s]

{'loss': 2.2571, 'grad_norm': 3.94960355758667, 'learning_rate': 1.4884416331224843e-05, 'epoch': 2.77}


 28%|██▊       | 10000/35780 [37:30<53:18,  8.06it/s]

{'loss': 2.2672, 'grad_norm': 3.5204451084136963, 'learning_rate': 1.482691201840138e-05, 'epoch': 2.79}


                                                     
 28%|██▊       | 10000/35780 [38:22<53:18,  8.06it/s]

{'eval_loss': 2.1833484172821045, 'eval_runtime': 52.6336, 'eval_samples_per_second': 1414.458, 'eval_steps_per_second': 44.211, 'epoch': 2.79}


 28%|██▊       | 10101/35780 [38:35<53:30,  8.00it/s]    

{'loss': 2.2485, 'grad_norm': 3.6678199768066406, 'learning_rate': 1.476940770557792e-05, 'epoch': 2.82}


 29%|██▊       | 10201/35780 [38:48<52:58,  8.05it/s]

{'loss': 2.2421, 'grad_norm': 3.6623871326446533, 'learning_rate': 1.4711903392754458e-05, 'epoch': 2.85}


 29%|██▉       | 10301/35780 [39:00<52:48,  8.04it/s]

{'loss': 2.26, 'grad_norm': 3.870002031326294, 'learning_rate': 1.4654399079930996e-05, 'epoch': 2.88}


 29%|██▉       | 10401/35780 [39:13<53:02,  7.98it/s]

{'loss': 2.2453, 'grad_norm': 3.785987615585327, 'learning_rate': 1.4596894767107534e-05, 'epoch': 2.91}


 29%|██▉       | 10500/35780 [39:25<52:08,  8.08it/s]

{'loss': 2.2528, 'grad_norm': 3.887012481689453, 'learning_rate': 1.4539390454284072e-05, 'epoch': 2.93}


                                                     
 29%|██▉       | 10500/35780 [40:18<52:08,  8.08it/s]

{'eval_loss': 2.176848888397217, 'eval_runtime': 52.6357, 'eval_samples_per_second': 1414.403, 'eval_steps_per_second': 44.21, 'epoch': 2.93}


 30%|██▉       | 10601/35780 [40:31<52:28,  8.00it/s]    

{'loss': 2.2565, 'grad_norm': 3.8787901401519775, 'learning_rate': 1.448188614146061e-05, 'epoch': 2.96}


 30%|██▉       | 10701/35780 [40:43<52:03,  8.03it/s]

{'loss': 2.2487, 'grad_norm': 3.6856021881103516, 'learning_rate': 1.4424381828637148e-05, 'epoch': 2.99}


 30%|███       | 10801/35780 [40:56<51:43,  8.05it/s]

{'loss': 2.2403, 'grad_norm': 3.6995084285736084, 'learning_rate': 1.4366877515813686e-05, 'epoch': 3.02}


 30%|███       | 10901/35780 [41:08<51:41,  8.02it/s]

{'loss': 2.2489, 'grad_norm': 3.792896032333374, 'learning_rate': 1.4309373202990224e-05, 'epoch': 3.05}


 31%|███       | 11000/35780 [41:20<51:04,  8.09it/s]

{'loss': 2.2428, 'grad_norm': 3.7143845558166504, 'learning_rate': 1.4251868890166764e-05, 'epoch': 3.07}


                                                     
 31%|███       | 11000/35780 [42:13<51:04,  8.09it/s]

{'eval_loss': 2.1769468784332275, 'eval_runtime': 52.5124, 'eval_samples_per_second': 1417.722, 'eval_steps_per_second': 44.313, 'epoch': 3.07}


 31%|███       | 11101/35780 [42:26<51:20,  8.01it/s]    

{'loss': 2.2546, 'grad_norm': 3.6846072673797607, 'learning_rate': 1.4194364577343302e-05, 'epoch': 3.1}


 31%|███▏      | 11201/35780 [42:38<50:44,  8.07it/s]

{'loss': 2.2479, 'grad_norm': 3.7158901691436768, 'learning_rate': 1.413686026451984e-05, 'epoch': 3.13}


 32%|███▏      | 11301/35780 [42:51<50:44,  8.04it/s]

{'loss': 2.2416, 'grad_norm': 3.8515408039093018, 'learning_rate': 1.4079355951696378e-05, 'epoch': 3.16}


 32%|███▏      | 11401/35780 [43:03<50:19,  8.07it/s]

{'loss': 2.2544, 'grad_norm': 3.6149802207946777, 'learning_rate': 1.4021851638872916e-05, 'epoch': 3.19}


 32%|███▏      | 11500/35780 [43:15<49:55,  8.10it/s]

{'loss': 2.2369, 'grad_norm': 3.561479330062866, 'learning_rate': 1.3964347326049454e-05, 'epoch': 3.21}


                                                     
 32%|███▏      | 11500/35780 [44:08<49:55,  8.10it/s]

{'eval_loss': 2.1687326431274414, 'eval_runtime': 52.6208, 'eval_samples_per_second': 1414.801, 'eval_steps_per_second': 44.222, 'epoch': 3.21}


 32%|███▏      | 11601/35780 [44:21<50:09,  8.03it/s]    

{'loss': 2.2303, 'grad_norm': 3.8693594932556152, 'learning_rate': 1.3906843013225992e-05, 'epoch': 3.24}


 33%|███▎      | 11701/35780 [44:34<49:58,  8.03it/s]

{'loss': 2.2389, 'grad_norm': 3.808295249938965, 'learning_rate': 1.384933870040253e-05, 'epoch': 3.27}


 33%|███▎      | 11801/35780 [44:46<49:41,  8.04it/s]

{'loss': 2.2459, 'grad_norm': 3.899364471435547, 'learning_rate': 1.379183438757907e-05, 'epoch': 3.3}


 33%|███▎      | 11901/35780 [44:58<49:18,  8.07it/s]

{'loss': 2.2468, 'grad_norm': 3.8885858058929443, 'learning_rate': 1.3734330074755608e-05, 'epoch': 3.33}


 34%|███▎      | 12000/35780 [45:11<49:14,  8.05it/s]

{'loss': 2.2415, 'grad_norm': 3.7248382568359375, 'learning_rate': 1.3676825761932146e-05, 'epoch': 3.35}


                                                     
 34%|███▎      | 12000/35780 [46:03<49:14,  8.05it/s]

{'eval_loss': 2.165362596511841, 'eval_runtime': 52.5718, 'eval_samples_per_second': 1416.121, 'eval_steps_per_second': 44.263, 'epoch': 3.35}


 34%|███▍      | 12101/35780 [46:16<49:01,  8.05it/s]    

{'loss': 2.2385, 'grad_norm': 3.5870091915130615, 'learning_rate': 1.3619321449108684e-05, 'epoch': 3.38}


 34%|███▍      | 12201/35780 [46:29<49:04,  8.01it/s]

{'loss': 2.2348, 'grad_norm': 3.847461700439453, 'learning_rate': 1.3561817136285222e-05, 'epoch': 3.41}


 34%|███▍      | 12301/35780 [46:41<48:37,  8.05it/s]

{'loss': 2.231, 'grad_norm': 3.9145312309265137, 'learning_rate': 1.350431282346176e-05, 'epoch': 3.44}


 35%|███▍      | 12401/35780 [46:53<48:27,  8.04it/s]

{'loss': 2.2516, 'grad_norm': 3.646599769592285, 'learning_rate': 1.3447383553766534e-05, 'epoch': 3.47}


 35%|███▍      | 12500/35780 [47:06<47:51,  8.11it/s]

{'loss': 2.2253, 'grad_norm': 3.6800575256347656, 'learning_rate': 1.3389879240943072e-05, 'epoch': 3.49}


                                                     
 35%|███▍      | 12500/35780 [47:58<47:51,  8.11it/s]

{'eval_loss': 2.159052610397339, 'eval_runtime': 52.4924, 'eval_samples_per_second': 1418.262, 'eval_steps_per_second': 44.33, 'epoch': 3.49}


 35%|███▌      | 12601/35780 [48:11<47:57,  8.05it/s]    

{'loss': 2.2319, 'grad_norm': 3.760059118270874, 'learning_rate': 1.333237492811961e-05, 'epoch': 3.52}


 35%|███▌      | 12701/35780 [48:24<47:57,  8.02it/s]

{'loss': 2.2206, 'grad_norm': 3.814999580383301, 'learning_rate': 1.327487061529615e-05, 'epoch': 3.55}


 36%|███▌      | 12801/35780 [48:36<47:47,  8.01it/s]

{'loss': 2.2424, 'grad_norm': 3.706214666366577, 'learning_rate': 1.3217366302472687e-05, 'epoch': 3.58}


 36%|███▌      | 12901/35780 [48:49<47:23,  8.05it/s]

{'loss': 2.2202, 'grad_norm': 3.687554359436035, 'learning_rate': 1.3159861989649225e-05, 'epoch': 3.61}


 36%|███▋      | 13000/35780 [49:01<46:50,  8.11it/s]

{'loss': 2.2116, 'grad_norm': 3.5483527183532715, 'learning_rate': 1.3102357676825763e-05, 'epoch': 3.63}


                                                     
 36%|███▋      | 13000/35780 [49:53<46:50,  8.11it/s]

{'eval_loss': 2.1558547019958496, 'eval_runtime': 52.5836, 'eval_samples_per_second': 1415.804, 'eval_steps_per_second': 44.253, 'epoch': 3.63}


 37%|███▋      | 13101/35780 [50:07<47:08,  8.02it/s]    

{'loss': 2.2503, 'grad_norm': 3.588886022567749, 'learning_rate': 1.3044853364002301e-05, 'epoch': 3.66}


 37%|███▋      | 13201/35780 [50:19<46:59,  8.01it/s]

{'loss': 2.2301, 'grad_norm': 3.56290602684021, 'learning_rate': 1.298734905117884e-05, 'epoch': 3.69}


 37%|███▋      | 13301/35780 [50:31<46:53,  7.99it/s]

{'loss': 2.215, 'grad_norm': 3.637357711791992, 'learning_rate': 1.2929844738355377e-05, 'epoch': 3.72}


 37%|███▋      | 13401/35780 [50:44<46:19,  8.05it/s]

{'loss': 2.204, 'grad_norm': 3.6798911094665527, 'learning_rate': 1.2872340425531915e-05, 'epoch': 3.75}


 38%|███▊      | 13500/35780 [50:56<45:58,  8.08it/s]

{'loss': 2.2094, 'grad_norm': 3.62524151802063, 'learning_rate': 1.2814836112708455e-05, 'epoch': 3.77}


                                                     
 38%|███▊      | 13500/35780 [51:49<45:58,  8.08it/s]

{'eval_loss': 2.1496083736419678, 'eval_runtime': 52.5386, 'eval_samples_per_second': 1417.014, 'eval_steps_per_second': 44.291, 'epoch': 3.77}


 38%|███▊      | 13601/35780 [52:02<46:05,  8.02it/s]   

{'loss': 2.2378, 'grad_norm': 3.570605993270874, 'learning_rate': 1.2757331799884993e-05, 'epoch': 3.8}


 38%|███▊      | 13701/35780 [52:14<45:54,  8.02it/s]

{'loss': 2.2256, 'grad_norm': 3.713261127471924, 'learning_rate': 1.2699827487061531e-05, 'epoch': 3.83}


 39%|███▊      | 13801/35780 [52:27<45:38,  8.03it/s]

{'loss': 2.2272, 'grad_norm': 3.6366031169891357, 'learning_rate': 1.264232317423807e-05, 'epoch': 3.86}


 39%|███▉      | 13901/35780 [52:39<45:20,  8.04it/s]

{'loss': 2.207, 'grad_norm': 3.8409183025360107, 'learning_rate': 1.2584818861414607e-05, 'epoch': 3.88}


 39%|███▉      | 14000/35780 [52:51<44:45,  8.11it/s]

{'loss': 2.2464, 'grad_norm': 3.8982441425323486, 'learning_rate': 1.2527314548591145e-05, 'epoch': 3.91}


                                                     
 39%|███▉      | 14000/35780 [53:44<44:45,  8.11it/s]

{'eval_loss': 2.1492888927459717, 'eval_runtime': 52.4432, 'eval_samples_per_second': 1419.592, 'eval_steps_per_second': 44.372, 'epoch': 3.91}


 39%|███▉      | 14101/35780 [53:57<44:53,  8.05it/s]   

{'loss': 2.2176, 'grad_norm': 3.8001906871795654, 'learning_rate': 1.2469810235767683e-05, 'epoch': 3.94}


 40%|███▉      | 14201/35780 [54:09<44:52,  8.01it/s]

{'loss': 2.2287, 'grad_norm': 3.706984281539917, 'learning_rate': 1.2412305922944221e-05, 'epoch': 3.97}


 40%|███▉      | 14301/35780 [54:22<44:46,  8.00it/s]

{'loss': 2.2118, 'grad_norm': 3.773850917816162, 'learning_rate': 1.235480161012076e-05, 'epoch': 4.0}


 40%|████      | 14401/35780 [54:34<44:16,  8.05it/s]

{'loss': 2.2227, 'grad_norm': 3.5696425437927246, 'learning_rate': 1.2297872340425535e-05, 'epoch': 4.02}


 41%|████      | 14500/35780 [54:46<43:43,  8.11it/s]

{'loss': 2.2189, 'grad_norm': 3.9605777263641357, 'learning_rate': 1.2240368027602073e-05, 'epoch': 4.05}


                                                     
 41%|████      | 14500/35780 [55:39<43:43,  8.11it/s]

{'eval_loss': 2.148578405380249, 'eval_runtime': 52.5397, 'eval_samples_per_second': 1416.987, 'eval_steps_per_second': 44.29, 'epoch': 4.05}


 41%|████      | 14601/35780 [55:52<43:59,  8.02it/s]   

{'loss': 2.2217, 'grad_norm': 3.656407117843628, 'learning_rate': 1.218286371477861e-05, 'epoch': 4.08}


 41%|████      | 14701/35780 [56:04<43:54,  8.00it/s]

{'loss': 2.2075, 'grad_norm': 3.555678606033325, 'learning_rate': 1.2125359401955149e-05, 'epoch': 4.11}


 41%|████▏     | 14801/35780 [56:17<43:32,  8.03it/s]

{'loss': 2.2091, 'grad_norm': 3.6327028274536133, 'learning_rate': 1.2067855089131687e-05, 'epoch': 4.14}


 42%|████▏     | 14901/35780 [56:29<43:24,  8.02it/s]

{'loss': 2.2, 'grad_norm': 3.9101433753967285, 'learning_rate': 1.2010350776308225e-05, 'epoch': 4.16}


 42%|████▏     | 15000/35780 [56:42<42:41,  8.11it/s]

{'loss': 2.1856, 'grad_norm': 3.3938472270965576, 'learning_rate': 1.1952846463484763e-05, 'epoch': 4.19}


                                                     
 42%|████▏     | 15000/35780 [57:34<42:41,  8.11it/s]

{'eval_loss': 2.1461806297302246, 'eval_runtime': 52.4805, 'eval_samples_per_second': 1418.583, 'eval_steps_per_second': 44.34, 'epoch': 4.19}


 42%|████▏     | 15101/35780 [57:47<43:02,  8.01it/s]   

{'loss': 2.2019, 'grad_norm': 3.76733660697937, 'learning_rate': 1.1895342150661299e-05, 'epoch': 4.22}


 42%|████▏     | 15201/35780 [58:00<42:48,  8.01it/s]

{'loss': 2.2029, 'grad_norm': 3.912949562072754, 'learning_rate': 1.1837837837837837e-05, 'epoch': 4.25}


 43%|████▎     | 15301/35780 [58:12<42:31,  8.03it/s]

{'loss': 2.2038, 'grad_norm': 3.696024179458618, 'learning_rate': 1.1780333525014378e-05, 'epoch': 4.28}


 43%|████▎     | 15401/35780 [58:24<42:30,  7.99it/s]

{'loss': 2.1935, 'grad_norm': 3.6557836532592773, 'learning_rate': 1.1722829212190916e-05, 'epoch': 4.3}


 43%|████▎     | 15500/35780 [58:37<41:56,  8.06it/s]

{'loss': 2.1862, 'grad_norm': 3.625537157058716, 'learning_rate': 1.1665324899367454e-05, 'epoch': 4.33}


                                                     
 43%|████▎     | 15500/35780 [59:29<41:56,  8.06it/s]

{'eval_loss': 2.1389307975769043, 'eval_runtime': 52.4568, 'eval_samples_per_second': 1419.225, 'eval_steps_per_second': 44.36, 'epoch': 4.33}


 44%|████▎     | 15601/35780 [59:42<41:54,  8.02it/s]   

{'loss': 2.1936, 'grad_norm': 3.551037549972534, 'learning_rate': 1.1607820586543992e-05, 'epoch': 4.36}


 44%|████▍     | 15701/35780 [59:55<41:41,  8.03it/s]

{'loss': 2.1996, 'grad_norm': 3.939487934112549, 'learning_rate': 1.155031627372053e-05, 'epoch': 4.39}


 44%|████▍     | 15801/35780 [1:00:07<41:29,  8.03it/s]

{'loss': 2.2196, 'grad_norm': 3.753139019012451, 'learning_rate': 1.1492811960897069e-05, 'epoch': 4.42}


 44%|████▍     | 15901/35780 [1:00:20<41:17,  8.02it/s]

{'loss': 2.1846, 'grad_norm': 3.747812032699585, 'learning_rate': 1.1435307648073607e-05, 'epoch': 4.44}


 45%|████▍     | 16000/35780 [1:00:32<40:47,  8.08it/s]

{'loss': 2.2028, 'grad_norm': 3.703045606613159, 'learning_rate': 1.1377803335250145e-05, 'epoch': 4.47}


                                                       
 45%|████▍     | 16000/35780 [1:01:24<40:47,  8.08it/s]

{'eval_loss': 2.1330885887145996, 'eval_runtime': 52.4556, 'eval_samples_per_second': 1419.257, 'eval_steps_per_second': 44.361, 'epoch': 4.47}


 45%|████▌     | 16101/35780 [1:01:38<40:59,  8.00it/s]   

{'loss': 2.2094, 'grad_norm': 3.799128770828247, 'learning_rate': 1.1320299022426684e-05, 'epoch': 4.5}


 45%|████▌     | 16201/35780 [1:01:50<40:36,  8.04it/s]

{'loss': 2.1987, 'grad_norm': 3.6739342212677, 'learning_rate': 1.1262794709603222e-05, 'epoch': 4.53}


 46%|████▌     | 16301/35780 [1:02:02<40:26,  8.03it/s]

{'loss': 2.1853, 'grad_norm': 3.564405679702759, 'learning_rate': 1.120529039677976e-05, 'epoch': 4.56}


 46%|████▌     | 16401/35780 [1:02:15<39:53,  8.10it/s]

{'loss': 2.2066, 'grad_norm': 3.897737503051758, 'learning_rate': 1.1148361127084532e-05, 'epoch': 4.58}


 46%|████▌     | 16500/35780 [1:02:27<39:51,  8.06it/s]

{'loss': 2.2071, 'grad_norm': 4.004103183746338, 'learning_rate': 1.109085681426107e-05, 'epoch': 4.61}


                                                       
 46%|████▌     | 16500/35780 [1:03:19<39:51,  8.06it/s]

{'eval_loss': 2.136902093887329, 'eval_runtime': 52.4017, 'eval_samples_per_second': 1420.717, 'eval_steps_per_second': 44.407, 'epoch': 4.61}


 46%|████▋     | 16601/35780 [1:03:33<39:57,  8.00it/s]   

{'loss': 2.1967, 'grad_norm': 3.6435747146606445, 'learning_rate': 1.1033352501437608e-05, 'epoch': 4.64}


 47%|████▋     | 16701/35780 [1:03:45<39:35,  8.03it/s]

{'loss': 2.2041, 'grad_norm': 3.887831926345825, 'learning_rate': 1.0975848188614146e-05, 'epoch': 4.67}


 47%|████▋     | 16801/35780 [1:03:57<39:22,  8.03it/s]

{'loss': 2.1922, 'grad_norm': 3.811499834060669, 'learning_rate': 1.0918343875790684e-05, 'epoch': 4.7}


 47%|████▋     | 16901/35780 [1:04:10<39:16,  8.01it/s]

{'loss': 2.1955, 'grad_norm': 3.626366376876831, 'learning_rate': 1.0860839562967222e-05, 'epoch': 4.72}


 48%|████▊     | 17000/35780 [1:04:22<38:43,  8.08it/s]

{'loss': 2.1905, 'grad_norm': 3.6714162826538086, 'learning_rate': 1.0803335250143762e-05, 'epoch': 4.75}


                                                       
 48%|████▊     | 17000/35780 [1:05:15<38:43,  8.08it/s]

{'eval_loss': 2.1298909187316895, 'eval_runtime': 52.4891, 'eval_samples_per_second': 1418.35, 'eval_steps_per_second': 44.333, 'epoch': 4.75}


 48%|████▊     | 17101/35780 [1:05:28<38:52,  8.01it/s]   

{'loss': 2.1786, 'grad_norm': 3.5966291427612305, 'learning_rate': 1.07458309373203e-05, 'epoch': 4.78}


 48%|████▊     | 17201/35780 [1:05:40<38:32,  8.03it/s]

{'loss': 2.1872, 'grad_norm': 3.8779733180999756, 'learning_rate': 1.0688326624496838e-05, 'epoch': 4.81}


 48%|████▊     | 17301/35780 [1:05:53<38:29,  8.00it/s]

{'loss': 2.1917, 'grad_norm': 3.6838462352752686, 'learning_rate': 1.0630822311673376e-05, 'epoch': 4.84}


 49%|████▊     | 17401/35780 [1:06:05<38:10,  8.02it/s]

{'loss': 2.1742, 'grad_norm': 3.791729211807251, 'learning_rate': 1.0573317998849914e-05, 'epoch': 4.86}


 49%|████▉     | 17500/35780 [1:06:17<37:40,  8.09it/s]

{'loss': 2.2008, 'grad_norm': 3.5529613494873047, 'learning_rate': 1.0515813686026452e-05, 'epoch': 4.89}


                                                       
 49%|████▉     | 17500/35780 [1:07:10<37:40,  8.09it/s]

{'eval_loss': 2.122312068939209, 'eval_runtime': 52.4857, 'eval_samples_per_second': 1418.444, 'eval_steps_per_second': 44.336, 'epoch': 4.89}


 49%|████▉     | 17601/35780 [1:07:23<37:54,  7.99it/s]   

{'loss': 2.2023, 'grad_norm': 3.6309902667999268, 'learning_rate': 1.045830937320299e-05, 'epoch': 4.92}


 49%|████▉     | 17701/35780 [1:07:35<37:30,  8.03it/s]

{'loss': 2.1897, 'grad_norm': 3.727041244506836, 'learning_rate': 1.0400805060379528e-05, 'epoch': 4.95}


 50%|████▉     | 17801/35780 [1:07:48<37:21,  8.02it/s]

{'loss': 2.1875, 'grad_norm': 3.7729105949401855, 'learning_rate': 1.0343300747556066e-05, 'epoch': 4.97}


 50%|█████     | 17901/35780 [1:08:00<37:01,  8.05it/s]

{'loss': 2.1813, 'grad_norm': 3.657167434692383, 'learning_rate': 1.0285796434732608e-05, 'epoch': 5.0}


 50%|█████     | 18000/35780 [1:08:13<36:42,  8.07it/s]

{'loss': 2.1894, 'grad_norm': 3.780454158782959, 'learning_rate': 1.0228292121909144e-05, 'epoch': 5.03}


                                                       
 50%|█████     | 18000/35780 [1:09:05<36:42,  8.07it/s]

{'eval_loss': 2.130140781402588, 'eval_runtime': 52.4172, 'eval_samples_per_second': 1420.297, 'eval_steps_per_second': 44.394, 'epoch': 5.03}


 51%|█████     | 18101/35780 [1:09:18<36:50,  8.00it/s]   

{'loss': 2.1718, 'grad_norm': 3.7347042560577393, 'learning_rate': 1.0170787809085682e-05, 'epoch': 5.06}


 51%|█████     | 18201/35780 [1:09:31<36:36,  8.00it/s]

{'loss': 2.1848, 'grad_norm': 3.7820048332214355, 'learning_rate': 1.011328349626222e-05, 'epoch': 5.09}


 51%|█████     | 18301/35780 [1:09:43<36:25,  8.00it/s]

{'loss': 2.1939, 'grad_norm': 3.7700273990631104, 'learning_rate': 1.0055779183438758e-05, 'epoch': 5.11}


 51%|█████▏    | 18401/35780 [1:09:55<36:03,  8.03it/s]

{'loss': 2.1733, 'grad_norm': 3.6012725830078125, 'learning_rate': 9.998274870615296e-06, 'epoch': 5.14}


 52%|█████▏    | 18500/35780 [1:10:08<35:53,  8.02it/s]

{'loss': 2.168, 'grad_norm': 3.5818052291870117, 'learning_rate': 9.94134560092007e-06, 'epoch': 5.17}


                                                       
 52%|█████▏    | 18500/35780 [1:11:00<35:53,  8.02it/s]

{'eval_loss': 2.123793363571167, 'eval_runtime': 52.5035, 'eval_samples_per_second': 1417.962, 'eval_steps_per_second': 44.321, 'epoch': 5.17}


 52%|█████▏    | 18601/35780 [1:11:13<35:38,  8.03it/s]   

{'loss': 2.1716, 'grad_norm': 3.702817440032959, 'learning_rate': 9.88384128809661e-06, 'epoch': 5.2}


 52%|█████▏    | 18701/35780 [1:11:26<35:19,  8.06it/s]

{'loss': 2.1962, 'grad_norm': 3.728191375732422, 'learning_rate': 9.826336975273145e-06, 'epoch': 5.23}


 53%|█████▎    | 18801/35780 [1:11:38<35:10,  8.04it/s]

{'loss': 2.1905, 'grad_norm': 3.841203212738037, 'learning_rate': 9.768832662449684e-06, 'epoch': 5.25}


 53%|█████▎    | 18901/35780 [1:11:51<34:53,  8.06it/s]

{'loss': 2.1879, 'grad_norm': 4.043895244598389, 'learning_rate': 9.711328349626222e-06, 'epoch': 5.28}


 53%|█████▎    | 19000/35780 [1:12:03<34:41,  8.06it/s]

{'loss': 2.1894, 'grad_norm': 3.58307147026062, 'learning_rate': 9.653824036802761e-06, 'epoch': 5.31}


                                                       
 53%|█████▎    | 19000/35780 [1:12:56<34:41,  8.06it/s]

{'eval_loss': 2.120260000228882, 'eval_runtime': 52.5538, 'eval_samples_per_second': 1416.606, 'eval_steps_per_second': 44.278, 'epoch': 5.31}


 53%|█████▎    | 19101/35780 [1:13:09<34:47,  7.99it/s]   

{'loss': 2.1644, 'grad_norm': 3.639327049255371, 'learning_rate': 9.5963197239793e-06, 'epoch': 5.34}


 54%|█████▎    | 19201/35780 [1:13:21<34:39,  7.97it/s]

{'loss': 2.1609, 'grad_norm': 3.8910558223724365, 'learning_rate': 9.538815411155837e-06, 'epoch': 5.37}


 54%|█████▍    | 19301/35780 [1:13:33<34:08,  8.04it/s]

{'loss': 2.1804, 'grad_norm': 3.6812901496887207, 'learning_rate': 9.481311098332375e-06, 'epoch': 5.39}


 54%|█████▍    | 19401/35780 [1:13:46<33:58,  8.03it/s]

{'loss': 2.1748, 'grad_norm': 3.562530279159546, 'learning_rate': 9.423806785508915e-06, 'epoch': 5.42}


 54%|█████▍    | 19500/35780 [1:13:58<33:35,  8.08it/s]

{'loss': 2.1885, 'grad_norm': 3.818065881729126, 'learning_rate': 9.366302472685453e-06, 'epoch': 5.45}


                                                       
 54%|█████▍    | 19500/35780 [1:14:51<33:35,  8.08it/s]

{'eval_loss': 2.1142170429229736, 'eval_runtime': 52.4781, 'eval_samples_per_second': 1418.65, 'eval_steps_per_second': 44.342, 'epoch': 5.45}


 55%|█████▍    | 19601/35780 [1:15:04<33:18,  8.10it/s]   

{'loss': 2.2, 'grad_norm': 3.665357828140259, 'learning_rate': 9.308798159861991e-06, 'epoch': 5.48}


 55%|█████▌    | 19701/35780 [1:15:16<33:21,  8.03it/s]

{'loss': 2.1879, 'grad_norm': 3.9723501205444336, 'learning_rate': 9.251293847038529e-06, 'epoch': 5.51}


 55%|█████▌    | 19801/35780 [1:15:29<33:14,  8.01it/s]

{'loss': 2.1769, 'grad_norm': 3.8034074306488037, 'learning_rate': 9.193789534215067e-06, 'epoch': 5.53}


 56%|█████▌    | 19901/35780 [1:15:41<32:58,  8.03it/s]

{'loss': 2.1739, 'grad_norm': 3.6746654510498047, 'learning_rate': 9.136285221391605e-06, 'epoch': 5.56}


 56%|█████▌    | 20000/35780 [1:15:53<32:33,  8.08it/s]

{'loss': 2.1835, 'grad_norm': 3.5555126667022705, 'learning_rate': 9.078780908568143e-06, 'epoch': 5.59}


                                                       
 56%|█████▌    | 20000/35780 [1:16:46<32:33,  8.08it/s]

{'eval_loss': 2.114222526550293, 'eval_runtime': 52.5734, 'eval_samples_per_second': 1416.078, 'eval_steps_per_second': 44.262, 'epoch': 5.59}


 56%|█████▌    | 20101/35780 [1:16:59<32:34,  8.02it/s]   

{'loss': 2.1759, 'grad_norm': 3.689662218093872, 'learning_rate': 9.021276595744681e-06, 'epoch': 5.62}


 56%|█████▋    | 20201/35780 [1:17:11<32:17,  8.04it/s]

{'loss': 2.1737, 'grad_norm': 3.7420449256896973, 'learning_rate': 8.96377228292122e-06, 'epoch': 5.65}


 57%|█████▋    | 20301/35780 [1:17:24<32:08,  8.02it/s]

{'loss': 2.1626, 'grad_norm': 3.9478886127471924, 'learning_rate': 8.906267970097759e-06, 'epoch': 5.67}


 57%|█████▋    | 20401/35780 [1:17:36<31:54,  8.03it/s]

{'loss': 2.2018, 'grad_norm': 3.8834993839263916, 'learning_rate': 8.848763657274297e-06, 'epoch': 5.7}


 57%|█████▋    | 20500/35780 [1:17:49<31:34,  8.07it/s]

{'loss': 2.1746, 'grad_norm': 3.7774899005889893, 'learning_rate': 8.791834387579069e-06, 'epoch': 5.73}


                                                       
 57%|█████▋    | 20500/35780 [1:18:41<31:34,  8.07it/s]

{'eval_loss': 2.1182730197906494, 'eval_runtime': 52.5779, 'eval_samples_per_second': 1415.957, 'eval_steps_per_second': 44.258, 'epoch': 5.73}


 58%|█████▊    | 20601/35780 [1:18:54<31:25,  8.05it/s]   

{'loss': 2.1603, 'grad_norm': 3.8018441200256348, 'learning_rate': 8.734330074755607e-06, 'epoch': 5.76}


 58%|█████▊    | 20701/35780 [1:19:07<31:10,  8.06it/s]

{'loss': 2.1721, 'grad_norm': 3.860416889190674, 'learning_rate': 8.676825761932146e-06, 'epoch': 5.79}


 58%|█████▊    | 20801/35780 [1:19:19<31:20,  7.96it/s]

{'loss': 2.1736, 'grad_norm': 3.822904586791992, 'learning_rate': 8.619321449108684e-06, 'epoch': 5.81}


 58%|█████▊    | 20901/35780 [1:19:31<30:55,  8.02it/s]

{'loss': 2.1664, 'grad_norm': 3.672532558441162, 'learning_rate': 8.561817136285222e-06, 'epoch': 5.84}


 59%|█████▊    | 21000/35780 [1:19:44<30:24,  8.10it/s]

{'loss': 2.206, 'grad_norm': 3.703555107116699, 'learning_rate': 8.50431282346176e-06, 'epoch': 5.87}


                                                       
 59%|█████▊    | 21000/35780 [1:20:36<30:24,  8.10it/s]

{'eval_loss': 2.114932060241699, 'eval_runtime': 52.4896, 'eval_samples_per_second': 1418.337, 'eval_steps_per_second': 44.333, 'epoch': 5.87}


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].
 59%|█████▊    | 21000/35780 [1:20:37<56:44,  4.34it/s]

{'train_runtime': 4837.5299, 'train_samples_per_second': 473.339, 'train_steps_per_second': 7.396, 'train_loss': 2.2944495900472006, 'epoch': 5.87}





TrainOutput(global_step=21000, training_loss=2.2944495900472006, metrics={'train_runtime': 4837.5299, 'train_samples_per_second': 473.339, 'train_steps_per_second': 7.396, 'total_flos': 3.583948918699494e+16, 'train_loss': 2.2944495900472006, 'epoch': 5.869200670765791})

## Evaluate the Model

In [18]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


  0%|          | 0/2327 [00:00<?, ?it/s]

100%|██████████| 2327/2327 [00:52<00:00, 43.91it/s]

Perplexity: 8.34





In [19]:
# Saving the Model and Tokenizer

import json
model_output_dir = "distilbert-finetuned-imdb-mlm"

trainer.save_model(model_output_dir)

tokenizer.save_pretrained(model_output_dir)

('distilbert-finetuned-imdb-mlm\\tokenizer_config.json',
 'distilbert-finetuned-imdb-mlm\\special_tokens_map.json',
 'distilbert-finetuned-imdb-mlm\\vocab.txt',
 'distilbert-finetuned-imdb-mlm\\added_tokens.json',
 'distilbert-finetuned-imdb-mlm\\tokenizer.json')

In [22]:
metrics_output_file = model_output_dir + "/log_history.json"
with open(metrics_output_file, "w") as f:
    json.dump(trainer.state.log_history, f)

In [20]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="distilbert-finetuned-imdb-mlm"
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [21]:
text = "This is a great [MASK]."
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> this is a great movie.
>>> this is a great film.
>>> this is a great show.
>>> this is a great documentary.
>>> this is a great story.
