In [1]:
import tqdm as notebook_tqdm
from datasets import load_dataset

In [2]:
squad = load_dataset("squad")

#### Check dataset

In [3]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [4]:
squad['train']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [5]:
squad['train'][:1]

{'id': ['5733be284776f41900661182'],
 'title': ['University_of_Notre_Dame'],
 'context': ['Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'],
 'question': ['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'],
 'answers': [{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}]}

In [6]:
squad['validation'][:1]

{'id': ['56be4db0acb8001400a502ec'],
 'title': ['Super_Bowl_50'],
 'context': ['Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.'],
 'question': ['Which NFL team represented the AFC at Super Bowl 50?'],
 'answers': [{'text': ['Denver Broncos', 'Denver Broncos', 'Denver Br

### Tokenizer

- For encoding the data which need to provides to my bert model (transformer encoder)
- Transformer(encoded data) --> positional encoding --> attention layer --> neural network --> output
- High level overview of transformer working

In [7]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

Transformer --> model --> pretrain --> BERT --> question answering, text classification, sentiment analysis, summarization, translation, etc.

Transformer --> BERT --> Specific classes (question answering, text classification, sentiment analysis, summarization, translation, etc.)

Transformer --> pytorch (AutoModelForQuestionAnswering, AutoModelForCasualLM, etc.)

Transformer --> Tensorflow (TFAutoModel) --> Low level authority --> You can add your own layers

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
# load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [10]:
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased").to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [12]:
squad['validation'][:1]

{'id': ['56be4db0acb8001400a502ec'],
 'title': ['Super_Bowl_50'],
 'context': ['Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.'],
 'question': ['Which NFL team represented the AFC at Super Bowl 50?'],
 'answers': [{'text': ['Denver Broncos', 'Denver Broncos', 'Denver Br

In [13]:
text = 'Which NFL team represented the AFC at Super Bowl 50?'

In [14]:
context = 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.'

In [15]:
inputs = tokenizer(text, context, return_tensors='pt')

In [16]:
inputs

{'input_ids': tensor([[  101,  2029,  5088,  2136,  3421,  1996, 10511,  2012,  3565,  4605,
          2753,  1029,   102,  3565,  4605,  2753,  2001,  2019,  2137,  2374,
          2208,  2000,  5646,  1996,  3410,  1997,  1996,  2120,  2374,  2223,
          1006,  5088,  1007,  2005,  1996,  2325,  2161,  1012,  1996,  2137,
          2374,  3034,  1006, 10511,  1007,  3410,  7573, 14169,  3249,  1996,
          2120,  2374,  3034,  1006, 22309,  1007,  3410,  3792, 12915,  2484,
          1516,  2184,  2000,  7796,  2037,  2353,  3565,  4605,  2516,  1012,
          1996,  2208,  2001,  2209,  2006,  2337,  1021,  1010,  2355,  1010,
          2012, 11902,  1005,  1055,  3346,  1999,  1996,  2624,  3799,  3016,
          2181,  2012,  4203, 10254,  1010,  2662,  1012,  2004,  2023,  2001,
          1996, 12951,  3565,  4605,  1010,  1996,  2223, 13155,  1996,  1000,
          3585,  5315,  1000,  2007,  2536,  2751,  1011, 11773, 11107,  1010,
          2004,  2092,  2004,  8184, 2

- Attention mask is which work is important. 1 means important word, 0 means not important.

In [24]:
# Move input tensors to the same device as the model
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

In [25]:
input_ids

tensor([[  101,  2029,  5088,  2136,  3421,  1996, 10511,  2012,  3565,  4605,
          2753,  1029,   102,  3565,  4605,  2753,  2001,  2019,  2137,  2374,
          2208,  2000,  5646,  1996,  3410,  1997,  1996,  2120,  2374,  2223,
          1006,  5088,  1007,  2005,  1996,  2325,  2161,  1012,  1996,  2137,
          2374,  3034,  1006, 10511,  1007,  3410,  7573, 14169,  3249,  1996,
          2120,  2374,  3034,  1006, 22309,  1007,  3410,  3792, 12915,  2484,
          1516,  2184,  2000,  7796,  2037,  2353,  3565,  4605,  2516,  1012,
          1996,  2208,  2001,  2209,  2006,  2337,  1021,  1010,  2355,  1010,
          2012, 11902,  1005,  1055,  3346,  1999,  1996,  2624,  3799,  3016,
          2181,  2012,  4203, 10254,  1010,  2662,  1012,  2004,  2023,  2001,
          1996, 12951,  3565,  4605,  1010,  1996,  2223, 13155,  1996,  1000,
          3585,  5315,  1000,  2007,  2536,  2751,  1011, 11773, 11107,  1010,
          2004,  2092,  2004,  8184, 28324,  2075,  

In [26]:
attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]], device='cuda:0')

In [27]:
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

In [28]:
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-9.9525e-02, -1.6205e-02, -5.8271e-02, -4.0584e-01, -1.1191e-02,
         -1.0651e-01, -8.7459e-02,  3.5844e-01,  3.8691e-01,  3.5075e-01,
          1.9071e-01,  1.0301e-01,  1.7453e-01,  4.4841e-01,  4.2113e-01,
          2.6179e-01,  1.2617e-01,  2.2724e-01, -1.8773e-01,  8.4850e-02,
          1.0451e-01,  8.3353e-02,  5.2257e-01, -2.9884e-01, -3.6994e-02,
         -8.8060e-02, -2.9656e-01,  1.7547e-01,  3.6042e-01,  1.1349e-01,
         -1.4115e-01,  1.2845e-01,  1.1504e-01,  1.5430e-02, -2.5788e-01,
          1.5131e-01,  1.0811e-01, -4.5800e-01, -5.2916e-01, -1.8176e-02,
          2.6946e-01,  1.6311e-01, -1.2378e-02,  2.8995e-02,  2.1362e-01,
         -2.4795e-01, -2.4248e-01, -9.6852e-02, -5.7364e-02, -4.3557e-01,
          8.3975e-02,  3.3610e-01,  2.6692e-01,  9.0950e-02,  1.8509e-02,
         -4.8028e-01, -2.4935e-01, -1.1934e-01, -2.4996e-01, -2.2500e-01,
         -2.3658e-01, -3.5485e-02, -9.3080e-02,  1.0425e-01

In [29]:
outputs.start_logits

tensor([[-9.9525e-02, -1.6205e-02, -5.8271e-02, -4.0584e-01, -1.1191e-02,
         -1.0651e-01, -8.7459e-02,  3.5844e-01,  3.8691e-01,  3.5075e-01,
          1.9071e-01,  1.0301e-01,  1.7453e-01,  4.4841e-01,  4.2113e-01,
          2.6179e-01,  1.2617e-01,  2.2724e-01, -1.8773e-01,  8.4850e-02,
          1.0451e-01,  8.3353e-02,  5.2257e-01, -2.9884e-01, -3.6994e-02,
         -8.8060e-02, -2.9656e-01,  1.7547e-01,  3.6042e-01,  1.1349e-01,
         -1.4115e-01,  1.2845e-01,  1.1504e-01,  1.5430e-02, -2.5788e-01,
          1.5131e-01,  1.0811e-01, -4.5800e-01, -5.2916e-01, -1.8176e-02,
          2.6946e-01,  1.6311e-01, -1.2378e-02,  2.8995e-02,  2.1362e-01,
         -2.4795e-01, -2.4248e-01, -9.6852e-02, -5.7364e-02, -4.3557e-01,
          8.3975e-02,  3.3610e-01,  2.6692e-01,  9.0950e-02,  1.8509e-02,
         -4.8028e-01, -2.4935e-01, -1.1934e-01, -2.4996e-01, -2.2500e-01,
         -2.3658e-01, -3.5485e-02, -9.3080e-02,  1.0425e-01,  1.0147e-01,
          6.1986e-02,  3.4700e-01, -2.

In [30]:
outputs.end_logits

tensor([[ 0.9637,  0.3824,  0.1824,  0.6251,  0.3256,  0.2387, -0.0265,  0.2797,
         -0.1483, -0.1341,  0.3057, -0.1693,  0.1827, -0.0031,  0.0113,  0.3109,
          0.3064,  0.5739, -0.3737, -0.2018,  0.5672,  0.3253,  0.2871,  0.0871,
          0.3093, -0.0133, -0.0108, -0.3007, -0.4083,  0.1139, -0.1556, -0.2368,
          0.0904, -0.3656,  0.0190, -0.2405,  0.4642, -0.3385,  0.1299, -0.7632,
         -0.4328,  0.2373, -0.1090, -0.2327,  0.3647,  0.0599,  0.0975, -0.2161,
          0.4201,  0.0172, -0.2779, -0.4061, -0.0503,  0.0436, -0.3024, -0.3493,
          0.2447, -0.0832, -0.1213,  0.2020,  0.2552,  0.2779, -0.0106,  0.3784,
          0.2274,  0.0621, -0.0186, -0.0278,  0.0353, -0.3324,  0.4175,  0.4267,
          0.4440,  0.5848,  0.4576,  0.2086,  0.3373, -0.2441, -0.3948, -0.2961,
          0.4459,  0.1598,  0.3797,  0.1795, -0.0999,  0.3271,  0.4559,  0.1409,
          0.0800,  0.6206,  0.3769,  0.2767,  0.2124,  0.4997,  0.2332,  0.5271,
         -0.3232,  0.6747,  

In [31]:
start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [32]:
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores) + 1

In [33]:
start_index, end_index

(tensor(73, device='cuda:0'), tensor(1, device='cuda:0'))

In [34]:
input_ids

tensor([[  101,  2029,  5088,  2136,  3421,  1996, 10511,  2012,  3565,  4605,
          2753,  1029,   102,  3565,  4605,  2753,  2001,  2019,  2137,  2374,
          2208,  2000,  5646,  1996,  3410,  1997,  1996,  2120,  2374,  2223,
          1006,  5088,  1007,  2005,  1996,  2325,  2161,  1012,  1996,  2137,
          2374,  3034,  1006, 10511,  1007,  3410,  7573, 14169,  3249,  1996,
          2120,  2374,  3034,  1006, 22309,  1007,  3410,  3792, 12915,  2484,
          1516,  2184,  2000,  7796,  2037,  2353,  3565,  4605,  2516,  1012,
          1996,  2208,  2001,  2209,  2006,  2337,  1021,  1010,  2355,  1010,
          2012, 11902,  1005,  1055,  3346,  1999,  1996,  2624,  3799,  3016,
          2181,  2012,  4203, 10254,  1010,  2662,  1012,  2004,  2023,  2001,
          1996, 12951,  3565,  4605,  1010,  1996,  2223, 13155,  1996,  1000,
          3585,  5315,  1000,  2007,  2536,  2751,  1011, 11773, 11107,  1010,
          2004,  2092,  2004,  8184, 28324,  2075,  

In [35]:
len(input_ids[0])

171

In [36]:
answer_tokens = input_ids[0][start_index:end_index]

In [37]:
tokenizer.decode(answer_tokens)

''

- LLM model --> Inferencing --> Question Answering
- Inferencing --> load the pretraining model and test your model over the test data.
- Inferencing is the prediction on top of the data which is not seen by the model.

#### Process the data and then finetune the model

In [48]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride.
    # This results in one example possible giving several features when a context is long, 
    # each of those features having a context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation='only_second', # truncate context, not the question
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding='max_length',
    )
    
    # Since one example might give us several features if it has a long context, we need a map from a feature to its corresponding example.
    # This key will help us map the token's position to its position in the list of sentence.
    sample_mapping = tokenized_examples.pop('overflow_to_sample_mapping')
    
    # The offset mappings will give us a map from token to character position in the original context. 
    # This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop('offset_mapping')
    
    # Let's label those examples!
    tokenized_examples['start_positions'] = []
    tokenized_examples['end_positions'] = []
    
    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples['input_ids'][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        # Grab the sequence corresponding to that example (to know what is the context and what is the question)
        sequence_ids = tokenized_examples.sequence_ids(i)
        
        # One example can give several spans, this is the index of the example containing this span of texts
        sample_index = sample_mapping[i]
        answers = examples['answers'][sample_index]
        # If no answers are given, set the cls_index as answer.
        
        if len(answers['answer_start']) == 0:
            tokenized_examples['start_positions'].append(cls_index)
            tokenized_examples['end_positions'].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers['answer_start'][0]
            end_char = start_char + len(answers['text'][0])
            
            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
                
            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
                
            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS)
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples['start_positions'].append(cls_index)
                tokenized_examples['end_positions'].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples['start_positions'].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples['end_positions'].append(token_end_index + 1)
                
    return tokenized_examples
    

In [49]:
example = squad['validation'][:1]

In [50]:
prepare_train_features(example)

{'input_ids': [[101, 2029, 5088, 2136, 3421, 1996, 10511, 2012, 3565, 4605, 2753, 1029, 102, 3565, 4605, 2753, 2001, 2019, 2137, 2374, 2208, 2000, 5646, 1996, 3410, 1997, 1996, 2120, 2374, 2223, 1006, 5088, 1007, 2005, 1996, 2325, 2161, 1012, 1996, 2137, 2374, 3034, 1006, 10511, 1007, 3410, 7573, 14169, 3249, 1996, 2120, 2374, 3034, 1006, 22309, 1007, 3410, 3792, 12915, 2484, 1516, 2184, 2000, 7796, 2037, 2353, 3565, 4605, 2516, 1012, 1996, 2208, 2001, 2209, 2006, 2337, 1021, 1010, 2355, 1010, 2012, 11902, 1005, 1055, 3346, 1999, 1996, 2624, 3799, 3016, 2181, 2012, 4203, 10254, 1010, 2662, 1012, 2004, 2023, 2001, 1996, 12951, 3565, 4605, 1010, 1996, 2223, 13155, 1996, 1000, 3585, 5315, 1000, 2007, 2536, 2751, 1011, 11773, 11107, 1010, 2004, 2092, 2004, 8184, 28324, 2075, 1996, 4535, 1997, 10324, 2169, 3565, 4605, 2208, 2007, 3142, 16371, 28990, 2015, 1006, 2104, 2029, 1996, 2208, 2052, 2031, 2042, 2124, 2004, 1000, 3565, 4605, 1048, 1000, 1007, 1010, 2061, 2008, 1996, 8154, 2071, 14500

In [53]:
tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad['train'].column_names)

In [54]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 88524
})

In [55]:
tokenized_datasets['validation']

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 10784
})

### Fine tuning the BERT model

In [59]:
from transformers import TrainingArguments, Trainer

In [60]:
args = TrainingArguments(
    output_dir='finetune-BERT-squad',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01
)

In [61]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

- Data Collator --> It take care the optimization of the training and provide the data in form of batches.
- data_collator is used to create batches.

In [64]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=tokenized_datasets['train'].select(range(2000)),
    eval_dataset=tokenized_datasets['validation'].select(range(2000)),
    data_collator=data_collator
)

In [65]:
trainer.train()

  0%|          | 0/1250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.34498929977417, 'eval_runtime': 25.0612, 'eval_samples_per_second': 79.805, 'eval_steps_per_second': 9.976, 'epoch': 1.0}
{'loss': 2.5716, 'grad_norm': 37.80916976928711, 'learning_rate': 1.2e-05, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.9134577512741089, 'eval_runtime': 25.0092, 'eval_samples_per_second': 79.971, 'eval_steps_per_second': 9.996, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.9042402505874634, 'eval_runtime': 25.4438, 'eval_samples_per_second': 78.605, 'eval_steps_per_second': 9.826, 'epoch': 3.0}
{'loss': 0.8364, 'grad_norm': 16.585693359375, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.9276578426361084, 'eval_runtime': 25.3952, 'eval_samples_per_second': 78.755, 'eval_steps_per_second': 9.844, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.993401050567627, 'eval_runtime': 25.2327, 'eval_samples_per_second': 79.262, 'eval_steps_per_second': 9.908, 'epoch': 5.0}
{'train_runtime': 570.7317, 'train_samples_per_second': 17.521, 'train_steps_per_second': 2.19, 'train_loss': 1.4610540405273438, 'epoch': 5.0}


TrainOutput(global_step=1250, training_loss=1.4610540405273438, metrics={'train_runtime': 570.7317, 'train_samples_per_second': 17.521, 'train_steps_per_second': 2.19, 'total_flos': 1959725675520000.0, 'train_loss': 1.4610540405273438, 'epoch': 5.0})

{'input_ids': [[101, 2029, 5088, 2136, 3421, 1996, 10511, 2012, 3565, 4605, 2753, 1029, 102, 3565, 4605, 2753, 2001, 2019, 2137, 2374, 2208, 2000, 5646, 1996, 3410, 1997, 1996, 2120, 2374, 2223, 1006, 5088, 1007, 2005, 1996, 2325, 2161, 1012, 1996, 2137, 2374, 3034, 1006, 10511, 1007, 3410, 7573, 14169, 3249, 1996, 2120, 2374, 3034, 1006, 22309, 1007, 3410, 3792, 12915, 2484, 1516, 2184, 2000, 7796, 2037, 2353, 3565, 4605, 2516, 1012, 1996, 2208, 2001, 2209, 2006, 2337, 1021, 1010, 2355, 1010, 2012, 11902, 1005, 1055, 3346, 1999, 1996, 2624, 3799, 3016, 2181, 2012, 4203, 10254, 1010, 2662, 1012, 2004, 2023, 2001, 1996, 12951, 3565, 4605, 1010, 1996, 2223, 13155, 1996, 1000, 3585, 5315, 1000, 2007, 2536, 2751, 1011, 11773, 11107, 1010, 2004, 2092, 2004, 8184, 28324, 2075, 1996, 4535, 1997, 10324, 2169, 3565, 4605, 2208, 2007, 3142, 16371, 28990, 2015, 1006, 2104, 2029, 1996, 2208, 2052, 2031, 2042, 2124, 2004, 1000, 3565, 4605, 1048, 1000, 1007, 1010, 2061, 2008, 1996, 8154, 2071, 14500