# BERT for sequence classification

In [3]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, \
     DataCollatorWithPadding, pipeline
from datasets import Dataset
import numpy as np
import evaluate

Downloading a training data with a sequence of classified entities

In [4]:
snips_file = open('../data/snips.train.txt', 'rb')

snips_rows = snips_file.readlines()

snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [5]:
# This code segment parses the snips dataset into a more manageable format

utterances = [] # Stores the complete sentences
tokenized_utterances = [] # Stores the tokenized sentences
labels_for_tokens = [] # Stores the labels of each word in the sentence
sequence_labels = [] # Text

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:  # skip over rows with no data
        continue
    if ' ' not in snip_row.decode():  # we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(label_for_utterances)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)

In [6]:
len(labels_for_tokens), len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084, 13084)

In [7]:
utterances[1], sequence_labels[1]

('add step to me to the 50 clásicos playlist', 'AddToPlaylist')

In [8]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')

In [9]:
# Set of unique sequence labels
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['RateBook',
 'SearchCreativeWork',
 'GetWeather',
 'PlayMusic',
 'BookRestaurant',
 'AddToPlaylist',
 'SearchScreeningEvent']

In [10]:
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]

print(f'There are {len(unique_sequence_labels)} unique sequence labels')

There are 7 unique sequence labels


In [11]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f'There are {len(unique_token_labels)} unique token labels')

There are 72 unique token labels


In [12]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print([unique_token_labels[l] for l in labels_for_tokens[0]])
print(utterances[0])
print(sequence_labels[0])
print(unique_sequence_labels[sequence_labels[0]])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
[21, 21, 27, 21, 5, 21, 6, 60]
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
3
PlayMusic


In [13]:
Dataset

datasets.arrow_dataset.Dataset

In [14]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances, 
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)

snips_dataset = snips_dataset.train_test_split(test_size=0.2)

In [15]:
snips_dataset

DatasetDict({
    train: Dataset({
        features: ['utterance', 'label', 'tokens', 'token_labels'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['utterance', 'label', 'tokens', 'token_labels'],
        num_rows: 2617
    })
})

In [16]:
unique_sequence_labels[6]

'SearchScreeningEvent'

In [17]:
snips_dataset['train'][0]

{'utterance': 'find a photograph called call on me',
 'label': 1,
 'tokens': ['find', 'a', 'photograph', 'called', 'call', 'on', 'me'],
 'token_labels': [21, 21, 68, 21, 30, 9, 9]}

In [18]:
tokenizer('hi')

{'input_ids': [101, 20844, 102], 'attention_mask': [1, 1, 1]}

In [19]:
tokenizer.decode([101, 2603, 1142, 18977, 126, 2940, 102])


'[CLS] rate this textbook 5 stars [SEP]'

In [20]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):
    return tokenizer(examples["utterance"], truncation=True)

In [21]:
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function, batched=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:  38%|███▊      | 4000/10467 [00:00<00:00, 11312.46 examples/s]

                                                                    

In [22]:
# only input_ids, attention_mask, and label are used. The rest are for show
seq_clf_tokenized_snips['train'][0]

{'utterance': 'find a photograph called call on me',
 'label': 1,
 'tokens': ['find', 'a', 'photograph', 'called', 'call', 'on', 'me'],
 'token_labels': [21, 21, 68, 21, 30, 9, 9],
 'input_ids': [101, 1525, 170, 10110, 1270, 1840, 1113, 1143, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [29]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
{i: l for i, l in enumerate(unique_sequence_labels)}

{0: 'BookRestaurant',
 1: 'SearchCreativeWork',
 2: 'PlayMusic',
 3: 'GetWeather',
 4: 'AddToPlaylist',
 5: 'SearchScreeningEvent',
 6: 'RateBook'}

In [23]:
sequence_clf_model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-cased', 
    num_labels=len(unique_sequence_labels),
)

# set an index -> label dictionary
sequence_clf_model.config.id2label = {i: l for i, l in enumerate(unique_sequence_labels)}

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier

In [24]:
sequence_clf_model.config.id2label[0]

'BookRestaurant'

In [27]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):  # custom method to take in logits and calculate accuracy of the eval set
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [26]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    
    # some deep learning parameters that the Trainer is able to take in
    warmup_steps=len(seq_clf_tokenized_snips['train']) // 10,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    
    logging_steps=10,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,  # optional
    data_collator=data_collator
)

In [27]:
# Get initial metrics
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens, utterance, token_labels. If tokens, utterance, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 82/82 [01:14<00:00,  1.11it/s]


{'eval_loss': 1.9480133056640625,
 'eval_accuracy': 0.14214749713412303,
 'eval_runtime': 75.2707,
 'eval_samples_per_second': 34.768,
 'eval_steps_per_second': 1.089}

In [28]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens, utterance, token_labels. If tokens, utterance, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 65,786,887
  2%|▏         | 10/656 [00:33<37:17,  3.46s/it]

{'loss': 1.9525, 'learning_rate': 4.780114722753346e-07, 'epoch': 0.03}


  3%|▎         | 20/656 [01:03<31:01,  2.93s/it]

{'loss': 1.9461, 'learning_rate': 9.560229445506693e-07, 'epoch': 0.06}


  5%|▍         | 30/656 [01:33<31:29,  3.02s/it]

{'loss': 1.9463, 'learning_rate': 1.4340344168260037e-06, 'epoch': 0.09}


  6%|▌         | 40/656 [02:03<30:07,  2.93s/it]

{'loss': 1.9478, 'learning_rate': 1.9120458891013386e-06, 'epoch': 0.12}


  8%|▊         | 50/656 [02:31<28:12,  2.79s/it]

{'loss': 1.9402, 'learning_rate': 2.390057361376673e-06, 'epoch': 0.15}


  9%|▉         | 60/656 [03:07<32:23,  3.26s/it]

{'loss': 1.926, 'learning_rate': 2.8680688336520075e-06, 'epoch': 0.18}


 11%|█         | 70/656 [03:36<28:59,  2.97s/it]

{'loss': 1.9148, 'learning_rate': 3.3460803059273427e-06, 'epoch': 0.21}


 12%|█▏        | 80/656 [04:05<27:16,  2.84s/it]

{'loss': 1.9029, 'learning_rate': 3.824091778202677e-06, 'epoch': 0.24}


 14%|█▎        | 90/656 [04:33<26:21,  2.79s/it]

{'loss': 1.8799, 'learning_rate': 4.302103250478012e-06, 'epoch': 0.27}


 15%|█▌        | 100/656 [05:04<27:44,  2.99s/it]

{'loss': 1.8421, 'learning_rate': 4.780114722753346e-06, 'epoch': 0.3}


 17%|█▋        | 110/656 [05:35<28:52,  3.17s/it]

{'loss': 1.7726, 'learning_rate': 5.2581261950286805e-06, 'epoch': 0.34}


 18%|█▊        | 120/656 [06:06<25:17,  2.83s/it]

{'loss': 1.6509, 'learning_rate': 5.736137667304015e-06, 'epoch': 0.37}


 20%|█▉        | 130/656 [06:33<23:59,  2.74s/it]

{'loss': 1.5156, 'learning_rate': 6.21414913957935e-06, 'epoch': 0.4}


 21%|██▏       | 140/656 [07:00<24:46,  2.88s/it]

{'loss': 1.3262, 'learning_rate': 6.6921606118546855e-06, 'epoch': 0.43}


 23%|██▎       | 150/656 [07:28<25:04,  2.97s/it]

{'loss': 1.1237, 'learning_rate': 7.17017208413002e-06, 'epoch': 0.46}


 24%|██▍       | 160/656 [07:58<24:47,  3.00s/it]

{'loss': 0.9232, 'learning_rate': 7.648183556405354e-06, 'epoch': 0.49}


 26%|██▌       | 170/656 [08:30<25:54,  3.20s/it]

{'loss': 0.7302, 'learning_rate': 8.126195028680688e-06, 'epoch': 0.52}


 27%|██▋       | 180/656 [09:01<25:06,  3.16s/it]

{'loss': 0.6459, 'learning_rate': 8.604206500956023e-06, 'epoch': 0.55}


 29%|██▉       | 190/656 [09:30<21:56,  2.83s/it]

{'loss': 0.4357, 'learning_rate': 9.082217973231358e-06, 'epoch': 0.58}


 30%|███       | 200/656 [09:57<19:52,  2.62s/it]

{'loss': 0.3653, 'learning_rate': 9.560229445506692e-06, 'epoch': 0.61}


 32%|███▏      | 210/656 [10:24<19:02,  2.56s/it]

{'loss': 0.2776, 'learning_rate': 1.0038240917782027e-05, 'epoch': 0.64}


 34%|███▎      | 220/656 [10:52<20:52,  2.87s/it]

{'loss': 0.2552, 'learning_rate': 1.0516252390057361e-05, 'epoch': 0.67}


 35%|███▌      | 230/656 [11:21<20:51,  2.94s/it]

{'loss': 0.1723, 'learning_rate': 1.0994263862332696e-05, 'epoch': 0.7}


 37%|███▋      | 240/656 [11:50<20:34,  2.97s/it]

{'loss': 0.1432, 'learning_rate': 1.147227533460803e-05, 'epoch': 0.73}


 38%|███▊      | 250/656 [12:20<20:24,  3.02s/it]

{'loss': 0.1638, 'learning_rate': 1.1950286806883365e-05, 'epoch': 0.76}


 40%|███▉      | 260/656 [12:50<18:50,  2.86s/it]

{'loss': 0.2015, 'learning_rate': 1.24282982791587e-05, 'epoch': 0.79}


 41%|████      | 270/656 [13:18<16:28,  2.56s/it]

{'loss': 0.171, 'learning_rate': 1.2906309751434034e-05, 'epoch': 0.82}


 43%|████▎     | 280/656 [13:48<18:52,  3.01s/it]

{'loss': 0.1366, 'learning_rate': 1.3384321223709371e-05, 'epoch': 0.85}


 44%|████▍     | 290/656 [14:19<18:42,  3.07s/it]

{'loss': 0.0984, 'learning_rate': 1.3862332695984703e-05, 'epoch': 0.88}


 46%|████▌     | 300/656 [14:48<16:55,  2.85s/it]

{'loss': 0.1149, 'learning_rate': 1.434034416826004e-05, 'epoch': 0.91}


 47%|████▋     | 310/656 [15:20<17:10,  2.98s/it]

{'loss': 0.0881, 'learning_rate': 1.4818355640535372e-05, 'epoch': 0.95}


 49%|████▉     | 320/656 [15:49<16:02,  2.86s/it]

{'loss': 0.0915, 'learning_rate': 1.529636711281071e-05, 'epoch': 0.98}


 50%|█████     | 328/656 [16:11<13:34,  2.48s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens, utterance, token_labels. If tokens, utterance, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
                                                 
 50%|█████     | 328/656 [17:26<13:34,  2.48s/it]Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json


{'eval_loss': 0.08913559466600418, 'eval_accuracy': 0.9786014520443256, 'eval_runtime': 74.5302, 'eval_samples_per_second': 35.113, 'eval_steps_per_second': 1.1, 'epoch': 1.0}


Model weights saved in ./snips_clf/results/checkpoint-328/pytorch_model.bin
 50%|█████     | 330/656 [17:36<1:45:08, 19.35s/it]

{'loss': 0.0947, 'learning_rate': 1.5774378585086042e-05, 'epoch': 1.01}


 52%|█████▏    | 340/656 [18:06<17:27,  3.31s/it]  

{'loss': 0.0906, 'learning_rate': 1.6252390057361376e-05, 'epoch': 1.04}


 53%|█████▎    | 350/656 [18:39<17:37,  3.46s/it]

{'loss': 0.0814, 'learning_rate': 1.6730401529636713e-05, 'epoch': 1.07}


 55%|█████▍    | 360/656 [19:09<14:14,  2.89s/it]

{'loss': 0.0898, 'learning_rate': 1.7208413001912046e-05, 'epoch': 1.1}


 56%|█████▋    | 370/656 [19:39<14:09,  2.97s/it]

{'loss': 0.0513, 'learning_rate': 1.7686424474187383e-05, 'epoch': 1.13}


 58%|█████▊    | 380/656 [20:08<14:46,  3.21s/it]

{'loss': 0.0474, 'learning_rate': 1.8164435946462717e-05, 'epoch': 1.16}


 59%|█████▉    | 390/656 [20:40<13:27,  3.03s/it]

{'loss': 0.0872, 'learning_rate': 1.864244741873805e-05, 'epoch': 1.19}


 61%|██████    | 400/656 [21:09<12:44,  2.99s/it]

{'loss': 0.0792, 'learning_rate': 1.9120458891013384e-05, 'epoch': 1.22}


 62%|██████▎   | 410/656 [21:37<11:18,  2.76s/it]

{'loss': 0.0449, 'learning_rate': 1.959847036328872e-05, 'epoch': 1.25}


 64%|██████▍   | 420/656 [22:09<12:11,  3.10s/it]

{'loss': 0.0385, 'learning_rate': 2.0076481835564055e-05, 'epoch': 1.28}


 66%|██████▌   | 430/656 [22:43<11:26,  3.04s/it]

{'loss': 0.0663, 'learning_rate': 2.055449330783939e-05, 'epoch': 1.31}


 67%|██████▋   | 440/656 [23:12<10:55,  3.04s/it]

{'loss': 0.0802, 'learning_rate': 2.1032504780114722e-05, 'epoch': 1.34}


 69%|██████▊   | 450/656 [23:41<10:15,  2.99s/it]

{'loss': 0.0608, 'learning_rate': 2.151051625239006e-05, 'epoch': 1.37}


 70%|███████   | 460/656 [24:09<09:18,  2.85s/it]

{'loss': 0.0855, 'learning_rate': 2.1988527724665392e-05, 'epoch': 1.4}


 72%|███████▏  | 470/656 [24:41<09:50,  3.17s/it]

{'loss': 0.0455, 'learning_rate': 2.246653919694073e-05, 'epoch': 1.43}


 73%|███████▎  | 480/656 [25:10<08:42,  2.97s/it]

{'loss': 0.0489, 'learning_rate': 2.294455066921606e-05, 'epoch': 1.46}


 75%|███████▍  | 490/656 [25:37<07:21,  2.66s/it]

{'loss': 0.0513, 'learning_rate': 2.3422562141491397e-05, 'epoch': 1.49}


 76%|███████▌  | 500/656 [26:05<07:07,  2.74s/it]

{'loss': 0.1018, 'learning_rate': 2.390057361376673e-05, 'epoch': 1.52}


 78%|███████▊  | 510/656 [26:34<07:26,  3.06s/it]

{'loss': 0.0265, 'learning_rate': 2.4378585086042067e-05, 'epoch': 1.55}


 79%|███████▉  | 520/656 [27:05<06:55,  3.05s/it]

{'loss': 0.0434, 'learning_rate': 2.48565965583174e-05, 'epoch': 1.59}


 81%|████████  | 530/656 [27:36<05:51,  2.79s/it]

{'loss': 0.0824, 'learning_rate': 2.5334608030592738e-05, 'epoch': 1.62}


 82%|████████▏ | 540/656 [28:10<06:39,  3.44s/it]

{'loss': 0.0479, 'learning_rate': 2.5812619502868068e-05, 'epoch': 1.65}


 84%|████████▍ | 550/656 [28:43<05:36,  3.17s/it]

{'loss': 0.0714, 'learning_rate': 2.6290630975143405e-05, 'epoch': 1.68}


 85%|████████▌ | 560/656 [29:12<04:42,  2.95s/it]

{'loss': 0.0559, 'learning_rate': 2.6768642447418742e-05, 'epoch': 1.71}


 87%|████████▋ | 570/656 [29:38<04:00,  2.80s/it]

{'loss': 0.066, 'learning_rate': 2.7246653919694075e-05, 'epoch': 1.74}


 88%|████████▊ | 580/656 [30:08<03:52,  3.05s/it]

{'loss': 0.0517, 'learning_rate': 2.7724665391969406e-05, 'epoch': 1.77}


 90%|████████▉ | 590/656 [30:41<03:35,  3.27s/it]

{'loss': 0.0387, 'learning_rate': 2.8202676864244743e-05, 'epoch': 1.8}


 91%|█████████▏| 600/656 [31:12<02:44,  2.94s/it]

{'loss': 0.0409, 'learning_rate': 2.868068833652008e-05, 'epoch': 1.83}


 93%|█████████▎| 610/656 [31:42<02:29,  3.25s/it]

{'loss': 0.0195, 'learning_rate': 2.9158699808795413e-05, 'epoch': 1.86}


 95%|█████████▍| 620/656 [32:14<01:54,  3.17s/it]

{'loss': 0.0573, 'learning_rate': 2.9636711281070743e-05, 'epoch': 1.89}


 96%|█████████▌| 630/656 [32:42<01:11,  2.74s/it]

{'loss': 0.0563, 'learning_rate': 3.011472275334608e-05, 'epoch': 1.92}


 98%|█████████▊| 640/656 [33:12<00:47,  2.96s/it]

{'loss': 0.0515, 'learning_rate': 3.059273422562142e-05, 'epoch': 1.95}


 99%|█████████▉| 650/656 [33:42<00:19,  3.18s/it]

{'loss': 0.0297, 'learning_rate': 3.1070745697896754e-05, 'epoch': 1.98}


100%|██████████| 656/656 [33:58<00:00,  2.25s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens, utterance, token_labels. If tokens, utterance, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
                                                 
100%|██████████| 656/656 [35:13<00:00,  2.25s/it]Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved in ./snips_clf/results/checkpoint-656/config.json


{'eval_loss': 0.06147920712828636, 'eval_accuracy': 0.9858616736721437, 'eval_runtime': 74.586, 'eval_samples_per_second': 35.087, 'eval_steps_per_second': 1.099, 'epoch': 2.0}


Model weights saved in ./snips_clf/results/checkpoint-656/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_clf/results/checkpoint-656 (score: 0.06147920712828636).
100%|██████████| 656/656 [35:16<00:00,  3.23s/it]

{'train_runtime': 2116.4004, 'train_samples_per_second': 9.891, 'train_steps_per_second': 0.31, 'train_loss': 0.512464398563635, 'epoch': 2.0}





TrainOutput(global_step=656, training_loss=0.512464398563635, metrics={'train_runtime': 2116.4004, 'train_samples_per_second': 9.891, 'train_steps_per_second': 0.31, 'train_loss': 0.512464398563635, 'epoch': 2.0})

In [29]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens, utterance, token_labels. If tokens, utterance, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [01:12<00:00,  1.14it/s]


{'eval_loss': 0.06147920712828636,
 'eval_accuracy': 0.9858616736721437,
 'eval_runtime': 72.7994,
 'eval_samples_per_second': 35.948,
 'eval_steps_per_second': 1.126,
 'epoch': 2.0}

In [30]:
trainer.save_model()

Saving model checkpoint to ./snips_clf/results
Configuration saved in ./snips_clf/results/config.json
Model weights saved in ./snips_clf/results/pytorch_model.bin


In [31]:
# We can now load our fine-tuned from our directory
pipe = pipeline("text-classification", "./snips_clf/results", tokenizer=tokenizer)

pipe('Please add Here We Go by Dispatch to my road trip playlist')

loading configuration file ./snips_clf/results/config.json
Model config DistilBertConfig {
  "_name_or_path": "./snips_clf/results",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "BookRestaurant",
    "1": "SearchCreativeWork",
    "2": "PlayMusic",
    "3": "GetWeather",
    "4": "AddToPlaylist",
    "5": "SearchScreeningEvent",
    "6": "RateBook"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype":

[{'label': 'AddToPlaylist', 'score': 0.998089611530304}]

In [24]:
frozen_sequence_clf_model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-cased', 
    num_labels=len(unique_sequence_labels),
)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.wei

In [25]:
# freezes EVERY parameter in our bert model
# does not freeze our classification layer
for param in frozen_sequence_clf_model.distilbert.parameters():
    param.requires_grad = False

In [30]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    
    # some deep learning parameters that the Trainer is able to take in
    warmup_steps=len(seq_clf_tokenized_snips['train']) // 10,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    report_to='wandb',
    
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=frozen_sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [31]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 595,975
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mtrendprix[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/656 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 1/656 [00:00<09:45,  1.12it/s]

{'loss': 1.9641, 'learning_rate': 4.780114722753346e-08, 'epoch': 0.0}


  0%|          | 2/656 [00:01<10:26,  1.04it/s]

{'loss': 1.9199, 'learning_rate': 9.560229445506692e-08, 'epoch': 0.01}


  0%|          | 3/656 [00:03<13:48,  1.27s/it]

{'loss': 1.9423, 'learning_rate': 1.434034416826004e-07, 'epoch': 0.01}


  1%|          | 4/656 [00:04<13:10,  1.21s/it]

{'loss': 1.9747, 'learning_rate': 1.9120458891013384e-07, 'epoch': 0.01}


  1%|          | 5/656 [00:05<11:11,  1.03s/it]

{'loss': 1.9568, 'learning_rate': 2.390057361376673e-07, 'epoch': 0.02}


  1%|          | 6/656 [00:06<10:46,  1.00it/s]

{'loss': 1.9689, 'learning_rate': 2.868068833652008e-07, 'epoch': 0.02}


  1%|          | 7/656 [00:07<12:16,  1.13s/it]

{'loss': 1.9477, 'learning_rate': 3.346080305927343e-07, 'epoch': 0.02}


  1%|          | 8/656 [00:08<11:58,  1.11s/it]

{'loss': 1.9522, 'learning_rate': 3.824091778202677e-07, 'epoch': 0.02}


  1%|▏         | 9/656 [00:09<11:04,  1.03s/it]

{'loss': 1.935, 'learning_rate': 4.3021032504780114e-07, 'epoch': 0.03}


  2%|▏         | 10/656 [00:10<10:32,  1.02it/s]

{'loss': 1.9298, 'learning_rate': 4.780114722753346e-07, 'epoch': 0.03}


  2%|▏         | 11/656 [00:11<10:55,  1.02s/it]

{'loss': 1.9828, 'learning_rate': 5.258126195028682e-07, 'epoch': 0.03}


  2%|▏         | 12/656 [00:12<10:16,  1.04it/s]

{'loss': 1.9722, 'learning_rate': 5.736137667304016e-07, 'epoch': 0.04}


  2%|▏         | 13/656 [00:13<09:48,  1.09it/s]

{'loss': 1.9354, 'learning_rate': 6.214149139579351e-07, 'epoch': 0.04}


  2%|▏         | 14/656 [00:14<09:26,  1.13it/s]

{'loss': 1.9613, 'learning_rate': 6.692160611854686e-07, 'epoch': 0.04}


  2%|▏         | 15/656 [00:14<09:12,  1.16it/s]

{'loss': 1.9359, 'learning_rate': 7.170172084130019e-07, 'epoch': 0.05}


  2%|▏         | 16/656 [00:15<08:47,  1.21it/s]

{'loss': 1.973, 'learning_rate': 7.648183556405354e-07, 'epoch': 0.05}


  3%|▎         | 17/656 [00:16<08:56,  1.19it/s]

{'loss': 1.9467, 'learning_rate': 8.12619502868069e-07, 'epoch': 0.05}


  3%|▎         | 18/656 [00:17<08:24,  1.27it/s]

{'loss': 1.9419, 'learning_rate': 8.604206500956023e-07, 'epoch': 0.05}


  3%|▎         | 19/656 [00:18<09:30,  1.12it/s]

{'loss': 1.9618, 'learning_rate': 9.082217973231359e-07, 'epoch': 0.06}


  3%|▎         | 20/656 [00:19<10:50,  1.02s/it]

{'loss': 1.9299, 'learning_rate': 9.560229445506693e-07, 'epoch': 0.06}


  3%|▎         | 21/656 [00:20<09:56,  1.06it/s]

{'loss': 1.9689, 'learning_rate': 1.0038240917782027e-06, 'epoch': 0.06}


  3%|▎         | 22/656 [00:20<08:59,  1.18it/s]

{'loss': 1.9496, 'learning_rate': 1.0516252390057363e-06, 'epoch': 0.07}


  4%|▎         | 23/656 [00:21<08:47,  1.20it/s]

{'loss': 1.9601, 'learning_rate': 1.0994263862332695e-06, 'epoch': 0.07}


  4%|▎         | 24/656 [00:23<10:49,  1.03s/it]

{'loss': 1.9464, 'learning_rate': 1.1472275334608031e-06, 'epoch': 0.07}


  4%|▍         | 25/656 [00:24<10:40,  1.01s/it]

{'loss': 1.9881, 'learning_rate': 1.1950286806883365e-06, 'epoch': 0.08}


  4%|▍         | 26/656 [00:25<10:02,  1.05it/s]

{'loss': 1.9626, 'learning_rate': 1.2428298279158701e-06, 'epoch': 0.08}


  4%|▍         | 27/656 [00:25<09:27,  1.11it/s]

{'loss': 1.9779, 'learning_rate': 1.2906309751434035e-06, 'epoch': 0.08}


  4%|▍         | 28/656 [00:26<09:11,  1.14it/s]

{'loss': 1.9351, 'learning_rate': 1.3384321223709371e-06, 'epoch': 0.09}


  4%|▍         | 29/656 [00:27<09:14,  1.13it/s]

{'loss': 1.9638, 'learning_rate': 1.3862332695984705e-06, 'epoch': 0.09}


  5%|▍         | 30/656 [00:28<09:29,  1.10it/s]

{'loss': 1.9927, 'learning_rate': 1.4340344168260037e-06, 'epoch': 0.09}


  5%|▍         | 31/656 [00:29<09:07,  1.14it/s]

{'loss': 1.9984, 'learning_rate': 1.4818355640535373e-06, 'epoch': 0.09}


  5%|▍         | 32/656 [00:30<08:48,  1.18it/s]

{'loss': 1.9581, 'learning_rate': 1.5296367112810707e-06, 'epoch': 0.1}


  5%|▌         | 33/656 [00:30<08:23,  1.24it/s]

{'loss': 1.9364, 'learning_rate': 1.5774378585086041e-06, 'epoch': 0.1}


  5%|▌         | 34/656 [00:31<09:06,  1.14it/s]

{'loss': 1.9192, 'learning_rate': 1.625239005736138e-06, 'epoch': 0.1}


  5%|▌         | 35/656 [00:32<09:04,  1.14it/s]

{'loss': 1.9597, 'learning_rate': 1.6730401529636714e-06, 'epoch': 0.11}


  5%|▌         | 36/656 [00:33<08:34,  1.21it/s]

{'loss': 1.94, 'learning_rate': 1.7208413001912046e-06, 'epoch': 0.11}


  6%|▌         | 37/656 [00:34<08:31,  1.21it/s]

{'loss': 1.977, 'learning_rate': 1.768642447418738e-06, 'epoch': 0.11}


  6%|▌         | 38/656 [00:35<08:44,  1.18it/s]

{'loss': 1.9836, 'learning_rate': 1.8164435946462718e-06, 'epoch': 0.12}


  6%|▌         | 39/656 [00:36<08:43,  1.18it/s]

{'loss': 1.9366, 'learning_rate': 1.8642447418738052e-06, 'epoch': 0.12}


  6%|▌         | 40/656 [00:36<08:57,  1.15it/s]

{'loss': 1.9227, 'learning_rate': 1.9120458891013386e-06, 'epoch': 0.12}


  6%|▋         | 41/656 [00:37<08:26,  1.21it/s]

{'loss': 1.9586, 'learning_rate': 1.9598470363288718e-06, 'epoch': 0.12}


  6%|▋         | 42/656 [00:38<08:17,  1.23it/s]

{'loss': 1.9698, 'learning_rate': 2.0076481835564054e-06, 'epoch': 0.13}


  7%|▋         | 43/656 [00:39<09:31,  1.07it/s]

{'loss': 1.9452, 'learning_rate': 2.055449330783939e-06, 'epoch': 0.13}


  7%|▋         | 44/656 [00:40<09:22,  1.09it/s]

{'loss': 1.9611, 'learning_rate': 2.1032504780114726e-06, 'epoch': 0.13}


  7%|▋         | 45/656 [00:41<09:49,  1.04it/s]

{'loss': 1.953, 'learning_rate': 2.151051625239006e-06, 'epoch': 0.14}


  7%|▋         | 46/656 [00:42<09:29,  1.07it/s]

{'loss': 1.9474, 'learning_rate': 2.198852772466539e-06, 'epoch': 0.14}


  7%|▋         | 47/656 [00:43<09:31,  1.06it/s]

{'loss': 1.9638, 'learning_rate': 2.246653919694073e-06, 'epoch': 0.14}


  7%|▋         | 48/656 [00:44<09:20,  1.08it/s]

{'loss': 1.9775, 'learning_rate': 2.2944550669216062e-06, 'epoch': 0.15}


  7%|▋         | 49/656 [00:45<09:31,  1.06it/s]

{'loss': 1.9576, 'learning_rate': 2.34225621414914e-06, 'epoch': 0.15}


  8%|▊         | 50/656 [00:46<10:08,  1.00s/it]

{'loss': 1.97, 'learning_rate': 2.390057361376673e-06, 'epoch': 0.15}


  8%|▊         | 51/656 [00:47<09:41,  1.04it/s]

{'loss': 1.9523, 'learning_rate': 2.4378585086042066e-06, 'epoch': 0.16}


  8%|▊         | 52/656 [00:48<09:14,  1.09it/s]

{'loss': 1.9474, 'learning_rate': 2.4856596558317402e-06, 'epoch': 0.16}


  8%|▊         | 53/656 [00:49<09:41,  1.04it/s]

{'loss': 1.9255, 'learning_rate': 2.5334608030592734e-06, 'epoch': 0.16}


  8%|▊         | 54/656 [00:50<09:12,  1.09it/s]

{'loss': 1.931, 'learning_rate': 2.581261950286807e-06, 'epoch': 0.16}


  8%|▊         | 55/656 [00:51<09:23,  1.07it/s]

{'loss': 1.9374, 'learning_rate': 2.6290630975143402e-06, 'epoch': 0.17}


  9%|▊         | 56/656 [00:52<09:44,  1.03it/s]

{'loss': 1.9733, 'learning_rate': 2.6768642447418743e-06, 'epoch': 0.17}


  9%|▊         | 57/656 [00:52<08:50,  1.13it/s]

{'loss': 1.9385, 'learning_rate': 2.7246653919694075e-06, 'epoch': 0.17}


  9%|▉         | 58/656 [00:53<08:12,  1.21it/s]

{'loss': 1.9582, 'learning_rate': 2.772466539196941e-06, 'epoch': 0.18}


  9%|▉         | 59/656 [00:54<08:11,  1.21it/s]

{'loss': 1.9505, 'learning_rate': 2.8202676864244743e-06, 'epoch': 0.18}


  9%|▉         | 60/656 [00:55<07:59,  1.24it/s]

{'loss': 1.9603, 'learning_rate': 2.8680688336520075e-06, 'epoch': 0.18}


  9%|▉         | 61/656 [00:55<08:18,  1.19it/s]

{'loss': 1.9771, 'learning_rate': 2.9158699808795415e-06, 'epoch': 0.19}


  9%|▉         | 62/656 [00:56<07:56,  1.25it/s]

{'loss': 1.9377, 'learning_rate': 2.9636711281070747e-06, 'epoch': 0.19}


 10%|▉         | 63/656 [00:57<08:04,  1.22it/s]

{'loss': 1.9567, 'learning_rate': 3.0114722753346083e-06, 'epoch': 0.19}


 10%|▉         | 64/656 [00:58<08:22,  1.18it/s]

{'loss': 1.9684, 'learning_rate': 3.0592734225621415e-06, 'epoch': 0.2}


 10%|▉         | 65/656 [00:59<08:12,  1.20it/s]

{'loss': 1.9705, 'learning_rate': 3.107074569789675e-06, 'epoch': 0.2}


 10%|█         | 66/656 [01:00<09:16,  1.06it/s]

{'loss': 1.9539, 'learning_rate': 3.1548757170172083e-06, 'epoch': 0.2}


 10%|█         | 67/656 [01:01<08:50,  1.11it/s]

{'loss': 1.9288, 'learning_rate': 3.202676864244742e-06, 'epoch': 0.2}


 10%|█         | 68/656 [01:02<08:53,  1.10it/s]

{'loss': 1.969, 'learning_rate': 3.250478011472276e-06, 'epoch': 0.21}


 11%|█         | 69/656 [01:03<09:00,  1.09it/s]

{'loss': 1.9494, 'learning_rate': 3.2982791586998087e-06, 'epoch': 0.21}


 11%|█         | 70/656 [01:03<08:31,  1.15it/s]

{'loss': 1.9487, 'learning_rate': 3.3460803059273427e-06, 'epoch': 0.21}


 11%|█         | 71/656 [01:04<08:27,  1.15it/s]

{'loss': 1.9147, 'learning_rate': 3.3938814531548755e-06, 'epoch': 0.22}


 11%|█         | 72/656 [01:05<08:17,  1.17it/s]

{'loss': 1.9477, 'learning_rate': 3.441682600382409e-06, 'epoch': 0.22}


 11%|█         | 73/656 [01:06<08:04,  1.20it/s]

{'loss': 1.9579, 'learning_rate': 3.489483747609943e-06, 'epoch': 0.22}


 11%|█▏        | 74/656 [01:07<08:48,  1.10it/s]

{'loss': 1.9737, 'learning_rate': 3.537284894837476e-06, 'epoch': 0.23}


 11%|█▏        | 75/656 [01:08<08:17,  1.17it/s]

{'loss': 1.9447, 'learning_rate': 3.58508604206501e-06, 'epoch': 0.23}


 12%|█▏        | 76/656 [01:08<08:11,  1.18it/s]

{'loss': 1.938, 'learning_rate': 3.6328871892925436e-06, 'epoch': 0.23}


 12%|█▏        | 77/656 [01:09<07:55,  1.22it/s]

{'loss': 1.8999, 'learning_rate': 3.6806883365200768e-06, 'epoch': 0.23}


 12%|█▏        | 78/656 [01:10<08:13,  1.17it/s]

{'loss': 1.9579, 'learning_rate': 3.7284894837476104e-06, 'epoch': 0.24}


 12%|█▏        | 79/656 [01:11<07:55,  1.21it/s]

{'loss': 1.9519, 'learning_rate': 3.7762906309751436e-06, 'epoch': 0.24}


 12%|█▏        | 80/656 [01:12<08:04,  1.19it/s]

{'loss': 1.931, 'learning_rate': 3.824091778202677e-06, 'epoch': 0.24}


 12%|█▏        | 81/656 [01:13<08:50,  1.08it/s]

{'loss': 1.9308, 'learning_rate': 3.871892925430211e-06, 'epoch': 0.25}


 12%|█▎        | 82/656 [01:14<07:59,  1.20it/s]

{'loss': 1.9371, 'learning_rate': 3.9196940726577436e-06, 'epoch': 0.25}


 13%|█▎        | 83/656 [01:14<07:51,  1.22it/s]

{'loss': 1.9505, 'learning_rate': 3.967495219885278e-06, 'epoch': 0.25}


 13%|█▎        | 84/656 [01:15<08:13,  1.16it/s]

{'loss': 1.9456, 'learning_rate': 4.015296367112811e-06, 'epoch': 0.26}


 13%|█▎        | 85/656 [01:16<07:54,  1.20it/s]

{'loss': 1.9464, 'learning_rate': 4.063097514340344e-06, 'epoch': 0.26}


 13%|█▎        | 86/656 [01:17<07:33,  1.26it/s]

{'loss': 1.9411, 'learning_rate': 4.110898661567878e-06, 'epoch': 0.26}


 13%|█▎        | 87/656 [01:17<07:25,  1.28it/s]

{'loss': 1.959, 'learning_rate': 4.158699808795411e-06, 'epoch': 0.27}


 13%|█▎        | 88/656 [01:18<07:28,  1.27it/s]

{'loss': 1.9531, 'learning_rate': 4.206500956022945e-06, 'epoch': 0.27}


 14%|█▎        | 89/656 [01:19<07:31,  1.26it/s]

{'loss': 1.9604, 'learning_rate': 4.2543021032504776e-06, 'epoch': 0.27}


 14%|█▎        | 90/656 [01:20<07:28,  1.26it/s]

{'loss': 1.9485, 'learning_rate': 4.302103250478012e-06, 'epoch': 0.27}


 14%|█▍        | 91/656 [01:21<07:11,  1.31it/s]

{'loss': 1.9392, 'learning_rate': 4.349904397705546e-06, 'epoch': 0.28}


 14%|█▍        | 92/656 [01:21<07:06,  1.32it/s]

{'loss': 1.9288, 'learning_rate': 4.397705544933078e-06, 'epoch': 0.28}


 14%|█▍        | 93/656 [01:22<06:55,  1.36it/s]

{'loss': 1.9365, 'learning_rate': 4.445506692160612e-06, 'epoch': 0.28}


 14%|█▍        | 94/656 [01:23<06:56,  1.35it/s]

{'loss': 1.9238, 'learning_rate': 4.493307839388146e-06, 'epoch': 0.29}


 14%|█▍        | 95/656 [01:24<06:57,  1.34it/s]

{'loss': 1.9278, 'learning_rate': 4.541108986615679e-06, 'epoch': 0.29}


 15%|█▍        | 96/656 [01:24<07:26,  1.25it/s]

{'loss': 1.9395, 'learning_rate': 4.5889101338432124e-06, 'epoch': 0.29}


 15%|█▍        | 97/656 [01:26<09:07,  1.02it/s]

{'loss': 1.9563, 'learning_rate': 4.636711281070746e-06, 'epoch': 0.3}


 15%|█▍        | 98/656 [01:27<08:29,  1.09it/s]

{'loss': 1.9509, 'learning_rate': 4.68451242829828e-06, 'epoch': 0.3}


 15%|█▌        | 99/656 [01:27<08:14,  1.13it/s]

{'loss': 1.9026, 'learning_rate': 4.732313575525813e-06, 'epoch': 0.3}


 15%|█▌        | 100/656 [01:28<08:19,  1.11it/s]

{'loss': 1.9203, 'learning_rate': 4.780114722753346e-06, 'epoch': 0.3}


 15%|█▌        | 101/656 [01:29<07:45,  1.19it/s]

{'loss': 1.9507, 'learning_rate': 4.82791586998088e-06, 'epoch': 0.31}


 16%|█▌        | 102/656 [01:30<07:19,  1.26it/s]

{'loss': 1.9742, 'learning_rate': 4.875717017208413e-06, 'epoch': 0.31}


 16%|█▌        | 103/656 [01:31<07:28,  1.23it/s]

{'loss': 1.9557, 'learning_rate': 4.9235181644359465e-06, 'epoch': 0.31}


 16%|█▌        | 104/656 [01:31<07:41,  1.20it/s]

{'loss': 1.9239, 'learning_rate': 4.9713193116634805e-06, 'epoch': 0.32}


 16%|█▌        | 105/656 [01:32<07:06,  1.29it/s]

{'loss': 1.9307, 'learning_rate': 5.019120458891014e-06, 'epoch': 0.32}


 16%|█▌        | 106/656 [01:33<07:01,  1.31it/s]

{'loss': 1.9323, 'learning_rate': 5.066921606118547e-06, 'epoch': 0.32}


 16%|█▋        | 107/656 [01:34<06:59,  1.31it/s]

{'loss': 1.9551, 'learning_rate': 5.11472275334608e-06, 'epoch': 0.33}


 16%|█▋        | 108/656 [01:35<07:31,  1.21it/s]

{'loss': 1.9299, 'learning_rate': 5.162523900573614e-06, 'epoch': 0.33}


 17%|█▋        | 109/656 [01:35<07:08,  1.28it/s]

{'loss': 1.9519, 'learning_rate': 5.210325047801147e-06, 'epoch': 0.33}


 17%|█▋        | 110/656 [01:36<07:12,  1.26it/s]

{'loss': 1.9653, 'learning_rate': 5.2581261950286805e-06, 'epoch': 0.34}


 17%|█▋        | 111/656 [01:37<07:45,  1.17it/s]

{'loss': 1.931, 'learning_rate': 5.3059273422562145e-06, 'epoch': 0.34}


 17%|█▋        | 112/656 [01:38<08:30,  1.07it/s]

{'loss': 1.9613, 'learning_rate': 5.3537284894837486e-06, 'epoch': 0.34}


 17%|█▋        | 113/656 [01:39<08:19,  1.09it/s]

{'loss': 1.9267, 'learning_rate': 5.401529636711281e-06, 'epoch': 0.34}


 17%|█▋        | 114/656 [01:40<08:53,  1.02it/s]

{'loss': 1.9225, 'learning_rate': 5.449330783938815e-06, 'epoch': 0.35}


 18%|█▊        | 115/656 [01:41<08:50,  1.02it/s]

{'loss': 1.9107, 'learning_rate': 5.497131931166348e-06, 'epoch': 0.35}


 18%|█▊        | 116/656 [01:42<09:15,  1.03s/it]

{'loss': 1.9318, 'learning_rate': 5.544933078393882e-06, 'epoch': 0.35}


 18%|█▊        | 117/656 [01:43<09:25,  1.05s/it]

{'loss': 1.9088, 'learning_rate': 5.592734225621415e-06, 'epoch': 0.36}


 18%|█▊        | 118/656 [01:44<09:10,  1.02s/it]

{'loss': 1.9248, 'learning_rate': 5.6405353728489485e-06, 'epoch': 0.36}


 18%|█▊        | 119/656 [01:45<08:32,  1.05it/s]

{'loss': 1.9201, 'learning_rate': 5.6883365200764826e-06, 'epoch': 0.36}


 18%|█▊        | 120/656 [01:46<08:15,  1.08it/s]

{'loss': 1.939, 'learning_rate': 5.736137667304015e-06, 'epoch': 0.37}


 18%|█▊        | 121/656 [01:47<08:30,  1.05it/s]

{'loss': 1.9357, 'learning_rate': 5.783938814531549e-06, 'epoch': 0.37}


 19%|█▊        | 122/656 [01:48<08:07,  1.09it/s]

{'loss': 1.9367, 'learning_rate': 5.831739961759083e-06, 'epoch': 0.37}


 19%|█▉        | 123/656 [01:49<07:45,  1.14it/s]

{'loss': 1.9503, 'learning_rate': 5.879541108986616e-06, 'epoch': 0.38}


 19%|█▉        | 124/656 [01:50<07:43,  1.15it/s]

{'loss': 1.9848, 'learning_rate': 5.927342256214149e-06, 'epoch': 0.38}


 19%|█▉        | 125/656 [01:51<07:58,  1.11it/s]

{'loss': 1.9521, 'learning_rate': 5.9751434034416826e-06, 'epoch': 0.38}


 19%|█▉        | 126/656 [01:51<07:59,  1.10it/s]

{'loss': 1.9169, 'learning_rate': 6.022944550669217e-06, 'epoch': 0.38}


 19%|█▉        | 127/656 [01:52<07:30,  1.18it/s]

{'loss': 1.9575, 'learning_rate': 6.07074569789675e-06, 'epoch': 0.39}


 20%|█▉        | 128/656 [01:53<07:35,  1.16it/s]

{'loss': 1.9177, 'learning_rate': 6.118546845124283e-06, 'epoch': 0.39}


 20%|█▉        | 129/656 [01:54<07:06,  1.23it/s]

{'loss': 1.9174, 'learning_rate': 6.166347992351817e-06, 'epoch': 0.39}


 20%|█▉        | 130/656 [01:55<07:16,  1.20it/s]

{'loss': 1.9467, 'learning_rate': 6.21414913957935e-06, 'epoch': 0.4}


 20%|█▉        | 131/656 [01:55<07:15,  1.21it/s]

{'loss': 1.9407, 'learning_rate': 6.261950286806883e-06, 'epoch': 0.4}


 20%|██        | 132/656 [01:56<07:38,  1.14it/s]

{'loss': 1.9253, 'learning_rate': 6.3097514340344166e-06, 'epoch': 0.4}


 20%|██        | 133/656 [01:57<07:34,  1.15it/s]

{'loss': 1.9209, 'learning_rate': 6.357552581261951e-06, 'epoch': 0.41}


 20%|██        | 134/656 [01:58<08:11,  1.06it/s]

{'loss': 1.9512, 'learning_rate': 6.405353728489484e-06, 'epoch': 0.41}


 21%|██        | 135/656 [01:59<07:58,  1.09it/s]

{'loss': 1.9212, 'learning_rate': 6.453154875717017e-06, 'epoch': 0.41}


 21%|██        | 136/656 [02:00<07:48,  1.11it/s]

{'loss': 1.9505, 'learning_rate': 6.500956022944552e-06, 'epoch': 0.41}


 21%|██        | 137/656 [02:01<07:54,  1.09it/s]

{'loss': 1.9071, 'learning_rate': 6.548757170172084e-06, 'epoch': 0.42}


 21%|██        | 138/656 [02:02<07:44,  1.12it/s]

{'loss': 1.9414, 'learning_rate': 6.596558317399617e-06, 'epoch': 0.42}


 21%|██        | 139/656 [02:03<07:29,  1.15it/s]

{'loss': 1.9485, 'learning_rate': 6.644359464627152e-06, 'epoch': 0.42}


 21%|██▏       | 140/656 [02:04<07:31,  1.14it/s]

{'loss': 1.9348, 'learning_rate': 6.6921606118546855e-06, 'epoch': 0.43}


 21%|██▏       | 141/656 [02:04<07:00,  1.23it/s]

{'loss': 1.9451, 'learning_rate': 6.739961759082218e-06, 'epoch': 0.43}


 22%|██▏       | 142/656 [02:05<07:57,  1.08it/s]

{'loss': 1.9177, 'learning_rate': 6.787762906309751e-06, 'epoch': 0.43}


 22%|██▏       | 143/656 [02:06<07:53,  1.08it/s]

{'loss': 1.924, 'learning_rate': 6.835564053537286e-06, 'epoch': 0.44}


 22%|██▏       | 144/656 [02:08<08:59,  1.05s/it]

{'loss': 1.9323, 'learning_rate': 6.883365200764818e-06, 'epoch': 0.44}


 22%|██▏       | 145/656 [02:09<08:52,  1.04s/it]

{'loss': 1.961, 'learning_rate': 6.9311663479923514e-06, 'epoch': 0.44}


 22%|██▏       | 146/656 [02:10<08:08,  1.05it/s]

{'loss': 1.9152, 'learning_rate': 6.978967495219886e-06, 'epoch': 0.45}


 22%|██▏       | 147/656 [02:11<08:14,  1.03it/s]

{'loss': 1.9243, 'learning_rate': 7.0267686424474195e-06, 'epoch': 0.45}


 23%|██▎       | 148/656 [02:11<07:44,  1.09it/s]

{'loss': 1.9282, 'learning_rate': 7.074569789674952e-06, 'epoch': 0.45}


 23%|██▎       | 149/656 [02:12<08:17,  1.02it/s]

{'loss': 1.9371, 'learning_rate': 7.122370936902487e-06, 'epoch': 0.45}


 23%|██▎       | 150/656 [02:13<07:45,  1.09it/s]

{'loss': 1.9045, 'learning_rate': 7.17017208413002e-06, 'epoch': 0.46}


 23%|██▎       | 151/656 [02:14<07:38,  1.10it/s]

{'loss': 1.9254, 'learning_rate': 7.217973231357552e-06, 'epoch': 0.46}


 23%|██▎       | 152/656 [02:15<07:34,  1.11it/s]

{'loss': 1.933, 'learning_rate': 7.265774378585087e-06, 'epoch': 0.46}


 23%|██▎       | 153/656 [02:16<07:29,  1.12it/s]

{'loss': 1.9269, 'learning_rate': 7.31357552581262e-06, 'epoch': 0.47}


 23%|██▎       | 154/656 [02:17<07:04,  1.18it/s]

{'loss': 1.8967, 'learning_rate': 7.3613766730401535e-06, 'epoch': 0.47}


 24%|██▎       | 155/656 [02:17<06:57,  1.20it/s]

{'loss': 1.9288, 'learning_rate': 7.409177820267686e-06, 'epoch': 0.47}


 24%|██▍       | 156/656 [02:18<06:54,  1.21it/s]

{'loss': 1.9454, 'learning_rate': 7.456978967495221e-06, 'epoch': 0.48}


 24%|██▍       | 157/656 [02:19<07:48,  1.07it/s]

{'loss': 1.9488, 'learning_rate': 7.504780114722754e-06, 'epoch': 0.48}


 24%|██▍       | 158/656 [02:20<07:24,  1.12it/s]

{'loss': 1.9473, 'learning_rate': 7.552581261950287e-06, 'epoch': 0.48}


 24%|██▍       | 159/656 [02:21<07:24,  1.12it/s]

{'loss': 1.9193, 'learning_rate': 7.600382409177821e-06, 'epoch': 0.48}


 24%|██▍       | 160/656 [02:22<06:59,  1.18it/s]

{'loss': 1.9145, 'learning_rate': 7.648183556405354e-06, 'epoch': 0.49}


 25%|██▍       | 161/656 [02:23<06:47,  1.21it/s]

{'loss': 1.913, 'learning_rate': 7.695984703632887e-06, 'epoch': 0.49}


 25%|██▍       | 162/656 [02:23<06:41,  1.23it/s]

{'loss': 1.9146, 'learning_rate': 7.743785850860422e-06, 'epoch': 0.49}


 25%|██▍       | 163/656 [02:24<07:04,  1.16it/s]

{'loss': 1.9317, 'learning_rate': 7.791586998087955e-06, 'epoch': 0.5}


 25%|██▌       | 164/656 [02:25<07:17,  1.13it/s]

{'loss': 1.9203, 'learning_rate': 7.839388145315487e-06, 'epoch': 0.5}


 25%|██▌       | 165/656 [02:26<07:46,  1.05it/s]

{'loss': 1.9342, 'learning_rate': 7.887189292543021e-06, 'epoch': 0.5}


 25%|██▌       | 166/656 [02:27<07:40,  1.06it/s]

{'loss': 1.8887, 'learning_rate': 7.934990439770555e-06, 'epoch': 0.51}


 25%|██▌       | 167/656 [02:28<08:03,  1.01it/s]

{'loss': 1.9265, 'learning_rate': 7.982791586998088e-06, 'epoch': 0.51}


 26%|██▌       | 168/656 [02:29<08:00,  1.02it/s]

{'loss': 1.9164, 'learning_rate': 8.030592734225622e-06, 'epoch': 0.51}


 26%|██▌       | 169/656 [02:30<07:56,  1.02it/s]

{'loss': 1.9056, 'learning_rate': 8.078393881453156e-06, 'epoch': 0.52}


 26%|██▌       | 170/656 [02:31<07:40,  1.06it/s]

{'loss': 1.9453, 'learning_rate': 8.126195028680688e-06, 'epoch': 0.52}


 26%|██▌       | 171/656 [02:32<07:25,  1.09it/s]

{'loss': 1.9216, 'learning_rate': 8.173996175908222e-06, 'epoch': 0.52}


 26%|██▌       | 172/656 [02:33<06:53,  1.17it/s]

{'loss': 1.9326, 'learning_rate': 8.221797323135756e-06, 'epoch': 0.52}


 26%|██▋       | 173/656 [02:34<06:37,  1.21it/s]

{'loss': 1.9152, 'learning_rate': 8.26959847036329e-06, 'epoch': 0.53}


 27%|██▋       | 174/656 [02:35<07:01,  1.14it/s]

{'loss': 1.9039, 'learning_rate': 8.317399617590822e-06, 'epoch': 0.53}


 27%|██▋       | 175/656 [02:35<07:01,  1.14it/s]

{'loss': 1.8982, 'learning_rate': 8.365200764818356e-06, 'epoch': 0.53}


 27%|██▋       | 176/656 [02:36<06:43,  1.19it/s]

{'loss': 1.9364, 'learning_rate': 8.41300191204589e-06, 'epoch': 0.54}


 27%|██▋       | 177/656 [02:37<06:55,  1.15it/s]

{'loss': 1.9036, 'learning_rate': 8.460803059273423e-06, 'epoch': 0.54}


 27%|██▋       | 178/656 [02:38<06:55,  1.15it/s]

{'loss': 1.9055, 'learning_rate': 8.508604206500955e-06, 'epoch': 0.54}


 27%|██▋       | 179/656 [02:39<07:20,  1.08it/s]

{'loss': 1.9148, 'learning_rate': 8.556405353728491e-06, 'epoch': 0.55}


 27%|██▋       | 180/656 [02:40<06:46,  1.17it/s]

{'loss': 1.9413, 'learning_rate': 8.604206500956023e-06, 'epoch': 0.55}


 28%|██▊       | 181/656 [02:41<07:27,  1.06it/s]

{'loss': 1.9262, 'learning_rate': 8.652007648183556e-06, 'epoch': 0.55}


 28%|██▊       | 182/656 [02:42<07:12,  1.10it/s]

{'loss': 1.9127, 'learning_rate': 8.699808795411091e-06, 'epoch': 0.55}


 28%|██▊       | 183/656 [02:43<07:02,  1.12it/s]

{'loss': 1.9395, 'learning_rate': 8.747609942638624e-06, 'epoch': 0.56}


 28%|██▊       | 184/656 [02:43<06:45,  1.16it/s]

{'loss': 1.9159, 'learning_rate': 8.795411089866156e-06, 'epoch': 0.56}


 28%|██▊       | 185/656 [02:44<07:12,  1.09it/s]

{'loss': 1.9361, 'learning_rate': 8.843212237093692e-06, 'epoch': 0.56}


 28%|██▊       | 186/656 [02:46<07:55,  1.01s/it]

{'loss': 1.9394, 'learning_rate': 8.891013384321224e-06, 'epoch': 0.57}


 29%|██▊       | 187/656 [02:47<07:37,  1.03it/s]

{'loss': 1.9176, 'learning_rate': 8.938814531548758e-06, 'epoch': 0.57}


 29%|██▊       | 188/656 [02:47<07:03,  1.10it/s]

{'loss': 1.9253, 'learning_rate': 8.986615678776292e-06, 'epoch': 0.57}


 29%|██▉       | 189/656 [02:48<07:28,  1.04it/s]

{'loss': 1.8999, 'learning_rate': 9.034416826003824e-06, 'epoch': 0.58}


 29%|██▉       | 190/656 [02:49<07:27,  1.04it/s]

{'loss': 1.9178, 'learning_rate': 9.082217973231358e-06, 'epoch': 0.58}


 29%|██▉       | 191/656 [02:50<07:06,  1.09it/s]

{'loss': 1.9337, 'learning_rate': 9.13001912045889e-06, 'epoch': 0.58}


 29%|██▉       | 192/656 [02:51<07:05,  1.09it/s]

{'loss': 1.909, 'learning_rate': 9.177820267686425e-06, 'epoch': 0.59}


 29%|██▉       | 193/656 [02:52<07:01,  1.10it/s]

{'loss': 1.9166, 'learning_rate': 9.225621414913959e-06, 'epoch': 0.59}


 30%|██▉       | 194/656 [02:53<06:44,  1.14it/s]

{'loss': 1.9111, 'learning_rate': 9.273422562141491e-06, 'epoch': 0.59}


 30%|██▉       | 195/656 [02:54<07:04,  1.08it/s]

{'loss': 1.9157, 'learning_rate': 9.321223709369025e-06, 'epoch': 0.59}


 30%|██▉       | 196/656 [02:55<07:01,  1.09it/s]

{'loss': 1.9125, 'learning_rate': 9.36902485659656e-06, 'epoch': 0.6}


 30%|███       | 197/656 [02:56<07:34,  1.01it/s]

{'loss': 1.9266, 'learning_rate': 9.416826003824092e-06, 'epoch': 0.6}


 30%|███       | 198/656 [02:57<07:19,  1.04it/s]

{'loss': 1.9164, 'learning_rate': 9.464627151051626e-06, 'epoch': 0.6}


 30%|███       | 199/656 [02:58<07:06,  1.07it/s]

{'loss': 1.918, 'learning_rate': 9.51242829827916e-06, 'epoch': 0.61}


 30%|███       | 200/656 [02:59<07:22,  1.03it/s]

{'loss': 1.9208, 'learning_rate': 9.560229445506692e-06, 'epoch': 0.61}


 31%|███       | 201/656 [03:00<07:11,  1.05it/s]

{'loss': 1.8967, 'learning_rate': 9.608030592734226e-06, 'epoch': 0.61}


 31%|███       | 202/656 [03:00<07:03,  1.07it/s]

{'loss': 1.8884, 'learning_rate': 9.65583173996176e-06, 'epoch': 0.62}


 31%|███       | 203/656 [03:02<07:18,  1.03it/s]

{'loss': 1.9214, 'learning_rate': 9.703632887189293e-06, 'epoch': 0.62}


 31%|███       | 204/656 [03:02<06:56,  1.09it/s]

{'loss': 1.9012, 'learning_rate': 9.751434034416827e-06, 'epoch': 0.62}


 31%|███▏      | 205/656 [03:03<06:36,  1.14it/s]

{'loss': 1.9039, 'learning_rate': 9.79923518164436e-06, 'epoch': 0.62}


 31%|███▏      | 206/656 [03:04<06:21,  1.18it/s]

{'loss': 1.8999, 'learning_rate': 9.847036328871893e-06, 'epoch': 0.63}


 32%|███▏      | 207/656 [03:05<06:44,  1.11it/s]

{'loss': 1.899, 'learning_rate': 9.894837476099427e-06, 'epoch': 0.63}


 32%|███▏      | 208/656 [03:06<06:57,  1.07it/s]

{'loss': 1.8948, 'learning_rate': 9.942638623326961e-06, 'epoch': 0.63}


 32%|███▏      | 209/656 [03:07<07:04,  1.05it/s]

{'loss': 1.8999, 'learning_rate': 9.990439770554493e-06, 'epoch': 0.64}


 32%|███▏      | 210/656 [03:08<07:43,  1.04s/it]

{'loss': 1.889, 'learning_rate': 1.0038240917782027e-05, 'epoch': 0.64}


 32%|███▏      | 211/656 [03:09<07:17,  1.02it/s]

{'loss': 1.9034, 'learning_rate': 1.0086042065009561e-05, 'epoch': 0.64}


 32%|███▏      | 212/656 [03:10<06:44,  1.10it/s]

{'loss': 1.8979, 'learning_rate': 1.0133843212237094e-05, 'epoch': 0.65}


 32%|███▏      | 213/656 [03:11<07:11,  1.03it/s]

{'loss': 1.9246, 'learning_rate': 1.0181644359464628e-05, 'epoch': 0.65}


 33%|███▎      | 214/656 [03:12<06:43,  1.10it/s]

{'loss': 1.8765, 'learning_rate': 1.022944550669216e-05, 'epoch': 0.65}


 33%|███▎      | 215/656 [03:13<06:47,  1.08it/s]

{'loss': 1.9302, 'learning_rate': 1.0277246653919694e-05, 'epoch': 0.66}


 33%|███▎      | 216/656 [03:13<06:36,  1.11it/s]

{'loss': 1.905, 'learning_rate': 1.0325047801147228e-05, 'epoch': 0.66}


 33%|███▎      | 217/656 [03:14<06:22,  1.15it/s]

{'loss': 1.904, 'learning_rate': 1.037284894837476e-05, 'epoch': 0.66}


 33%|███▎      | 218/656 [03:15<06:32,  1.12it/s]

{'loss': 1.9253, 'learning_rate': 1.0420650095602295e-05, 'epoch': 0.66}


 33%|███▎      | 219/656 [03:16<06:28,  1.12it/s]

{'loss': 1.8975, 'learning_rate': 1.0468451242829829e-05, 'epoch': 0.67}


 34%|███▎      | 220/656 [03:17<06:16,  1.16it/s]

{'loss': 1.9119, 'learning_rate': 1.0516252390057361e-05, 'epoch': 0.67}


 34%|███▎      | 221/656 [03:18<06:30,  1.11it/s]

{'loss': 1.9035, 'learning_rate': 1.0564053537284897e-05, 'epoch': 0.67}


 34%|███▍      | 222/656 [03:19<06:10,  1.17it/s]

{'loss': 1.9073, 'learning_rate': 1.0611854684512429e-05, 'epoch': 0.68}


 34%|███▍      | 223/656 [03:20<06:43,  1.07it/s]

{'loss': 1.8905, 'learning_rate': 1.0659655831739961e-05, 'epoch': 0.68}


 34%|███▍      | 224/656 [03:20<06:26,  1.12it/s]

{'loss': 1.9042, 'learning_rate': 1.0707456978967497e-05, 'epoch': 0.68}


 34%|███▍      | 225/656 [03:21<06:01,  1.19it/s]

{'loss': 1.9235, 'learning_rate': 1.075525812619503e-05, 'epoch': 0.69}


 34%|███▍      | 226/656 [03:22<05:56,  1.21it/s]

{'loss': 1.8971, 'learning_rate': 1.0803059273422562e-05, 'epoch': 0.69}


 35%|███▍      | 227/656 [03:23<05:53,  1.21it/s]

{'loss': 1.912, 'learning_rate': 1.0850860420650096e-05, 'epoch': 0.69}


 35%|███▍      | 228/656 [03:24<06:00,  1.19it/s]

{'loss': 1.8984, 'learning_rate': 1.089866156787763e-05, 'epoch': 0.7}


 35%|███▍      | 229/656 [03:24<05:45,  1.24it/s]

{'loss': 1.8953, 'learning_rate': 1.0946462715105162e-05, 'epoch': 0.7}


 35%|███▌      | 230/656 [03:26<06:22,  1.11it/s]

{'loss': 1.893, 'learning_rate': 1.0994263862332696e-05, 'epoch': 0.7}


 35%|███▌      | 231/656 [03:26<06:20,  1.12it/s]

{'loss': 1.899, 'learning_rate': 1.104206500956023e-05, 'epoch': 0.7}


 35%|███▌      | 232/656 [03:27<06:17,  1.12it/s]

{'loss': 1.9111, 'learning_rate': 1.1089866156787764e-05, 'epoch': 0.71}


 36%|███▌      | 233/656 [03:28<06:49,  1.03it/s]

{'loss': 1.8746, 'learning_rate': 1.1137667304015297e-05, 'epoch': 0.71}


 36%|███▌      | 234/656 [03:30<07:04,  1.01s/it]

{'loss': 1.878, 'learning_rate': 1.118546845124283e-05, 'epoch': 0.71}


 36%|███▌      | 235/656 [03:30<06:53,  1.02it/s]

{'loss': 1.9006, 'learning_rate': 1.1233269598470365e-05, 'epoch': 0.72}


 36%|███▌      | 236/656 [03:32<07:12,  1.03s/it]

{'loss': 1.9085, 'learning_rate': 1.1281070745697897e-05, 'epoch': 0.72}


 36%|███▌      | 237/656 [03:32<06:51,  1.02it/s]

{'loss': 1.8901, 'learning_rate': 1.1328871892925431e-05, 'epoch': 0.72}


 36%|███▋      | 238/656 [03:34<07:19,  1.05s/it]

{'loss': 1.8914, 'learning_rate': 1.1376673040152965e-05, 'epoch': 0.73}


 36%|███▋      | 239/656 [03:35<07:21,  1.06s/it]

{'loss': 1.9272, 'learning_rate': 1.1424474187380497e-05, 'epoch': 0.73}


 37%|███▋      | 240/656 [03:36<07:34,  1.09s/it]

{'loss': 1.8759, 'learning_rate': 1.147227533460803e-05, 'epoch': 0.73}


 37%|███▋      | 241/656 [03:37<07:58,  1.15s/it]

{'loss': 1.9077, 'learning_rate': 1.1520076481835566e-05, 'epoch': 0.73}


 37%|███▋      | 242/656 [03:38<07:48,  1.13s/it]

{'loss': 1.9126, 'learning_rate': 1.1567877629063098e-05, 'epoch': 0.74}


 37%|███▋      | 243/656 [03:39<07:19,  1.06s/it]

{'loss': 1.892, 'learning_rate': 1.161567877629063e-05, 'epoch': 0.74}


 37%|███▋      | 244/656 [03:40<07:00,  1.02s/it]

{'loss': 1.8784, 'learning_rate': 1.1663479923518166e-05, 'epoch': 0.74}


 37%|███▋      | 245/656 [03:41<06:44,  1.02it/s]

{'loss': 1.8918, 'learning_rate': 1.1711281070745698e-05, 'epoch': 0.75}


 38%|███▊      | 246/656 [03:42<06:13,  1.10it/s]

{'loss': 1.8856, 'learning_rate': 1.1759082217973232e-05, 'epoch': 0.75}


 38%|███▊      | 247/656 [03:43<05:59,  1.14it/s]

{'loss': 1.8945, 'learning_rate': 1.1806883365200766e-05, 'epoch': 0.75}


 38%|███▊      | 248/656 [03:43<05:58,  1.14it/s]

{'loss': 1.8985, 'learning_rate': 1.1854684512428299e-05, 'epoch': 0.76}


 38%|███▊      | 249/656 [03:44<06:13,  1.09it/s]

{'loss': 1.8867, 'learning_rate': 1.1902485659655833e-05, 'epoch': 0.76}


 38%|███▊      | 250/656 [03:45<06:17,  1.08it/s]

{'loss': 1.8768, 'learning_rate': 1.1950286806883365e-05, 'epoch': 0.76}


 38%|███▊      | 251/656 [03:46<06:03,  1.11it/s]

{'loss': 1.9022, 'learning_rate': 1.1998087954110899e-05, 'epoch': 0.77}


 38%|███▊      | 252/656 [03:47<05:40,  1.19it/s]

{'loss': 1.8915, 'learning_rate': 1.2045889101338433e-05, 'epoch': 0.77}


 39%|███▊      | 253/656 [03:48<05:38,  1.19it/s]

{'loss': 1.9169, 'learning_rate': 1.2093690248565966e-05, 'epoch': 0.77}


 39%|███▊      | 254/656 [03:49<06:02,  1.11it/s]

{'loss': 1.8815, 'learning_rate': 1.21414913957935e-05, 'epoch': 0.77}


 39%|███▉      | 255/656 [03:50<06:34,  1.02it/s]

{'loss': 1.8939, 'learning_rate': 1.2189292543021034e-05, 'epoch': 0.78}


 39%|███▉      | 256/656 [03:51<06:12,  1.07it/s]

{'loss': 1.885, 'learning_rate': 1.2237093690248566e-05, 'epoch': 0.78}


 39%|███▉      | 257/656 [03:52<06:32,  1.02it/s]

{'loss': 1.8768, 'learning_rate': 1.22848948374761e-05, 'epoch': 0.78}


 39%|███▉      | 258/656 [03:53<06:34,  1.01it/s]

{'loss': 1.8733, 'learning_rate': 1.2332695984703634e-05, 'epoch': 0.79}


 39%|███▉      | 259/656 [03:54<06:23,  1.03it/s]

{'loss': 1.8891, 'learning_rate': 1.2380497131931166e-05, 'epoch': 0.79}


 40%|███▉      | 260/656 [03:55<06:10,  1.07it/s]

{'loss': 1.8671, 'learning_rate': 1.24282982791587e-05, 'epoch': 0.79}


 40%|███▉      | 261/656 [03:56<06:41,  1.02s/it]

{'loss': 1.8799, 'learning_rate': 1.2476099426386234e-05, 'epoch': 0.8}


 40%|███▉      | 262/656 [03:57<06:49,  1.04s/it]

{'loss': 1.9089, 'learning_rate': 1.2523900573613767e-05, 'epoch': 0.8}


 40%|████      | 263/656 [03:58<06:31,  1.00it/s]

{'loss': 1.8943, 'learning_rate': 1.25717017208413e-05, 'epoch': 0.8}


 40%|████      | 264/656 [03:59<05:55,  1.10it/s]

{'loss': 1.8862, 'learning_rate': 1.2619502868068833e-05, 'epoch': 0.8}


 40%|████      | 265/656 [03:59<05:25,  1.20it/s]

{'loss': 1.8729, 'learning_rate': 1.2667304015296369e-05, 'epoch': 0.81}


 41%|████      | 266/656 [04:00<05:35,  1.16it/s]

{'loss': 1.8795, 'learning_rate': 1.2715105162523901e-05, 'epoch': 0.81}


 41%|████      | 267/656 [04:01<05:21,  1.21it/s]

{'loss': 1.8875, 'learning_rate': 1.2762906309751435e-05, 'epoch': 0.81}


 41%|████      | 268/656 [04:02<05:22,  1.20it/s]

{'loss': 1.8523, 'learning_rate': 1.2810707456978968e-05, 'epoch': 0.82}


 41%|████      | 269/656 [04:03<05:26,  1.18it/s]

{'loss': 1.8827, 'learning_rate': 1.2858508604206502e-05, 'epoch': 0.82}


 41%|████      | 270/656 [04:04<05:44,  1.12it/s]

{'loss': 1.8684, 'learning_rate': 1.2906309751434034e-05, 'epoch': 0.82}


 41%|████▏     | 271/656 [04:05<05:52,  1.09it/s]

{'loss': 1.8726, 'learning_rate': 1.2954110898661568e-05, 'epoch': 0.83}


 41%|████▏     | 272/656 [04:05<05:32,  1.15it/s]

{'loss': 1.8804, 'learning_rate': 1.3001912045889104e-05, 'epoch': 0.83}


 42%|████▏     | 273/656 [04:07<06:19,  1.01it/s]

{'loss': 1.8632, 'learning_rate': 1.3049713193116636e-05, 'epoch': 0.83}


 42%|████▏     | 274/656 [04:08<06:20,  1.00it/s]

{'loss': 1.863, 'learning_rate': 1.3097514340344168e-05, 'epoch': 0.84}


 42%|████▏     | 275/656 [04:08<05:57,  1.07it/s]

{'loss': 1.8898, 'learning_rate': 1.3145315487571702e-05, 'epoch': 0.84}


 42%|████▏     | 276/656 [04:09<05:53,  1.08it/s]

{'loss': 1.8782, 'learning_rate': 1.3193116634799235e-05, 'epoch': 0.84}


 42%|████▏     | 277/656 [04:10<05:25,  1.16it/s]

{'loss': 1.8948, 'learning_rate': 1.3240917782026769e-05, 'epoch': 0.84}


 42%|████▏     | 278/656 [04:11<06:09,  1.02it/s]

{'loss': 1.859, 'learning_rate': 1.3288718929254305e-05, 'epoch': 0.85}


 43%|████▎     | 279/656 [04:12<05:49,  1.08it/s]

{'loss': 1.8677, 'learning_rate': 1.3336520076481837e-05, 'epoch': 0.85}


 43%|████▎     | 280/656 [04:13<05:40,  1.10it/s]

{'loss': 1.8471, 'learning_rate': 1.3384321223709371e-05, 'epoch': 0.85}


 43%|████▎     | 281/656 [04:14<05:29,  1.14it/s]

{'loss': 1.8724, 'learning_rate': 1.3432122370936903e-05, 'epoch': 0.86}


 43%|████▎     | 282/656 [04:15<05:46,  1.08it/s]

{'loss': 1.8712, 'learning_rate': 1.3479923518164436e-05, 'epoch': 0.86}


 43%|████▎     | 283/656 [04:16<05:41,  1.09it/s]

{'loss': 1.872, 'learning_rate': 1.352772466539197e-05, 'epoch': 0.86}


 43%|████▎     | 284/656 [04:17<05:34,  1.11it/s]

{'loss': 1.8465, 'learning_rate': 1.3575525812619502e-05, 'epoch': 0.87}


 43%|████▎     | 285/656 [04:17<05:29,  1.13it/s]

{'loss': 1.902, 'learning_rate': 1.3623326959847038e-05, 'epoch': 0.87}


 44%|████▎     | 286/656 [04:19<06:06,  1.01it/s]

{'loss': 1.8771, 'learning_rate': 1.3671128107074572e-05, 'epoch': 0.87}


 44%|████▍     | 287/656 [04:20<05:58,  1.03it/s]

{'loss': 1.8648, 'learning_rate': 1.3718929254302104e-05, 'epoch': 0.88}


 44%|████▍     | 288/656 [04:20<05:36,  1.09it/s]

{'loss': 1.8518, 'learning_rate': 1.3766730401529636e-05, 'epoch': 0.88}


 44%|████▍     | 289/656 [04:21<05:26,  1.12it/s]

{'loss': 1.88, 'learning_rate': 1.381453154875717e-05, 'epoch': 0.88}


 44%|████▍     | 290/656 [04:22<05:31,  1.10it/s]

{'loss': 1.8475, 'learning_rate': 1.3862332695984703e-05, 'epoch': 0.88}


 44%|████▍     | 291/656 [04:23<05:45,  1.06it/s]

{'loss': 1.8591, 'learning_rate': 1.3910133843212239e-05, 'epoch': 0.89}


 45%|████▍     | 292/656 [04:24<05:49,  1.04it/s]

{'loss': 1.8603, 'learning_rate': 1.3957934990439773e-05, 'epoch': 0.89}


 45%|████▍     | 293/656 [04:25<05:51,  1.03it/s]

{'loss': 1.8565, 'learning_rate': 1.4005736137667305e-05, 'epoch': 0.89}


 45%|████▍     | 294/656 [04:26<05:44,  1.05it/s]

{'loss': 1.8656, 'learning_rate': 1.4053537284894839e-05, 'epoch': 0.9}


 45%|████▍     | 295/656 [04:27<05:33,  1.08it/s]

{'loss': 1.8295, 'learning_rate': 1.4101338432122371e-05, 'epoch': 0.9}


 45%|████▌     | 296/656 [04:28<05:41,  1.06it/s]

{'loss': 1.8647, 'learning_rate': 1.4149139579349904e-05, 'epoch': 0.9}


 45%|████▌     | 297/656 [04:29<05:35,  1.07it/s]

{'loss': 1.8649, 'learning_rate': 1.4196940726577438e-05, 'epoch': 0.91}


 45%|████▌     | 298/656 [04:30<05:52,  1.02it/s]

{'loss': 1.8464, 'learning_rate': 1.4244741873804973e-05, 'epoch': 0.91}


 46%|████▌     | 299/656 [04:31<06:01,  1.01s/it]

{'loss': 1.856, 'learning_rate': 1.4292543021032506e-05, 'epoch': 0.91}


 46%|████▌     | 300/656 [04:32<05:48,  1.02it/s]

{'loss': 1.8692, 'learning_rate': 1.434034416826004e-05, 'epoch': 0.91}


 46%|████▌     | 301/656 [04:33<06:02,  1.02s/it]

{'loss': 1.8461, 'learning_rate': 1.4388145315487572e-05, 'epoch': 0.92}


 46%|████▌     | 302/656 [04:34<05:36,  1.05it/s]

{'loss': 1.8616, 'learning_rate': 1.4435946462715105e-05, 'epoch': 0.92}


 46%|████▌     | 303/656 [04:35<05:59,  1.02s/it]

{'loss': 1.8462, 'learning_rate': 1.4483747609942639e-05, 'epoch': 0.92}


 46%|████▋     | 304/656 [04:36<05:58,  1.02s/it]

{'loss': 1.8641, 'learning_rate': 1.4531548757170174e-05, 'epoch': 0.93}


 46%|████▋     | 305/656 [04:37<05:30,  1.06it/s]

{'loss': 1.8551, 'learning_rate': 1.4579349904397707e-05, 'epoch': 0.93}


 47%|████▋     | 306/656 [04:38<05:32,  1.05it/s]

{'loss': 1.8779, 'learning_rate': 1.462715105162524e-05, 'epoch': 0.93}


 47%|████▋     | 307/656 [04:39<05:45,  1.01it/s]

{'loss': 1.8405, 'learning_rate': 1.4674952198852773e-05, 'epoch': 0.94}


 47%|████▋     | 308/656 [04:40<05:45,  1.01it/s]

{'loss': 1.8282, 'learning_rate': 1.4722753346080307e-05, 'epoch': 0.94}


 47%|████▋     | 309/656 [04:41<06:18,  1.09s/it]

{'loss': 1.8386, 'learning_rate': 1.477055449330784e-05, 'epoch': 0.94}


 47%|████▋     | 310/656 [04:42<05:54,  1.02s/it]

{'loss': 1.8278, 'learning_rate': 1.4818355640535372e-05, 'epoch': 0.95}


 47%|████▋     | 311/656 [04:43<05:47,  1.01s/it]

{'loss': 1.8405, 'learning_rate': 1.4866156787762907e-05, 'epoch': 0.95}


 48%|████▊     | 312/656 [04:44<05:32,  1.03it/s]

{'loss': 1.861, 'learning_rate': 1.4913957934990441e-05, 'epoch': 0.95}


 48%|████▊     | 313/656 [04:45<05:36,  1.02it/s]

{'loss': 1.8613, 'learning_rate': 1.4961759082217974e-05, 'epoch': 0.95}


 48%|████▊     | 314/656 [04:46<05:19,  1.07it/s]

{'loss': 1.8373, 'learning_rate': 1.5009560229445508e-05, 'epoch': 0.96}


 48%|████▊     | 315/656 [04:47<05:15,  1.08it/s]

{'loss': 1.8666, 'learning_rate': 1.505736137667304e-05, 'epoch': 0.96}


 48%|████▊     | 316/656 [04:47<04:38,  1.22it/s]

{'loss': 1.8293, 'learning_rate': 1.5105162523900574e-05, 'epoch': 0.96}


 48%|████▊     | 317/656 [04:48<04:42,  1.20it/s]

{'loss': 1.8602, 'learning_rate': 1.5152963671128107e-05, 'epoch': 0.97}


 48%|████▊     | 318/656 [04:49<04:28,  1.26it/s]

{'loss': 1.841, 'learning_rate': 1.5200764818355642e-05, 'epoch': 0.97}


 49%|████▊     | 319/656 [04:50<04:27,  1.26it/s]

{'loss': 1.8477, 'learning_rate': 1.5248565965583175e-05, 'epoch': 0.97}


 49%|████▉     | 320/656 [04:51<04:42,  1.19it/s]

{'loss': 1.8495, 'learning_rate': 1.529636711281071e-05, 'epoch': 0.98}


 49%|████▉     | 321/656 [04:51<04:39,  1.20it/s]

{'loss': 1.8625, 'learning_rate': 1.534416826003824e-05, 'epoch': 0.98}


 49%|████▉     | 322/656 [04:53<05:19,  1.04it/s]

{'loss': 1.8649, 'learning_rate': 1.5391969407265773e-05, 'epoch': 0.98}


 49%|████▉     | 323/656 [04:53<04:57,  1.12it/s]

{'loss': 1.8436, 'learning_rate': 1.543977055449331e-05, 'epoch': 0.98}


 49%|████▉     | 324/656 [04:54<04:54,  1.13it/s]

{'loss': 1.8527, 'learning_rate': 1.5487571701720845e-05, 'epoch': 0.99}


 50%|████▉     | 325/656 [04:55<04:40,  1.18it/s]

{'loss': 1.8493, 'learning_rate': 1.5535372848948377e-05, 'epoch': 0.99}


 50%|████▉     | 326/656 [04:56<04:33,  1.21it/s]

{'loss': 1.8692, 'learning_rate': 1.558317399617591e-05, 'epoch': 0.99}


 50%|█████     | 328/656 [04:57<03:50,  1.42it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 1.8436, 'learning_rate': 1.5630975143403442e-05, 'epoch': 1.0}
{'loss': 1.8043, 'learning_rate': 1.5678776290630974e-05, 'epoch': 1.0}


                                                 
 50%|█████     | 328/656 [06:09<03:50,  1.42it/s]Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json


{'eval_loss': 1.8220031261444092, 'eval_accuracy': 0.5827283148643485, 'eval_runtime': 72.2314, 'eval_samples_per_second': 36.231, 'eval_steps_per_second': 1.135, 'epoch': 1.0}


Model weights saved in ./snips_clf/results/checkpoint-328/pytorch_model.bin
 50%|█████     | 329/656 [06:11<2:03:54, 22.74s/it]

{'loss': 1.8418, 'learning_rate': 1.572657743785851e-05, 'epoch': 1.0}


 50%|█████     | 330/656 [06:12<1:28:01, 16.20s/it]

{'loss': 1.8396, 'learning_rate': 1.5774378585086042e-05, 'epoch': 1.01}


 50%|█████     | 331/656 [06:13<1:02:50, 11.60s/it]

{'loss': 1.8276, 'learning_rate': 1.5822179732313578e-05, 'epoch': 1.01}


 51%|█████     | 332/656 [06:14<45:10,  8.37s/it]  

{'loss': 1.8208, 'learning_rate': 1.586998087954111e-05, 'epoch': 1.01}


 51%|█████     | 333/656 [06:15<33:02,  6.14s/it]

{'loss': 1.825, 'learning_rate': 1.5917782026768643e-05, 'epoch': 1.02}


 51%|█████     | 334/656 [06:16<24:48,  4.62s/it]

{'loss': 1.8541, 'learning_rate': 1.5965583173996175e-05, 'epoch': 1.02}


 51%|█████     | 335/656 [06:17<18:29,  3.46s/it]

{'loss': 1.8195, 'learning_rate': 1.601338432122371e-05, 'epoch': 1.02}


 51%|█████     | 336/656 [06:18<14:15,  2.67s/it]

{'loss': 1.8452, 'learning_rate': 1.6061185468451243e-05, 'epoch': 1.02}


 51%|█████▏    | 337/656 [06:19<12:00,  2.26s/it]

{'loss': 1.849, 'learning_rate': 1.610898661567878e-05, 'epoch': 1.03}


 52%|█████▏    | 338/656 [06:21<11:07,  2.10s/it]

{'loss': 1.8135, 'learning_rate': 1.615678776290631e-05, 'epoch': 1.03}


 52%|█████▏    | 339/656 [06:21<08:49,  1.67s/it]

{'loss': 1.8039, 'learning_rate': 1.6204588910133844e-05, 'epoch': 1.03}


 52%|█████▏    | 340/656 [06:22<07:59,  1.52s/it]

{'loss': 1.8144, 'learning_rate': 1.6252390057361376e-05, 'epoch': 1.04}


 52%|█████▏    | 341/656 [06:23<07:07,  1.36s/it]

{'loss': 1.8542, 'learning_rate': 1.630019120458891e-05, 'epoch': 1.04}


 52%|█████▏    | 342/656 [06:24<06:41,  1.28s/it]

{'loss': 1.8297, 'learning_rate': 1.6347992351816444e-05, 'epoch': 1.04}


 52%|█████▏    | 343/656 [06:25<05:59,  1.15s/it]

{'loss': 1.8138, 'learning_rate': 1.6395793499043976e-05, 'epoch': 1.05}


 52%|█████▏    | 344/656 [06:26<05:25,  1.04s/it]

{'loss': 1.844, 'learning_rate': 1.6443594646271512e-05, 'epoch': 1.05}


 53%|█████▎    | 345/656 [06:27<05:26,  1.05s/it]

{'loss': 1.8425, 'learning_rate': 1.6491395793499044e-05, 'epoch': 1.05}


 53%|█████▎    | 346/656 [06:28<05:03,  1.02it/s]

{'loss': 1.8173, 'learning_rate': 1.653919694072658e-05, 'epoch': 1.05}


 53%|█████▎    | 347/656 [06:29<04:49,  1.07it/s]

{'loss': 1.8221, 'learning_rate': 1.6586998087954112e-05, 'epoch': 1.06}


 53%|█████▎    | 348/656 [06:30<04:30,  1.14it/s]

{'loss': 1.8224, 'learning_rate': 1.6634799235181645e-05, 'epoch': 1.06}


 53%|█████▎    | 349/656 [06:30<04:31,  1.13it/s]

{'loss': 1.8379, 'learning_rate': 1.6682600382409177e-05, 'epoch': 1.06}


 53%|█████▎    | 350/656 [06:31<04:23,  1.16it/s]

{'loss': 1.8335, 'learning_rate': 1.6730401529636713e-05, 'epoch': 1.07}


 54%|█████▎    | 351/656 [06:32<04:30,  1.13it/s]

{'loss': 1.8031, 'learning_rate': 1.6778202676864245e-05, 'epoch': 1.07}


 54%|█████▎    | 352/656 [06:33<04:23,  1.15it/s]

{'loss': 1.8126, 'learning_rate': 1.682600382409178e-05, 'epoch': 1.07}


 54%|█████▍    | 353/656 [06:34<04:32,  1.11it/s]

{'loss': 1.8098, 'learning_rate': 1.6873804971319313e-05, 'epoch': 1.08}


 54%|█████▍    | 354/656 [06:35<04:21,  1.16it/s]

{'loss': 1.8216, 'learning_rate': 1.6921606118546846e-05, 'epoch': 1.08}


 54%|█████▍    | 355/656 [06:36<04:13,  1.19it/s]

{'loss': 1.8038, 'learning_rate': 1.6969407265774378e-05, 'epoch': 1.08}


 54%|█████▍    | 356/656 [06:37<04:42,  1.06it/s]

{'loss': 1.8077, 'learning_rate': 1.701720841300191e-05, 'epoch': 1.09}


 54%|█████▍    | 357/656 [06:38<04:40,  1.07it/s]

{'loss': 1.8387, 'learning_rate': 1.7065009560229446e-05, 'epoch': 1.09}


 55%|█████▍    | 358/656 [06:39<04:54,  1.01it/s]

{'loss': 1.8594, 'learning_rate': 1.7112810707456982e-05, 'epoch': 1.09}


 55%|█████▍    | 359/656 [06:40<05:15,  1.06s/it]

{'loss': 1.8337, 'learning_rate': 1.7160611854684514e-05, 'epoch': 1.09}


 55%|█████▍    | 360/656 [06:41<05:04,  1.03s/it]

{'loss': 1.8014, 'learning_rate': 1.7208413001912046e-05, 'epoch': 1.1}


 55%|█████▌    | 361/656 [06:42<04:52,  1.01it/s]

{'loss': 1.7977, 'learning_rate': 1.725621414913958e-05, 'epoch': 1.1}


 55%|█████▌    | 362/656 [06:43<04:22,  1.12it/s]

{'loss': 1.788, 'learning_rate': 1.730401529636711e-05, 'epoch': 1.1}


 55%|█████▌    | 363/656 [06:43<04:08,  1.18it/s]

{'loss': 1.7531, 'learning_rate': 1.7351816443594647e-05, 'epoch': 1.11}


 55%|█████▌    | 364/656 [06:44<04:13,  1.15it/s]

{'loss': 1.7846, 'learning_rate': 1.7399617590822183e-05, 'epoch': 1.11}


 56%|█████▌    | 365/656 [06:45<04:01,  1.21it/s]

{'loss': 1.8162, 'learning_rate': 1.7447418738049715e-05, 'epoch': 1.11}


 56%|█████▌    | 366/656 [06:46<04:02,  1.20it/s]

{'loss': 1.7996, 'learning_rate': 1.7495219885277247e-05, 'epoch': 1.12}


 56%|█████▌    | 367/656 [06:47<04:02,  1.19it/s]

{'loss': 1.8084, 'learning_rate': 1.754302103250478e-05, 'epoch': 1.12}


 56%|█████▌    | 368/656 [06:47<04:02,  1.19it/s]

{'loss': 1.789, 'learning_rate': 1.7590822179732312e-05, 'epoch': 1.12}


 56%|█████▋    | 369/656 [06:49<04:22,  1.09it/s]

{'loss': 1.7907, 'learning_rate': 1.7638623326959848e-05, 'epoch': 1.12}


 56%|█████▋    | 370/656 [06:49<04:20,  1.10it/s]

{'loss': 1.7872, 'learning_rate': 1.7686424474187383e-05, 'epoch': 1.13}


 57%|█████▋    | 371/656 [06:50<04:06,  1.15it/s]

{'loss': 1.7779, 'learning_rate': 1.7734225621414916e-05, 'epoch': 1.13}


 57%|█████▋    | 372/656 [06:51<04:14,  1.12it/s]

{'loss': 1.8327, 'learning_rate': 1.7782026768642448e-05, 'epoch': 1.13}


 57%|█████▋    | 373/656 [06:52<03:56,  1.20it/s]

{'loss': 1.8022, 'learning_rate': 1.782982791586998e-05, 'epoch': 1.14}


 57%|█████▋    | 374/656 [06:53<04:07,  1.14it/s]

{'loss': 1.7831, 'learning_rate': 1.7877629063097516e-05, 'epoch': 1.14}


 57%|█████▋    | 375/656 [06:54<04:05,  1.15it/s]

{'loss': 1.81, 'learning_rate': 1.792543021032505e-05, 'epoch': 1.14}


 57%|█████▋    | 376/656 [06:55<04:12,  1.11it/s]

{'loss': 1.8002, 'learning_rate': 1.7973231357552584e-05, 'epoch': 1.15}


 57%|█████▋    | 377/656 [06:55<03:56,  1.18it/s]

{'loss': 1.7959, 'learning_rate': 1.8021032504780117e-05, 'epoch': 1.15}


 58%|█████▊    | 378/656 [06:56<03:56,  1.18it/s]

{'loss': 1.7949, 'learning_rate': 1.806883365200765e-05, 'epoch': 1.15}


 58%|█████▊    | 379/656 [06:57<03:46,  1.22it/s]

{'loss': 1.8043, 'learning_rate': 1.811663479923518e-05, 'epoch': 1.16}


 58%|█████▊    | 380/656 [06:58<03:46,  1.22it/s]

{'loss': 1.7923, 'learning_rate': 1.8164435946462717e-05, 'epoch': 1.16}


 58%|█████▊    | 381/656 [06:59<03:54,  1.17it/s]

{'loss': 1.7797, 'learning_rate': 1.821223709369025e-05, 'epoch': 1.16}


 58%|█████▊    | 382/656 [07:00<03:43,  1.23it/s]

{'loss': 1.7963, 'learning_rate': 1.826003824091778e-05, 'epoch': 1.16}


 58%|█████▊    | 383/656 [07:00<03:40,  1.24it/s]

{'loss': 1.7801, 'learning_rate': 1.8307839388145317e-05, 'epoch': 1.17}


 59%|█████▊    | 384/656 [07:01<03:36,  1.26it/s]

{'loss': 1.7895, 'learning_rate': 1.835564053537285e-05, 'epoch': 1.17}


 59%|█████▊    | 385/656 [07:02<03:44,  1.21it/s]

{'loss': 1.7637, 'learning_rate': 1.8403441682600382e-05, 'epoch': 1.17}


 59%|█████▉    | 386/656 [07:03<03:54,  1.15it/s]

{'loss': 1.7857, 'learning_rate': 1.8451242829827918e-05, 'epoch': 1.18}


 59%|█████▉    | 387/656 [07:04<03:55,  1.14it/s]

{'loss': 1.7868, 'learning_rate': 1.849904397705545e-05, 'epoch': 1.18}


 59%|█████▉    | 388/656 [07:05<03:45,  1.19it/s]

{'loss': 1.7925, 'learning_rate': 1.8546845124282983e-05, 'epoch': 1.18}


 59%|█████▉    | 389/656 [07:05<03:41,  1.20it/s]

{'loss': 1.7821, 'learning_rate': 1.8594646271510518e-05, 'epoch': 1.19}


 59%|█████▉    | 390/656 [07:06<03:30,  1.26it/s]

{'loss': 1.7453, 'learning_rate': 1.864244741873805e-05, 'epoch': 1.19}


 60%|█████▉    | 391/656 [07:07<03:30,  1.26it/s]

{'loss': 1.8208, 'learning_rate': 1.8690248565965583e-05, 'epoch': 1.19}


 60%|█████▉    | 392/656 [07:08<03:40,  1.20it/s]

{'loss': 1.763, 'learning_rate': 1.873804971319312e-05, 'epoch': 1.2}


 60%|█████▉    | 393/656 [07:09<03:47,  1.16it/s]

{'loss': 1.7695, 'learning_rate': 1.878585086042065e-05, 'epoch': 1.2}


 60%|██████    | 394/656 [07:10<04:14,  1.03it/s]

{'loss': 1.7814, 'learning_rate': 1.8833652007648183e-05, 'epoch': 1.2}


 60%|██████    | 395/656 [07:11<03:59,  1.09it/s]

{'loss': 1.7755, 'learning_rate': 1.8881453154875716e-05, 'epoch': 1.2}


 60%|██████    | 396/656 [07:12<03:59,  1.09it/s]

{'loss': 1.7607, 'learning_rate': 1.892925430210325e-05, 'epoch': 1.21}


 61%|██████    | 397/656 [07:12<03:43,  1.16it/s]

{'loss': 1.7638, 'learning_rate': 1.8977055449330787e-05, 'epoch': 1.21}


 61%|██████    | 398/656 [07:13<03:29,  1.23it/s]

{'loss': 1.7621, 'learning_rate': 1.902485659655832e-05, 'epoch': 1.21}


 61%|██████    | 399/656 [07:14<03:27,  1.24it/s]

{'loss': 1.7652, 'learning_rate': 1.9072657743785852e-05, 'epoch': 1.22}


 61%|██████    | 400/656 [07:15<03:43,  1.15it/s]

{'loss': 1.7707, 'learning_rate': 1.9120458891013384e-05, 'epoch': 1.22}


 61%|██████    | 401/656 [07:16<03:40,  1.16it/s]

{'loss': 1.7572, 'learning_rate': 1.9168260038240917e-05, 'epoch': 1.22}


 61%|██████▏   | 402/656 [07:17<03:30,  1.21it/s]

{'loss': 1.7685, 'learning_rate': 1.9216061185468452e-05, 'epoch': 1.23}


 61%|██████▏   | 403/656 [07:17<03:36,  1.17it/s]

{'loss': 1.7815, 'learning_rate': 1.9263862332695988e-05, 'epoch': 1.23}


 62%|██████▏   | 404/656 [07:18<03:42,  1.13it/s]

{'loss': 1.7297, 'learning_rate': 1.931166347992352e-05, 'epoch': 1.23}


 62%|██████▏   | 405/656 [07:19<03:51,  1.09it/s]

{'loss': 1.7437, 'learning_rate': 1.9359464627151053e-05, 'epoch': 1.23}


 62%|██████▏   | 406/656 [07:20<03:43,  1.12it/s]

{'loss': 1.7883, 'learning_rate': 1.9407265774378585e-05, 'epoch': 1.24}


 62%|██████▏   | 407/656 [07:22<04:22,  1.05s/it]

{'loss': 1.7846, 'learning_rate': 1.9455066921606117e-05, 'epoch': 1.24}


 62%|██████▏   | 408/656 [07:22<04:03,  1.02it/s]

{'loss': 1.7519, 'learning_rate': 1.9502868068833653e-05, 'epoch': 1.24}


 62%|██████▏   | 409/656 [07:23<04:04,  1.01it/s]

{'loss': 1.7786, 'learning_rate': 1.955066921606119e-05, 'epoch': 1.25}


 62%|██████▎   | 410/656 [07:24<03:51,  1.06it/s]

{'loss': 1.7838, 'learning_rate': 1.959847036328872e-05, 'epoch': 1.25}


 63%|██████▎   | 411/656 [07:25<03:34,  1.14it/s]

{'loss': 1.7492, 'learning_rate': 1.9646271510516253e-05, 'epoch': 1.25}


 63%|██████▎   | 412/656 [07:26<03:26,  1.18it/s]

{'loss': 1.7414, 'learning_rate': 1.9694072657743786e-05, 'epoch': 1.26}


 63%|██████▎   | 413/656 [07:27<03:23,  1.20it/s]

{'loss': 1.7898, 'learning_rate': 1.9741873804971318e-05, 'epoch': 1.26}


 63%|██████▎   | 414/656 [07:27<03:23,  1.19it/s]

{'loss': 1.7667, 'learning_rate': 1.9789674952198854e-05, 'epoch': 1.26}


 63%|██████▎   | 415/656 [07:28<03:24,  1.18it/s]

{'loss': 1.7917, 'learning_rate': 1.9837476099426386e-05, 'epoch': 1.27}


 63%|██████▎   | 416/656 [07:29<03:25,  1.17it/s]

{'loss': 1.7738, 'learning_rate': 1.9885277246653922e-05, 'epoch': 1.27}


 64%|██████▎   | 417/656 [07:30<03:16,  1.22it/s]

{'loss': 1.7279, 'learning_rate': 1.9933078393881454e-05, 'epoch': 1.27}


 64%|██████▎   | 418/656 [07:31<03:18,  1.20it/s]

{'loss': 1.771, 'learning_rate': 1.9980879541108987e-05, 'epoch': 1.27}


 64%|██████▍   | 419/656 [07:32<03:12,  1.23it/s]

{'loss': 1.8139, 'learning_rate': 2.0028680688336522e-05, 'epoch': 1.28}


 64%|██████▍   | 420/656 [07:33<03:21,  1.17it/s]

{'loss': 1.7572, 'learning_rate': 2.0076481835564055e-05, 'epoch': 1.28}


 64%|██████▍   | 421/656 [07:33<03:25,  1.14it/s]

{'loss': 1.7173, 'learning_rate': 2.0124282982791587e-05, 'epoch': 1.28}


 64%|██████▍   | 422/656 [07:34<03:14,  1.21it/s]

{'loss': 1.7113, 'learning_rate': 2.0172084130019123e-05, 'epoch': 1.29}


 64%|██████▍   | 423/656 [07:35<03:00,  1.29it/s]

{'loss': 1.7483, 'learning_rate': 2.0219885277246655e-05, 'epoch': 1.29}


 65%|██████▍   | 424/656 [07:35<02:49,  1.37it/s]

{'loss': 1.7197, 'learning_rate': 2.0267686424474187e-05, 'epoch': 1.29}


 65%|██████▍   | 425/656 [07:36<03:00,  1.28it/s]

{'loss': 1.7574, 'learning_rate': 2.0315487571701723e-05, 'epoch': 1.3}


 65%|██████▍   | 426/656 [07:37<03:12,  1.19it/s]

{'loss': 1.7683, 'learning_rate': 2.0363288718929256e-05, 'epoch': 1.3}


 65%|██████▌   | 427/656 [07:38<03:12,  1.19it/s]

{'loss': 1.7826, 'learning_rate': 2.0411089866156788e-05, 'epoch': 1.3}


 65%|██████▌   | 428/656 [07:39<03:31,  1.08it/s]

{'loss': 1.7412, 'learning_rate': 2.045889101338432e-05, 'epoch': 1.3}


 65%|██████▌   | 429/656 [07:40<03:25,  1.11it/s]

{'loss': 1.7433, 'learning_rate': 2.0506692160611856e-05, 'epoch': 1.31}


 66%|██████▌   | 430/656 [07:41<03:15,  1.16it/s]

{'loss': 1.7059, 'learning_rate': 2.055449330783939e-05, 'epoch': 1.31}


 66%|██████▌   | 431/656 [07:42<03:17,  1.14it/s]

{'loss': 1.7837, 'learning_rate': 2.0602294455066924e-05, 'epoch': 1.31}


 66%|██████▌   | 432/656 [07:43<03:09,  1.18it/s]

{'loss': 1.7231, 'learning_rate': 2.0650095602294456e-05, 'epoch': 1.32}


 66%|██████▌   | 433/656 [07:44<03:19,  1.12it/s]

{'loss': 1.7454, 'learning_rate': 2.069789674952199e-05, 'epoch': 1.32}


 66%|██████▌   | 434/656 [07:45<03:18,  1.12it/s]

{'loss': 1.6777, 'learning_rate': 2.074569789674952e-05, 'epoch': 1.32}


 66%|██████▋   | 435/656 [07:45<03:11,  1.16it/s]

{'loss': 1.7474, 'learning_rate': 2.0793499043977057e-05, 'epoch': 1.33}


 66%|██████▋   | 436/656 [07:46<03:08,  1.16it/s]

{'loss': 1.7724, 'learning_rate': 2.084130019120459e-05, 'epoch': 1.33}


 67%|██████▋   | 437/656 [07:47<03:02,  1.20it/s]

{'loss': 1.6977, 'learning_rate': 2.0889101338432125e-05, 'epoch': 1.33}


 67%|██████▋   | 438/656 [07:48<03:11,  1.14it/s]

{'loss': 1.7257, 'learning_rate': 2.0936902485659657e-05, 'epoch': 1.34}


 67%|██████▋   | 439/656 [07:49<03:11,  1.13it/s]

{'loss': 1.7472, 'learning_rate': 2.098470363288719e-05, 'epoch': 1.34}


 67%|██████▋   | 440/656 [07:50<03:03,  1.18it/s]

{'loss': 1.6949, 'learning_rate': 2.1032504780114722e-05, 'epoch': 1.34}


 67%|██████▋   | 441/656 [07:50<03:03,  1.17it/s]

{'loss': 1.7237, 'learning_rate': 2.1080305927342258e-05, 'epoch': 1.34}


 67%|██████▋   | 442/656 [07:51<02:58,  1.20it/s]

{'loss': 1.7472, 'learning_rate': 2.1128107074569793e-05, 'epoch': 1.35}


 68%|██████▊   | 443/656 [07:52<02:55,  1.21it/s]

{'loss': 1.7601, 'learning_rate': 2.1175908221797326e-05, 'epoch': 1.35}


 68%|██████▊   | 444/656 [07:53<02:57,  1.19it/s]

{'loss': 1.7086, 'learning_rate': 2.1223709369024858e-05, 'epoch': 1.35}


 68%|██████▊   | 445/656 [07:54<02:50,  1.24it/s]

{'loss': 1.7452, 'learning_rate': 2.127151051625239e-05, 'epoch': 1.36}


 68%|██████▊   | 446/656 [07:55<02:58,  1.18it/s]

{'loss': 1.7663, 'learning_rate': 2.1319311663479923e-05, 'epoch': 1.36}


 68%|██████▊   | 447/656 [07:55<02:52,  1.21it/s]

{'loss': 1.7555, 'learning_rate': 2.136711281070746e-05, 'epoch': 1.36}


 68%|██████▊   | 448/656 [07:56<02:52,  1.21it/s]

{'loss': 1.7149, 'learning_rate': 2.1414913957934994e-05, 'epoch': 1.37}


 68%|██████▊   | 449/656 [07:57<02:46,  1.24it/s]

{'loss': 1.7344, 'learning_rate': 2.1462715105162527e-05, 'epoch': 1.37}


 69%|██████▊   | 450/656 [07:58<02:42,  1.27it/s]

{'loss': 1.7462, 'learning_rate': 2.151051625239006e-05, 'epoch': 1.37}


 69%|██████▉   | 451/656 [07:58<02:33,  1.33it/s]

{'loss': 1.7033, 'learning_rate': 2.155831739961759e-05, 'epoch': 1.38}


 69%|██████▉   | 452/656 [07:59<02:49,  1.20it/s]

{'loss': 1.7305, 'learning_rate': 2.1606118546845124e-05, 'epoch': 1.38}


 69%|██████▉   | 453/656 [08:00<02:49,  1.20it/s]

{'loss': 1.7215, 'learning_rate': 2.165391969407266e-05, 'epoch': 1.38}


 69%|██████▉   | 454/656 [08:01<03:00,  1.12it/s]

{'loss': 1.7051, 'learning_rate': 2.170172084130019e-05, 'epoch': 1.38}


 69%|██████▉   | 455/656 [08:02<02:55,  1.14it/s]

{'loss': 1.6984, 'learning_rate': 2.1749521988527727e-05, 'epoch': 1.39}


 70%|██████▉   | 456/656 [08:03<02:58,  1.12it/s]

{'loss': 1.7122, 'learning_rate': 2.179732313575526e-05, 'epoch': 1.39}


 70%|██████▉   | 457/656 [08:04<02:55,  1.14it/s]

{'loss': 1.7417, 'learning_rate': 2.1845124282982792e-05, 'epoch': 1.39}


 70%|██████▉   | 458/656 [08:05<03:01,  1.09it/s]

{'loss': 1.7339, 'learning_rate': 2.1892925430210324e-05, 'epoch': 1.4}


 70%|██████▉   | 459/656 [08:06<03:00,  1.09it/s]

{'loss': 1.7008, 'learning_rate': 2.194072657743786e-05, 'epoch': 1.4}


 70%|███████   | 460/656 [08:07<03:17,  1.01s/it]

{'loss': 1.6766, 'learning_rate': 2.1988527724665392e-05, 'epoch': 1.4}


 70%|███████   | 461/656 [08:09<03:54,  1.20s/it]

{'loss': 1.6759, 'learning_rate': 2.2036328871892928e-05, 'epoch': 1.41}


 70%|███████   | 462/656 [08:09<03:32,  1.10s/it]

{'loss': 1.7109, 'learning_rate': 2.208413001912046e-05, 'epoch': 1.41}


 71%|███████   | 463/656 [08:10<03:12,  1.00it/s]

{'loss': 1.7272, 'learning_rate': 2.2131931166347993e-05, 'epoch': 1.41}


 71%|███████   | 464/656 [08:11<03:00,  1.07it/s]

{'loss': 1.6949, 'learning_rate': 2.217973231357553e-05, 'epoch': 1.41}


 71%|███████   | 465/656 [08:12<02:48,  1.14it/s]

{'loss': 1.6902, 'learning_rate': 2.222753346080306e-05, 'epoch': 1.42}


 71%|███████   | 466/656 [08:13<02:43,  1.16it/s]

{'loss': 1.7075, 'learning_rate': 2.2275334608030593e-05, 'epoch': 1.42}


 71%|███████   | 467/656 [08:13<02:35,  1.21it/s]

{'loss': 1.6687, 'learning_rate': 2.2323135755258126e-05, 'epoch': 1.42}


 71%|███████▏  | 468/656 [08:14<02:47,  1.12it/s]

{'loss': 1.7291, 'learning_rate': 2.237093690248566e-05, 'epoch': 1.43}


 71%|███████▏  | 469/656 [08:15<02:55,  1.06it/s]

{'loss': 1.7103, 'learning_rate': 2.2418738049713194e-05, 'epoch': 1.43}


 72%|███████▏  | 470/656 [08:17<03:01,  1.02it/s]

{'loss': 1.7071, 'learning_rate': 2.246653919694073e-05, 'epoch': 1.43}


 72%|███████▏  | 471/656 [08:17<02:58,  1.03it/s]

{'loss': 1.7141, 'learning_rate': 2.2514340344168262e-05, 'epoch': 1.44}


 72%|███████▏  | 472/656 [08:19<03:02,  1.01it/s]

{'loss': 1.6892, 'learning_rate': 2.2562141491395794e-05, 'epoch': 1.44}


 72%|███████▏  | 473/656 [08:20<03:17,  1.08s/it]

{'loss': 1.6649, 'learning_rate': 2.2609942638623326e-05, 'epoch': 1.44}


 72%|███████▏  | 474/656 [08:21<03:17,  1.08s/it]

{'loss': 1.7561, 'learning_rate': 2.2657743785850862e-05, 'epoch': 1.45}


 72%|███████▏  | 475/656 [08:22<03:09,  1.05s/it]

{'loss': 1.744, 'learning_rate': 2.2705544933078395e-05, 'epoch': 1.45}


 73%|███████▎  | 476/656 [08:23<02:50,  1.05it/s]

{'loss': 1.6682, 'learning_rate': 2.275334608030593e-05, 'epoch': 1.45}


 73%|███████▎  | 477/656 [08:23<02:44,  1.09it/s]

{'loss': 1.7038, 'learning_rate': 2.2801147227533463e-05, 'epoch': 1.45}


 73%|███████▎  | 478/656 [08:24<02:51,  1.04it/s]

{'loss': 1.6197, 'learning_rate': 2.2848948374760995e-05, 'epoch': 1.46}


 73%|███████▎  | 479/656 [08:25<02:47,  1.06it/s]

{'loss': 1.6784, 'learning_rate': 2.2896749521988527e-05, 'epoch': 1.46}


 73%|███████▎  | 480/656 [08:26<02:45,  1.06it/s]

{'loss': 1.6532, 'learning_rate': 2.294455066921606e-05, 'epoch': 1.46}


 73%|███████▎  | 481/656 [08:27<02:42,  1.08it/s]

{'loss': 1.6268, 'learning_rate': 2.2992351816443595e-05, 'epoch': 1.47}


 73%|███████▎  | 482/656 [08:28<02:36,  1.11it/s]

{'loss': 1.695, 'learning_rate': 2.304015296367113e-05, 'epoch': 1.47}


 74%|███████▎  | 483/656 [08:29<02:50,  1.01it/s]

{'loss': 1.646, 'learning_rate': 2.3087954110898663e-05, 'epoch': 1.47}


 74%|███████▍  | 484/656 [08:30<02:53,  1.01s/it]

{'loss': 1.6915, 'learning_rate': 2.3135755258126196e-05, 'epoch': 1.48}


 74%|███████▍  | 485/656 [08:32<03:03,  1.07s/it]

{'loss': 1.6545, 'learning_rate': 2.3183556405353728e-05, 'epoch': 1.48}


 74%|███████▍  | 486/656 [08:32<02:56,  1.04s/it]

{'loss': 1.6703, 'learning_rate': 2.323135755258126e-05, 'epoch': 1.48}


 74%|███████▍  | 487/656 [08:33<02:48,  1.01it/s]

{'loss': 1.6633, 'learning_rate': 2.3279158699808796e-05, 'epoch': 1.48}


 74%|███████▍  | 488/656 [08:34<02:34,  1.09it/s]

{'loss': 1.7266, 'learning_rate': 2.3326959847036332e-05, 'epoch': 1.49}


 75%|███████▍  | 489/656 [08:35<02:30,  1.11it/s]

{'loss': 1.6495, 'learning_rate': 2.3374760994263864e-05, 'epoch': 1.49}


 75%|███████▍  | 490/656 [08:36<02:41,  1.03it/s]

{'loss': 1.6881, 'learning_rate': 2.3422562141491397e-05, 'epoch': 1.49}


 75%|███████▍  | 491/656 [08:37<02:52,  1.04s/it]

{'loss': 1.658, 'learning_rate': 2.347036328871893e-05, 'epoch': 1.5}


 75%|███████▌  | 492/656 [08:39<02:57,  1.08s/it]

{'loss': 1.6789, 'learning_rate': 2.3518164435946465e-05, 'epoch': 1.5}


 75%|███████▌  | 493/656 [08:40<03:32,  1.30s/it]

{'loss': 1.6897, 'learning_rate': 2.3565965583173997e-05, 'epoch': 1.5}


 75%|███████▌  | 494/656 [08:41<03:15,  1.20s/it]

{'loss': 1.6851, 'learning_rate': 2.3613766730401533e-05, 'epoch': 1.51}


 75%|███████▌  | 495/656 [08:42<03:10,  1.19s/it]

{'loss': 1.5908, 'learning_rate': 2.3661567877629065e-05, 'epoch': 1.51}


 76%|███████▌  | 496/656 [08:44<03:08,  1.18s/it]

{'loss': 1.6602, 'learning_rate': 2.3709369024856597e-05, 'epoch': 1.51}


 76%|███████▌  | 497/656 [08:45<02:53,  1.09s/it]

{'loss': 1.6603, 'learning_rate': 2.375717017208413e-05, 'epoch': 1.52}


 76%|███████▌  | 498/656 [08:45<02:45,  1.05s/it]

{'loss': 1.6423, 'learning_rate': 2.3804971319311666e-05, 'epoch': 1.52}


 76%|███████▌  | 499/656 [08:46<02:30,  1.04it/s]

{'loss': 1.6475, 'learning_rate': 2.3852772466539198e-05, 'epoch': 1.52}


 76%|███████▌  | 500/656 [08:47<02:24,  1.08it/s]

{'loss': 1.6619, 'learning_rate': 2.390057361376673e-05, 'epoch': 1.52}


 76%|███████▋  | 501/656 [08:48<02:29,  1.04it/s]

{'loss': 1.6222, 'learning_rate': 2.3948374760994266e-05, 'epoch': 1.53}


 77%|███████▋  | 502/656 [08:49<02:24,  1.06it/s]

{'loss': 1.6984, 'learning_rate': 2.3996175908221798e-05, 'epoch': 1.53}


 77%|███████▋  | 503/656 [08:50<02:25,  1.05it/s]

{'loss': 1.6404, 'learning_rate': 2.404397705544933e-05, 'epoch': 1.53}


 77%|███████▋  | 504/656 [08:51<02:29,  1.02it/s]

{'loss': 1.684, 'learning_rate': 2.4091778202676866e-05, 'epoch': 1.54}


 77%|███████▋  | 505/656 [08:52<02:24,  1.05it/s]

{'loss': 1.5996, 'learning_rate': 2.41395793499044e-05, 'epoch': 1.54}


 77%|███████▋  | 506/656 [08:54<02:56,  1.18s/it]

{'loss': 1.6473, 'learning_rate': 2.418738049713193e-05, 'epoch': 1.54}


 77%|███████▋  | 507/656 [08:55<03:01,  1.22s/it]

{'loss': 1.6729, 'learning_rate': 2.4235181644359467e-05, 'epoch': 1.55}


 77%|███████▋  | 508/656 [08:56<02:49,  1.15s/it]

{'loss': 1.6181, 'learning_rate': 2.4282982791587e-05, 'epoch': 1.55}


 78%|███████▊  | 509/656 [08:57<02:36,  1.07s/it]

{'loss': 1.6749, 'learning_rate': 2.433078393881453e-05, 'epoch': 1.55}


 78%|███████▊  | 510/656 [08:58<02:35,  1.06s/it]

{'loss': 1.5828, 'learning_rate': 2.4378585086042067e-05, 'epoch': 1.55}


 78%|███████▊  | 511/656 [08:59<02:33,  1.06s/it]

{'loss': 1.653, 'learning_rate': 2.44263862332696e-05, 'epoch': 1.56}


 78%|███████▊  | 512/656 [09:00<02:22,  1.01it/s]

{'loss': 1.6493, 'learning_rate': 2.4474187380497132e-05, 'epoch': 1.56}


 78%|███████▊  | 513/656 [09:00<02:12,  1.08it/s]

{'loss': 1.6301, 'learning_rate': 2.4521988527724664e-05, 'epoch': 1.56}


 78%|███████▊  | 514/656 [09:01<02:06,  1.12it/s]

{'loss': 1.6161, 'learning_rate': 2.45697896749522e-05, 'epoch': 1.57}


 79%|███████▊  | 515/656 [09:02<01:58,  1.19it/s]

{'loss': 1.6597, 'learning_rate': 2.4617590822179736e-05, 'epoch': 1.57}


 79%|███████▊  | 516/656 [09:03<02:06,  1.11it/s]

{'loss': 1.6664, 'learning_rate': 2.4665391969407268e-05, 'epoch': 1.57}


 79%|███████▉  | 517/656 [09:04<02:02,  1.14it/s]

{'loss': 1.612, 'learning_rate': 2.47131931166348e-05, 'epoch': 1.58}


 79%|███████▉  | 518/656 [09:05<02:14,  1.03it/s]

{'loss': 1.6275, 'learning_rate': 2.4760994263862333e-05, 'epoch': 1.58}


 79%|███████▉  | 519/656 [09:06<02:02,  1.12it/s]

{'loss': 1.5733, 'learning_rate': 2.4808795411089865e-05, 'epoch': 1.58}


 79%|███████▉  | 520/656 [09:07<02:04,  1.09it/s]

{'loss': 1.618, 'learning_rate': 2.48565965583174e-05, 'epoch': 1.59}


 79%|███████▉  | 521/656 [09:08<02:05,  1.08it/s]

{'loss': 1.5247, 'learning_rate': 2.4904397705544937e-05, 'epoch': 1.59}


 80%|███████▉  | 522/656 [09:08<01:56,  1.15it/s]

{'loss': 1.6089, 'learning_rate': 2.495219885277247e-05, 'epoch': 1.59}


 80%|███████▉  | 523/656 [09:10<02:04,  1.07it/s]

{'loss': 1.6671, 'learning_rate': 2.5e-05, 'epoch': 1.59}


 80%|███████▉  | 524/656 [09:11<02:11,  1.01it/s]

{'loss': 1.5615, 'learning_rate': 2.5047801147227534e-05, 'epoch': 1.6}


 80%|████████  | 525/656 [09:12<02:05,  1.05it/s]

{'loss': 1.6315, 'learning_rate': 2.5095602294455066e-05, 'epoch': 1.6}


 80%|████████  | 526/656 [09:12<01:59,  1.08it/s]

{'loss': 1.6047, 'learning_rate': 2.51434034416826e-05, 'epoch': 1.6}


 80%|████████  | 527/656 [09:13<01:58,  1.09it/s]

{'loss': 1.6685, 'learning_rate': 2.5191204588910134e-05, 'epoch': 1.61}


 80%|████████  | 528/656 [09:14<02:01,  1.05it/s]

{'loss': 1.5563, 'learning_rate': 2.5239005736137666e-05, 'epoch': 1.61}


 81%|████████  | 529/656 [09:15<02:00,  1.05it/s]

{'loss': 1.6589, 'learning_rate': 2.52868068833652e-05, 'epoch': 1.61}


 81%|████████  | 530/656 [09:16<02:08,  1.02s/it]

{'loss': 1.5897, 'learning_rate': 2.5334608030592738e-05, 'epoch': 1.62}


 81%|████████  | 531/656 [09:17<01:59,  1.05it/s]

{'loss': 1.6264, 'learning_rate': 2.538240917782027e-05, 'epoch': 1.62}


 81%|████████  | 532/656 [09:18<02:05,  1.01s/it]

{'loss': 1.571, 'learning_rate': 2.5430210325047802e-05, 'epoch': 1.62}


 81%|████████▏ | 533/656 [09:19<01:59,  1.03it/s]

{'loss': 1.5424, 'learning_rate': 2.5478011472275338e-05, 'epoch': 1.62}


 81%|████████▏ | 534/656 [09:20<01:58,  1.03it/s]

{'loss': 1.658, 'learning_rate': 2.552581261950287e-05, 'epoch': 1.63}


 82%|████████▏ | 535/656 [09:21<01:54,  1.05it/s]

{'loss': 1.6085, 'learning_rate': 2.5573613766730403e-05, 'epoch': 1.63}


 82%|████████▏ | 536/656 [09:22<01:48,  1.10it/s]

{'loss': 1.6003, 'learning_rate': 2.5621414913957935e-05, 'epoch': 1.63}


 82%|████████▏ | 537/656 [09:23<01:43,  1.15it/s]

{'loss': 1.5699, 'learning_rate': 2.566921606118547e-05, 'epoch': 1.64}


 82%|████████▏ | 538/656 [09:24<01:40,  1.17it/s]

{'loss': 1.6177, 'learning_rate': 2.5717017208413003e-05, 'epoch': 1.64}


 82%|████████▏ | 539/656 [09:25<01:44,  1.12it/s]

{'loss': 1.5959, 'learning_rate': 2.5764818355640536e-05, 'epoch': 1.64}


 82%|████████▏ | 540/656 [09:26<01:49,  1.06it/s]

{'loss': 1.6022, 'learning_rate': 2.5812619502868068e-05, 'epoch': 1.65}


 82%|████████▏ | 541/656 [09:26<01:44,  1.10it/s]

{'loss': 1.5883, 'learning_rate': 2.58604206500956e-05, 'epoch': 1.65}


 83%|████████▎ | 542/656 [09:27<01:38,  1.15it/s]

{'loss': 1.6147, 'learning_rate': 2.5908221797323136e-05, 'epoch': 1.65}


 83%|████████▎ | 543/656 [09:28<01:45,  1.07it/s]

{'loss': 1.4751, 'learning_rate': 2.5956022944550672e-05, 'epoch': 1.66}


 83%|████████▎ | 544/656 [09:29<01:36,  1.15it/s]

{'loss': 1.6633, 'learning_rate': 2.6003824091778207e-05, 'epoch': 1.66}


 83%|████████▎ | 545/656 [09:30<01:34,  1.18it/s]

{'loss': 1.5798, 'learning_rate': 2.605162523900574e-05, 'epoch': 1.66}


 83%|████████▎ | 546/656 [09:31<01:37,  1.13it/s]

{'loss': 1.5462, 'learning_rate': 2.6099426386233272e-05, 'epoch': 1.66}


 83%|████████▎ | 547/656 [09:32<01:34,  1.15it/s]

{'loss': 1.5765, 'learning_rate': 2.6147227533460805e-05, 'epoch': 1.67}


 84%|████████▎ | 548/656 [09:32<01:32,  1.16it/s]

{'loss': 1.5497, 'learning_rate': 2.6195028680688337e-05, 'epoch': 1.67}


 84%|████████▎ | 549/656 [09:33<01:29,  1.20it/s]

{'loss': 1.5229, 'learning_rate': 2.6242829827915873e-05, 'epoch': 1.67}


 84%|████████▍ | 550/656 [09:34<01:25,  1.24it/s]

{'loss': 1.5343, 'learning_rate': 2.6290630975143405e-05, 'epoch': 1.68}


 84%|████████▍ | 551/656 [09:35<01:27,  1.21it/s]

{'loss': 1.5294, 'learning_rate': 2.6338432122370937e-05, 'epoch': 1.68}


 84%|████████▍ | 552/656 [09:36<01:26,  1.21it/s]

{'loss': 1.5299, 'learning_rate': 2.638623326959847e-05, 'epoch': 1.68}


 84%|████████▍ | 553/656 [09:37<01:34,  1.09it/s]

{'loss': 1.6462, 'learning_rate': 2.6434034416826002e-05, 'epoch': 1.69}


 84%|████████▍ | 554/656 [09:38<01:34,  1.08it/s]

{'loss': 1.6213, 'learning_rate': 2.6481835564053538e-05, 'epoch': 1.69}


 85%|████████▍ | 555/656 [09:39<01:36,  1.05it/s]

{'loss': 1.5735, 'learning_rate': 2.652963671128107e-05, 'epoch': 1.69}


 85%|████████▍ | 556/656 [09:40<01:30,  1.11it/s]

{'loss': 1.4863, 'learning_rate': 2.657743785850861e-05, 'epoch': 1.7}


 85%|████████▍ | 557/656 [09:40<01:26,  1.15it/s]

{'loss': 1.5963, 'learning_rate': 2.662523900573614e-05, 'epoch': 1.7}


 85%|████████▌ | 558/656 [09:41<01:22,  1.19it/s]

{'loss': 1.5681, 'learning_rate': 2.6673040152963674e-05, 'epoch': 1.7}


 85%|████████▌ | 559/656 [09:42<01:21,  1.18it/s]

{'loss': 1.6326, 'learning_rate': 2.6720841300191206e-05, 'epoch': 1.7}


 85%|████████▌ | 560/656 [09:43<01:23,  1.16it/s]

{'loss': 1.5932, 'learning_rate': 2.6768642447418742e-05, 'epoch': 1.71}


 86%|████████▌ | 561/656 [09:44<01:23,  1.14it/s]

{'loss': 1.549, 'learning_rate': 2.6816443594646274e-05, 'epoch': 1.71}


 86%|████████▌ | 562/656 [09:45<01:30,  1.04it/s]

{'loss': 1.5301, 'learning_rate': 2.6864244741873807e-05, 'epoch': 1.71}


 86%|████████▌ | 563/656 [09:46<01:24,  1.10it/s]

{'loss': 1.5567, 'learning_rate': 2.691204588910134e-05, 'epoch': 1.72}


 86%|████████▌ | 564/656 [09:47<01:29,  1.02it/s]

{'loss': 1.6135, 'learning_rate': 2.695984703632887e-05, 'epoch': 1.72}


 86%|████████▌ | 565/656 [09:48<01:23,  1.09it/s]

{'loss': 1.5694, 'learning_rate': 2.7007648183556407e-05, 'epoch': 1.72}


 86%|████████▋ | 566/656 [09:49<01:24,  1.07it/s]

{'loss': 1.5524, 'learning_rate': 2.705544933078394e-05, 'epoch': 1.73}


 86%|████████▋ | 567/656 [09:49<01:19,  1.13it/s]

{'loss': 1.5604, 'learning_rate': 2.7103250478011472e-05, 'epoch': 1.73}


 87%|████████▋ | 568/656 [09:51<01:24,  1.04it/s]

{'loss': 1.4879, 'learning_rate': 2.7151051625239004e-05, 'epoch': 1.73}


 87%|████████▋ | 569/656 [09:51<01:22,  1.05it/s]

{'loss': 1.5641, 'learning_rate': 2.7198852772466543e-05, 'epoch': 1.73}


 87%|████████▋ | 570/656 [09:52<01:20,  1.07it/s]

{'loss': 1.5478, 'learning_rate': 2.7246653919694075e-05, 'epoch': 1.74}


 87%|████████▋ | 571/656 [09:53<01:14,  1.15it/s]

{'loss': 1.4807, 'learning_rate': 2.7294455066921608e-05, 'epoch': 1.74}


 87%|████████▋ | 572/656 [09:54<01:12,  1.15it/s]

{'loss': 1.5567, 'learning_rate': 2.7342256214149144e-05, 'epoch': 1.74}


 87%|████████▋ | 573/656 [09:55<01:09,  1.19it/s]

{'loss': 1.5484, 'learning_rate': 2.7390057361376676e-05, 'epoch': 1.75}


 88%|████████▊ | 574/656 [09:56<01:11,  1.15it/s]

{'loss': 1.5735, 'learning_rate': 2.7437858508604208e-05, 'epoch': 1.75}


 88%|████████▊ | 575/656 [09:56<01:09,  1.16it/s]

{'loss': 1.533, 'learning_rate': 2.748565965583174e-05, 'epoch': 1.75}


 88%|████████▊ | 576/656 [09:57<01:11,  1.12it/s]

{'loss': 1.5806, 'learning_rate': 2.7533460803059273e-05, 'epoch': 1.76}


 88%|████████▊ | 577/656 [09:58<01:11,  1.10it/s]

{'loss': 1.5435, 'learning_rate': 2.758126195028681e-05, 'epoch': 1.76}


 88%|████████▊ | 578/656 [10:00<01:16,  1.02it/s]

{'loss': 1.5207, 'learning_rate': 2.762906309751434e-05, 'epoch': 1.76}


 88%|████████▊ | 579/656 [10:00<01:10,  1.10it/s]

{'loss': 1.5978, 'learning_rate': 2.7676864244741873e-05, 'epoch': 1.77}


 88%|████████▊ | 580/656 [10:01<01:05,  1.16it/s]

{'loss': 1.5364, 'learning_rate': 2.7724665391969406e-05, 'epoch': 1.77}


 89%|████████▊ | 581/656 [10:02<01:02,  1.19it/s]

{'loss': 1.6024, 'learning_rate': 2.7772466539196938e-05, 'epoch': 1.77}


 89%|████████▊ | 582/656 [10:03<01:03,  1.16it/s]

{'loss': 1.4989, 'learning_rate': 2.7820267686424477e-05, 'epoch': 1.77}


 89%|████████▉ | 583/656 [10:03<00:59,  1.23it/s]

{'loss': 1.5528, 'learning_rate': 2.7868068833652013e-05, 'epoch': 1.78}


 89%|████████▉ | 584/656 [10:04<00:55,  1.29it/s]

{'loss': 1.4991, 'learning_rate': 2.7915869980879545e-05, 'epoch': 1.78}


 89%|████████▉ | 585/656 [10:05<00:55,  1.28it/s]

{'loss': 1.559, 'learning_rate': 2.7963671128107078e-05, 'epoch': 1.78}


 89%|████████▉ | 586/656 [10:06<00:52,  1.33it/s]

{'loss': 1.4521, 'learning_rate': 2.801147227533461e-05, 'epoch': 1.79}


 89%|████████▉ | 587/656 [10:06<00:52,  1.32it/s]

{'loss': 1.5439, 'learning_rate': 2.8059273422562142e-05, 'epoch': 1.79}


 90%|████████▉ | 588/656 [10:07<00:54,  1.26it/s]

{'loss': 1.5037, 'learning_rate': 2.8107074569789678e-05, 'epoch': 1.79}


 90%|████████▉ | 589/656 [10:08<00:59,  1.13it/s]

{'loss': 1.4884, 'learning_rate': 2.815487571701721e-05, 'epoch': 1.8}


 90%|████████▉ | 590/656 [10:09<00:56,  1.16it/s]

{'loss': 1.5437, 'learning_rate': 2.8202676864244743e-05, 'epoch': 1.8}


 90%|█████████ | 591/656 [10:10<00:55,  1.18it/s]

{'loss': 1.4608, 'learning_rate': 2.8250478011472275e-05, 'epoch': 1.8}


 90%|█████████ | 592/656 [10:11<01:01,  1.05it/s]

{'loss': 1.4993, 'learning_rate': 2.8298279158699807e-05, 'epoch': 1.8}


 90%|█████████ | 593/656 [10:12<00:57,  1.09it/s]

{'loss': 1.5632, 'learning_rate': 2.8346080305927343e-05, 'epoch': 1.81}


 91%|█████████ | 594/656 [10:13<00:55,  1.13it/s]

{'loss': 1.4626, 'learning_rate': 2.8393881453154875e-05, 'epoch': 1.81}


 91%|█████████ | 595/656 [10:14<00:52,  1.16it/s]

{'loss': 1.4824, 'learning_rate': 2.8441682600382415e-05, 'epoch': 1.81}


 91%|█████████ | 596/656 [10:14<00:50,  1.18it/s]

{'loss': 1.4729, 'learning_rate': 2.8489483747609947e-05, 'epoch': 1.82}


 91%|█████████ | 597/656 [10:15<00:49,  1.20it/s]

{'loss': 1.4618, 'learning_rate': 2.853728489483748e-05, 'epoch': 1.82}


 91%|█████████ | 598/656 [10:16<00:47,  1.23it/s]

{'loss': 1.5354, 'learning_rate': 2.858508604206501e-05, 'epoch': 1.82}


 91%|█████████▏| 599/656 [10:17<00:48,  1.18it/s]

{'loss': 1.5564, 'learning_rate': 2.8632887189292544e-05, 'epoch': 1.83}


 91%|█████████▏| 600/656 [10:18<00:48,  1.15it/s]

{'loss': 1.532, 'learning_rate': 2.868068833652008e-05, 'epoch': 1.83}


 92%|█████████▏| 601/656 [10:19<00:45,  1.22it/s]

{'loss': 1.4785, 'learning_rate': 2.8728489483747612e-05, 'epoch': 1.83}


 92%|█████████▏| 602/656 [10:19<00:45,  1.18it/s]

{'loss': 1.4569, 'learning_rate': 2.8776290630975144e-05, 'epoch': 1.84}


 92%|█████████▏| 603/656 [10:20<00:47,  1.12it/s]

{'loss': 1.5134, 'learning_rate': 2.8824091778202677e-05, 'epoch': 1.84}


 92%|█████████▏| 604/656 [10:21<00:46,  1.12it/s]

{'loss': 1.4924, 'learning_rate': 2.887189292543021e-05, 'epoch': 1.84}


 92%|█████████▏| 605/656 [10:22<00:45,  1.13it/s]

{'loss': 1.4511, 'learning_rate': 2.8919694072657745e-05, 'epoch': 1.84}


 92%|█████████▏| 606/656 [10:23<00:44,  1.12it/s]

{'loss': 1.5507, 'learning_rate': 2.8967495219885277e-05, 'epoch': 1.85}


 93%|█████████▎| 607/656 [10:24<00:41,  1.17it/s]

{'loss': 1.5691, 'learning_rate': 2.901529636711281e-05, 'epoch': 1.85}


 93%|█████████▎| 608/656 [10:25<00:41,  1.16it/s]

{'loss': 1.442, 'learning_rate': 2.906309751434035e-05, 'epoch': 1.85}


 93%|█████████▎| 609/656 [10:26<00:41,  1.13it/s]

{'loss': 1.4301, 'learning_rate': 2.911089866156788e-05, 'epoch': 1.86}


 93%|█████████▎| 610/656 [10:26<00:38,  1.19it/s]

{'loss': 1.4896, 'learning_rate': 2.9158699808795413e-05, 'epoch': 1.86}


 93%|█████████▎| 611/656 [10:28<00:41,  1.10it/s]

{'loss': 1.4156, 'learning_rate': 2.920650095602295e-05, 'epoch': 1.86}


 93%|█████████▎| 612/656 [10:28<00:36,  1.21it/s]

{'loss': 1.5118, 'learning_rate': 2.925430210325048e-05, 'epoch': 1.87}


 93%|█████████▎| 613/656 [10:29<00:37,  1.14it/s]

{'loss': 1.469, 'learning_rate': 2.9302103250478014e-05, 'epoch': 1.87}


 94%|█████████▎| 614/656 [10:30<00:40,  1.05it/s]

{'loss': 1.4826, 'learning_rate': 2.9349904397705546e-05, 'epoch': 1.87}


 94%|█████████▍| 615/656 [10:31<00:40,  1.02it/s]

{'loss': 1.5117, 'learning_rate': 2.939770554493308e-05, 'epoch': 1.88}


 94%|█████████▍| 616/656 [10:32<00:38,  1.04it/s]

{'loss': 1.4995, 'learning_rate': 2.9445506692160614e-05, 'epoch': 1.88}


 94%|█████████▍| 617/656 [10:33<00:39,  1.02s/it]

{'loss': 1.4876, 'learning_rate': 2.9493307839388146e-05, 'epoch': 1.88}


 94%|█████████▍| 618/656 [10:34<00:35,  1.08it/s]

{'loss': 1.4606, 'learning_rate': 2.954110898661568e-05, 'epoch': 1.88}


 94%|█████████▍| 619/656 [10:35<00:32,  1.15it/s]

{'loss': 1.4517, 'learning_rate': 2.958891013384321e-05, 'epoch': 1.89}


 95%|█████████▍| 620/656 [10:36<00:31,  1.14it/s]

{'loss': 1.447, 'learning_rate': 2.9636711281070743e-05, 'epoch': 1.89}


 95%|█████████▍| 621/656 [10:36<00:29,  1.19it/s]

{'loss': 1.46, 'learning_rate': 2.968451242829828e-05, 'epoch': 1.89}


 95%|█████████▍| 622/656 [10:37<00:28,  1.18it/s]

{'loss': 1.4888, 'learning_rate': 2.9732313575525815e-05, 'epoch': 1.9}


 95%|█████████▍| 623/656 [10:38<00:28,  1.14it/s]

{'loss': 1.4773, 'learning_rate': 2.978011472275335e-05, 'epoch': 1.9}


 95%|█████████▌| 624/656 [10:39<00:26,  1.20it/s]

{'loss': 1.3963, 'learning_rate': 2.9827915869980883e-05, 'epoch': 1.9}


 95%|█████████▌| 625/656 [10:40<00:24,  1.27it/s]

{'loss': 1.4795, 'learning_rate': 2.9875717017208415e-05, 'epoch': 1.91}


 95%|█████████▌| 626/656 [10:40<00:23,  1.28it/s]

{'loss': 1.5211, 'learning_rate': 2.9923518164435948e-05, 'epoch': 1.91}


 96%|█████████▌| 627/656 [10:41<00:22,  1.27it/s]

{'loss': 1.4763, 'learning_rate': 2.997131931166348e-05, 'epoch': 1.91}


 96%|█████████▌| 628/656 [10:42<00:24,  1.14it/s]

{'loss': 1.4097, 'learning_rate': 3.0019120458891016e-05, 'epoch': 1.91}


 96%|█████████▌| 629/656 [10:43<00:22,  1.18it/s]

{'loss': 1.3483, 'learning_rate': 3.0066921606118548e-05, 'epoch': 1.92}


 96%|█████████▌| 630/656 [10:44<00:21,  1.19it/s]

{'loss': 1.5067, 'learning_rate': 3.011472275334608e-05, 'epoch': 1.92}


 96%|█████████▌| 631/656 [10:45<00:20,  1.20it/s]

{'loss': 1.4636, 'learning_rate': 3.0162523900573613e-05, 'epoch': 1.92}


 96%|█████████▋| 632/656 [10:46<00:22,  1.06it/s]

{'loss': 1.4565, 'learning_rate': 3.021032504780115e-05, 'epoch': 1.93}


 96%|█████████▋| 633/656 [10:47<00:22,  1.00it/s]

{'loss': 1.4473, 'learning_rate': 3.025812619502868e-05, 'epoch': 1.93}


 97%|█████████▋| 634/656 [10:48<00:21,  1.02it/s]

{'loss': 1.4346, 'learning_rate': 3.0305927342256213e-05, 'epoch': 1.93}


 97%|█████████▋| 635/656 [10:49<00:21,  1.04s/it]

{'loss': 1.476, 'learning_rate': 3.0353728489483752e-05, 'epoch': 1.94}


 97%|█████████▋| 636/656 [10:50<00:19,  1.02it/s]

{'loss': 1.3541, 'learning_rate': 3.0401529636711285e-05, 'epoch': 1.94}


 97%|█████████▋| 637/656 [10:51<00:17,  1.07it/s]

{'loss': 1.4143, 'learning_rate': 3.0449330783938817e-05, 'epoch': 1.94}


 97%|█████████▋| 638/656 [10:52<00:15,  1.13it/s]

{'loss': 1.477, 'learning_rate': 3.049713193116635e-05, 'epoch': 1.95}


 97%|█████████▋| 639/656 [10:52<00:13,  1.23it/s]

{'loss': 1.4321, 'learning_rate': 3.0544933078393885e-05, 'epoch': 1.95}


 98%|█████████▊| 640/656 [10:53<00:13,  1.23it/s]

{'loss': 1.4602, 'learning_rate': 3.059273422562142e-05, 'epoch': 1.95}


 98%|█████████▊| 641/656 [10:54<00:12,  1.24it/s]

{'loss': 1.3917, 'learning_rate': 3.064053537284895e-05, 'epoch': 1.95}


 98%|█████████▊| 642/656 [10:55<00:11,  1.18it/s]

{'loss': 1.4263, 'learning_rate': 3.068833652007648e-05, 'epoch': 1.96}


 98%|█████████▊| 643/656 [10:56<00:10,  1.22it/s]

{'loss': 1.447, 'learning_rate': 3.0736137667304014e-05, 'epoch': 1.96}


 98%|█████████▊| 644/656 [10:56<00:09,  1.26it/s]

{'loss': 1.4471, 'learning_rate': 3.078393881453155e-05, 'epoch': 1.96}


 98%|█████████▊| 645/656 [10:57<00:08,  1.27it/s]

{'loss': 1.4151, 'learning_rate': 3.083173996175908e-05, 'epoch': 1.97}


 98%|█████████▊| 646/656 [10:58<00:07,  1.26it/s]

{'loss': 1.4261, 'learning_rate': 3.087954110898662e-05, 'epoch': 1.97}


 99%|█████████▊| 647/656 [10:59<00:07,  1.28it/s]

{'loss': 1.3672, 'learning_rate': 3.092734225621415e-05, 'epoch': 1.97}


 99%|█████████▉| 648/656 [11:00<00:06,  1.23it/s]

{'loss': 1.3916, 'learning_rate': 3.097514340344169e-05, 'epoch': 1.98}


 99%|█████████▉| 649/656 [11:01<00:06,  1.16it/s]

{'loss': 1.392, 'learning_rate': 3.102294455066922e-05, 'epoch': 1.98}


 99%|█████████▉| 650/656 [11:02<00:05,  1.11it/s]

{'loss': 1.3996, 'learning_rate': 3.1070745697896754e-05, 'epoch': 1.98}


 99%|█████████▉| 651/656 [11:02<00:04,  1.14it/s]

{'loss': 1.4293, 'learning_rate': 3.111854684512429e-05, 'epoch': 1.98}


 99%|█████████▉| 652/656 [11:03<00:03,  1.10it/s]

{'loss': 1.4959, 'learning_rate': 3.116634799235182e-05, 'epoch': 1.99}


100%|█████████▉| 653/656 [11:04<00:02,  1.06it/s]

{'loss': 1.3782, 'learning_rate': 3.121414913957935e-05, 'epoch': 1.99}


100%|█████████▉| 654/656 [11:05<00:01,  1.10it/s]

{'loss': 1.4137, 'learning_rate': 3.1261950286806884e-05, 'epoch': 1.99}


100%|██████████| 656/656 [11:06<00:00,  1.10it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 1.4418, 'learning_rate': 3.1309751434034416e-05, 'epoch': 2.0}
{'loss': 1.3467, 'learning_rate': 3.135755258126195e-05, 'epoch': 2.0}


                                                 
100%|██████████| 656/656 [12:18<00:00,  1.10it/s]Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved in ./snips_clf/results/checkpoint-656/config.json


{'eval_loss': 1.3543637990951538, 'eval_accuracy': 0.8914787925105082, 'eval_runtime': 71.9796, 'eval_samples_per_second': 36.358, 'eval_steps_per_second': 1.139, 'epoch': 2.0}


Model weights saved in ./snips_clf/results/checkpoint-656/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_clf/results/checkpoint-656 (score: 1.3543637990951538).
100%|██████████| 656/656 [12:20<00:00,  1.13s/it]

{'train_runtime': 752.036, 'train_samples_per_second': 27.836, 'train_steps_per_second': 0.872, 'train_loss': 1.7816114474723979, 'epoch': 2.0}





TrainOutput(global_step=656, training_loss=1.7816114474723979, metrics={'train_runtime': 752.036, 'train_samples_per_second': 27.836, 'train_steps_per_second': 0.872, 'train_loss': 1.7816114474723979, 'epoch': 2.0})

In [32]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [01:07<00:00,  1.21it/s]


{'eval_loss': 1.3543637990951538,
 'eval_accuracy': 0.8914787925105082,
 'eval_runtime': 68.4554,
 'eval_samples_per_second': 38.229,
 'eval_steps_per_second': 1.198,
 'epoch': 2.0}

In [34]:
# We can now load our fine-tuned from our directory
pipe = pipeline("text-classification", "./snips_clf/results", tokenizer=tokenizer)

loading configuration file ./snips_clf/results/config.json
Model config DistilBertConfig {
  "_name_or_path": "./snips_clf/results",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "BookRestaurant",
    "1": "SearchCreativeWork",
    "2": "PlayMusic",
    "3": "GetWeather",
    "4": "AddToPlaylist",
    "5": "SearchScreeningEvent",
    "6": "RateBook"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype":

In [38]:
pipe('I want to make a reservation at Clementina for tonight')

[{'label': 'BookRestaurant', 'score': 0.9962743520736694}]

wandb: Network error (ConnectionError), entering retry loop.
