In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

from tqdm import tqdm

  device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),


In [5]:
torch.device('mps')

device(type='mps')

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [7]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    # TODO: mistake_1:
    # def forward(self, x, mask):
    #     ff_output = self.feed_forward(x)
    #     attn_output = self.self_attn(x, x, x, mask)
    #     x = self.norm1(x + self.dropout(attn_output))
    #     x = self.norm2(x + self.dropout(ff_output))
    #     return x
    
    # correct!
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [10]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    # TODO: mistake_2:     
    # def forward(self, x, enc_output, src_mask, tgt_mask):
    #     attn_output = self.self_attn(x, x, x, tgt_mask)
    #     x = self.norm1(x + self.dropout(attn_output))
    #     ff_output = self.feed_forward(x)
    #     x = self.norm3(x + self.dropout(ff_output))
    #     return x
    
    # correct!
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [11]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [26]:
src_vocab_size = 1000
tgt_vocab_size = 2000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 50
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

In [27]:
from random import randint

In [28]:
def generate_sample():
    start = randint(1, 944)
    
    increasing = torch.arange(start, start+max_seq_length, dtype=torch.int64)
    trg = increasing + torch.arange(1, max_seq_length + 1, dtype=torch.int64)
  
    src = torch.zeros_like(trg, dtype=torch.int64)
    
    src[:10] = increasing[:10]

    return src, trg

In [29]:
generate_sample()

(tensor([203, 204, 205, 206, 207, 208, 209, 210, 211, 212,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]),
 tensor([204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230,
         232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254, 256, 258,
         260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286,
         288, 290, 292, 294, 296, 298, 300, 302]))

In [30]:
def generate_batch(batch_size: int = 128):
    src_batch = torch.tensor([], dtype=torch.int64)
    trg_batch = torch.tensor([], dtype=torch.int64)

    while src_batch.shape[0] < batch_size:
        src_sample, trg_sample = generate_sample()
        src_batch = torch.cat((src_batch, src_sample.unsqueeze(0)))
        trg_batch = torch.cat((trg_batch, trg_sample.unsqueeze(0)))
    
    return src_batch, trg_batch


In [31]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for step in tqdm(range(1000)):
    src_batch, trg_batch = generate_batch(64)
    optimizer.zero_grad()
    output = transformer(src_batch, src_batch)
    loss = criterion(output.view(-1, output.size(-1)), trg_batch.view(-1))
    loss.backward()
    optimizer.step()
    print(f"Step: {step+1}, Loss: {loss.item()}")

  0%|          | 1/1000 [00:02<35:37,  2.14s/it]

Step: 1, Loss: 7.757936000823975


  0%|          | 2/1000 [00:03<29:08,  1.75s/it]

Step: 2, Loss: 7.649057388305664


  0%|          | 3/1000 [00:05<27:38,  1.66s/it]

Step: 3, Loss: 7.517981052398682


  0%|          | 4/1000 [00:06<25:20,  1.53s/it]

Step: 4, Loss: 7.500527381896973


  0%|          | 5/1000 [00:07<23:56,  1.44s/it]

Step: 5, Loss: 7.498258590698242


  1%|          | 6/1000 [00:09<22:59,  1.39s/it]

Step: 6, Loss: 7.437707901000977


  1%|          | 7/1000 [00:10<22:13,  1.34s/it]

Step: 7, Loss: 7.421270370483398


  1%|          | 8/1000 [00:11<21:52,  1.32s/it]

Step: 8, Loss: 7.394241809844971


  1%|          | 9/1000 [00:12<21:21,  1.29s/it]

Step: 9, Loss: 7.3625383377075195


  1%|          | 10/1000 [00:14<21:05,  1.28s/it]

Step: 10, Loss: 7.340339183807373


  1%|          | 11/1000 [00:15<21:12,  1.29s/it]

Step: 11, Loss: 7.322544574737549


  1%|          | 12/1000 [00:17<25:55,  1.57s/it]

Step: 12, Loss: 7.331706523895264


  1%|▏         | 13/1000 [00:19<26:37,  1.62s/it]

Step: 13, Loss: 7.291641712188721


  1%|▏         | 14/1000 [00:20<26:15,  1.60s/it]

Step: 14, Loss: 7.272925853729248


  2%|▏         | 15/1000 [00:22<24:28,  1.49s/it]

Step: 15, Loss: 7.225790977478027


  2%|▏         | 16/1000 [00:23<23:06,  1.41s/it]

Step: 16, Loss: 7.233592987060547


  2%|▏         | 17/1000 [00:24<22:17,  1.36s/it]

Step: 17, Loss: 7.2215352058410645


  2%|▏         | 18/1000 [00:25<22:07,  1.35s/it]

Step: 18, Loss: 7.2003631591796875


  2%|▏         | 19/1000 [00:27<21:50,  1.34s/it]

Step: 19, Loss: 7.165151596069336


  2%|▏         | 20/1000 [00:28<21:23,  1.31s/it]

Step: 20, Loss: 7.1739068031311035


  2%|▏         | 21/1000 [00:29<20:57,  1.28s/it]

Step: 21, Loss: 7.157436370849609


  2%|▏         | 22/1000 [00:30<20:38,  1.27s/it]

Step: 22, Loss: 7.1514892578125


  2%|▏         | 23/1000 [00:32<20:50,  1.28s/it]

Step: 23, Loss: 7.111783504486084


  2%|▏         | 24/1000 [00:33<20:58,  1.29s/it]

Step: 24, Loss: 7.102738857269287


  2%|▎         | 25/1000 [00:34<20:51,  1.28s/it]

Step: 25, Loss: 7.094117641448975


  3%|▎         | 26/1000 [00:36<20:45,  1.28s/it]

Step: 26, Loss: 7.112886428833008


  3%|▎         | 27/1000 [00:37<20:38,  1.27s/it]

Step: 27, Loss: 7.0828118324279785


  3%|▎         | 28/1000 [00:38<20:25,  1.26s/it]

Step: 28, Loss: 7.065099716186523


  3%|▎         | 29/1000 [00:39<20:15,  1.25s/it]

Step: 29, Loss: 7.033782958984375


  3%|▎         | 30/1000 [00:41<20:07,  1.25s/it]

Step: 30, Loss: 6.9915618896484375


  3%|▎         | 31/1000 [00:42<20:11,  1.25s/it]

Step: 31, Loss: 6.9708685874938965


  3%|▎         | 32/1000 [00:43<20:00,  1.24s/it]

Step: 32, Loss: 6.959544658660889


  3%|▎         | 33/1000 [00:44<20:14,  1.26s/it]

Step: 33, Loss: 6.928696155548096


  3%|▎         | 34/1000 [00:46<20:01,  1.24s/it]

Step: 34, Loss: 6.8937296867370605


  4%|▎         | 35/1000 [00:47<19:57,  1.24s/it]

Step: 35, Loss: 6.85301399230957


  4%|▎         | 36/1000 [00:48<19:48,  1.23s/it]

Step: 36, Loss: 6.848909378051758


  4%|▎         | 37/1000 [00:49<19:51,  1.24s/it]

Step: 37, Loss: 6.846566200256348


  4%|▍         | 38/1000 [00:50<19:44,  1.23s/it]

Step: 38, Loss: 6.758878707885742


  4%|▍         | 39/1000 [00:52<19:49,  1.24s/it]

Step: 39, Loss: 6.752110004425049


  4%|▍         | 40/1000 [00:53<19:54,  1.24s/it]

Step: 40, Loss: 6.67905330657959


  4%|▍         | 41/1000 [00:54<19:44,  1.24s/it]

Step: 41, Loss: 6.688540458679199


  4%|▍         | 42/1000 [00:56<20:18,  1.27s/it]

Step: 42, Loss: 6.619688034057617


  4%|▍         | 43/1000 [00:57<20:36,  1.29s/it]

Step: 43, Loss: 6.617766857147217


  4%|▍         | 44/1000 [00:58<20:40,  1.30s/it]

Step: 44, Loss: 6.5788679122924805


  4%|▍         | 45/1000 [01:00<21:08,  1.33s/it]

Step: 45, Loss: 6.516787052154541


  5%|▍         | 46/1000 [01:01<20:44,  1.30s/it]

Step: 46, Loss: 6.4798688888549805


  5%|▍         | 47/1000 [01:02<20:27,  1.29s/it]

Step: 47, Loss: 6.462921619415283


  5%|▍         | 48/1000 [01:03<20:04,  1.27s/it]

Step: 48, Loss: 6.4069952964782715


  5%|▍         | 49/1000 [01:05<20:07,  1.27s/it]

Step: 49, Loss: 6.382225513458252


  5%|▌         | 50/1000 [01:06<20:02,  1.27s/it]

Step: 50, Loss: 6.350244045257568


  5%|▌         | 51/1000 [01:07<20:20,  1.29s/it]

Step: 51, Loss: 6.310060024261475


  5%|▌         | 52/1000 [01:08<20:08,  1.27s/it]

Step: 52, Loss: 6.263212203979492


  5%|▌         | 53/1000 [01:10<20:14,  1.28s/it]

Step: 53, Loss: 6.2603864669799805


  5%|▌         | 54/1000 [01:11<20:02,  1.27s/it]

Step: 54, Loss: 6.223956108093262


  6%|▌         | 55/1000 [01:12<19:49,  1.26s/it]

Step: 55, Loss: 6.18536376953125


  6%|▌         | 56/1000 [01:13<19:49,  1.26s/it]

Step: 56, Loss: 6.173675537109375


  6%|▌         | 57/1000 [01:15<19:37,  1.25s/it]

Step: 57, Loss: 6.0995001792907715


  6%|▌         | 58/1000 [01:16<19:36,  1.25s/it]

Step: 58, Loss: 6.106044292449951


  6%|▌         | 59/1000 [01:18<21:23,  1.36s/it]

Step: 59, Loss: 6.087432861328125


  6%|▌         | 60/1000 [01:20<26:30,  1.69s/it]

Step: 60, Loss: 6.0728254318237305


  6%|▌         | 61/1000 [01:22<26:17,  1.68s/it]

Step: 61, Loss: 6.0462646484375


  6%|▌         | 62/1000 [01:23<24:32,  1.57s/it]

Step: 62, Loss: 6.029684543609619


  6%|▋         | 63/1000 [01:24<23:10,  1.48s/it]

Step: 63, Loss: 5.987194061279297


  6%|▋         | 64/1000 [01:26<23:26,  1.50s/it]

Step: 64, Loss: 5.930561065673828


  6%|▋         | 65/1000 [01:27<22:34,  1.45s/it]

Step: 65, Loss: 5.9360551834106445


  7%|▋         | 66/1000 [01:28<22:01,  1.41s/it]

Step: 66, Loss: 5.9240641593933105


  7%|▋         | 67/1000 [01:30<22:30,  1.45s/it]

Step: 67, Loss: 5.871568202972412


  7%|▋         | 68/1000 [01:32<23:58,  1.54s/it]

Step: 68, Loss: 5.889871120452881


  7%|▋         | 69/1000 [01:34<28:59,  1.87s/it]

Step: 69, Loss: 5.778552055358887


  7%|▋         | 70/1000 [01:36<26:24,  1.70s/it]

Step: 70, Loss: 5.77528190612793


  7%|▋         | 71/1000 [01:37<24:22,  1.57s/it]

Step: 71, Loss: 5.76908016204834


  7%|▋         | 72/1000 [01:38<22:41,  1.47s/it]

Step: 72, Loss: 5.775423049926758


  7%|▋         | 73/1000 [01:39<21:34,  1.40s/it]

Step: 73, Loss: 5.713735580444336


  7%|▋         | 74/1000 [01:41<20:40,  1.34s/it]

Step: 74, Loss: 5.763482093811035


  8%|▊         | 75/1000 [01:42<20:12,  1.31s/it]

Step: 75, Loss: 5.697798252105713


  8%|▊         | 76/1000 [01:43<19:53,  1.29s/it]

Step: 76, Loss: 5.687540054321289


  8%|▊         | 77/1000 [01:44<19:55,  1.29s/it]

Step: 77, Loss: 5.649975776672363


  8%|▊         | 78/1000 [01:46<19:33,  1.27s/it]

Step: 78, Loss: 5.646329402923584


  8%|▊         | 79/1000 [01:47<20:39,  1.35s/it]

Step: 79, Loss: 5.610534191131592


  8%|▊         | 80/1000 [01:49<21:56,  1.43s/it]

Step: 80, Loss: 5.588037014007568


  8%|▊         | 81/1000 [01:50<21:47,  1.42s/it]

Step: 81, Loss: 5.573785305023193


  8%|▊         | 82/1000 [01:51<21:11,  1.38s/it]

Step: 82, Loss: 5.560961723327637


  8%|▊         | 83/1000 [01:53<21:10,  1.39s/it]

Step: 83, Loss: 5.491409778594971


  8%|▊         | 84/1000 [01:54<20:37,  1.35s/it]

Step: 84, Loss: 5.520762920379639


  8%|▊         | 85/1000 [01:55<20:21,  1.33s/it]

Step: 85, Loss: 5.509378433227539


  9%|▊         | 86/1000 [01:57<20:03,  1.32s/it]

Step: 86, Loss: 5.45472526550293


  9%|▊         | 87/1000 [01:58<20:04,  1.32s/it]

Step: 87, Loss: 5.4167327880859375


  9%|▉         | 88/1000 [01:59<19:55,  1.31s/it]

Step: 88, Loss: 5.434882164001465


  9%|▉         | 89/1000 [02:01<19:52,  1.31s/it]

Step: 89, Loss: 5.427652359008789


  9%|▉         | 90/1000 [02:02<20:15,  1.34s/it]

Step: 90, Loss: 5.40367317199707


  9%|▉         | 91/1000 [02:03<20:15,  1.34s/it]

Step: 91, Loss: 5.3485541343688965


  9%|▉         | 92/1000 [02:05<20:05,  1.33s/it]

Step: 92, Loss: 5.340639591217041


  9%|▉         | 93/1000 [02:06<20:06,  1.33s/it]

Step: 93, Loss: 5.3349409103393555


  9%|▉         | 94/1000 [02:07<20:34,  1.36s/it]

Step: 94, Loss: 5.280211925506592


 10%|▉         | 95/1000 [02:09<20:38,  1.37s/it]

Step: 95, Loss: 5.305182933807373


 10%|▉         | 96/1000 [02:10<20:19,  1.35s/it]

Step: 96, Loss: 5.27972412109375


 10%|▉         | 97/1000 [02:11<19:49,  1.32s/it]

Step: 97, Loss: 5.218614101409912


 10%|▉         | 98/1000 [02:13<19:29,  1.30s/it]

Step: 98, Loss: 5.2193803787231445


 10%|▉         | 99/1000 [02:14<19:12,  1.28s/it]

Step: 99, Loss: 5.1857147216796875


 10%|█         | 100/1000 [02:15<19:02,  1.27s/it]

Step: 100, Loss: 5.204029560089111


 10%|█         | 101/1000 [02:16<18:54,  1.26s/it]

Step: 101, Loss: 5.181591033935547


 10%|█         | 102/1000 [02:18<18:52,  1.26s/it]

Step: 102, Loss: 5.181889057159424


 10%|█         | 103/1000 [02:19<18:53,  1.26s/it]

Step: 103, Loss: 5.154682636260986


 10%|█         | 104/1000 [02:20<18:52,  1.26s/it]

Step: 104, Loss: 5.165631294250488


 10%|█         | 105/1000 [02:21<18:45,  1.26s/it]

Step: 105, Loss: 5.120797157287598


 11%|█         | 106/1000 [02:23<18:49,  1.26s/it]

Step: 106, Loss: 5.081640720367432


 11%|█         | 107/1000 [02:24<18:36,  1.25s/it]

Step: 107, Loss: 5.082715034484863


 11%|█         | 108/1000 [02:25<19:32,  1.31s/it]

Step: 108, Loss: 5.052606582641602


 11%|█         | 109/1000 [02:27<19:38,  1.32s/it]

Step: 109, Loss: 5.0412678718566895


 11%|█         | 110/1000 [02:28<19:23,  1.31s/it]

Step: 110, Loss: 5.036228656768799


 11%|█         | 111/1000 [02:29<19:41,  1.33s/it]

Step: 111, Loss: 4.969249725341797


 11%|█         | 112/1000 [02:31<19:33,  1.32s/it]

Step: 112, Loss: 4.953546524047852


 11%|█▏        | 113/1000 [02:32<19:16,  1.30s/it]

Step: 113, Loss: 4.990079402923584


 11%|█▏        | 114/1000 [02:33<18:57,  1.28s/it]

Step: 114, Loss: 4.932954788208008


 12%|█▏        | 115/1000 [02:34<18:49,  1.28s/it]

Step: 115, Loss: 4.904578685760498


 12%|█▏        | 116/1000 [02:36<19:21,  1.31s/it]

Step: 116, Loss: 4.892603397369385


 12%|█▏        | 117/1000 [02:37<19:52,  1.35s/it]

Step: 117, Loss: 4.863278388977051


 12%|█▏        | 118/1000 [02:39<20:10,  1.37s/it]

Step: 118, Loss: 4.849616527557373


 12%|█▏        | 119/1000 [02:40<19:40,  1.34s/it]

Step: 119, Loss: 4.815040111541748


 12%|█▏        | 120/1000 [02:41<19:19,  1.32s/it]

Step: 120, Loss: 4.835151195526123


 12%|█▏        | 121/1000 [02:42<18:53,  1.29s/it]

Step: 121, Loss: 4.831853866577148


 12%|█▏        | 122/1000 [02:44<18:50,  1.29s/it]

Step: 122, Loss: 4.79379415512085


 12%|█▏        | 123/1000 [02:45<18:37,  1.27s/it]

Step: 123, Loss: 4.741842746734619


 12%|█▏        | 124/1000 [02:46<18:23,  1.26s/it]

Step: 124, Loss: 4.738584518432617


 12%|█▎        | 125/1000 [02:47<18:25,  1.26s/it]

Step: 125, Loss: 4.745553970336914


 13%|█▎        | 126/1000 [02:49<18:19,  1.26s/it]

Step: 126, Loss: 4.7135910987854


 13%|█▎        | 127/1000 [02:50<18:19,  1.26s/it]

Step: 127, Loss: 4.737319469451904


 13%|█▎        | 128/1000 [02:51<18:32,  1.28s/it]

Step: 128, Loss: 4.714951992034912


 13%|█▎        | 129/1000 [02:53<18:23,  1.27s/it]

Step: 129, Loss: 4.685452461242676


 13%|█▎        | 130/1000 [02:54<18:20,  1.26s/it]

Step: 130, Loss: 4.659347057342529


 13%|█▎        | 131/1000 [02:55<18:07,  1.25s/it]

Step: 131, Loss: 4.6660637855529785


 13%|█▎        | 132/1000 [02:56<18:06,  1.25s/it]

Step: 132, Loss: 4.657390594482422


 13%|█▎        | 133/1000 [02:57<17:55,  1.24s/it]

Step: 133, Loss: 4.610848903656006


 13%|█▎        | 134/1000 [02:59<17:57,  1.24s/it]

Step: 134, Loss: 4.610386848449707


 14%|█▎        | 135/1000 [03:00<17:49,  1.24s/it]

Step: 135, Loss: 4.57281494140625


 14%|█▎        | 136/1000 [03:01<17:49,  1.24s/it]

Step: 136, Loss: 4.572502613067627


 14%|█▎        | 137/1000 [03:02<17:50,  1.24s/it]

Step: 137, Loss: 4.5769500732421875


 14%|█▍        | 138/1000 [03:04<17:54,  1.25s/it]

Step: 138, Loss: 4.543405532836914


 14%|█▍        | 139/1000 [03:05<18:02,  1.26s/it]

Step: 139, Loss: 4.5340752601623535


 14%|█▍        | 140/1000 [03:06<18:13,  1.27s/it]

Step: 140, Loss: 4.550960063934326


 14%|█▍        | 141/1000 [03:08<18:12,  1.27s/it]

Step: 141, Loss: 4.480863571166992


 14%|█▍        | 142/1000 [03:09<18:06,  1.27s/it]

Step: 142, Loss: 4.527590274810791


 14%|█▍        | 143/1000 [03:10<17:57,  1.26s/it]

Step: 143, Loss: 4.481218338012695


 14%|█▍        | 144/1000 [03:11<17:53,  1.25s/it]

Step: 144, Loss: 4.460519313812256


 14%|█▍        | 145/1000 [03:13<17:52,  1.25s/it]

Step: 145, Loss: 4.499773979187012


 15%|█▍        | 146/1000 [03:14<17:48,  1.25s/it]

Step: 146, Loss: 4.472424507141113


 15%|█▍        | 147/1000 [03:15<17:46,  1.25s/it]

Step: 147, Loss: 4.420210838317871


 15%|█▍        | 148/1000 [03:16<17:57,  1.27s/it]

Step: 148, Loss: 4.433295726776123


 15%|█▍        | 149/1000 [03:18<17:53,  1.26s/it]

Step: 149, Loss: 4.459924221038818


 15%|█▌        | 150/1000 [03:19<18:14,  1.29s/it]

Step: 150, Loss: 4.422067165374756


 15%|█▌        | 151/1000 [03:20<18:08,  1.28s/it]

Step: 151, Loss: 4.405008792877197


 15%|█▌        | 152/1000 [03:21<18:03,  1.28s/it]

Step: 152, Loss: 4.403439998626709


 15%|█▌        | 153/1000 [03:23<17:47,  1.26s/it]

Step: 153, Loss: 4.353463649749756


 15%|█▌        | 154/1000 [03:24<18:43,  1.33s/it]

Step: 154, Loss: 4.358173847198486


 16%|█▌        | 155/1000 [03:25<18:27,  1.31s/it]

Step: 155, Loss: 4.372148036956787


 16%|█▌        | 156/1000 [03:27<18:13,  1.30s/it]

Step: 156, Loss: 4.3513383865356445


 16%|█▌        | 157/1000 [03:28<17:59,  1.28s/it]

Step: 157, Loss: 4.349155902862549


 16%|█▌        | 158/1000 [03:29<18:14,  1.30s/it]

Step: 158, Loss: 4.304969310760498


 16%|█▌        | 159/1000 [03:31<18:00,  1.28s/it]

Step: 159, Loss: 4.30502986907959


 16%|█▌        | 160/1000 [03:32<18:06,  1.29s/it]

Step: 160, Loss: 4.270330429077148


 16%|█▌        | 161/1000 [03:33<18:29,  1.32s/it]

Step: 161, Loss: 4.285006046295166


 16%|█▌        | 162/1000 [03:35<18:19,  1.31s/it]

Step: 162, Loss: 4.2896928787231445


 16%|█▋        | 163/1000 [03:36<18:00,  1.29s/it]

Step: 163, Loss: 4.270920753479004


 16%|█▋        | 164/1000 [03:37<17:49,  1.28s/it]

Step: 164, Loss: 4.307260513305664


 16%|█▋        | 165/1000 [03:39<20:13,  1.45s/it]

Step: 165, Loss: 4.27582311630249


 17%|█▋        | 166/1000 [03:41<21:22,  1.54s/it]

Step: 166, Loss: 4.259708404541016


 17%|█▋        | 167/1000 [03:42<22:00,  1.59s/it]

Step: 167, Loss: 4.206920623779297


 17%|█▋        | 168/1000 [03:44<20:58,  1.51s/it]

Step: 168, Loss: 4.212064743041992


 17%|█▋        | 169/1000 [03:45<19:50,  1.43s/it]

Step: 169, Loss: 4.1943559646606445


 17%|█▋        | 170/1000 [03:46<19:21,  1.40s/it]

Step: 170, Loss: 4.207618236541748


 17%|█▋        | 171/1000 [03:48<18:57,  1.37s/it]

Step: 171, Loss: 4.202296257019043


 17%|█▋        | 172/1000 [03:49<18:49,  1.36s/it]

Step: 172, Loss: 4.217301845550537


 17%|█▋        | 173/1000 [03:51<20:33,  1.49s/it]

Step: 173, Loss: 4.220984935760498


 17%|█▋        | 174/1000 [03:52<20:19,  1.48s/it]

Step: 174, Loss: 4.205558776855469


 18%|█▊        | 175/1000 [03:53<19:43,  1.43s/it]

Step: 175, Loss: 4.178686618804932


 18%|█▊        | 176/1000 [03:55<21:07,  1.54s/it]

Step: 176, Loss: 4.176748752593994


 18%|█▊        | 177/1000 [03:57<21:29,  1.57s/it]

Step: 177, Loss: 4.188938617706299


 18%|█▊        | 178/1000 [03:58<21:35,  1.58s/it]

Step: 178, Loss: 4.16098165512085


 18%|█▊        | 179/1000 [04:00<20:27,  1.49s/it]

Step: 179, Loss: 4.197214126586914


 18%|█▊        | 180/1000 [04:01<19:21,  1.42s/it]

Step: 180, Loss: 4.153518199920654


 18%|█▊        | 181/1000 [04:02<18:38,  1.37s/it]

Step: 181, Loss: 4.124727249145508


 18%|█▊        | 182/1000 [04:03<18:06,  1.33s/it]

Step: 182, Loss: 4.162786483764648


 18%|█▊        | 183/1000 [04:05<17:42,  1.30s/it]

Step: 183, Loss: 4.1530327796936035


 18%|█▊        | 184/1000 [04:06<17:42,  1.30s/it]

Step: 184, Loss: 4.1389899253845215


 18%|█▊        | 185/1000 [04:07<17:35,  1.30s/it]

Step: 185, Loss: 4.11282205581665


 19%|█▊        | 186/1000 [04:09<17:37,  1.30s/it]

Step: 186, Loss: 4.087357997894287


 19%|█▊        | 187/1000 [04:10<17:26,  1.29s/it]

Step: 187, Loss: 4.100869655609131


 19%|█▉        | 188/1000 [04:11<17:22,  1.28s/it]

Step: 188, Loss: 4.053329944610596


 19%|█▉        | 189/1000 [04:12<17:15,  1.28s/it]

Step: 189, Loss: 4.062793254852295


 19%|█▉        | 190/1000 [04:14<17:01,  1.26s/it]

Step: 190, Loss: 4.057154655456543


 19%|█▉        | 191/1000 [04:15<16:59,  1.26s/it]

Step: 191, Loss: 4.071334362030029


 19%|█▉        | 192/1000 [04:16<16:54,  1.26s/it]

Step: 192, Loss: 4.032976150512695


 19%|█▉        | 193/1000 [04:17<17:01,  1.27s/it]

Step: 193, Loss: 4.0953874588012695


 19%|█▉        | 194/1000 [04:19<16:56,  1.26s/it]

Step: 194, Loss: 4.051149368286133


 20%|█▉        | 195/1000 [04:20<16:53,  1.26s/it]

Step: 195, Loss: 4.0301618576049805


 20%|█▉        | 196/1000 [04:21<16:59,  1.27s/it]

Step: 196, Loss: 4.076319694519043


 20%|█▉        | 197/1000 [04:22<16:54,  1.26s/it]

Step: 197, Loss: 4.050021171569824


 20%|█▉        | 198/1000 [04:24<17:48,  1.33s/it]

Step: 198, Loss: 4.001705646514893


 20%|█▉        | 199/1000 [04:25<18:01,  1.35s/it]

Step: 199, Loss: 4.019362449645996


 20%|██        | 200/1000 [04:27<17:45,  1.33s/it]

Step: 200, Loss: 3.9943268299102783


 20%|██        | 201/1000 [04:28<17:41,  1.33s/it]

Step: 201, Loss: 3.969860315322876


 20%|██        | 202/1000 [04:29<17:25,  1.31s/it]

Step: 202, Loss: 4.010040283203125


 20%|██        | 203/1000 [04:31<17:16,  1.30s/it]

Step: 203, Loss: 3.9960668087005615


 20%|██        | 204/1000 [04:32<17:16,  1.30s/it]

Step: 204, Loss: 3.9568397998809814


 20%|██        | 205/1000 [04:33<17:43,  1.34s/it]

Step: 205, Loss: 3.959383249282837


 21%|██        | 206/1000 [04:35<19:06,  1.44s/it]

Step: 206, Loss: 3.9579503536224365


 21%|██        | 207/1000 [04:36<18:48,  1.42s/it]

Step: 207, Loss: 3.982785940170288


 21%|██        | 208/1000 [04:38<18:21,  1.39s/it]

Step: 208, Loss: 3.932177782058716


 21%|██        | 209/1000 [04:39<17:45,  1.35s/it]

Step: 209, Loss: 3.913715124130249


 21%|██        | 210/1000 [04:40<17:15,  1.31s/it]

Step: 210, Loss: 3.9216561317443848


 21%|██        | 211/1000 [04:41<17:00,  1.29s/it]

Step: 211, Loss: 3.925459861755371


 21%|██        | 212/1000 [04:43<16:46,  1.28s/it]

Step: 212, Loss: 3.923279047012329


 21%|██▏       | 213/1000 [04:44<16:30,  1.26s/it]

Step: 213, Loss: 3.905012845993042


 21%|██▏       | 214/1000 [04:45<16:26,  1.26s/it]

Step: 214, Loss: 3.9076528549194336


 22%|██▏       | 215/1000 [04:46<16:19,  1.25s/it]

Step: 215, Loss: 3.8755078315734863


 22%|██▏       | 216/1000 [04:48<16:18,  1.25s/it]

Step: 216, Loss: 3.8642218112945557


 22%|██▏       | 217/1000 [04:49<16:18,  1.25s/it]

Step: 217, Loss: 3.8720591068267822


 22%|██▏       | 218/1000 [04:50<16:25,  1.26s/it]

Step: 218, Loss: 3.9169797897338867


 22%|██▏       | 219/1000 [04:51<16:25,  1.26s/it]

Step: 219, Loss: 3.8471450805664062


 22%|██▏       | 220/1000 [04:53<16:35,  1.28s/it]

Step: 220, Loss: 3.872720956802368


 22%|██▏       | 221/1000 [04:54<16:27,  1.27s/it]

Step: 221, Loss: 3.909470319747925


 22%|██▏       | 222/1000 [04:55<16:26,  1.27s/it]

Step: 222, Loss: 3.8362624645233154


 22%|██▏       | 223/1000 [04:56<16:11,  1.25s/it]

Step: 223, Loss: 3.8481197357177734


 22%|██▏       | 224/1000 [04:58<16:10,  1.25s/it]

Step: 224, Loss: 3.8453118801116943


 22%|██▎       | 225/1000 [04:59<16:06,  1.25s/it]

Step: 225, Loss: 3.836557388305664


 23%|██▎       | 226/1000 [05:00<15:57,  1.24s/it]

Step: 226, Loss: 3.856616497039795


 23%|██▎       | 227/1000 [05:01<15:56,  1.24s/it]

Step: 227, Loss: 3.8662893772125244


 23%|██▎       | 228/1000 [05:03<15:54,  1.24s/it]

Step: 228, Loss: 3.8431692123413086


 23%|██▎       | 229/1000 [05:04<15:52,  1.23s/it]

Step: 229, Loss: 3.7978100776672363


 23%|██▎       | 230/1000 [05:05<15:53,  1.24s/it]

Step: 230, Loss: 3.7902987003326416


 23%|██▎       | 231/1000 [05:06<15:59,  1.25s/it]

Step: 231, Loss: 3.8004631996154785


 23%|██▎       | 232/1000 [05:08<16:05,  1.26s/it]

Step: 232, Loss: 3.78486704826355


 23%|██▎       | 233/1000 [05:09<16:02,  1.26s/it]

Step: 233, Loss: 3.7929537296295166


 23%|██▎       | 234/1000 [05:10<16:03,  1.26s/it]

Step: 234, Loss: 3.7710230350494385


 24%|██▎       | 235/1000 [05:11<16:01,  1.26s/it]

Step: 235, Loss: 3.74287486076355


 24%|██▎       | 236/1000 [05:13<15:53,  1.25s/it]

Step: 236, Loss: 3.7535247802734375


 24%|██▎       | 237/1000 [05:14<15:49,  1.25s/it]

Step: 237, Loss: 3.82692813873291


 24%|██▍       | 238/1000 [05:15<15:46,  1.24s/it]

Step: 238, Loss: 3.749591112136841


 24%|██▍       | 239/1000 [05:16<15:48,  1.25s/it]

Step: 239, Loss: 3.758964776992798


 24%|██▍       | 240/1000 [05:18<15:57,  1.26s/it]

Step: 240, Loss: 3.751643180847168


 24%|██▍       | 241/1000 [05:19<15:53,  1.26s/it]

Step: 241, Loss: 3.7799408435821533


 24%|██▍       | 242/1000 [05:20<15:57,  1.26s/it]

Step: 242, Loss: 3.732137441635132


 24%|██▍       | 243/1000 [05:22<17:41,  1.40s/it]

Step: 243, Loss: 3.7284984588623047


 24%|██▍       | 244/1000 [05:23<17:18,  1.37s/it]

Step: 244, Loss: 3.7200543880462646


 24%|██▍       | 245/1000 [05:24<16:54,  1.34s/it]

Step: 245, Loss: 3.680807113647461


 25%|██▍       | 246/1000 [05:26<16:31,  1.31s/it]

Step: 246, Loss: 3.7074708938598633


 25%|██▍       | 247/1000 [05:27<16:28,  1.31s/it]

Step: 247, Loss: 3.695432186126709


 25%|██▍       | 248/1000 [05:28<16:08,  1.29s/it]

Step: 248, Loss: 3.7193219661712646


 25%|██▍       | 249/1000 [05:29<16:04,  1.28s/it]

Step: 249, Loss: 3.6781277656555176


 25%|██▌       | 250/1000 [05:31<16:06,  1.29s/it]

Step: 250, Loss: 3.6468114852905273


 25%|██▌       | 251/1000 [05:32<15:59,  1.28s/it]

Step: 251, Loss: 3.6567749977111816


 25%|██▌       | 252/1000 [05:33<15:47,  1.27s/it]

Step: 252, Loss: 3.694239854812622


 25%|██▌       | 253/1000 [05:35<15:44,  1.26s/it]

Step: 253, Loss: 3.666494131088257


 25%|██▌       | 254/1000 [05:36<15:47,  1.27s/it]

Step: 254, Loss: 3.671313524246216


 26%|██▌       | 255/1000 [05:37<15:34,  1.25s/it]

Step: 255, Loss: 3.6636922359466553


 26%|██▌       | 256/1000 [05:38<15:34,  1.26s/it]

Step: 256, Loss: 3.658046245574951


 26%|██▌       | 257/1000 [05:40<15:28,  1.25s/it]

Step: 257, Loss: 3.654308795928955


 26%|██▌       | 258/1000 [05:41<15:29,  1.25s/it]

Step: 258, Loss: 3.6276373863220215


 26%|██▌       | 259/1000 [05:42<15:33,  1.26s/it]

Step: 259, Loss: 3.641451358795166


 26%|██▌       | 260/1000 [05:43<15:26,  1.25s/it]

Step: 260, Loss: 3.6371564865112305


 26%|██▌       | 261/1000 [05:45<15:42,  1.28s/it]

Step: 261, Loss: 3.665844678878784


 26%|██▌       | 262/1000 [05:46<15:31,  1.26s/it]

Step: 262, Loss: 3.664027214050293


 26%|██▋       | 263/1000 [05:47<15:23,  1.25s/it]

Step: 263, Loss: 3.66843843460083


 26%|██▋       | 264/1000 [05:48<15:18,  1.25s/it]

Step: 264, Loss: 3.6146280765533447


 26%|██▋       | 265/1000 [05:50<15:13,  1.24s/it]

Step: 265, Loss: 3.6885910034179688


 27%|██▋       | 266/1000 [05:51<15:15,  1.25s/it]

Step: 266, Loss: 3.6669275760650635


 27%|██▋       | 267/1000 [05:52<15:18,  1.25s/it]

Step: 267, Loss: 3.645447015762329


 27%|██▋       | 268/1000 [05:53<15:28,  1.27s/it]

Step: 268, Loss: 3.6247658729553223


 27%|██▋       | 269/1000 [05:55<15:33,  1.28s/it]

Step: 269, Loss: 3.6574277877807617


 27%|██▋       | 270/1000 [05:56<15:59,  1.31s/it]

Step: 270, Loss: 3.6138007640838623


 27%|██▋       | 271/1000 [05:57<15:47,  1.30s/it]

Step: 271, Loss: 3.5988333225250244


 27%|██▋       | 272/1000 [05:59<15:38,  1.29s/it]

Step: 272, Loss: 3.5782318115234375


 27%|██▋       | 273/1000 [06:00<15:31,  1.28s/it]

Step: 273, Loss: 3.591651678085327


 27%|██▋       | 274/1000 [06:01<15:29,  1.28s/it]

Step: 274, Loss: 3.5902514457702637


 28%|██▊       | 275/1000 [06:02<15:24,  1.28s/it]

Step: 275, Loss: 3.615333318710327


 28%|██▊       | 276/1000 [06:04<15:28,  1.28s/it]

Step: 276, Loss: 3.5681467056274414


 28%|██▊       | 277/1000 [06:05<15:23,  1.28s/it]

Step: 277, Loss: 3.562713861465454


 28%|██▊       | 278/1000 [06:06<15:25,  1.28s/it]

Step: 278, Loss: 3.5847504138946533


 28%|██▊       | 279/1000 [06:08<15:12,  1.27s/it]

Step: 279, Loss: 3.5561633110046387


 28%|██▊       | 280/1000 [06:09<15:12,  1.27s/it]

Step: 280, Loss: 3.580781936645508


 28%|██▊       | 281/1000 [06:10<15:04,  1.26s/it]

Step: 281, Loss: 3.5746865272521973


 28%|██▊       | 282/1000 [06:11<14:55,  1.25s/it]

Step: 282, Loss: 3.586038589477539


 28%|██▊       | 283/1000 [06:12<14:47,  1.24s/it]

Step: 283, Loss: 3.569401502609253


 28%|██▊       | 284/1000 [06:14<14:46,  1.24s/it]

Step: 284, Loss: 3.5674185752868652


 28%|██▊       | 285/1000 [06:15<14:40,  1.23s/it]

Step: 285, Loss: 3.5346596240997314


 29%|██▊       | 286/1000 [06:16<14:44,  1.24s/it]

Step: 286, Loss: 3.549074649810791


 29%|██▊       | 287/1000 [06:17<14:49,  1.25s/it]

Step: 287, Loss: 3.5683350563049316


 29%|██▉       | 288/1000 [06:19<14:54,  1.26s/it]

Step: 288, Loss: 3.5553719997406006


 29%|██▉       | 289/1000 [06:20<14:51,  1.25s/it]

Step: 289, Loss: 3.5246775150299072


 29%|██▉       | 290/1000 [06:21<14:57,  1.26s/it]

Step: 290, Loss: 3.5182387828826904


 29%|██▉       | 291/1000 [06:22<14:52,  1.26s/it]

Step: 291, Loss: 3.5400147438049316


 29%|██▉       | 292/1000 [06:24<15:01,  1.27s/it]

Step: 292, Loss: 3.5014405250549316


 29%|██▉       | 293/1000 [06:25<14:58,  1.27s/it]

Step: 293, Loss: 3.492323875427246


 29%|██▉       | 294/1000 [06:26<14:56,  1.27s/it]

Step: 294, Loss: 3.472823143005371


 30%|██▉       | 295/1000 [06:28<14:54,  1.27s/it]

Step: 295, Loss: 3.5406668186187744


 30%|██▉       | 296/1000 [06:29<15:22,  1.31s/it]

Step: 296, Loss: 3.4918460845947266


 30%|██▉       | 297/1000 [06:31<16:02,  1.37s/it]

Step: 297, Loss: 3.5212554931640625


 30%|██▉       | 298/1000 [06:32<15:49,  1.35s/it]

Step: 298, Loss: 3.48414945602417


 30%|██▉       | 299/1000 [06:33<15:33,  1.33s/it]

Step: 299, Loss: 3.507199287414551


 30%|███       | 300/1000 [06:34<15:16,  1.31s/it]

Step: 300, Loss: 3.4811859130859375


 30%|███       | 301/1000 [06:36<15:02,  1.29s/it]

Step: 301, Loss: 3.503877878189087


 30%|███       | 302/1000 [06:37<14:47,  1.27s/it]

Step: 302, Loss: 3.4691154956817627


 30%|███       | 303/1000 [06:38<14:43,  1.27s/it]

Step: 303, Loss: 3.4486992359161377


 30%|███       | 304/1000 [06:39<14:42,  1.27s/it]

Step: 304, Loss: 3.469745397567749


 30%|███       | 305/1000 [06:41<14:46,  1.28s/it]

Step: 305, Loss: 3.5313384532928467


 31%|███       | 306/1000 [06:42<14:45,  1.28s/it]

Step: 306, Loss: 3.4560067653656006


 31%|███       | 307/1000 [06:43<14:42,  1.27s/it]

Step: 307, Loss: 3.4531161785125732


 31%|███       | 308/1000 [06:44<14:34,  1.26s/it]

Step: 308, Loss: 3.462815284729004


 31%|███       | 309/1000 [06:46<14:32,  1.26s/it]

Step: 309, Loss: 3.4855263233184814


 31%|███       | 310/1000 [06:47<14:26,  1.26s/it]

Step: 310, Loss: 3.4063587188720703


 31%|███       | 311/1000 [06:48<14:18,  1.25s/it]

Step: 311, Loss: 3.4757583141326904


 31%|███       | 312/1000 [06:49<14:13,  1.24s/it]

Step: 312, Loss: 3.4876868724823


 31%|███▏      | 313/1000 [06:51<14:19,  1.25s/it]

Step: 313, Loss: 3.467649221420288


 31%|███▏      | 314/1000 [06:52<14:21,  1.26s/it]

Step: 314, Loss: 3.407733678817749


 32%|███▏      | 315/1000 [06:53<14:19,  1.26s/it]

Step: 315, Loss: 3.5096988677978516


 32%|███▏      | 316/1000 [06:54<14:26,  1.27s/it]

Step: 316, Loss: 3.472198486328125


 32%|███▏      | 317/1000 [06:56<14:46,  1.30s/it]

Step: 317, Loss: 3.459902286529541


 32%|███▏      | 318/1000 [06:57<14:40,  1.29s/it]

Step: 318, Loss: 3.438505172729492


 32%|███▏      | 319/1000 [06:58<14:35,  1.29s/it]

Step: 319, Loss: 3.413630485534668


 32%|███▏      | 320/1000 [07:00<14:25,  1.27s/it]

Step: 320, Loss: 3.4753425121307373


 32%|███▏      | 321/1000 [07:01<14:17,  1.26s/it]

Step: 321, Loss: 3.4221858978271484


 32%|███▏      | 322/1000 [07:02<14:13,  1.26s/it]

Step: 322, Loss: 3.446553945541382


 32%|███▏      | 323/1000 [07:03<14:10,  1.26s/it]

Step: 323, Loss: 3.3927221298217773


 32%|███▏      | 324/1000 [07:05<14:11,  1.26s/it]

Step: 324, Loss: 3.4272501468658447


 32%|███▎      | 325/1000 [07:06<14:10,  1.26s/it]

Step: 325, Loss: 3.3979332447052


 33%|███▎      | 326/1000 [07:07<14:10,  1.26s/it]

Step: 326, Loss: 3.3851675987243652


 33%|███▎      | 327/1000 [07:08<14:10,  1.26s/it]

Step: 327, Loss: 3.397204875946045


 33%|███▎      | 328/1000 [07:10<14:16,  1.27s/it]

Step: 328, Loss: 3.4431874752044678


 33%|███▎      | 329/1000 [07:11<14:35,  1.31s/it]

Step: 329, Loss: 3.4281978607177734


 33%|███▎      | 330/1000 [07:12<14:31,  1.30s/it]

Step: 330, Loss: 3.439816474914551


 33%|███▎      | 331/1000 [07:14<14:22,  1.29s/it]

Step: 331, Loss: 3.3819332122802734


 33%|███▎      | 332/1000 [07:15<14:14,  1.28s/it]

Step: 332, Loss: 3.450742483139038


 33%|███▎      | 333/1000 [07:16<14:08,  1.27s/it]

Step: 333, Loss: 3.366136074066162


 33%|███▎      | 334/1000 [07:17<14:08,  1.27s/it]

Step: 334, Loss: 3.3751797676086426


 34%|███▎      | 335/1000 [07:19<14:08,  1.28s/it]

Step: 335, Loss: 3.362480878829956


 34%|███▎      | 336/1000 [07:20<14:08,  1.28s/it]

Step: 336, Loss: 3.344026803970337


 34%|███▎      | 337/1000 [07:21<14:10,  1.28s/it]

Step: 337, Loss: 3.3405041694641113


 34%|███▍      | 338/1000 [07:23<15:32,  1.41s/it]

Step: 338, Loss: 3.3461713790893555


 34%|███▍      | 339/1000 [07:24<15:16,  1.39s/it]

Step: 339, Loss: 3.327960729598999


 34%|███▍      | 340/1000 [07:26<14:58,  1.36s/it]

Step: 340, Loss: 3.3461594581604004


 34%|███▍      | 341/1000 [07:27<14:46,  1.35s/it]

Step: 341, Loss: 3.3721747398376465


 34%|███▍      | 342/1000 [07:28<14:37,  1.33s/it]

Step: 342, Loss: 3.332125663757324


 34%|███▍      | 343/1000 [07:30<15:39,  1.43s/it]

Step: 343, Loss: 3.3080453872680664


 34%|███▍      | 344/1000 [07:31<15:11,  1.39s/it]

Step: 344, Loss: 3.36171293258667


 34%|███▍      | 345/1000 [07:33<15:13,  1.39s/it]

Step: 345, Loss: 3.3261029720306396


 35%|███▍      | 346/1000 [07:34<14:54,  1.37s/it]

Step: 346, Loss: 3.329233169555664


 35%|███▍      | 347/1000 [07:35<14:40,  1.35s/it]

Step: 347, Loss: 3.320713758468628


 35%|███▍      | 348/1000 [07:37<14:25,  1.33s/it]

Step: 348, Loss: 3.419452428817749


 35%|███▍      | 349/1000 [07:38<14:10,  1.31s/it]

Step: 349, Loss: 3.3562302589416504


 35%|███▌      | 350/1000 [07:39<13:54,  1.28s/it]

Step: 350, Loss: 3.299264907836914


 35%|███▌      | 351/1000 [07:40<13:44,  1.27s/it]

Step: 351, Loss: 3.361452579498291


 35%|███▌      | 352/1000 [07:41<13:38,  1.26s/it]

Step: 352, Loss: 3.333521842956543


 35%|███▌      | 353/1000 [07:43<13:33,  1.26s/it]

Step: 353, Loss: 3.285909652709961


 35%|███▌      | 354/1000 [07:44<13:31,  1.26s/it]

Step: 354, Loss: 3.342846632003784


 36%|███▌      | 355/1000 [07:45<13:27,  1.25s/it]

Step: 355, Loss: 3.3287832736968994


 36%|███▌      | 356/1000 [07:46<13:24,  1.25s/it]

Step: 356, Loss: 3.3062963485717773


 36%|███▌      | 357/1000 [07:48<13:28,  1.26s/it]

Step: 357, Loss: 3.3039891719818115


 36%|███▌      | 358/1000 [07:49<13:29,  1.26s/it]

Step: 358, Loss: 3.2743959426879883


 36%|███▌      | 359/1000 [07:50<13:38,  1.28s/it]

Step: 359, Loss: 3.289552688598633


 36%|███▌      | 360/1000 [07:52<14:16,  1.34s/it]

Step: 360, Loss: 3.280433416366577


 36%|███▌      | 361/1000 [07:53<14:04,  1.32s/it]

Step: 361, Loss: 3.306694984436035


 36%|███▌      | 362/1000 [07:54<13:59,  1.32s/it]

Step: 362, Loss: 3.267345666885376


 36%|███▋      | 363/1000 [07:56<13:50,  1.30s/it]

Step: 363, Loss: 3.2637436389923096


 36%|███▋      | 364/1000 [07:57<13:46,  1.30s/it]

Step: 364, Loss: 3.3151791095733643


 36%|███▋      | 365/1000 [07:58<13:42,  1.30s/it]

Step: 365, Loss: 3.2542388439178467


 37%|███▋      | 366/1000 [08:00<13:41,  1.30s/it]

Step: 366, Loss: 3.2982516288757324


 37%|███▋      | 367/1000 [08:01<13:34,  1.29s/it]

Step: 367, Loss: 3.296983242034912


 37%|███▋      | 368/1000 [08:02<13:22,  1.27s/it]

Step: 368, Loss: 3.2589592933654785


 37%|███▋      | 369/1000 [08:03<13:24,  1.27s/it]

Step: 369, Loss: 3.2315280437469482


 37%|███▋      | 370/1000 [08:05<13:12,  1.26s/it]

Step: 370, Loss: 3.2417714595794678


 37%|███▋      | 371/1000 [08:06<13:05,  1.25s/it]

Step: 371, Loss: 3.243640422821045


 37%|███▋      | 372/1000 [08:07<13:02,  1.25s/it]

Step: 372, Loss: 3.2473196983337402


 37%|███▋      | 373/1000 [08:08<13:02,  1.25s/it]

Step: 373, Loss: 3.2507429122924805


 37%|███▋      | 374/1000 [08:10<13:06,  1.26s/it]

Step: 374, Loss: 3.251138687133789


 38%|███▊      | 375/1000 [08:11<13:12,  1.27s/it]

Step: 375, Loss: 3.2032370567321777


 38%|███▊      | 376/1000 [08:12<13:38,  1.31s/it]

Step: 376, Loss: 3.213533878326416


 38%|███▊      | 377/1000 [08:14<13:35,  1.31s/it]

Step: 377, Loss: 3.2388999462127686


 38%|███▊      | 378/1000 [08:15<13:27,  1.30s/it]

Step: 378, Loss: 3.248278856277466


 38%|███▊      | 379/1000 [08:16<13:18,  1.29s/it]

Step: 379, Loss: 3.2636101245880127


 38%|███▊      | 380/1000 [08:17<13:09,  1.27s/it]

Step: 380, Loss: 3.2333478927612305


 38%|███▊      | 381/1000 [08:19<13:03,  1.27s/it]

Step: 381, Loss: 3.240752935409546


 38%|███▊      | 382/1000 [08:20<13:06,  1.27s/it]

Step: 382, Loss: 3.219594955444336


 38%|███▊      | 383/1000 [08:21<14:05,  1.37s/it]

Step: 383, Loss: 3.22562837600708


 38%|███▊      | 384/1000 [08:23<14:45,  1.44s/it]

Step: 384, Loss: 3.203460454940796


 38%|███▊      | 385/1000 [08:24<14:14,  1.39s/it]

Step: 385, Loss: 3.231931447982788


 39%|███▊      | 386/1000 [08:26<13:55,  1.36s/it]

Step: 386, Loss: 3.2124147415161133


 39%|███▊      | 387/1000 [08:27<13:45,  1.35s/it]

Step: 387, Loss: 3.1950018405914307


 39%|███▉      | 388/1000 [08:28<13:35,  1.33s/it]

Step: 388, Loss: 3.221343994140625


 39%|███▉      | 389/1000 [08:30<13:22,  1.31s/it]

Step: 389, Loss: 3.1653833389282227


 39%|███▉      | 390/1000 [08:31<13:08,  1.29s/it]

Step: 390, Loss: 3.230720281600952


 39%|███▉      | 391/1000 [08:32<13:01,  1.28s/it]

Step: 391, Loss: 3.1879947185516357


 39%|███▉      | 392/1000 [08:33<12:53,  1.27s/it]

Step: 392, Loss: 3.2012929916381836


 39%|███▉      | 393/1000 [08:34<12:43,  1.26s/it]

Step: 393, Loss: 3.2179272174835205


 39%|███▉      | 394/1000 [08:36<12:40,  1.26s/it]

Step: 394, Loss: 3.2081549167633057


 40%|███▉      | 395/1000 [08:37<12:41,  1.26s/it]

Step: 395, Loss: 3.201523542404175


 40%|███▉      | 396/1000 [08:38<12:36,  1.25s/it]

Step: 396, Loss: 3.1787567138671875


 40%|███▉      | 397/1000 [08:39<12:34,  1.25s/it]

Step: 397, Loss: 3.172872304916382


 40%|███▉      | 398/1000 [08:41<12:42,  1.27s/it]

Step: 398, Loss: 3.190333366394043


 40%|███▉      | 399/1000 [08:42<12:45,  1.27s/it]

Step: 399, Loss: 3.1867239475250244


 40%|████      | 400/1000 [08:43<12:52,  1.29s/it]

Step: 400, Loss: 3.1632955074310303


 40%|████      | 401/1000 [08:45<12:45,  1.28s/it]

Step: 401, Loss: 3.1798343658447266


 40%|████      | 402/1000 [08:46<12:31,  1.26s/it]

Step: 402, Loss: 3.1416432857513428


 40%|████      | 403/1000 [08:47<12:31,  1.26s/it]

Step: 403, Loss: 3.1981003284454346


 40%|████      | 404/1000 [08:48<12:21,  1.24s/it]

Step: 404, Loss: 3.1984033584594727


 40%|████      | 405/1000 [08:50<12:25,  1.25s/it]

Step: 405, Loss: 3.1661806106567383


 41%|████      | 406/1000 [08:51<12:37,  1.27s/it]

Step: 406, Loss: 3.1429615020751953


 41%|████      | 407/1000 [08:52<12:49,  1.30s/it]

Step: 407, Loss: 3.190772294998169


 41%|████      | 408/1000 [08:54<12:46,  1.29s/it]

Step: 408, Loss: 3.159090518951416


 41%|████      | 409/1000 [08:55<13:53,  1.41s/it]

Step: 409, Loss: 3.156099319458008


 41%|████      | 410/1000 [08:57<14:35,  1.48s/it]

Step: 410, Loss: 3.1448473930358887


 41%|████      | 411/1000 [08:59<14:56,  1.52s/it]

Step: 411, Loss: 3.1392717361450195


 41%|████      | 412/1000 [09:00<14:15,  1.45s/it]

Step: 412, Loss: 3.15317964553833


 41%|████▏     | 413/1000 [09:02<14:56,  1.53s/it]

Step: 413, Loss: 3.1503067016601562


 41%|████▏     | 414/1000 [09:03<14:30,  1.49s/it]

Step: 414, Loss: 3.142336130142212


 42%|████▏     | 415/1000 [09:05<15:21,  1.58s/it]

Step: 415, Loss: 3.140195369720459


 42%|████▏     | 416/1000 [09:06<14:50,  1.53s/it]

Step: 416, Loss: 3.162412643432617


 42%|████▏     | 417/1000 [09:07<14:08,  1.46s/it]

Step: 417, Loss: 3.123976945877075


 42%|████▏     | 418/1000 [09:09<13:50,  1.43s/it]

Step: 418, Loss: 3.133235216140747


 42%|████▏     | 419/1000 [09:10<13:34,  1.40s/it]

Step: 419, Loss: 3.1624960899353027


 42%|████▏     | 420/1000 [09:11<13:10,  1.36s/it]

Step: 420, Loss: 3.160615921020508


 42%|████▏     | 421/1000 [09:13<13:04,  1.35s/it]

Step: 421, Loss: 3.1847782135009766


 42%|████▏     | 422/1000 [09:14<12:48,  1.33s/it]

Step: 422, Loss: 3.1310818195343018


 42%|████▏     | 423/1000 [09:15<12:44,  1.33s/it]

Step: 423, Loss: 3.132660150527954


 42%|████▏     | 424/1000 [09:17<12:38,  1.32s/it]

Step: 424, Loss: 3.166907548904419


 42%|████▎     | 425/1000 [09:18<12:33,  1.31s/it]

Step: 425, Loss: 3.19795560836792


 43%|████▎     | 426/1000 [09:19<13:16,  1.39s/it]

Step: 426, Loss: 3.158907413482666


 43%|████▎     | 427/1000 [09:21<12:57,  1.36s/it]

Step: 427, Loss: 3.1842963695526123


 43%|████▎     | 428/1000 [09:22<12:50,  1.35s/it]

Step: 428, Loss: 3.1445696353912354


 43%|████▎     | 429/1000 [09:23<12:31,  1.32s/it]

Step: 429, Loss: 3.1988494396209717


 43%|████▎     | 430/1000 [09:25<12:23,  1.30s/it]

Step: 430, Loss: 3.1075637340545654


 43%|████▎     | 431/1000 [09:26<12:21,  1.30s/it]

Step: 431, Loss: 3.10007381439209


 43%|████▎     | 432/1000 [09:27<12:23,  1.31s/it]

Step: 432, Loss: 3.1275899410247803


 43%|████▎     | 433/1000 [09:28<12:19,  1.30s/it]

Step: 433, Loss: 3.1355948448181152


 43%|████▎     | 434/1000 [09:30<12:11,  1.29s/it]

Step: 434, Loss: 3.141667127609253


 44%|████▎     | 435/1000 [09:31<12:00,  1.28s/it]

Step: 435, Loss: 3.105198860168457


 44%|████▎     | 436/1000 [09:32<11:58,  1.27s/it]

Step: 436, Loss: 3.082204818725586


 44%|████▎     | 437/1000 [09:34<11:58,  1.28s/it]

Step: 437, Loss: 3.129842758178711


 44%|████▍     | 438/1000 [09:35<11:56,  1.27s/it]

Step: 438, Loss: 3.1128902435302734


 44%|████▍     | 439/1000 [09:36<11:58,  1.28s/it]

Step: 439, Loss: 3.0815558433532715


 44%|████▍     | 440/1000 [09:37<12:10,  1.31s/it]

Step: 440, Loss: 3.105628252029419


 44%|████▍     | 441/1000 [09:39<12:04,  1.30s/it]

Step: 441, Loss: 3.0676538944244385


 44%|████▍     | 442/1000 [09:40<12:09,  1.31s/it]

Step: 442, Loss: 3.122978925704956


 44%|████▍     | 443/1000 [09:41<11:55,  1.29s/it]

Step: 443, Loss: 3.1003966331481934


 44%|████▍     | 444/1000 [09:43<11:49,  1.28s/it]

Step: 444, Loss: 3.0869405269622803


 44%|████▍     | 445/1000 [09:44<11:57,  1.29s/it]

Step: 445, Loss: 3.063342332839966


 45%|████▍     | 446/1000 [09:45<11:58,  1.30s/it]

Step: 446, Loss: 3.130082130432129


 45%|████▍     | 447/1000 [09:47<12:02,  1.31s/it]

Step: 447, Loss: 3.0916836261749268


 45%|████▍     | 448/1000 [09:48<11:52,  1.29s/it]

Step: 448, Loss: 3.141630172729492


 45%|████▍     | 449/1000 [09:49<11:52,  1.29s/it]

Step: 449, Loss: 3.106572151184082


 45%|████▌     | 450/1000 [09:50<11:50,  1.29s/it]

Step: 450, Loss: 3.1095242500305176


 45%|████▌     | 451/1000 [09:52<11:58,  1.31s/it]

Step: 451, Loss: 3.1232213973999023


 45%|████▌     | 452/1000 [09:53<11:55,  1.31s/it]

Step: 452, Loss: 3.1005055904388428


 45%|████▌     | 453/1000 [09:54<11:50,  1.30s/it]

Step: 453, Loss: 3.061790704727173


 45%|████▌     | 454/1000 [09:56<11:49,  1.30s/it]

Step: 454, Loss: 3.1216437816619873


 46%|████▌     | 455/1000 [09:57<11:43,  1.29s/it]

Step: 455, Loss: 3.1413345336914062


 46%|████▌     | 456/1000 [09:58<11:40,  1.29s/it]

Step: 456, Loss: 3.090900182723999


 46%|████▌     | 457/1000 [09:59<11:38,  1.29s/it]

Step: 457, Loss: 3.0845141410827637


 46%|████▌     | 458/1000 [10:01<11:30,  1.27s/it]

Step: 458, Loss: 3.0551466941833496


 46%|████▌     | 459/1000 [10:02<11:21,  1.26s/it]

Step: 459, Loss: 3.1110761165618896


 46%|████▌     | 460/1000 [10:03<11:16,  1.25s/it]

Step: 460, Loss: 3.0768237113952637


 46%|████▌     | 461/1000 [10:04<11:11,  1.25s/it]

Step: 461, Loss: 3.039759635925293


 46%|████▌     | 462/1000 [10:06<11:08,  1.24s/it]

Step: 462, Loss: 3.0821990966796875


 46%|████▋     | 463/1000 [10:07<11:11,  1.25s/it]

Step: 463, Loss: 3.068343162536621


 46%|████▋     | 464/1000 [10:08<12:02,  1.35s/it]

Step: 464, Loss: 3.077148675918579


 46%|████▋     | 465/1000 [10:10<12:02,  1.35s/it]

Step: 465, Loss: 3.055300712585449


 47%|████▋     | 466/1000 [10:11<11:53,  1.34s/it]

Step: 466, Loss: 3.0511279106140137


 47%|████▋     | 467/1000 [10:12<11:47,  1.33s/it]

Step: 467, Loss: 3.030830144882202


 47%|████▋     | 468/1000 [10:14<11:40,  1.32s/it]

Step: 468, Loss: 3.0252718925476074


 47%|████▋     | 469/1000 [10:15<11:28,  1.30s/it]

Step: 469, Loss: 3.0724050998687744


 47%|████▋     | 470/1000 [10:16<11:25,  1.29s/it]

Step: 470, Loss: 3.003514051437378


 47%|████▋     | 471/1000 [10:18<11:19,  1.28s/it]

Step: 471, Loss: 3.0668325424194336


 47%|████▋     | 472/1000 [10:19<11:18,  1.28s/it]

Step: 472, Loss: 3.026848793029785


 47%|████▋     | 473/1000 [10:20<11:14,  1.28s/it]

Step: 473, Loss: 2.963979482650757


 47%|████▋     | 474/1000 [10:21<11:12,  1.28s/it]

Step: 474, Loss: 2.9832139015197754


 48%|████▊     | 475/1000 [10:23<11:10,  1.28s/it]

Step: 475, Loss: 3.0122058391571045


 48%|████▊     | 476/1000 [10:24<11:11,  1.28s/it]

Step: 476, Loss: 2.9988837242126465


 48%|████▊     | 477/1000 [10:25<11:08,  1.28s/it]

Step: 477, Loss: 3.047146797180176


 48%|████▊     | 478/1000 [10:26<11:08,  1.28s/it]

Step: 478, Loss: 3.0078344345092773


 48%|████▊     | 479/1000 [10:28<11:04,  1.28s/it]

Step: 479, Loss: 3.0138869285583496


 48%|████▊     | 480/1000 [10:29<11:05,  1.28s/it]

Step: 480, Loss: 3.016190528869629


 48%|████▊     | 481/1000 [10:30<11:15,  1.30s/it]

Step: 481, Loss: 2.988494634628296


 48%|████▊     | 482/1000 [10:32<11:26,  1.33s/it]

Step: 482, Loss: 3.0121893882751465


 48%|████▊     | 483/1000 [10:33<11:27,  1.33s/it]

Step: 483, Loss: 3.0362377166748047


 48%|████▊     | 484/1000 [10:34<11:17,  1.31s/it]

Step: 484, Loss: 3.008971929550171


 48%|████▊     | 485/1000 [10:36<11:07,  1.30s/it]

Step: 485, Loss: 3.0512847900390625


 49%|████▊     | 486/1000 [10:37<12:14,  1.43s/it]

Step: 486, Loss: 3.0065882205963135


 49%|████▊     | 487/1000 [10:39<11:58,  1.40s/it]

Step: 487, Loss: 3.0084950923919678


 49%|████▉     | 488/1000 [10:40<11:43,  1.37s/it]

Step: 488, Loss: 2.986138343811035


 49%|████▉     | 489/1000 [10:41<11:21,  1.33s/it]

Step: 489, Loss: 2.9996490478515625


 49%|████▉     | 490/1000 [10:43<11:54,  1.40s/it]

Step: 490, Loss: 2.9731671810150146


 49%|████▉     | 491/1000 [10:44<12:16,  1.45s/it]

Step: 491, Loss: 3.0165834426879883


 49%|████▉     | 492/1000 [10:46<12:00,  1.42s/it]

Step: 492, Loss: 2.9868087768554688


 49%|████▉     | 493/1000 [10:47<11:39,  1.38s/it]

Step: 493, Loss: 2.954946279525757


 49%|████▉     | 494/1000 [10:48<11:18,  1.34s/it]

Step: 494, Loss: 2.9993648529052734


 50%|████▉     | 495/1000 [10:50<11:45,  1.40s/it]

Step: 495, Loss: 2.9467170238494873


 50%|████▉     | 496/1000 [10:52<12:32,  1.49s/it]

Step: 496, Loss: 2.9556641578674316


 50%|████▉     | 497/1000 [10:53<12:57,  1.55s/it]

Step: 497, Loss: 2.9427947998046875


 50%|████▉     | 498/1000 [10:55<14:09,  1.69s/it]

Step: 498, Loss: 2.930687665939331


 50%|████▉     | 499/1000 [10:57<13:39,  1.63s/it]

Step: 499, Loss: 2.9506773948669434


 50%|█████     | 500/1000 [10:58<13:00,  1.56s/it]

Step: 500, Loss: 2.955373764038086


 50%|█████     | 501/1000 [10:59<12:23,  1.49s/it]

Step: 501, Loss: 2.9350759983062744


 50%|█████     | 502/1000 [11:01<12:38,  1.52s/it]

Step: 502, Loss: 2.9257264137268066


 50%|█████     | 503/1000 [11:03<12:42,  1.53s/it]

Step: 503, Loss: 2.945793867111206


 50%|█████     | 504/1000 [11:04<13:12,  1.60s/it]

Step: 504, Loss: 2.9338016510009766


 50%|█████     | 505/1000 [11:06<12:43,  1.54s/it]

Step: 505, Loss: 2.958876132965088


 51%|█████     | 506/1000 [11:07<12:12,  1.48s/it]

Step: 506, Loss: 2.927785634994507


 51%|█████     | 507/1000 [11:08<11:48,  1.44s/it]

Step: 507, Loss: 2.960953950881958


 51%|█████     | 508/1000 [11:10<11:59,  1.46s/it]

Step: 508, Loss: 2.9106225967407227


 51%|█████     | 509/1000 [11:11<11:41,  1.43s/it]

Step: 509, Loss: 2.973040819168091


 51%|█████     | 510/1000 [11:13<11:17,  1.38s/it]

Step: 510, Loss: 2.9371511936187744


 51%|█████     | 511/1000 [11:14<11:12,  1.37s/it]

Step: 511, Loss: 2.919260025024414


 51%|█████     | 512/1000 [11:15<11:00,  1.35s/it]

Step: 512, Loss: 2.9373979568481445


 51%|█████▏    | 513/1000 [11:17<12:14,  1.51s/it]

Step: 513, Loss: 2.9505672454833984


 51%|█████▏    | 514/1000 [11:19<12:06,  1.50s/it]

Step: 514, Loss: 2.965195655822754


 52%|█████▏    | 515/1000 [11:20<11:57,  1.48s/it]

Step: 515, Loss: 2.9054718017578125


 52%|█████▏    | 516/1000 [11:21<11:37,  1.44s/it]

Step: 516, Loss: 2.9134249687194824


 52%|█████▏    | 517/1000 [11:23<11:24,  1.42s/it]

Step: 517, Loss: 2.9173965454101562


 52%|█████▏    | 518/1000 [11:24<11:02,  1.37s/it]

Step: 518, Loss: 2.9173409938812256


 52%|█████▏    | 519/1000 [11:25<10:41,  1.33s/it]

Step: 519, Loss: 2.888758897781372


 52%|█████▏    | 520/1000 [11:27<10:36,  1.33s/it]

Step: 520, Loss: 2.9082841873168945


 52%|█████▏    | 521/1000 [11:28<11:18,  1.42s/it]

Step: 521, Loss: 2.9381728172302246


 52%|█████▏    | 522/1000 [11:30<11:05,  1.39s/it]

Step: 522, Loss: 2.9119672775268555


 52%|█████▏    | 523/1000 [11:31<10:45,  1.35s/it]

Step: 523, Loss: 2.95938777923584


 52%|█████▏    | 524/1000 [11:32<10:33,  1.33s/it]

Step: 524, Loss: 2.865002155303955


 52%|█████▎    | 525/1000 [11:33<10:25,  1.32s/it]

Step: 525, Loss: 2.910414218902588


 53%|█████▎    | 526/1000 [11:35<10:24,  1.32s/it]

Step: 526, Loss: 2.9194252490997314


 53%|█████▎    | 527/1000 [11:36<10:17,  1.31s/it]

Step: 527, Loss: 2.908874273300171


 53%|█████▎    | 528/1000 [11:37<10:14,  1.30s/it]

Step: 528, Loss: 2.930684804916382


 53%|█████▎    | 529/1000 [11:39<10:10,  1.30s/it]

Step: 529, Loss: 2.9246106147766113


 53%|█████▎    | 530/1000 [11:40<10:09,  1.30s/it]

Step: 530, Loss: 2.938030481338501


 53%|█████▎    | 531/1000 [11:41<10:04,  1.29s/it]

Step: 531, Loss: 2.902477979660034


 53%|█████▎    | 532/1000 [11:42<10:02,  1.29s/it]

Step: 532, Loss: 2.8581652641296387


 53%|█████▎    | 533/1000 [11:44<09:59,  1.28s/it]

Step: 533, Loss: 2.9152138233184814


 53%|█████▎    | 534/1000 [11:45<09:52,  1.27s/it]

Step: 534, Loss: 2.8752028942108154


 54%|█████▎    | 535/1000 [11:46<09:50,  1.27s/it]

Step: 535, Loss: 2.873983860015869


 54%|█████▎    | 536/1000 [11:47<09:56,  1.29s/it]

Step: 536, Loss: 2.914287805557251


 54%|█████▎    | 537/1000 [11:49<10:01,  1.30s/it]

Step: 537, Loss: 2.8983423709869385


 54%|█████▍    | 538/1000 [11:50<09:55,  1.29s/it]

Step: 538, Loss: 2.9098563194274902


 54%|█████▍    | 539/1000 [11:51<09:53,  1.29s/it]

Step: 539, Loss: 2.865295648574829


 54%|█████▍    | 540/1000 [11:53<09:47,  1.28s/it]

Step: 540, Loss: 2.891890525817871


 54%|█████▍    | 541/1000 [11:54<09:47,  1.28s/it]

Step: 541, Loss: 2.848966121673584


 54%|█████▍    | 542/1000 [11:55<09:45,  1.28s/it]

Step: 542, Loss: 2.8803460597991943


 54%|█████▍    | 543/1000 [11:56<09:39,  1.27s/it]

Step: 543, Loss: 2.897548198699951


 54%|█████▍    | 544/1000 [11:58<09:38,  1.27s/it]

Step: 544, Loss: 2.8902032375335693


 55%|█████▍    | 545/1000 [11:59<09:36,  1.27s/it]

Step: 545, Loss: 2.8644320964813232


 55%|█████▍    | 546/1000 [12:00<09:35,  1.27s/it]

Step: 546, Loss: 2.880802869796753


 55%|█████▍    | 547/1000 [12:01<09:34,  1.27s/it]

Step: 547, Loss: 2.863189935684204


 55%|█████▍    | 548/1000 [12:03<09:34,  1.27s/it]

Step: 548, Loss: 2.864630937576294


 55%|█████▍    | 549/1000 [12:04<09:40,  1.29s/it]

Step: 549, Loss: 2.8549094200134277


 55%|█████▌    | 550/1000 [12:05<09:54,  1.32s/it]

Step: 550, Loss: 2.8868680000305176


 55%|█████▌    | 551/1000 [12:07<09:44,  1.30s/it]

Step: 551, Loss: 2.8848135471343994


 55%|█████▌    | 552/1000 [12:08<09:42,  1.30s/it]

Step: 552, Loss: 2.8649208545684814


 55%|█████▌    | 553/1000 [12:10<10:14,  1.37s/it]

Step: 553, Loss: 2.88852596282959


 55%|█████▌    | 554/1000 [12:11<10:05,  1.36s/it]

Step: 554, Loss: 2.8575284481048584


 56%|█████▌    | 555/1000 [12:12<09:52,  1.33s/it]

Step: 555, Loss: 2.870943069458008


 56%|█████▌    | 556/1000 [12:13<09:50,  1.33s/it]

Step: 556, Loss: 2.8790972232818604


 56%|█████▌    | 557/1000 [12:15<09:44,  1.32s/it]

Step: 557, Loss: 2.910446882247925


 56%|█████▌    | 558/1000 [12:16<09:37,  1.31s/it]

Step: 558, Loss: 2.8606905937194824


 56%|█████▌    | 559/1000 [12:17<09:31,  1.30s/it]

Step: 559, Loss: 2.8493106365203857


 56%|█████▌    | 560/1000 [12:19<09:23,  1.28s/it]

Step: 560, Loss: 2.849531888961792


 56%|█████▌    | 561/1000 [12:20<09:19,  1.27s/it]

Step: 561, Loss: 2.8285281658172607


 56%|█████▌    | 562/1000 [12:21<09:17,  1.27s/it]

Step: 562, Loss: 2.855661630630493


 56%|█████▋    | 563/1000 [12:22<09:18,  1.28s/it]

Step: 563, Loss: 2.873236894607544


 56%|█████▋    | 564/1000 [12:24<09:21,  1.29s/it]

Step: 564, Loss: 2.8587632179260254


 56%|█████▋    | 565/1000 [12:25<09:18,  1.28s/it]

Step: 565, Loss: 2.8155219554901123


 57%|█████▋    | 566/1000 [12:26<09:20,  1.29s/it]

Step: 566, Loss: 2.830228567123413


 57%|█████▋    | 567/1000 [12:28<09:34,  1.33s/it]

Step: 567, Loss: 2.8691439628601074


 57%|█████▋    | 568/1000 [12:29<09:33,  1.33s/it]

Step: 568, Loss: 2.8586885929107666


 57%|█████▋    | 569/1000 [12:30<09:29,  1.32s/it]

Step: 569, Loss: 2.816509485244751


 57%|█████▋    | 570/1000 [12:32<09:25,  1.32s/it]

Step: 570, Loss: 2.8234035968780518


 57%|█████▋    | 571/1000 [12:33<09:18,  1.30s/it]

Step: 571, Loss: 2.810258150100708


 57%|█████▋    | 572/1000 [12:34<09:15,  1.30s/it]

Step: 572, Loss: 2.826979875564575


 57%|█████▋    | 573/1000 [12:35<09:13,  1.30s/it]

Step: 573, Loss: 2.8521645069122314


 57%|█████▋    | 574/1000 [12:37<09:08,  1.29s/it]

Step: 574, Loss: 2.828153133392334


 57%|█████▊    | 575/1000 [12:38<09:03,  1.28s/it]

Step: 575, Loss: 2.7927310466766357


 58%|█████▊    | 576/1000 [12:39<08:59,  1.27s/it]

Step: 576, Loss: 2.8257665634155273


 58%|█████▊    | 577/1000 [12:41<08:55,  1.27s/it]

Step: 577, Loss: 2.81318736076355


 58%|█████▊    | 578/1000 [12:42<08:56,  1.27s/it]

Step: 578, Loss: 2.8680028915405273


 58%|█████▊    | 579/1000 [12:43<09:06,  1.30s/it]

Step: 579, Loss: 2.8792548179626465


 58%|█████▊    | 580/1000 [12:44<09:03,  1.29s/it]

Step: 580, Loss: 2.8260724544525146


 58%|█████▊    | 581/1000 [12:46<09:02,  1.29s/it]

Step: 581, Loss: 2.8168892860412598


 58%|█████▊    | 582/1000 [12:47<09:06,  1.31s/it]

Step: 582, Loss: 2.887582778930664


 58%|█████▊    | 583/1000 [12:49<09:33,  1.38s/it]

Step: 583, Loss: 2.8407769203186035


 58%|█████▊    | 584/1000 [12:50<09:25,  1.36s/it]

Step: 584, Loss: 2.8246395587921143


 58%|█████▊    | 585/1000 [12:51<09:14,  1.34s/it]

Step: 585, Loss: 2.840303421020508


 59%|█████▊    | 586/1000 [12:53<09:11,  1.33s/it]

Step: 586, Loss: 2.7904696464538574


 59%|█████▊    | 587/1000 [12:54<09:08,  1.33s/it]

Step: 587, Loss: 2.760948419570923


 59%|█████▉    | 588/1000 [12:55<09:09,  1.33s/it]

Step: 588, Loss: 2.8375823497772217


 59%|█████▉    | 589/1000 [12:56<08:57,  1.31s/it]

Step: 589, Loss: 2.8246304988861084


 59%|█████▉    | 590/1000 [12:58<08:53,  1.30s/it]

Step: 590, Loss: 2.8118112087249756


 59%|█████▉    | 591/1000 [12:59<08:51,  1.30s/it]

Step: 591, Loss: 2.8200507164001465


 59%|█████▉    | 592/1000 [13:00<08:52,  1.31s/it]

Step: 592, Loss: 2.848452568054199


 59%|█████▉    | 593/1000 [13:02<08:47,  1.30s/it]

Step: 593, Loss: 2.795426368713379


 59%|█████▉    | 594/1000 [13:03<08:47,  1.30s/it]

Step: 594, Loss: 2.7723755836486816


 60%|█████▉    | 595/1000 [13:04<08:45,  1.30s/it]

Step: 595, Loss: 2.8069686889648438


 60%|█████▉    | 596/1000 [13:06<09:00,  1.34s/it]

Step: 596, Loss: 2.810032367706299


 60%|█████▉    | 597/1000 [13:07<08:52,  1.32s/it]

Step: 597, Loss: 2.8180453777313232


 60%|█████▉    | 598/1000 [13:08<08:49,  1.32s/it]

Step: 598, Loss: 2.7919368743896484


 60%|█████▉    | 599/1000 [13:10<08:44,  1.31s/it]

Step: 599, Loss: 2.7958297729492188


 60%|██████    | 600/1000 [13:11<08:45,  1.31s/it]

Step: 600, Loss: 2.7626090049743652


 60%|██████    | 601/1000 [13:12<08:41,  1.31s/it]

Step: 601, Loss: 2.8081271648406982


 60%|██████    | 602/1000 [13:13<08:38,  1.30s/it]

Step: 602, Loss: 2.7576088905334473


 60%|██████    | 603/1000 [13:15<08:32,  1.29s/it]

Step: 603, Loss: 2.779085159301758


 60%|██████    | 604/1000 [13:16<08:26,  1.28s/it]

Step: 604, Loss: 2.7796192169189453


 60%|██████    | 605/1000 [13:17<08:24,  1.28s/it]

Step: 605, Loss: 2.766782760620117


 61%|██████    | 606/1000 [13:18<08:20,  1.27s/it]

Step: 606, Loss: 2.7967617511749268


 61%|██████    | 607/1000 [13:20<08:56,  1.36s/it]

Step: 607, Loss: 2.7432165145874023


 61%|██████    | 608/1000 [13:21<08:57,  1.37s/it]

Step: 608, Loss: 2.777081251144409


 61%|██████    | 609/1000 [13:23<08:43,  1.34s/it]

Step: 609, Loss: 2.759899377822876


 61%|██████    | 610/1000 [13:24<08:34,  1.32s/it]

Step: 610, Loss: 2.767057180404663


 61%|██████    | 611/1000 [13:26<09:30,  1.47s/it]

Step: 611, Loss: 2.760288953781128


 61%|██████    | 612/1000 [13:27<09:26,  1.46s/it]

Step: 612, Loss: 2.786651849746704


 61%|██████▏   | 613/1000 [13:29<09:04,  1.41s/it]

Step: 613, Loss: 2.7923660278320312


 61%|██████▏   | 614/1000 [13:30<08:46,  1.36s/it]

Step: 614, Loss: 2.759030818939209


 62%|██████▏   | 615/1000 [13:31<08:33,  1.33s/it]

Step: 615, Loss: 2.763002395629883


 62%|██████▏   | 616/1000 [13:32<08:26,  1.32s/it]

Step: 616, Loss: 2.7436912059783936


 62%|██████▏   | 617/1000 [13:34<08:21,  1.31s/it]

Step: 617, Loss: 2.7514560222625732


 62%|██████▏   | 618/1000 [13:35<08:17,  1.30s/it]

Step: 618, Loss: 2.7444450855255127


 62%|██████▏   | 619/1000 [13:36<08:12,  1.29s/it]

Step: 619, Loss: 2.758186101913452


 62%|██████▏   | 620/1000 [13:37<08:10,  1.29s/it]

Step: 620, Loss: 2.755096197128296


 62%|██████▏   | 621/1000 [13:39<08:13,  1.30s/it]

Step: 621, Loss: 2.7612862586975098


 62%|██████▏   | 622/1000 [13:40<08:09,  1.30s/it]

Step: 622, Loss: 2.727285861968994


 62%|██████▏   | 623/1000 [13:41<08:05,  1.29s/it]

Step: 623, Loss: 2.7338125705718994


 62%|██████▏   | 624/1000 [13:43<08:00,  1.28s/it]

Step: 624, Loss: 2.7489326000213623


 62%|██████▎   | 625/1000 [13:44<08:08,  1.30s/it]

Step: 625, Loss: 2.7034878730773926


 63%|██████▎   | 626/1000 [13:45<08:06,  1.30s/it]

Step: 626, Loss: 2.7037744522094727


 63%|██████▎   | 627/1000 [13:47<08:04,  1.30s/it]

Step: 627, Loss: 2.732963800430298


 63%|██████▎   | 628/1000 [13:48<08:01,  1.29s/it]

Step: 628, Loss: 2.681492328643799


 63%|██████▎   | 629/1000 [13:49<07:56,  1.28s/it]

Step: 629, Loss: 2.73283052444458


 63%|██████▎   | 630/1000 [13:50<07:56,  1.29s/it]

Step: 630, Loss: 2.708944797515869


 63%|██████▎   | 631/1000 [13:52<07:58,  1.30s/it]

Step: 631, Loss: 2.729140520095825


 63%|██████▎   | 632/1000 [13:53<07:54,  1.29s/it]

Step: 632, Loss: 2.698202610015869


 63%|██████▎   | 633/1000 [13:54<07:50,  1.28s/it]

Step: 633, Loss: 2.7777459621429443


 63%|██████▎   | 634/1000 [13:56<08:04,  1.32s/it]

Step: 634, Loss: 2.736398935317993


 64%|██████▎   | 635/1000 [13:57<08:13,  1.35s/it]

Step: 635, Loss: 2.731501817703247


 64%|██████▎   | 636/1000 [13:58<08:06,  1.34s/it]

Step: 636, Loss: 2.726151943206787


 64%|██████▎   | 637/1000 [14:00<08:08,  1.35s/it]

Step: 637, Loss: 2.774202346801758


 64%|██████▍   | 638/1000 [14:01<07:59,  1.32s/it]

Step: 638, Loss: 2.770733118057251


 64%|██████▍   | 639/1000 [14:02<07:51,  1.31s/it]

Step: 639, Loss: 2.7385659217834473


 64%|██████▍   | 640/1000 [14:04<08:13,  1.37s/it]

Step: 640, Loss: 2.748553991317749


 64%|██████▍   | 641/1000 [14:05<08:02,  1.34s/it]

Step: 641, Loss: 2.8784127235412598


 64%|██████▍   | 642/1000 [14:06<07:58,  1.34s/it]

Step: 642, Loss: 2.753462791442871


 64%|██████▍   | 643/1000 [14:08<08:01,  1.35s/it]

Step: 643, Loss: 2.77962589263916


 64%|██████▍   | 644/1000 [14:09<07:54,  1.33s/it]

Step: 644, Loss: 2.7382421493530273


 64%|██████▍   | 645/1000 [14:10<07:46,  1.31s/it]

Step: 645, Loss: 2.730194091796875


 65%|██████▍   | 646/1000 [14:12<07:41,  1.30s/it]

Step: 646, Loss: 2.7214879989624023


 65%|██████▍   | 647/1000 [14:13<07:35,  1.29s/it]

Step: 647, Loss: 2.7013039588928223


 65%|██████▍   | 648/1000 [14:14<07:33,  1.29s/it]

Step: 648, Loss: 2.7929816246032715


 65%|██████▍   | 649/1000 [14:15<07:31,  1.29s/it]

Step: 649, Loss: 2.7411673069000244


 65%|██████▌   | 650/1000 [14:17<07:29,  1.28s/it]

Step: 650, Loss: 2.7500784397125244


 65%|██████▌   | 651/1000 [14:18<07:26,  1.28s/it]

Step: 651, Loss: 2.767408847808838


 65%|██████▌   | 652/1000 [14:19<07:35,  1.31s/it]

Step: 652, Loss: 2.7676644325256348


 65%|██████▌   | 653/1000 [14:21<07:30,  1.30s/it]

Step: 653, Loss: 2.788484811782837


 65%|██████▌   | 654/1000 [14:22<07:39,  1.33s/it]

Step: 654, Loss: 2.745962619781494


 66%|██████▌   | 655/1000 [14:23<07:30,  1.31s/it]

Step: 655, Loss: 2.675313711166382


 66%|██████▌   | 656/1000 [14:25<07:23,  1.29s/it]

Step: 656, Loss: 2.6666758060455322


 66%|██████▌   | 657/1000 [14:26<07:22,  1.29s/it]

Step: 657, Loss: 2.8472325801849365


 66%|██████▌   | 658/1000 [14:27<07:27,  1.31s/it]

Step: 658, Loss: 2.707821846008301


 66%|██████▌   | 659/1000 [14:29<07:25,  1.31s/it]

Step: 659, Loss: 2.7664453983306885


 66%|██████▌   | 660/1000 [14:30<07:20,  1.29s/it]

Step: 660, Loss: 2.717841386795044


 66%|██████▌   | 661/1000 [14:31<07:18,  1.29s/it]

Step: 661, Loss: 2.7261862754821777


 66%|██████▌   | 662/1000 [14:32<07:26,  1.32s/it]

Step: 662, Loss: 2.655831813812256


 66%|██████▋   | 663/1000 [14:34<07:21,  1.31s/it]

Step: 663, Loss: 2.7111711502075195


 66%|██████▋   | 664/1000 [14:35<07:30,  1.34s/it]

Step: 664, Loss: 2.680922031402588


 66%|██████▋   | 665/1000 [14:36<07:26,  1.33s/it]

Step: 665, Loss: 2.7208218574523926


 67%|██████▋   | 666/1000 [14:38<07:48,  1.40s/it]

Step: 666, Loss: 2.698563575744629


 67%|██████▋   | 667/1000 [14:40<08:02,  1.45s/it]

Step: 667, Loss: 2.662709951400757


 67%|██████▋   | 668/1000 [14:41<07:51,  1.42s/it]

Step: 668, Loss: 2.639968156814575


 67%|██████▋   | 669/1000 [14:42<07:37,  1.38s/it]

Step: 669, Loss: 2.699842929840088


 67%|██████▋   | 670/1000 [14:44<07:26,  1.35s/it]

Step: 670, Loss: 2.669142246246338


 67%|██████▋   | 671/1000 [14:45<07:15,  1.32s/it]

Step: 671, Loss: 2.680887460708618


 67%|██████▋   | 672/1000 [14:46<07:05,  1.30s/it]

Step: 672, Loss: 2.644773483276367


 67%|██████▋   | 673/1000 [14:47<07:02,  1.29s/it]

Step: 673, Loss: 2.6902289390563965


 67%|██████▋   | 674/1000 [14:49<06:56,  1.28s/it]

Step: 674, Loss: 2.6390345096588135


 68%|██████▊   | 675/1000 [14:50<06:57,  1.28s/it]

Step: 675, Loss: 2.6960086822509766


 68%|██████▊   | 676/1000 [14:51<06:54,  1.28s/it]

Step: 676, Loss: 2.6824839115142822


 68%|██████▊   | 677/1000 [14:52<06:56,  1.29s/it]

Step: 677, Loss: 2.6761224269866943


 68%|██████▊   | 678/1000 [14:54<07:01,  1.31s/it]

Step: 678, Loss: 2.650442123413086


 68%|██████▊   | 679/1000 [14:55<07:02,  1.31s/it]

Step: 679, Loss: 2.6659369468688965


 68%|██████▊   | 680/1000 [14:56<06:57,  1.30s/it]

Step: 680, Loss: 2.650374412536621


 68%|██████▊   | 681/1000 [14:58<06:50,  1.29s/it]

Step: 681, Loss: 2.6588799953460693


 68%|██████▊   | 682/1000 [14:59<06:54,  1.30s/it]

Step: 682, Loss: 2.657557725906372


 68%|██████▊   | 683/1000 [15:00<07:05,  1.34s/it]

Step: 683, Loss: 2.611680507659912


 68%|██████▊   | 684/1000 [15:03<08:37,  1.64s/it]

Step: 684, Loss: 2.6526503562927246


 68%|██████▊   | 685/1000 [15:04<08:39,  1.65s/it]

Step: 685, Loss: 2.6383025646209717


 69%|██████▊   | 686/1000 [15:06<08:10,  1.56s/it]

Step: 686, Loss: 2.638918876647949


 69%|██████▊   | 687/1000 [15:07<07:47,  1.49s/it]

Step: 687, Loss: 2.6548590660095215


 69%|██████▉   | 688/1000 [15:08<07:27,  1.44s/it]

Step: 688, Loss: 2.6502370834350586


 69%|██████▉   | 689/1000 [15:10<07:18,  1.41s/it]

Step: 689, Loss: 2.6544952392578125


 69%|██████▉   | 690/1000 [15:11<07:07,  1.38s/it]

Step: 690, Loss: 2.6567506790161133


 69%|██████▉   | 691/1000 [15:12<06:59,  1.36s/it]

Step: 691, Loss: 2.6914145946502686


 69%|██████▉   | 692/1000 [15:14<06:51,  1.34s/it]

Step: 692, Loss: 2.6649322509765625


 69%|██████▉   | 693/1000 [15:15<06:43,  1.31s/it]

Step: 693, Loss: 2.6199283599853516


 69%|██████▉   | 694/1000 [15:16<06:35,  1.29s/it]

Step: 694, Loss: 2.6318705081939697


 70%|██████▉   | 695/1000 [15:17<06:33,  1.29s/it]

Step: 695, Loss: 2.6350202560424805


 70%|██████▉   | 696/1000 [15:19<06:41,  1.32s/it]

Step: 696, Loss: 2.6211447715759277


 70%|██████▉   | 697/1000 [15:20<07:06,  1.41s/it]

Step: 697, Loss: 2.6739377975463867


 70%|██████▉   | 698/1000 [15:22<06:53,  1.37s/it]

Step: 698, Loss: 2.613213300704956


 70%|██████▉   | 699/1000 [15:23<06:43,  1.34s/it]

Step: 699, Loss: 2.6360666751861572


 70%|███████   | 700/1000 [15:24<06:36,  1.32s/it]

Step: 700, Loss: 2.652172803878784


 70%|███████   | 701/1000 [15:26<06:29,  1.30s/it]

Step: 701, Loss: 2.6216351985931396


 70%|███████   | 702/1000 [15:27<06:27,  1.30s/it]

Step: 702, Loss: 2.625469923019409


 70%|███████   | 703/1000 [15:28<06:22,  1.29s/it]

Step: 703, Loss: 2.6501588821411133


 70%|███████   | 704/1000 [15:29<06:20,  1.28s/it]

Step: 704, Loss: 2.633084297180176


 70%|███████   | 705/1000 [15:31<06:18,  1.28s/it]

Step: 705, Loss: 2.5926291942596436


 71%|███████   | 706/1000 [15:32<06:15,  1.28s/it]

Step: 706, Loss: 2.6136903762817383


 71%|███████   | 707/1000 [15:33<06:14,  1.28s/it]

Step: 707, Loss: 2.616607904434204


 71%|███████   | 708/1000 [15:34<06:13,  1.28s/it]

Step: 708, Loss: 2.583836317062378


 71%|███████   | 709/1000 [15:36<06:29,  1.34s/it]

Step: 709, Loss: 2.657379150390625


 71%|███████   | 710/1000 [15:37<06:31,  1.35s/it]

Step: 710, Loss: 2.6138978004455566


 71%|███████   | 711/1000 [15:39<06:26,  1.34s/it]

Step: 711, Loss: 2.662707805633545


 71%|███████   | 712/1000 [15:40<06:24,  1.33s/it]

Step: 712, Loss: 2.6410694122314453


 71%|███████▏  | 713/1000 [15:41<06:21,  1.33s/it]

Step: 713, Loss: 2.6230361461639404


 71%|███████▏  | 714/1000 [15:43<06:16,  1.32s/it]

Step: 714, Loss: 2.650057792663574


 72%|███████▏  | 715/1000 [15:44<06:12,  1.31s/it]

Step: 715, Loss: 2.6149163246154785


 72%|███████▏  | 716/1000 [15:45<06:07,  1.29s/it]

Step: 716, Loss: 2.6215426921844482


 72%|███████▏  | 717/1000 [15:46<06:06,  1.29s/it]

Step: 717, Loss: 2.6165575981140137


 72%|███████▏  | 718/1000 [15:48<06:09,  1.31s/it]

Step: 718, Loss: 2.588052272796631


 72%|███████▏  | 719/1000 [15:49<06:07,  1.31s/it]

Step: 719, Loss: 2.626779079437256


 72%|███████▏  | 720/1000 [15:50<06:04,  1.30s/it]

Step: 720, Loss: 2.670123338699341


 72%|███████▏  | 721/1000 [15:52<06:05,  1.31s/it]

Step: 721, Loss: 2.6407642364501953


 72%|███████▏  | 722/1000 [15:53<05:58,  1.29s/it]

Step: 722, Loss: 2.6309709548950195


 72%|███████▏  | 723/1000 [15:54<05:53,  1.28s/it]

Step: 723, Loss: 2.619661808013916


 72%|███████▏  | 724/1000 [15:56<05:56,  1.29s/it]

Step: 724, Loss: 2.6284873485565186


 72%|███████▎  | 725/1000 [15:57<05:52,  1.28s/it]

Step: 725, Loss: 2.5969839096069336


 73%|███████▎  | 726/1000 [15:58<05:47,  1.27s/it]

Step: 726, Loss: 2.6259946823120117


 73%|███████▎  | 727/1000 [15:59<05:52,  1.29s/it]

Step: 727, Loss: 2.6817264556884766


 73%|███████▎  | 728/1000 [16:01<05:51,  1.29s/it]

Step: 728, Loss: 2.56697154045105


 73%|███████▎  | 729/1000 [16:02<06:19,  1.40s/it]

Step: 729, Loss: 2.654233932495117


 73%|███████▎  | 730/1000 [16:04<06:27,  1.43s/it]

Step: 730, Loss: 2.602489948272705


 73%|███████▎  | 731/1000 [16:05<06:18,  1.41s/it]

Step: 731, Loss: 2.6005284786224365


 73%|███████▎  | 732/1000 [16:06<06:07,  1.37s/it]

Step: 732, Loss: 2.6275341510772705


 73%|███████▎  | 733/1000 [16:08<06:02,  1.36s/it]

Step: 733, Loss: 2.605818510055542


 73%|███████▎  | 734/1000 [16:09<05:48,  1.31s/it]

Step: 734, Loss: 2.6094307899475098


 74%|███████▎  | 735/1000 [16:10<05:44,  1.30s/it]

Step: 735, Loss: 2.6294541358947754


 74%|███████▎  | 736/1000 [16:12<05:40,  1.29s/it]

Step: 736, Loss: 2.664497137069702


 74%|███████▎  | 737/1000 [16:13<05:40,  1.29s/it]

Step: 737, Loss: 2.621901035308838


 74%|███████▍  | 738/1000 [16:14<05:36,  1.29s/it]

Step: 738, Loss: 2.65208101272583


 74%|███████▍  | 739/1000 [16:15<05:34,  1.28s/it]

Step: 739, Loss: 2.6375186443328857


 74%|███████▍  | 740/1000 [16:17<05:53,  1.36s/it]

Step: 740, Loss: 2.6216015815734863


 74%|███████▍  | 741/1000 [16:18<05:47,  1.34s/it]

Step: 741, Loss: 2.6427390575408936


 74%|███████▍  | 742/1000 [16:19<05:40,  1.32s/it]

Step: 742, Loss: 2.651426076889038


 74%|███████▍  | 743/1000 [16:21<05:36,  1.31s/it]

Step: 743, Loss: 2.648555040359497


 74%|███████▍  | 744/1000 [16:22<05:33,  1.30s/it]

Step: 744, Loss: 2.6961331367492676


 74%|███████▍  | 745/1000 [16:23<05:38,  1.33s/it]

Step: 745, Loss: 2.6124050617218018


 75%|███████▍  | 746/1000 [16:25<05:33,  1.31s/it]

Step: 746, Loss: 2.6039865016937256


 75%|███████▍  | 747/1000 [16:26<05:34,  1.32s/it]

Step: 747, Loss: 2.655381202697754


 75%|███████▍  | 748/1000 [16:27<05:30,  1.31s/it]

Step: 748, Loss: 2.627048969268799


 75%|███████▍  | 749/1000 [16:29<05:27,  1.30s/it]

Step: 749, Loss: 2.592655897140503


 75%|███████▌  | 750/1000 [16:30<05:22,  1.29s/it]

Step: 750, Loss: 2.6059463024139404


 75%|███████▌  | 751/1000 [16:32<05:49,  1.40s/it]

Step: 751, Loss: 2.5706264972686768


 75%|███████▌  | 752/1000 [16:33<05:56,  1.44s/it]

Step: 752, Loss: 2.594402551651001


 75%|███████▌  | 753/1000 [16:34<05:45,  1.40s/it]

Step: 753, Loss: 2.5788660049438477


 75%|███████▌  | 754/1000 [16:36<05:35,  1.36s/it]

Step: 754, Loss: 2.5662567615509033


 76%|███████▌  | 755/1000 [16:37<05:29,  1.34s/it]

Step: 755, Loss: 2.5892257690429688


 76%|███████▌  | 756/1000 [16:38<05:23,  1.33s/it]

Step: 756, Loss: 2.555629014968872


 76%|███████▌  | 757/1000 [16:40<05:19,  1.31s/it]

Step: 757, Loss: 2.554543972015381


 76%|███████▌  | 758/1000 [16:41<05:18,  1.32s/it]

Step: 758, Loss: 2.5585579872131348


 76%|███████▌  | 759/1000 [16:42<05:17,  1.32s/it]

Step: 759, Loss: 2.5823731422424316


 76%|███████▌  | 760/1000 [16:43<05:16,  1.32s/it]

Step: 760, Loss: 2.529254674911499


 76%|███████▌  | 761/1000 [16:45<05:18,  1.33s/it]

Step: 761, Loss: 2.575014591217041


 76%|███████▌  | 762/1000 [16:46<05:12,  1.31s/it]

Step: 762, Loss: 2.5724260807037354


 76%|███████▋  | 763/1000 [16:47<05:08,  1.30s/it]

Step: 763, Loss: 2.545769214630127


 76%|███████▋  | 764/1000 [16:49<05:03,  1.29s/it]

Step: 764, Loss: 2.5628366470336914


 76%|███████▋  | 765/1000 [16:50<05:01,  1.28s/it]

Step: 765, Loss: 2.5355145931243896


 77%|███████▋  | 766/1000 [16:51<05:00,  1.28s/it]

Step: 766, Loss: 2.528201103210449


 77%|███████▋  | 767/1000 [16:53<05:01,  1.30s/it]

Step: 767, Loss: 2.5290541648864746


 77%|███████▋  | 768/1000 [16:54<05:14,  1.35s/it]

Step: 768, Loss: 2.5090293884277344


 77%|███████▋  | 769/1000 [16:56<05:29,  1.43s/it]

Step: 769, Loss: 2.5620968341827393


 77%|███████▋  | 770/1000 [16:57<05:23,  1.41s/it]

Step: 770, Loss: 2.544905424118042


 77%|███████▋  | 771/1000 [16:58<05:15,  1.38s/it]

Step: 771, Loss: 2.5704598426818848


 77%|███████▋  | 772/1000 [17:00<05:09,  1.36s/it]

Step: 772, Loss: 2.5482616424560547


 77%|███████▋  | 773/1000 [17:01<05:02,  1.33s/it]

Step: 773, Loss: 2.5176610946655273


 77%|███████▋  | 774/1000 [17:02<04:54,  1.30s/it]

Step: 774, Loss: 2.4985811710357666


 78%|███████▊  | 775/1000 [17:03<04:50,  1.29s/it]

Step: 775, Loss: 2.5351052284240723


 78%|███████▊  | 776/1000 [17:05<04:48,  1.29s/it]

Step: 776, Loss: 2.537216901779175


 78%|███████▊  | 777/1000 [17:06<04:54,  1.32s/it]

Step: 777, Loss: 2.5119428634643555


 78%|███████▊  | 778/1000 [17:07<04:53,  1.32s/it]

Step: 778, Loss: 2.4928104877471924


 78%|███████▊  | 779/1000 [17:09<04:56,  1.34s/it]

Step: 779, Loss: 2.4976589679718018


 78%|███████▊  | 780/1000 [17:10<04:55,  1.34s/it]

Step: 780, Loss: 2.484091281890869


 78%|███████▊  | 781/1000 [17:11<04:48,  1.32s/it]

Step: 781, Loss: 2.485403299331665


 78%|███████▊  | 782/1000 [17:13<04:44,  1.30s/it]

Step: 782, Loss: 2.4638214111328125


 78%|███████▊  | 783/1000 [17:14<04:40,  1.29s/it]

Step: 783, Loss: 2.5397391319274902


 78%|███████▊  | 784/1000 [17:15<04:35,  1.28s/it]

Step: 784, Loss: 2.5147242546081543


 78%|███████▊  | 785/1000 [17:17<04:46,  1.33s/it]

Step: 785, Loss: 2.4791345596313477


 79%|███████▊  | 786/1000 [17:18<04:44,  1.33s/it]

Step: 786, Loss: 2.5261526107788086


 79%|███████▊  | 787/1000 [17:19<04:42,  1.32s/it]

Step: 787, Loss: 2.514113664627075


 79%|███████▉  | 788/1000 [17:21<04:38,  1.31s/it]

Step: 788, Loss: 2.5135672092437744


 79%|███████▉  | 789/1000 [17:22<04:33,  1.30s/it]

Step: 789, Loss: 2.5247743129730225


 79%|███████▉  | 790/1000 [17:23<04:32,  1.30s/it]

Step: 790, Loss: 2.5145370960235596


 79%|███████▉  | 791/1000 [17:25<05:26,  1.56s/it]

Step: 791, Loss: 2.5347771644592285


 79%|███████▉  | 792/1000 [17:27<05:17,  1.52s/it]

Step: 792, Loss: 2.5064914226531982


 79%|███████▉  | 793/1000 [17:28<05:11,  1.50s/it]

Step: 793, Loss: 2.5035083293914795


 79%|███████▉  | 794/1000 [17:29<05:00,  1.46s/it]

Step: 794, Loss: 2.5091981887817383


 80%|███████▉  | 795/1000 [17:31<04:52,  1.43s/it]

Step: 795, Loss: 2.5132036209106445


 80%|███████▉  | 796/1000 [17:32<04:42,  1.38s/it]

Step: 796, Loss: 2.504798412322998


 80%|███████▉  | 797/1000 [17:34<04:40,  1.38s/it]

Step: 797, Loss: 2.5114963054656982


 80%|███████▉  | 798/1000 [17:35<04:34,  1.36s/it]

Step: 798, Loss: 2.550356864929199


 80%|███████▉  | 799/1000 [17:36<04:26,  1.32s/it]

Step: 799, Loss: 2.5351197719573975


 80%|████████  | 800/1000 [17:37<04:22,  1.31s/it]

Step: 800, Loss: 2.5173349380493164


 80%|████████  | 801/1000 [17:39<04:25,  1.33s/it]

Step: 801, Loss: 2.5364174842834473


 80%|████████  | 802/1000 [17:40<04:23,  1.33s/it]

Step: 802, Loss: 2.554826259613037


 80%|████████  | 803/1000 [17:41<04:19,  1.32s/it]

Step: 803, Loss: 2.5151727199554443


 80%|████████  | 804/1000 [17:43<04:19,  1.33s/it]

Step: 804, Loss: 2.5820648670196533


 80%|████████  | 805/1000 [17:44<04:15,  1.31s/it]

Step: 805, Loss: 2.5086896419525146


 81%|████████  | 806/1000 [17:45<04:13,  1.31s/it]

Step: 806, Loss: 2.559772253036499


 81%|████████  | 807/1000 [17:47<04:11,  1.31s/it]

Step: 807, Loss: 2.5533297061920166


 81%|████████  | 808/1000 [17:48<04:13,  1.32s/it]

Step: 808, Loss: 2.570753812789917


 81%|████████  | 809/1000 [17:49<04:12,  1.32s/it]

Step: 809, Loss: 2.513828992843628


 81%|████████  | 810/1000 [17:51<04:08,  1.31s/it]

Step: 810, Loss: 2.5510964393615723


 81%|████████  | 811/1000 [17:52<04:10,  1.33s/it]

Step: 811, Loss: 2.587027072906494


 81%|████████  | 812/1000 [17:53<04:12,  1.34s/it]

Step: 812, Loss: 2.508380889892578


 81%|████████▏ | 813/1000 [17:55<04:15,  1.37s/it]

Step: 813, Loss: 2.5364434719085693


 81%|████████▏ | 814/1000 [17:56<04:13,  1.36s/it]

Step: 814, Loss: 2.5163960456848145


 82%|████████▏ | 815/1000 [17:57<04:09,  1.35s/it]

Step: 815, Loss: 2.502626895904541


 82%|████████▏ | 816/1000 [17:59<04:12,  1.37s/it]

Step: 816, Loss: 2.5960707664489746


 82%|████████▏ | 817/1000 [18:01<04:34,  1.50s/it]

Step: 817, Loss: 2.5653579235076904


 82%|████████▏ | 818/1000 [18:02<04:33,  1.50s/it]

Step: 818, Loss: 2.522993326187134


 82%|████████▏ | 819/1000 [18:03<04:23,  1.46s/it]

Step: 819, Loss: 2.594120502471924


 82%|████████▏ | 820/1000 [18:05<04:18,  1.44s/it]

Step: 820, Loss: 2.502708911895752


 82%|████████▏ | 821/1000 [18:06<04:07,  1.39s/it]

Step: 821, Loss: 2.5371921062469482


 82%|████████▏ | 822/1000 [18:08<04:09,  1.40s/it]

Step: 822, Loss: 2.5279853343963623


 82%|████████▏ | 823/1000 [18:09<04:19,  1.47s/it]

Step: 823, Loss: 2.5070717334747314


 82%|████████▏ | 824/1000 [18:11<04:12,  1.44s/it]

Step: 824, Loss: 2.5346901416778564


 82%|████████▎ | 825/1000 [18:12<04:04,  1.39s/it]

Step: 825, Loss: 2.515066146850586


 83%|████████▎ | 826/1000 [18:13<03:57,  1.37s/it]

Step: 826, Loss: 2.4943392276763916


 83%|████████▎ | 827/1000 [18:14<03:52,  1.34s/it]

Step: 827, Loss: 2.5180509090423584


 83%|████████▎ | 828/1000 [18:16<03:48,  1.33s/it]

Step: 828, Loss: 2.502741813659668


 83%|████████▎ | 829/1000 [18:17<03:43,  1.30s/it]

Step: 829, Loss: 2.474475860595703


 83%|████████▎ | 830/1000 [18:18<03:40,  1.30s/it]

Step: 830, Loss: 2.4868760108947754


 83%|████████▎ | 831/1000 [18:19<03:37,  1.29s/it]

Step: 831, Loss: 2.4694411754608154


 83%|████████▎ | 832/1000 [18:21<03:54,  1.40s/it]

Step: 832, Loss: 2.5445852279663086


 83%|████████▎ | 833/1000 [18:22<03:48,  1.37s/it]

Step: 833, Loss: 2.5173041820526123


 83%|████████▎ | 834/1000 [18:24<03:46,  1.36s/it]

Step: 834, Loss: 2.5207746028900146


 84%|████████▎ | 835/1000 [18:25<03:42,  1.35s/it]

Step: 835, Loss: 2.5451161861419678


 84%|████████▎ | 836/1000 [18:26<03:37,  1.33s/it]

Step: 836, Loss: 2.5317533016204834


 84%|████████▎ | 837/1000 [18:28<03:38,  1.34s/it]

Step: 837, Loss: 2.5183374881744385


 84%|████████▍ | 838/1000 [18:29<03:36,  1.34s/it]

Step: 838, Loss: 2.551311492919922


 84%|████████▍ | 839/1000 [18:30<03:32,  1.32s/it]

Step: 839, Loss: 2.5448660850524902


 84%|████████▍ | 840/1000 [18:32<03:30,  1.31s/it]

Step: 840, Loss: 2.5226821899414062


 84%|████████▍ | 841/1000 [18:33<03:30,  1.32s/it]

Step: 841, Loss: 2.4784319400787354


 84%|████████▍ | 842/1000 [18:34<03:28,  1.32s/it]

Step: 842, Loss: 2.516322135925293


 84%|████████▍ | 843/1000 [18:36<03:26,  1.31s/it]

Step: 843, Loss: 2.506707191467285


 84%|████████▍ | 844/1000 [18:37<03:21,  1.29s/it]

Step: 844, Loss: 2.4837646484375


 84%|████████▍ | 845/1000 [18:38<03:19,  1.29s/it]

Step: 845, Loss: 2.4871084690093994


 85%|████████▍ | 846/1000 [18:39<03:18,  1.29s/it]

Step: 846, Loss: 2.4831132888793945


 85%|████████▍ | 847/1000 [18:41<03:15,  1.28s/it]

Step: 847, Loss: 2.513641595840454


 85%|████████▍ | 848/1000 [18:42<03:17,  1.30s/it]

Step: 848, Loss: 2.4761300086975098


 85%|████████▍ | 849/1000 [18:43<03:15,  1.30s/it]

Step: 849, Loss: 2.487046003341675


 85%|████████▌ | 850/1000 [18:45<03:15,  1.30s/it]

Step: 850, Loss: 2.465947151184082


 85%|████████▌ | 851/1000 [18:46<03:38,  1.47s/it]

Step: 851, Loss: 2.4625635147094727


 85%|████████▌ | 852/1000 [18:48<03:29,  1.42s/it]

Step: 852, Loss: 2.4889063835144043


 85%|████████▌ | 853/1000 [18:49<03:24,  1.39s/it]

Step: 853, Loss: 2.4693713188171387


 85%|████████▌ | 854/1000 [18:50<03:18,  1.36s/it]

Step: 854, Loss: 2.47518253326416


 86%|████████▌ | 855/1000 [18:52<03:15,  1.35s/it]

Step: 855, Loss: 2.4115095138549805


 86%|████████▌ | 856/1000 [18:53<03:10,  1.32s/it]

Step: 856, Loss: 2.504218816757202


 86%|████████▌ | 857/1000 [18:54<03:07,  1.31s/it]

Step: 857, Loss: 2.4468727111816406


 86%|████████▌ | 858/1000 [18:56<03:05,  1.31s/it]

Step: 858, Loss: 2.4270224571228027


 86%|████████▌ | 859/1000 [18:57<03:08,  1.33s/it]

Step: 859, Loss: 2.503971576690674


 86%|████████▌ | 860/1000 [18:58<03:12,  1.37s/it]

Step: 860, Loss: 2.40842866897583


 86%|████████▌ | 861/1000 [19:00<03:07,  1.35s/it]

Step: 861, Loss: 2.4139437675476074


 86%|████████▌ | 862/1000 [19:01<03:02,  1.32s/it]

Step: 862, Loss: 2.4111526012420654


 86%|████████▋ | 863/1000 [19:02<02:59,  1.31s/it]

Step: 863, Loss: 2.4723575115203857


 86%|████████▋ | 864/1000 [19:04<02:57,  1.31s/it]

Step: 864, Loss: 2.4636595249176025


 86%|████████▋ | 865/1000 [19:05<02:56,  1.30s/it]

Step: 865, Loss: 2.429527521133423


 87%|████████▋ | 866/1000 [19:06<02:54,  1.30s/it]

Step: 866, Loss: 2.4695353507995605


 87%|████████▋ | 867/1000 [19:07<02:53,  1.31s/it]

Step: 867, Loss: 2.525404930114746


 87%|████████▋ | 868/1000 [19:09<03:01,  1.38s/it]

Step: 868, Loss: 2.4462616443634033


 87%|████████▋ | 869/1000 [19:10<02:58,  1.36s/it]

Step: 869, Loss: 2.466290235519409


 87%|████████▋ | 870/1000 [19:12<02:54,  1.34s/it]

Step: 870, Loss: 2.447100877761841


 87%|████████▋ | 871/1000 [19:13<02:50,  1.32s/it]

Step: 871, Loss: 2.452711343765259


 87%|████████▋ | 872/1000 [19:14<02:49,  1.33s/it]

Step: 872, Loss: 2.4589128494262695


 87%|████████▋ | 873/1000 [19:16<02:47,  1.32s/it]

Step: 873, Loss: 2.4396140575408936


 87%|████████▋ | 874/1000 [19:17<02:46,  1.32s/it]

Step: 874, Loss: 2.453754186630249


 88%|████████▊ | 875/1000 [19:18<02:44,  1.31s/it]

Step: 875, Loss: 2.48020076751709


 88%|████████▊ | 876/1000 [19:20<02:45,  1.33s/it]

Step: 876, Loss: 2.4142603874206543


 88%|████████▊ | 877/1000 [19:21<02:44,  1.34s/it]

Step: 877, Loss: 2.45417857170105


 88%|████████▊ | 878/1000 [19:22<02:42,  1.33s/it]

Step: 878, Loss: 2.426954507827759


 88%|████████▊ | 879/1000 [19:24<02:40,  1.33s/it]

Step: 879, Loss: 2.4146599769592285


 88%|████████▊ | 880/1000 [19:25<02:39,  1.33s/it]

Step: 880, Loss: 2.401827573776245


 88%|████████▊ | 881/1000 [19:26<02:38,  1.33s/it]

Step: 881, Loss: 2.4069273471832275


 88%|████████▊ | 882/1000 [19:28<02:39,  1.35s/it]

Step: 882, Loss: 2.403125762939453


 88%|████████▊ | 883/1000 [19:29<02:36,  1.34s/it]

Step: 883, Loss: 2.404345750808716


 88%|████████▊ | 884/1000 [19:30<02:32,  1.31s/it]

Step: 884, Loss: 2.4351749420166016


 88%|████████▊ | 885/1000 [19:31<02:28,  1.29s/it]

Step: 885, Loss: 2.417739152908325


 89%|████████▊ | 886/1000 [19:33<02:28,  1.30s/it]

Step: 886, Loss: 2.3956611156463623


 89%|████████▊ | 887/1000 [19:34<02:33,  1.36s/it]

Step: 887, Loss: 2.416653871536255


 89%|████████▉ | 888/1000 [19:36<02:36,  1.40s/it]

Step: 888, Loss: 2.3890175819396973


 89%|████████▉ | 889/1000 [19:37<02:32,  1.37s/it]

Step: 889, Loss: 2.391019105911255


 89%|████████▉ | 890/1000 [19:38<02:27,  1.34s/it]

Step: 890, Loss: 2.3894307613372803


 89%|████████▉ | 891/1000 [19:40<02:25,  1.33s/it]

Step: 891, Loss: 2.382932186126709


 89%|████████▉ | 892/1000 [19:41<02:22,  1.32s/it]

Step: 892, Loss: 2.3613080978393555


 89%|████████▉ | 893/1000 [19:42<02:22,  1.33s/it]

Step: 893, Loss: 2.4083714485168457


 89%|████████▉ | 894/1000 [19:44<02:20,  1.33s/it]

Step: 894, Loss: 2.3910648822784424


 90%|████████▉ | 895/1000 [19:45<02:23,  1.37s/it]

Step: 895, Loss: 2.392165422439575


 90%|████████▉ | 896/1000 [19:46<02:22,  1.37s/it]

Step: 896, Loss: 2.39298415184021


 90%|████████▉ | 897/1000 [19:48<02:19,  1.36s/it]

Step: 897, Loss: 2.4197189807891846


 90%|████████▉ | 898/1000 [19:49<02:17,  1.35s/it]

Step: 898, Loss: 2.3487043380737305


 90%|████████▉ | 899/1000 [19:50<02:15,  1.34s/it]

Step: 899, Loss: 2.373990535736084


 90%|█████████ | 900/1000 [19:52<02:12,  1.32s/it]

Step: 900, Loss: 2.38558030128479


 90%|█████████ | 901/1000 [19:53<02:08,  1.29s/it]

Step: 901, Loss: 2.347763776779175


 90%|█████████ | 902/1000 [19:54<02:06,  1.29s/it]

Step: 902, Loss: 2.390324831008911


 90%|█████████ | 903/1000 [19:56<02:08,  1.33s/it]

Step: 903, Loss: 2.3904237747192383


 90%|█████████ | 904/1000 [19:57<02:06,  1.31s/it]

Step: 904, Loss: 2.3722586631774902


 90%|█████████ | 905/1000 [19:58<02:06,  1.33s/it]

Step: 905, Loss: 2.3418402671813965


 91%|█████████ | 906/1000 [20:00<02:08,  1.37s/it]

Step: 906, Loss: 2.4001903533935547


 91%|█████████ | 907/1000 [20:01<02:08,  1.39s/it]

Step: 907, Loss: 2.3858132362365723


 91%|█████████ | 908/1000 [20:02<02:05,  1.37s/it]

Step: 908, Loss: 2.3550899028778076


 91%|█████████ | 909/1000 [20:04<02:02,  1.35s/it]

Step: 909, Loss: 2.4191243648529053


 91%|█████████ | 910/1000 [20:05<02:00,  1.34s/it]

Step: 910, Loss: 2.381471633911133


 91%|█████████ | 911/1000 [20:06<01:58,  1.33s/it]

Step: 911, Loss: 2.3387258052825928


 91%|█████████ | 912/1000 [20:08<01:56,  1.32s/it]

Step: 912, Loss: 2.373749017715454


 91%|█████████▏| 913/1000 [20:09<01:54,  1.31s/it]

Step: 913, Loss: 2.3731329441070557


 91%|█████████▏| 914/1000 [20:10<01:50,  1.29s/it]

Step: 914, Loss: 2.326061487197876


 92%|█████████▏| 915/1000 [20:12<01:56,  1.37s/it]

Step: 915, Loss: 2.3390884399414062


 92%|█████████▏| 916/1000 [20:13<01:54,  1.36s/it]

Step: 916, Loss: 2.375617265701294


 92%|█████████▏| 917/1000 [20:14<01:51,  1.34s/it]

Step: 917, Loss: 2.3414554595947266


 92%|█████████▏| 918/1000 [20:16<01:49,  1.33s/it]

Step: 918, Loss: 2.3901255130767822


 92%|█████████▏| 919/1000 [20:17<01:47,  1.33s/it]

Step: 919, Loss: 2.365588903427124


 92%|█████████▏| 920/1000 [20:18<01:44,  1.31s/it]

Step: 920, Loss: 2.372169256210327


 92%|█████████▏| 921/1000 [20:20<01:42,  1.30s/it]

Step: 921, Loss: 2.330904483795166


 92%|█████████▏| 922/1000 [20:21<01:48,  1.40s/it]

Step: 922, Loss: 2.325352907180786


 92%|█████████▏| 923/1000 [20:23<01:52,  1.46s/it]

Step: 923, Loss: 2.3543694019317627


 92%|█████████▏| 924/1000 [20:24<01:46,  1.40s/it]

Step: 924, Loss: 2.3551135063171387


 92%|█████████▎| 925/1000 [20:25<01:43,  1.37s/it]

Step: 925, Loss: 2.3728387355804443


 93%|█████████▎| 926/1000 [20:27<01:43,  1.39s/it]

Step: 926, Loss: 2.339836359024048


 93%|█████████▎| 927/1000 [20:28<01:40,  1.37s/it]

Step: 927, Loss: 2.3680548667907715


 93%|█████████▎| 928/1000 [20:29<01:37,  1.36s/it]

Step: 928, Loss: 2.3937511444091797


 93%|█████████▎| 929/1000 [20:31<01:35,  1.34s/it]

Step: 929, Loss: 2.3593692779541016


 93%|█████████▎| 930/1000 [20:32<01:32,  1.32s/it]

Step: 930, Loss: 2.376373529434204


 93%|█████████▎| 931/1000 [20:33<01:33,  1.35s/it]

Step: 931, Loss: 2.3243982791900635


 93%|█████████▎| 932/1000 [20:35<01:31,  1.35s/it]

Step: 932, Loss: 2.359117031097412


 93%|█████████▎| 933/1000 [20:36<01:29,  1.33s/it]

Step: 933, Loss: 2.3456482887268066


 93%|█████████▎| 934/1000 [20:37<01:26,  1.32s/it]

Step: 934, Loss: 2.354458808898926


 94%|█████████▎| 935/1000 [20:39<01:25,  1.31s/it]

Step: 935, Loss: 2.3472280502319336


 94%|█████████▎| 936/1000 [20:40<01:23,  1.31s/it]

Step: 936, Loss: 2.350687026977539


 94%|█████████▎| 937/1000 [20:41<01:22,  1.30s/it]

Step: 937, Loss: 2.318631172180176


 94%|█████████▍| 938/1000 [20:43<01:20,  1.30s/it]

Step: 938, Loss: 2.3694798946380615


 94%|█████████▍| 939/1000 [20:44<01:18,  1.28s/it]

Step: 939, Loss: 2.3193304538726807


 94%|█████████▍| 940/1000 [20:45<01:17,  1.29s/it]

Step: 940, Loss: 2.350362539291382


 94%|█████████▍| 941/1000 [20:47<01:23,  1.41s/it]

Step: 941, Loss: 2.309763193130493


 94%|█████████▍| 942/1000 [20:48<01:20,  1.39s/it]

Step: 942, Loss: 2.341099739074707


 94%|█████████▍| 943/1000 [20:49<01:18,  1.38s/it]

Step: 943, Loss: 2.3216707706451416


 94%|█████████▍| 944/1000 [20:51<01:15,  1.35s/it]

Step: 944, Loss: 2.321239471435547


 94%|█████████▍| 945/1000 [20:53<01:21,  1.48s/it]

Step: 945, Loss: 2.3535425662994385


 95%|█████████▍| 946/1000 [20:54<01:17,  1.44s/it]

Step: 946, Loss: 2.3593015670776367


 95%|█████████▍| 947/1000 [20:55<01:14,  1.40s/it]

Step: 947, Loss: 2.3406128883361816


 95%|█████████▍| 948/1000 [20:57<01:11,  1.38s/it]

Step: 948, Loss: 2.336743116378784


 95%|█████████▍| 949/1000 [20:58<01:13,  1.44s/it]

Step: 949, Loss: 2.313932418823242


 95%|█████████▌| 950/1000 [20:59<01:09,  1.39s/it]

Step: 950, Loss: 2.3430707454681396


 95%|█████████▌| 951/1000 [21:01<01:08,  1.40s/it]

Step: 951, Loss: 2.3061418533325195


 95%|█████████▌| 952/1000 [21:02<01:05,  1.37s/it]

Step: 952, Loss: 2.3761777877807617


 95%|█████████▌| 953/1000 [21:03<01:02,  1.34s/it]

Step: 953, Loss: 2.3255774974823


 95%|█████████▌| 954/1000 [21:05<01:00,  1.32s/it]

Step: 954, Loss: 2.316397190093994


 96%|█████████▌| 955/1000 [21:06<00:59,  1.32s/it]

Step: 955, Loss: 2.324737787246704


 96%|█████████▌| 956/1000 [21:07<00:57,  1.31s/it]

Step: 956, Loss: 2.3195040225982666


 96%|█████████▌| 957/1000 [21:09<00:57,  1.33s/it]

Step: 957, Loss: 2.3461806774139404


 96%|█████████▌| 958/1000 [21:10<01:00,  1.43s/it]

Step: 958, Loss: 2.309663772583008


 96%|█████████▌| 959/1000 [21:12<00:56,  1.39s/it]

Step: 959, Loss: 2.3287160396575928


 96%|█████████▌| 960/1000 [21:13<00:54,  1.37s/it]

Step: 960, Loss: 2.3273134231567383


 96%|█████████▌| 961/1000 [21:14<00:52,  1.36s/it]

Step: 961, Loss: 2.3938851356506348


 96%|█████████▌| 962/1000 [21:16<00:51,  1.35s/it]

Step: 962, Loss: 2.3259644508361816


 96%|█████████▋| 963/1000 [21:17<00:49,  1.34s/it]

Step: 963, Loss: 2.3339195251464844


 96%|█████████▋| 964/1000 [21:18<00:47,  1.33s/it]

Step: 964, Loss: 2.339803695678711


 96%|█████████▋| 965/1000 [21:19<00:46,  1.32s/it]

Step: 965, Loss: 2.336552619934082


 97%|█████████▋| 966/1000 [21:21<00:46,  1.35s/it]

Step: 966, Loss: 2.302536725997925


 97%|█████████▋| 967/1000 [21:22<00:44,  1.35s/it]

Step: 967, Loss: 2.3383471965789795


 97%|█████████▋| 968/1000 [21:24<00:42,  1.32s/it]

Step: 968, Loss: 2.2845191955566406


 97%|█████████▋| 969/1000 [21:25<00:40,  1.31s/it]

Step: 969, Loss: 2.3075509071350098


 97%|█████████▋| 970/1000 [21:26<00:39,  1.31s/it]

Step: 970, Loss: 2.3445448875427246


 97%|█████████▋| 971/1000 [21:27<00:38,  1.31s/it]

Step: 971, Loss: 2.3188326358795166


 97%|█████████▋| 972/1000 [21:29<00:36,  1.31s/it]

Step: 972, Loss: 2.3383617401123047


 97%|█████████▋| 973/1000 [21:30<00:35,  1.33s/it]

Step: 973, Loss: 2.3108880519866943


 97%|█████████▋| 974/1000 [21:32<00:37,  1.45s/it]

Step: 974, Loss: 2.3385889530181885


 98%|█████████▊| 975/1000 [21:33<00:35,  1.41s/it]

Step: 975, Loss: 2.2674970626831055


 98%|█████████▊| 976/1000 [21:35<00:33,  1.41s/it]

Step: 976, Loss: 2.3163514137268066


 98%|█████████▊| 977/1000 [21:36<00:31,  1.37s/it]

Step: 977, Loss: 2.3273000717163086


 98%|█████████▊| 978/1000 [21:37<00:29,  1.34s/it]

Step: 978, Loss: 2.3334274291992188


 98%|█████████▊| 979/1000 [21:38<00:27,  1.32s/it]

Step: 979, Loss: 2.2694547176361084


 98%|█████████▊| 980/1000 [21:40<00:26,  1.32s/it]

Step: 980, Loss: 2.3242456912994385


 98%|█████████▊| 981/1000 [21:41<00:25,  1.37s/it]

Step: 981, Loss: 2.3096923828125


 98%|█████████▊| 982/1000 [21:43<00:25,  1.42s/it]

Step: 982, Loss: 2.349062442779541


 98%|█████████▊| 983/1000 [21:44<00:23,  1.40s/it]

Step: 983, Loss: 2.312892436981201


 98%|█████████▊| 984/1000 [21:45<00:21,  1.37s/it]

Step: 984, Loss: 2.3389077186584473


 98%|█████████▊| 985/1000 [21:47<00:20,  1.36s/it]

Step: 985, Loss: 2.3361496925354004


 99%|█████████▊| 986/1000 [21:48<00:19,  1.36s/it]

Step: 986, Loss: 2.3552005290985107


 99%|█████████▊| 987/1000 [21:49<00:17,  1.35s/it]

Step: 987, Loss: 2.3222134113311768


 99%|█████████▉| 988/1000 [21:51<00:16,  1.35s/it]

Step: 988, Loss: 2.30584716796875


 99%|█████████▉| 989/1000 [21:52<00:14,  1.35s/it]

Step: 989, Loss: 2.305166721343994


 99%|█████████▉| 990/1000 [21:54<00:14,  1.41s/it]

Step: 990, Loss: 2.3104238510131836


 99%|█████████▉| 991/1000 [21:55<00:12,  1.38s/it]

Step: 991, Loss: 2.355902671813965


 99%|█████████▉| 992/1000 [21:56<00:10,  1.37s/it]

Step: 992, Loss: 2.336315870285034


 99%|█████████▉| 993/1000 [21:58<00:09,  1.37s/it]

Step: 993, Loss: 2.299668312072754


 99%|█████████▉| 994/1000 [21:59<00:08,  1.37s/it]

Step: 994, Loss: 2.3515262603759766


100%|█████████▉| 995/1000 [22:01<00:07,  1.51s/it]

Step: 995, Loss: 2.366455316543579


100%|█████████▉| 996/1000 [22:02<00:05,  1.49s/it]

Step: 996, Loss: 2.280163049697876


100%|█████████▉| 997/1000 [22:04<00:04,  1.43s/it]

Step: 997, Loss: 2.314680576324463


100%|█████████▉| 998/1000 [22:05<00:02,  1.37s/it]

Step: 998, Loss: 2.347386360168457


100%|█████████▉| 999/1000 [22:06<00:01,  1.35s/it]

Step: 999, Loss: 2.2892813682556152


100%|██████████| 1000/1000 [22:07<00:00,  1.33s/it]

Step: 1000, Loss: 2.306718111038208





In [36]:
transformer.eval()
src_sample = torch.zeros(10, dtype=torch.int64)
src_sample[:10] = torch.arange(833, 843, dtype=torch.int64)

In [37]:
src_sample.unsqueeze(0)

tensor([[833, 834, 835, 836, 837, 838, 839, 840, 841, 842]])

In [38]:
res = transformer(src_sample.unsqueeze(0), src_sample.unsqueeze(0))

In [39]:
res.squeeze().argmax(dim=1)

tensor([834, 836, 838, 840, 842, 844, 846, 849, 850, 852])