In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

from tqdm import tqdm

  device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),


In [2]:
torch.device('mps')

device(type='mps')

In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [4]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [6]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    # TODO: mistake_1:
    # def forward(self, x, mask):
    #     ff_output = self.feed_forward(x)
    #     attn_output = self.self_attn(x, x, x, mask)
    #     x = self.norm1(x + self.dropout(attn_output))
    #     x = self.norm2(x + self.dropout(ff_output))
    #     return x
    
    # correct!
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [7]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    # TODO: mistake_2:     
    # def forward(self, x, enc_output, src_mask, tgt_mask):
    #     attn_output = self.self_attn(x, x, x, tgt_mask)
    #     x = self.norm1(x + self.dropout(attn_output))
    #     ff_output = self.feed_forward(x)
    #     x = self.norm3(x + self.dropout(ff_output))
    #     return x
    
    # correct!
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [8]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [9]:
src_vocab_size = 1000
tgt_vocab_size = 1000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 50
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

In [10]:
from random import randint

In [11]:
def generate_sample():
    start = randint(1, 944)
    # src = torch.arange(start, start+50, dtype=torch.int64)
    # trg = src + 1
    trg = torch.arange(start, start+50, dtype=torch.int64)
    src = torch.zeros_like(trg, dtype=torch.int64)
    src[:10] = trg[:10]

    return src, trg

In [12]:
def generate_batch(batch_size: int = 128):
    src_batch = torch.tensor([], dtype=torch.int64)
    trg_batch = torch.tensor([], dtype=torch.int64)

    while src_batch.shape[0] < batch_size:
        src_sample, trg_sample = generate_sample()
        src_batch = torch.cat((src_batch, src_sample.unsqueeze(0)))
        trg_batch = torch.cat((trg_batch, trg_sample.unsqueeze(0)))
    
    return src_batch, trg_batch


In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for step in tqdm(range(1000)):
    src_batch, trg_batch = generate_batch(64)
    optimizer.zero_grad()
    output = transformer(src_batch, src_batch)
    loss = criterion(output.view(-1, output.size(-1)), trg_batch.view(-1))
    loss.backward()
    optimizer.step()
    print(f"Step: {step+1}, Loss: {loss.item()}")

  0%|          | 1/1000 [00:02<34:28,  2.07s/it]

Step: 1, Loss: 7.0629496574401855


  0%|          | 2/1000 [00:03<31:56,  1.92s/it]

Step: 2, Loss: 6.9759745597839355


  0%|          | 3/1000 [00:05<27:38,  1.66s/it]

Step: 3, Loss: 7.020008563995361


  0%|          | 4/1000 [00:06<25:19,  1.53s/it]

Step: 4, Loss: 6.949252128601074


  0%|          | 5/1000 [00:07<23:59,  1.45s/it]

Step: 5, Loss: 6.947412014007568


  1%|          | 6/1000 [00:09<23:17,  1.41s/it]

Step: 6, Loss: 6.938932418823242


  1%|          | 7/1000 [00:10<22:40,  1.37s/it]

Step: 7, Loss: 6.896792411804199


  1%|          | 8/1000 [00:11<22:12,  1.34s/it]

Step: 8, Loss: 6.884660720825195


  1%|          | 9/1000 [00:13<21:57,  1.33s/it]

Step: 9, Loss: 6.846088409423828


  1%|          | 10/1000 [00:14<21:58,  1.33s/it]

Step: 10, Loss: 6.823604583740234


  1%|          | 11/1000 [00:15<22:10,  1.34s/it]

Step: 11, Loss: 6.795740127563477


  1%|          | 12/1000 [00:17<21:57,  1.33s/it]

Step: 12, Loss: 6.739255428314209


  1%|▏         | 13/1000 [00:18<22:01,  1.34s/it]

Step: 13, Loss: 6.742644786834717


  1%|▏         | 14/1000 [00:19<22:00,  1.34s/it]

Step: 14, Loss: 6.721521377563477


  2%|▏         | 15/1000 [00:21<23:00,  1.40s/it]

Step: 15, Loss: 6.619436264038086


  2%|▏         | 16/1000 [00:22<24:06,  1.47s/it]

Step: 16, Loss: 6.650399208068848


  2%|▏         | 17/1000 [00:24<23:34,  1.44s/it]

Step: 17, Loss: 6.5885210037231445


  2%|▏         | 18/1000 [00:25<22:56,  1.40s/it]

Step: 18, Loss: 6.483358383178711


  2%|▏         | 19/1000 [00:26<22:12,  1.36s/it]

Step: 19, Loss: 6.411899566650391


  2%|▏         | 20/1000 [00:28<21:38,  1.32s/it]

Step: 20, Loss: 6.4383745193481445


  2%|▏         | 21/1000 [00:29<21:48,  1.34s/it]

Step: 21, Loss: 6.348447799682617


  2%|▏         | 22/1000 [00:30<21:34,  1.32s/it]

Step: 22, Loss: 6.3240814208984375


  2%|▏         | 23/1000 [00:32<21:44,  1.34s/it]

Step: 23, Loss: 6.2001118659973145


  2%|▏         | 24/1000 [00:33<21:59,  1.35s/it]

Step: 24, Loss: 6.11851692199707


  2%|▎         | 25/1000 [00:34<22:01,  1.36s/it]

Step: 25, Loss: 6.060064315795898


  3%|▎         | 26/1000 [00:36<21:47,  1.34s/it]

Step: 26, Loss: 5.982077121734619


  3%|▎         | 27/1000 [00:37<21:38,  1.33s/it]

Step: 27, Loss: 5.965450286865234


  3%|▎         | 28/1000 [00:38<21:20,  1.32s/it]

Step: 28, Loss: 5.8979573249816895


  3%|▎         | 29/1000 [00:40<21:06,  1.30s/it]

Step: 29, Loss: 5.815658092498779


  3%|▎         | 30/1000 [00:41<20:46,  1.28s/it]

Step: 30, Loss: 5.780308246612549


  3%|▎         | 31/1000 [00:42<20:31,  1.27s/it]

Step: 31, Loss: 5.779208183288574


  3%|▎         | 32/1000 [00:43<20:19,  1.26s/it]

Step: 32, Loss: 5.737443447113037


  3%|▎         | 33/1000 [00:45<20:17,  1.26s/it]

Step: 33, Loss: 5.587722301483154


  3%|▎         | 34/1000 [00:46<20:25,  1.27s/it]

Step: 34, Loss: 5.597532749176025


  4%|▎         | 35/1000 [00:47<20:35,  1.28s/it]

Step: 35, Loss: 5.573431968688965


  4%|▎         | 36/1000 [00:49<23:08,  1.44s/it]

Step: 36, Loss: 5.548783779144287


  4%|▎         | 37/1000 [00:50<22:52,  1.43s/it]

Step: 37, Loss: 5.497768402099609


  4%|▍         | 38/1000 [00:52<21:57,  1.37s/it]

Step: 38, Loss: 5.443684101104736


  4%|▍         | 39/1000 [00:53<22:15,  1.39s/it]

Step: 39, Loss: 5.423701763153076


  4%|▍         | 40/1000 [00:54<21:32,  1.35s/it]

Step: 40, Loss: 5.354679107666016


  4%|▍         | 41/1000 [00:56<21:25,  1.34s/it]

Step: 41, Loss: 5.328814506530762


  4%|▍         | 42/1000 [00:57<21:12,  1.33s/it]

Step: 42, Loss: 5.311673164367676


  4%|▍         | 43/1000 [00:58<22:21,  1.40s/it]

Step: 43, Loss: 5.271435737609863


  4%|▍         | 44/1000 [01:00<21:52,  1.37s/it]

Step: 44, Loss: 5.2421674728393555


  4%|▍         | 45/1000 [01:01<21:11,  1.33s/it]

Step: 45, Loss: 5.186380386352539


  5%|▍         | 46/1000 [01:02<21:08,  1.33s/it]

Step: 46, Loss: 5.169363498687744


  5%|▍         | 47/1000 [01:04<21:52,  1.38s/it]

Step: 47, Loss: 5.104887962341309


  5%|▍         | 48/1000 [01:05<21:20,  1.34s/it]

Step: 48, Loss: 5.0776567459106445


  5%|▍         | 49/1000 [01:06<21:03,  1.33s/it]

Step: 49, Loss: 5.078325271606445


  5%|▌         | 50/1000 [01:08<20:56,  1.32s/it]

Step: 50, Loss: 5.0149149894714355


  5%|▌         | 51/1000 [01:09<20:52,  1.32s/it]

Step: 51, Loss: 4.989578723907471


  5%|▌         | 52/1000 [01:11<23:13,  1.47s/it]

Step: 52, Loss: 4.955894947052002


  5%|▌         | 53/1000 [01:12<22:55,  1.45s/it]

Step: 53, Loss: 4.937359809875488


  5%|▌         | 54/1000 [01:14<22:09,  1.41s/it]

Step: 54, Loss: 4.90561056137085


  6%|▌         | 55/1000 [01:15<21:31,  1.37s/it]

Step: 55, Loss: 4.870834827423096


  6%|▌         | 56/1000 [01:16<20:56,  1.33s/it]

Step: 56, Loss: 4.818994045257568


  6%|▌         | 57/1000 [01:18<21:37,  1.38s/it]

Step: 57, Loss: 4.794983386993408


  6%|▌         | 58/1000 [01:19<21:07,  1.35s/it]

Step: 58, Loss: 4.784017562866211


  6%|▌         | 59/1000 [01:20<20:56,  1.34s/it]

Step: 59, Loss: 4.75956392288208


  6%|▌         | 60/1000 [01:21<20:42,  1.32s/it]

Step: 60, Loss: 4.739668369293213


  6%|▌         | 61/1000 [01:23<20:31,  1.31s/it]

Step: 61, Loss: 4.71062707901001


  6%|▌         | 62/1000 [01:24<20:28,  1.31s/it]

Step: 62, Loss: 4.6872053146362305


  6%|▋         | 63/1000 [01:25<20:36,  1.32s/it]

Step: 63, Loss: 4.656591892242432


  6%|▋         | 64/1000 [01:27<20:45,  1.33s/it]

Step: 64, Loss: 4.6205925941467285


  6%|▋         | 65/1000 [01:28<20:24,  1.31s/it]

Step: 65, Loss: 4.590844631195068


  7%|▋         | 66/1000 [01:29<20:03,  1.29s/it]

Step: 66, Loss: 4.577810764312744


  7%|▋         | 67/1000 [01:30<19:55,  1.28s/it]

Step: 67, Loss: 4.5688395500183105


  7%|▋         | 68/1000 [01:32<19:42,  1.27s/it]

Step: 68, Loss: 4.519039630889893


  7%|▋         | 69/1000 [01:33<19:40,  1.27s/it]

Step: 69, Loss: 4.498906135559082


  7%|▋         | 70/1000 [01:34<19:27,  1.25s/it]

Step: 70, Loss: 4.468374729156494


  7%|▋         | 71/1000 [01:35<19:19,  1.25s/it]

Step: 71, Loss: 4.423927307128906


  7%|▋         | 72/1000 [01:37<19:46,  1.28s/it]

Step: 72, Loss: 4.392436504364014


  7%|▋         | 73/1000 [01:38<19:37,  1.27s/it]

Step: 73, Loss: 4.38249397277832


  7%|▋         | 74/1000 [01:39<19:51,  1.29s/it]

Step: 74, Loss: 4.334473133087158


  8%|▊         | 75/1000 [01:41<19:48,  1.28s/it]

Step: 75, Loss: 4.321470737457275


  8%|▊         | 76/1000 [01:42<19:42,  1.28s/it]

Step: 76, Loss: 4.309699535369873


  8%|▊         | 77/1000 [01:43<19:52,  1.29s/it]

Step: 77, Loss: 4.271479606628418


  8%|▊         | 78/1000 [01:45<19:47,  1.29s/it]

Step: 78, Loss: 4.232562065124512


  8%|▊         | 79/1000 [01:46<20:22,  1.33s/it]

Step: 79, Loss: 4.21103048324585


  8%|▊         | 80/1000 [01:47<20:05,  1.31s/it]

Step: 80, Loss: 4.213424205780029


  8%|▊         | 81/1000 [01:48<19:49,  1.29s/it]

Step: 81, Loss: 4.185417652130127


  8%|▊         | 82/1000 [01:50<19:37,  1.28s/it]

Step: 82, Loss: 4.154955863952637


  8%|▊         | 83/1000 [01:51<19:25,  1.27s/it]

Step: 83, Loss: 4.144239902496338


  8%|▊         | 84/1000 [01:52<19:30,  1.28s/it]

Step: 84, Loss: 4.127198219299316


  8%|▊         | 85/1000 [01:54<19:41,  1.29s/it]

Step: 85, Loss: 4.083845138549805


  9%|▊         | 86/1000 [01:55<19:32,  1.28s/it]

Step: 86, Loss: 4.093003273010254


  9%|▊         | 87/1000 [01:56<19:23,  1.27s/it]

Step: 87, Loss: 4.056476593017578


  9%|▉         | 88/1000 [01:57<19:39,  1.29s/it]

Step: 88, Loss: 4.03891134262085


  9%|▉         | 89/1000 [01:59<19:33,  1.29s/it]

Step: 89, Loss: 4.017592906951904


  9%|▉         | 90/1000 [02:00<19:33,  1.29s/it]

Step: 90, Loss: 4.009635925292969


  9%|▉         | 91/1000 [02:01<19:43,  1.30s/it]

Step: 91, Loss: 3.9682276248931885


  9%|▉         | 92/1000 [02:04<24:02,  1.59s/it]

Step: 92, Loss: 3.9669675827026367


  9%|▉         | 93/1000 [02:05<23:10,  1.53s/it]

Step: 93, Loss: 3.933591604232788


  9%|▉         | 94/1000 [02:06<22:06,  1.46s/it]

Step: 94, Loss: 3.8913631439208984


 10%|▉         | 95/1000 [02:08<21:15,  1.41s/it]

Step: 95, Loss: 3.8787152767181396


 10%|▉         | 96/1000 [02:09<20:45,  1.38s/it]

Step: 96, Loss: 3.8692562580108643


 10%|▉         | 97/1000 [02:10<20:07,  1.34s/it]

Step: 97, Loss: 3.828369379043579


 10%|▉         | 98/1000 [02:11<19:41,  1.31s/it]

Step: 98, Loss: 3.840346336364746


 10%|▉         | 99/1000 [02:13<19:25,  1.29s/it]

Step: 99, Loss: 3.805614709854126


 10%|█         | 100/1000 [02:14<19:08,  1.28s/it]

Step: 100, Loss: 3.7813525199890137


 10%|█         | 101/1000 [02:15<19:05,  1.27s/it]

Step: 101, Loss: 3.7737953662872314


 10%|█         | 102/1000 [02:16<19:25,  1.30s/it]

Step: 102, Loss: 3.7413671016693115


 10%|█         | 103/1000 [02:18<19:21,  1.30s/it]

Step: 103, Loss: 3.722752571105957


 10%|█         | 104/1000 [02:19<19:07,  1.28s/it]

Step: 104, Loss: 3.7359631061553955


 10%|█         | 105/1000 [02:20<18:55,  1.27s/it]

Step: 105, Loss: 3.7177681922912598


 11%|█         | 106/1000 [02:22<18:52,  1.27s/it]

Step: 106, Loss: 3.654362440109253


 11%|█         | 107/1000 [02:23<18:49,  1.26s/it]

Step: 107, Loss: 3.6921441555023193


 11%|█         | 108/1000 [02:24<18:52,  1.27s/it]

Step: 108, Loss: 3.6403310298919678


 11%|█         | 109/1000 [02:25<19:18,  1.30s/it]

Step: 109, Loss: 3.624713659286499


 11%|█         | 110/1000 [02:27<19:08,  1.29s/it]

Step: 110, Loss: 3.639085054397583


 11%|█         | 111/1000 [02:28<19:04,  1.29s/it]

Step: 111, Loss: 3.6088461875915527


 11%|█         | 112/1000 [02:29<19:08,  1.29s/it]

Step: 112, Loss: 3.5605711936950684


 11%|█▏        | 113/1000 [02:31<18:57,  1.28s/it]

Step: 113, Loss: 3.550170660018921


 11%|█▏        | 114/1000 [02:32<19:05,  1.29s/it]

Step: 114, Loss: 3.5452959537506104


 12%|█▏        | 115/1000 [02:33<19:02,  1.29s/it]

Step: 115, Loss: 3.5258803367614746


 12%|█▏        | 116/1000 [02:34<19:09,  1.30s/it]

Step: 116, Loss: 3.5154049396514893


 12%|█▏        | 117/1000 [02:36<19:15,  1.31s/it]

Step: 117, Loss: 3.529263973236084


 12%|█▏        | 118/1000 [02:37<19:03,  1.30s/it]

Step: 118, Loss: 3.505782127380371


 12%|█▏        | 119/1000 [02:38<18:50,  1.28s/it]

Step: 119, Loss: 3.4600448608398438


 12%|█▏        | 120/1000 [02:40<18:39,  1.27s/it]

Step: 120, Loss: 3.4474563598632812


 12%|█▏        | 121/1000 [02:41<18:33,  1.27s/it]

Step: 121, Loss: 3.4322900772094727


 12%|█▏        | 122/1000 [02:42<18:27,  1.26s/it]

Step: 122, Loss: 3.4406237602233887


 12%|█▏        | 123/1000 [02:43<18:31,  1.27s/it]

Step: 123, Loss: 3.43967342376709


 12%|█▏        | 124/1000 [02:45<18:36,  1.27s/it]

Step: 124, Loss: 3.373964786529541


 12%|█▎        | 125/1000 [02:46<18:27,  1.27s/it]

Step: 125, Loss: 3.3865811824798584


 13%|█▎        | 126/1000 [02:47<18:25,  1.26s/it]

Step: 126, Loss: 3.358036518096924


 13%|█▎        | 127/1000 [02:48<18:43,  1.29s/it]

Step: 127, Loss: 3.333601713180542


 13%|█▎        | 128/1000 [02:50<18:47,  1.29s/it]

Step: 128, Loss: 3.3353030681610107


 13%|█▎        | 129/1000 [02:51<18:33,  1.28s/it]

Step: 129, Loss: 3.274735450744629


 13%|█▎        | 130/1000 [02:52<18:37,  1.28s/it]

Step: 130, Loss: 3.292576789855957


 13%|█▎        | 131/1000 [02:54<18:47,  1.30s/it]

Step: 131, Loss: 3.2736616134643555


 13%|█▎        | 132/1000 [02:56<21:25,  1.48s/it]

Step: 132, Loss: 3.2420406341552734


 13%|█▎        | 133/1000 [02:57<20:55,  1.45s/it]

Step: 133, Loss: 3.2270758152008057


 13%|█▎        | 134/1000 [02:58<20:10,  1.40s/it]

Step: 134, Loss: 3.2056353092193604


 14%|█▎        | 135/1000 [03:00<19:58,  1.39s/it]

Step: 135, Loss: 3.2140607833862305


 14%|█▎        | 136/1000 [03:01<19:44,  1.37s/it]

Step: 136, Loss: 3.1815061569213867


 14%|█▎        | 137/1000 [03:02<19:32,  1.36s/it]

Step: 137, Loss: 3.1685397624969482


 14%|█▍        | 138/1000 [03:04<19:14,  1.34s/it]

Step: 138, Loss: 3.1571884155273438


 14%|█▍        | 139/1000 [03:05<18:52,  1.32s/it]

Step: 139, Loss: 3.14353609085083


 14%|█▍        | 140/1000 [03:06<18:41,  1.30s/it]

Step: 140, Loss: 3.11411190032959


 14%|█▍        | 141/1000 [03:07<18:27,  1.29s/it]

Step: 141, Loss: 3.0909414291381836


 14%|█▍        | 142/1000 [03:09<18:16,  1.28s/it]

Step: 142, Loss: 3.0792746543884277


 14%|█▍        | 143/1000 [03:10<18:01,  1.26s/it]

Step: 143, Loss: 3.07871150970459


 14%|█▍        | 144/1000 [03:11<17:54,  1.26s/it]

Step: 144, Loss: 3.056098699569702


 14%|█▍        | 145/1000 [03:12<17:54,  1.26s/it]

Step: 145, Loss: 3.0452280044555664


 15%|█▍        | 146/1000 [03:14<17:52,  1.26s/it]

Step: 146, Loss: 3.0610170364379883


 15%|█▍        | 147/1000 [03:15<18:06,  1.27s/it]

Step: 147, Loss: 3.046295166015625


 15%|█▍        | 148/1000 [03:16<18:06,  1.28s/it]

Step: 148, Loss: 3.0196127891540527


 15%|█▍        | 149/1000 [03:17<18:11,  1.28s/it]

Step: 149, Loss: 3.0076074600219727


 15%|█▌        | 150/1000 [03:19<18:15,  1.29s/it]

Step: 150, Loss: 2.994311809539795


 15%|█▌        | 151/1000 [03:20<18:13,  1.29s/it]

Step: 151, Loss: 2.9907572269439697


 15%|█▌        | 152/1000 [03:21<17:58,  1.27s/it]

Step: 152, Loss: 2.9646270275115967


 15%|█▌        | 153/1000 [03:23<17:50,  1.26s/it]

Step: 153, Loss: 2.972268581390381


 15%|█▌        | 154/1000 [03:24<17:47,  1.26s/it]

Step: 154, Loss: 2.9361259937286377


 16%|█▌        | 155/1000 [03:25<19:12,  1.36s/it]

Step: 155, Loss: 2.9342987537384033


 16%|█▌        | 156/1000 [03:27<19:02,  1.35s/it]

Step: 156, Loss: 2.9258828163146973


 16%|█▌        | 157/1000 [03:28<19:05,  1.36s/it]

Step: 157, Loss: 2.911466360092163


 16%|█▌        | 158/1000 [03:29<18:40,  1.33s/it]

Step: 158, Loss: 2.9021167755126953


 16%|█▌        | 159/1000 [03:31<18:19,  1.31s/it]

Step: 159, Loss: 2.9043281078338623


 16%|█▌        | 160/1000 [03:32<18:08,  1.30s/it]

Step: 160, Loss: 2.8862645626068115


 16%|█▌        | 161/1000 [03:33<18:01,  1.29s/it]

Step: 161, Loss: 2.8832502365112305


 16%|█▌        | 162/1000 [03:34<17:47,  1.27s/it]

Step: 162, Loss: 2.8717660903930664


 16%|█▋        | 163/1000 [03:36<17:40,  1.27s/it]

Step: 163, Loss: 2.8332836627960205


 16%|█▋        | 164/1000 [03:37<17:35,  1.26s/it]

Step: 164, Loss: 2.8467135429382324


 16%|█▋        | 165/1000 [03:38<17:31,  1.26s/it]

Step: 165, Loss: 2.822617292404175


 17%|█▋        | 166/1000 [03:39<17:28,  1.26s/it]

Step: 166, Loss: 2.819533348083496


 17%|█▋        | 167/1000 [03:41<17:24,  1.25s/it]

Step: 167, Loss: 2.8172783851623535


 17%|█▋        | 168/1000 [03:42<17:19,  1.25s/it]

Step: 168, Loss: 2.7955875396728516


 17%|█▋        | 169/1000 [03:43<17:20,  1.25s/it]

Step: 169, Loss: 2.789799451828003


 17%|█▋        | 170/1000 [03:44<17:16,  1.25s/it]

Step: 170, Loss: 2.7767250537872314


 17%|█▋        | 171/1000 [03:46<17:19,  1.25s/it]

Step: 171, Loss: 2.7607686519622803


 17%|█▋        | 172/1000 [03:47<17:19,  1.26s/it]

Step: 172, Loss: 2.769595146179199


 17%|█▋        | 173/1000 [03:48<17:16,  1.25s/it]

Step: 173, Loss: 2.7577333450317383


 17%|█▋        | 174/1000 [03:49<17:24,  1.27s/it]

Step: 174, Loss: 2.753408908843994


 18%|█▊        | 175/1000 [03:51<17:30,  1.27s/it]

Step: 175, Loss: 2.740718126296997


 18%|█▊        | 176/1000 [03:52<17:21,  1.26s/it]

Step: 176, Loss: 2.7323801517486572


 18%|█▊        | 177/1000 [03:53<17:17,  1.26s/it]

Step: 177, Loss: 2.71323823928833


 18%|█▊        | 178/1000 [03:54<17:11,  1.26s/it]

Step: 178, Loss: 2.7059850692749023


 18%|█▊        | 179/1000 [03:56<17:07,  1.25s/it]

Step: 179, Loss: 2.712813138961792


 18%|█▊        | 180/1000 [03:57<17:06,  1.25s/it]

Step: 180, Loss: 2.700518846511841


 18%|█▊        | 181/1000 [03:58<17:02,  1.25s/it]

Step: 181, Loss: 2.7002551555633545


 18%|█▊        | 182/1000 [03:59<16:56,  1.24s/it]

Step: 182, Loss: 2.6761791706085205


 18%|█▊        | 183/1000 [04:01<16:56,  1.24s/it]

Step: 183, Loss: 2.6619439125061035


 18%|█▊        | 184/1000 [04:02<16:58,  1.25s/it]

Step: 184, Loss: 2.66548490524292


 18%|█▊        | 185/1000 [04:03<16:57,  1.25s/it]

Step: 185, Loss: 2.665266513824463


 19%|█▊        | 186/1000 [04:04<16:54,  1.25s/it]

Step: 186, Loss: 2.651190996170044


 19%|█▊        | 187/1000 [04:06<16:55,  1.25s/it]

Step: 187, Loss: 2.635920763015747


 19%|█▉        | 188/1000 [04:07<16:52,  1.25s/it]

Step: 188, Loss: 2.6284844875335693


 19%|█▉        | 189/1000 [04:08<16:53,  1.25s/it]

Step: 189, Loss: 2.6337640285491943


 19%|█▉        | 190/1000 [04:09<16:49,  1.25s/it]

Step: 190, Loss: 2.6187210083007812


 19%|█▉        | 191/1000 [04:11<16:44,  1.24s/it]

Step: 191, Loss: 2.638427495956421


 19%|█▉        | 192/1000 [04:12<16:47,  1.25s/it]

Step: 192, Loss: 2.6094956398010254


 19%|█▉        | 193/1000 [04:13<16:39,  1.24s/it]

Step: 193, Loss: 2.590393304824829


 19%|█▉        | 194/1000 [04:14<16:38,  1.24s/it]

Step: 194, Loss: 2.5765106678009033


 20%|█▉        | 195/1000 [04:16<16:39,  1.24s/it]

Step: 195, Loss: 2.593153715133667


 20%|█▉        | 196/1000 [04:17<16:46,  1.25s/it]

Step: 196, Loss: 2.55998158454895


 20%|█▉        | 197/1000 [04:18<16:43,  1.25s/it]

Step: 197, Loss: 2.5506935119628906


 20%|█▉        | 198/1000 [04:19<16:37,  1.24s/it]

Step: 198, Loss: 2.5574989318847656


 20%|█▉        | 199/1000 [04:21<16:35,  1.24s/it]

Step: 199, Loss: 2.552389144897461


 20%|██        | 200/1000 [04:22<16:33,  1.24s/it]

Step: 200, Loss: 2.5486807823181152


 20%|██        | 201/1000 [04:23<16:32,  1.24s/it]

Step: 201, Loss: 2.5407919883728027


 20%|██        | 202/1000 [04:24<16:27,  1.24s/it]

Step: 202, Loss: 2.527081251144409


 20%|██        | 203/1000 [04:26<16:25,  1.24s/it]

Step: 203, Loss: 2.527327060699463


 20%|██        | 204/1000 [04:27<16:29,  1.24s/it]

Step: 204, Loss: 2.5147957801818848


 20%|██        | 205/1000 [04:28<16:40,  1.26s/it]

Step: 205, Loss: 2.5071871280670166


 21%|██        | 206/1000 [04:29<16:40,  1.26s/it]

Step: 206, Loss: 2.4999215602874756


 21%|██        | 207/1000 [04:31<16:39,  1.26s/it]

Step: 207, Loss: 2.499516487121582


 21%|██        | 208/1000 [04:32<16:38,  1.26s/it]

Step: 208, Loss: 2.4837758541107178


 21%|██        | 209/1000 [04:33<16:39,  1.26s/it]

Step: 209, Loss: 2.483898162841797


 21%|██        | 210/1000 [04:34<16:34,  1.26s/it]

Step: 210, Loss: 2.476641893386841


 21%|██        | 211/1000 [04:36<16:28,  1.25s/it]

Step: 211, Loss: 2.469536304473877


 21%|██        | 212/1000 [04:37<16:24,  1.25s/it]

Step: 212, Loss: 2.4797840118408203


 21%|██▏       | 213/1000 [04:38<16:21,  1.25s/it]

Step: 213, Loss: 2.4540274143218994


 21%|██▏       | 214/1000 [04:39<16:17,  1.24s/it]

Step: 214, Loss: 2.456834316253662


 22%|██▏       | 215/1000 [04:41<16:18,  1.25s/it]

Step: 215, Loss: 2.4469122886657715


 22%|██▏       | 216/1000 [04:42<16:19,  1.25s/it]

Step: 216, Loss: 2.4382615089416504


 22%|██▏       | 217/1000 [04:43<16:12,  1.24s/it]

Step: 217, Loss: 2.4369301795959473


 22%|██▏       | 218/1000 [04:44<16:07,  1.24s/it]

Step: 218, Loss: 2.421553611755371


 22%|██▏       | 219/1000 [04:46<16:07,  1.24s/it]

Step: 219, Loss: 2.4122257232666016


 22%|██▏       | 220/1000 [04:47<16:05,  1.24s/it]

Step: 220, Loss: 2.39726185798645


 22%|██▏       | 221/1000 [04:48<16:10,  1.25s/it]

Step: 221, Loss: 2.4058175086975098


 22%|██▏       | 222/1000 [04:49<16:04,  1.24s/it]

Step: 222, Loss: 2.41076397895813


 22%|██▏       | 223/1000 [04:51<16:00,  1.24s/it]

Step: 223, Loss: 2.391404867172241


 22%|██▏       | 224/1000 [04:52<15:59,  1.24s/it]

Step: 224, Loss: 2.3944830894470215


 22%|██▎       | 225/1000 [04:53<15:57,  1.24s/it]

Step: 225, Loss: 2.389347791671753


 23%|██▎       | 226/1000 [04:54<15:55,  1.23s/it]

Step: 226, Loss: 2.3810806274414062


 23%|██▎       | 227/1000 [04:55<15:57,  1.24s/it]

Step: 227, Loss: 2.363208055496216


 23%|██▎       | 228/1000 [04:57<16:04,  1.25s/it]

Step: 228, Loss: 2.3676576614379883


 23%|██▎       | 229/1000 [04:58<16:12,  1.26s/it]

Step: 229, Loss: 2.361898422241211


 23%|██▎       | 230/1000 [04:59<16:20,  1.27s/it]

Step: 230, Loss: 2.3650498390197754


 23%|██▎       | 231/1000 [05:01<16:35,  1.29s/it]

Step: 231, Loss: 2.3554558753967285


 23%|██▎       | 232/1000 [05:02<16:41,  1.30s/it]

Step: 232, Loss: 2.34127140045166


 23%|██▎       | 233/1000 [05:03<16:39,  1.30s/it]

Step: 233, Loss: 2.351487636566162


 23%|██▎       | 234/1000 [05:05<16:33,  1.30s/it]

Step: 234, Loss: 2.323024034500122


 24%|██▎       | 235/1000 [05:06<16:28,  1.29s/it]

Step: 235, Loss: 2.3258635997772217


 24%|██▎       | 236/1000 [05:07<16:17,  1.28s/it]

Step: 236, Loss: 2.3272762298583984


 24%|██▎       | 237/1000 [05:09<17:32,  1.38s/it]

Step: 237, Loss: 2.31526517868042


 24%|██▍       | 238/1000 [05:10<17:23,  1.37s/it]

Step: 238, Loss: 2.310800313949585


 24%|██▍       | 239/1000 [05:11<17:03,  1.34s/it]

Step: 239, Loss: 2.300168514251709


 24%|██▍       | 240/1000 [05:13<16:56,  1.34s/it]

Step: 240, Loss: 2.297590732574463


 24%|██▍       | 241/1000 [05:14<16:49,  1.33s/it]

Step: 241, Loss: 2.29990553855896


 24%|██▍       | 242/1000 [05:15<16:39,  1.32s/it]

Step: 242, Loss: 2.312640905380249


 24%|██▍       | 243/1000 [05:17<16:39,  1.32s/it]

Step: 243, Loss: 2.2786154747009277


 24%|██▍       | 244/1000 [05:18<16:39,  1.32s/it]

Step: 244, Loss: 2.2692651748657227


 24%|██▍       | 245/1000 [05:19<16:28,  1.31s/it]

Step: 245, Loss: 2.284116268157959


 25%|██▍       | 246/1000 [05:21<16:15,  1.29s/it]

Step: 246, Loss: 2.270493268966675


 25%|██▍       | 247/1000 [05:22<16:03,  1.28s/it]

Step: 247, Loss: 2.258357286453247


 25%|██▍       | 248/1000 [05:23<15:54,  1.27s/it]

Step: 248, Loss: 2.2581989765167236


 25%|██▍       | 249/1000 [05:24<15:49,  1.26s/it]

Step: 249, Loss: 2.2582590579986572


 25%|██▌       | 250/1000 [05:25<15:44,  1.26s/it]

Step: 250, Loss: 2.2584471702575684


 25%|██▌       | 251/1000 [05:27<15:41,  1.26s/it]

Step: 251, Loss: 2.2448601722717285


 25%|██▌       | 252/1000 [05:28<15:35,  1.25s/it]

Step: 252, Loss: 2.235582113265991


 25%|██▌       | 253/1000 [05:29<15:33,  1.25s/it]

Step: 253, Loss: 2.2163822650909424


 25%|██▌       | 254/1000 [05:30<15:30,  1.25s/it]

Step: 254, Loss: 2.2165212631225586


 26%|██▌       | 255/1000 [05:32<15:30,  1.25s/it]

Step: 255, Loss: 2.227012872695923


 26%|██▌       | 256/1000 [05:33<15:24,  1.24s/it]

Step: 256, Loss: 2.215651273727417


 26%|██▌       | 257/1000 [05:34<15:24,  1.24s/it]

Step: 257, Loss: 2.200838804244995


 26%|██▌       | 258/1000 [05:35<15:22,  1.24s/it]

Step: 258, Loss: 2.2071759700775146


 26%|██▌       | 259/1000 [05:37<15:19,  1.24s/it]

Step: 259, Loss: 2.213855266571045


 26%|██▌       | 260/1000 [05:38<15:16,  1.24s/it]

Step: 260, Loss: 2.1967334747314453


 26%|██▌       | 261/1000 [05:39<15:14,  1.24s/it]

Step: 261, Loss: 2.1848485469818115


 26%|██▌       | 262/1000 [05:40<15:15,  1.24s/it]

Step: 262, Loss: 2.189985990524292


 26%|██▋       | 263/1000 [05:42<15:15,  1.24s/it]

Step: 263, Loss: 2.175014019012451


 26%|██▋       | 264/1000 [05:43<15:12,  1.24s/it]

Step: 264, Loss: 2.176304340362549


 26%|██▋       | 265/1000 [05:44<15:27,  1.26s/it]

Step: 265, Loss: 2.1599578857421875


 27%|██▋       | 266/1000 [05:45<15:31,  1.27s/it]

Step: 266, Loss: 2.161771297454834


 27%|██▋       | 267/1000 [05:47<15:34,  1.28s/it]

Step: 267, Loss: 2.1521177291870117


 27%|██▋       | 268/1000 [05:48<15:38,  1.28s/it]

Step: 268, Loss: 2.1638524532318115


 27%|██▋       | 269/1000 [05:49<16:01,  1.32s/it]

Step: 269, Loss: 2.1556742191314697


 27%|██▋       | 270/1000 [05:51<16:14,  1.34s/it]

Step: 270, Loss: 2.15576171875


 27%|██▋       | 271/1000 [05:52<16:05,  1.32s/it]

Step: 271, Loss: 2.1437716484069824


 27%|██▋       | 272/1000 [05:53<15:47,  1.30s/it]

Step: 272, Loss: 2.13720440864563


 27%|██▋       | 273/1000 [05:55<15:51,  1.31s/it]

Step: 273, Loss: 2.1448142528533936


 27%|██▋       | 274/1000 [05:56<15:54,  1.31s/it]

Step: 274, Loss: 2.1204216480255127


 28%|██▊       | 275/1000 [05:57<15:53,  1.31s/it]

Step: 275, Loss: 2.126751661300659


 28%|██▊       | 276/1000 [05:59<15:38,  1.30s/it]

Step: 276, Loss: 2.119762659072876


 28%|██▊       | 277/1000 [06:00<15:31,  1.29s/it]

Step: 277, Loss: 2.107271671295166


 28%|██▊       | 278/1000 [06:01<15:26,  1.28s/it]

Step: 278, Loss: 2.1112284660339355


 28%|██▊       | 279/1000 [06:03<15:42,  1.31s/it]

Step: 279, Loss: 2.1022229194641113


 28%|██▊       | 280/1000 [06:04<15:37,  1.30s/it]

Step: 280, Loss: 2.1047966480255127


 28%|██▊       | 281/1000 [06:05<15:39,  1.31s/it]

Step: 281, Loss: 2.097135543823242


 28%|██▊       | 282/1000 [06:06<15:29,  1.30s/it]

Step: 282, Loss: 2.0889432430267334


 28%|██▊       | 283/1000 [06:08<15:40,  1.31s/it]

Step: 283, Loss: 2.091567039489746


 28%|██▊       | 284/1000 [06:09<15:45,  1.32s/it]

Step: 284, Loss: 2.0692992210388184


 28%|██▊       | 285/1000 [06:10<15:45,  1.32s/it]

Step: 285, Loss: 2.078667640686035


 29%|██▊       | 286/1000 [06:12<15:29,  1.30s/it]

Step: 286, Loss: 2.0790016651153564


 29%|██▊       | 287/1000 [06:13<15:17,  1.29s/it]

Step: 287, Loss: 2.0674118995666504


 29%|██▉       | 288/1000 [06:14<15:12,  1.28s/it]

Step: 288, Loss: 2.05850887298584


 29%|██▉       | 289/1000 [06:15<15:08,  1.28s/it]

Step: 289, Loss: 2.056497573852539


 29%|██▉       | 290/1000 [06:17<15:15,  1.29s/it]

Step: 290, Loss: 2.075913429260254


 29%|██▉       | 291/1000 [06:18<15:24,  1.30s/it]

Step: 291, Loss: 2.05022931098938


 29%|██▉       | 292/1000 [06:19<15:24,  1.31s/it]

Step: 292, Loss: 2.0472214221954346


 29%|██▉       | 293/1000 [06:21<15:45,  1.34s/it]

Step: 293, Loss: 2.0370187759399414


 29%|██▉       | 294/1000 [06:22<15:48,  1.34s/it]

Step: 294, Loss: 2.042618989944458


 30%|██▉       | 295/1000 [06:23<15:42,  1.34s/it]

Step: 295, Loss: 2.024348020553589


 30%|██▉       | 296/1000 [06:25<15:38,  1.33s/it]

Step: 296, Loss: 2.021761894226074


 30%|██▉       | 297/1000 [06:26<15:30,  1.32s/it]

Step: 297, Loss: 2.0169334411621094


 30%|██▉       | 298/1000 [06:27<15:22,  1.31s/it]

Step: 298, Loss: 2.023592948913574


 30%|██▉       | 299/1000 [06:29<15:15,  1.31s/it]

Step: 299, Loss: 2.020275592803955


 30%|███       | 300/1000 [06:30<15:14,  1.31s/it]

Step: 300, Loss: 2.0131309032440186


 30%|███       | 301/1000 [06:31<15:10,  1.30s/it]

Step: 301, Loss: 1.9999220371246338


 30%|███       | 302/1000 [06:33<15:06,  1.30s/it]

Step: 302, Loss: 2.0044760704040527


 30%|███       | 303/1000 [06:34<14:58,  1.29s/it]

Step: 303, Loss: 1.9987471103668213


 30%|███       | 304/1000 [06:35<14:59,  1.29s/it]

Step: 304, Loss: 2.004619598388672


 30%|███       | 305/1000 [06:36<14:57,  1.29s/it]

Step: 305, Loss: 1.9981834888458252


 31%|███       | 306/1000 [06:38<14:52,  1.29s/it]

Step: 306, Loss: 1.987799882888794


 31%|███       | 307/1000 [06:39<14:52,  1.29s/it]

Step: 307, Loss: 1.9850342273712158


 31%|███       | 308/1000 [06:40<14:54,  1.29s/it]

Step: 308, Loss: 1.9757187366485596


 31%|███       | 309/1000 [06:42<14:52,  1.29s/it]

Step: 309, Loss: 1.9718835353851318


 31%|███       | 310/1000 [06:43<14:43,  1.28s/it]

Step: 310, Loss: 1.9721953868865967


 31%|███       | 311/1000 [06:44<14:45,  1.28s/it]

Step: 311, Loss: 1.9777334928512573


 31%|███       | 312/1000 [06:45<14:48,  1.29s/it]

Step: 312, Loss: 1.9561539888381958


 31%|███▏      | 313/1000 [06:47<14:49,  1.30s/it]

Step: 313, Loss: 1.971370816230774


 31%|███▏      | 314/1000 [06:48<14:46,  1.29s/it]

Step: 314, Loss: 1.9488351345062256


 32%|███▏      | 315/1000 [06:49<14:48,  1.30s/it]

Step: 315, Loss: 1.9471431970596313


 32%|███▏      | 316/1000 [06:51<14:42,  1.29s/it]

Step: 316, Loss: 1.9537888765335083


 32%|███▏      | 317/1000 [06:52<14:40,  1.29s/it]

Step: 317, Loss: 1.9325408935546875


 32%|███▏      | 318/1000 [06:53<14:36,  1.29s/it]

Step: 318, Loss: 1.9380508661270142


 32%|███▏      | 319/1000 [06:54<14:33,  1.28s/it]

Step: 319, Loss: 1.9235093593597412


 32%|███▏      | 320/1000 [06:56<14:35,  1.29s/it]

Step: 320, Loss: 1.919907808303833


 32%|███▏      | 321/1000 [06:57<14:28,  1.28s/it]

Step: 321, Loss: 1.9156993627548218


 32%|███▏      | 322/1000 [06:58<14:36,  1.29s/it]

Step: 322, Loss: 1.9222040176391602


 32%|███▏      | 323/1000 [07:00<14:40,  1.30s/it]

Step: 323, Loss: 1.9258331060409546


 32%|███▏      | 324/1000 [07:01<14:37,  1.30s/it]

Step: 324, Loss: 1.9132370948791504


 32%|███▎      | 325/1000 [07:02<14:36,  1.30s/it]

Step: 325, Loss: 1.8958605527877808


 33%|███▎      | 326/1000 [07:04<14:34,  1.30s/it]

Step: 326, Loss: 1.9060051441192627


 33%|███▎      | 327/1000 [07:05<14:51,  1.32s/it]

Step: 327, Loss: 1.8986605405807495


 33%|███▎      | 328/1000 [07:06<14:43,  1.32s/it]

Step: 328, Loss: 1.8946740627288818


 33%|███▎      | 329/1000 [07:08<14:53,  1.33s/it]

Step: 329, Loss: 1.884945034980774


 33%|███▎      | 330/1000 [07:09<14:42,  1.32s/it]

Step: 330, Loss: 1.8893098831176758


 33%|███▎      | 331/1000 [07:10<14:27,  1.30s/it]

Step: 331, Loss: 1.8902931213378906


 33%|███▎      | 332/1000 [07:11<14:17,  1.28s/it]

Step: 332, Loss: 1.8779780864715576


 33%|███▎      | 333/1000 [07:13<14:12,  1.28s/it]

Step: 333, Loss: 1.8839378356933594


 33%|███▎      | 334/1000 [07:14<14:04,  1.27s/it]

Step: 334, Loss: 1.8750616312026978


 34%|███▎      | 335/1000 [07:15<14:01,  1.27s/it]

Step: 335, Loss: 1.872565507888794


 34%|███▎      | 336/1000 [07:16<13:54,  1.26s/it]

Step: 336, Loss: 1.8711254596710205


 34%|███▎      | 337/1000 [07:18<13:52,  1.26s/it]

Step: 337, Loss: 1.8555479049682617


 34%|███▍      | 338/1000 [07:19<13:51,  1.26s/it]

Step: 338, Loss: 1.8620057106018066


 34%|███▍      | 339/1000 [07:20<13:53,  1.26s/it]

Step: 339, Loss: 1.8448370695114136


 34%|███▍      | 340/1000 [07:21<13:52,  1.26s/it]

Step: 340, Loss: 1.8490819931030273


 34%|███▍      | 341/1000 [07:23<14:30,  1.32s/it]

Step: 341, Loss: 1.8376710414886475


 34%|███▍      | 342/1000 [07:24<14:57,  1.36s/it]

Step: 342, Loss: 1.8405983448028564


 34%|███▍      | 343/1000 [07:26<14:41,  1.34s/it]

Step: 343, Loss: 1.8477063179016113


 34%|███▍      | 344/1000 [07:27<14:31,  1.33s/it]

Step: 344, Loss: 1.8327556848526


 34%|███▍      | 345/1000 [07:28<14:12,  1.30s/it]

Step: 345, Loss: 1.8326172828674316


 35%|███▍      | 346/1000 [07:30<14:26,  1.32s/it]

Step: 346, Loss: 1.8349581956863403


 35%|███▍      | 347/1000 [07:31<14:23,  1.32s/it]

Step: 347, Loss: 1.825859785079956


 35%|███▍      | 348/1000 [07:32<14:19,  1.32s/it]

Step: 348, Loss: 1.8169573545455933


 35%|███▍      | 349/1000 [07:33<14:09,  1.30s/it]

Step: 349, Loss: 1.817712664604187


 35%|███▌      | 350/1000 [07:35<14:12,  1.31s/it]

Step: 350, Loss: 1.8158354759216309


 35%|███▌      | 351/1000 [07:36<14:06,  1.30s/it]

Step: 351, Loss: 1.809472680091858


 35%|███▌      | 352/1000 [07:37<14:01,  1.30s/it]

Step: 352, Loss: 1.8082700967788696


 35%|███▌      | 353/1000 [07:39<13:58,  1.30s/it]

Step: 353, Loss: 1.7991713285446167


 35%|███▌      | 354/1000 [07:40<13:49,  1.28s/it]

Step: 354, Loss: 1.806868314743042


 36%|███▌      | 355/1000 [07:41<13:43,  1.28s/it]

Step: 355, Loss: 1.8034955263137817


 36%|███▌      | 356/1000 [07:42<13:41,  1.28s/it]

Step: 356, Loss: 1.7885525226593018


 36%|███▌      | 357/1000 [07:44<13:36,  1.27s/it]

Step: 357, Loss: 1.8022531270980835


 36%|███▌      | 358/1000 [07:45<13:35,  1.27s/it]

Step: 358, Loss: 1.7918684482574463


 36%|███▌      | 359/1000 [07:46<13:30,  1.26s/it]

Step: 359, Loss: 1.8080830574035645


 36%|███▌      | 360/1000 [07:47<13:25,  1.26s/it]

Step: 360, Loss: 1.7833638191223145


 36%|███▌      | 361/1000 [07:49<13:48,  1.30s/it]

Step: 361, Loss: 1.7831649780273438


 36%|███▌      | 362/1000 [07:50<13:38,  1.28s/it]

Step: 362, Loss: 1.7722039222717285


 36%|███▋      | 363/1000 [07:51<13:33,  1.28s/it]

Step: 363, Loss: 1.7841622829437256


 36%|███▋      | 364/1000 [07:53<13:23,  1.26s/it]

Step: 364, Loss: 1.787180781364441


 36%|███▋      | 365/1000 [07:54<13:18,  1.26s/it]

Step: 365, Loss: 1.77695631980896


 37%|███▋      | 366/1000 [07:55<13:13,  1.25s/it]

Step: 366, Loss: 1.7799904346466064


 37%|███▋      | 367/1000 [07:56<13:10,  1.25s/it]

Step: 367, Loss: 1.7618610858917236


 37%|███▋      | 368/1000 [07:58<13:07,  1.25s/it]

Step: 368, Loss: 1.7644802331924438


 37%|███▋      | 369/1000 [07:59<13:07,  1.25s/it]

Step: 369, Loss: 1.770908236503601


 37%|███▋      | 370/1000 [08:00<13:05,  1.25s/it]

Step: 370, Loss: 1.7499526739120483


 37%|███▋      | 371/1000 [08:01<13:04,  1.25s/it]

Step: 371, Loss: 1.753391146659851


 37%|███▋      | 372/1000 [08:03<13:03,  1.25s/it]

Step: 372, Loss: 1.7430421113967896


 37%|███▋      | 373/1000 [08:04<13:02,  1.25s/it]

Step: 373, Loss: 1.743455171585083


 37%|███▋      | 374/1000 [08:05<12:59,  1.25s/it]

Step: 374, Loss: 1.7508363723754883


 38%|███▊      | 375/1000 [08:06<13:07,  1.26s/it]

Step: 375, Loss: 1.7528423070907593


 38%|███▊      | 376/1000 [08:08<13:20,  1.28s/it]

Step: 376, Loss: 1.7510374784469604


 38%|███▊      | 377/1000 [08:09<13:22,  1.29s/it]

Step: 377, Loss: 1.7442044019699097


 38%|███▊      | 378/1000 [08:10<13:26,  1.30s/it]

Step: 378, Loss: 1.7544358968734741


 38%|███▊      | 379/1000 [08:12<13:22,  1.29s/it]

Step: 379, Loss: 1.7283929586410522


 38%|███▊      | 380/1000 [08:13<13:18,  1.29s/it]

Step: 380, Loss: 1.727107286453247


 38%|███▊      | 381/1000 [08:14<13:17,  1.29s/it]

Step: 381, Loss: 1.7245204448699951


 38%|███▊      | 382/1000 [08:15<13:18,  1.29s/it]

Step: 382, Loss: 1.716093897819519


 38%|███▊      | 383/1000 [08:17<13:21,  1.30s/it]

Step: 383, Loss: 1.7284072637557983


 38%|███▊      | 384/1000 [08:18<13:17,  1.29s/it]

Step: 384, Loss: 1.7126175165176392


 38%|███▊      | 385/1000 [08:19<13:19,  1.30s/it]

Step: 385, Loss: 1.7147936820983887


 39%|███▊      | 386/1000 [08:21<13:22,  1.31s/it]

Step: 386, Loss: 1.7088621854782104


 39%|███▊      | 387/1000 [08:22<13:13,  1.29s/it]

Step: 387, Loss: 1.7097609043121338


 39%|███▉      | 388/1000 [08:23<13:12,  1.29s/it]

Step: 388, Loss: 1.718886375427246


 39%|███▉      | 389/1000 [08:25<13:16,  1.30s/it]

Step: 389, Loss: 1.6907227039337158


 39%|███▉      | 390/1000 [08:26<13:02,  1.28s/it]

Step: 390, Loss: 1.7021805047988892


 39%|███▉      | 391/1000 [08:27<12:56,  1.27s/it]

Step: 391, Loss: 1.685662865638733


 39%|███▉      | 392/1000 [08:28<12:54,  1.27s/it]

Step: 392, Loss: 1.695806860923767


 39%|███▉      | 393/1000 [08:30<12:50,  1.27s/it]

Step: 393, Loss: 1.6924175024032593


 39%|███▉      | 394/1000 [08:31<12:43,  1.26s/it]

Step: 394, Loss: 1.6850634813308716


 40%|███▉      | 395/1000 [08:32<12:36,  1.25s/it]

Step: 395, Loss: 1.6942654848098755


 40%|███▉      | 396/1000 [08:33<12:37,  1.25s/it]

Step: 396, Loss: 1.6778408288955688


 40%|███▉      | 397/1000 [08:35<12:41,  1.26s/it]

Step: 397, Loss: 1.6762484312057495


 40%|███▉      | 398/1000 [08:36<12:35,  1.26s/it]

Step: 398, Loss: 1.666825532913208


 40%|███▉      | 399/1000 [08:37<12:32,  1.25s/it]

Step: 399, Loss: 1.6683272123336792


 40%|████      | 400/1000 [08:38<12:28,  1.25s/it]

Step: 400, Loss: 1.6692368984222412


 40%|████      | 401/1000 [08:40<12:28,  1.25s/it]

Step: 401, Loss: 1.6525112390518188


 40%|████      | 402/1000 [08:41<12:24,  1.24s/it]

Step: 402, Loss: 1.6508442163467407


 40%|████      | 403/1000 [08:42<12:24,  1.25s/it]

Step: 403, Loss: 1.662459373474121


 40%|████      | 404/1000 [08:43<12:23,  1.25s/it]

Step: 404, Loss: 1.645723819732666


 40%|████      | 405/1000 [08:45<12:23,  1.25s/it]

Step: 405, Loss: 1.6492047309875488


 41%|████      | 406/1000 [08:46<12:23,  1.25s/it]

Step: 406, Loss: 1.639643907546997


 41%|████      | 407/1000 [08:47<12:25,  1.26s/it]

Step: 407, Loss: 1.6413480043411255


 41%|████      | 408/1000 [08:48<12:20,  1.25s/it]

Step: 408, Loss: 1.6426194906234741


 41%|████      | 409/1000 [08:50<12:25,  1.26s/it]

Step: 409, Loss: 1.6377977132797241


 41%|████      | 410/1000 [08:51<12:24,  1.26s/it]

Step: 410, Loss: 1.6263537406921387


 41%|████      | 411/1000 [08:52<12:23,  1.26s/it]

Step: 411, Loss: 1.624428153038025


 41%|████      | 412/1000 [08:53<12:18,  1.26s/it]

Step: 412, Loss: 1.6193684339523315


 41%|████▏     | 413/1000 [08:55<12:16,  1.25s/it]

Step: 413, Loss: 1.6217727661132812


 41%|████▏     | 414/1000 [08:56<12:19,  1.26s/it]

Step: 414, Loss: 1.6145272254943848


 42%|████▏     | 415/1000 [08:57<12:15,  1.26s/it]

Step: 415, Loss: 1.6210802793502808


 42%|████▏     | 416/1000 [08:58<12:21,  1.27s/it]

Step: 416, Loss: 1.6042355298995972


 42%|████▏     | 417/1000 [09:00<13:30,  1.39s/it]

Step: 417, Loss: 1.6055047512054443


 42%|████▏     | 418/1000 [09:02<14:16,  1.47s/it]

Step: 418, Loss: 1.601243257522583


 42%|████▏     | 419/1000 [09:03<13:59,  1.44s/it]

Step: 419, Loss: 1.5983798503875732


 42%|████▏     | 420/1000 [09:04<13:36,  1.41s/it]

Step: 420, Loss: 1.6062986850738525


 42%|████▏     | 421/1000 [09:06<13:19,  1.38s/it]

Step: 421, Loss: 1.6035668849945068


 42%|████▏     | 422/1000 [09:07<12:59,  1.35s/it]

Step: 422, Loss: 1.5872888565063477


 42%|████▏     | 423/1000 [09:08<12:51,  1.34s/it]

Step: 423, Loss: 1.5894678831100464


 42%|████▏     | 424/1000 [09:10<12:42,  1.32s/it]

Step: 424, Loss: 1.591080904006958


 42%|████▎     | 425/1000 [09:11<12:28,  1.30s/it]

Step: 425, Loss: 1.5854125022888184


 43%|████▎     | 426/1000 [09:12<12:17,  1.28s/it]

Step: 426, Loss: 1.5968520641326904


 43%|████▎     | 427/1000 [09:13<12:07,  1.27s/it]

Step: 427, Loss: 1.581413745880127


 43%|████▎     | 428/1000 [09:15<12:00,  1.26s/it]

Step: 428, Loss: 1.5859391689300537


 43%|████▎     | 429/1000 [09:16<11:58,  1.26s/it]

Step: 429, Loss: 1.5803991556167603


 43%|████▎     | 430/1000 [09:17<11:52,  1.25s/it]

Step: 430, Loss: 1.5767436027526855


 43%|████▎     | 431/1000 [09:18<11:50,  1.25s/it]

Step: 431, Loss: 1.5735453367233276


 43%|████▎     | 432/1000 [09:20<11:47,  1.25s/it]

Step: 432, Loss: 1.5763299465179443


 43%|████▎     | 433/1000 [09:21<11:48,  1.25s/it]

Step: 433, Loss: 1.5805529356002808


 43%|████▎     | 434/1000 [09:22<11:44,  1.25s/it]

Step: 434, Loss: 1.5663713216781616


 44%|████▎     | 435/1000 [09:23<11:43,  1.25s/it]

Step: 435, Loss: 1.5612573623657227


 44%|████▎     | 436/1000 [09:25<11:42,  1.25s/it]

Step: 436, Loss: 1.5696091651916504


 44%|████▎     | 437/1000 [09:26<11:41,  1.25s/it]

Step: 437, Loss: 1.5765949487686157


 44%|████▍     | 438/1000 [09:27<11:37,  1.24s/it]

Step: 438, Loss: 1.5573409795761108


 44%|████▍     | 439/1000 [09:28<11:33,  1.24s/it]

Step: 439, Loss: 1.5530329942703247


 44%|████▍     | 440/1000 [09:30<11:39,  1.25s/it]

Step: 440, Loss: 1.5485285520553589


 44%|████▍     | 441/1000 [09:31<11:38,  1.25s/it]

Step: 441, Loss: 1.5477162599563599


 44%|████▍     | 442/1000 [09:32<11:34,  1.24s/it]

Step: 442, Loss: 1.5502595901489258


 44%|████▍     | 443/1000 [09:33<11:33,  1.24s/it]

Step: 443, Loss: 1.5381454229354858


 44%|████▍     | 444/1000 [09:35<11:33,  1.25s/it]

Step: 444, Loss: 1.5557056665420532


 44%|████▍     | 445/1000 [09:36<11:33,  1.25s/it]

Step: 445, Loss: 1.540674090385437


 45%|████▍     | 446/1000 [09:37<11:31,  1.25s/it]

Step: 446, Loss: 1.5493320226669312


 45%|████▍     | 447/1000 [09:38<11:29,  1.25s/it]

Step: 447, Loss: 1.5414646863937378


 45%|████▍     | 448/1000 [09:40<11:29,  1.25s/it]

Step: 448, Loss: 1.5510210990905762


 45%|████▍     | 449/1000 [09:41<11:30,  1.25s/it]

Step: 449, Loss: 1.5366010665893555


 45%|████▌     | 450/1000 [09:42<11:27,  1.25s/it]

Step: 450, Loss: 1.5322500467300415


 45%|████▌     | 451/1000 [09:43<11:24,  1.25s/it]

Step: 451, Loss: 1.5434465408325195


 45%|████▌     | 452/1000 [09:45<11:20,  1.24s/it]

Step: 452, Loss: 1.5222958326339722


 45%|████▌     | 453/1000 [09:46<11:19,  1.24s/it]

Step: 453, Loss: 1.5338833332061768


 45%|████▌     | 454/1000 [09:47<11:23,  1.25s/it]

Step: 454, Loss: 1.5321366786956787


 46%|████▌     | 455/1000 [09:48<11:19,  1.25s/it]

Step: 455, Loss: 1.5228289365768433


 46%|████▌     | 456/1000 [09:50<11:18,  1.25s/it]

Step: 456, Loss: 1.5418877601623535


 46%|████▌     | 457/1000 [09:51<11:17,  1.25s/it]

Step: 457, Loss: 1.512768268585205


 46%|████▌     | 458/1000 [09:52<11:17,  1.25s/it]

Step: 458, Loss: 1.5157911777496338


 46%|████▌     | 459/1000 [09:53<11:10,  1.24s/it]

Step: 459, Loss: 1.5079318284988403


 46%|████▌     | 460/1000 [09:55<11:07,  1.24s/it]

Step: 460, Loss: 1.5031300783157349


 46%|████▌     | 461/1000 [09:56<11:06,  1.24s/it]

Step: 461, Loss: 1.5159411430358887


 46%|████▌     | 462/1000 [09:57<11:04,  1.23s/it]

Step: 462, Loss: 1.5060337781906128


 46%|████▋     | 463/1000 [09:58<11:03,  1.24s/it]

Step: 463, Loss: 1.5018352270126343


 46%|████▋     | 464/1000 [09:59<11:05,  1.24s/it]

Step: 464, Loss: 1.5003260374069214


 46%|████▋     | 465/1000 [10:01<11:06,  1.25s/it]

Step: 465, Loss: 1.511448860168457


 47%|████▋     | 466/1000 [10:02<11:04,  1.25s/it]

Step: 466, Loss: 1.4900840520858765


 47%|████▋     | 467/1000 [10:03<11:03,  1.25s/it]

Step: 467, Loss: 1.4946725368499756


 47%|████▋     | 468/1000 [10:04<11:00,  1.24s/it]

Step: 468, Loss: 1.500951886177063


 47%|████▋     | 469/1000 [10:06<11:02,  1.25s/it]

Step: 469, Loss: 1.4806500673294067


 47%|████▋     | 470/1000 [10:07<11:00,  1.25s/it]

Step: 470, Loss: 1.4891688823699951


 47%|████▋     | 471/1000 [10:08<11:11,  1.27s/it]

Step: 471, Loss: 1.4795846939086914


 47%|████▋     | 472/1000 [10:10<11:06,  1.26s/it]

Step: 472, Loss: 1.4790945053100586


 47%|████▋     | 473/1000 [10:11<11:02,  1.26s/it]

Step: 473, Loss: 1.4731303453445435


 47%|████▋     | 474/1000 [10:12<10:58,  1.25s/it]

Step: 474, Loss: 1.4747096300125122


 48%|████▊     | 475/1000 [10:13<10:55,  1.25s/it]

Step: 475, Loss: 1.4708893299102783


 48%|████▊     | 476/1000 [10:14<10:53,  1.25s/it]

Step: 476, Loss: 1.463290810585022


 48%|████▊     | 477/1000 [10:16<10:50,  1.24s/it]

Step: 477, Loss: 1.4687583446502686


 48%|████▊     | 478/1000 [10:17<10:49,  1.24s/it]

Step: 478, Loss: 1.4526830911636353


 48%|████▊     | 479/1000 [10:18<10:48,  1.25s/it]

Step: 479, Loss: 1.458796739578247


 48%|████▊     | 480/1000 [10:19<10:46,  1.24s/it]

Step: 480, Loss: 1.453660488128662


 48%|████▊     | 481/1000 [10:21<10:43,  1.24s/it]

Step: 481, Loss: 1.4658836126327515


 48%|████▊     | 482/1000 [10:22<10:44,  1.25s/it]

Step: 482, Loss: 1.4486169815063477


 48%|████▊     | 483/1000 [10:23<10:42,  1.24s/it]

Step: 483, Loss: 1.4521191120147705


 48%|████▊     | 484/1000 [10:24<10:42,  1.24s/it]

Step: 484, Loss: 1.4471426010131836


 48%|████▊     | 485/1000 [10:26<10:39,  1.24s/it]

Step: 485, Loss: 1.4423273801803589


 49%|████▊     | 486/1000 [10:27<10:38,  1.24s/it]

Step: 486, Loss: 1.4389184713363647


 49%|████▊     | 487/1000 [10:28<10:38,  1.24s/it]

Step: 487, Loss: 1.4315245151519775


 49%|████▉     | 488/1000 [10:29<10:35,  1.24s/it]

Step: 488, Loss: 1.4381788969039917


 49%|████▉     | 489/1000 [10:31<10:35,  1.24s/it]

Step: 489, Loss: 1.4389561414718628


 49%|████▉     | 490/1000 [10:32<10:37,  1.25s/it]

Step: 490, Loss: 1.4340161085128784


 49%|████▉     | 491/1000 [10:33<10:41,  1.26s/it]

Step: 491, Loss: 1.4249769449234009


 49%|████▉     | 492/1000 [10:34<10:39,  1.26s/it]

Step: 492, Loss: 1.4292149543762207


 49%|████▉     | 493/1000 [10:36<10:35,  1.25s/it]

Step: 493, Loss: 1.4241316318511963


 49%|████▉     | 494/1000 [10:37<10:34,  1.25s/it]

Step: 494, Loss: 1.4245754480361938


 50%|████▉     | 495/1000 [10:38<10:31,  1.25s/it]

Step: 495, Loss: 1.4195398092269897


 50%|████▉     | 496/1000 [10:39<10:30,  1.25s/it]

Step: 496, Loss: 1.4179879426956177


 50%|████▉     | 497/1000 [10:41<10:29,  1.25s/it]

Step: 497, Loss: 1.424623727798462


 50%|████▉     | 498/1000 [10:42<10:33,  1.26s/it]

Step: 498, Loss: 1.4172232151031494


 50%|████▉     | 499/1000 [10:43<10:30,  1.26s/it]

Step: 499, Loss: 1.4078459739685059


 50%|█████     | 500/1000 [10:45<10:34,  1.27s/it]

Step: 500, Loss: 1.4160100221633911


 50%|█████     | 501/1000 [10:46<10:28,  1.26s/it]

Step: 501, Loss: 1.4124810695648193


 50%|█████     | 502/1000 [10:47<10:26,  1.26s/it]

Step: 502, Loss: 1.4099736213684082


 50%|█████     | 503/1000 [10:48<10:25,  1.26s/it]

Step: 503, Loss: 1.4095826148986816


 50%|█████     | 504/1000 [10:50<10:24,  1.26s/it]

Step: 504, Loss: 1.4202635288238525


 50%|█████     | 505/1000 [10:51<10:19,  1.25s/it]

Step: 505, Loss: 1.4023019075393677


 51%|█████     | 506/1000 [10:52<10:16,  1.25s/it]

Step: 506, Loss: 1.4093306064605713


 51%|█████     | 507/1000 [10:53<10:14,  1.25s/it]

Step: 507, Loss: 1.412317156791687


 51%|█████     | 508/1000 [10:54<10:09,  1.24s/it]

Step: 508, Loss: 1.3948678970336914


 51%|█████     | 509/1000 [10:56<10:06,  1.24s/it]

Step: 509, Loss: 1.4025837182998657


 51%|█████     | 510/1000 [10:57<10:09,  1.24s/it]

Step: 510, Loss: 1.3917750120162964


 51%|█████     | 511/1000 [10:58<10:05,  1.24s/it]

Step: 511, Loss: 1.3747875690460205


 51%|█████     | 512/1000 [10:59<10:05,  1.24s/it]

Step: 512, Loss: 1.3888616561889648


 51%|█████▏    | 513/1000 [11:01<10:05,  1.24s/it]

Step: 513, Loss: 1.3773761987686157


 51%|█████▏    | 514/1000 [11:02<10:05,  1.25s/it]

Step: 514, Loss: 1.3853741884231567


 52%|█████▏    | 515/1000 [11:03<10:08,  1.25s/it]

Step: 515, Loss: 1.3852213621139526


 52%|█████▏    | 516/1000 [11:04<10:06,  1.25s/it]

Step: 516, Loss: 1.3773001432418823


 52%|█████▏    | 517/1000 [11:06<10:05,  1.25s/it]

Step: 517, Loss: 1.3706297874450684


 52%|█████▏    | 518/1000 [11:07<10:04,  1.25s/it]

Step: 518, Loss: 1.3763797283172607


 52%|█████▏    | 519/1000 [11:08<10:02,  1.25s/it]

Step: 519, Loss: 1.3766974210739136


 52%|█████▏    | 520/1000 [11:10<10:06,  1.26s/it]

Step: 520, Loss: 1.3665598630905151


 52%|█████▏    | 521/1000 [11:11<10:17,  1.29s/it]

Step: 521, Loss: 1.3757827281951904


 52%|█████▏    | 522/1000 [11:12<10:29,  1.32s/it]

Step: 522, Loss: 1.373795747756958


 52%|█████▏    | 523/1000 [11:14<10:31,  1.32s/it]

Step: 523, Loss: 1.3681128025054932


 52%|█████▏    | 524/1000 [11:15<10:28,  1.32s/it]

Step: 524, Loss: 1.3568060398101807


 52%|█████▎    | 525/1000 [11:16<10:22,  1.31s/it]

Step: 525, Loss: 1.3620487451553345


 53%|█████▎    | 526/1000 [11:17<10:19,  1.31s/it]

Step: 526, Loss: 1.3548678159713745


 53%|█████▎    | 527/1000 [11:19<10:25,  1.32s/it]

Step: 527, Loss: 1.3565983772277832


 53%|█████▎    | 528/1000 [11:20<10:27,  1.33s/it]

Step: 528, Loss: 1.3514012098312378


 53%|█████▎    | 529/1000 [11:22<10:25,  1.33s/it]

Step: 529, Loss: 1.344193935394287


 53%|█████▎    | 530/1000 [11:23<10:26,  1.33s/it]

Step: 530, Loss: 1.3555450439453125


 53%|█████▎    | 531/1000 [11:24<10:17,  1.32s/it]

Step: 531, Loss: 1.3473236560821533


 53%|█████▎    | 532/1000 [11:25<10:17,  1.32s/it]

Step: 532, Loss: 1.3425586223602295


 53%|█████▎    | 533/1000 [11:27<10:15,  1.32s/it]

Step: 533, Loss: 1.3387776613235474


 53%|█████▎    | 534/1000 [11:28<10:15,  1.32s/it]

Step: 534, Loss: 1.3369474411010742


 54%|█████▎    | 535/1000 [11:29<10:03,  1.30s/it]

Step: 535, Loss: 1.3400241136550903


 54%|█████▎    | 536/1000 [11:31<09:52,  1.28s/it]

Step: 536, Loss: 1.3442045450210571


 54%|█████▎    | 537/1000 [11:32<09:55,  1.29s/it]

Step: 537, Loss: 1.3494051694869995


 54%|█████▍    | 538/1000 [11:33<09:51,  1.28s/it]

Step: 538, Loss: 1.346146821975708


 54%|█████▍    | 539/1000 [11:34<09:43,  1.26s/it]

Step: 539, Loss: 1.3280147314071655


 54%|█████▍    | 540/1000 [11:36<09:50,  1.28s/it]

Step: 540, Loss: 1.3329627513885498


 54%|█████▍    | 541/1000 [11:37<09:44,  1.27s/it]

Step: 541, Loss: 1.330374002456665


 54%|█████▍    | 542/1000 [11:38<09:38,  1.26s/it]

Step: 542, Loss: 1.3354954719543457


 54%|█████▍    | 543/1000 [11:39<09:32,  1.25s/it]

Step: 543, Loss: 1.3126857280731201


 54%|█████▍    | 544/1000 [11:41<09:33,  1.26s/it]

Step: 544, Loss: 1.3270630836486816


 55%|█████▍    | 545/1000 [11:42<09:28,  1.25s/it]

Step: 545, Loss: 1.3329262733459473


 55%|█████▍    | 546/1000 [11:43<09:29,  1.25s/it]

Step: 546, Loss: 1.3129692077636719


 55%|█████▍    | 547/1000 [11:44<09:27,  1.25s/it]

Step: 547, Loss: 1.3261221647262573


 55%|█████▍    | 548/1000 [11:46<09:27,  1.25s/it]

Step: 548, Loss: 1.3226499557495117


 55%|█████▍    | 549/1000 [11:47<09:26,  1.26s/it]

Step: 549, Loss: 1.3244576454162598


 55%|█████▌    | 550/1000 [11:48<09:20,  1.25s/it]

Step: 550, Loss: 1.3051289319992065


 55%|█████▌    | 551/1000 [11:49<09:22,  1.25s/it]

Step: 551, Loss: 1.3094651699066162


 55%|█████▌    | 552/1000 [11:51<09:17,  1.24s/it]

Step: 552, Loss: 1.3215216398239136


 55%|█████▌    | 553/1000 [11:52<09:17,  1.25s/it]

Step: 553, Loss: 1.3117135763168335


 55%|█████▌    | 554/1000 [11:53<09:16,  1.25s/it]

Step: 554, Loss: 1.303982138633728


 56%|█████▌    | 555/1000 [11:54<09:15,  1.25s/it]

Step: 555, Loss: 1.3152449131011963


 56%|█████▌    | 556/1000 [11:56<09:12,  1.24s/it]

Step: 556, Loss: 1.3160854578018188


 56%|█████▌    | 557/1000 [11:57<09:11,  1.25s/it]

Step: 557, Loss: 1.3139584064483643


 56%|█████▌    | 558/1000 [11:58<09:13,  1.25s/it]

Step: 558, Loss: 1.3128300905227661


 56%|█████▌    | 559/1000 [11:59<09:10,  1.25s/it]

Step: 559, Loss: 1.3196452856063843


 56%|█████▌    | 560/1000 [12:01<09:09,  1.25s/it]

Step: 560, Loss: 1.3129725456237793


 56%|█████▌    | 561/1000 [12:02<09:09,  1.25s/it]

Step: 561, Loss: 1.29874587059021


 56%|█████▌    | 562/1000 [12:03<09:07,  1.25s/it]

Step: 562, Loss: 1.2959074974060059


 56%|█████▋    | 563/1000 [12:04<09:06,  1.25s/it]

Step: 563, Loss: 1.291940450668335


 56%|█████▋    | 564/1000 [12:06<09:02,  1.24s/it]

Step: 564, Loss: 1.2854199409484863


 56%|█████▋    | 565/1000 [12:07<09:02,  1.25s/it]

Step: 565, Loss: 1.2858527898788452


 57%|█████▋    | 566/1000 [12:08<09:00,  1.25s/it]

Step: 566, Loss: 1.2829351425170898


 57%|█████▋    | 567/1000 [12:09<09:00,  1.25s/it]

Step: 567, Loss: 1.2847509384155273


 57%|█████▋    | 568/1000 [12:11<08:59,  1.25s/it]

Step: 568, Loss: 1.284637451171875


 57%|█████▋    | 569/1000 [12:12<09:01,  1.26s/it]

Step: 569, Loss: 1.272570013999939


 57%|█████▋    | 570/1000 [12:13<08:57,  1.25s/it]

Step: 570, Loss: 1.2780107259750366


 57%|█████▋    | 571/1000 [12:14<08:54,  1.24s/it]

Step: 571, Loss: 1.2754238843917847


 57%|█████▋    | 572/1000 [12:16<08:52,  1.24s/it]

Step: 572, Loss: 1.2798728942871094


 57%|█████▋    | 573/1000 [12:17<08:50,  1.24s/it]

Step: 573, Loss: 1.2714289426803589


 57%|█████▋    | 574/1000 [12:18<08:50,  1.25s/it]

Step: 574, Loss: 1.2627466917037964


 57%|█████▊    | 575/1000 [12:19<08:51,  1.25s/it]

Step: 575, Loss: 1.2720288038253784


 58%|█████▊    | 576/1000 [12:21<08:49,  1.25s/it]

Step: 576, Loss: 1.2788673639297485


 58%|█████▊    | 577/1000 [12:22<08:48,  1.25s/it]

Step: 577, Loss: 1.276729702949524


 58%|█████▊    | 578/1000 [12:24<09:34,  1.36s/it]

Step: 578, Loss: 1.26808500289917


 58%|█████▊    | 579/1000 [12:25<09:23,  1.34s/it]

Step: 579, Loss: 1.2621408700942993


 58%|█████▊    | 580/1000 [12:26<09:14,  1.32s/it]

Step: 580, Loss: 1.270838975906372


 58%|█████▊    | 581/1000 [12:27<09:06,  1.30s/it]

Step: 581, Loss: 1.2559521198272705


 58%|█████▊    | 582/1000 [12:29<08:58,  1.29s/it]

Step: 582, Loss: 1.260082721710205


 58%|█████▊    | 583/1000 [12:30<08:54,  1.28s/it]

Step: 583, Loss: 1.2474998235702515


 58%|█████▊    | 584/1000 [12:31<08:49,  1.27s/it]

Step: 584, Loss: 1.2587249279022217


 58%|█████▊    | 585/1000 [12:32<08:43,  1.26s/it]

Step: 585, Loss: 1.2583956718444824


 59%|█████▊    | 586/1000 [12:34<08:39,  1.26s/it]

Step: 586, Loss: 1.247931957244873


 59%|█████▊    | 587/1000 [12:35<08:40,  1.26s/it]

Step: 587, Loss: 1.2573014497756958


 59%|█████▉    | 588/1000 [12:36<08:38,  1.26s/it]

Step: 588, Loss: 1.2411288022994995


 59%|█████▉    | 589/1000 [12:37<08:44,  1.28s/it]

Step: 589, Loss: 1.2468883991241455


 59%|█████▉    | 590/1000 [12:39<08:44,  1.28s/it]

Step: 590, Loss: 1.2483619451522827


 59%|█████▉    | 591/1000 [12:40<08:42,  1.28s/it]

Step: 591, Loss: 1.2558903694152832


 59%|█████▉    | 592/1000 [12:41<08:37,  1.27s/it]

Step: 592, Loss: 1.2552154064178467


 59%|█████▉    | 593/1000 [12:42<08:33,  1.26s/it]

Step: 593, Loss: 1.2367584705352783


 59%|█████▉    | 594/1000 [12:44<08:30,  1.26s/it]

Step: 594, Loss: 1.228001594543457


 60%|█████▉    | 595/1000 [12:45<08:27,  1.25s/it]

Step: 595, Loss: 1.2609014511108398


 60%|█████▉    | 596/1000 [12:46<08:23,  1.25s/it]

Step: 596, Loss: 1.2324457168579102


 60%|█████▉    | 597/1000 [12:47<08:27,  1.26s/it]

Step: 597, Loss: 1.2433048486709595


 60%|█████▉    | 598/1000 [12:49<08:36,  1.28s/it]

Step: 598, Loss: 1.2351293563842773


 60%|█████▉    | 599/1000 [12:50<08:35,  1.29s/it]

Step: 599, Loss: 1.2318639755249023


 60%|██████    | 600/1000 [12:51<08:40,  1.30s/it]

Step: 600, Loss: 1.2337864637374878


 60%|██████    | 601/1000 [12:53<08:39,  1.30s/it]

Step: 601, Loss: 1.2366827726364136


 60%|██████    | 602/1000 [12:54<08:34,  1.29s/it]

Step: 602, Loss: 1.231673002243042


 60%|██████    | 603/1000 [12:55<08:29,  1.28s/it]

Step: 603, Loss: 1.2414789199829102


 60%|██████    | 604/1000 [12:57<08:25,  1.28s/it]

Step: 604, Loss: 1.21909499168396


 60%|██████    | 605/1000 [12:58<08:20,  1.27s/it]

Step: 605, Loss: 1.2273688316345215


 61%|██████    | 606/1000 [12:59<08:18,  1.26s/it]

Step: 606, Loss: 1.2071820497512817


 61%|██████    | 607/1000 [13:00<08:18,  1.27s/it]

Step: 607, Loss: 1.2125251293182373


 61%|██████    | 608/1000 [13:02<08:19,  1.27s/it]

Step: 608, Loss: 1.213510513305664


 61%|██████    | 609/1000 [13:03<08:16,  1.27s/it]

Step: 609, Loss: 1.2125407457351685


 61%|██████    | 610/1000 [13:04<08:20,  1.28s/it]

Step: 610, Loss: 1.2158783674240112


 61%|██████    | 611/1000 [13:05<08:16,  1.28s/it]

Step: 611, Loss: 1.217639684677124


 61%|██████    | 612/1000 [13:07<08:17,  1.28s/it]

Step: 612, Loss: 1.2076321840286255


 61%|██████▏   | 613/1000 [13:08<08:13,  1.28s/it]

Step: 613, Loss: 1.205244779586792


 61%|██████▏   | 614/1000 [13:09<08:11,  1.27s/it]

Step: 614, Loss: 1.2046579122543335


 62%|██████▏   | 615/1000 [13:11<08:09,  1.27s/it]

Step: 615, Loss: 1.2019555568695068


 62%|██████▏   | 616/1000 [13:12<08:08,  1.27s/it]

Step: 616, Loss: 1.2007807493209839


 62%|██████▏   | 617/1000 [13:13<08:06,  1.27s/it]

Step: 617, Loss: 1.1952015161514282


 62%|██████▏   | 618/1000 [13:14<08:05,  1.27s/it]

Step: 618, Loss: 1.2002822160720825


 62%|██████▏   | 619/1000 [13:16<08:03,  1.27s/it]

Step: 619, Loss: 1.1987736225128174


 62%|██████▏   | 620/1000 [13:17<08:02,  1.27s/it]

Step: 620, Loss: 1.1901894807815552


 62%|██████▏   | 621/1000 [13:18<08:02,  1.27s/it]

Step: 621, Loss: 1.1922990083694458


 62%|██████▏   | 622/1000 [13:19<08:01,  1.27s/it]

Step: 622, Loss: 1.1884208917617798


 62%|██████▏   | 623/1000 [13:21<08:00,  1.27s/it]

Step: 623, Loss: 1.1974477767944336


 62%|██████▏   | 624/1000 [13:22<07:58,  1.27s/it]

Step: 624, Loss: 1.1806753873825073


 62%|██████▎   | 625/1000 [13:23<07:57,  1.27s/it]

Step: 625, Loss: 1.177071452140808


 63%|██████▎   | 626/1000 [13:25<07:54,  1.27s/it]

Step: 626, Loss: 1.1817870140075684


 63%|██████▎   | 627/1000 [13:26<07:54,  1.27s/it]

Step: 627, Loss: 1.185563564300537


 63%|██████▎   | 628/1000 [13:27<07:51,  1.27s/it]

Step: 628, Loss: 1.1780012845993042


 63%|██████▎   | 629/1000 [13:28<07:51,  1.27s/it]

Step: 629, Loss: 1.1801390647888184


 63%|██████▎   | 630/1000 [13:30<07:49,  1.27s/it]

Step: 630, Loss: 1.1818599700927734


 63%|██████▎   | 631/1000 [13:31<07:49,  1.27s/it]

Step: 631, Loss: 1.1673455238342285


 63%|██████▎   | 632/1000 [13:32<07:53,  1.29s/it]

Step: 632, Loss: 1.1796146631240845


 63%|██████▎   | 633/1000 [13:33<07:51,  1.28s/it]

Step: 633, Loss: 1.1724956035614014


 63%|██████▎   | 634/1000 [13:35<07:46,  1.27s/it]

Step: 634, Loss: 1.165291666984558


 64%|██████▎   | 635/1000 [13:36<07:44,  1.27s/it]

Step: 635, Loss: 1.1667876243591309


 64%|██████▎   | 636/1000 [13:37<07:42,  1.27s/it]

Step: 636, Loss: 1.169024109840393


 64%|██████▎   | 637/1000 [13:39<07:40,  1.27s/it]

Step: 637, Loss: 1.1691139936447144


 64%|██████▍   | 638/1000 [13:40<07:37,  1.26s/it]

Step: 638, Loss: 1.1648868322372437


 64%|██████▍   | 639/1000 [13:41<07:38,  1.27s/it]

Step: 639, Loss: 1.1692748069763184


 64%|██████▍   | 640/1000 [13:42<07:41,  1.28s/it]

Step: 640, Loss: 1.15382719039917


 64%|██████▍   | 641/1000 [13:44<07:37,  1.28s/it]

Step: 641, Loss: 1.1633241176605225


 64%|██████▍   | 642/1000 [13:45<07:35,  1.27s/it]

Step: 642, Loss: 1.1584750413894653


 64%|██████▍   | 643/1000 [13:46<07:34,  1.27s/it]

Step: 643, Loss: 1.160444974899292


 64%|██████▍   | 644/1000 [13:47<07:32,  1.27s/it]

Step: 644, Loss: 1.1597678661346436


 64%|██████▍   | 645/1000 [13:49<07:30,  1.27s/it]

Step: 645, Loss: 1.1569674015045166


 65%|██████▍   | 646/1000 [13:50<07:27,  1.26s/it]

Step: 646, Loss: 1.1525866985321045


 65%|██████▍   | 647/1000 [13:51<07:26,  1.27s/it]

Step: 647, Loss: 1.1584736108779907


 65%|██████▍   | 648/1000 [13:52<07:24,  1.26s/it]

Step: 648, Loss: 1.1466742753982544


 65%|██████▍   | 649/1000 [13:54<07:22,  1.26s/it]

Step: 649, Loss: 1.1523653268814087


 65%|██████▌   | 650/1000 [13:55<07:21,  1.26s/it]

Step: 650, Loss: 1.1557596921920776


 65%|██████▌   | 651/1000 [13:56<07:19,  1.26s/it]

Step: 651, Loss: 1.1339472532272339


 65%|██████▌   | 652/1000 [13:58<07:19,  1.26s/it]

Step: 652, Loss: 1.1542046070098877


 65%|██████▌   | 653/1000 [13:59<07:27,  1.29s/it]

Step: 653, Loss: 1.144277811050415


 65%|██████▌   | 654/1000 [14:00<07:23,  1.28s/it]

Step: 654, Loss: 1.1484986543655396


 66%|██████▌   | 655/1000 [14:01<07:19,  1.27s/it]

Step: 655, Loss: 1.1425509452819824


 66%|██████▌   | 656/1000 [14:03<07:16,  1.27s/it]

Step: 656, Loss: 1.1471089124679565


 66%|██████▌   | 657/1000 [14:04<07:14,  1.27s/it]

Step: 657, Loss: 1.144045352935791


 66%|██████▌   | 658/1000 [14:05<07:14,  1.27s/it]

Step: 658, Loss: 1.1403065919876099


 66%|██████▌   | 659/1000 [14:06<07:11,  1.26s/it]

Step: 659, Loss: 1.1393022537231445


 66%|██████▌   | 660/1000 [14:08<07:10,  1.27s/it]

Step: 660, Loss: 1.1630359888076782


 66%|██████▌   | 661/1000 [14:09<07:17,  1.29s/it]

Step: 661, Loss: 1.1294937133789062


 66%|██████▌   | 662/1000 [14:10<07:11,  1.28s/it]

Step: 662, Loss: 1.140108585357666


 66%|██████▋   | 663/1000 [14:12<07:10,  1.28s/it]

Step: 663, Loss: 1.130202293395996


 66%|██████▋   | 664/1000 [14:13<07:06,  1.27s/it]

Step: 664, Loss: 1.131539225578308


 66%|██████▋   | 665/1000 [14:14<07:04,  1.27s/it]

Step: 665, Loss: 1.1354271173477173


 67%|██████▋   | 666/1000 [14:15<07:00,  1.26s/it]

Step: 666, Loss: 1.1244354248046875


 67%|██████▋   | 667/1000 [14:17<07:03,  1.27s/it]

Step: 667, Loss: 1.1310827732086182


 67%|██████▋   | 668/1000 [14:18<07:02,  1.27s/it]

Step: 668, Loss: 1.1270115375518799


 67%|██████▋   | 669/1000 [14:19<07:01,  1.27s/it]

Step: 669, Loss: 1.1307653188705444


 67%|██████▋   | 670/1000 [14:20<06:58,  1.27s/it]

Step: 670, Loss: 1.1164543628692627


 67%|██████▋   | 671/1000 [14:22<06:58,  1.27s/it]

Step: 671, Loss: 1.1360483169555664


 67%|██████▋   | 672/1000 [14:23<06:58,  1.28s/it]

Step: 672, Loss: 1.1172658205032349


 67%|██████▋   | 673/1000 [14:24<06:58,  1.28s/it]

Step: 673, Loss: 1.1186326742172241


 67%|██████▋   | 674/1000 [14:26<06:56,  1.28s/it]

Step: 674, Loss: 1.1272718906402588


 68%|██████▊   | 675/1000 [14:27<06:52,  1.27s/it]

Step: 675, Loss: 1.1144251823425293


 68%|██████▊   | 676/1000 [14:28<06:51,  1.27s/it]

Step: 676, Loss: 1.1149364709854126


 68%|██████▊   | 677/1000 [14:29<06:48,  1.26s/it]

Step: 677, Loss: 1.1111507415771484


 68%|██████▊   | 678/1000 [14:31<06:45,  1.26s/it]

Step: 678, Loss: 1.0993468761444092


 68%|██████▊   | 679/1000 [14:32<06:45,  1.26s/it]

Step: 679, Loss: 1.0972225666046143


 68%|██████▊   | 680/1000 [14:33<06:44,  1.26s/it]

Step: 680, Loss: 1.110120415687561


 68%|██████▊   | 681/1000 [14:34<06:42,  1.26s/it]

Step: 681, Loss: 1.0998201370239258


 68%|██████▊   | 682/1000 [14:36<06:41,  1.26s/it]

Step: 682, Loss: 1.0922077894210815


 68%|██████▊   | 683/1000 [14:37<06:41,  1.27s/it]

Step: 683, Loss: 1.1086220741271973


 68%|██████▊   | 684/1000 [14:38<06:41,  1.27s/it]

Step: 684, Loss: 1.100689172744751


 68%|██████▊   | 685/1000 [14:39<06:38,  1.26s/it]

Step: 685, Loss: 1.0944277048110962


 69%|██████▊   | 686/1000 [14:41<06:35,  1.26s/it]

Step: 686, Loss: 1.0925346612930298


 69%|██████▊   | 687/1000 [14:42<06:33,  1.26s/it]

Step: 687, Loss: 1.0960948467254639


 69%|██████▉   | 688/1000 [14:43<06:35,  1.27s/it]

Step: 688, Loss: 1.0946942567825317


 69%|██████▉   | 689/1000 [14:45<06:34,  1.27s/it]

Step: 689, Loss: 1.0880922079086304


 69%|██████▉   | 690/1000 [14:46<06:32,  1.27s/it]

Step: 690, Loss: 1.0862716436386108


 69%|██████▉   | 691/1000 [14:47<06:32,  1.27s/it]

Step: 691, Loss: 1.0931016206741333


 69%|██████▉   | 692/1000 [14:48<06:33,  1.28s/it]

Step: 692, Loss: 1.1085290908813477


 69%|██████▉   | 693/1000 [14:50<06:33,  1.28s/it]

Step: 693, Loss: 1.0911422967910767


 69%|██████▉   | 694/1000 [14:51<06:30,  1.28s/it]

Step: 694, Loss: 1.076490044593811


 70%|██████▉   | 695/1000 [14:52<06:27,  1.27s/it]

Step: 695, Loss: 1.074535608291626


 70%|██████▉   | 696/1000 [14:53<06:25,  1.27s/it]

Step: 696, Loss: 1.0802617073059082


 70%|██████▉   | 697/1000 [14:55<06:24,  1.27s/it]

Step: 697, Loss: 1.0867706537246704


 70%|██████▉   | 698/1000 [14:56<06:23,  1.27s/it]

Step: 698, Loss: 1.0841882228851318


 70%|██████▉   | 699/1000 [14:57<06:21,  1.27s/it]

Step: 699, Loss: 1.0872869491577148


 70%|███████   | 700/1000 [14:58<06:18,  1.26s/it]

Step: 700, Loss: 1.0876126289367676


 70%|███████   | 701/1000 [15:00<06:19,  1.27s/it]

Step: 701, Loss: 1.0725725889205933


 70%|███████   | 702/1000 [15:01<06:18,  1.27s/it]

Step: 702, Loss: 1.0846110582351685


 70%|███████   | 703/1000 [15:02<06:17,  1.27s/it]

Step: 703, Loss: 1.0678443908691406


 70%|███████   | 704/1000 [15:04<06:13,  1.26s/it]

Step: 704, Loss: 1.0743131637573242


 70%|███████   | 705/1000 [15:05<06:12,  1.26s/it]

Step: 705, Loss: 1.0624662637710571


 71%|███████   | 706/1000 [15:06<06:11,  1.27s/it]

Step: 706, Loss: 1.073410987854004


 71%|███████   | 707/1000 [15:07<06:12,  1.27s/it]

Step: 707, Loss: 1.0789915323257446


 71%|███████   | 708/1000 [15:09<06:12,  1.27s/it]

Step: 708, Loss: 1.0606846809387207


 71%|███████   | 709/1000 [15:10<06:09,  1.27s/it]

Step: 709, Loss: 1.058767557144165


 71%|███████   | 710/1000 [15:11<06:11,  1.28s/it]

Step: 710, Loss: 1.0681567192077637


 71%|███████   | 711/1000 [15:13<06:10,  1.28s/it]

Step: 711, Loss: 1.0569496154785156


 71%|███████   | 712/1000 [15:14<06:10,  1.29s/it]

Step: 712, Loss: 1.0662413835525513


 71%|███████▏  | 713/1000 [15:15<06:07,  1.28s/it]

Step: 713, Loss: 1.0610313415527344


 71%|███████▏  | 714/1000 [15:16<06:05,  1.28s/it]

Step: 714, Loss: 1.0568783283233643


 72%|███████▏  | 715/1000 [15:18<06:02,  1.27s/it]

Step: 715, Loss: 1.062436819076538


 72%|███████▏  | 716/1000 [15:19<06:01,  1.27s/it]

Step: 716, Loss: 1.0754647254943848


 72%|███████▏  | 717/1000 [15:20<06:00,  1.27s/it]

Step: 717, Loss: 1.0530121326446533


 72%|███████▏  | 718/1000 [15:21<05:58,  1.27s/it]

Step: 718, Loss: 1.050561785697937


 72%|███████▏  | 719/1000 [15:23<05:56,  1.27s/it]

Step: 719, Loss: 1.060714840888977


 72%|███████▏  | 720/1000 [15:24<05:56,  1.27s/it]

Step: 720, Loss: 1.0392988920211792


 72%|███████▏  | 721/1000 [15:25<05:54,  1.27s/it]

Step: 721, Loss: 1.0573517084121704


 72%|███████▏  | 722/1000 [15:27<05:53,  1.27s/it]

Step: 722, Loss: 1.0469342470169067


 72%|███████▏  | 723/1000 [15:28<05:52,  1.27s/it]

Step: 723, Loss: 1.0513958930969238


 72%|███████▏  | 724/1000 [15:29<05:49,  1.27s/it]

Step: 724, Loss: 1.053062915802002


 72%|███████▎  | 725/1000 [15:30<05:50,  1.28s/it]

Step: 725, Loss: 1.0346426963806152


 73%|███████▎  | 726/1000 [15:33<07:07,  1.56s/it]

Step: 726, Loss: 1.0559214353561401


 73%|███████▎  | 727/1000 [15:34<06:50,  1.50s/it]

Step: 727, Loss: 1.0433509349822998


 73%|███████▎  | 728/1000 [15:35<06:28,  1.43s/it]

Step: 728, Loss: 1.032562494277954


 73%|███████▎  | 729/1000 [15:36<06:16,  1.39s/it]

Step: 729, Loss: 1.0499482154846191


 73%|███████▎  | 730/1000 [15:38<06:06,  1.36s/it]

Step: 730, Loss: 1.039339303970337


 73%|███████▎  | 731/1000 [15:39<05:58,  1.33s/it]

Step: 731, Loss: 1.032070279121399


 73%|███████▎  | 732/1000 [15:40<05:50,  1.31s/it]

Step: 732, Loss: 1.033740520477295


 73%|███████▎  | 733/1000 [15:42<05:47,  1.30s/it]

Step: 733, Loss: 1.0295790433883667


 73%|███████▎  | 734/1000 [15:43<05:45,  1.30s/it]

Step: 734, Loss: 1.0306346416473389


 74%|███████▎  | 735/1000 [15:44<05:42,  1.29s/it]

Step: 735, Loss: 1.0262303352355957


 74%|███████▎  | 736/1000 [15:45<05:39,  1.29s/it]

Step: 736, Loss: 1.022078275680542


 74%|███████▎  | 737/1000 [15:47<05:38,  1.29s/it]

Step: 737, Loss: 1.0195488929748535


 74%|███████▍  | 738/1000 [15:48<05:34,  1.28s/it]

Step: 738, Loss: 1.0286598205566406


 74%|███████▍  | 739/1000 [15:49<05:34,  1.28s/it]

Step: 739, Loss: 1.0203163623809814


 74%|███████▍  | 740/1000 [15:51<05:31,  1.27s/it]

Step: 740, Loss: 1.0214262008666992


 74%|███████▍  | 741/1000 [15:52<05:29,  1.27s/it]

Step: 741, Loss: 1.0159014463424683


 74%|███████▍  | 742/1000 [15:53<05:28,  1.27s/it]

Step: 742, Loss: 1.0170423984527588


 74%|███████▍  | 743/1000 [15:54<05:25,  1.27s/it]

Step: 743, Loss: 1.0220136642456055


 74%|███████▍  | 744/1000 [15:56<05:27,  1.28s/it]

Step: 744, Loss: 1.010663628578186


 74%|███████▍  | 745/1000 [15:57<05:29,  1.29s/it]

Step: 745, Loss: 1.0062205791473389


 75%|███████▍  | 746/1000 [15:58<05:26,  1.29s/it]

Step: 746, Loss: 1.0059844255447388


 75%|███████▍  | 747/1000 [16:00<05:26,  1.29s/it]

Step: 747, Loss: 1.0181459188461304


 75%|███████▍  | 748/1000 [16:01<05:24,  1.29s/it]

Step: 748, Loss: 1.0077420473098755


 75%|███████▍  | 749/1000 [16:02<05:29,  1.31s/it]

Step: 749, Loss: 1.0105979442596436


 75%|███████▌  | 750/1000 [16:03<05:21,  1.29s/it]

Step: 750, Loss: 1.019381046295166


 75%|███████▌  | 751/1000 [16:05<05:17,  1.28s/it]

Step: 751, Loss: 1.015012264251709


 75%|███████▌  | 752/1000 [16:06<05:13,  1.27s/it]

Step: 752, Loss: 1.000191330909729


 75%|███████▌  | 753/1000 [16:07<05:10,  1.26s/it]

Step: 753, Loss: 0.9977867603302002


 75%|███████▌  | 754/1000 [16:08<05:07,  1.25s/it]

Step: 754, Loss: 0.9939308166503906


 76%|███████▌  | 755/1000 [16:10<05:05,  1.25s/it]

Step: 755, Loss: 1.013904094696045


 76%|███████▌  | 756/1000 [16:11<05:05,  1.25s/it]

Step: 756, Loss: 1.0083091259002686


 76%|███████▌  | 757/1000 [16:12<05:07,  1.26s/it]

Step: 757, Loss: 1.0168564319610596


 76%|███████▌  | 758/1000 [16:13<05:05,  1.26s/it]

Step: 758, Loss: 1.0068656206130981


 76%|███████▌  | 759/1000 [16:15<05:05,  1.27s/it]

Step: 759, Loss: 0.9931035041809082


 76%|███████▌  | 760/1000 [16:16<05:02,  1.26s/it]

Step: 760, Loss: 1.0136052370071411


 76%|███████▌  | 761/1000 [16:17<05:01,  1.26s/it]

Step: 761, Loss: 1.0019692182540894


 76%|███████▌  | 762/1000 [16:18<04:57,  1.25s/it]

Step: 762, Loss: 0.9934729933738708


 76%|███████▋  | 763/1000 [16:20<04:56,  1.25s/it]

Step: 763, Loss: 0.9909394979476929


 76%|███████▋  | 764/1000 [16:21<04:54,  1.25s/it]

Step: 764, Loss: 0.9901171326637268


 76%|███████▋  | 765/1000 [16:22<04:53,  1.25s/it]

Step: 765, Loss: 0.9819591641426086


 77%|███████▋  | 766/1000 [16:23<04:53,  1.25s/it]

Step: 766, Loss: 0.9985961318016052


 77%|███████▋  | 767/1000 [16:25<04:51,  1.25s/it]

Step: 767, Loss: 0.989357054233551


 77%|███████▋  | 768/1000 [16:26<04:49,  1.25s/it]

Step: 768, Loss: 0.9954861998558044


 77%|███████▋  | 769/1000 [16:27<04:59,  1.30s/it]

Step: 769, Loss: 0.9837485551834106


 77%|███████▋  | 770/1000 [16:29<04:54,  1.28s/it]

Step: 770, Loss: 0.9737405180931091


 77%|███████▋  | 771/1000 [16:30<04:49,  1.27s/it]

Step: 771, Loss: 0.9804878234863281


 77%|███████▋  | 772/1000 [16:31<04:46,  1.26s/it]

Step: 772, Loss: 0.9833267331123352


 77%|███████▋  | 773/1000 [16:32<04:44,  1.25s/it]

Step: 773, Loss: 0.9845495820045471


 77%|███████▋  | 774/1000 [16:34<04:40,  1.24s/it]

Step: 774, Loss: 0.9790956974029541


 78%|███████▊  | 775/1000 [16:35<04:39,  1.24s/it]

Step: 775, Loss: 0.9772266149520874


 78%|███████▊  | 776/1000 [16:36<04:38,  1.24s/it]

Step: 776, Loss: 0.9648149013519287


 78%|███████▊  | 777/1000 [16:37<04:37,  1.24s/it]

Step: 777, Loss: 0.9673688411712646


 78%|███████▊  | 778/1000 [16:38<04:35,  1.24s/it]

Step: 778, Loss: 0.9768301248550415


 78%|███████▊  | 779/1000 [16:40<04:35,  1.25s/it]

Step: 779, Loss: 0.9756450653076172


 78%|███████▊  | 780/1000 [16:41<04:33,  1.24s/it]

Step: 780, Loss: 0.9627689123153687


 78%|███████▊  | 781/1000 [16:42<04:32,  1.24s/it]

Step: 781, Loss: 0.9727285504341125


 78%|███████▊  | 782/1000 [16:43<04:32,  1.25s/it]

Step: 782, Loss: 0.9775745272636414


 78%|███████▊  | 783/1000 [16:45<04:30,  1.25s/it]

Step: 783, Loss: 0.9647656083106995


 78%|███████▊  | 784/1000 [16:46<04:29,  1.25s/it]

Step: 784, Loss: 0.9714233875274658


 78%|███████▊  | 785/1000 [16:47<04:27,  1.24s/it]

Step: 785, Loss: 0.9660680294036865


 79%|███████▊  | 786/1000 [16:48<04:28,  1.25s/it]

Step: 786, Loss: 0.9562353491783142


 79%|███████▊  | 787/1000 [16:50<04:26,  1.25s/it]

Step: 787, Loss: 0.9658865928649902


 79%|███████▉  | 788/1000 [16:51<04:24,  1.25s/it]

Step: 788, Loss: 0.9533339142799377


 79%|███████▉  | 789/1000 [16:52<04:23,  1.25s/it]

Step: 789, Loss: 0.968990683555603


 79%|███████▉  | 790/1000 [16:53<04:22,  1.25s/it]

Step: 790, Loss: 0.9630345702171326


 79%|███████▉  | 791/1000 [16:55<04:22,  1.25s/it]

Step: 791, Loss: 0.9623649716377258


 79%|███████▉  | 792/1000 [16:56<04:19,  1.25s/it]

Step: 792, Loss: 0.9591088891029358


 79%|███████▉  | 793/1000 [16:57<04:17,  1.24s/it]

Step: 793, Loss: 0.9545046091079712


 79%|███████▉  | 794/1000 [16:58<04:15,  1.24s/it]

Step: 794, Loss: 0.9490072727203369


 80%|███████▉  | 795/1000 [17:00<04:14,  1.24s/it]

Step: 795, Loss: 0.9533092975616455


 80%|███████▉  | 796/1000 [17:01<04:12,  1.24s/it]

Step: 796, Loss: 0.953015148639679


 80%|███████▉  | 797/1000 [17:02<04:12,  1.24s/it]

Step: 797, Loss: 0.9660088419914246


 80%|███████▉  | 798/1000 [17:03<04:12,  1.25s/it]

Step: 798, Loss: 0.9417545795440674


 80%|███████▉  | 799/1000 [17:05<04:10,  1.25s/it]

Step: 799, Loss: 0.9470888376235962


 80%|████████  | 800/1000 [17:06<04:09,  1.25s/it]

Step: 800, Loss: 0.9460509419441223


 80%|████████  | 801/1000 [17:07<04:06,  1.24s/it]

Step: 801, Loss: 0.9415653347969055


 80%|████████  | 802/1000 [17:08<04:05,  1.24s/it]

Step: 802, Loss: 0.9462634921073914


 80%|████████  | 803/1000 [17:10<04:04,  1.24s/it]

Step: 803, Loss: 0.9383217096328735


 80%|████████  | 804/1000 [17:11<04:03,  1.24s/it]

Step: 804, Loss: 0.943361222743988


 80%|████████  | 805/1000 [17:12<04:02,  1.25s/it]

Step: 805, Loss: 0.9432485103607178


 81%|████████  | 806/1000 [17:13<04:01,  1.24s/it]

Step: 806, Loss: 0.9437432289123535


 81%|████████  | 807/1000 [17:15<04:01,  1.25s/it]

Step: 807, Loss: 0.933860719203949


 81%|████████  | 808/1000 [17:16<04:01,  1.26s/it]

Step: 808, Loss: 0.956642210483551


 81%|████████  | 809/1000 [17:17<03:58,  1.25s/it]

Step: 809, Loss: 0.9560471177101135


 81%|████████  | 810/1000 [17:18<03:56,  1.25s/it]

Step: 810, Loss: 0.937882661819458


 81%|████████  | 811/1000 [17:20<03:56,  1.25s/it]

Step: 811, Loss: 0.9478517770767212


 81%|████████  | 812/1000 [17:21<03:54,  1.25s/it]

Step: 812, Loss: 0.9411845207214355


 81%|████████▏ | 813/1000 [17:22<03:53,  1.25s/it]

Step: 813, Loss: 0.9390079379081726


 81%|████████▏ | 814/1000 [17:23<03:52,  1.25s/it]

Step: 814, Loss: 0.9449417591094971


 82%|████████▏ | 815/1000 [17:25<03:50,  1.25s/it]

Step: 815, Loss: 0.9346872568130493


 82%|████████▏ | 816/1000 [17:26<03:55,  1.28s/it]

Step: 816, Loss: 0.9475418925285339


 82%|████████▏ | 817/1000 [17:27<03:55,  1.29s/it]

Step: 817, Loss: 0.9370627403259277


 82%|████████▏ | 818/1000 [17:29<03:51,  1.27s/it]

Step: 818, Loss: 0.9362969994544983


 82%|████████▏ | 819/1000 [17:30<03:49,  1.27s/it]

Step: 819, Loss: 0.9370065927505493


 82%|████████▏ | 820/1000 [17:31<03:47,  1.26s/it]

Step: 820, Loss: 0.9283848404884338


 82%|████████▏ | 821/1000 [17:32<03:44,  1.26s/it]

Step: 821, Loss: 0.9290002584457397


 82%|████████▏ | 822/1000 [17:33<03:42,  1.25s/it]

Step: 822, Loss: 0.9243161082267761


 82%|████████▏ | 823/1000 [17:35<03:40,  1.25s/it]

Step: 823, Loss: 0.9255783557891846


 82%|████████▏ | 824/1000 [17:36<03:38,  1.24s/it]

Step: 824, Loss: 0.9084245562553406


 82%|████████▎ | 825/1000 [17:37<03:38,  1.25s/it]

Step: 825, Loss: 0.9217526912689209


 83%|████████▎ | 826/1000 [17:38<03:36,  1.24s/it]

Step: 826, Loss: 0.9159319996833801


 83%|████████▎ | 827/1000 [17:40<03:34,  1.24s/it]

Step: 827, Loss: 0.9213740825653076


 83%|████████▎ | 828/1000 [17:41<03:33,  1.24s/it]

Step: 828, Loss: 0.9103745818138123


 83%|████████▎ | 829/1000 [17:42<03:32,  1.24s/it]

Step: 829, Loss: 0.9148375988006592


 83%|████████▎ | 830/1000 [17:43<03:31,  1.24s/it]

Step: 830, Loss: 0.9131104350090027


 83%|████████▎ | 831/1000 [17:45<03:29,  1.24s/it]

Step: 831, Loss: 0.901049792766571


 83%|████████▎ | 832/1000 [17:46<03:29,  1.25s/it]

Step: 832, Loss: 0.9092699289321899


 83%|████████▎ | 833/1000 [17:47<03:28,  1.25s/it]

Step: 833, Loss: 0.9056602716445923


 83%|████████▎ | 834/1000 [17:48<03:27,  1.25s/it]

Step: 834, Loss: 0.895678699016571


 84%|████████▎ | 835/1000 [17:50<03:27,  1.26s/it]

Step: 835, Loss: 0.9050182104110718


 84%|████████▎ | 836/1000 [17:51<03:26,  1.26s/it]

Step: 836, Loss: 0.9034603834152222


 84%|████████▎ | 837/1000 [17:52<03:23,  1.25s/it]

Step: 837, Loss: 0.9044037461280823


 84%|████████▍ | 838/1000 [17:53<03:22,  1.25s/it]

Step: 838, Loss: 0.8979048132896423


 84%|████████▍ | 839/1000 [17:55<03:21,  1.25s/it]

Step: 839, Loss: 0.8910648822784424


 84%|████████▍ | 840/1000 [17:56<03:20,  1.26s/it]

Step: 840, Loss: 0.8962454199790955


 84%|████████▍ | 841/1000 [17:57<03:21,  1.27s/it]

Step: 841, Loss: 0.8983045816421509


 84%|████████▍ | 842/1000 [17:59<03:18,  1.26s/it]

Step: 842, Loss: 0.885368824005127


 84%|████████▍ | 843/1000 [18:00<03:16,  1.25s/it]

Step: 843, Loss: 0.9096587896347046


 84%|████████▍ | 844/1000 [18:01<03:14,  1.25s/it]

Step: 844, Loss: 0.8986586928367615


 84%|████████▍ | 845/1000 [18:02<03:14,  1.25s/it]

Step: 845, Loss: 0.8968985080718994


 85%|████████▍ | 846/1000 [18:03<03:12,  1.25s/it]

Step: 846, Loss: 0.907439112663269


 85%|████████▍ | 847/1000 [18:05<03:10,  1.25s/it]

Step: 847, Loss: 0.8897547125816345


 85%|████████▍ | 848/1000 [18:06<03:09,  1.25s/it]

Step: 848, Loss: 0.897329568862915


 85%|████████▍ | 849/1000 [18:07<03:08,  1.25s/it]

Step: 849, Loss: 0.8996961116790771


 85%|████████▌ | 850/1000 [18:08<03:07,  1.25s/it]

Step: 850, Loss: 0.8846971988677979


 85%|████████▌ | 851/1000 [18:10<03:05,  1.25s/it]

Step: 851, Loss: 0.9014211297035217


 85%|████████▌ | 852/1000 [18:11<03:04,  1.25s/it]

Step: 852, Loss: 0.8945197463035583


 85%|████████▌ | 853/1000 [18:12<03:02,  1.24s/it]

Step: 853, Loss: 0.8789049386978149


 85%|████████▌ | 854/1000 [18:13<03:01,  1.24s/it]

Step: 854, Loss: 0.8824522495269775


 86%|████████▌ | 855/1000 [18:15<03:00,  1.25s/it]

Step: 855, Loss: 0.8825408220291138


 86%|████████▌ | 856/1000 [18:16<02:59,  1.25s/it]

Step: 856, Loss: 0.8805891275405884


 86%|████████▌ | 857/1000 [18:17<02:58,  1.25s/it]

Step: 857, Loss: 0.8761386871337891


 86%|████████▌ | 858/1000 [18:18<02:57,  1.25s/it]

Step: 858, Loss: 0.8786821961402893


 86%|████████▌ | 859/1000 [18:20<03:04,  1.31s/it]

Step: 859, Loss: 0.881523847579956


 86%|████████▌ | 860/1000 [18:21<03:08,  1.35s/it]

Step: 860, Loss: 0.8781082630157471


 86%|████████▌ | 861/1000 [18:23<03:02,  1.31s/it]

Step: 861, Loss: 0.8838354349136353


 86%|████████▌ | 862/1000 [18:24<02:58,  1.29s/it]

Step: 862, Loss: 0.8763822317123413


 86%|████████▋ | 863/1000 [18:25<02:54,  1.27s/it]

Step: 863, Loss: 0.8766933679580688


 86%|████████▋ | 864/1000 [18:26<02:54,  1.29s/it]

Step: 864, Loss: 0.8780542016029358


 86%|████████▋ | 865/1000 [18:28<02:52,  1.28s/it]

Step: 865, Loss: 0.8769596219062805


 87%|████████▋ | 866/1000 [18:29<02:49,  1.27s/it]

Step: 866, Loss: 0.8759599924087524


 87%|████████▋ | 867/1000 [18:30<02:47,  1.26s/it]

Step: 867, Loss: 0.8688826560974121


 87%|████████▋ | 868/1000 [18:31<02:45,  1.25s/it]

Step: 868, Loss: 0.8717136383056641


 87%|████████▋ | 869/1000 [18:33<02:45,  1.26s/it]

Step: 869, Loss: 0.885429859161377


 87%|████████▋ | 870/1000 [18:34<02:44,  1.27s/it]

Step: 870, Loss: 0.8679357171058655


 87%|████████▋ | 871/1000 [18:35<02:42,  1.26s/it]

Step: 871, Loss: 0.8588346242904663


 87%|████████▋ | 872/1000 [18:36<02:41,  1.26s/it]

Step: 872, Loss: 0.8715548515319824


 87%|████████▋ | 873/1000 [18:38<02:39,  1.25s/it]

Step: 873, Loss: 0.8713972568511963


 87%|████████▋ | 874/1000 [18:39<02:37,  1.25s/it]

Step: 874, Loss: 0.8672282695770264


 88%|████████▊ | 875/1000 [18:40<02:36,  1.25s/it]

Step: 875, Loss: 0.8602178692817688


 88%|████████▊ | 876/1000 [18:41<02:34,  1.25s/it]

Step: 876, Loss: 0.8562865257263184


 88%|████████▊ | 877/1000 [18:43<02:32,  1.24s/it]

Step: 877, Loss: 0.8610178232192993


 88%|████████▊ | 878/1000 [18:44<02:32,  1.25s/it]

Step: 878, Loss: 0.8609071969985962


 88%|████████▊ | 879/1000 [18:45<02:31,  1.25s/it]

Step: 879, Loss: 0.8582310676574707


 88%|████████▊ | 880/1000 [18:46<02:30,  1.25s/it]

Step: 880, Loss: 0.8454148769378662


 88%|████████▊ | 881/1000 [18:48<02:30,  1.26s/it]

Step: 881, Loss: 0.8538339138031006


 88%|████████▊ | 882/1000 [18:49<02:28,  1.25s/it]

Step: 882, Loss: 0.8490076661109924


 88%|████████▊ | 883/1000 [18:50<02:26,  1.25s/it]

Step: 883, Loss: 0.8501343727111816


 88%|████████▊ | 884/1000 [18:51<02:24,  1.25s/it]

Step: 884, Loss: 0.8574284315109253


 88%|████████▊ | 885/1000 [18:53<02:23,  1.25s/it]

Step: 885, Loss: 0.8443080186843872


 89%|████████▊ | 886/1000 [18:54<02:22,  1.25s/it]

Step: 886, Loss: 0.8510221242904663


 89%|████████▊ | 887/1000 [18:55<02:21,  1.26s/it]

Step: 887, Loss: 0.8362793922424316


 89%|████████▉ | 888/1000 [18:56<02:20,  1.26s/it]

Step: 888, Loss: 0.8450357913970947


 89%|████████▉ | 889/1000 [18:58<02:19,  1.26s/it]

Step: 889, Loss: 0.8346686363220215


 89%|████████▉ | 890/1000 [18:59<02:17,  1.25s/it]

Step: 890, Loss: 0.8524775505065918


 89%|████████▉ | 891/1000 [19:00<02:16,  1.25s/it]

Step: 891, Loss: 0.8415653109550476


 89%|████████▉ | 892/1000 [19:01<02:15,  1.26s/it]

Step: 892, Loss: 0.8360649347305298


 89%|████████▉ | 893/1000 [19:03<02:14,  1.25s/it]

Step: 893, Loss: 0.8326273560523987


 89%|████████▉ | 894/1000 [19:04<02:13,  1.26s/it]

Step: 894, Loss: 0.842441737651825


 90%|████████▉ | 895/1000 [19:05<02:11,  1.26s/it]

Step: 895, Loss: 0.8398707509040833


 90%|████████▉ | 896/1000 [19:06<02:10,  1.26s/it]

Step: 896, Loss: 0.8274095058441162


 90%|████████▉ | 897/1000 [19:08<02:08,  1.25s/it]

Step: 897, Loss: 0.8230293393135071


 90%|████████▉ | 898/1000 [19:09<02:07,  1.25s/it]

Step: 898, Loss: 0.8362525105476379


 90%|████████▉ | 899/1000 [19:10<02:05,  1.24s/it]

Step: 899, Loss: 0.831880509853363


 90%|█████████ | 900/1000 [19:11<02:04,  1.24s/it]

Step: 900, Loss: 0.8251084089279175


 90%|█████████ | 901/1000 [19:13<02:02,  1.24s/it]

Step: 901, Loss: 0.8207079172134399


 90%|█████████ | 902/1000 [19:14<02:01,  1.24s/it]

Step: 902, Loss: 0.817704439163208


 90%|█████████ | 903/1000 [19:15<02:01,  1.25s/it]

Step: 903, Loss: 0.8170474767684937


 90%|█████████ | 904/1000 [19:16<01:59,  1.25s/it]

Step: 904, Loss: 0.8313863277435303


 90%|█████████ | 905/1000 [19:18<01:58,  1.25s/it]

Step: 905, Loss: 0.8283577561378479


 91%|█████████ | 906/1000 [19:19<01:56,  1.24s/it]

Step: 906, Loss: 0.841372549533844


 91%|█████████ | 907/1000 [19:20<01:55,  1.25s/it]

Step: 907, Loss: 0.8139467835426331


 91%|█████████ | 908/1000 [19:21<01:56,  1.27s/it]

Step: 908, Loss: 0.8200011253356934


 91%|█████████ | 909/1000 [19:23<01:55,  1.27s/it]

Step: 909, Loss: 0.8211482167243958


 91%|█████████ | 910/1000 [19:24<01:53,  1.26s/it]

Step: 910, Loss: 0.8216758966445923


 91%|█████████ | 911/1000 [19:25<01:52,  1.26s/it]

Step: 911, Loss: 0.8224413990974426


 91%|█████████ | 912/1000 [19:27<01:51,  1.26s/it]

Step: 912, Loss: 0.8127245306968689


 91%|█████████▏| 913/1000 [19:28<01:49,  1.26s/it]

Step: 913, Loss: 0.8080593347549438


 91%|█████████▏| 914/1000 [19:29<01:47,  1.25s/it]

Step: 914, Loss: 0.8163452744483948


 92%|█████████▏| 915/1000 [19:30<01:46,  1.25s/it]

Step: 915, Loss: 0.8202069997787476


 92%|█████████▏| 916/1000 [19:32<01:45,  1.25s/it]

Step: 916, Loss: 0.8076547980308533


 92%|█████████▏| 917/1000 [19:33<01:43,  1.25s/it]

Step: 917, Loss: 0.8145998120307922


 92%|█████████▏| 918/1000 [19:34<01:41,  1.24s/it]

Step: 918, Loss: 0.8074615001678467


 92%|█████████▏| 919/1000 [19:35<01:40,  1.24s/it]

Step: 919, Loss: 0.8133672475814819


 92%|█████████▏| 920/1000 [19:36<01:39,  1.24s/it]

Step: 920, Loss: 0.8065957427024841


 92%|█████████▏| 921/1000 [19:38<01:38,  1.25s/it]

Step: 921, Loss: 0.8258771300315857


 92%|█████████▏| 922/1000 [19:39<01:37,  1.25s/it]

Step: 922, Loss: 0.8116414546966553


 92%|█████████▏| 923/1000 [19:40<01:36,  1.25s/it]

Step: 923, Loss: 0.811454176902771


 92%|█████████▏| 924/1000 [19:41<01:34,  1.25s/it]

Step: 924, Loss: 0.8086493015289307


 92%|█████████▎| 925/1000 [19:43<01:36,  1.29s/it]

Step: 925, Loss: 0.8072537779808044


 93%|█████████▎| 926/1000 [19:44<01:35,  1.29s/it]

Step: 926, Loss: 0.8109869956970215


 93%|█████████▎| 927/1000 [19:45<01:33,  1.28s/it]

Step: 927, Loss: 0.7963687777519226


 93%|█████████▎| 928/1000 [19:47<01:32,  1.28s/it]

Step: 928, Loss: 0.8099300861358643


 93%|█████████▎| 929/1000 [19:48<01:32,  1.30s/it]

Step: 929, Loss: 0.8169599771499634


 93%|█████████▎| 930/1000 [19:49<01:31,  1.30s/it]

Step: 930, Loss: 0.7871463298797607


 93%|█████████▎| 931/1000 [19:51<01:31,  1.33s/it]

Step: 931, Loss: 0.8055940270423889


 93%|█████████▎| 932/1000 [19:52<01:30,  1.34s/it]

Step: 932, Loss: 0.8076185584068298


 93%|█████████▎| 933/1000 [19:53<01:30,  1.35s/it]

Step: 933, Loss: 0.7908028364181519


 93%|█████████▎| 934/1000 [19:55<01:28,  1.34s/it]

Step: 934, Loss: 0.7943690419197083


 94%|█████████▎| 935/1000 [19:56<01:26,  1.33s/it]

Step: 935, Loss: 0.7887682318687439


 94%|█████████▎| 936/1000 [19:58<01:27,  1.37s/it]

Step: 936, Loss: 0.7931485176086426


 94%|█████████▎| 937/1000 [19:59<01:31,  1.45s/it]

Step: 937, Loss: 0.7972042560577393


 94%|█████████▍| 938/1000 [20:01<01:30,  1.46s/it]

Step: 938, Loss: 0.7947794198989868


 94%|█████████▍| 939/1000 [20:02<01:28,  1.45s/it]

Step: 939, Loss: 0.7851731777191162


 94%|█████████▍| 940/1000 [20:04<01:28,  1.48s/it]

Step: 940, Loss: 0.7931550741195679


 94%|█████████▍| 941/1000 [20:05<01:25,  1.45s/it]

Step: 941, Loss: 0.7847809791564941


 94%|█████████▍| 942/1000 [20:06<01:24,  1.46s/it]

Step: 942, Loss: 0.7893512845039368


 94%|█████████▍| 943/1000 [20:08<01:22,  1.45s/it]

Step: 943, Loss: 0.7754977345466614


 94%|█████████▍| 944/1000 [20:09<01:20,  1.44s/it]

Step: 944, Loss: 0.8073994517326355


 94%|█████████▍| 945/1000 [20:11<01:18,  1.42s/it]

Step: 945, Loss: 0.7902112007141113


 95%|█████████▍| 946/1000 [20:12<01:17,  1.44s/it]

Step: 946, Loss: 0.7731595635414124


 95%|█████████▍| 947/1000 [20:14<01:18,  1.49s/it]

Step: 947, Loss: 0.7932403683662415


 95%|█████████▍| 948/1000 [20:15<01:17,  1.50s/it]

Step: 948, Loss: 0.7802517414093018


 95%|█████████▍| 949/1000 [20:17<01:16,  1.49s/it]

Step: 949, Loss: 0.7748913764953613


 95%|█████████▌| 950/1000 [20:18<01:14,  1.50s/it]

Step: 950, Loss: 0.7819966673851013


 95%|█████████▌| 951/1000 [20:20<01:13,  1.49s/it]

Step: 951, Loss: 0.780285656452179


 95%|█████████▌| 952/1000 [20:21<01:11,  1.48s/it]

Step: 952, Loss: 0.7831647992134094


 95%|█████████▌| 953/1000 [20:23<01:08,  1.47s/it]

Step: 953, Loss: 0.7895607948303223


 95%|█████████▌| 954/1000 [20:24<01:07,  1.46s/it]

Step: 954, Loss: 0.7710179686546326


 96%|█████████▌| 955/1000 [20:26<01:04,  1.44s/it]

Step: 955, Loss: 0.777396559715271


 96%|█████████▌| 956/1000 [20:27<01:01,  1.40s/it]

Step: 956, Loss: 0.7999210953712463


 96%|█████████▌| 957/1000 [20:28<00:59,  1.39s/it]

Step: 957, Loss: 0.7725851535797119


 96%|█████████▌| 958/1000 [20:30<00:58,  1.39s/it]

Step: 958, Loss: 0.7975892424583435


 96%|█████████▌| 959/1000 [20:31<00:56,  1.38s/it]

Step: 959, Loss: 0.7732778787612915


 96%|█████████▌| 960/1000 [20:32<00:54,  1.36s/it]

Step: 960, Loss: 0.7553507089614868


 96%|█████████▌| 961/1000 [20:34<00:52,  1.36s/it]

Step: 961, Loss: 0.7658498287200928


 96%|█████████▌| 962/1000 [20:35<00:51,  1.36s/it]

Step: 962, Loss: 0.7718735337257385


 96%|█████████▋| 963/1000 [20:36<00:49,  1.34s/it]

Step: 963, Loss: 0.7636931538581848


 96%|█████████▋| 964/1000 [20:38<00:50,  1.40s/it]

Step: 964, Loss: 0.7485014200210571


 96%|█████████▋| 965/1000 [20:39<00:48,  1.38s/it]

Step: 965, Loss: 0.7459930181503296


 97%|█████████▋| 966/1000 [20:40<00:46,  1.36s/it]

Step: 966, Loss: 0.7623921036720276


 97%|█████████▋| 967/1000 [20:42<00:44,  1.34s/it]

Step: 967, Loss: 0.7535517811775208


 97%|█████████▋| 968/1000 [20:43<00:42,  1.32s/it]

Step: 968, Loss: 0.7596067190170288


 97%|█████████▋| 969/1000 [20:44<00:40,  1.31s/it]

Step: 969, Loss: 0.7604688405990601


 97%|█████████▋| 970/1000 [20:46<00:39,  1.30s/it]

Step: 970, Loss: 0.7736424803733826


 97%|█████████▋| 971/1000 [20:47<00:38,  1.34s/it]

Step: 971, Loss: 0.7616370916366577


 97%|█████████▋| 972/1000 [20:48<00:37,  1.34s/it]

Step: 972, Loss: 0.7600029110908508


 97%|█████████▋| 973/1000 [20:50<00:36,  1.33s/it]

Step: 973, Loss: 0.7501218318939209


 97%|█████████▋| 974/1000 [20:51<00:34,  1.31s/it]

Step: 974, Loss: 0.7427173852920532


 98%|█████████▊| 975/1000 [20:52<00:32,  1.31s/it]

Step: 975, Loss: 0.7358204126358032


 98%|█████████▊| 976/1000 [20:54<00:31,  1.31s/it]

Step: 976, Loss: 0.7500829100608826


 98%|█████████▊| 977/1000 [20:55<00:30,  1.31s/it]

Step: 977, Loss: 0.7508933544158936


 98%|█████████▊| 978/1000 [20:56<00:28,  1.30s/it]

Step: 978, Loss: 0.7399460077285767


 98%|█████████▊| 979/1000 [20:57<00:27,  1.29s/it]

Step: 979, Loss: 0.7475972771644592


 98%|█████████▊| 980/1000 [20:59<00:25,  1.28s/it]

Step: 980, Loss: 0.7382412552833557


 98%|█████████▊| 981/1000 [21:00<00:24,  1.29s/it]

Step: 981, Loss: 0.7374749779701233


 98%|█████████▊| 982/1000 [21:01<00:23,  1.29s/it]

Step: 982, Loss: 0.7389360070228577


 98%|█████████▊| 983/1000 [21:03<00:21,  1.29s/it]

Step: 983, Loss: 0.7385562062263489


 98%|█████████▊| 984/1000 [21:04<00:20,  1.29s/it]

Step: 984, Loss: 0.7337948083877563


 98%|█████████▊| 985/1000 [21:05<00:19,  1.29s/it]

Step: 985, Loss: 0.7385236620903015


 99%|█████████▊| 986/1000 [21:06<00:18,  1.29s/it]

Step: 986, Loss: 0.7301033139228821


 99%|█████████▊| 987/1000 [21:08<00:16,  1.29s/it]

Step: 987, Loss: 0.7238829731941223


 99%|█████████▉| 988/1000 [21:09<00:15,  1.29s/it]

Step: 988, Loss: 0.7284152507781982


 99%|█████████▉| 989/1000 [21:10<00:14,  1.29s/it]

Step: 989, Loss: 0.7434104681015015


 99%|█████████▉| 990/1000 [21:12<00:12,  1.28s/it]

Step: 990, Loss: 0.7316339612007141


 99%|█████████▉| 991/1000 [21:13<00:11,  1.27s/it]

Step: 991, Loss: 0.7417448163032532


 99%|█████████▉| 992/1000 [21:14<00:10,  1.29s/it]

Step: 992, Loss: 0.720452070236206


 99%|█████████▉| 993/1000 [21:15<00:09,  1.29s/it]

Step: 993, Loss: 0.737459123134613


 99%|█████████▉| 994/1000 [21:17<00:07,  1.28s/it]

Step: 994, Loss: 0.743104100227356


100%|█████████▉| 995/1000 [21:18<00:06,  1.28s/it]

Step: 995, Loss: 0.7118552327156067


100%|█████████▉| 996/1000 [21:19<00:05,  1.28s/it]

Step: 996, Loss: 0.7203037142753601


100%|█████████▉| 997/1000 [21:21<00:03,  1.28s/it]

Step: 997, Loss: 0.7158747911453247


100%|█████████▉| 998/1000 [21:22<00:02,  1.28s/it]

Step: 998, Loss: 0.7192241549491882


100%|█████████▉| 999/1000 [21:23<00:01,  1.28s/it]

Step: 999, Loss: 0.718546986579895


100%|██████████| 1000/1000 [21:24<00:00,  1.28s/it]

Step: 1000, Loss: 0.7097722887992859





In [14]:
transformer.eval()
src_sample = torch.zeros(50, dtype=torch.int64)
src_sample[:10] = torch.arange(833, 843, dtype=torch.int64)

In [15]:
src_sample.unsqueeze(0)

tensor([[833, 834, 835, 836, 837, 838, 839, 840, 841, 842,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]])

In [16]:
res = transformer(src_sample.unsqueeze(0), src_sample.unsqueeze(0))

In [17]:
res.squeeze().argmax(dim=1)

tensor([833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846,
        847, 848, 849, 850, 851, 852, 853, 855, 855, 856, 857, 859, 859, 860,
        861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874,
        875, 876, 877, 878, 879, 880, 881, 882])