In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

from tqdm import tqdm

In [14]:
torch.device('mps')

device(type='mps')

In [15]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [16]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [17]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [18]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        ff_output = self.feed_forward(x)
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [19]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [20]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [21]:
src_vocab_size = 1000
tgt_vocab_size = 1000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 50
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

In [22]:
from random import randint

In [23]:
def generate_sample():
    start = randint(1, 944)
    # src = torch.arange(start, start+50, dtype=torch.int64)
    # trg = src + 1
    trg = torch.arange(start, start+50, dtype=torch.int64)
    src = torch.zeros_like(trg, dtype=torch.int64)
    src[:10] = trg[:10]

    return src, trg

In [24]:
def generate_batch(batch_size: int = 128):
    src_batch = torch.tensor([], dtype=torch.int64)
    trg_batch = torch.tensor([], dtype=torch.int64)

    while src_batch.shape[0] < batch_size:
        src_sample, trg_sample = generate_sample()
        src_batch = torch.cat((src_batch, src_sample.unsqueeze(0)))
        trg_batch = torch.cat((trg_batch, trg_sample.unsqueeze(0)))
    
    return src_batch, trg_batch


In [29]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for step in tqdm(range(1000)):
    src_batch, trg_batch = generate_batch(64)
    optimizer.zero_grad()
    output = transformer(src_batch, src_batch)
    loss = criterion(output.view(-1, output.size(-1)), trg_batch.view(-1))
    loss.backward()
    optimizer.step()
    print(f"Step: {step+1}, Loss: {loss.item()}")

  0%|          | 1/1000 [00:01<21:05,  1.27s/it]

Step: 1, Loss: 6.613803863525391


  0%|          | 2/1000 [00:02<16:49,  1.01s/it]

Step: 2, Loss: 6.584731578826904


  0%|          | 3/1000 [00:02<14:38,  1.13it/s]

Step: 3, Loss: 6.605838775634766


  0%|          | 4/1000 [00:03<13:39,  1.22it/s]

Step: 4, Loss: 6.578049182891846


  0%|          | 5/1000 [00:04<13:00,  1.28it/s]

Step: 5, Loss: 6.5376105308532715


  1%|          | 6/1000 [00:05<12:55,  1.28it/s]

Step: 6, Loss: 6.509527683258057


  1%|          | 7/1000 [00:05<13:16,  1.25it/s]

Step: 7, Loss: 6.511623382568359


  1%|          | 8/1000 [00:06<12:50,  1.29it/s]

Step: 8, Loss: 6.478860378265381


  1%|          | 9/1000 [00:07<12:33,  1.32it/s]

Step: 9, Loss: 6.448017120361328


  1%|          | 10/1000 [00:08<12:22,  1.33it/s]

Step: 10, Loss: 6.435214042663574


  1%|          | 11/1000 [00:08<12:09,  1.36it/s]

Step: 11, Loss: 6.389303207397461


  1%|          | 12/1000 [00:09<12:04,  1.36it/s]

Step: 12, Loss: 6.442404270172119


  1%|▏         | 13/1000 [00:10<12:02,  1.37it/s]

Step: 13, Loss: 6.395855903625488


  1%|▏         | 14/1000 [00:11<12:13,  1.34it/s]

Step: 14, Loss: 6.365206718444824


  2%|▏         | 15/1000 [00:11<12:17,  1.34it/s]

Step: 15, Loss: 6.335125923156738


  2%|▏         | 16/1000 [00:12<12:16,  1.34it/s]

Step: 16, Loss: 6.312338352203369


  2%|▏         | 17/1000 [00:13<12:09,  1.35it/s]

Step: 17, Loss: 6.320474147796631


  2%|▏         | 18/1000 [00:13<12:03,  1.36it/s]

Step: 18, Loss: 6.317385196685791


  2%|▏         | 19/1000 [00:14<11:59,  1.36it/s]

Step: 19, Loss: 6.288266181945801


  2%|▏         | 20/1000 [00:15<11:56,  1.37it/s]

Step: 20, Loss: 6.217092514038086


  2%|▏         | 21/1000 [00:16<11:53,  1.37it/s]

Step: 21, Loss: 6.227687358856201


  2%|▏         | 22/1000 [00:16<11:55,  1.37it/s]

Step: 22, Loss: 6.280582427978516


  2%|▏         | 23/1000 [00:17<12:00,  1.36it/s]

Step: 23, Loss: 6.185579299926758


  2%|▏         | 24/1000 [00:18<12:02,  1.35it/s]

Step: 24, Loss: 6.17408561706543


  2%|▎         | 25/1000 [00:19<11:53,  1.37it/s]

Step: 25, Loss: 6.192272186279297


  3%|▎         | 26/1000 [00:19<11:55,  1.36it/s]

Step: 26, Loss: 6.1695637702941895


  3%|▎         | 27/1000 [00:20<11:54,  1.36it/s]

Step: 27, Loss: 6.1459550857543945


  3%|▎         | 28/1000 [00:21<11:50,  1.37it/s]

Step: 28, Loss: 6.170865058898926


  3%|▎         | 29/1000 [00:22<11:48,  1.37it/s]

Step: 29, Loss: 6.083249092102051


  3%|▎         | 30/1000 [00:22<11:47,  1.37it/s]

Step: 30, Loss: 6.101678848266602


  3%|▎         | 31/1000 [00:23<11:43,  1.38it/s]

Step: 31, Loss: 6.052651882171631


  3%|▎         | 32/1000 [00:24<11:40,  1.38it/s]

Step: 32, Loss: 6.051452159881592


  3%|▎         | 33/1000 [00:24<11:34,  1.39it/s]

Step: 33, Loss: 6.040523529052734


  3%|▎         | 34/1000 [00:25<11:30,  1.40it/s]

Step: 34, Loss: 6.0752739906311035


  4%|▎         | 35/1000 [00:26<11:32,  1.39it/s]

Step: 35, Loss: 6.011772632598877


  4%|▎         | 36/1000 [00:27<11:28,  1.40it/s]

Step: 36, Loss: 5.999355316162109


  4%|▎         | 37/1000 [00:27<11:26,  1.40it/s]

Step: 37, Loss: 5.995185375213623


  4%|▍         | 38/1000 [00:28<11:25,  1.40it/s]

Step: 38, Loss: 5.954916477203369


  4%|▍         | 39/1000 [00:29<11:25,  1.40it/s]

Step: 39, Loss: 5.9570183753967285


  4%|▍         | 40/1000 [00:29<11:29,  1.39it/s]

Step: 40, Loss: 5.960080146789551


  4%|▍         | 41/1000 [00:30<11:28,  1.39it/s]

Step: 41, Loss: 5.9873785972595215


  4%|▍         | 42/1000 [00:31<12:11,  1.31it/s]

Step: 42, Loss: 5.980182647705078


  4%|▍         | 43/1000 [00:32<12:02,  1.33it/s]

Step: 43, Loss: 5.927926063537598


  4%|▍         | 44/1000 [00:32<11:51,  1.34it/s]

Step: 44, Loss: 5.920687198638916


  4%|▍         | 45/1000 [00:33<11:38,  1.37it/s]

Step: 45, Loss: 5.932861328125


  5%|▍         | 46/1000 [00:34<11:30,  1.38it/s]

Step: 46, Loss: 5.88180685043335


  5%|▍         | 47/1000 [00:35<11:29,  1.38it/s]

Step: 47, Loss: 5.857851982116699


  5%|▍         | 48/1000 [00:35<11:24,  1.39it/s]

Step: 48, Loss: 5.900892734527588


  5%|▍         | 49/1000 [00:36<11:19,  1.40it/s]

Step: 49, Loss: 5.886228561401367


  5%|▌         | 50/1000 [00:37<11:18,  1.40it/s]

Step: 50, Loss: 5.841921806335449


  5%|▌         | 51/1000 [00:37<11:13,  1.41it/s]

Step: 51, Loss: 5.856761455535889


  5%|▌         | 52/1000 [00:38<11:15,  1.40it/s]

Step: 52, Loss: 5.847621917724609


  5%|▌         | 53/1000 [00:39<11:23,  1.39it/s]

Step: 53, Loss: 5.83029317855835


  5%|▌         | 54/1000 [00:40<11:19,  1.39it/s]

Step: 54, Loss: 5.852931022644043


  6%|▌         | 55/1000 [00:40<11:19,  1.39it/s]

Step: 55, Loss: 5.818267345428467


  6%|▌         | 56/1000 [00:41<11:17,  1.39it/s]

Step: 56, Loss: 5.810194492340088


  6%|▌         | 57/1000 [00:42<11:14,  1.40it/s]

Step: 57, Loss: 5.825860977172852


  6%|▌         | 58/1000 [00:42<11:13,  1.40it/s]

Step: 58, Loss: 5.811642646789551


  6%|▌         | 59/1000 [00:43<11:11,  1.40it/s]

Step: 59, Loss: 5.769148349761963


  6%|▌         | 60/1000 [00:44<11:12,  1.40it/s]

Step: 60, Loss: 5.80556583404541


  6%|▌         | 61/1000 [00:45<11:10,  1.40it/s]

Step: 61, Loss: 5.7386322021484375


  6%|▌         | 62/1000 [00:45<11:07,  1.41it/s]

Step: 62, Loss: 5.779932022094727


  6%|▋         | 63/1000 [00:46<11:06,  1.41it/s]

Step: 63, Loss: 5.77781867980957


  6%|▋         | 64/1000 [00:47<11:02,  1.41it/s]

Step: 64, Loss: 5.702237606048584


  6%|▋         | 65/1000 [00:47<11:01,  1.41it/s]

Step: 65, Loss: 5.736491680145264


  7%|▋         | 66/1000 [00:48<11:01,  1.41it/s]

Step: 66, Loss: 5.726873874664307


  7%|▋         | 67/1000 [00:49<11:00,  1.41it/s]

Step: 67, Loss: 5.728427886962891


  7%|▋         | 68/1000 [00:50<11:04,  1.40it/s]

Step: 68, Loss: 5.7449116706848145


  7%|▋         | 69/1000 [00:50<11:02,  1.40it/s]

Step: 69, Loss: 5.720271587371826


  7%|▋         | 70/1000 [00:51<11:00,  1.41it/s]

Step: 70, Loss: 5.685042858123779


  7%|▋         | 71/1000 [00:52<11:01,  1.40it/s]

Step: 71, Loss: 5.674815654754639


  7%|▋         | 72/1000 [00:52<11:03,  1.40it/s]

Step: 72, Loss: 5.695188999176025


  7%|▋         | 73/1000 [00:53<11:00,  1.40it/s]

Step: 73, Loss: 5.692094802856445


  7%|▋         | 74/1000 [00:54<11:00,  1.40it/s]

Step: 74, Loss: 5.686941623687744


  8%|▊         | 75/1000 [00:55<10:58,  1.40it/s]

Step: 75, Loss: 5.679164886474609


  8%|▊         | 76/1000 [00:55<10:58,  1.40it/s]

Step: 76, Loss: 5.6348185539245605


  8%|▊         | 77/1000 [00:56<10:56,  1.41it/s]

Step: 77, Loss: 5.66097354888916


  8%|▊         | 78/1000 [00:57<10:56,  1.40it/s]

Step: 78, Loss: 5.6378278732299805


  8%|▊         | 79/1000 [00:57<10:58,  1.40it/s]

Step: 79, Loss: 5.664534091949463


  8%|▊         | 80/1000 [00:58<10:57,  1.40it/s]

Step: 80, Loss: 5.629083156585693


  8%|▊         | 81/1000 [00:59<10:59,  1.39it/s]

Step: 81, Loss: 5.618173122406006


  8%|▊         | 82/1000 [01:00<11:01,  1.39it/s]

Step: 82, Loss: 5.638242244720459


  8%|▊         | 83/1000 [01:00<11:20,  1.35it/s]

Step: 83, Loss: 5.654865264892578


  8%|▊         | 84/1000 [01:01<11:34,  1.32it/s]

Step: 84, Loss: 5.631461143493652


  8%|▊         | 85/1000 [01:02<11:45,  1.30it/s]

Step: 85, Loss: 5.607819080352783


  9%|▊         | 86/1000 [01:03<11:44,  1.30it/s]

Step: 86, Loss: 5.647684097290039


  9%|▊         | 87/1000 [01:03<11:44,  1.30it/s]

Step: 87, Loss: 5.645233631134033


  9%|▉         | 88/1000 [01:04<11:39,  1.30it/s]

Step: 88, Loss: 5.623495578765869


  9%|▉         | 89/1000 [01:05<11:37,  1.31it/s]

Step: 89, Loss: 5.6178693771362305


  9%|▉         | 90/1000 [01:06<11:29,  1.32it/s]

Step: 90, Loss: 5.627748489379883


  9%|▉         | 91/1000 [01:06<11:28,  1.32it/s]

Step: 91, Loss: 5.590860366821289


  9%|▉         | 92/1000 [01:07<11:19,  1.34it/s]

Step: 92, Loss: 5.624499320983887


  9%|▉         | 93/1000 [01:08<11:17,  1.34it/s]

Step: 93, Loss: 5.623471260070801


  9%|▉         | 94/1000 [01:09<11:11,  1.35it/s]

Step: 94, Loss: 5.606191635131836


 10%|▉         | 95/1000 [01:09<11:15,  1.34it/s]

Step: 95, Loss: 5.6105523109436035


 10%|▉         | 96/1000 [01:10<11:17,  1.33it/s]

Step: 96, Loss: 5.584841251373291


 10%|▉         | 97/1000 [01:11<11:22,  1.32it/s]

Step: 97, Loss: 5.605213165283203


 10%|▉         | 98/1000 [01:12<11:18,  1.33it/s]

Step: 98, Loss: 5.567598342895508


 10%|▉         | 99/1000 [01:12<11:15,  1.33it/s]

Step: 99, Loss: 5.5820112228393555


 10%|█         | 100/1000 [01:13<11:09,  1.34it/s]

Step: 100, Loss: 5.582563877105713


 10%|█         | 101/1000 [01:14<11:09,  1.34it/s]

Step: 101, Loss: 5.6028594970703125


 10%|█         | 102/1000 [01:15<11:08,  1.34it/s]

Step: 102, Loss: 5.533786773681641


 10%|█         | 103/1000 [01:15<11:08,  1.34it/s]

Step: 103, Loss: 5.559879302978516


 10%|█         | 104/1000 [01:16<11:10,  1.34it/s]

Step: 104, Loss: 5.582901477813721


 10%|█         | 105/1000 [01:17<11:01,  1.35it/s]

Step: 105, Loss: 5.537494659423828


 11%|█         | 106/1000 [01:18<11:00,  1.35it/s]

Step: 106, Loss: 5.537865161895752


 11%|█         | 107/1000 [01:18<11:02,  1.35it/s]

Step: 107, Loss: 5.585780143737793


 11%|█         | 108/1000 [01:19<11:06,  1.34it/s]

Step: 108, Loss: 5.522624492645264


 11%|█         | 109/1000 [01:20<11:06,  1.34it/s]

Step: 109, Loss: 5.546212673187256


 11%|█         | 110/1000 [01:21<11:08,  1.33it/s]

Step: 110, Loss: 5.556055068969727


 11%|█         | 111/1000 [01:21<11:03,  1.34it/s]

Step: 111, Loss: 5.509264945983887


 11%|█         | 112/1000 [01:22<11:36,  1.28it/s]

Step: 112, Loss: 5.497724533081055


 11%|█▏        | 113/1000 [01:23<11:26,  1.29it/s]

Step: 113, Loss: 5.50369119644165


 11%|█▏        | 114/1000 [01:24<11:28,  1.29it/s]

Step: 114, Loss: 5.472351551055908


 12%|█▏        | 115/1000 [01:25<11:18,  1.30it/s]

Step: 115, Loss: 5.471973896026611


 12%|█▏        | 116/1000 [01:25<11:12,  1.31it/s]

Step: 116, Loss: 5.478796005249023


 12%|█▏        | 117/1000 [01:26<11:09,  1.32it/s]

Step: 117, Loss: 5.557778835296631


 12%|█▏        | 118/1000 [01:27<11:05,  1.33it/s]

Step: 118, Loss: 5.483642101287842


 12%|█▏        | 119/1000 [01:28<10:58,  1.34it/s]

Step: 119, Loss: 5.454535007476807


 12%|█▏        | 120/1000 [01:28<10:54,  1.34it/s]

Step: 120, Loss: 5.433134078979492


 12%|█▏        | 121/1000 [01:29<10:58,  1.34it/s]

Step: 121, Loss: 5.485366344451904


 12%|█▏        | 122/1000 [01:30<10:54,  1.34it/s]

Step: 122, Loss: 5.4973835945129395


 12%|█▏        | 123/1000 [01:30<10:51,  1.35it/s]

Step: 123, Loss: 5.399667739868164


 12%|█▏        | 124/1000 [01:31<10:55,  1.34it/s]

Step: 124, Loss: 5.495397567749023


 12%|█▎        | 125/1000 [01:32<10:51,  1.34it/s]

Step: 125, Loss: 5.445077419281006


 13%|█▎        | 126/1000 [01:33<10:49,  1.35it/s]

Step: 126, Loss: 5.371440887451172


 13%|█▎        | 127/1000 [01:33<10:50,  1.34it/s]

Step: 127, Loss: 5.373917102813721


 13%|█▎        | 128/1000 [01:34<10:43,  1.36it/s]

Step: 128, Loss: 5.356451034545898


 13%|█▎        | 129/1000 [01:35<10:41,  1.36it/s]

Step: 129, Loss: 5.350994110107422


 13%|█▎        | 130/1000 [01:36<10:42,  1.35it/s]

Step: 130, Loss: 5.370176792144775


 13%|█▎        | 131/1000 [01:36<10:44,  1.35it/s]

Step: 131, Loss: 5.309854507446289


 13%|█▎        | 132/1000 [01:37<10:40,  1.35it/s]

Step: 132, Loss: 5.188615322113037


 13%|█▎        | 133/1000 [01:38<10:38,  1.36it/s]

Step: 133, Loss: 5.264617919921875


 13%|█▎        | 134/1000 [01:39<10:44,  1.34it/s]

Step: 134, Loss: 5.218801975250244


 14%|█▎        | 135/1000 [01:39<10:51,  1.33it/s]

Step: 135, Loss: 5.112277984619141


 14%|█▎        | 136/1000 [01:40<10:48,  1.33it/s]

Step: 136, Loss: 5.190618991851807


 14%|█▎        | 137/1000 [01:41<10:50,  1.33it/s]

Step: 137, Loss: 5.1206488609313965


 14%|█▍        | 138/1000 [01:42<10:46,  1.33it/s]

Step: 138, Loss: 5.190767288208008


 14%|█▍        | 139/1000 [01:42<10:49,  1.33it/s]

Step: 139, Loss: 5.218191623687744


 14%|█▍        | 140/1000 [01:43<10:47,  1.33it/s]

Step: 140, Loss: 5.095790386199951


 14%|█▍        | 141/1000 [01:44<10:49,  1.32it/s]

Step: 141, Loss: 5.014481067657471


 14%|█▍        | 142/1000 [01:45<10:46,  1.33it/s]

Step: 142, Loss: 5.193069934844971


 14%|█▍        | 143/1000 [01:45<10:45,  1.33it/s]

Step: 143, Loss: 5.045404434204102


 14%|█▍        | 144/1000 [01:46<10:43,  1.33it/s]

Step: 144, Loss: 5.051736354827881


 14%|█▍        | 145/1000 [01:47<10:38,  1.34it/s]

Step: 145, Loss: 4.92435884475708


 15%|█▍        | 146/1000 [01:48<10:37,  1.34it/s]

Step: 146, Loss: 4.9127726554870605


 15%|█▍        | 147/1000 [01:48<10:36,  1.34it/s]

Step: 147, Loss: 4.936267852783203


 15%|█▍        | 148/1000 [01:49<10:32,  1.35it/s]

Step: 148, Loss: 4.911624908447266


 15%|█▍        | 149/1000 [01:50<10:29,  1.35it/s]

Step: 149, Loss: 4.818757057189941


 15%|█▌        | 150/1000 [01:51<10:25,  1.36it/s]

Step: 150, Loss: 4.8406877517700195


 15%|█▌        | 151/1000 [01:51<10:28,  1.35it/s]

Step: 151, Loss: 4.8216657638549805


 15%|█▌        | 152/1000 [01:52<10:30,  1.34it/s]

Step: 152, Loss: 4.7466959953308105


 15%|█▌        | 153/1000 [01:53<10:25,  1.35it/s]

Step: 153, Loss: 4.726749420166016


 15%|█▌        | 154/1000 [01:54<10:24,  1.35it/s]

Step: 154, Loss: 4.6906657218933105


 16%|█▌        | 155/1000 [01:54<10:26,  1.35it/s]

Step: 155, Loss: 4.664632320404053


 16%|█▌        | 156/1000 [01:55<10:26,  1.35it/s]

Step: 156, Loss: 4.626828193664551


 16%|█▌        | 157/1000 [01:56<10:30,  1.34it/s]

Step: 157, Loss: 4.6435346603393555


 16%|█▌        | 158/1000 [01:57<10:29,  1.34it/s]

Step: 158, Loss: 4.6585307121276855


 16%|█▌        | 159/1000 [01:57<10:28,  1.34it/s]

Step: 159, Loss: 4.547457695007324


 16%|█▌        | 160/1000 [01:58<10:27,  1.34it/s]

Step: 160, Loss: 4.470277309417725


 16%|█▌        | 161/1000 [01:59<10:20,  1.35it/s]

Step: 161, Loss: 4.516144275665283


 16%|█▌        | 162/1000 [02:00<11:17,  1.24it/s]

Step: 162, Loss: 4.443655967712402


 16%|█▋        | 163/1000 [02:01<11:09,  1.25it/s]

Step: 163, Loss: 4.468437194824219


 16%|█▋        | 164/1000 [02:01<11:16,  1.24it/s]

Step: 164, Loss: 4.382125377655029


 16%|█▋        | 165/1000 [02:02<11:15,  1.24it/s]

Step: 165, Loss: 4.403685092926025


 17%|█▋        | 166/1000 [02:03<11:05,  1.25it/s]

Step: 166, Loss: 4.398558139801025


 17%|█▋        | 167/1000 [02:04<10:55,  1.27it/s]

Step: 167, Loss: 4.40717077255249


 17%|█▋        | 168/1000 [02:04<10:48,  1.28it/s]

Step: 168, Loss: 4.396492958068848


 17%|█▋        | 169/1000 [02:05<10:45,  1.29it/s]

Step: 169, Loss: 4.410214424133301


 17%|█▋        | 170/1000 [02:06<10:37,  1.30it/s]

Step: 170, Loss: 4.2658281326293945


 17%|█▋        | 171/1000 [02:07<10:33,  1.31it/s]

Step: 171, Loss: 4.2562642097473145


 17%|█▋        | 172/1000 [02:07<10:30,  1.31it/s]

Step: 172, Loss: 4.248936176300049


 17%|█▋        | 173/1000 [02:08<10:27,  1.32it/s]

Step: 173, Loss: 4.221890449523926


 17%|█▋        | 174/1000 [02:09<10:21,  1.33it/s]

Step: 174, Loss: 4.181827068328857


 18%|█▊        | 175/1000 [02:10<10:14,  1.34it/s]

Step: 175, Loss: 4.174830436706543


 18%|█▊        | 176/1000 [02:10<10:22,  1.32it/s]

Step: 176, Loss: 4.144412994384766


 18%|█▊        | 177/1000 [02:11<10:25,  1.32it/s]

Step: 177, Loss: 4.173820495605469


 18%|█▊        | 178/1000 [02:12<10:24,  1.32it/s]

Step: 178, Loss: 4.198276519775391


 18%|█▊        | 179/1000 [02:13<10:23,  1.32it/s]

Step: 179, Loss: 4.148850440979004


 18%|█▊        | 180/1000 [02:14<10:17,  1.33it/s]

Step: 180, Loss: 4.092001438140869


 18%|█▊        | 181/1000 [02:14<10:12,  1.34it/s]

Step: 181, Loss: 4.115855693817139


 18%|█▊        | 182/1000 [02:15<10:19,  1.32it/s]

Step: 182, Loss: 4.0552849769592285


 18%|█▊        | 183/1000 [02:16<10:21,  1.31it/s]

Step: 183, Loss: 4.086931228637695


 18%|█▊        | 184/1000 [02:17<10:18,  1.32it/s]

Step: 184, Loss: 4.018614292144775


 18%|█▊        | 185/1000 [02:17<10:15,  1.32it/s]

Step: 185, Loss: 3.9959793090820312


 19%|█▊        | 186/1000 [02:18<10:12,  1.33it/s]

Step: 186, Loss: 3.978952407836914


 19%|█▊        | 187/1000 [02:19<10:05,  1.34it/s]

Step: 187, Loss: 4.0359368324279785


 19%|█▉        | 188/1000 [02:20<10:03,  1.35it/s]

Step: 188, Loss: 3.9921646118164062


 19%|█▉        | 189/1000 [02:20<10:06,  1.34it/s]

Step: 189, Loss: 3.9714248180389404


 19%|█▉        | 190/1000 [02:21<10:04,  1.34it/s]

Step: 190, Loss: 3.955961227416992


 19%|█▉        | 191/1000 [02:22<10:05,  1.34it/s]

Step: 191, Loss: 3.930544376373291


 19%|█▉        | 192/1000 [02:23<10:00,  1.34it/s]

Step: 192, Loss: 3.9089789390563965


 19%|█▉        | 193/1000 [02:23<10:02,  1.34it/s]

Step: 193, Loss: 3.894505262374878


 19%|█▉        | 194/1000 [02:24<10:04,  1.33it/s]

Step: 194, Loss: 3.9374172687530518


 20%|█▉        | 195/1000 [02:25<10:08,  1.32it/s]

Step: 195, Loss: 3.8706307411193848


 20%|█▉        | 196/1000 [02:26<10:10,  1.32it/s]

Step: 196, Loss: 3.8662283420562744


 20%|█▉        | 197/1000 [02:26<10:10,  1.32it/s]

Step: 197, Loss: 3.869640588760376


 20%|█▉        | 198/1000 [02:27<10:08,  1.32it/s]

Step: 198, Loss: 3.826930284500122


 20%|█▉        | 199/1000 [02:28<10:06,  1.32it/s]

Step: 199, Loss: 3.8215160369873047


 20%|██        | 200/1000 [02:29<10:08,  1.31it/s]

Step: 200, Loss: 3.8138866424560547


 20%|██        | 201/1000 [02:29<10:03,  1.32it/s]

Step: 201, Loss: 3.8291332721710205


 20%|██        | 202/1000 [02:30<10:03,  1.32it/s]

Step: 202, Loss: 3.8033907413482666


 20%|██        | 203/1000 [02:31<09:59,  1.33it/s]

Step: 203, Loss: 3.7688632011413574


 20%|██        | 204/1000 [02:32<09:54,  1.34it/s]

Step: 204, Loss: 3.7719571590423584


 20%|██        | 205/1000 [02:32<09:55,  1.34it/s]

Step: 205, Loss: 3.765873908996582


 21%|██        | 206/1000 [02:33<09:57,  1.33it/s]

Step: 206, Loss: 3.7808361053466797


 21%|██        | 207/1000 [02:34<09:57,  1.33it/s]

Step: 207, Loss: 3.724367380142212


 21%|██        | 208/1000 [02:35<09:53,  1.33it/s]

Step: 208, Loss: 3.7356526851654053


 21%|██        | 209/1000 [02:35<09:51,  1.34it/s]

Step: 209, Loss: 3.7059152126312256


 21%|██        | 210/1000 [02:36<09:55,  1.33it/s]

Step: 210, Loss: 3.6948907375335693


 21%|██        | 211/1000 [02:37<09:52,  1.33it/s]

Step: 211, Loss: 3.7124764919281006


 21%|██        | 212/1000 [02:38<10:15,  1.28it/s]

Step: 212, Loss: 3.6732547283172607


 21%|██▏       | 213/1000 [02:38<10:05,  1.30it/s]

Step: 213, Loss: 3.6728475093841553


 21%|██▏       | 214/1000 [02:39<10:04,  1.30it/s]

Step: 214, Loss: 3.6805524826049805


 22%|██▏       | 215/1000 [02:40<09:59,  1.31it/s]

Step: 215, Loss: 3.666220188140869


 22%|██▏       | 216/1000 [02:41<09:54,  1.32it/s]

Step: 216, Loss: 3.6444098949432373


 22%|██▏       | 217/1000 [02:41<09:51,  1.32it/s]

Step: 217, Loss: 3.62691068649292


 22%|██▏       | 218/1000 [02:42<09:48,  1.33it/s]

Step: 218, Loss: 3.6267895698547363


 22%|██▏       | 219/1000 [02:43<09:47,  1.33it/s]

Step: 219, Loss: 3.6178653240203857


 22%|██▏       | 220/1000 [02:44<09:45,  1.33it/s]

Step: 220, Loss: 3.6022210121154785


 22%|██▏       | 221/1000 [02:44<09:45,  1.33it/s]

Step: 221, Loss: 3.6012063026428223


 22%|██▏       | 222/1000 [02:45<09:44,  1.33it/s]

Step: 222, Loss: 3.5984625816345215


 22%|██▏       | 223/1000 [02:46<09:40,  1.34it/s]

Step: 223, Loss: 3.5793216228485107


 22%|██▏       | 224/1000 [02:47<09:40,  1.34it/s]

Step: 224, Loss: 3.5618581771850586


 22%|██▎       | 225/1000 [02:47<09:44,  1.33it/s]

Step: 225, Loss: 3.575552463531494


 23%|██▎       | 226/1000 [02:48<09:39,  1.34it/s]

Step: 226, Loss: 3.526141405105591


 23%|██▎       | 227/1000 [02:49<09:34,  1.35it/s]

Step: 227, Loss: 3.5740365982055664


 23%|██▎       | 228/1000 [02:50<09:50,  1.31it/s]

Step: 228, Loss: 3.5570287704467773


 23%|██▎       | 229/1000 [02:50<09:45,  1.32it/s]

Step: 229, Loss: 3.551638126373291


 23%|██▎       | 230/1000 [02:51<09:42,  1.32it/s]

Step: 230, Loss: 3.5376813411712646


 23%|██▎       | 231/1000 [02:52<09:39,  1.33it/s]

Step: 231, Loss: 3.5300354957580566


 23%|██▎       | 232/1000 [02:53<09:37,  1.33it/s]

Step: 232, Loss: 3.5016257762908936


 23%|██▎       | 233/1000 [02:53<09:36,  1.33it/s]

Step: 233, Loss: 3.5175881385803223


 23%|██▎       | 234/1000 [02:54<09:40,  1.32it/s]

Step: 234, Loss: 3.4957382678985596


 24%|██▎       | 235/1000 [02:55<09:33,  1.34it/s]

Step: 235, Loss: 3.4935503005981445


 24%|██▎       | 236/1000 [02:56<09:32,  1.34it/s]

Step: 236, Loss: 3.513193368911743


 24%|██▎       | 237/1000 [02:56<09:34,  1.33it/s]

Step: 237, Loss: 3.5050854682922363


 24%|██▍       | 238/1000 [02:57<09:28,  1.34it/s]

Step: 238, Loss: 3.5027811527252197


 24%|██▍       | 239/1000 [02:58<09:29,  1.34it/s]

Step: 239, Loss: 3.4741015434265137


 24%|██▍       | 240/1000 [02:59<09:31,  1.33it/s]

Step: 240, Loss: 3.4685449600219727


 24%|██▍       | 241/1000 [02:59<09:25,  1.34it/s]

Step: 241, Loss: 3.4488649368286133


 24%|██▍       | 242/1000 [03:00<09:27,  1.34it/s]

Step: 242, Loss: 3.457310199737549


 24%|██▍       | 243/1000 [03:01<09:29,  1.33it/s]

Step: 243, Loss: 3.4725844860076904


 24%|██▍       | 244/1000 [03:02<09:33,  1.32it/s]

Step: 244, Loss: 3.4487805366516113


 24%|██▍       | 245/1000 [03:03<09:34,  1.31it/s]

Step: 245, Loss: 3.4215188026428223


 25%|██▍       | 246/1000 [03:03<09:33,  1.32it/s]

Step: 246, Loss: 3.4626004695892334


 25%|██▍       | 247/1000 [03:04<09:32,  1.32it/s]

Step: 247, Loss: 3.4348413944244385


 25%|██▍       | 248/1000 [03:05<09:26,  1.33it/s]

Step: 248, Loss: 3.418163537979126


 25%|██▍       | 249/1000 [03:06<09:26,  1.33it/s]

Step: 249, Loss: 3.4103286266326904


 25%|██▌       | 250/1000 [03:06<09:24,  1.33it/s]

Step: 250, Loss: 3.416696548461914


 25%|██▌       | 251/1000 [03:07<09:24,  1.33it/s]

Step: 251, Loss: 3.4040632247924805


 25%|██▌       | 252/1000 [03:08<09:23,  1.33it/s]

Step: 252, Loss: 3.400858163833618


 25%|██▌       | 253/1000 [03:09<09:24,  1.32it/s]

Step: 253, Loss: 3.379049301147461


 25%|██▌       | 254/1000 [03:09<09:24,  1.32it/s]

Step: 254, Loss: 3.40799617767334


 26%|██▌       | 255/1000 [03:10<09:24,  1.32it/s]

Step: 255, Loss: 3.387913465499878


 26%|██▌       | 256/1000 [03:11<09:20,  1.33it/s]

Step: 256, Loss: 3.3954994678497314


 26%|██▌       | 257/1000 [03:12<09:22,  1.32it/s]

Step: 257, Loss: 3.3918073177337646


 26%|██▌       | 258/1000 [03:12<09:19,  1.33it/s]

Step: 258, Loss: 3.3856472969055176


 26%|██▌       | 259/1000 [03:13<09:31,  1.30it/s]

Step: 259, Loss: 3.367710828781128


 26%|██▌       | 260/1000 [03:14<09:25,  1.31it/s]

Step: 260, Loss: 3.365992546081543


 26%|██▌       | 261/1000 [03:15<09:28,  1.30it/s]

Step: 261, Loss: 3.3566479682922363


 26%|██▌       | 262/1000 [03:15<09:26,  1.30it/s]

Step: 262, Loss: 3.3515775203704834


 26%|██▋       | 263/1000 [03:16<09:29,  1.29it/s]

Step: 263, Loss: 3.3657431602478027


 26%|██▋       | 264/1000 [03:17<09:20,  1.31it/s]

Step: 264, Loss: 3.3355743885040283


 26%|██▋       | 265/1000 [03:18<09:11,  1.33it/s]

Step: 265, Loss: 3.354562759399414


 27%|██▋       | 266/1000 [03:18<09:07,  1.34it/s]

Step: 266, Loss: 3.328047752380371


 27%|██▋       | 267/1000 [03:19<09:10,  1.33it/s]

Step: 267, Loss: 3.3465845584869385


 27%|██▋       | 268/1000 [03:20<09:03,  1.35it/s]

Step: 268, Loss: 3.3306961059570312


 27%|██▋       | 269/1000 [03:21<09:02,  1.35it/s]

Step: 269, Loss: 3.3146212100982666


 27%|██▋       | 270/1000 [03:21<09:02,  1.34it/s]

Step: 270, Loss: 3.3327317237854004


 27%|██▋       | 271/1000 [03:22<09:02,  1.34it/s]

Step: 271, Loss: 3.325444221496582


 27%|██▋       | 272/1000 [03:23<09:05,  1.34it/s]

Step: 272, Loss: 3.322183847427368


 27%|██▋       | 273/1000 [03:24<09:04,  1.33it/s]

Step: 273, Loss: 3.297830581665039


 27%|██▋       | 274/1000 [03:24<09:07,  1.33it/s]

Step: 274, Loss: 3.305049180984497


 28%|██▊       | 275/1000 [03:25<09:06,  1.33it/s]

Step: 275, Loss: 3.2999908924102783


 28%|██▊       | 276/1000 [03:26<09:02,  1.34it/s]

Step: 276, Loss: 3.269401788711548


 28%|██▊       | 277/1000 [03:27<08:57,  1.35it/s]

Step: 277, Loss: 3.286656379699707


 28%|██▊       | 278/1000 [03:27<08:54,  1.35it/s]

Step: 278, Loss: 3.275531053543091


 28%|██▊       | 279/1000 [03:28<08:52,  1.35it/s]

Step: 279, Loss: 3.271904706954956


 28%|██▊       | 280/1000 [03:29<08:50,  1.36it/s]

Step: 280, Loss: 3.2633004188537598


 28%|██▊       | 281/1000 [03:30<08:51,  1.35it/s]

Step: 281, Loss: 3.2444257736206055


 28%|██▊       | 282/1000 [03:30<08:55,  1.34it/s]

Step: 282, Loss: 3.242802619934082


 28%|██▊       | 283/1000 [03:31<08:58,  1.33it/s]

Step: 283, Loss: 3.2561113834381104


 28%|██▊       | 284/1000 [03:32<08:54,  1.34it/s]

Step: 284, Loss: 3.2622714042663574


 28%|██▊       | 285/1000 [03:33<08:51,  1.35it/s]

Step: 285, Loss: 3.2296788692474365


 29%|██▊       | 286/1000 [03:33<08:50,  1.35it/s]

Step: 286, Loss: 3.254939556121826


 29%|██▊       | 287/1000 [03:34<08:43,  1.36it/s]

Step: 287, Loss: 3.2270114421844482


 29%|██▉       | 288/1000 [03:35<08:59,  1.32it/s]

Step: 288, Loss: 3.2207894325256348


 29%|██▉       | 289/1000 [03:36<08:58,  1.32it/s]

Step: 289, Loss: 3.2322440147399902


 29%|██▉       | 290/1000 [03:36<08:59,  1.32it/s]

Step: 290, Loss: 3.241776704788208


 29%|██▉       | 291/1000 [03:37<08:54,  1.33it/s]

Step: 291, Loss: 3.2125179767608643


 29%|██▉       | 292/1000 [03:38<08:56,  1.32it/s]

Step: 292, Loss: 3.2015345096588135


 29%|██▉       | 293/1000 [03:39<08:55,  1.32it/s]

Step: 293, Loss: 3.1770851612091064


 29%|██▉       | 294/1000 [03:39<08:52,  1.33it/s]

Step: 294, Loss: 3.2130801677703857


 30%|██▉       | 295/1000 [03:40<08:51,  1.33it/s]

Step: 295, Loss: 3.1658847332000732


 30%|██▉       | 296/1000 [03:41<08:47,  1.33it/s]

Step: 296, Loss: 3.1833059787750244


 30%|██▉       | 297/1000 [03:42<08:49,  1.33it/s]

Step: 297, Loss: 3.1678502559661865


 30%|██▉       | 298/1000 [03:42<08:50,  1.32it/s]

Step: 298, Loss: 3.1585211753845215


 30%|██▉       | 299/1000 [03:43<08:49,  1.32it/s]

Step: 299, Loss: 3.169929265975952


 30%|███       | 300/1000 [03:44<08:47,  1.33it/s]

Step: 300, Loss: 3.1663012504577637


 30%|███       | 301/1000 [03:45<08:51,  1.32it/s]

Step: 301, Loss: 3.1498186588287354


 30%|███       | 302/1000 [03:45<08:49,  1.32it/s]

Step: 302, Loss: 3.1400928497314453


 30%|███       | 303/1000 [03:46<08:45,  1.33it/s]

Step: 303, Loss: 3.1048266887664795


 30%|███       | 304/1000 [03:47<08:42,  1.33it/s]

Step: 304, Loss: 3.132749080657959


 30%|███       | 305/1000 [03:48<08:40,  1.34it/s]

Step: 305, Loss: 3.1121270656585693


 31%|███       | 306/1000 [03:48<08:41,  1.33it/s]

Step: 306, Loss: 3.0878381729125977


 31%|███       | 307/1000 [03:49<08:41,  1.33it/s]

Step: 307, Loss: 3.095672369003296


 31%|███       | 308/1000 [03:50<08:38,  1.34it/s]

Step: 308, Loss: 3.0881311893463135


 31%|███       | 309/1000 [03:51<08:36,  1.34it/s]

Step: 309, Loss: 3.1025850772857666


 31%|███       | 310/1000 [03:51<08:36,  1.34it/s]

Step: 310, Loss: 3.063650608062744


 31%|███       | 311/1000 [03:52<08:32,  1.34it/s]

Step: 311, Loss: 3.077622890472412


 31%|███       | 312/1000 [03:53<08:32,  1.34it/s]

Step: 312, Loss: 3.0863237380981445


 31%|███▏      | 313/1000 [03:54<08:31,  1.34it/s]

Step: 313, Loss: 3.1178958415985107


 31%|███▏      | 314/1000 [03:54<08:43,  1.31it/s]

Step: 314, Loss: 3.075394868850708


 32%|███▏      | 315/1000 [03:55<08:37,  1.32it/s]

Step: 315, Loss: 3.0459396839141846


 32%|███▏      | 316/1000 [03:56<08:35,  1.33it/s]

Step: 316, Loss: 3.0539934635162354


 32%|███▏      | 317/1000 [03:57<08:35,  1.33it/s]

Step: 317, Loss: 3.0365052223205566


 32%|███▏      | 318/1000 [03:57<08:35,  1.32it/s]

Step: 318, Loss: 3.0289902687072754


 32%|███▏      | 319/1000 [03:58<08:30,  1.33it/s]

Step: 319, Loss: 3.020660638809204


 32%|███▏      | 320/1000 [03:59<08:32,  1.33it/s]

Step: 320, Loss: 3.027646064758301


 32%|███▏      | 321/1000 [04:00<08:27,  1.34it/s]

Step: 321, Loss: 3.001904010772705


 32%|███▏      | 322/1000 [04:00<08:26,  1.34it/s]

Step: 322, Loss: 3.010073661804199


 32%|███▏      | 323/1000 [04:01<08:24,  1.34it/s]

Step: 323, Loss: 3.0322985649108887


 32%|███▏      | 324/1000 [04:02<08:21,  1.35it/s]

Step: 324, Loss: 2.992232084274292


 32%|███▎      | 325/1000 [04:03<08:20,  1.35it/s]

Step: 325, Loss: 2.993452548980713


 33%|███▎      | 326/1000 [04:03<08:21,  1.34it/s]

Step: 326, Loss: 2.998034715652466


 33%|███▎      | 327/1000 [04:04<08:18,  1.35it/s]

Step: 327, Loss: 2.980602502822876


 33%|███▎      | 328/1000 [04:05<08:17,  1.35it/s]

Step: 328, Loss: 2.9837687015533447


 33%|███▎      | 329/1000 [04:06<08:18,  1.35it/s]

Step: 329, Loss: 2.987039566040039


 33%|███▎      | 330/1000 [04:06<08:18,  1.35it/s]

Step: 330, Loss: 2.977139949798584


 33%|███▎      | 331/1000 [04:07<08:19,  1.34it/s]

Step: 331, Loss: 2.9756503105163574


 33%|███▎      | 332/1000 [04:08<08:22,  1.33it/s]

Step: 332, Loss: 2.945819616317749


 33%|███▎      | 333/1000 [04:09<08:18,  1.34it/s]

Step: 333, Loss: 2.982332706451416


 33%|███▎      | 334/1000 [04:09<08:17,  1.34it/s]

Step: 334, Loss: 2.9613373279571533


 34%|███▎      | 335/1000 [04:10<08:15,  1.34it/s]

Step: 335, Loss: 2.9427945613861084


 34%|███▎      | 336/1000 [04:11<08:15,  1.34it/s]

Step: 336, Loss: 2.9432802200317383


 34%|███▎      | 337/1000 [04:12<08:16,  1.33it/s]

Step: 337, Loss: 2.9354026317596436


 34%|███▍      | 338/1000 [04:12<08:16,  1.33it/s]

Step: 338, Loss: 2.91813325881958


 34%|███▍      | 339/1000 [04:13<08:31,  1.29it/s]

Step: 339, Loss: 2.9276928901672363


 34%|███▍      | 340/1000 [04:14<08:25,  1.31it/s]

Step: 340, Loss: 2.920220375061035


 34%|███▍      | 341/1000 [04:15<08:20,  1.32it/s]

Step: 341, Loss: 2.9209508895874023


 34%|███▍      | 342/1000 [04:15<08:12,  1.34it/s]

Step: 342, Loss: 2.915170669555664


 34%|███▍      | 343/1000 [04:16<08:10,  1.34it/s]

Step: 343, Loss: 2.882140874862671


 34%|███▍      | 344/1000 [04:17<08:08,  1.34it/s]

Step: 344, Loss: 2.929647922515869


 34%|███▍      | 345/1000 [04:18<08:06,  1.35it/s]

Step: 345, Loss: 2.9060940742492676


 35%|███▍      | 346/1000 [04:18<08:05,  1.35it/s]

Step: 346, Loss: 2.9219162464141846


 35%|███▍      | 347/1000 [04:19<08:06,  1.34it/s]

Step: 347, Loss: 2.865006446838379


 35%|███▍      | 348/1000 [04:20<08:05,  1.34it/s]

Step: 348, Loss: 2.877755641937256


 35%|███▍      | 349/1000 [04:21<08:02,  1.35it/s]

Step: 349, Loss: 2.876915693283081


 35%|███▌      | 350/1000 [04:21<08:03,  1.35it/s]

Step: 350, Loss: 2.8743369579315186


 35%|███▌      | 351/1000 [04:22<08:01,  1.35it/s]

Step: 351, Loss: 2.876260757446289


 35%|███▌      | 352/1000 [04:23<08:02,  1.34it/s]

Step: 352, Loss: 2.9029433727264404


 35%|███▌      | 353/1000 [04:24<08:05,  1.33it/s]

Step: 353, Loss: 2.8736939430236816


 35%|███▌      | 354/1000 [04:24<08:05,  1.33it/s]

Step: 354, Loss: 2.8462142944335938


 36%|███▌      | 355/1000 [04:25<08:00,  1.34it/s]

Step: 355, Loss: 2.8645718097686768


 36%|███▌      | 356/1000 [04:26<08:00,  1.34it/s]

Step: 356, Loss: 2.830756902694702


 36%|███▌      | 357/1000 [04:27<07:58,  1.34it/s]

Step: 357, Loss: 2.8658058643341064


 36%|███▌      | 358/1000 [04:27<07:56,  1.35it/s]

Step: 358, Loss: 2.840010166168213


 36%|███▌      | 359/1000 [04:28<07:57,  1.34it/s]

Step: 359, Loss: 2.839759588241577


 36%|███▌      | 360/1000 [04:29<07:57,  1.34it/s]

Step: 360, Loss: 2.835458993911743


 36%|███▌      | 361/1000 [04:30<07:52,  1.35it/s]

Step: 361, Loss: 2.843151807785034


 36%|███▌      | 362/1000 [04:30<08:00,  1.33it/s]

Step: 362, Loss: 2.8345842361450195


 36%|███▋      | 363/1000 [04:31<07:57,  1.33it/s]

Step: 363, Loss: 2.832139253616333


 36%|███▋      | 364/1000 [04:32<07:51,  1.35it/s]

Step: 364, Loss: 2.8279953002929688


 36%|███▋      | 365/1000 [04:32<07:50,  1.35it/s]

Step: 365, Loss: 2.8043320178985596


 37%|███▋      | 366/1000 [04:33<07:49,  1.35it/s]

Step: 366, Loss: 2.792290687561035


 37%|███▋      | 367/1000 [04:34<07:48,  1.35it/s]

Step: 367, Loss: 2.815159320831299


 37%|███▋      | 368/1000 [04:35<07:48,  1.35it/s]

Step: 368, Loss: 2.7919609546661377


 37%|███▋      | 369/1000 [04:35<07:49,  1.34it/s]

Step: 369, Loss: 2.7956621646881104


 37%|███▋      | 370/1000 [04:36<07:48,  1.35it/s]

Step: 370, Loss: 2.7781472206115723


 37%|███▋      | 371/1000 [04:37<07:48,  1.34it/s]

Step: 371, Loss: 2.7667746543884277


 37%|███▋      | 372/1000 [04:38<07:51,  1.33it/s]

Step: 372, Loss: 2.7917325496673584


 37%|███▋      | 373/1000 [04:38<07:49,  1.34it/s]

Step: 373, Loss: 2.7717864513397217


 37%|███▋      | 374/1000 [04:39<07:49,  1.33it/s]

Step: 374, Loss: 2.7599282264709473


 38%|███▊      | 375/1000 [04:40<07:51,  1.32it/s]

Step: 375, Loss: 2.76509952545166


 38%|███▊      | 376/1000 [04:41<07:53,  1.32it/s]

Step: 376, Loss: 2.7784650325775146


 38%|███▊      | 377/1000 [04:42<07:53,  1.31it/s]

Step: 377, Loss: 2.775742530822754


 38%|███▊      | 378/1000 [04:42<07:55,  1.31it/s]

Step: 378, Loss: 2.7535128593444824


 38%|███▊      | 379/1000 [04:43<07:50,  1.32it/s]

Step: 379, Loss: 2.7700083255767822


 38%|███▊      | 380/1000 [04:44<07:45,  1.33it/s]

Step: 380, Loss: 2.7536895275115967


 38%|███▊      | 381/1000 [04:45<07:44,  1.33it/s]

Step: 381, Loss: 2.746912956237793


 38%|███▊      | 382/1000 [04:45<07:40,  1.34it/s]

Step: 382, Loss: 2.7487761974334717


 38%|███▊      | 383/1000 [04:46<07:51,  1.31it/s]

Step: 383, Loss: 2.7338614463806152


 38%|███▊      | 384/1000 [04:47<07:50,  1.31it/s]

Step: 384, Loss: 2.754075050354004


 38%|███▊      | 385/1000 [04:48<07:47,  1.32it/s]

Step: 385, Loss: 2.7331511974334717


 39%|███▊      | 386/1000 [04:48<07:43,  1.32it/s]

Step: 386, Loss: 2.7338497638702393


 39%|███▊      | 387/1000 [04:49<07:40,  1.33it/s]

Step: 387, Loss: 2.7267370223999023


 39%|███▉      | 388/1000 [04:50<07:37,  1.34it/s]

Step: 388, Loss: 2.7340729236602783


 39%|███▉      | 389/1000 [04:51<07:36,  1.34it/s]

Step: 389, Loss: 2.730833053588867


 39%|███▉      | 390/1000 [04:51<07:35,  1.34it/s]

Step: 390, Loss: 2.7087998390197754


 39%|███▉      | 391/1000 [04:52<07:36,  1.33it/s]

Step: 391, Loss: 2.7287333011627197


 39%|███▉      | 392/1000 [04:53<07:39,  1.32it/s]

Step: 392, Loss: 2.7096571922302246


 39%|███▉      | 393/1000 [04:54<07:39,  1.32it/s]

Step: 393, Loss: 2.7123584747314453


 39%|███▉      | 394/1000 [04:54<07:38,  1.32it/s]

Step: 394, Loss: 2.723620891571045


 40%|███▉      | 395/1000 [04:55<07:42,  1.31it/s]

Step: 395, Loss: 2.7002334594726562


 40%|███▉      | 396/1000 [04:56<07:37,  1.32it/s]

Step: 396, Loss: 2.728900194168091


 40%|███▉      | 397/1000 [04:57<07:39,  1.31it/s]

Step: 397, Loss: 2.6819403171539307


 40%|███▉      | 398/1000 [04:57<07:39,  1.31it/s]

Step: 398, Loss: 2.7215261459350586


 40%|███▉      | 399/1000 [04:58<07:37,  1.31it/s]

Step: 399, Loss: 2.694472312927246


 40%|████      | 400/1000 [04:59<07:36,  1.31it/s]

Step: 400, Loss: 2.699404001235962


 40%|████      | 401/1000 [05:00<07:33,  1.32it/s]

Step: 401, Loss: 2.686582565307617


 40%|████      | 402/1000 [05:00<07:35,  1.31it/s]

Step: 402, Loss: 2.692479133605957


 40%|████      | 403/1000 [05:01<07:46,  1.28it/s]

Step: 403, Loss: 2.680208683013916


 40%|████      | 404/1000 [05:02<09:07,  1.09it/s]

Step: 404, Loss: 2.7021470069885254


 40%|████      | 405/1000 [05:03<09:11,  1.08it/s]

Step: 405, Loss: 2.713879108428955


 41%|████      | 406/1000 [05:04<09:00,  1.10it/s]

Step: 406, Loss: 2.692692279815674


 41%|████      | 407/1000 [05:05<08:39,  1.14it/s]

Step: 407, Loss: 2.6988375186920166


 41%|████      | 408/1000 [05:06<08:19,  1.18it/s]

Step: 408, Loss: 2.6686389446258545


 41%|████      | 409/1000 [05:07<08:19,  1.18it/s]

Step: 409, Loss: 2.7206978797912598


 41%|████      | 410/1000 [05:08<08:17,  1.19it/s]

Step: 410, Loss: 2.694779634475708


 41%|████      | 411/1000 [05:08<08:06,  1.21it/s]

Step: 411, Loss: 2.688580274581909


 41%|████      | 412/1000 [05:09<08:02,  1.22it/s]

Step: 412, Loss: 2.6760807037353516


 41%|████▏     | 413/1000 [05:10<07:58,  1.23it/s]

Step: 413, Loss: 2.6571929454803467


 41%|████▏     | 414/1000 [05:11<08:00,  1.22it/s]

Step: 414, Loss: 2.6837899684906006


 42%|████▏     | 415/1000 [05:12<07:54,  1.23it/s]

Step: 415, Loss: 2.6575582027435303


 42%|████▏     | 416/1000 [05:12<08:05,  1.20it/s]

Step: 416, Loss: 2.662456750869751


 42%|████▏     | 417/1000 [05:13<08:02,  1.21it/s]

Step: 417, Loss: 2.6468496322631836


 42%|████▏     | 418/1000 [05:14<07:53,  1.23it/s]

Step: 418, Loss: 2.649049758911133


 42%|████▏     | 419/1000 [05:15<07:48,  1.24it/s]

Step: 419, Loss: 2.635679244995117


 42%|████▏     | 420/1000 [05:16<07:46,  1.24it/s]

Step: 420, Loss: 2.6375110149383545


 42%|████▏     | 421/1000 [05:16<07:41,  1.25it/s]

Step: 421, Loss: 2.624786615371704


 42%|████▏     | 422/1000 [05:17<07:43,  1.25it/s]

Step: 422, Loss: 2.6313726902008057


 42%|████▏     | 423/1000 [05:18<07:44,  1.24it/s]

Step: 423, Loss: 2.624823808670044


 42%|████▏     | 424/1000 [05:19<07:42,  1.25it/s]

Step: 424, Loss: 2.6252877712249756


 42%|████▎     | 425/1000 [05:20<07:38,  1.25it/s]

Step: 425, Loss: 2.6159539222717285


 43%|████▎     | 426/1000 [05:20<07:34,  1.26it/s]

Step: 426, Loss: 2.629178524017334


 43%|████▎     | 427/1000 [05:21<07:31,  1.27it/s]

Step: 427, Loss: 2.6316444873809814


 43%|████▎     | 428/1000 [05:22<07:27,  1.28it/s]

Step: 428, Loss: 2.6242780685424805


 43%|████▎     | 429/1000 [05:23<07:26,  1.28it/s]

Step: 429, Loss: 2.6260132789611816


 43%|████▎     | 430/1000 [05:24<07:25,  1.28it/s]

Step: 430, Loss: 2.6152215003967285


 43%|████▎     | 431/1000 [05:24<07:30,  1.26it/s]

Step: 431, Loss: 2.6308717727661133


 43%|████▎     | 432/1000 [05:25<07:29,  1.26it/s]

Step: 432, Loss: 2.6049423217773438


 43%|████▎     | 433/1000 [05:26<07:26,  1.27it/s]

Step: 433, Loss: 2.618999719619751


 43%|████▎     | 434/1000 [05:27<07:24,  1.27it/s]

Step: 434, Loss: 2.6108815670013428


 44%|████▎     | 435/1000 [05:27<07:20,  1.28it/s]

Step: 435, Loss: 2.5937628746032715


 44%|████▎     | 436/1000 [05:28<07:17,  1.29it/s]

Step: 436, Loss: 2.623764753341675


 44%|████▎     | 437/1000 [05:29<07:16,  1.29it/s]

Step: 437, Loss: 2.5932700634002686


 44%|████▍     | 438/1000 [05:30<07:18,  1.28it/s]

Step: 438, Loss: 2.5869803428649902


 44%|████▍     | 439/1000 [05:31<07:15,  1.29it/s]

Step: 439, Loss: 2.6059420108795166


 44%|████▍     | 440/1000 [05:31<07:16,  1.28it/s]

Step: 440, Loss: 2.5992560386657715


 44%|████▍     | 441/1000 [05:32<07:16,  1.28it/s]

Step: 441, Loss: 2.604485511779785


 44%|████▍     | 442/1000 [05:33<07:13,  1.29it/s]

Step: 442, Loss: 2.5828843116760254


 44%|████▍     | 443/1000 [05:34<07:08,  1.30it/s]

Step: 443, Loss: 2.5999040603637695


 44%|████▍     | 444/1000 [05:34<07:08,  1.30it/s]

Step: 444, Loss: 2.5987796783447266


 44%|████▍     | 445/1000 [05:35<07:06,  1.30it/s]

Step: 445, Loss: 2.5852439403533936


 45%|████▍     | 446/1000 [05:36<07:18,  1.26it/s]

Step: 446, Loss: 2.5973682403564453


 45%|████▍     | 447/1000 [05:37<07:13,  1.28it/s]

Step: 447, Loss: 2.574697494506836


 45%|████▍     | 448/1000 [05:38<07:12,  1.28it/s]

Step: 448, Loss: 2.6013143062591553


 45%|████▍     | 449/1000 [05:38<07:11,  1.28it/s]

Step: 449, Loss: 2.57564115524292


 45%|████▌     | 450/1000 [05:39<07:11,  1.27it/s]

Step: 450, Loss: 2.5613930225372314


 45%|████▌     | 451/1000 [05:40<07:09,  1.28it/s]

Step: 451, Loss: 2.5716986656188965


 45%|████▌     | 452/1000 [05:41<07:08,  1.28it/s]

Step: 452, Loss: 2.5650229454040527


 45%|████▌     | 453/1000 [05:41<07:03,  1.29it/s]

Step: 453, Loss: 2.5553479194641113


 45%|████▌     | 454/1000 [05:42<07:01,  1.30it/s]

Step: 454, Loss: 2.5615005493164062


 46%|████▌     | 455/1000 [05:43<06:59,  1.30it/s]

Step: 455, Loss: 2.568521738052368


 46%|████▌     | 456/1000 [05:44<06:59,  1.30it/s]

Step: 456, Loss: 2.538801670074463


 46%|████▌     | 457/1000 [05:45<06:59,  1.30it/s]

Step: 457, Loss: 2.5596277713775635


 46%|████▌     | 458/1000 [05:45<06:55,  1.30it/s]

Step: 458, Loss: 2.522244453430176


 46%|████▌     | 459/1000 [05:46<06:53,  1.31it/s]

Step: 459, Loss: 2.5596792697906494


 46%|████▌     | 460/1000 [05:47<06:50,  1.31it/s]

Step: 460, Loss: 2.5511953830718994


 46%|████▌     | 461/1000 [05:48<06:52,  1.31it/s]

Step: 461, Loss: 2.5619473457336426


 46%|████▌     | 462/1000 [05:48<06:56,  1.29it/s]

Step: 462, Loss: 2.542348861694336


 46%|████▋     | 463/1000 [05:49<06:54,  1.29it/s]

Step: 463, Loss: 2.555370330810547


 46%|████▋     | 464/1000 [05:50<06:53,  1.30it/s]

Step: 464, Loss: 2.5392603874206543


 46%|████▋     | 465/1000 [05:51<06:55,  1.29it/s]

Step: 465, Loss: 2.5538699626922607


 47%|████▋     | 466/1000 [05:51<06:52,  1.29it/s]

Step: 466, Loss: 2.5554018020629883


 47%|████▋     | 467/1000 [05:52<06:58,  1.27it/s]

Step: 467, Loss: 2.512213706970215


 47%|████▋     | 468/1000 [05:53<06:55,  1.28it/s]

Step: 468, Loss: 2.547410011291504


 47%|████▋     | 469/1000 [05:54<06:57,  1.27it/s]

Step: 469, Loss: 2.520273208618164


 47%|████▋     | 470/1000 [05:55<06:56,  1.27it/s]

Step: 470, Loss: 2.516981601715088


 47%|████▋     | 471/1000 [05:55<06:53,  1.28it/s]

Step: 471, Loss: 2.516003370285034


 47%|████▋     | 472/1000 [05:56<06:53,  1.28it/s]

Step: 472, Loss: 2.5391712188720703


 47%|████▋     | 473/1000 [05:57<06:53,  1.27it/s]

Step: 473, Loss: 2.510085344314575


 47%|████▋     | 474/1000 [05:58<06:51,  1.28it/s]

Step: 474, Loss: 2.556246280670166


 48%|████▊     | 475/1000 [05:59<06:52,  1.27it/s]

Step: 475, Loss: 2.4990053176879883


 48%|████▊     | 476/1000 [05:59<06:49,  1.28it/s]

Step: 476, Loss: 2.5254249572753906


 48%|████▊     | 477/1000 [06:00<06:50,  1.27it/s]

Step: 477, Loss: 2.518789768218994


 48%|████▊     | 478/1000 [06:01<06:49,  1.27it/s]

Step: 478, Loss: 2.5462241172790527


 48%|████▊     | 479/1000 [06:02<06:46,  1.28it/s]

Step: 479, Loss: 2.5246317386627197


 48%|████▊     | 480/1000 [06:02<06:44,  1.28it/s]

Step: 480, Loss: 2.4914612770080566


 48%|████▊     | 481/1000 [06:03<06:41,  1.29it/s]

Step: 481, Loss: 2.542119026184082


 48%|████▊     | 482/1000 [06:04<06:40,  1.29it/s]

Step: 482, Loss: 2.513476848602295


 48%|████▊     | 483/1000 [06:05<06:41,  1.29it/s]

Step: 483, Loss: 2.506762742996216


 48%|████▊     | 484/1000 [06:06<06:41,  1.28it/s]

Step: 484, Loss: 2.4962549209594727


 48%|████▊     | 485/1000 [06:06<06:38,  1.29it/s]

Step: 485, Loss: 2.5009765625


 49%|████▊     | 486/1000 [06:07<06:37,  1.29it/s]

Step: 486, Loss: 2.479825496673584


 49%|████▊     | 487/1000 [06:08<06:37,  1.29it/s]

Step: 487, Loss: 2.4835562705993652


 49%|████▉     | 488/1000 [06:09<06:34,  1.30it/s]

Step: 488, Loss: 2.4991610050201416


 49%|████▉     | 489/1000 [06:09<06:34,  1.29it/s]

Step: 489, Loss: 2.5010194778442383


 49%|████▉     | 490/1000 [06:10<06:37,  1.28it/s]

Step: 490, Loss: 2.4962213039398193


 49%|████▉     | 491/1000 [06:11<06:36,  1.28it/s]

Step: 491, Loss: 2.4989871978759766


 49%|████▉     | 492/1000 [06:12<06:35,  1.29it/s]

Step: 492, Loss: 2.4885168075561523


 49%|████▉     | 493/1000 [06:13<06:30,  1.30it/s]

Step: 493, Loss: 2.4764890670776367


 49%|████▉     | 494/1000 [06:13<06:31,  1.29it/s]

Step: 494, Loss: 2.476118326187134


 50%|████▉     | 495/1000 [06:14<06:31,  1.29it/s]

Step: 495, Loss: 2.4863698482513428


 50%|████▉     | 496/1000 [06:15<06:28,  1.30it/s]

Step: 496, Loss: 2.485459566116333


 50%|████▉     | 497/1000 [06:16<06:27,  1.30it/s]

Step: 497, Loss: 2.454885959625244


 50%|████▉     | 498/1000 [06:16<06:27,  1.30it/s]

Step: 498, Loss: 2.4669809341430664


 50%|████▉     | 499/1000 [06:17<06:23,  1.31it/s]

Step: 499, Loss: 2.483952283859253


 50%|█████     | 500/1000 [06:18<06:25,  1.30it/s]

Step: 500, Loss: 2.4713191986083984


 50%|█████     | 501/1000 [06:19<06:26,  1.29it/s]

Step: 501, Loss: 2.476964235305786


 50%|█████     | 502/1000 [06:19<06:30,  1.28it/s]

Step: 502, Loss: 2.473921298980713


 50%|█████     | 503/1000 [06:20<06:28,  1.28it/s]

Step: 503, Loss: 2.474642753601074


 50%|█████     | 504/1000 [06:21<06:26,  1.28it/s]

Step: 504, Loss: 2.466856002807617


 50%|█████     | 505/1000 [06:22<06:24,  1.29it/s]

Step: 505, Loss: 2.457545280456543


 51%|█████     | 506/1000 [06:23<06:24,  1.28it/s]

Step: 506, Loss: 2.477966547012329


 51%|█████     | 507/1000 [06:23<06:23,  1.29it/s]

Step: 507, Loss: 2.4562761783599854


 51%|█████     | 508/1000 [06:24<06:23,  1.28it/s]

Step: 508, Loss: 2.4775173664093018


 51%|█████     | 509/1000 [06:25<06:23,  1.28it/s]

Step: 509, Loss: 2.4961791038513184


 51%|█████     | 510/1000 [06:26<06:22,  1.28it/s]

Step: 510, Loss: 2.4766745567321777


 51%|█████     | 511/1000 [06:26<06:19,  1.29it/s]

Step: 511, Loss: 2.4723598957061768


 51%|█████     | 512/1000 [06:27<06:16,  1.29it/s]

Step: 512, Loss: 2.4657657146453857


 51%|█████▏    | 513/1000 [06:28<06:13,  1.30it/s]

Step: 513, Loss: 2.4905924797058105


 51%|█████▏    | 514/1000 [06:29<06:12,  1.30it/s]

Step: 514, Loss: 2.4604368209838867


 52%|█████▏    | 515/1000 [06:30<06:11,  1.31it/s]

Step: 515, Loss: 2.497800350189209


 52%|█████▏    | 516/1000 [06:30<06:12,  1.30it/s]

Step: 516, Loss: 2.5048036575317383


 52%|█████▏    | 517/1000 [06:31<06:11,  1.30it/s]

Step: 517, Loss: 2.4899308681488037


 52%|█████▏    | 518/1000 [06:32<06:11,  1.30it/s]

Step: 518, Loss: 2.4961695671081543


 52%|█████▏    | 519/1000 [06:33<06:10,  1.30it/s]

Step: 519, Loss: 2.4598894119262695


 52%|█████▏    | 520/1000 [06:33<06:06,  1.31it/s]

Step: 520, Loss: 2.473581314086914


 52%|█████▏    | 521/1000 [06:34<06:05,  1.31it/s]

Step: 521, Loss: 2.482056140899658


 52%|█████▏    | 522/1000 [06:35<06:09,  1.29it/s]

Step: 522, Loss: 2.4778387546539307


 52%|█████▏    | 523/1000 [06:36<06:07,  1.30it/s]

Step: 523, Loss: 2.44724178314209


 52%|█████▏    | 524/1000 [06:36<06:03,  1.31it/s]

Step: 524, Loss: 2.446126699447632


 52%|█████▎    | 525/1000 [06:37<06:03,  1.31it/s]

Step: 525, Loss: 2.4574575424194336


 53%|█████▎    | 526/1000 [06:38<06:03,  1.30it/s]

Step: 526, Loss: 2.415783405303955


 53%|█████▎    | 527/1000 [06:39<06:00,  1.31it/s]

Step: 527, Loss: 2.4382948875427246


 53%|█████▎    | 528/1000 [06:40<06:00,  1.31it/s]

Step: 528, Loss: 2.427962064743042


 53%|█████▎    | 529/1000 [06:40<06:00,  1.31it/s]

Step: 529, Loss: 2.440019130706787


 53%|█████▎    | 530/1000 [06:41<05:59,  1.31it/s]

Step: 530, Loss: 2.4320709705352783


 53%|█████▎    | 531/1000 [06:42<05:57,  1.31it/s]

Step: 531, Loss: 2.426828145980835


 53%|█████▎    | 532/1000 [06:43<05:56,  1.31it/s]

Step: 532, Loss: 2.431133985519409


 53%|█████▎    | 533/1000 [06:43<05:55,  1.31it/s]

Step: 533, Loss: 2.410341739654541


 53%|█████▎    | 534/1000 [06:44<05:54,  1.32it/s]

Step: 534, Loss: 2.415639638900757


 54%|█████▎    | 535/1000 [06:45<05:55,  1.31it/s]

Step: 535, Loss: 2.4236900806427


 54%|█████▎    | 536/1000 [06:46<05:57,  1.30it/s]

Step: 536, Loss: 2.4252846240997314


 54%|█████▎    | 537/1000 [06:46<05:55,  1.30it/s]

Step: 537, Loss: 2.424159049987793


 54%|█████▍    | 538/1000 [06:47<05:56,  1.30it/s]

Step: 538, Loss: 2.396852493286133


 54%|█████▍    | 539/1000 [06:48<05:51,  1.31it/s]

Step: 539, Loss: 2.425135374069214


 54%|█████▍    | 540/1000 [06:49<05:49,  1.31it/s]

Step: 540, Loss: 2.3896372318267822


 54%|█████▍    | 541/1000 [06:49<05:48,  1.32it/s]

Step: 541, Loss: 2.4058780670166016


 54%|█████▍    | 542/1000 [06:50<05:47,  1.32it/s]

Step: 542, Loss: 2.393470048904419


 54%|█████▍    | 543/1000 [06:51<05:47,  1.31it/s]

Step: 543, Loss: 2.4209885597229004


 54%|█████▍    | 544/1000 [06:52<05:48,  1.31it/s]

Step: 544, Loss: 2.401129722595215


 55%|█████▍    | 545/1000 [06:52<05:44,  1.32it/s]

Step: 545, Loss: 2.392989158630371


 55%|█████▍    | 546/1000 [06:53<05:45,  1.31it/s]

Step: 546, Loss: 2.4265198707580566


 55%|█████▍    | 547/1000 [06:54<05:45,  1.31it/s]

Step: 547, Loss: 2.3963963985443115


 55%|█████▍    | 548/1000 [06:55<05:44,  1.31it/s]

Step: 548, Loss: 2.3747429847717285


 55%|█████▍    | 549/1000 [06:56<05:43,  1.31it/s]

Step: 549, Loss: 2.418801784515381


 55%|█████▌    | 550/1000 [06:56<05:41,  1.32it/s]

Step: 550, Loss: 2.3808786869049072


 55%|█████▌    | 551/1000 [06:57<05:39,  1.32it/s]

Step: 551, Loss: 2.3682355880737305


 55%|█████▌    | 552/1000 [06:58<05:41,  1.31it/s]

Step: 552, Loss: 2.401315212249756


 55%|█████▌    | 553/1000 [06:59<05:44,  1.30it/s]

Step: 553, Loss: 2.3764824867248535


 55%|█████▌    | 554/1000 [06:59<05:43,  1.30it/s]

Step: 554, Loss: 2.3827080726623535


 56%|█████▌    | 555/1000 [07:00<05:45,  1.29it/s]

Step: 555, Loss: 2.3808116912841797


 56%|█████▌    | 556/1000 [07:01<05:44,  1.29it/s]

Step: 556, Loss: 2.3642754554748535


 56%|█████▌    | 557/1000 [07:02<05:41,  1.30it/s]

Step: 557, Loss: 2.353050947189331


 56%|█████▌    | 558/1000 [07:02<05:41,  1.29it/s]

Step: 558, Loss: 2.362933874130249


 56%|█████▌    | 559/1000 [07:03<05:41,  1.29it/s]

Step: 559, Loss: 2.350611686706543


 56%|█████▌    | 560/1000 [07:04<05:39,  1.30it/s]

Step: 560, Loss: 2.362957239151001


 56%|█████▌    | 561/1000 [07:05<05:36,  1.30it/s]

Step: 561, Loss: 2.3540337085723877


 56%|█████▌    | 562/1000 [07:06<05:37,  1.30it/s]

Step: 562, Loss: 2.345876932144165


 56%|█████▋    | 563/1000 [07:06<05:44,  1.27it/s]

Step: 563, Loss: 2.372098445892334


 56%|█████▋    | 564/1000 [07:07<05:39,  1.28it/s]

Step: 564, Loss: 2.3513550758361816


 56%|█████▋    | 565/1000 [07:08<05:39,  1.28it/s]

Step: 565, Loss: 2.3619890213012695


 57%|█████▋    | 566/1000 [07:09<05:36,  1.29it/s]

Step: 566, Loss: 2.3530516624450684


 57%|█████▋    | 567/1000 [07:09<05:34,  1.29it/s]

Step: 567, Loss: 2.347944974899292


 57%|█████▋    | 568/1000 [07:10<05:33,  1.29it/s]

Step: 568, Loss: 2.349592924118042


 57%|█████▋    | 569/1000 [07:11<05:28,  1.31it/s]

Step: 569, Loss: 2.358569860458374


 57%|█████▋    | 570/1000 [07:12<05:29,  1.31it/s]

Step: 570, Loss: 2.3491432666778564


 57%|█████▋    | 571/1000 [07:13<05:32,  1.29it/s]

Step: 571, Loss: 2.352928876876831


 57%|█████▋    | 572/1000 [07:13<05:30,  1.30it/s]

Step: 572, Loss: 2.3425278663635254


 57%|█████▋    | 573/1000 [07:14<05:28,  1.30it/s]

Step: 573, Loss: 2.3419058322906494


 57%|█████▋    | 574/1000 [07:15<05:28,  1.30it/s]

Step: 574, Loss: 2.344473123550415


 57%|█████▊    | 575/1000 [07:16<05:26,  1.30it/s]

Step: 575, Loss: 2.3371503353118896


 58%|█████▊    | 576/1000 [07:16<05:24,  1.31it/s]

Step: 576, Loss: 2.3605916500091553


 58%|█████▊    | 577/1000 [07:17<05:26,  1.29it/s]

Step: 577, Loss: 2.3408756256103516


 58%|█████▊    | 578/1000 [07:18<05:23,  1.30it/s]

Step: 578, Loss: 2.347015142440796


 58%|█████▊    | 579/1000 [07:19<05:20,  1.31it/s]

Step: 579, Loss: 2.349351167678833


 58%|█████▊    | 580/1000 [07:19<05:21,  1.31it/s]

Step: 580, Loss: 2.327606439590454


 58%|█████▊    | 581/1000 [07:20<05:18,  1.32it/s]

Step: 581, Loss: 2.3334734439849854


 58%|█████▊    | 582/1000 [07:21<05:19,  1.31it/s]

Step: 582, Loss: 2.3204143047332764


 58%|█████▊    | 583/1000 [07:22<05:19,  1.31it/s]

Step: 583, Loss: 2.3439626693725586


 58%|█████▊    | 584/1000 [07:22<05:15,  1.32it/s]

Step: 584, Loss: 2.3190999031066895


 58%|█████▊    | 585/1000 [07:23<05:17,  1.31it/s]

Step: 585, Loss: 2.342076063156128


 59%|█████▊    | 586/1000 [07:24<05:20,  1.29it/s]

Step: 586, Loss: 2.3317534923553467


 59%|█████▊    | 587/1000 [07:25<05:17,  1.30it/s]

Step: 587, Loss: 2.3052828311920166


 59%|█████▉    | 588/1000 [07:26<05:15,  1.31it/s]

Step: 588, Loss: 2.33402943611145


 59%|█████▉    | 589/1000 [07:26<05:16,  1.30it/s]

Step: 589, Loss: 2.3075428009033203


 59%|█████▉    | 590/1000 [07:27<05:17,  1.29it/s]

Step: 590, Loss: 2.3038337230682373


 59%|█████▉    | 591/1000 [07:28<05:17,  1.29it/s]

Step: 591, Loss: 2.3306820392608643


 59%|█████▉    | 592/1000 [07:29<05:14,  1.30it/s]

Step: 592, Loss: 2.3109428882598877


 59%|█████▉    | 593/1000 [07:29<05:10,  1.31it/s]

Step: 593, Loss: 2.3073699474334717


 59%|█████▉    | 594/1000 [07:30<05:09,  1.31it/s]

Step: 594, Loss: 2.3153514862060547


 60%|█████▉    | 595/1000 [07:31<05:10,  1.30it/s]

Step: 595, Loss: 2.2947702407836914


 60%|█████▉    | 596/1000 [07:32<05:07,  1.31it/s]

Step: 596, Loss: 2.296795606613159


 60%|█████▉    | 597/1000 [07:32<05:08,  1.31it/s]

Step: 597, Loss: 2.3039329051971436


 60%|█████▉    | 598/1000 [07:33<05:07,  1.31it/s]

Step: 598, Loss: 2.3126609325408936


 60%|█████▉    | 599/1000 [07:34<05:04,  1.32it/s]

Step: 599, Loss: 2.3130767345428467


 60%|██████    | 600/1000 [07:35<05:01,  1.32it/s]

Step: 600, Loss: 2.293200969696045


 60%|██████    | 601/1000 [07:35<05:03,  1.32it/s]

Step: 601, Loss: 2.320150852203369


 60%|██████    | 602/1000 [07:36<05:06,  1.30it/s]

Step: 602, Loss: 2.2968454360961914


 60%|██████    | 603/1000 [07:37<05:06,  1.30it/s]

Step: 603, Loss: 2.2767348289489746


 60%|██████    | 604/1000 [07:38<05:04,  1.30it/s]

Step: 604, Loss: 2.2802600860595703


 60%|██████    | 605/1000 [07:39<05:03,  1.30it/s]

Step: 605, Loss: 2.3007075786590576


 61%|██████    | 606/1000 [07:39<05:02,  1.30it/s]

Step: 606, Loss: 2.299942970275879


 61%|██████    | 607/1000 [07:40<05:00,  1.31it/s]

Step: 607, Loss: 2.3017590045928955


 61%|██████    | 608/1000 [07:41<04:58,  1.31it/s]

Step: 608, Loss: 2.3245606422424316


 61%|██████    | 609/1000 [07:42<05:00,  1.30it/s]

Step: 609, Loss: 2.3080360889434814


 61%|██████    | 610/1000 [07:42<04:57,  1.31it/s]

Step: 610, Loss: 2.307560920715332


 61%|██████    | 611/1000 [07:43<04:57,  1.31it/s]

Step: 611, Loss: 2.280947685241699


 61%|██████    | 612/1000 [07:44<04:55,  1.31it/s]

Step: 612, Loss: 2.311354398727417


 61%|██████▏   | 613/1000 [07:45<04:54,  1.32it/s]

Step: 613, Loss: 2.3044092655181885


 61%|██████▏   | 614/1000 [07:45<04:52,  1.32it/s]

Step: 614, Loss: 2.3023037910461426


 62%|██████▏   | 615/1000 [07:46<04:54,  1.31it/s]

Step: 615, Loss: 2.312326431274414


 62%|██████▏   | 616/1000 [07:47<04:54,  1.30it/s]

Step: 616, Loss: 2.3081021308898926


 62%|██████▏   | 617/1000 [07:48<04:51,  1.32it/s]

Step: 617, Loss: 2.301999092102051


 62%|██████▏   | 618/1000 [07:48<04:49,  1.32it/s]

Step: 618, Loss: 2.2710111141204834


 62%|██████▏   | 619/1000 [07:49<04:49,  1.31it/s]

Step: 619, Loss: 2.2990081310272217


 62%|██████▏   | 620/1000 [07:50<04:52,  1.30it/s]

Step: 620, Loss: 2.2616426944732666


 62%|██████▏   | 621/1000 [07:51<04:49,  1.31it/s]

Step: 621, Loss: 2.299445390701294


 62%|██████▏   | 622/1000 [07:52<04:49,  1.31it/s]

Step: 622, Loss: 2.2702977657318115


 62%|██████▏   | 623/1000 [07:52<04:47,  1.31it/s]

Step: 623, Loss: 2.2640204429626465


 62%|██████▏   | 624/1000 [07:53<04:45,  1.32it/s]

Step: 624, Loss: 2.2700185775756836


 62%|██████▎   | 625/1000 [07:54<04:46,  1.31it/s]

Step: 625, Loss: 2.2664687633514404


 63%|██████▎   | 626/1000 [07:55<04:45,  1.31it/s]

Step: 626, Loss: 2.2691733837127686


 63%|██████▎   | 627/1000 [07:55<04:44,  1.31it/s]

Step: 627, Loss: 2.255474090576172


 63%|██████▎   | 628/1000 [07:56<04:44,  1.31it/s]

Step: 628, Loss: 2.256714105606079


 63%|██████▎   | 629/1000 [07:57<04:43,  1.31it/s]

Step: 629, Loss: 2.255974054336548


 63%|██████▎   | 630/1000 [07:58<04:40,  1.32it/s]

Step: 630, Loss: 2.2520244121551514


 63%|██████▎   | 631/1000 [07:58<04:38,  1.33it/s]

Step: 631, Loss: 2.2518656253814697


 63%|██████▎   | 632/1000 [07:59<04:38,  1.32it/s]

Step: 632, Loss: 2.244194746017456


 63%|██████▎   | 633/1000 [08:00<04:38,  1.32it/s]

Step: 633, Loss: 2.2434356212615967


 63%|██████▎   | 634/1000 [08:01<04:39,  1.31it/s]

Step: 634, Loss: 2.2520833015441895


 64%|██████▎   | 635/1000 [08:01<04:39,  1.31it/s]

Step: 635, Loss: 2.2534399032592773


 64%|██████▎   | 636/1000 [08:02<04:38,  1.31it/s]

Step: 636, Loss: 2.2736001014709473


 64%|██████▎   | 637/1000 [08:03<04:36,  1.31it/s]

Step: 637, Loss: 2.294675588607788


 64%|██████▍   | 638/1000 [08:04<04:36,  1.31it/s]

Step: 638, Loss: 2.252131700515747


 64%|██████▍   | 639/1000 [08:04<04:36,  1.30it/s]

Step: 639, Loss: 2.2787160873413086


 64%|██████▍   | 640/1000 [08:05<04:38,  1.29it/s]

Step: 640, Loss: 2.2653281688690186


 64%|██████▍   | 641/1000 [08:06<04:37,  1.29it/s]

Step: 641, Loss: 2.2953834533691406


 64%|██████▍   | 642/1000 [08:07<04:35,  1.30it/s]

Step: 642, Loss: 2.264829158782959


 64%|██████▍   | 643/1000 [08:08<04:34,  1.30it/s]

Step: 643, Loss: 2.247789144515991


 64%|██████▍   | 644/1000 [08:08<04:33,  1.30it/s]

Step: 644, Loss: 2.286435842514038


 64%|██████▍   | 645/1000 [08:09<04:32,  1.30it/s]

Step: 645, Loss: 2.2963531017303467


 65%|██████▍   | 646/1000 [08:10<04:31,  1.30it/s]

Step: 646, Loss: 2.2467024326324463


 65%|██████▍   | 647/1000 [08:11<04:31,  1.30it/s]

Step: 647, Loss: 2.263887882232666


 65%|██████▍   | 648/1000 [08:11<04:30,  1.30it/s]

Step: 648, Loss: 2.271575689315796


 65%|██████▍   | 649/1000 [08:12<04:31,  1.29it/s]

Step: 649, Loss: 2.264374017715454


 65%|██████▌   | 650/1000 [08:13<04:29,  1.30it/s]

Step: 650, Loss: 2.261676788330078


 65%|██████▌   | 651/1000 [08:14<04:26,  1.31it/s]

Step: 651, Loss: 2.235305070877075


 65%|██████▌   | 652/1000 [08:14<04:24,  1.32it/s]

Step: 652, Loss: 2.2647526264190674


 65%|██████▌   | 653/1000 [08:15<04:22,  1.32it/s]

Step: 653, Loss: 2.2494800090789795


 65%|██████▌   | 654/1000 [08:16<04:22,  1.32it/s]

Step: 654, Loss: 2.2474684715270996


 66%|██████▌   | 655/1000 [08:17<04:24,  1.31it/s]

Step: 655, Loss: 2.2445015907287598


 66%|██████▌   | 656/1000 [08:18<04:23,  1.30it/s]

Step: 656, Loss: 2.229806900024414


 66%|██████▌   | 657/1000 [08:18<04:22,  1.31it/s]

Step: 657, Loss: 2.2436161041259766


 66%|██████▌   | 658/1000 [08:19<04:20,  1.31it/s]

Step: 658, Loss: 2.247607469558716


 66%|██████▌   | 659/1000 [08:20<04:19,  1.32it/s]

Step: 659, Loss: 2.249659299850464


 66%|██████▌   | 660/1000 [08:21<04:20,  1.30it/s]

Step: 660, Loss: 2.24623441696167


 66%|██████▌   | 661/1000 [08:21<04:18,  1.31it/s]

Step: 661, Loss: 2.251311779022217


 66%|██████▌   | 662/1000 [08:22<04:16,  1.32it/s]

Step: 662, Loss: 2.2490007877349854


 66%|██████▋   | 663/1000 [08:23<04:16,  1.32it/s]

Step: 663, Loss: 2.2362403869628906


 66%|██████▋   | 664/1000 [08:24<04:15,  1.32it/s]

Step: 664, Loss: 2.2469241619110107


 66%|██████▋   | 665/1000 [08:24<04:13,  1.32it/s]

Step: 665, Loss: 2.262960910797119


 67%|██████▋   | 666/1000 [08:25<04:19,  1.29it/s]

Step: 666, Loss: 2.233560085296631


 67%|██████▋   | 667/1000 [08:26<04:22,  1.27it/s]

Step: 667, Loss: 2.250605583190918


 67%|██████▋   | 668/1000 [08:27<04:19,  1.28it/s]

Step: 668, Loss: 2.214242696762085


 67%|██████▋   | 669/1000 [08:28<04:18,  1.28it/s]

Step: 669, Loss: 2.290212631225586


 67%|██████▋   | 670/1000 [08:28<04:18,  1.28it/s]

Step: 670, Loss: 2.2300875186920166


 67%|██████▋   | 671/1000 [08:29<04:15,  1.29it/s]

Step: 671, Loss: 2.2292592525482178


 67%|██████▋   | 672/1000 [08:30<04:11,  1.30it/s]

Step: 672, Loss: 2.243445634841919


 67%|██████▋   | 673/1000 [08:31<04:11,  1.30it/s]

Step: 673, Loss: 2.2326900959014893


 67%|██████▋   | 674/1000 [08:31<04:09,  1.30it/s]

Step: 674, Loss: 2.192863702774048


 68%|██████▊   | 675/1000 [08:32<04:08,  1.31it/s]

Step: 675, Loss: 2.233626365661621


 68%|██████▊   | 676/1000 [08:33<04:07,  1.31it/s]

Step: 676, Loss: 2.226137161254883


 68%|██████▊   | 677/1000 [08:34<04:06,  1.31it/s]

Step: 677, Loss: 2.216247320175171


 68%|██████▊   | 678/1000 [08:34<04:06,  1.31it/s]

Step: 678, Loss: 2.2237937450408936


 68%|██████▊   | 679/1000 [08:35<04:04,  1.31it/s]

Step: 679, Loss: 2.2291955947875977


 68%|██████▊   | 680/1000 [08:36<04:03,  1.31it/s]

Step: 680, Loss: 2.2140445709228516


 68%|██████▊   | 681/1000 [08:37<04:03,  1.31it/s]

Step: 681, Loss: 2.256657123565674


 68%|██████▊   | 682/1000 [08:37<04:02,  1.31it/s]

Step: 682, Loss: 2.215550422668457


 68%|██████▊   | 683/1000 [08:38<04:01,  1.31it/s]

Step: 683, Loss: 2.210951328277588


 68%|██████▊   | 684/1000 [08:39<03:59,  1.32it/s]

Step: 684, Loss: 2.1930878162384033


 68%|██████▊   | 685/1000 [08:40<04:00,  1.31it/s]

Step: 685, Loss: 2.201063394546509


 69%|██████▊   | 686/1000 [08:41<04:00,  1.31it/s]

Step: 686, Loss: 2.194653272628784


 69%|██████▊   | 687/1000 [08:41<03:59,  1.31it/s]

Step: 687, Loss: 2.200132131576538


 69%|██████▉   | 688/1000 [08:42<03:57,  1.31it/s]

Step: 688, Loss: 2.2034876346588135


 69%|██████▉   | 689/1000 [08:43<03:58,  1.31it/s]

Step: 689, Loss: 2.1949923038482666


 69%|██████▉   | 690/1000 [08:44<03:54,  1.32it/s]

Step: 690, Loss: 2.192533493041992


 69%|██████▉   | 691/1000 [08:44<03:55,  1.31it/s]

Step: 691, Loss: 2.2027769088745117


 69%|██████▉   | 692/1000 [08:45<03:54,  1.31it/s]

Step: 692, Loss: 2.2133841514587402


 69%|██████▉   | 693/1000 [08:46<03:53,  1.31it/s]

Step: 693, Loss: 2.1770431995391846


 69%|██████▉   | 694/1000 [08:47<03:53,  1.31it/s]

Step: 694, Loss: 2.1801018714904785


 70%|██████▉   | 695/1000 [08:47<03:51,  1.32it/s]

Step: 695, Loss: 2.2003777027130127


 70%|██████▉   | 696/1000 [08:48<03:49,  1.33it/s]

Step: 696, Loss: 2.196868419647217


 70%|██████▉   | 697/1000 [08:49<03:50,  1.32it/s]

Step: 697, Loss: 2.2070016860961914


 70%|██████▉   | 698/1000 [08:50<03:52,  1.30it/s]

Step: 698, Loss: 2.188483238220215


 70%|██████▉   | 699/1000 [08:50<03:49,  1.31it/s]

Step: 699, Loss: 2.1584503650665283


 70%|███████   | 700/1000 [08:51<03:49,  1.31it/s]

Step: 700, Loss: 2.166722536087036


 70%|███████   | 701/1000 [08:52<03:49,  1.30it/s]

Step: 701, Loss: 2.156562566757202


 70%|███████   | 702/1000 [08:53<03:48,  1.30it/s]

Step: 702, Loss: 2.1717162132263184


 70%|███████   | 703/1000 [08:53<03:46,  1.31it/s]

Step: 703, Loss: 2.184147596359253


 70%|███████   | 704/1000 [08:54<03:45,  1.31it/s]

Step: 704, Loss: 2.1743052005767822


 70%|███████   | 705/1000 [08:55<03:44,  1.31it/s]

Step: 705, Loss: 2.163918972015381


 71%|███████   | 706/1000 [08:56<03:45,  1.31it/s]

Step: 706, Loss: 2.179140090942383


 71%|███████   | 707/1000 [08:57<03:43,  1.31it/s]

Step: 707, Loss: 2.1661324501037598


 71%|███████   | 708/1000 [08:57<03:42,  1.31it/s]

Step: 708, Loss: 2.143240213394165


 71%|███████   | 709/1000 [08:58<03:41,  1.32it/s]

Step: 709, Loss: 2.1667754650115967


 71%|███████   | 710/1000 [08:59<03:38,  1.32it/s]

Step: 710, Loss: 2.1752138137817383


 71%|███████   | 711/1000 [09:00<03:37,  1.33it/s]

Step: 711, Loss: 2.1572649478912354


 71%|███████   | 712/1000 [09:00<03:37,  1.33it/s]

Step: 712, Loss: 2.1676900386810303


 71%|███████▏  | 713/1000 [09:01<03:37,  1.32it/s]

Step: 713, Loss: 2.1489360332489014


 71%|███████▏  | 714/1000 [09:02<03:36,  1.32it/s]

Step: 714, Loss: 2.1384575366973877


 72%|███████▏  | 715/1000 [09:03<03:34,  1.33it/s]

Step: 715, Loss: 2.170240879058838


 72%|███████▏  | 716/1000 [09:03<03:33,  1.33it/s]

Step: 716, Loss: 2.1625816822052


 72%|███████▏  | 717/1000 [09:04<03:33,  1.33it/s]

Step: 717, Loss: 2.166386365890503


 72%|███████▏  | 718/1000 [09:05<03:33,  1.32it/s]

Step: 718, Loss: 2.138607978820801


 72%|███████▏  | 719/1000 [09:06<03:33,  1.31it/s]

Step: 719, Loss: 2.1629045009613037


 72%|███████▏  | 720/1000 [09:06<03:31,  1.33it/s]

Step: 720, Loss: 2.1576592922210693


 72%|███████▏  | 721/1000 [09:07<03:31,  1.32it/s]

Step: 721, Loss: 2.1534206867218018


 72%|███████▏  | 722/1000 [09:08<03:30,  1.32it/s]

Step: 722, Loss: 2.153120756149292


 72%|███████▏  | 723/1000 [09:09<03:29,  1.32it/s]

Step: 723, Loss: 2.1468749046325684


 72%|███████▏  | 724/1000 [09:09<03:28,  1.33it/s]

Step: 724, Loss: 2.1323866844177246


 72%|███████▎  | 725/1000 [09:10<03:28,  1.32it/s]

Step: 725, Loss: 2.1356923580169678


 73%|███████▎  | 726/1000 [09:11<03:26,  1.33it/s]

Step: 726, Loss: 2.149649143218994


 73%|███████▎  | 727/1000 [09:12<03:26,  1.32it/s]

Step: 727, Loss: 2.1316025257110596


 73%|███████▎  | 728/1000 [09:12<03:26,  1.31it/s]

Step: 728, Loss: 2.140686273574829


 73%|███████▎  | 729/1000 [09:13<03:24,  1.33it/s]

Step: 729, Loss: 2.156160831451416


 73%|███████▎  | 730/1000 [09:14<03:27,  1.30it/s]

Step: 730, Loss: 2.1200804710388184


 73%|███████▎  | 731/1000 [09:15<03:32,  1.26it/s]

Step: 731, Loss: 2.160080671310425


 73%|███████▎  | 732/1000 [09:16<03:31,  1.27it/s]

Step: 732, Loss: 2.138575792312622


 73%|███████▎  | 733/1000 [09:16<03:28,  1.28it/s]

Step: 733, Loss: 2.166400194168091


 73%|███████▎  | 734/1000 [09:17<03:26,  1.29it/s]

Step: 734, Loss: 2.152920722961426


 74%|███████▎  | 735/1000 [09:18<03:27,  1.28it/s]

Step: 735, Loss: 2.1260523796081543


 74%|███████▎  | 736/1000 [09:19<03:25,  1.28it/s]

Step: 736, Loss: 2.1607325077056885


 74%|███████▎  | 737/1000 [09:19<03:25,  1.28it/s]

Step: 737, Loss: 2.110053777694702


 74%|███████▍  | 738/1000 [09:20<03:23,  1.28it/s]

Step: 738, Loss: 2.1593499183654785


 74%|███████▍  | 739/1000 [09:21<03:21,  1.29it/s]

Step: 739, Loss: 2.1128149032592773


 74%|███████▍  | 740/1000 [09:22<03:19,  1.30it/s]

Step: 740, Loss: 2.119823694229126


 74%|███████▍  | 741/1000 [09:23<03:16,  1.32it/s]

Step: 741, Loss: 2.137934684753418


 74%|███████▍  | 742/1000 [09:23<03:16,  1.31it/s]

Step: 742, Loss: 2.1076924800872803


 74%|███████▍  | 743/1000 [09:24<03:18,  1.30it/s]

Step: 743, Loss: 2.1319518089294434


 74%|███████▍  | 744/1000 [09:25<03:17,  1.29it/s]

Step: 744, Loss: 2.1050922870635986


 74%|███████▍  | 745/1000 [09:26<03:17,  1.29it/s]

Step: 745, Loss: 2.127972364425659


 75%|███████▍  | 746/1000 [09:26<03:16,  1.30it/s]

Step: 746, Loss: 2.133283853530884


 75%|███████▍  | 747/1000 [09:27<03:14,  1.30it/s]

Step: 747, Loss: 2.139683723449707


 75%|███████▍  | 748/1000 [09:28<03:12,  1.31it/s]

Step: 748, Loss: 2.151700973510742


 75%|███████▍  | 749/1000 [09:29<03:11,  1.31it/s]

Step: 749, Loss: 2.1459896564483643


 75%|███████▌  | 750/1000 [09:29<03:10,  1.31it/s]

Step: 750, Loss: 2.1505212783813477


 75%|███████▌  | 751/1000 [09:30<03:09,  1.32it/s]

Step: 751, Loss: 2.1567766666412354


 75%|███████▌  | 752/1000 [09:31<03:10,  1.30it/s]

Step: 752, Loss: 2.1370866298675537


 75%|███████▌  | 753/1000 [09:32<03:09,  1.31it/s]

Step: 753, Loss: 2.111595392227173


 75%|███████▌  | 754/1000 [09:32<03:08,  1.30it/s]

Step: 754, Loss: 2.136195182800293


 76%|███████▌  | 755/1000 [09:33<03:07,  1.30it/s]

Step: 755, Loss: 2.1794936656951904


 76%|███████▌  | 756/1000 [09:34<03:09,  1.29it/s]

Step: 756, Loss: 2.1237213611602783


 76%|███████▌  | 757/1000 [09:35<03:12,  1.26it/s]

Step: 757, Loss: 2.1417975425720215


 76%|███████▌  | 758/1000 [09:36<03:11,  1.26it/s]

Step: 758, Loss: 2.131960391998291


 76%|███████▌  | 759/1000 [09:36<03:09,  1.27it/s]

Step: 759, Loss: 2.126084089279175


 76%|███████▌  | 760/1000 [09:37<03:07,  1.28it/s]

Step: 760, Loss: 2.106743097305298


 76%|███████▌  | 761/1000 [09:38<03:05,  1.29it/s]

Step: 761, Loss: 2.075296401977539


 76%|███████▌  | 762/1000 [09:39<03:03,  1.30it/s]

Step: 762, Loss: 2.1284356117248535


 76%|███████▋  | 763/1000 [09:40<03:02,  1.30it/s]

Step: 763, Loss: 2.110891103744507


 76%|███████▋  | 764/1000 [09:40<03:01,  1.30it/s]

Step: 764, Loss: 2.109384298324585


 76%|███████▋  | 765/1000 [09:41<02:59,  1.31it/s]

Step: 765, Loss: 2.1079065799713135


 77%|███████▋  | 766/1000 [09:42<02:59,  1.31it/s]

Step: 766, Loss: 2.0992095470428467


 77%|███████▋  | 767/1000 [09:43<02:58,  1.31it/s]

Step: 767, Loss: 2.111427068710327


 77%|███████▋  | 768/1000 [09:43<02:56,  1.31it/s]

Step: 768, Loss: 2.09322452545166


 77%|███████▋  | 769/1000 [09:44<02:56,  1.31it/s]

Step: 769, Loss: 2.0952885150909424


 77%|███████▋  | 770/1000 [09:45<02:56,  1.30it/s]

Step: 770, Loss: 2.085819959640503


 77%|███████▋  | 771/1000 [09:46<02:55,  1.31it/s]

Step: 771, Loss: 2.0791192054748535


 77%|███████▋  | 772/1000 [09:46<02:54,  1.31it/s]

Step: 772, Loss: 2.0998036861419678


 77%|███████▋  | 773/1000 [09:47<02:54,  1.30it/s]

Step: 773, Loss: 2.1012723445892334


 77%|███████▋  | 774/1000 [09:48<02:53,  1.31it/s]

Step: 774, Loss: 2.088711977005005


 78%|███████▊  | 775/1000 [09:49<02:51,  1.31it/s]

Step: 775, Loss: 2.0902090072631836


 78%|███████▊  | 776/1000 [09:49<02:51,  1.31it/s]

Step: 776, Loss: 2.112565279006958


 78%|███████▊  | 777/1000 [09:50<02:50,  1.30it/s]

Step: 777, Loss: 2.0938093662261963


 78%|███████▊  | 778/1000 [09:51<02:49,  1.31it/s]

Step: 778, Loss: 2.097688674926758


 78%|███████▊  | 779/1000 [09:52<02:49,  1.31it/s]

Step: 779, Loss: 2.0908336639404297


 78%|███████▊  | 780/1000 [09:52<02:46,  1.32it/s]

Step: 780, Loss: 2.1042230129241943


 78%|███████▊  | 781/1000 [09:53<02:45,  1.33it/s]

Step: 781, Loss: 2.0855541229248047


 78%|███████▊  | 782/1000 [09:54<02:45,  1.32it/s]

Step: 782, Loss: 2.1317338943481445


 78%|███████▊  | 783/1000 [09:55<02:44,  1.32it/s]

Step: 783, Loss: 2.093247652053833


 78%|███████▊  | 784/1000 [09:56<02:44,  1.31it/s]

Step: 784, Loss: 2.100459337234497


 78%|███████▊  | 785/1000 [09:56<02:44,  1.30it/s]

Step: 785, Loss: 2.0813841819763184


 79%|███████▊  | 786/1000 [09:57<02:43,  1.31it/s]

Step: 786, Loss: 2.070427656173706


 79%|███████▊  | 787/1000 [09:58<02:42,  1.31it/s]

Step: 787, Loss: 2.0819780826568604


 79%|███████▉  | 788/1000 [09:59<02:40,  1.32it/s]

Step: 788, Loss: 2.0710480213165283


 79%|███████▉  | 789/1000 [09:59<02:39,  1.32it/s]

Step: 789, Loss: 2.0831124782562256


 79%|███████▉  | 790/1000 [10:00<02:39,  1.32it/s]

Step: 790, Loss: 2.0917365550994873


 79%|███████▉  | 791/1000 [10:01<02:38,  1.32it/s]

Step: 791, Loss: 2.0561811923980713


 79%|███████▉  | 792/1000 [10:02<02:39,  1.30it/s]

Step: 792, Loss: 2.0863900184631348


 79%|███████▉  | 793/1000 [10:02<02:38,  1.30it/s]

Step: 793, Loss: 2.0606839656829834


 79%|███████▉  | 794/1000 [10:03<02:37,  1.30it/s]

Step: 794, Loss: 2.0409200191497803


 80%|███████▉  | 795/1000 [10:04<02:35,  1.32it/s]

Step: 795, Loss: 2.0763044357299805


 80%|███████▉  | 796/1000 [10:05<02:35,  1.31it/s]

Step: 796, Loss: 2.0636653900146484


 80%|███████▉  | 797/1000 [10:05<02:35,  1.31it/s]

Step: 797, Loss: 2.0658156871795654


 80%|███████▉  | 798/1000 [10:06<02:33,  1.32it/s]

Step: 798, Loss: 2.052031993865967


 80%|███████▉  | 799/1000 [10:07<02:33,  1.31it/s]

Step: 799, Loss: 2.0477113723754883


 80%|████████  | 800/1000 [10:08<02:33,  1.30it/s]

Step: 800, Loss: 2.0650064945220947


 80%|████████  | 801/1000 [10:09<02:32,  1.31it/s]

Step: 801, Loss: 2.058936834335327


 80%|████████  | 802/1000 [10:09<02:32,  1.30it/s]

Step: 802, Loss: 2.061307191848755


 80%|████████  | 803/1000 [10:10<02:33,  1.29it/s]

Step: 803, Loss: 2.04551362991333


 80%|████████  | 804/1000 [10:11<02:31,  1.29it/s]

Step: 804, Loss: 2.0483500957489014


 80%|████████  | 805/1000 [10:12<02:29,  1.30it/s]

Step: 805, Loss: 2.063924551010132


 81%|████████  | 806/1000 [10:12<02:28,  1.30it/s]

Step: 806, Loss: 2.074361801147461


 81%|████████  | 807/1000 [10:13<02:27,  1.31it/s]

Step: 807, Loss: 2.083317756652832


 81%|████████  | 808/1000 [10:14<02:26,  1.31it/s]

Step: 808, Loss: 2.06660795211792


 81%|████████  | 809/1000 [10:15<02:25,  1.31it/s]

Step: 809, Loss: 2.052687644958496


 81%|████████  | 810/1000 [10:15<02:24,  1.32it/s]

Step: 810, Loss: 2.055379629135132


 81%|████████  | 811/1000 [10:16<02:24,  1.31it/s]

Step: 811, Loss: 2.032021999359131


 81%|████████  | 812/1000 [10:17<02:23,  1.31it/s]

Step: 812, Loss: 2.0235135555267334


 81%|████████▏ | 813/1000 [10:18<02:22,  1.31it/s]

Step: 813, Loss: 2.0526671409606934


 81%|████████▏ | 814/1000 [10:18<02:21,  1.32it/s]

Step: 814, Loss: 2.04960298538208


 82%|████████▏ | 815/1000 [10:19<02:20,  1.31it/s]

Step: 815, Loss: 2.0275893211364746


 82%|████████▏ | 816/1000 [10:20<02:19,  1.32it/s]

Step: 816, Loss: 2.0653738975524902


 82%|████████▏ | 817/1000 [10:21<02:17,  1.33it/s]

Step: 817, Loss: 2.0312438011169434


 82%|████████▏ | 818/1000 [10:21<02:18,  1.32it/s]

Step: 818, Loss: 2.0331645011901855


 82%|████████▏ | 819/1000 [10:22<02:17,  1.32it/s]

Step: 819, Loss: 2.042109489440918


 82%|████████▏ | 820/1000 [10:23<02:16,  1.32it/s]

Step: 820, Loss: 2.0241541862487793


 82%|████████▏ | 821/1000 [10:24<02:15,  1.32it/s]

Step: 821, Loss: 2.040395975112915


 82%|████████▏ | 822/1000 [10:24<02:13,  1.33it/s]

Step: 822, Loss: 2.0419232845306396


 82%|████████▏ | 823/1000 [10:25<02:14,  1.32it/s]

Step: 823, Loss: 2.016857147216797


 82%|████████▏ | 824/1000 [10:26<02:14,  1.31it/s]

Step: 824, Loss: 2.042330741882324


 82%|████████▎ | 825/1000 [10:27<02:13,  1.31it/s]

Step: 825, Loss: 2.0261621475219727


 83%|████████▎ | 826/1000 [10:28<02:12,  1.31it/s]

Step: 826, Loss: 2.0319297313690186


 83%|████████▎ | 827/1000 [10:28<02:11,  1.31it/s]

Step: 827, Loss: 2.0198676586151123


 83%|████████▎ | 828/1000 [10:29<02:10,  1.32it/s]

Step: 828, Loss: 2.0086660385131836


 83%|████████▎ | 829/1000 [10:30<02:09,  1.32it/s]

Step: 829, Loss: 2.017639398574829


 83%|████████▎ | 830/1000 [10:31<02:09,  1.31it/s]

Step: 830, Loss: 2.0199201107025146


 83%|████████▎ | 831/1000 [10:31<02:08,  1.32it/s]

Step: 831, Loss: 2.0174574851989746


 83%|████████▎ | 832/1000 [10:32<02:07,  1.32it/s]

Step: 832, Loss: 2.0093488693237305


 83%|████████▎ | 833/1000 [10:33<02:06,  1.32it/s]

Step: 833, Loss: 2.0222103595733643


 83%|████████▎ | 834/1000 [10:34<02:06,  1.32it/s]

Step: 834, Loss: 2.0096988677978516


 84%|████████▎ | 835/1000 [10:34<02:04,  1.32it/s]

Step: 835, Loss: 2.0000171661376953


 84%|████████▎ | 836/1000 [10:35<02:04,  1.32it/s]

Step: 836, Loss: 2.005389928817749


 84%|████████▎ | 837/1000 [10:36<02:03,  1.32it/s]

Step: 837, Loss: 2.027768135070801


 84%|████████▍ | 838/1000 [10:37<02:02,  1.33it/s]

Step: 838, Loss: 2.0034682750701904


 84%|████████▍ | 839/1000 [10:37<02:02,  1.32it/s]

Step: 839, Loss: 2.0130629539489746


 84%|████████▍ | 840/1000 [10:38<02:00,  1.33it/s]

Step: 840, Loss: 2.0068297386169434


 84%|████████▍ | 841/1000 [10:39<01:59,  1.33it/s]

Step: 841, Loss: 2.020010232925415


 84%|████████▍ | 842/1000 [10:40<01:59,  1.32it/s]

Step: 842, Loss: 2.01406192779541


 84%|████████▍ | 843/1000 [10:41<02:02,  1.28it/s]

Step: 843, Loss: 2.014859676361084


 84%|████████▍ | 844/1000 [10:41<02:03,  1.26it/s]

Step: 844, Loss: 2.014622449874878


 84%|████████▍ | 845/1000 [10:42<02:00,  1.28it/s]

Step: 845, Loss: 2.009286880493164


 85%|████████▍ | 846/1000 [10:43<01:59,  1.29it/s]

Step: 846, Loss: 2.014115810394287


 85%|████████▍ | 847/1000 [10:44<01:57,  1.30it/s]

Step: 847, Loss: 2.005336046218872


 85%|████████▍ | 848/1000 [10:44<01:56,  1.31it/s]

Step: 848, Loss: 2.0249381065368652


 85%|████████▍ | 849/1000 [10:45<01:55,  1.31it/s]

Step: 849, Loss: 2.0122766494750977


 85%|████████▌ | 850/1000 [10:46<01:53,  1.32it/s]

Step: 850, Loss: 2.053670883178711


 85%|████████▌ | 851/1000 [10:47<01:53,  1.31it/s]

Step: 851, Loss: 2.030198574066162


 85%|████████▌ | 852/1000 [10:47<01:52,  1.31it/s]

Step: 852, Loss: 2.0410468578338623


 85%|████████▌ | 853/1000 [10:48<01:51,  1.32it/s]

Step: 853, Loss: 1.9836175441741943


 85%|████████▌ | 854/1000 [10:49<01:51,  1.31it/s]

Step: 854, Loss: 2.056035280227661


 86%|████████▌ | 855/1000 [10:50<01:50,  1.31it/s]

Step: 855, Loss: 2.0177831649780273


 86%|████████▌ | 856/1000 [10:50<01:49,  1.31it/s]

Step: 856, Loss: 2.0554966926574707


 86%|████████▌ | 857/1000 [10:51<01:49,  1.30it/s]

Step: 857, Loss: 2.0201315879821777


 86%|████████▌ | 858/1000 [10:52<01:48,  1.31it/s]

Step: 858, Loss: 2.05061411857605


 86%|████████▌ | 859/1000 [10:53<01:47,  1.31it/s]

Step: 859, Loss: 2.047318458557129


 86%|████████▌ | 860/1000 [10:54<01:47,  1.30it/s]

Step: 860, Loss: 2.068596363067627


 86%|████████▌ | 861/1000 [10:54<01:47,  1.30it/s]

Step: 861, Loss: 2.0157182216644287


 86%|████████▌ | 862/1000 [10:55<01:46,  1.29it/s]

Step: 862, Loss: 2.057631492614746


 86%|████████▋ | 863/1000 [10:56<01:46,  1.29it/s]

Step: 863, Loss: 2.075108766555786


 86%|████████▋ | 864/1000 [10:57<01:46,  1.28it/s]

Step: 864, Loss: 2.039820671081543


 86%|████████▋ | 865/1000 [10:57<01:46,  1.27it/s]

Step: 865, Loss: 2.027880907058716


 87%|████████▋ | 866/1000 [10:58<01:44,  1.29it/s]

Step: 866, Loss: 2.0040996074676514


 87%|████████▋ | 867/1000 [10:59<01:43,  1.29it/s]

Step: 867, Loss: 2.0894079208374023


 87%|████████▋ | 868/1000 [11:00<01:40,  1.31it/s]

Step: 868, Loss: 2.0259158611297607


 87%|████████▋ | 869/1000 [11:00<01:40,  1.31it/s]

Step: 869, Loss: 1.9967483282089233


 87%|████████▋ | 870/1000 [11:01<01:39,  1.31it/s]

Step: 870, Loss: 2.0754902362823486


 87%|████████▋ | 871/1000 [11:02<01:38,  1.31it/s]

Step: 871, Loss: 2.0083835124969482


 87%|████████▋ | 872/1000 [11:03<01:37,  1.31it/s]

Step: 872, Loss: 2.0258541107177734


 87%|████████▋ | 873/1000 [11:04<01:37,  1.30it/s]

Step: 873, Loss: 2.026931047439575


 87%|████████▋ | 874/1000 [11:04<01:35,  1.31it/s]

Step: 874, Loss: 2.023102045059204


 88%|████████▊ | 875/1000 [11:05<01:35,  1.31it/s]

Step: 875, Loss: 1.9943712949752808


 88%|████████▊ | 876/1000 [11:06<01:34,  1.31it/s]

Step: 876, Loss: 2.011129856109619


 88%|████████▊ | 877/1000 [11:07<01:33,  1.32it/s]

Step: 877, Loss: 1.9906736612319946


 88%|████████▊ | 878/1000 [11:07<01:32,  1.32it/s]

Step: 878, Loss: 1.9851250648498535


 88%|████████▊ | 879/1000 [11:08<01:31,  1.33it/s]

Step: 879, Loss: 1.9798771142959595


 88%|████████▊ | 880/1000 [11:09<01:30,  1.33it/s]

Step: 880, Loss: 2.0170512199401855


 88%|████████▊ | 881/1000 [11:10<01:30,  1.32it/s]

Step: 881, Loss: 1.9893766641616821


 88%|████████▊ | 882/1000 [11:10<01:30,  1.31it/s]

Step: 882, Loss: 1.998756766319275


 88%|████████▊ | 883/1000 [11:11<01:29,  1.31it/s]

Step: 883, Loss: 1.9859977960586548


 88%|████████▊ | 884/1000 [11:12<01:27,  1.32it/s]

Step: 884, Loss: 1.9627152681350708


 88%|████████▊ | 885/1000 [11:13<01:27,  1.31it/s]

Step: 885, Loss: 1.9485445022583008


 89%|████████▊ | 886/1000 [11:13<01:26,  1.32it/s]

Step: 886, Loss: 1.9554721117019653


 89%|████████▊ | 887/1000 [11:14<01:25,  1.32it/s]

Step: 887, Loss: 1.9515918493270874


 89%|████████▉ | 888/1000 [11:15<01:24,  1.32it/s]

Step: 888, Loss: 1.9457546472549438


 89%|████████▉ | 889/1000 [11:16<01:24,  1.31it/s]

Step: 889, Loss: 1.9866746664047241


 89%|████████▉ | 890/1000 [11:16<01:23,  1.31it/s]

Step: 890, Loss: 1.9701939821243286


 89%|████████▉ | 891/1000 [11:17<01:23,  1.31it/s]

Step: 891, Loss: 1.9627760648727417


 89%|████████▉ | 892/1000 [11:18<01:22,  1.31it/s]

Step: 892, Loss: 1.9693537950515747


 89%|████████▉ | 893/1000 [11:19<01:21,  1.31it/s]

Step: 893, Loss: 1.9843347072601318


 89%|████████▉ | 894/1000 [11:19<01:20,  1.31it/s]

Step: 894, Loss: 1.9471938610076904


 90%|████████▉ | 895/1000 [11:20<01:20,  1.31it/s]

Step: 895, Loss: 1.9889241456985474


 90%|████████▉ | 896/1000 [11:21<01:19,  1.32it/s]

Step: 896, Loss: 1.9983612298965454


 90%|████████▉ | 897/1000 [11:22<01:18,  1.32it/s]

Step: 897, Loss: 1.9584839344024658


 90%|████████▉ | 898/1000 [11:23<01:16,  1.33it/s]

Step: 898, Loss: 1.9619381427764893


 90%|████████▉ | 899/1000 [11:23<01:16,  1.32it/s]

Step: 899, Loss: 1.974879503250122


 90%|█████████ | 900/1000 [11:24<01:15,  1.32it/s]

Step: 900, Loss: 1.9554226398468018


 90%|█████████ | 901/1000 [11:25<01:14,  1.33it/s]

Step: 901, Loss: 1.9723803997039795


 90%|█████████ | 902/1000 [11:26<01:13,  1.33it/s]

Step: 902, Loss: 1.9667649269104004


 90%|█████████ | 903/1000 [11:26<01:13,  1.31it/s]

Step: 903, Loss: 1.9764546155929565


 90%|█████████ | 904/1000 [11:27<01:12,  1.32it/s]

Step: 904, Loss: 1.9732667207717896


 90%|█████████ | 905/1000 [11:28<01:12,  1.31it/s]

Step: 905, Loss: 1.9603201150894165


 91%|█████████ | 906/1000 [11:29<01:11,  1.31it/s]

Step: 906, Loss: 1.946903109550476


 91%|█████████ | 907/1000 [11:29<01:10,  1.31it/s]

Step: 907, Loss: 1.9396793842315674


 91%|█████████ | 908/1000 [11:30<01:10,  1.31it/s]

Step: 908, Loss: 1.9456467628479004


 91%|█████████ | 909/1000 [11:31<01:08,  1.32it/s]

Step: 909, Loss: 1.9445899724960327


 91%|█████████ | 910/1000 [11:32<01:08,  1.32it/s]

Step: 910, Loss: 1.9530972242355347


 91%|█████████ | 911/1000 [11:32<01:07,  1.31it/s]

Step: 911, Loss: 1.9584835767745972


 91%|█████████ | 912/1000 [11:33<01:06,  1.32it/s]

Step: 912, Loss: 1.9265234470367432


 91%|█████████▏| 913/1000 [11:34<01:06,  1.32it/s]

Step: 913, Loss: 1.9568142890930176


 91%|█████████▏| 914/1000 [11:35<01:05,  1.31it/s]

Step: 914, Loss: 1.9282149076461792


 92%|█████████▏| 915/1000 [11:35<01:04,  1.32it/s]

Step: 915, Loss: 1.9514280557632446


 92%|█████████▏| 916/1000 [11:36<01:04,  1.30it/s]

Step: 916, Loss: 1.9374428987503052


 92%|█████████▏| 917/1000 [11:37<01:03,  1.31it/s]

Step: 917, Loss: 1.9318770170211792


 92%|█████████▏| 918/1000 [11:38<01:02,  1.31it/s]

Step: 918, Loss: 1.9217028617858887


 92%|█████████▏| 919/1000 [11:38<01:01,  1.32it/s]

Step: 919, Loss: 1.9095711708068848


 92%|█████████▏| 920/1000 [11:39<01:00,  1.32it/s]

Step: 920, Loss: 1.9646849632263184


 92%|█████████▏| 921/1000 [11:40<01:00,  1.31it/s]

Step: 921, Loss: 1.9327431917190552


 92%|█████████▏| 922/1000 [11:41<00:59,  1.31it/s]

Step: 922, Loss: 1.9434263706207275


 92%|█████████▏| 923/1000 [11:42<00:59,  1.29it/s]

Step: 923, Loss: 1.9222798347473145


 92%|█████████▏| 924/1000 [11:42<01:00,  1.25it/s]

Step: 924, Loss: 1.9661866426467896


 92%|█████████▎| 925/1000 [11:43<00:59,  1.27it/s]

Step: 925, Loss: 1.9353358745574951


 93%|█████████▎| 926/1000 [11:44<00:57,  1.28it/s]

Step: 926, Loss: 1.9346615076065063


 93%|█████████▎| 927/1000 [11:45<00:56,  1.29it/s]

Step: 927, Loss: 1.9692550897598267


 93%|█████████▎| 928/1000 [11:45<00:55,  1.30it/s]

Step: 928, Loss: 1.9133429527282715


 93%|█████████▎| 929/1000 [11:46<00:54,  1.29it/s]

Step: 929, Loss: 1.9459971189498901


 93%|█████████▎| 930/1000 [11:47<00:54,  1.29it/s]

Step: 930, Loss: 1.9241530895233154


 93%|█████████▎| 931/1000 [11:48<00:53,  1.30it/s]

Step: 931, Loss: 1.9100745916366577


 93%|█████████▎| 932/1000 [11:49<00:52,  1.30it/s]

Step: 932, Loss: 1.9290791749954224


 93%|█████████▎| 933/1000 [11:49<00:51,  1.29it/s]

Step: 933, Loss: 1.9293816089630127


 93%|█████████▎| 934/1000 [11:50<00:51,  1.29it/s]

Step: 934, Loss: 1.9191598892211914


 94%|█████████▎| 935/1000 [11:51<00:50,  1.29it/s]

Step: 935, Loss: 1.9192641973495483


 94%|█████████▎| 936/1000 [11:52<00:49,  1.30it/s]

Step: 936, Loss: 1.9247970581054688


 94%|█████████▎| 937/1000 [11:52<00:48,  1.29it/s]

Step: 937, Loss: 1.929683804512024


 94%|█████████▍| 938/1000 [11:53<00:47,  1.29it/s]

Step: 938, Loss: 1.9090096950531006


 94%|█████████▍| 939/1000 [11:54<00:47,  1.29it/s]

Step: 939, Loss: 1.9126651287078857


 94%|█████████▍| 940/1000 [11:55<00:45,  1.31it/s]

Step: 940, Loss: 1.9112827777862549


 94%|█████████▍| 941/1000 [11:55<00:45,  1.30it/s]

Step: 941, Loss: 1.9194178581237793


 94%|█████████▍| 942/1000 [11:56<00:44,  1.30it/s]

Step: 942, Loss: 1.9207311868667603


 94%|█████████▍| 943/1000 [11:57<00:43,  1.31it/s]

Step: 943, Loss: 1.9392906427383423


 94%|█████████▍| 944/1000 [11:58<00:43,  1.29it/s]

Step: 944, Loss: 1.9041543006896973


 94%|█████████▍| 945/1000 [11:59<00:42,  1.29it/s]

Step: 945, Loss: 1.9141353368759155


 95%|█████████▍| 946/1000 [11:59<00:41,  1.29it/s]

Step: 946, Loss: 1.9497284889221191


 95%|█████████▍| 947/1000 [12:00<00:40,  1.30it/s]

Step: 947, Loss: 1.9000450372695923


 95%|█████████▍| 948/1000 [12:01<00:39,  1.30it/s]

Step: 948, Loss: 1.9079092741012573


 95%|█████████▍| 949/1000 [12:02<00:38,  1.31it/s]

Step: 949, Loss: 1.8722152709960938


 95%|█████████▌| 950/1000 [12:02<00:38,  1.31it/s]

Step: 950, Loss: 1.9019547700881958


 95%|█████████▌| 951/1000 [12:03<00:37,  1.31it/s]

Step: 951, Loss: 1.9070048332214355


 95%|█████████▌| 952/1000 [12:04<00:36,  1.30it/s]

Step: 952, Loss: 1.9098316431045532


 95%|█████████▌| 953/1000 [12:05<00:36,  1.29it/s]

Step: 953, Loss: 1.876877784729004


 95%|█████████▌| 954/1000 [12:06<00:36,  1.26it/s]

Step: 954, Loss: 1.8904668092727661


 96%|█████████▌| 955/1000 [12:06<00:36,  1.25it/s]

Step: 955, Loss: 1.910373568534851


 96%|█████████▌| 956/1000 [12:07<00:34,  1.26it/s]

Step: 956, Loss: 1.886601448059082


 96%|█████████▌| 957/1000 [12:08<00:34,  1.26it/s]

Step: 957, Loss: 1.9327970743179321


 96%|█████████▌| 958/1000 [12:09<00:33,  1.26it/s]

Step: 958, Loss: 1.8818600177764893


 96%|█████████▌| 959/1000 [12:10<00:32,  1.28it/s]

Step: 959, Loss: 1.8846665620803833


 96%|█████████▌| 960/1000 [12:10<00:31,  1.27it/s]

Step: 960, Loss: 1.8845467567443848


 96%|█████████▌| 961/1000 [12:11<00:30,  1.27it/s]

Step: 961, Loss: 1.8984010219573975


 96%|█████████▌| 962/1000 [12:12<00:29,  1.27it/s]

Step: 962, Loss: 1.8835880756378174


 96%|█████████▋| 963/1000 [12:13<00:30,  1.19it/s]

Step: 963, Loss: 1.891579270362854


 96%|█████████▋| 964/1000 [12:14<00:30,  1.19it/s]

Step: 964, Loss: 1.8843945264816284


 96%|█████████▋| 965/1000 [12:15<00:29,  1.20it/s]

Step: 965, Loss: 1.8935147523880005


 97%|█████████▋| 966/1000 [12:15<00:27,  1.22it/s]

Step: 966, Loss: 1.875964641571045


 97%|█████████▋| 967/1000 [12:16<00:26,  1.24it/s]

Step: 967, Loss: 1.892930269241333


 97%|█████████▋| 968/1000 [12:17<00:25,  1.25it/s]

Step: 968, Loss: 1.9111992120742798


 97%|█████████▋| 969/1000 [12:18<00:24,  1.27it/s]

Step: 969, Loss: 1.9116374254226685


 97%|█████████▋| 970/1000 [12:18<00:23,  1.28it/s]

Step: 970, Loss: 1.9013092517852783


 97%|█████████▋| 971/1000 [12:19<00:22,  1.30it/s]

Step: 971, Loss: 1.9366029500961304


 97%|█████████▋| 972/1000 [12:20<00:21,  1.30it/s]

Step: 972, Loss: 1.9300259351730347


 97%|█████████▋| 973/1000 [12:21<00:20,  1.29it/s]

Step: 973, Loss: 1.9054667949676514


 97%|█████████▋| 974/1000 [12:21<00:20,  1.29it/s]

Step: 974, Loss: 1.9120850563049316


 98%|█████████▊| 975/1000 [12:22<00:19,  1.30it/s]

Step: 975, Loss: 1.908440113067627


 98%|█████████▊| 976/1000 [12:23<00:18,  1.30it/s]

Step: 976, Loss: 1.9124689102172852


 98%|█████████▊| 977/1000 [12:24<00:17,  1.30it/s]

Step: 977, Loss: 1.874609351158142


 98%|█████████▊| 978/1000 [12:25<00:16,  1.30it/s]

Step: 978, Loss: 1.8995850086212158


 98%|█████████▊| 979/1000 [12:25<00:16,  1.30it/s]

Step: 979, Loss: 1.8771694898605347


 98%|█████████▊| 980/1000 [12:26<00:15,  1.29it/s]

Step: 980, Loss: 1.9245845079421997


 98%|█████████▊| 981/1000 [12:27<00:15,  1.27it/s]

Step: 981, Loss: 1.8748245239257812


 98%|█████████▊| 982/1000 [12:28<00:14,  1.28it/s]

Step: 982, Loss: 1.8833163976669312


 98%|█████████▊| 983/1000 [12:28<00:13,  1.29it/s]

Step: 983, Loss: 1.9673718214035034


 98%|█████████▊| 984/1000 [12:29<00:12,  1.30it/s]

Step: 984, Loss: 1.9011976718902588


 98%|█████████▊| 985/1000 [12:30<00:11,  1.31it/s]

Step: 985, Loss: 1.925065279006958


 99%|█████████▊| 986/1000 [12:31<00:10,  1.31it/s]

Step: 986, Loss: 1.9079777002334595


 99%|█████████▊| 987/1000 [12:31<00:09,  1.33it/s]

Step: 987, Loss: 1.9458494186401367


 99%|█████████▉| 988/1000 [12:32<00:08,  1.34it/s]

Step: 988, Loss: 1.8768447637557983


 99%|█████████▉| 989/1000 [12:33<00:08,  1.34it/s]

Step: 989, Loss: 1.9105584621429443


 99%|█████████▉| 990/1000 [12:34<00:07,  1.35it/s]

Step: 990, Loss: 1.8779590129852295


 99%|█████████▉| 991/1000 [12:34<00:06,  1.35it/s]

Step: 991, Loss: 1.9354889392852783


 99%|█████████▉| 992/1000 [12:35<00:05,  1.36it/s]

Step: 992, Loss: 1.8810359239578247


 99%|█████████▉| 993/1000 [12:36<00:05,  1.35it/s]

Step: 993, Loss: 1.9311881065368652


 99%|█████████▉| 994/1000 [12:37<00:04,  1.35it/s]

Step: 994, Loss: 1.9081790447235107


100%|█████████▉| 995/1000 [12:37<00:03,  1.36it/s]

Step: 995, Loss: 1.9110134840011597


100%|█████████▉| 996/1000 [12:38<00:02,  1.37it/s]

Step: 996, Loss: 1.8914402723312378


100%|█████████▉| 997/1000 [12:39<00:02,  1.36it/s]

Step: 997, Loss: 1.8610883951187134


100%|█████████▉| 998/1000 [12:40<00:01,  1.37it/s]

Step: 998, Loss: 1.9090291261672974


100%|█████████▉| 999/1000 [12:40<00:00,  1.37it/s]

Step: 999, Loss: 1.9072178602218628


100%|██████████| 1000/1000 [12:41<00:00,  1.31it/s]

Step: 1000, Loss: 1.8604521751403809





In [30]:
transformer.eval()
src_sample = torch.zeros(50, dtype=torch.int64)
src_sample[:10] = torch.arange(833, 843, dtype=torch.int64)

In [31]:
src_sample.unsqueeze(0)

tensor([[833, 834, 835, 836, 837, 838, 839, 840, 841, 842,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]])

In [32]:
res = transformer(src_sample.unsqueeze(0), src_sample.unsqueeze(0))

In [33]:
res.squeeze().argmax(dim=1)

tensor([833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 841, 841, 841, 847,
        847, 847, 847, 847, 849, 853, 853, 854, 854, 854, 854, 857, 857, 859,
        862, 862, 863, 863, 863, 865, 866, 866, 869, 869, 869, 872, 873, 873,
        874, 876, 876, 878, 878, 878, 880, 880])