In [400]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
import math
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
from torch.utils.data import Dataset,RandomSampler
import operator
import numpy as np
%matplotlib inline

In [401]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD_token = 0
SOS_token = 1
EOS_token = 2
vocab_size = 85000
hidden_size = 256
# emb_size = 256
MAX_LENGTH_1 = 100 # since 99% source sentence is <= 100
# MAX_LENGTH_1 = max(len(pair[0].split(" ")) for pair in pairs)
# MAX_LENGTH_2 = max(len(pair[1].split(" ")) for pair in pairs)
dropout_p = 0.1
teacher_forcing_ratio = 0.5
BATCH_SIZE = 32

In [402]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [403]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0:"PAD", 1: "SOS", 2: "EOS",3:"UNK"}
        self.n_words = 4  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
        
            
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


def readLangs(lang1, lang2):
    print("Reading lines...")

    # Read the file and split into lines
    lang1_lines = open('data/iwslt-zh-en/train.tok.zh', encoding = 'utf-8').read().\
                    strip().split('\n')
    lang2_lines = open('data/iwslt-zh-en/train.tok.en', encoding = 'utf-8').read().\
                    strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [(lang1_lines[i] +',' + lang2_lines[i]).split(',',1) for i in range(len(lang1_lines))]

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

def prepareData(lang1, lang2):
    input_lang, output_lang, pairs = readLangs(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


def build_topwordVocab(lang, vocab_size):
    print("Build vocabulary by top {} frequent word...".format(vocab_size))
    sorted_word2Count = sorted(lang.word2count.items(),
        key=operator.itemgetter(1),
        reverse=True)
    sorted_words = [x[0] for x in sorted_word2Count[:vocab_size]]
    
    
    lang.index2word = {}
    lang.index2word[0] = "PAD"
    lang.index2word[1] = "SOS"
    lang.index2word[2] = "EOS"
    lang.index2word[3] = "UNK"
    
    for ind, word in enumerate(sorted_words):
            lang.index2word[word] = ind + 4
            

    lang.word2index = {}
    for ind, word in enumerate(sorted_words):
        lang.word2index[ind + 4] = word
    
    lang.n_words = len(lang.index2word)
    
    print(lang.name, lang.n_words)
    return lang

input_lang, output_lang, pairs = prepareData('eng', 'chi')

input_lang = build_topwordVocab(input_lang,vocab_size)
print(random.choice(pairs))

Reading lines...
Read 213376 sentence pairs
Counting words...
Counted words:
eng 88919
chi 69127
Build vocabulary by top 85000 frequent word...
eng 85004
['热衷 热衷于 科技 的 Kevin Kelly  问    科技 要 的 是 什么     并发 发现 它 的 发展 的 繁复   跟 生命 的 进化 很 相似 ', 'Tech enthusiast Kevin Kelly asks &quot; What does technology want ? &quot; and discovers that its movement toward ubiquity and complexity is much like the evolution of life .']


In [381]:
sorted_word2Count = sorted(input_lang.word2count.items(),
    key=operator.itemgetter(1),
    reverse=True)

In [383]:
sorted_word2Count ###标点符号排第一 之后要改掉

In [405]:
def indexesFromSentence(lang, sentence):
    idxs = []
    for word in sentence.split(' '):
        try:
            idxs.append(lang.word2index[word])
        except KeyError:
            idxs.append(3)  # 3 is the id of 'UNK'
    return idxs


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)



class VocabDataset(Dataset):
    def __init__(self, pairs):
#         pairs = [tensorsFromPair(pair) for pair in pairs]
#         self.source_sent_list = [i[0] for i in pairs]
#         self.target_sent_list = [i[1] for i in pairs]

        self.source_sent_list = [indexesFromSentence(input_lang,pair[0]) for pair in pairs]
        self.target_sent_list = [indexesFromSentence(output_lang,pair[1]) for pair in pairs]
        
    def __len__(self):
        return len(self.source_sent_list)
        
    def __getitem__(self, key):
        token1_idx = self.source_sent_list[key][:MAX_LENGTH_1]
        token2_idx = self.target_sent_list[key][:MAX_LENGTH_1]
        return [token1_idx,token2_idx, len(token1_idx), len(token2_idx)]

    
def Vocab_collate_func(batch):
    source_sent_list = []
    target_sent_list = []
    source_len_list = []
    target_len_list = []

    for datum in batch:   ### batch = sample
        source_len_list.append(datum[2])
        target_len_list.append(datum[3])

    # padding
    for datum in batch:
        
        # source sentence processing
        padded_source = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_LENGTH_1-datum[2])),          ### 0代表左边没有pad,右边的值代表右边pad的个数
                                mode="constant", constant_values=PAD_token)
        source_sent_list.append(padded_source)
        
        # target sentence processing
        padded_target = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_LENGTH_1-datum[3])),          ### 0代表左边没有pad,右边的值代表右边pad的个数
                                mode="constant", constant_values=PAD_token)
        target_sent_list.append(padded_target)
        
    return [torch.tensor(source_sent_list,device = device), 
            torch.tensor(target_sent_list,device = device),
            torch.LongTensor(source_len_list,device = device), 
            torch.LongTensor(target_len_list,device = device)]

train_dataset = VocabDataset(pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=Vocab_collate_func,
                                           shuffle=True)

In [466]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, bidirectional=True) # embedding size = hidden size
        self.fc1 = nn.Linear(2*hidden_size, hidden_size)
    def initHidden(self,BATCH_SIZE):
        return torch.zeros(2, BATCH_SIZE, self.hidden_size, device=device) # return (2,1,hidden_size) 2 due to bidirection
    
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, BATCH_SIZE, -1)  # input is just one token at timpstep t
        output = embedded
        output, hidden = self.gru(output, hidden)  
        # (seq_len, batch, num_directions * hidden_size) and (num_layers * num_directions, batch, hidden_size)
        output = self.fc1(output)
        return output, hidden

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH_1):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, BATCH_SIZE, -1)
        embedded = self.dropout(embedded)
        
#         embedded: torch.Size([1, 32, 256])
#         hidden: torch.Size([1, 32, 256])
        
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)   
#         attn_weights:torch.Size([32, 100])
        attn_applied = torch.bmm(attn_weights.unsqueeze(1),
                                 encoder_outputs.transpose(0,1))
#         encoder_outputs: 100*32*512 attn_applied: 32*1*512
        
        output = torch.cat((embedded[0], attn_applied.transpose(0,1)[0]), 1)
        # output: 32*768
        output = self.attn_combine(output).unsqueeze(0)
        # output 1*32*256
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)
#         output: torch.Size([32, 69126])
#         hidden: torch.Size([1, 32, 256])
        return output, hidden, attn_weights

In [467]:
# this is just one sentence input, could be batchlized 
def train(input_tensor, target_tensor, encoder, decoder,
          encoder_optimizer, decoder_optimizer, criterion, mask = None):
    encoder_hidden = encoder.initHidden(BATCH_SIZE)
    encoder_optimizer.zero_grad()  # zero out the accumulated gradient over mini-batch
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0) # length of source sentence
    target_length = target_tensor.size(0)
    encoder_outputs = torch.zeros(target_length, BATCH_SIZE, encoder.hidden_size, device=device) 
    # (seq_length, BATCH_SIZE,hidden_size*2) 2 due to bidirection

    loss = 0
    
    # feed-forward layer resulting encoder outputs, ei refers to each word token in input sentence
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)  
        # encoder_output: torch.Size([1, 32, 512]) encoder_hidden: torch.Size([2, 32, 256])
        encoder_outputs[ei] = encoder_output[0] 
    # change the shape of encoder output to fit into decoder 
    encoder_hidden = nn.Linear(2*hidden_size,hidden_size)(
        torch.cat((encoder_hidden[0],encoder_hidden[1]),dim = 1)).unsqueeze(0)
    
    decoder_input = torch.tensor([[SOS_token]*32], device=device)  # decoder_input: torch.Size([1, 32])
    # init decoder hidden 
    decoder_hidden = encoder_hidden
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            
            decoder_input = target_tensor[di]  # Teacher forcing
            temp_loss = criterion(decoder_output, target_tensor[di])
            loss += temp_loss * mask[di:di+1].float()  
            ave_loss = loss.sum()/BATCH_SIZE 
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            # decoder_input: torch.Size([1, 32])
            # decoder_hidden: torch.Size([1, 32, 256]) 1 token * batch * hidden size
            # encoder_outputs: torch.Size([100, 32, 512])
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            # topv: 32*1
            # topi: 32*1

            decoder_input = topi.squeeze().detach()  # detach from history as input
            # decoder_input: 32
            # target_tensor: 100*32
            # decoder_output: 32*69127 
            temp_loss = criterion(decoder_output, target_tensor[di])
            loss += temp_loss * mask[di:di+1].float()
            # loss size 1*32
            ave_loss = loss.sum()/BATCH_SIZE  
            
    ave_loss.backward()
    
    
    encoder_optimizer.step()   # update parameters
    decoder_optimizer.step()

    return ave_loss.item() / target_length

In [468]:
def trainIters(encoder, decoder, n_iters, print_every=100, plot_every=100, learning_rate=0.01):
    start = time.time()

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss(reduce = False) ##!!!!!!!!!!1 这个loss是否要换成crossentropy

    for epoch in range(1, n_iters + 1):
        plot_losses = []
        print_loss_total = 0  # Reset every print_every
        plot_loss_total = 0  # Reset every plot_every
        for i, (input_sentences, target_sentences,len1,len2) in enumerate(train_loader): 
            input_tensor = input_sentences.transpose(0,1)   # 32*100 to 100*32
            target_tensor = target_sentences.transpose(0,1)
            mask = target_tensor.ge(1)   # 100 * 32
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion, mask = mask)
            print_loss_total += loss
            plot_loss_total += loss

            if i > 0 and i % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('Time: {}, Epoch: [{}/{}], Step: [{}/{}], Train Loss: {}'.format(
                    timeSince(start, i + 1/len(train_loader)), epoch, n_iters, i, 
                    len(train_loader),print_loss_avg))

            if i > 0 and i % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
                
        print(plot_losses)
        showPlot(plot_losses)

In [476]:
# def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH_1):
#     with torch.no_grad():
#         input_tensor = tensorFromSentence(input_lang, sentence)
#         input_length = input_tensor.size()[0]
#         encoder_hidden = encoder.initHidden(BATCH_SIZE)

#         encoder_outputs = torch.zeros(max_length, encoder.hidden_size*2, device=device)

#         for ei in range(input_length):
#             encoder_output, encoder_hidden = encoder(input_tensor[ei],
#                                                      encoder_hidden)
#             encoder_outputs[ei] += encoder_output[0, 0]

#         decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
        
#         decoder_hidden = encoder_hidden

#         decoded_words = []
#         decoder_attentions = torch.zeros(max_length, max_length)

#         for di in range(max_length):
#             decoder_output, decoder_hidden, decoder_attention = decoder(
#                 decoder_input, decoder_hidden, encoder_outputs)
#             decoder_attentions[di] = decoder_attention.data
#             topv, topi = decoder_output.data.topk(1)
#             if topi.item() == EOS_token:
#                 decoded_words.append('<EOS>')
#                 break
#             else:
#                 decoded_words.append(output_lang.index2word[topi.item()])

#             decoder_input = topi.squeeze().detach()

#         return decoded_words, decoder_attentions[:di + 1]
    
# def evaluateRandomly(encoder, decoder, n=10):
#     for i in range(n):
#         pair = random.choice(pairs)
#         print('>', pair[0])
#         print('=', pair[1])
#         output_words, attentions = evaluate(encoder, decoder, pair[0])
#         output_sentence = ' '.join(output_words)
#         print('<', output_sentence)
#         print('')

In [471]:
encoder1 = EncoderRNN(input_lang.n_words,hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 3, print_every=1,plot_every=1)

torch.save(encoder1.state_dict(), "saved_model/encoder_hiddenSize{}".format(hidden_size))
torch.save(attn_decoder1.state_dict(), "saved_model/attn_decoder_hiddenSize{}".format(hidden_size))



Time: 0m 54s (- -1m 59s), Epoch: [1/3], Step: [1/6668], Train Loss: 3.5373974609375
Time: 1m 15s (- -1m 22s), Epoch: [1/3], Step: [2/6668], Train Loss: 1.6213475036621094
Time: 1m 42s (- -2m 51s), Epoch: [1/3], Step: [3/6668], Train Loss: 1.451114501953125
Time: 2m 8s (- -2m 23s), Epoch: [1/3], Step: [4/6668], Train Loss: 1.608426055908203
Time: 2m 31s (- -3m 58s), Epoch: [1/3], Step: [5/6668], Train Loss: 1.4026364135742186
Time: 2m 56s (- -3m 33s), Epoch: [1/3], Step: [6/6668], Train Loss: 1.6417161560058593
Time: 3m 22s (- -3m 6s), Epoch: [1/3], Step: [7/6668], Train Loss: 2.0729544067382815
Time: 3m 45s (- -4m 42s), Epoch: [1/3], Step: [8/6668], Train Loss: 1.338889617919922
Time: 4m 4s (- -4m 22s), Epoch: [1/3], Step: [9/6668], Train Loss: 1.7672528076171874
Time: 4m 24s (- -4m 1s), Epoch: [1/3], Step: [10/6668], Train Loss: 1.6693408203125
Time: 4m 46s (- -5m 39s), Epoch: [1/3], Step: [11/6668], Train Loss: 1.6399649047851563
Time: 5m 22s (- -5m 4s), Epoch: [1/3], Step: [12/6668]

Time: 34m 32s (- -35m 49s), Epoch: [1/3], Step: [94/6668], Train Loss: 1.4132022094726562
Time: 34m 50s (- -35m 31s), Epoch: [1/3], Step: [95/6668], Train Loss: 1.300406494140625
Time: 35m 9s (- -35m 12s), Epoch: [1/3], Step: [96/6668], Train Loss: 1.3192240905761718
Time: 35m 30s (- -36m 51s), Epoch: [1/3], Step: [97/6668], Train Loss: 1.7629730224609375
Time: 35m 48s (- -36m 33s), Epoch: [1/3], Step: [98/6668], Train Loss: 1.6520623779296875
Time: 36m 7s (- -36m 14s), Epoch: [1/3], Step: [99/6668], Train Loss: 1.5467344665527343
Time: 36m 25s (- -37m 56s), Epoch: [1/3], Step: [100/6668], Train Loss: 1.6968312072753906
Time: 36m 43s (- -37m 37s), Epoch: [1/3], Step: [101/6668], Train Loss: 1.4376473999023438
Time: 37m 3s (- -37m 18s), Epoch: [1/3], Step: [102/6668], Train Loss: 1.3995938110351562
Time: 37m 21s (- -37m 0s), Epoch: [1/3], Step: [103/6668], Train Loss: 1.3905220031738281
Time: 37m 38s (- -38m 42s), Epoch: [1/3], Step: [104/6668], Train Loss: 1.6971363830566406
Time: 37m 

Time: 63m 13s (- -63m 7s), Epoch: [1/3], Step: [185/6668], Train Loss: 1.8678671264648437
Time: 63m 32s (- -64m 47s), Epoch: [1/3], Step: [186/6668], Train Loss: 1.4702879333496093
Time: 63m 50s (- -64m 30s), Epoch: [1/3], Step: [187/6668], Train Loss: 1.7260762023925782
Time: 64m 9s (- -64m 10s), Epoch: [1/3], Step: [188/6668], Train Loss: 1.4184539794921875
Time: 64m 27s (- -65m 52s), Epoch: [1/3], Step: [189/6668], Train Loss: 1.699876251220703
Time: 64m 45s (- -65m 35s), Epoch: [1/3], Step: [190/6668], Train Loss: 1.6978616333007812
Time: 65m 4s (- -65m 15s), Epoch: [1/3], Step: [191/6668], Train Loss: 1.7648036193847656
Time: 65m 24s (- -66m 56s), Epoch: [1/3], Step: [192/6668], Train Loss: 1.9315994262695313
Time: 65m 41s (- -66m 38s), Epoch: [1/3], Step: [193/6668], Train Loss: 1.6023661804199218
Time: 66m 1s (- -66m 18s), Epoch: [1/3], Step: [194/6668], Train Loss: 2.120616760253906
Time: 66m 20s (- -67m 59s), Epoch: [1/3], Step: [195/6668], Train Loss: 1.9595951843261719
Time:

Time: 91m 25s (- -92m 54s), Epoch: [1/3], Step: [276/6668], Train Loss: 1.4959503173828126
Time: 91m 43s (- -92m 36s), Epoch: [1/3], Step: [277/6668], Train Loss: 1.6247982788085937
Time: 92m 2s (- -92m 16s), Epoch: [1/3], Step: [278/6668], Train Loss: 1.583105010986328
Time: 92m 22s (- -93m 57s), Epoch: [1/3], Step: [279/6668], Train Loss: 1.7925144958496093
Time: 92m 42s (- -93m 37s), Epoch: [1/3], Step: [280/6668], Train Loss: 1.5568218994140626
Time: 92m 59s (- -93m 20s), Epoch: [1/3], Step: [281/6668], Train Loss: 1.52027099609375
Time: 93m 17s (- -93m 2s), Epoch: [1/3], Step: [282/6668], Train Loss: 1.5665008544921875
Time: 93m 34s (- -94m 45s), Epoch: [1/3], Step: [283/6668], Train Loss: 1.71555419921875
Time: 93m 52s (- -94m 27s), Epoch: [1/3], Step: [284/6668], Train Loss: 1.830846405029297
Time: 94m 12s (- -94m 7s), Epoch: [1/3], Step: [285/6668], Train Loss: 1.356453399658203
Time: 94m 31s (- -95m 48s), Epoch: [1/3], Step: [286/6668], Train Loss: 1.8201727294921874
Time: 94m

Time: 119m 23s (- -120m 55s), Epoch: [1/3], Step: [366/6668], Train Loss: 1.967689208984375
Time: 119m 43s (- -120m 36s), Epoch: [1/3], Step: [367/6668], Train Loss: 1.448539276123047
Time: 120m 2s (- -120m 16s), Epoch: [1/3], Step: [368/6668], Train Loss: 1.8818609619140625
Time: 120m 20s (- -121m 59s), Epoch: [1/3], Step: [369/6668], Train Loss: 1.4718490600585938
Time: 120m 37s (- -121m 41s), Epoch: [1/3], Step: [370/6668], Train Loss: 1.5973048400878906
Time: 120m 58s (- -121m 21s), Epoch: [1/3], Step: [371/6668], Train Loss: 1.4308395385742188
Time: 121m 15s (- -121m 3s), Epoch: [1/3], Step: [372/6668], Train Loss: 1.6964529418945313
Time: 121m 33s (- -122m 45s), Epoch: [1/3], Step: [373/6668], Train Loss: 1.7737437438964845
Time: 121m 53s (- -122m 26s), Epoch: [1/3], Step: [374/6668], Train Loss: 1.6908680725097656
Time: 122m 11s (- -122m 8s), Epoch: [1/3], Step: [375/6668], Train Loss: 1.8548619079589843
Time: 122m 30s (- -123m 48s), Epoch: [1/3], Step: [376/6668], Train Loss: 1

Time: 147m 16s (- -147m 3s), Epoch: [1/3], Step: [455/6668], Train Loss: 1.843590087890625
Time: 147m 33s (- -148m 45s), Epoch: [1/3], Step: [456/6668], Train Loss: 1.7165211486816405
Time: 147m 51s (- -148m 28s), Epoch: [1/3], Step: [457/6668], Train Loss: 1.9577841186523437
Time: 148m 8s (- -148m 10s), Epoch: [1/3], Step: [458/6668], Train Loss: 1.841990966796875
Time: 148m 29s (- -149m 49s), Epoch: [1/3], Step: [459/6668], Train Loss: 1.561771240234375
Time: 148m 47s (- -149m 31s), Epoch: [1/3], Step: [460/6668], Train Loss: 1.7286647033691407
Time: 149m 7s (- -149m 12s), Epoch: [1/3], Step: [461/6668], Train Loss: 1.5310670471191405
Time: 149m 26s (- -150m 52s), Epoch: [1/3], Step: [462/6668], Train Loss: 1.5324465942382812
Time: 149m 44s (- -150m 34s), Epoch: [1/3], Step: [463/6668], Train Loss: 1.919459686279297
Time: 150m 4s (- -150m 15s), Epoch: [1/3], Step: [464/6668], Train Loss: 1.8672561645507812
Time: 150m 23s (- -151m 55s), Epoch: [1/3], Step: [465/6668], Train Loss: 2.11

Time: 174m 34s (- -175m 44s), Epoch: [1/3], Step: [544/6668], Train Loss: 1.8983291625976562
Time: 174m 54s (- -175m 24s), Epoch: [1/3], Step: [545/6668], Train Loss: 1.8819035339355468
Time: 175m 11s (- -175m 7s), Epoch: [1/3], Step: [546/6668], Train Loss: 1.7729766845703125
Time: 175m 29s (- -176m 49s), Epoch: [1/3], Step: [547/6668], Train Loss: 1.9108641052246094
Time: 175m 47s (- -176m 31s), Epoch: [1/3], Step: [548/6668], Train Loss: 1.8097640991210937
Time: 176m 7s (- -176m 11s), Epoch: [1/3], Step: [549/6668], Train Loss: 1.7979391479492188
Time: 176m 27s (- -177m 51s), Epoch: [1/3], Step: [550/6668], Train Loss: 2.0388223266601564
Time: 176m 44s (- -177m 34s), Epoch: [1/3], Step: [551/6668], Train Loss: 1.7554461669921875
Time: 177m 2s (- -177m 17s), Epoch: [1/3], Step: [552/6668], Train Loss: 1.71939208984375
Time: 177m 19s (- -178m 59s), Epoch: [1/3], Step: [553/6668], Train Loss: 1.7433401489257812
Time: 177m 39s (- -178m 40s), Epoch: [1/3], Step: [554/6668], Train Loss: 1

Time: 202m 15s (- -202m 3s), Epoch: [1/3], Step: [633/6668], Train Loss: 1.5234945678710938
Time: 202m 34s (- -203m 44s), Epoch: [1/3], Step: [634/6668], Train Loss: 1.8587930297851563
Time: 202m 54s (- -203m 24s), Epoch: [1/3], Step: [635/6668], Train Loss: 1.6561166381835937
Time: 203m 13s (- -203m 5s), Epoch: [1/3], Step: [636/6668], Train Loss: 1.6528034973144532
Time: 203m 33s (- -204m 45s), Epoch: [1/3], Step: [637/6668], Train Loss: 1.9574240112304688
Time: 203m 53s (- -204m 26s), Epoch: [1/3], Step: [638/6668], Train Loss: 1.5633697509765625
Time: 204m 12s (- -204m 6s), Epoch: [1/3], Step: [639/6668], Train Loss: 2.0767333984375
Time: 204m 29s (- -205m 49s), Epoch: [1/3], Step: [640/6668], Train Loss: 1.9312225341796876
Time: 204m 49s (- -205m 29s), Epoch: [1/3], Step: [641/6668], Train Loss: 1.8741275024414064
Time: 205m 7s (- -205m 11s), Epoch: [1/3], Step: [642/6668], Train Loss: 1.4528022766113282
Time: 205m 25s (- -206m 53s), Epoch: [1/3], Step: [643/6668], Train Loss: 2.0

Time: 229m 51s (- -230m 28s), Epoch: [1/3], Step: [722/6668], Train Loss: 1.540093231201172
Time: 230m 8s (- -230m 10s), Epoch: [1/3], Step: [723/6668], Train Loss: 1.5790687561035157
Time: 230m 28s (- -231m 51s), Epoch: [1/3], Step: [724/6668], Train Loss: 1.8149642944335938
Time: 230m 45s (- -231m 33s), Epoch: [1/3], Step: [725/6668], Train Loss: 1.6631248474121094
Time: 231m 3s (- -231m 15s), Epoch: [1/3], Step: [726/6668], Train Loss: 1.6074737548828124
Time: 231m 22s (- -232m 56s), Epoch: [1/3], Step: [727/6668], Train Loss: 1.9588938903808595
Time: 231m 40s (- -232m 38s), Epoch: [1/3], Step: [728/6668], Train Loss: 1.803288116455078
Time: 231m 59s (- -232m 19s), Epoch: [1/3], Step: [729/6668], Train Loss: 1.899918670654297
Time: 232m 17s (- -232m 1s), Epoch: [1/3], Step: [730/6668], Train Loss: 2.0578599548339844
Time: 232m 34s (- -233m 44s), Epoch: [1/3], Step: [731/6668], Train Loss: 2.036624755859375
Time: 232m 52s (- -233m 26s), Epoch: [1/3], Step: [732/6668], Train Loss: 1.3

Time: 257m 38s (- -258m 40s), Epoch: [1/3], Step: [811/6668], Train Loss: 2.0038180541992188
Time: 257m 56s (- -258m 22s), Epoch: [1/3], Step: [812/6668], Train Loss: 2.1654908752441404
Time: 258m 13s (- -258m 5s), Epoch: [1/3], Step: [813/6668], Train Loss: 1.6643614196777343
Time: 258m 33s (- -259m 45s), Epoch: [1/3], Step: [814/6668], Train Loss: 1.6232528686523438
Time: 258m 52s (- -259m 26s), Epoch: [1/3], Step: [815/6668], Train Loss: 1.8224990844726563
Time: 259m 10s (- -259m 9s), Epoch: [1/3], Step: [816/6668], Train Loss: 1.7843804931640626
Time: 259m 29s (- -260m 49s), Epoch: [1/3], Step: [817/6668], Train Loss: 2.010695495605469
Time: 259m 49s (- -260m 29s), Epoch: [1/3], Step: [818/6668], Train Loss: 2.4595600891113283
Time: 260m 9s (- -260m 9s), Epoch: [1/3], Step: [819/6668], Train Loss: 1.9231622314453125
Time: 260m 26s (- -261m 52s), Epoch: [1/3], Step: [820/6668], Train Loss: 1.7854887390136718
Time: 260m 46s (- -261m 32s), Epoch: [1/3], Step: [821/6668], Train Loss: 1

Time: 285m 27s (- -286m 52s), Epoch: [1/3], Step: [900/6668], Train Loss: 1.9199696350097657
Time: 285m 44s (- -286m 34s), Epoch: [1/3], Step: [901/6668], Train Loss: 2.233931121826172
Time: 286m 3s (- -286m 15s), Epoch: [1/3], Step: [902/6668], Train Loss: 2.233282012939453
Time: 286m 23s (- -287m 55s), Epoch: [1/3], Step: [903/6668], Train Loss: 1.6466780090332032
Time: 286m 43s (- -287m 35s), Epoch: [1/3], Step: [904/6668], Train Loss: 2.116475372314453
Time: 287m 3s (- -287m 16s), Epoch: [1/3], Step: [905/6668], Train Loss: 2.272687530517578
Time: 287m 20s (- -288m 58s), Epoch: [1/3], Step: [906/6668], Train Loss: 1.958985595703125
Time: 287m 40s (- -288m 38s), Epoch: [1/3], Step: [907/6668], Train Loss: 1.6639521789550782
Time: 287m 58s (- -288m 20s), Epoch: [1/3], Step: [908/6668], Train Loss: 1.603085479736328
Time: 288m 17s (- -288m 1s), Epoch: [1/3], Step: [909/6668], Train Loss: 1.8957232666015624
Time: 288m 36s (- -289m 42s), Epoch: [1/3], Step: [910/6668], Train Loss: 1.810

Time: 313m 2s (- -313m 16s), Epoch: [1/3], Step: [989/6668], Train Loss: 1.9140226745605469
Time: 313m 21s (- -314m 57s), Epoch: [1/3], Step: [990/6668], Train Loss: 2.116273193359375
Time: 313m 41s (- -314m 37s), Epoch: [1/3], Step: [991/6668], Train Loss: 2.1886410522460937
Time: 313m 58s (- -314m 20s), Epoch: [1/3], Step: [992/6668], Train Loss: 2.04521728515625
Time: 314m 16s (- -314m 2s), Epoch: [1/3], Step: [993/6668], Train Loss: 1.7612115478515624
Time: 314m 35s (- -315m 43s), Epoch: [1/3], Step: [994/6668], Train Loss: 1.7063851928710938
Time: 314m 53s (- -315m 25s), Epoch: [1/3], Step: [995/6668], Train Loss: 1.6263760375976561
Time: 315m 12s (- -315m 6s), Epoch: [1/3], Step: [996/6668], Train Loss: 1.7301412963867187
Time: 315m 32s (- -316m 46s), Epoch: [1/3], Step: [997/6668], Train Loss: 1.8945343017578125
Time: 315m 50s (- -316m 28s), Epoch: [1/3], Step: [998/6668], Train Loss: 1.56754150390625
Time: 316m 7s (- -316m 11s), Epoch: [1/3], Step: [999/6668], Train Loss: 2.062

Time: 340m 31s (- -341m 47s), Epoch: [1/3], Step: [1078/6668], Train Loss: 2.4685678100585937
Time: 340m 49s (- -341m 29s), Epoch: [1/3], Step: [1079/6668], Train Loss: 1.925594024658203
Time: 341m 6s (- -341m 12s), Epoch: [1/3], Step: [1080/6668], Train Loss: 1.9351699829101563
Time: 341m 24s (- -342m 54s), Epoch: [1/3], Step: [1081/6668], Train Loss: 1.7206259155273438
Time: 341m 41s (- -342m 37s), Epoch: [1/3], Step: [1082/6668], Train Loss: 1.3568260192871093
Time: 342m 0s (- -342m 18s), Epoch: [1/3], Step: [1083/6668], Train Loss: 2.043794860839844
Time: 342m 17s (- -342m 1s), Epoch: [1/3], Step: [1084/6668], Train Loss: 1.8791023254394532
Time: 342m 37s (- -343m 41s), Epoch: [1/3], Step: [1085/6668], Train Loss: 1.7465916442871094
Time: 342m 56s (- -343m 22s), Epoch: [1/3], Step: [1086/6668], Train Loss: 1.6403182983398437
Time: 343m 13s (- -343m 5s), Epoch: [1/3], Step: [1087/6668], Train Loss: 2.3492414855957033
Time: 343m 31s (- -344m 47s), Epoch: [1/3], Step: [1088/6668], Tra

Time: 367m 46s (- -368m 32s), Epoch: [1/3], Step: [1166/6668], Train Loss: 2.1760015869140625
Time: 368m 5s (- -368m 12s), Epoch: [1/3], Step: [1167/6668], Train Loss: 1.7267547607421876
Time: 368m 23s (- -369m 55s), Epoch: [1/3], Step: [1168/6668], Train Loss: 1.9412924194335937
Time: 368m 41s (- -369m 37s), Epoch: [1/3], Step: [1169/6668], Train Loss: 2.18977783203125
Time: 369m 0s (- -369m 18s), Epoch: [1/3], Step: [1170/6668], Train Loss: 1.8739230346679687
Time: 369m 19s (- -370m 58s), Epoch: [1/3], Step: [1171/6668], Train Loss: 2.27038330078125
Time: 369m 37s (- -370m 41s), Epoch: [1/3], Step: [1172/6668], Train Loss: 2.3267500305175783
Time: 369m 55s (- -370m 23s), Epoch: [1/3], Step: [1173/6668], Train Loss: 1.8754090881347656
Time: 370m 12s (- -370m 5s), Epoch: [1/3], Step: [1174/6668], Train Loss: 1.8428948974609376
Time: 370m 30s (- -371m 48s), Epoch: [1/3], Step: [1175/6668], Train Loss: 1.716855010986328
Time: 370m 48s (- -371m 30s), Epoch: [1/3], Step: [1176/6668], Train

Time: 395m 5s (- -395m 13s), Epoch: [1/3], Step: [1254/6668], Train Loss: 1.9827784729003906
Time: 395m 23s (- -396m 55s), Epoch: [1/3], Step: [1255/6668], Train Loss: 1.893043212890625
Time: 395m 41s (- -396m 37s), Epoch: [1/3], Step: [1256/6668], Train Loss: 2.0154571533203125
Time: 396m 1s (- -396m 17s), Epoch: [1/3], Step: [1257/6668], Train Loss: 2.087542724609375
Time: 396m 20s (- -397m 58s), Epoch: [1/3], Step: [1258/6668], Train Loss: 2.2571832275390626
Time: 396m 38s (- -397m 40s), Epoch: [1/3], Step: [1259/6668], Train Loss: 1.8148146057128907
Time: 396m 56s (- -397m 22s), Epoch: [1/3], Step: [1260/6668], Train Loss: 1.8611041259765626
Time: 397m 15s (- -397m 3s), Epoch: [1/3], Step: [1261/6668], Train Loss: 2.089020080566406
Time: 397m 35s (- -398m 43s), Epoch: [1/3], Step: [1262/6668], Train Loss: 1.7881573486328124
Time: 397m 53s (- -398m 25s), Epoch: [1/3], Step: [1263/6668], Train Loss: 1.6455863952636718
Time: 398m 10s (- -398m 8s), Epoch: [1/3], Step: [1264/6668], Trai

Time: 422m 56s (- -423m 22s), Epoch: [1/3], Step: [1342/6668], Train Loss: 2.027724609375


KeyboardInterrupt: 