In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!ls "gdrive/MyDrive/wikitext-2" # check that it has successfully connected
# files should be at ur GDrive inside folder wikitext-2
!cp "gdrive/MyDrive/wikitext-2/wiki.test.tokens.txt" "test.txt" # copy the files to colab runtime
!cp "gdrive/MyDrive/wikitext-2/wiki.train.tokens.txt" "train.txt"
!cp "gdrive/MyDrive/wikitext-2/wiki.valid.tokens.txt" "valid.txt"

wiki.test.tokens      wiki.train.tokens      wiki.valid.tokens
wiki.test.tokens.txt  wiki.train.tokens.txt  wiki.valid.tokens.txt


# Preprocessing

In [4]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                # remove the headers e.g.  = = Description = = 
                if line.startswith('='): 
                    continue
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word.lower()) # make to lower

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word.lower()]) # make to lower
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
    
    @property
    def vocab_size(self):
        return len(self.dictionary.idx2word)

# Params

In [5]:
#=== params
corpus = Corpus('/content')
n_class = corpus.vocab_size
n_step = 7 # n-1 in paper
n_hidden = 200 # h in paper
embed_size = 200       # m in paper
batch_size = 512
order = n_step # order (int): the order of the language model, i.e. length of the history
epochs = 100
learning_rate = 0.001
cuda = torch.cuda.is_available()
seed = 42
clip = 2.0
#===

# Model

In [6]:
#== MODEL ==#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''
    the neural model learns the distributed representation of each word 
    (embedding matrix C) and 
    the probability function of a word sequence as a function of their distributed representations. 
    It has a hidden layer with 
    tanh activation and the output layer is a 
    Softmax layer. 
    The output of the model for each 
    input of (n - 1) previous words are the 
    probabilities over the |V | words in the vocabulary for the next word.
'''
class FNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, context_size, no_hidden, dropout=0.5, tie_weight=True):
        super(FNNModel,self).__init__()
        """
        Args:
            n_class (int): no. of vocabulary
            m (int): size of each embedding vector
#n-gram models construct tables of conditional probabilities for the next word, 
#for each one of a large number of contexts, i.e. combinations of the last n − 1 words
            n_step (int): n-1 in paper. #n_step + 1 = n-gram. if n_step = 1, bigram
            n_hidden (int): no. of hidden units associated with each word
        """
        """
        Vars:
            C: encoder (|V| x m)
            H: hiden layer weight (n x (n-1)m)
            W: word feature to output weights (|V| x (n-1)m)
            d: hidden layer bias (has h no. of elements)
            U: hidden-to-output weights (|V| × h matrix)
            b: output bias (has |V| no. of elements)
        """
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear1 = nn.Linear(context_size * embed_size, no_hidden)
        self.linear2 = nn.Linear(no_hidden, vocab_size)
        self.context_size = context_size
        self.embed_size = embed_size
        self.dropout = nn.Dropout(p=dropout)
        if tie_weight:
            self.linear2.weight = self.embeddings.weight

    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((-1, self.context_size * self.embed_size))
        hidden_output = self.linear1(embeds)
        # hidden_output = self.dropout(hidden_output)
        out = hidden_output.tanh()
        # out = self.dropout(out)
        out = self.linear2(out)
        out = self.dropout(out)
        log_probs = F.log_softmax(out, dim=1) # [1000, 28912]: softmax on 28912's dim
        return log_probs

# Data Loading

In [7]:
def batchify(data, batch_size):
    # Work out how cleanly we can divide the dataset into args.batch_size parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data
def get_batch(data, i, order):
    x = torch.autograd.Variable(torch.t(data[i:i+order]))
    y = torch.autograd.Variable(data[i+order].view(-1))
    return x, y
def evaluate(data, model, criterion):
	model.eval()
	total_loss = 0
	n_steps = data.size(0) - order - 1
	for i in tqdm(range(n_steps)):
		x, y = get_batch(data, i, order)
		out = model(x)
		loss = criterion(out, y)
		total_loss += loss.data.data
	return total_loss / n_steps

In [8]:
def clock_time(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    return int(h), int(m), int(s)

In [9]:
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import time


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)
if cuda:
	train_data, val_data, test_data = train_data.cuda(), val_data.cuda(), test_data.cuda()
print('Using cuda: {}'.format(cuda))
print('Size of training set: {:,} tokens'.format(np.prod(train_data.size())))
print('Size of validation set: {:,} tokens'.format(np.prod(val_data.size())))
print('Size of test set: {:,} tokens'.format(np.prod(test_data.size())))
print('Vocabulary size: {:,}'.format(corpus.vocab_size))
print('Example data:')
for k in range(100, 107):
    x = [corpus.dictionary.idx2word[i] for i in train_data[k:order+k, 0]]
    y = [corpus.dictionary.idx2word[train_data[k+order, 0]]]
    print(x, y)
#=== initialise model
model = FNNModel(
    n_class, 
    embed_size, 
    n_step, 
    n_hidden,
    tie_weight=True
    )
if cuda:
  model.cuda()
# Display the model's architecture
print('Model: \n', model)
criterion = nn.NLLLoss()
optimizer = optim.RMSprop(model.parameters(),lr=learning_rate)

Using cuda: True
Size of training set: 2,088,448 tokens
Size of validation set: 217,600 tokens
Size of test set: 245,248 tokens
Vocabulary size: 28,912
Example data:
['"', 'nameless', '"', ',', 'a', 'penal', 'military'] ['unit']
['nameless', '"', ',', 'a', 'penal', 'military', 'unit'] ['serving']
['"', ',', 'a', 'penal', 'military', 'unit', 'serving'] ['the']
[',', 'a', 'penal', 'military', 'unit', 'serving', 'the'] ['nation']
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation'] ['of']
['penal', 'military', 'unit', 'serving', 'the', 'nation', 'of'] ['gallia']
['military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia'] ['during']
Model: 
 FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [1]:
!nvidia-smi

Sun Nov 29 17:43:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P8    33W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Set seed for reproducibility.
torch.manual_seed(seed)
np.random.seed(seed)
parameters = [param for param in model.parameters() if param.requires_grad]
# Training
print('Training...')
losses = dict(train=[], val=[])

# initialize the early_stopping counter
stop_counter = 0

lr = learning_rate # so that can alter later if SGD not descending 
best_val_loss = None

num_steps = train_data.size(0) - order - 1
batch_order = np.arange(num_steps)

t0 = time.time()
try:
    for epoch in range(1, epochs+1):
        model.train()
        epoch_start_time = time.time()
        np.random.shuffle(batch_order)

        for step in range(1, num_steps+1):
            idx = batch_order[step-1]
            x, y = get_batch(train_data, idx, order)

            model.zero_grad()
            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)
            debug = False
            #   if debug:
            #     # Debugging softmax approximation.
            #     xe = nn.CrossEntropyLoss()
            #     true_loss = xe(logits, y)
            #     print('approx {:>3.2f}, true {:>3.2f}, diff {:>3.4f}'.format(
            #       loss.data, true_loss.data, true_loss.data - loss.data))

            # Update parameters
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # reduce exploding grad
            optimizer.step()

            # Save loss.
            losses['train'].append(loss.cpu().data)
            print_every = 500
            if step % print_every == 0:
                avg_loss = sum(losses['train'][-print_every:]) / print_every
                t1 = time.time()
                steps_per_second = print_every / (t1 - t0)
                print('| epoch {} | step {}/{} | loss {:.4f} | lr {:.5f} | '
                    'ngrams/sec {:.1f} | eta {}h{}m{}s'.format(
                    epoch, step, num_steps, avg_loss, lr,
                    steps_per_second * batch_size,
                    *clock_time((num_steps - step) / steps_per_second)))
                t0 = time.time()
            
        print('Evaluating on validation set...')
        val_loss = evaluate(val_data, model, criterion)
        losses['val'].append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, torch.exp(val_loss)))
        print('-' * 89)

        if not best_val_loss or val_loss < best_val_loss:
            stop_counter = 0 # reset counter
            best_val_loss = val_loss
            print('| saving current state of model ...')
            torch.save(model.state_dict(), 'checkpoint.pth')
            #=== download checkpoint file
            # files.download('checkpoint.pth')
        elif val_loss < best_val_loss and val_loss < losses['val'][-2] and val_loss < torch.mean(torch.stack(losses['val'])): # curr loss less than best loss and previous loss
            stop_counter += 1
            if stop_counter >= 10:
                print("Early stopping")
                break
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    
# write_losses(losses['train'], args.log_dir, name='train-losses')
# write_losses(losses['val'], args.log_dir, name='val-losses')

print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

Training...
| epoch 1 | step 500/4071 | loss 26.6740 | lr 0.00100 | ngrams/sec 39485.8 | eta 0h0m46s
| epoch 1 | step 1000/4071 | loss 12.2975 | lr 0.00100 | ngrams/sec 41010.4 | eta 0h0m38s
| epoch 1 | step 1500/4071 | loss 10.6890 | lr 0.00100 | ngrams/sec 40901.2 | eta 0h0m32s
| epoch 1 | step 2000/4071 | loss 10.7191 | lr 0.00100 | ngrams/sec 40559.6 | eta 0h0m26s
| epoch 1 | step 2500/4071 | loss 11.3566 | lr 0.00100 | ngrams/sec 40402.4 | eta 0h0m19s
| epoch 1 | step 3000/4071 | loss 11.4819 | lr 0.00100 | ngrams/sec 40136.7 | eta 0h0m13s
| epoch 1 | step 3500/4071 | loss 11.4476 | lr 0.00100 | ngrams/sec 39964.6 | eta 0h0m7s
| epoch 1 | step 4000/4071 | loss 11.3487 | lr 0.00100 | ngrams/sec 39676.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1174.58it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.59it/s]


-----------------------------------------------------------------------------------------
| end of epoch   1 | time 53.20s | valid loss  7.84 | valid ppl  2536.13
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 2 | step 500/4071 | loss 11.1717 | lr 0.00100 | ngrams/sec 27224.8 | eta 0h1m7s
| epoch 2 | step 1000/4071 | loss 11.1120 | lr 0.00100 | ngrams/sec 38111.3 | eta 0h0m41s
| epoch 2 | step 1500/4071 | loss 10.9933 | lr 0.00100 | ngrams/sec 37873.2 | eta 0h0m34s
| epoch 2 | step 2000/4071 | loss 10.7577 | lr 0.00100 | ngrams/sec 37740.2 | eta 0h0m28s
| epoch 2 | step 2500/4071 | loss 10.3265 | lr 0.00100 | ngrams/sec 36887.4 | eta 0h0m21s
| epoch 2 | step 3000/4071 | loss 9.8424 | lr 0.00100 | ngrams/sec 37670.3 | eta 0h0m14s
| epoch 2 | step 3500/4071 | loss 9.5802 | lr 0.00100 | ngrams/sec 37921.8 | eta 0h0m7s
| epoch 2 | step 4000/4071 | loss 9.4327 | lr 0.00100 | ngrams/sec 38155.9 | eta 0h0m0

 29%|██▊       | 119/417 [00:00<00:00, 1157.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 288.88it/s]


-----------------------------------------------------------------------------------------
| end of epoch   2 | time 56.44s | valid loss  7.62 | valid ppl  2031.56
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 3 | step 500/4071 | loss 9.2825 | lr 0.00100 | ngrams/sec 27258.8 | eta 0h1m7s
| epoch 3 | step 1000/4071 | loss 9.2129 | lr 0.00100 | ngrams/sec 39464.9 | eta 0h0m39s
| epoch 3 | step 1500/4071 | loss 9.1771 | lr 0.00100 | ngrams/sec 39535.3 | eta 0h0m33s
| epoch 3 | step 2000/4071 | loss 9.1248 | lr 0.00100 | ngrams/sec 39460.5 | eta 0h0m26s
| epoch 3 | step 2500/4071 | loss 9.1028 | lr 0.00100 | ngrams/sec 39251.3 | eta 0h0m20s
| epoch 3 | step 3000/4071 | loss 9.0674 | lr 0.00100 | ngrams/sec 38633.8 | eta 0h0m14s
| epoch 3 | step 3500/4071 | loss 9.0436 | lr 0.00100 | ngrams/sec 38557.0 | eta 0h0m7s
| epoch 3 | step 4000/4071 | loss 9.0464 | lr 0.00100 | ngrams/sec 38342.8 | eta 0h0m0s


 28%|██▊       | 117/417 [00:00<00:00, 1159.02it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.35it/s]


-----------------------------------------------------------------------------------------
| end of epoch   3 | time 54.88s | valid loss  7.45 | valid ppl  1719.12
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 4 | step 500/4071 | loss 8.9531 | lr 0.00100 | ngrams/sec 26769.6 | eta 0h1m8s
| epoch 4 | step 1000/4071 | loss 8.9330 | lr 0.00100 | ngrams/sec 38253.4 | eta 0h0m41s
| epoch 4 | step 1500/4071 | loss 8.9476 | lr 0.00100 | ngrams/sec 38226.3 | eta 0h0m34s
| epoch 4 | step 2000/4071 | loss 8.9277 | lr 0.00100 | ngrams/sec 38470.6 | eta 0h0m27s
| epoch 4 | step 2500/4071 | loss 8.9268 | lr 0.00100 | ngrams/sec 38766.4 | eta 0h0m20s
| epoch 4 | step 3000/4071 | loss 8.8866 | lr 0.00100 | ngrams/sec 38652.0 | eta 0h0m14s
| epoch 4 | step 3500/4071 | loss 8.8915 | lr 0.00100 | ngrams/sec 38686.8 | eta 0h0m7s
| epoch 4 | step 4000/4071 | loss 8.8674 | lr 0.00100 | ngrams/sec 38843.2 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1163.52it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.20it/s]


-----------------------------------------------------------------------------------------
| end of epoch   4 | time 55.51s | valid loss  7.37 | valid ppl  1581.34
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 5 | step 500/4071 | loss 8.8219 | lr 0.00100 | ngrams/sec 27139.5 | eta 0h1m7s
| epoch 5 | step 1000/4071 | loss 8.8352 | lr 0.00100 | ngrams/sec 38609.6 | eta 0h0m40s
| epoch 5 | step 1500/4071 | loss 8.8317 | lr 0.00100 | ngrams/sec 38780.6 | eta 0h0m33s
| epoch 5 | step 2000/4071 | loss 8.8134 | lr 0.00100 | ngrams/sec 38611.5 | eta 0h0m27s
| epoch 5 | step 2500/4071 | loss 8.8002 | lr 0.00100 | ngrams/sec 38659.6 | eta 0h0m20s
| epoch 5 | step 3000/4071 | loss 8.8090 | lr 0.00100 | ngrams/sec 38757.5 | eta 0h0m14s
| epoch 5 | step 3500/4071 | loss 8.8046 | lr 0.00100 | ngrams/sec 38761.9 | eta 0h0m7s
| epoch 5 | step 4000/4071 | loss 8.7929 | lr 0.00100 | ngrams/sec 38610.3 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1158.12it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 287.79it/s]


-----------------------------------------------------------------------------------------
| end of epoch   5 | time 55.28s | valid loss  7.30 | valid ppl  1474.38
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 6 | step 500/4071 | loss 8.7426 | lr 0.00100 | ngrams/sec 27136.7 | eta 0h1m7s
| epoch 6 | step 1000/4071 | loss 8.7448 | lr 0.00100 | ngrams/sec 38659.9 | eta 0h0m40s
| epoch 6 | step 1500/4071 | loss 8.7533 | lr 0.00100 | ngrams/sec 38760.3 | eta 0h0m33s
| epoch 6 | step 2000/4071 | loss 8.7485 | lr 0.00100 | ngrams/sec 38662.0 | eta 0h0m27s
| epoch 6 | step 2500/4071 | loss 8.7470 | lr 0.00100 | ngrams/sec 38486.5 | eta 0h0m20s
| epoch 6 | step 3000/4071 | loss 8.7488 | lr 0.00100 | ngrams/sec 38682.3 | eta 0h0m14s
| epoch 6 | step 3500/4071 | loss 8.7417 | lr 0.00100 | ngrams/sec 38674.6 | eta 0h0m7s
| epoch 6 | step 4000/4071 | loss 8.7280 | lr 0.00100 | ngrams/sec 38575.5 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1151.19it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 287.18it/s]


-----------------------------------------------------------------------------------------
| end of epoch   6 | time 55.34s | valid loss  7.24 | valid ppl  1396.07
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 7 | step 500/4071 | loss 8.6995 | lr 0.00100 | ngrams/sec 27118.6 | eta 0h1m7s
| epoch 7 | step 1000/4071 | loss 8.7128 | lr 0.00100 | ngrams/sec 38603.2 | eta 0h0m40s
| epoch 7 | step 1500/4071 | loss 8.7090 | lr 0.00100 | ngrams/sec 38451.1 | eta 0h0m34s
| epoch 7 | step 2000/4071 | loss 8.6941 | lr 0.00100 | ngrams/sec 38591.3 | eta 0h0m27s
| epoch 7 | step 2500/4071 | loss 8.7109 | lr 0.00100 | ngrams/sec 38615.5 | eta 0h0m20s
| epoch 7 | step 3000/4071 | loss 8.7031 | lr 0.00100 | ngrams/sec 38582.9 | eta 0h0m14s
| epoch 7 | step 3500/4071 | loss 8.7012 | lr 0.00100 | ngrams/sec 38609.0 | eta 0h0m7s
| epoch 7 | step 4000/4071 | loss 8.6907 | lr 0.00100 | ngrams/sec 38506.9 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1175.31it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 287.85it/s]


-----------------------------------------------------------------------------------------
| end of epoch   7 | time 55.43s | valid loss  7.21 | valid ppl  1349.50
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 8 | step 500/4071 | loss 8.6629 | lr 0.00100 | ngrams/sec 27122.8 | eta 0h1m7s
| epoch 8 | step 1000/4071 | loss 8.6634 | lr 0.00100 | ngrams/sec 38547.7 | eta 0h0m40s
| epoch 8 | step 1500/4071 | loss 8.6688 | lr 0.00100 | ngrams/sec 38494.5 | eta 0h0m34s
| epoch 8 | step 2000/4071 | loss 8.6666 | lr 0.00100 | ngrams/sec 38813.9 | eta 0h0m27s
| epoch 8 | step 2500/4071 | loss 8.6676 | lr 0.00100 | ngrams/sec 38739.6 | eta 0h0m20s
| epoch 8 | step 3000/4071 | loss 8.6747 | lr 0.00100 | ngrams/sec 38893.9 | eta 0h0m14s
| epoch 8 | step 3500/4071 | loss 8.6648 | lr 0.00100 | ngrams/sec 38625.7 | eta 0h0m7s
| epoch 8 | step 4000/4071 | loss 8.6638 | lr 0.00100 | ngrams/sec 38761.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1137.55it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 287.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch   8 | time 55.29s | valid loss  7.15 | valid ppl  1280.05
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 9 | step 500/4071 | loss 8.6431 | lr 0.00100 | ngrams/sec 27048.5 | eta 0h1m7s
| epoch 9 | step 1000/4071 | loss 8.6341 | lr 0.00100 | ngrams/sec 38811.2 | eta 0h0m40s
| epoch 9 | step 1500/4071 | loss 8.6381 | lr 0.00100 | ngrams/sec 38575.6 | eta 0h0m34s
| epoch 9 | step 2000/4071 | loss 8.6451 | lr 0.00100 | ngrams/sec 38632.7 | eta 0h0m27s
| epoch 9 | step 2500/4071 | loss 8.6362 | lr 0.00100 | ngrams/sec 38717.1 | eta 0h0m20s
| epoch 9 | step 3000/4071 | loss 8.6483 | lr 0.00100 | ngrams/sec 38574.4 | eta 0h0m14s
| epoch 9 | step 3500/4071 | loss 8.6407 | lr 0.00100 | ngrams/sec 38569.2 | eta 0h0m7s
| epoch 9 | step 4000/4071 | loss 8.6499 | lr 0.00100 | ngrams/sec 38421.0 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1171.04it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 285.86it/s]


-----------------------------------------------------------------------------------------
| end of epoch   9 | time 55.40s | valid loss  7.11 | valid ppl  1225.09
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 10 | step 500/4071 | loss 8.6108 | lr 0.00100 | ngrams/sec 26990.5 | eta 0h1m7s
| epoch 10 | step 1000/4071 | loss 8.6155 | lr 0.00100 | ngrams/sec 38588.2 | eta 0h0m40s
| epoch 10 | step 1500/4071 | loss 8.6103 | lr 0.00100 | ngrams/sec 38682.2 | eta 0h0m34s
| epoch 10 | step 2000/4071 | loss 8.6174 | lr 0.00100 | ngrams/sec 38628.2 | eta 0h0m27s
| epoch 10 | step 2500/4071 | loss 8.6278 | lr 0.00100 | ngrams/sec 38545.1 | eta 0h0m20s
| epoch 10 | step 3000/4071 | loss 8.6203 | lr 0.00100 | ngrams/sec 38641.0 | eta 0h0m14s
| epoch 10 | step 3500/4071 | loss 8.6203 | lr 0.00100 | ngrams/sec 38680.7 | eta 0h0m7s
| epoch 10 | step 4000/4071 | loss 8.6345 | lr 0.00100 | ngrams/sec 38647.8 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1161.87it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 286.28it/s]


-----------------------------------------------------------------------------------------
| end of epoch  10 | time 55.38s | valid loss  7.09 | valid ppl  1200.26
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 11 | step 500/4071 | loss 8.5901 | lr 0.00100 | ngrams/sec 27114.0 | eta 0h1m7s
| epoch 11 | step 1000/4071 | loss 8.6074 | lr 0.00100 | ngrams/sec 38753.7 | eta 0h0m40s
| epoch 11 | step 1500/4071 | loss 8.6177 | lr 0.00100 | ngrams/sec 38527.9 | eta 0h0m34s
| epoch 11 | step 2000/4071 | loss 8.6059 | lr 0.00100 | ngrams/sec 38685.6 | eta 0h0m27s
| epoch 11 | step 2500/4071 | loss 8.6112 | lr 0.00100 | ngrams/sec 38480.7 | eta 0h0m20s
| epoch 11 | step 3000/4071 | loss 8.5844 | lr 0.00100 | ngrams/sec 38551.4 | eta 0h0m14s
| epoch 11 | step 3500/4071 | loss 8.6045 | lr 0.00100 | ngrams/sec 38674.1 | eta 0h0m7s
| epoch 11 | step 4000/4071 | loss 8.6057 | lr 0.00100 | ngrams/sec 38578.8 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1157.08it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 286.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  11 | time 55.38s | valid loss  7.06 | valid ppl  1160.82
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 12 | step 500/4071 | loss 8.5763 | lr 0.00100 | ngrams/sec 27137.9 | eta 0h1m7s
| epoch 12 | step 1000/4071 | loss 8.5958 | lr 0.00100 | ngrams/sec 38828.9 | eta 0h0m40s
| epoch 12 | step 1500/4071 | loss 8.5833 | lr 0.00100 | ngrams/sec 38668.5 | eta 0h0m34s
| epoch 12 | step 2000/4071 | loss 8.5765 | lr 0.00100 | ngrams/sec 38670.7 | eta 0h0m27s
| epoch 12 | step 2500/4071 | loss 8.5855 | lr 0.00100 | ngrams/sec 38812.8 | eta 0h0m20s
| epoch 12 | step 3000/4071 | loss 8.5945 | lr 0.00100 | ngrams/sec 38869.0 | eta 0h0m14s
| epoch 12 | step 3500/4071 | loss 8.5929 | lr 0.00100 | ngrams/sec 39000.0 | eta 0h0m7s
| epoch 12 | step 4000/4071 | loss 8.5955 | lr 0.00100 | ngrams/sec 38984.3 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1163.19it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.26it/s]


-----------------------------------------------------------------------------------------
| end of epoch  12 | time 55.09s | valid loss  7.07 | valid ppl  1175.77
-----------------------------------------------------------------------------------------
| epoch 13 | step 500/4071 | loss 8.5588 | lr 0.00100 | ngrams/sec 27353.4 | eta 0h1m6s
| epoch 13 | step 1000/4071 | loss 8.5660 | lr 0.00100 | ngrams/sec 38934.9 | eta 0h0m40s
| epoch 13 | step 1500/4071 | loss 8.5601 | lr 0.00100 | ngrams/sec 39103.8 | eta 0h0m33s
| epoch 13 | step 2000/4071 | loss 8.5804 | lr 0.00100 | ngrams/sec 39059.2 | eta 0h0m27s
| epoch 13 | step 2500/4071 | loss 8.5786 | lr 0.00100 | ngrams/sec 39121.0 | eta 0h0m20s
| epoch 13 | step 3000/4071 | loss 8.5608 | lr 0.00100 | ngrams/sec 39077.8 | eta 0h0m14s
| epoch 13 | step 3500/4071 | loss 8.5689 | lr 0.00100 | ngrams/sec 39006.6 | eta 0h0m7s
| epoch 13 | step 4000/4071 | loss 8.5798 | lr 0.00100 | ngrams/sec 39086.3 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1143.86it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 288.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  13 | time 54.81s | valid loss  7.08 | valid ppl  1191.82
-----------------------------------------------------------------------------------------
| epoch 14 | step 500/4071 | loss 8.5372 | lr 0.00100 | ngrams/sec 27450.9 | eta 0h1m6s
| epoch 14 | step 1000/4071 | loss 8.5536 | lr 0.00100 | ngrams/sec 39158.3 | eta 0h0m40s
| epoch 14 | step 1500/4071 | loss 8.5671 | lr 0.00100 | ngrams/sec 39184.8 | eta 0h0m33s
| epoch 14 | step 2000/4071 | loss 8.5603 | lr 0.00100 | ngrams/sec 39141.7 | eta 0h0m27s
| epoch 14 | step 2500/4071 | loss 8.5542 | lr 0.00100 | ngrams/sec 39374.3 | eta 0h0m20s
| epoch 14 | step 3000/4071 | loss 8.5627 | lr 0.00100 | ngrams/sec 39270.8 | eta 0h0m13s
| epoch 14 | step 3500/4071 | loss 8.5428 | lr 0.00100 | ngrams/sec 39230.7 | eta 0h0m7s
| epoch 14 | step 4000/4071 | loss 8.5518 | lr 0.00100 | ngrams/sec 39103.2 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1155.77it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  14 | time 54.61s | valid loss  7.00 | valid ppl  1094.72
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 15 | step 500/4071 | loss 8.5237 | lr 0.00100 | ngrams/sec 27393.7 | eta 0h1m6s
| epoch 15 | step 1000/4071 | loss 8.5409 | lr 0.00100 | ngrams/sec 39279.7 | eta 0h0m40s
| epoch 15 | step 1500/4071 | loss 8.5360 | lr 0.00100 | ngrams/sec 39192.2 | eta 0h0m33s
| epoch 15 | step 2000/4071 | loss 8.5382 | lr 0.00100 | ngrams/sec 39162.1 | eta 0h0m27s
| epoch 15 | step 2500/4071 | loss 8.5305 | lr 0.00100 | ngrams/sec 39286.9 | eta 0h0m20s
| epoch 15 | step 3000/4071 | loss 8.5479 | lr 0.00100 | ngrams/sec 39225.9 | eta 0h0m13s
| epoch 15 | step 3500/4071 | loss 8.5539 | lr 0.00100 | ngrams/sec 39277.7 | eta 0h0m7s
| epoch 15 | step 4000/4071 | loss 8.5559 | lr 0.00100 | ngrams/sec 39227.3 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1142.03it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.41it/s]


-----------------------------------------------------------------------------------------
| end of epoch  15 | time 54.55s | valid loss  6.98 | valid ppl  1076.44
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 16 | step 500/4071 | loss 8.5119 | lr 0.00100 | ngrams/sec 27319.7 | eta 0h1m6s
| epoch 16 | step 1000/4071 | loss 8.5345 | lr 0.00100 | ngrams/sec 39302.7 | eta 0h0m40s
| epoch 16 | step 1500/4071 | loss 8.5301 | lr 0.00100 | ngrams/sec 39379.4 | eta 0h0m33s
| epoch 16 | step 2000/4071 | loss 8.5229 | lr 0.00100 | ngrams/sec 39313.2 | eta 0h0m26s
| epoch 16 | step 2500/4071 | loss 8.5319 | lr 0.00100 | ngrams/sec 39193.5 | eta 0h0m20s
| epoch 16 | step 3000/4071 | loss 8.5345 | lr 0.00100 | ngrams/sec 39311.1 | eta 0h0m13s
| epoch 16 | step 3500/4071 | loss 8.5349 | lr 0.00100 | ngrams/sec 39388.7 | eta 0h0m7s
| epoch 16 | step 4000/4071 | loss 8.5386 | lr 0.00100 | ngrams/sec 39199.6 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1154.63it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  16 | time 54.51s | valid loss  6.99 | valid ppl  1081.70
-----------------------------------------------------------------------------------------
| epoch 17 | step 500/4071 | loss 8.5001 | lr 0.00100 | ngrams/sec 27452.4 | eta 0h1m6s
| epoch 17 | step 1000/4071 | loss 8.5076 | lr 0.00100 | ngrams/sec 39376.6 | eta 0h0m39s
| epoch 17 | step 1500/4071 | loss 8.5172 | lr 0.00100 | ngrams/sec 39338.4 | eta 0h0m33s
| epoch 17 | step 2000/4071 | loss 8.5226 | lr 0.00100 | ngrams/sec 39231.5 | eta 0h0m27s
| epoch 17 | step 2500/4071 | loss 8.5166 | lr 0.00100 | ngrams/sec 39220.4 | eta 0h0m20s
| epoch 17 | step 3000/4071 | loss 8.5326 | lr 0.00100 | ngrams/sec 39403.2 | eta 0h0m13s
| epoch 17 | step 3500/4071 | loss 8.5127 | lr 0.00100 | ngrams/sec 39231.7 | eta 0h0m7s
| epoch 17 | step 4000/4071 | loss 8.5206 | lr 0.00100 | ngrams/sec 39330.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1146.06it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  17 | time 54.50s | valid loss  6.93 | valid ppl  1024.66
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 18 | step 500/4071 | loss 8.4924 | lr 0.00100 | ngrams/sec 27339.3 | eta 0h1m6s
| epoch 18 | step 1000/4071 | loss 8.4983 | lr 0.00100 | ngrams/sec 39267.8 | eta 0h0m40s
| epoch 18 | step 1500/4071 | loss 8.5187 | lr 0.00100 | ngrams/sec 39322.7 | eta 0h0m33s
| epoch 18 | step 2000/4071 | loss 8.5111 | lr 0.00100 | ngrams/sec 39295.8 | eta 0h0m26s
| epoch 18 | step 2500/4071 | loss 8.5037 | lr 0.00100 | ngrams/sec 39427.2 | eta 0h0m20s
| epoch 18 | step 3000/4071 | loss 8.5137 | lr 0.00100 | ngrams/sec 39364.1 | eta 0h0m13s
| epoch 18 | step 3500/4071 | loss 8.5167 | lr 0.00100 | ngrams/sec 39449.0 | eta 0h0m7s
| epoch 18 | step 4000/4071 | loss 8.5154 | lr 0.00100 | ngrams/sec 39424.8 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1155.82it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.18it/s]


-----------------------------------------------------------------------------------------
| end of epoch  18 | time 54.40s | valid loss  6.92 | valid ppl  1011.60
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 19 | step 500/4071 | loss 8.4888 | lr 0.00100 | ngrams/sec 27458.5 | eta 0h1m6s
| epoch 19 | step 1000/4071 | loss 8.4968 | lr 0.00100 | ngrams/sec 39344.0 | eta 0h0m39s
| epoch 19 | step 1500/4071 | loss 8.5131 | lr 0.00100 | ngrams/sec 39365.6 | eta 0h0m33s
| epoch 19 | step 2000/4071 | loss 8.5043 | lr 0.00100 | ngrams/sec 39424.1 | eta 0h0m26s
| epoch 19 | step 2500/4071 | loss 8.4895 | lr 0.00100 | ngrams/sec 39289.7 | eta 0h0m20s
| epoch 19 | step 3000/4071 | loss 8.4971 | lr 0.00100 | ngrams/sec 39422.0 | eta 0h0m13s
| epoch 19 | step 3500/4071 | loss 8.4965 | lr 0.00100 | ngrams/sec 39543.4 | eta 0h0m7s
| epoch 19 | step 4000/4071 | loss 8.5040 | lr 0.00100 | ngrams/sec 39569.7 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1143.43it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.61it/s]


-----------------------------------------------------------------------------------------
| end of epoch  19 | time 54.30s | valid loss  6.87 | valid ppl   961.84
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 20 | step 500/4071 | loss 8.4686 | lr 0.00100 | ngrams/sec 27501.8 | eta 0h1m6s
| epoch 20 | step 1000/4071 | loss 8.4808 | lr 0.00100 | ngrams/sec 39185.5 | eta 0h0m40s
| epoch 20 | step 1500/4071 | loss 8.4847 | lr 0.00100 | ngrams/sec 39516.3 | eta 0h0m33s
| epoch 20 | step 2000/4071 | loss 8.4887 | lr 0.00100 | ngrams/sec 39458.8 | eta 0h0m26s
| epoch 20 | step 2500/4071 | loss 8.5008 | lr 0.00100 | ngrams/sec 39538.2 | eta 0h0m20s
| epoch 20 | step 3000/4071 | loss 8.5187 | lr 0.00100 | ngrams/sec 39508.6 | eta 0h0m13s
| epoch 20 | step 3500/4071 | loss 8.4920 | lr 0.00100 | ngrams/sec 39476.2 | eta 0h0m7s
| epoch 20 | step 4000/4071 | loss 8.5000 | lr 0.00100 | ngrams/sec 39448.5 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1167.43it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.62it/s]


-----------------------------------------------------------------------------------------
| end of epoch  20 | time 54.27s | valid loss  6.87 | valid ppl   958.24
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 21 | step 500/4071 | loss 8.4632 | lr 0.00100 | ngrams/sec 27427.7 | eta 0h1m6s
| epoch 21 | step 1000/4071 | loss 8.4723 | lr 0.00100 | ngrams/sec 39506.5 | eta 0h0m39s
| epoch 21 | step 1500/4071 | loss 8.4888 | lr 0.00100 | ngrams/sec 39447.2 | eta 0h0m33s
| epoch 21 | step 2000/4071 | loss 8.4678 | lr 0.00100 | ngrams/sec 39552.8 | eta 0h0m26s
| epoch 21 | step 2500/4071 | loss 8.4930 | lr 0.00100 | ngrams/sec 39518.4 | eta 0h0m20s
| epoch 21 | step 3000/4071 | loss 8.4925 | lr 0.00100 | ngrams/sec 39495.1 | eta 0h0m13s
| epoch 21 | step 3500/4071 | loss 8.4843 | lr 0.00100 | ngrams/sec 39567.0 | eta 0h0m7s
| epoch 21 | step 4000/4071 | loss 8.4845 | lr 0.00100 | ngrams/sec 39426.1 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1141.62it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.05it/s]


-----------------------------------------------------------------------------------------
| end of epoch  21 | time 54.23s | valid loss  6.83 | valid ppl   926.50
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 22 | step 500/4071 | loss 8.4715 | lr 0.00100 | ngrams/sec 27490.2 | eta 0h1m6s
| epoch 22 | step 1000/4071 | loss 8.4634 | lr 0.00100 | ngrams/sec 39446.0 | eta 0h0m39s
| epoch 22 | step 1500/4071 | loss 8.4792 | lr 0.00100 | ngrams/sec 39499.8 | eta 0h0m33s
| epoch 22 | step 2000/4071 | loss 8.4693 | lr 0.00100 | ngrams/sec 39427.2 | eta 0h0m26s
| epoch 22 | step 2500/4071 | loss 8.4896 | lr 0.00100 | ngrams/sec 39404.6 | eta 0h0m20s
| epoch 22 | step 3000/4071 | loss 8.4883 | lr 0.00100 | ngrams/sec 39498.3 | eta 0h0m13s
| epoch 22 | step 3500/4071 | loss 8.4694 | lr 0.00100 | ngrams/sec 39413.6 | eta 0h0m7s
| epoch 22 | step 4000/4071 | loss 8.4942 | lr 0.00100 | ngrams/sec 39371.5 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1139.03it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.66it/s]


-----------------------------------------------------------------------------------------
| end of epoch  22 | time 54.29s | valid loss  6.82 | valid ppl   917.79
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 23 | step 500/4071 | loss 8.4607 | lr 0.00100 | ngrams/sec 27476.1 | eta 0h1m6s
| epoch 23 | step 1000/4071 | loss 8.4791 | lr 0.00100 | ngrams/sec 39401.1 | eta 0h0m39s
| epoch 23 | step 1500/4071 | loss 8.4777 | lr 0.00100 | ngrams/sec 39418.0 | eta 0h0m33s
| epoch 23 | step 2000/4071 | loss 8.4681 | lr 0.00100 | ngrams/sec 39456.7 | eta 0h0m26s
| epoch 23 | step 2500/4071 | loss 8.4537 | lr 0.00100 | ngrams/sec 39356.0 | eta 0h0m20s
| epoch 23 | step 3000/4071 | loss 8.4863 | lr 0.00100 | ngrams/sec 39415.4 | eta 0h0m13s
| epoch 23 | step 3500/4071 | loss 8.4812 | lr 0.00100 | ngrams/sec 39529.6 | eta 0h0m7s
| epoch 23 | step 4000/4071 | loss 8.4774 | lr 0.00100 | ngrams/sec 39517.7 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1152.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.73it/s]


-----------------------------------------------------------------------------------------
| end of epoch  23 | time 54.29s | valid loss  6.79 | valid ppl   892.73
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 24 | step 500/4071 | loss 8.4547 | lr 0.00100 | ngrams/sec 27459.3 | eta 0h1m6s
| epoch 24 | step 1000/4071 | loss 8.4427 | lr 0.00100 | ngrams/sec 39546.6 | eta 0h0m39s
| epoch 24 | step 1500/4071 | loss 8.4575 | lr 0.00100 | ngrams/sec 39585.6 | eta 0h0m33s
| epoch 24 | step 2000/4071 | loss 8.4500 | lr 0.00100 | ngrams/sec 39550.3 | eta 0h0m26s
| epoch 24 | step 2500/4071 | loss 8.4583 | lr 0.00100 | ngrams/sec 39565.3 | eta 0h0m20s
| epoch 24 | step 3000/4071 | loss 8.4692 | lr 0.00100 | ngrams/sec 39562.1 | eta 0h0m13s
| epoch 24 | step 3500/4071 | loss 8.4597 | lr 0.00100 | ngrams/sec 39078.7 | eta 0h0m7s
| epoch 24 | step 4000/4071 | loss 8.4556 | lr 0.00100 | ngrams/sec 39409.2 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1148.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  24 | time 54.24s | valid loss  6.80 | valid ppl   897.76
-----------------------------------------------------------------------------------------
| epoch 25 | step 500/4071 | loss 8.4320 | lr 0.00100 | ngrams/sec 27701.1 | eta 0h1m6s
| epoch 25 | step 1000/4071 | loss 8.4485 | lr 0.00100 | ngrams/sec 39543.6 | eta 0h0m39s
| epoch 25 | step 1500/4071 | loss 8.4508 | lr 0.00100 | ngrams/sec 39607.5 | eta 0h0m33s
| epoch 25 | step 2000/4071 | loss 8.4452 | lr 0.00100 | ngrams/sec 39580.2 | eta 0h0m26s
| epoch 25 | step 2500/4071 | loss 8.4435 | lr 0.00100 | ngrams/sec 39679.7 | eta 0h0m20s
| epoch 25 | step 3000/4071 | loss 8.4520 | lr 0.00100 | ngrams/sec 39642.2 | eta 0h0m13s
| epoch 25 | step 3500/4071 | loss 8.4480 | lr 0.00100 | ngrams/sec 39624.1 | eta 0h0m7s
| epoch 25 | step 4000/4071 | loss 8.4474 | lr 0.00100 | ngrams/sec 39639.7 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1164.60it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.95it/s]


-----------------------------------------------------------------------------------------
| end of epoch  25 | time 54.06s | valid loss  6.78 | valid ppl   881.12
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 26 | step 500/4071 | loss 8.4168 | lr 0.00100 | ngrams/sec 27568.7 | eta 0h1m6s
| epoch 26 | step 1000/4071 | loss 8.4467 | lr 0.00100 | ngrams/sec 39664.4 | eta 0h0m39s
| epoch 26 | step 1500/4071 | loss 8.4357 | lr 0.00100 | ngrams/sec 39713.0 | eta 0h0m33s
| epoch 26 | step 2000/4071 | loss 8.4336 | lr 0.00100 | ngrams/sec 39634.2 | eta 0h0m26s
| epoch 26 | step 2500/4071 | loss 8.4241 | lr 0.00100 | ngrams/sec 39591.0 | eta 0h0m20s
| epoch 26 | step 3000/4071 | loss 8.4258 | lr 0.00100 | ngrams/sec 39656.1 | eta 0h0m13s
| epoch 26 | step 3500/4071 | loss 8.4338 | lr 0.00100 | ngrams/sec 39684.5 | eta 0h0m7s
| epoch 26 | step 4000/4071 | loss 8.4377 | lr 0.00100 | ngrams/sec 39614.5 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1169.58it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.06it/s]


-----------------------------------------------------------------------------------------
| end of epoch  26 | time 54.02s | valid loss  6.78 | valid ppl   877.78
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 27 | step 500/4071 | loss 8.4241 | lr 0.00100 | ngrams/sec 27555.5 | eta 0h1m6s
| epoch 27 | step 1000/4071 | loss 8.4301 | lr 0.00100 | ngrams/sec 39637.8 | eta 0h0m39s
| epoch 27 | step 1500/4071 | loss 8.4154 | lr 0.00100 | ngrams/sec 39754.3 | eta 0h0m33s
| epoch 27 | step 2000/4071 | loss 8.4293 | lr 0.00100 | ngrams/sec 39583.8 | eta 0h0m26s
| epoch 27 | step 2500/4071 | loss 8.4101 | lr 0.00100 | ngrams/sec 39716.1 | eta 0h0m20s
| epoch 27 | step 3000/4071 | loss 8.4216 | lr 0.00100 | ngrams/sec 39583.6 | eta 0h0m13s
| epoch 27 | step 3500/4071 | loss 8.4236 | lr 0.00100 | ngrams/sec 39736.6 | eta 0h0m7s
| epoch 27 | step 4000/4071 | loss 8.4313 | lr 0.00100 | ngrams/sec 39726.1 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1171.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.35it/s]


-----------------------------------------------------------------------------------------
| end of epoch  27 | time 53.99s | valid loss  6.75 | valid ppl   858.30
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 28 | step 500/4071 | loss 8.3958 | lr 0.00100 | ngrams/sec 27677.0 | eta 0h1m6s
| epoch 28 | step 1000/4071 | loss 8.4136 | lr 0.00100 | ngrams/sec 39732.2 | eta 0h0m39s
| epoch 28 | step 1500/4071 | loss 8.4002 | lr 0.00100 | ngrams/sec 39692.6 | eta 0h0m33s
| epoch 28 | step 2000/4071 | loss 8.4191 | lr 0.00100 | ngrams/sec 39734.6 | eta 0h0m26s
| epoch 28 | step 2500/4071 | loss 8.4171 | lr 0.00100 | ngrams/sec 39692.6 | eta 0h0m20s
| epoch 28 | step 3000/4071 | loss 8.4200 | lr 0.00100 | ngrams/sec 39760.8 | eta 0h0m13s
| epoch 28 | step 3500/4071 | loss 8.4317 | lr 0.00100 | ngrams/sec 39834.5 | eta 0h0m7s
| epoch 28 | step 4000/4071 | loss 8.4153 | lr 0.00100 | ngrams/sec 39754.9 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1166.96it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  28 | time 53.87s | valid loss  6.76 | valid ppl   859.90
-----------------------------------------------------------------------------------------
| epoch 29 | step 500/4071 | loss 8.3822 | lr 0.00100 | ngrams/sec 27845.0 | eta 0h1m5s
| epoch 29 | step 1000/4071 | loss 8.4036 | lr 0.00100 | ngrams/sec 39796.0 | eta 0h0m39s
| epoch 29 | step 1500/4071 | loss 8.4087 | lr 0.00100 | ngrams/sec 39796.6 | eta 0h0m33s
| epoch 29 | step 2000/4071 | loss 8.4007 | lr 0.00100 | ngrams/sec 39690.9 | eta 0h0m26s
| epoch 29 | step 2500/4071 | loss 8.4198 | lr 0.00100 | ngrams/sec 39745.3 | eta 0h0m20s
| epoch 29 | step 3000/4071 | loss 8.4142 | lr 0.00100 | ngrams/sec 39759.5 | eta 0h0m13s
| epoch 29 | step 3500/4071 | loss 8.4256 | lr 0.00100 | ngrams/sec 39840.2 | eta 0h0m7s
| epoch 29 | step 4000/4071 | loss 8.4107 | lr 0.00100 | ngrams/sec 39817.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.84it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  29 | time 53.84s | valid loss  6.76 | valid ppl   859.56
-----------------------------------------------------------------------------------------
| epoch 30 | step 500/4071 | loss 8.3838 | lr 0.00100 | ngrams/sec 27848.2 | eta 0h1m5s
| epoch 30 | step 1000/4071 | loss 8.3728 | lr 0.00100 | ngrams/sec 39853.5 | eta 0h0m39s
| epoch 30 | step 1500/4071 | loss 8.4065 | lr 0.00100 | ngrams/sec 39826.8 | eta 0h0m33s
| epoch 30 | step 2000/4071 | loss 8.3862 | lr 0.00100 | ngrams/sec 39902.2 | eta 0h0m26s
| epoch 30 | step 2500/4071 | loss 8.4080 | lr 0.00100 | ngrams/sec 39903.6 | eta 0h0m20s
| epoch 30 | step 3000/4071 | loss 8.3910 | lr 0.00100 | ngrams/sec 39791.0 | eta 0h0m13s
| epoch 30 | step 3500/4071 | loss 8.3929 | lr 0.00100 | ngrams/sec 39793.5 | eta 0h0m7s
| epoch 30 | step 4000/4071 | loss 8.3928 | lr 0.00100 | ngrams/sec 39773.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.99it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.32it/s]


-----------------------------------------------------------------------------------------
| end of epoch  30 | time 53.78s | valid loss  6.76 | valid ppl   863.40
-----------------------------------------------------------------------------------------
| epoch 31 | step 500/4071 | loss 8.3562 | lr 0.00100 | ngrams/sec 27829.8 | eta 0h1m5s
| epoch 31 | step 1000/4071 | loss 8.3752 | lr 0.00100 | ngrams/sec 39709.2 | eta 0h0m39s
| epoch 31 | step 1500/4071 | loss 8.3694 | lr 0.00100 | ngrams/sec 39656.0 | eta 0h0m33s
| epoch 31 | step 2000/4071 | loss 8.3795 | lr 0.00100 | ngrams/sec 39668.1 | eta 0h0m26s
| epoch 31 | step 2500/4071 | loss 8.3697 | lr 0.00100 | ngrams/sec 39589.1 | eta 0h0m20s
| epoch 31 | step 3000/4071 | loss 8.3902 | lr 0.00100 | ngrams/sec 39633.9 | eta 0h0m13s
| epoch 31 | step 3500/4071 | loss 8.3737 | lr 0.00100 | ngrams/sec 39598.8 | eta 0h0m7s
| epoch 31 | step 4000/4071 | loss 8.3743 | lr 0.00100 | ngrams/sec 39625.0 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1170.75it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.06it/s]


-----------------------------------------------------------------------------------------
| end of epoch  31 | time 54.02s | valid loss  6.75 | valid ppl   854.90
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 32 | step 500/4071 | loss 8.3450 | lr 0.00100 | ngrams/sec 27526.8 | eta 0h1m6s
| epoch 32 | step 1000/4071 | loss 8.3609 | lr 0.00100 | ngrams/sec 39529.0 | eta 0h0m39s
| epoch 32 | step 1500/4071 | loss 8.3641 | lr 0.00100 | ngrams/sec 39599.4 | eta 0h0m33s
| epoch 32 | step 2000/4071 | loss 8.3534 | lr 0.00100 | ngrams/sec 39531.9 | eta 0h0m26s
| epoch 32 | step 2500/4071 | loss 8.3642 | lr 0.00100 | ngrams/sec 39554.5 | eta 0h0m20s
| epoch 32 | step 3000/4071 | loss 8.3689 | lr 0.00100 | ngrams/sec 39556.2 | eta 0h0m13s
| epoch 32 | step 3500/4071 | loss 8.3615 | lr 0.00100 | ngrams/sec 39663.6 | eta 0h0m7s
| epoch 32 | step 4000/4071 | loss 8.3899 | lr 0.00100 | ngrams/sec 39570.3 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1151.37it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.54it/s]


-----------------------------------------------------------------------------------------
| end of epoch  32 | time 54.11s | valid loss  6.74 | valid ppl   843.69
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 33 | step 500/4071 | loss 8.3586 | lr 0.00100 | ngrams/sec 27558.5 | eta 0h1m6s
| epoch 33 | step 1000/4071 | loss 8.3538 | lr 0.00100 | ngrams/sec 39591.5 | eta 0h0m39s
| epoch 33 | step 1500/4071 | loss 8.3476 | lr 0.00100 | ngrams/sec 39664.5 | eta 0h0m33s
| epoch 33 | step 2000/4071 | loss 8.3764 | lr 0.00100 | ngrams/sec 39643.7 | eta 0h0m26s
| epoch 33 | step 2500/4071 | loss 8.3468 | lr 0.00100 | ngrams/sec 39618.8 | eta 0h0m20s
| epoch 33 | step 3000/4071 | loss 8.3783 | lr 0.00100 | ngrams/sec 39636.4 | eta 0h0m13s
| epoch 33 | step 3500/4071 | loss 8.3515 | lr 0.00100 | ngrams/sec 39716.5 | eta 0h0m7s
| epoch 33 | step 4000/4071 | loss 8.3555 | lr 0.00100 | ngrams/sec 39694.3 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1167.24it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.57it/s]


-----------------------------------------------------------------------------------------
| end of epoch  33 | time 54.01s | valid loss  6.73 | valid ppl   839.18
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 34 | step 500/4071 | loss 8.3466 | lr 0.00100 | ngrams/sec 27598.3 | eta 0h1m6s
| epoch 34 | step 1000/4071 | loss 8.3379 | lr 0.00100 | ngrams/sec 39673.1 | eta 0h0m39s
| epoch 34 | step 1500/4071 | loss 8.3473 | lr 0.00100 | ngrams/sec 39694.6 | eta 0h0m33s
| epoch 34 | step 2000/4071 | loss 8.3564 | lr 0.00100 | ngrams/sec 39699.9 | eta 0h0m26s
| epoch 34 | step 2500/4071 | loss 8.3539 | lr 0.00100 | ngrams/sec 39733.8 | eta 0h0m20s
| epoch 34 | step 3000/4071 | loss 8.3726 | lr 0.00100 | ngrams/sec 39697.8 | eta 0h0m13s
| epoch 34 | step 3500/4071 | loss 8.3554 | lr 0.00100 | ngrams/sec 39668.2 | eta 0h0m7s
| epoch 34 | step 4000/4071 | loss 8.3518 | lr 0.00100 | ngrams/sec 39689.8 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1177.73it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  34 | time 53.95s | valid loss  6.72 | valid ppl   825.09
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 35 | step 500/4071 | loss 8.3082 | lr 0.00100 | ngrams/sec 27578.7 | eta 0h1m6s
| epoch 35 | step 1000/4071 | loss 8.3417 | lr 0.00100 | ngrams/sec 39716.3 | eta 0h0m39s
| epoch 35 | step 1500/4071 | loss 8.3319 | lr 0.00100 | ngrams/sec 39662.1 | eta 0h0m33s
| epoch 35 | step 2000/4071 | loss 8.3456 | lr 0.00100 | ngrams/sec 39659.0 | eta 0h0m26s
| epoch 35 | step 2500/4071 | loss 8.3347 | lr 0.00100 | ngrams/sec 39717.0 | eta 0h0m20s
| epoch 35 | step 3000/4071 | loss 8.3485 | lr 0.00100 | ngrams/sec 39690.2 | eta 0h0m13s
| epoch 35 | step 3500/4071 | loss 8.3484 | lr 0.00100 | ngrams/sec 39682.5 | eta 0h0m7s
| epoch 35 | step 4000/4071 | loss 8.3350 | lr 0.00100 | ngrams/sec 39723.8 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1170.03it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch  35 | time 53.97s | valid loss  6.71 | valid ppl   822.72
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 36 | step 500/4071 | loss 8.3198 | lr 0.00100 | ngrams/sec 27541.8 | eta 0h1m6s
| epoch 36 | step 1000/4071 | loss 8.3205 | lr 0.00100 | ngrams/sec 39653.6 | eta 0h0m39s
| epoch 36 | step 1500/4071 | loss 8.3371 | lr 0.00100 | ngrams/sec 39697.9 | eta 0h0m33s
| epoch 36 | step 2000/4071 | loss 8.3340 | lr 0.00100 | ngrams/sec 39699.1 | eta 0h0m26s
| epoch 36 | step 2500/4071 | loss 8.3382 | lr 0.00100 | ngrams/sec 39697.1 | eta 0h0m20s
| epoch 36 | step 3000/4071 | loss 8.3411 | lr 0.00100 | ngrams/sec 39627.0 | eta 0h0m13s
| epoch 36 | step 3500/4071 | loss 8.3395 | lr 0.00100 | ngrams/sec 39629.8 | eta 0h0m7s
| epoch 36 | step 4000/4071 | loss 8.3446 | lr 0.00100 | ngrams/sec 39630.3 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1178.03it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.39it/s]


-----------------------------------------------------------------------------------------
| end of epoch  36 | time 54.01s | valid loss  6.70 | valid ppl   814.63
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 37 | step 500/4071 | loss 8.3084 | lr 0.00100 | ngrams/sec 27550.4 | eta 0h1m6s
| epoch 37 | step 1000/4071 | loss 8.3272 | lr 0.00100 | ngrams/sec 39670.8 | eta 0h0m39s
| epoch 37 | step 1500/4071 | loss 8.3272 | lr 0.00100 | ngrams/sec 39720.4 | eta 0h0m33s
| epoch 37 | step 2000/4071 | loss 8.3237 | lr 0.00100 | ngrams/sec 39663.2 | eta 0h0m26s
| epoch 37 | step 2500/4071 | loss 8.3295 | lr 0.00100 | ngrams/sec 39648.7 | eta 0h0m20s
| epoch 37 | step 3000/4071 | loss 8.3295 | lr 0.00100 | ngrams/sec 39642.9 | eta 0h0m13s
| epoch 37 | step 3500/4071 | loss 8.3407 | lr 0.00100 | ngrams/sec 39708.1 | eta 0h0m7s
| epoch 37 | step 4000/4071 | loss 8.3309 | lr 0.00100 | ngrams/sec 39637.5 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1167.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.52it/s]


-----------------------------------------------------------------------------------------
| end of epoch  37 | time 54.00s | valid loss  6.71 | valid ppl   816.69
-----------------------------------------------------------------------------------------
| epoch 38 | step 500/4071 | loss 8.2958 | lr 0.00100 | ngrams/sec 27766.3 | eta 0h1m5s
| epoch 38 | step 1000/4071 | loss 8.3024 | lr 0.00100 | ngrams/sec 39648.3 | eta 0h0m39s
| epoch 38 | step 1500/4071 | loss 8.3233 | lr 0.00100 | ngrams/sec 39662.3 | eta 0h0m33s
| epoch 38 | step 2000/4071 | loss 8.3307 | lr 0.00100 | ngrams/sec 39623.3 | eta 0h0m26s
| epoch 38 | step 2500/4071 | loss 8.3295 | lr 0.00100 | ngrams/sec 39553.4 | eta 0h0m20s
| epoch 38 | step 3000/4071 | loss 8.3278 | lr 0.00100 | ngrams/sec 39610.2 | eta 0h0m13s
| epoch 38 | step 3500/4071 | loss 8.3250 | lr 0.00100 | ngrams/sec 39532.9 | eta 0h0m7s
| epoch 38 | step 4000/4071 | loss 8.3207 | lr 0.00100 | ngrams/sec 39583.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1154.88it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.60it/s]


-----------------------------------------------------------------------------------------
| end of epoch  38 | time 54.07s | valid loss  6.70 | valid ppl   813.99
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 39 | step 500/4071 | loss 8.3012 | lr 0.00100 | ngrams/sec 27498.5 | eta 0h1m6s
| epoch 39 | step 1000/4071 | loss 8.3050 | lr 0.00100 | ngrams/sec 39592.0 | eta 0h0m39s
| epoch 39 | step 1500/4071 | loss 8.3038 | lr 0.00100 | ngrams/sec 39543.5 | eta 0h0m33s
| epoch 39 | step 2000/4071 | loss 8.3173 | lr 0.00100 | ngrams/sec 39592.3 | eta 0h0m26s
| epoch 39 | step 2500/4071 | loss 8.3121 | lr 0.00100 | ngrams/sec 39594.0 | eta 0h0m20s
| epoch 39 | step 3000/4071 | loss 8.3241 | lr 0.00100 | ngrams/sec 39519.9 | eta 0h0m13s
| epoch 39 | step 3500/4071 | loss 8.3194 | lr 0.00100 | ngrams/sec 39623.9 | eta 0h0m7s
| epoch 39 | step 4000/4071 | loss 8.3166 | lr 0.00100 | ngrams/sec 39483.4 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1177.02it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  39 | time 54.12s | valid loss  6.69 | valid ppl   806.63
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 40 | step 500/4071 | loss 8.2844 | lr 0.00100 | ngrams/sec 27608.3 | eta 0h1m6s
| epoch 40 | step 1000/4071 | loss 8.2922 | lr 0.00100 | ngrams/sec 39666.1 | eta 0h0m39s
| epoch 40 | step 1500/4071 | loss 8.3244 | lr 0.00100 | ngrams/sec 39655.8 | eta 0h0m33s
| epoch 40 | step 2000/4071 | loss 8.3173 | lr 0.00100 | ngrams/sec 39540.1 | eta 0h0m26s
| epoch 40 | step 2500/4071 | loss 8.3019 | lr 0.00100 | ngrams/sec 39571.4 | eta 0h0m20s
| epoch 40 | step 3000/4071 | loss 8.2988 | lr 0.00100 | ngrams/sec 39626.7 | eta 0h0m13s
| epoch 40 | step 3500/4071 | loss 8.2995 | lr 0.00100 | ngrams/sec 39569.6 | eta 0h0m7s
| epoch 40 | step 4000/4071 | loss 8.3242 | lr 0.00100 | ngrams/sec 39503.3 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1171.42it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch  40 | time 54.07s | valid loss  6.68 | valid ppl   798.70
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 41 | step 500/4071 | loss 8.2770 | lr 0.00100 | ngrams/sec 27561.2 | eta 0h1m6s
| epoch 41 | step 1000/4071 | loss 8.2936 | lr 0.00100 | ngrams/sec 39585.0 | eta 0h0m39s
| epoch 41 | step 1500/4071 | loss 8.2940 | lr 0.00100 | ngrams/sec 39620.3 | eta 0h0m33s
| epoch 41 | step 2000/4071 | loss 8.2982 | lr 0.00100 | ngrams/sec 39684.2 | eta 0h0m26s
| epoch 41 | step 2500/4071 | loss 8.3070 | lr 0.00100 | ngrams/sec 39700.4 | eta 0h0m20s
| epoch 41 | step 3000/4071 | loss 8.3002 | lr 0.00100 | ngrams/sec 39807.5 | eta 0h0m13s
| epoch 41 | step 3500/4071 | loss 8.3031 | lr 0.00100 | ngrams/sec 39834.9 | eta 0h0m7s
| epoch 41 | step 4000/4071 | loss 8.3121 | lr 0.00100 | ngrams/sec 39894.7 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1154.81it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.26it/s]


-----------------------------------------------------------------------------------------
| end of epoch  41 | time 53.90s | valid loss  6.68 | valid ppl   796.98
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 42 | step 500/4071 | loss 8.2851 | lr 0.00100 | ngrams/sec 27729.5 | eta 0h1m5s
| epoch 42 | step 1000/4071 | loss 8.2816 | lr 0.00100 | ngrams/sec 39913.2 | eta 0h0m39s
| epoch 42 | step 1500/4071 | loss 8.2969 | lr 0.00100 | ngrams/sec 39912.3 | eta 0h0m32s
| epoch 42 | step 2000/4071 | loss 8.2964 | lr 0.00100 | ngrams/sec 39892.6 | eta 0h0m26s
| epoch 42 | step 2500/4071 | loss 8.2943 | lr 0.00100 | ngrams/sec 39896.9 | eta 0h0m20s
| epoch 42 | step 3000/4071 | loss 8.3029 | lr 0.00100 | ngrams/sec 39929.3 | eta 0h0m13s
| epoch 42 | step 3500/4071 | loss 8.3008 | lr 0.00100 | ngrams/sec 39839.7 | eta 0h0m7s
| epoch 42 | step 4000/4071 | loss 8.3001 | lr 0.00100 | ngrams/sec 39867.8 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1184.21it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  42 | time 53.69s | valid loss  6.69 | valid ppl   804.12
-----------------------------------------------------------------------------------------
| epoch 43 | step 500/4071 | loss 8.2585 | lr 0.00100 | ngrams/sec 27938.7 | eta 0h1m5s
| epoch 43 | step 1000/4071 | loss 8.2843 | lr 0.00100 | ngrams/sec 39829.0 | eta 0h0m39s
| epoch 43 | step 1500/4071 | loss 8.2881 | lr 0.00100 | ngrams/sec 39871.8 | eta 0h0m33s
| epoch 43 | step 2000/4071 | loss 8.2952 | lr 0.00100 | ngrams/sec 39751.6 | eta 0h0m26s
| epoch 43 | step 2500/4071 | loss 8.2974 | lr 0.00100 | ngrams/sec 39816.6 | eta 0h0m20s
| epoch 43 | step 3000/4071 | loss 8.2921 | lr 0.00100 | ngrams/sec 39868.8 | eta 0h0m13s
| epoch 43 | step 3500/4071 | loss 8.3042 | lr 0.00100 | ngrams/sec 39778.4 | eta 0h0m7s
| epoch 43 | step 4000/4071 | loss 8.2934 | lr 0.00100 | ngrams/sec 39787.2 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1151.79it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.53it/s]


-----------------------------------------------------------------------------------------
| end of epoch  43 | time 53.79s | valid loss  6.65 | valid ppl   771.50
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 44 | step 500/4071 | loss 8.2705 | lr 0.00100 | ngrams/sec 27627.9 | eta 0h1m6s
| epoch 44 | step 1000/4071 | loss 8.2676 | lr 0.00100 | ngrams/sec 39724.1 | eta 0h0m39s
| epoch 44 | step 1500/4071 | loss 8.2876 | lr 0.00100 | ngrams/sec 39676.3 | eta 0h0m33s
| epoch 44 | step 2000/4071 | loss 8.2797 | lr 0.00100 | ngrams/sec 39709.4 | eta 0h0m26s
| epoch 44 | step 2500/4071 | loss 8.2902 | lr 0.00100 | ngrams/sec 39592.1 | eta 0h0m20s
| epoch 44 | step 3000/4071 | loss 8.2841 | lr 0.00100 | ngrams/sec 39633.9 | eta 0h0m13s
| epoch 44 | step 3500/4071 | loss 8.2945 | lr 0.00100 | ngrams/sec 39649.0 | eta 0h0m7s
| epoch 44 | step 4000/4071 | loss 8.2937 | lr 0.00100 | ngrams/sec 39648.2 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1157.58it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch  44 | time 53.99s | valid loss  6.66 | valid ppl   780.44
-----------------------------------------------------------------------------------------
| epoch 45 | step 500/4071 | loss 8.2581 | lr 0.00100 | ngrams/sec 27755.3 | eta 0h1m5s
| epoch 45 | step 1000/4071 | loss 8.2799 | lr 0.00100 | ngrams/sec 39691.9 | eta 0h0m39s
| epoch 45 | step 1500/4071 | loss 8.2666 | lr 0.00100 | ngrams/sec 39764.6 | eta 0h0m33s
| epoch 45 | step 2000/4071 | loss 8.2707 | lr 0.00100 | ngrams/sec 39763.8 | eta 0h0m26s
| epoch 45 | step 2500/4071 | loss 8.2742 | lr 0.00100 | ngrams/sec 39740.1 | eta 0h0m20s
| epoch 45 | step 3000/4071 | loss 8.2885 | lr 0.00100 | ngrams/sec 39798.3 | eta 0h0m13s
| epoch 45 | step 3500/4071 | loss 8.2793 | lr 0.00100 | ngrams/sec 39856.5 | eta 0h0m7s
| epoch 45 | step 4000/4071 | loss 8.2840 | lr 0.00100 | ngrams/sec 39908.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1183.40it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.23it/s]


-----------------------------------------------------------------------------------------
| end of epoch  45 | time 53.84s | valid loss  6.65 | valid ppl   776.61
-----------------------------------------------------------------------------------------
| epoch 46 | step 500/4071 | loss 8.2627 | lr 0.00100 | ngrams/sec 27897.7 | eta 0h1m5s
| epoch 46 | step 1000/4071 | loss 8.2544 | lr 0.00100 | ngrams/sec 39873.2 | eta 0h0m39s
| epoch 46 | step 1500/4071 | loss 8.2566 | lr 0.00100 | ngrams/sec 39855.0 | eta 0h0m33s
| epoch 46 | step 2000/4071 | loss 8.2779 | lr 0.00100 | ngrams/sec 39915.4 | eta 0h0m26s
| epoch 46 | step 2500/4071 | loss 8.2743 | lr 0.00100 | ngrams/sec 39892.1 | eta 0h0m20s
| epoch 46 | step 3000/4071 | loss 8.2867 | lr 0.00100 | ngrams/sec 39955.9 | eta 0h0m13s
| epoch 46 | step 3500/4071 | loss 8.2730 | lr 0.00100 | ngrams/sec 40008.7 | eta 0h0m7s
| epoch 46 | step 4000/4071 | loss 8.2817 | lr 0.00100 | ngrams/sec 39597.6 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1140.62it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  46 | time 53.74s | valid loss  6.66 | valid ppl   778.69
-----------------------------------------------------------------------------------------
| epoch 47 | step 500/4071 | loss 8.2653 | lr 0.00100 | ngrams/sec 27970.1 | eta 0h1m5s
| epoch 47 | step 1000/4071 | loss 8.2466 | lr 0.00100 | ngrams/sec 39979.3 | eta 0h0m39s
| epoch 47 | step 1500/4071 | loss 8.2605 | lr 0.00100 | ngrams/sec 39881.3 | eta 0h0m33s
| epoch 47 | step 2000/4071 | loss 8.2704 | lr 0.00100 | ngrams/sec 39738.5 | eta 0h0m26s
| epoch 47 | step 2500/4071 | loss 8.2749 | lr 0.00100 | ngrams/sec 39694.3 | eta 0h0m20s
| epoch 47 | step 3000/4071 | loss 8.2743 | lr 0.00100 | ngrams/sec 39629.0 | eta 0h0m13s
| epoch 47 | step 3500/4071 | loss 8.2778 | lr 0.00100 | ngrams/sec 39578.2 | eta 0h0m7s
| epoch 47 | step 4000/4071 | loss 8.2739 | lr 0.00100 | ngrams/sec 39539.5 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1163.53it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.95it/s]


-----------------------------------------------------------------------------------------
| end of epoch  47 | time 53.87s | valid loss  6.64 | valid ppl   762.51
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 48 | step 500/4071 | loss 8.2460 | lr 0.00100 | ngrams/sec 27533.4 | eta 0h1m6s
| epoch 48 | step 1000/4071 | loss 8.2513 | lr 0.00100 | ngrams/sec 39634.8 | eta 0h0m39s
| epoch 48 | step 1500/4071 | loss 8.2547 | lr 0.00100 | ngrams/sec 39695.0 | eta 0h0m33s
| epoch 48 | step 2000/4071 | loss 8.2666 | lr 0.00100 | ngrams/sec 39653.9 | eta 0h0m26s
| epoch 48 | step 2500/4071 | loss 8.2574 | lr 0.00100 | ngrams/sec 39671.5 | eta 0h0m20s
| epoch 48 | step 3000/4071 | loss 8.2732 | lr 0.00100 | ngrams/sec 39782.9 | eta 0h0m13s
| epoch 48 | step 3500/4071 | loss 8.2722 | lr 0.00100 | ngrams/sec 39819.1 | eta 0h0m7s
| epoch 48 | step 4000/4071 | loss 8.2693 | lr 0.00100 | ngrams/sec 39769.8 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1152.60it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.77it/s]


-----------------------------------------------------------------------------------------
| end of epoch  48 | time 53.93s | valid loss  6.64 | valid ppl   765.52
-----------------------------------------------------------------------------------------
| epoch 49 | step 500/4071 | loss 8.2194 | lr 0.00100 | ngrams/sec 27934.3 | eta 0h1m5s
| epoch 49 | step 1000/4071 | loss 8.2514 | lr 0.00100 | ngrams/sec 39803.6 | eta 0h0m39s
| epoch 49 | step 1500/4071 | loss 8.2586 | lr 0.00100 | ngrams/sec 39855.1 | eta 0h0m33s
| epoch 49 | step 2000/4071 | loss 8.2505 | lr 0.00100 | ngrams/sec 39895.5 | eta 0h0m26s
| epoch 49 | step 2500/4071 | loss 8.2669 | lr 0.00100 | ngrams/sec 39795.7 | eta 0h0m20s
| epoch 49 | step 3000/4071 | loss 8.2597 | lr 0.00100 | ngrams/sec 39889.5 | eta 0h0m13s
| epoch 49 | step 3500/4071 | loss 8.2600 | lr 0.00100 | ngrams/sec 39874.4 | eta 0h0m7s
| epoch 49 | step 4000/4071 | loss 8.2592 | lr 0.00100 | ngrams/sec 39937.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1169.57it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.38it/s]


-----------------------------------------------------------------------------------------
| end of epoch  49 | time 53.70s | valid loss  6.63 | valid ppl   755.98
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 50 | step 500/4071 | loss 8.2492 | lr 0.00100 | ngrams/sec 27766.5 | eta 0h1m5s
| epoch 50 | step 1000/4071 | loss 8.2270 | lr 0.00100 | ngrams/sec 39782.4 | eta 0h0m39s
| epoch 50 | step 1500/4071 | loss 8.2677 | lr 0.00100 | ngrams/sec 39805.7 | eta 0h0m33s
| epoch 50 | step 2000/4071 | loss 8.2494 | lr 0.00100 | ngrams/sec 39762.9 | eta 0h0m26s
| epoch 50 | step 2500/4071 | loss 8.2515 | lr 0.00100 | ngrams/sec 39775.6 | eta 0h0m20s
| epoch 50 | step 3000/4071 | loss 8.2506 | lr 0.00100 | ngrams/sec 39757.1 | eta 0h0m13s
| epoch 50 | step 3500/4071 | loss 8.2565 | lr 0.00100 | ngrams/sec 39733.0 | eta 0h0m7s
| epoch 50 | step 4000/4071 | loss 8.2537 | lr 0.00100 | ngrams/sec 39691.1 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1148.36it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.22it/s]


-----------------------------------------------------------------------------------------
| end of epoch  50 | time 53.85s | valid loss  6.67 | valid ppl   790.87
-----------------------------------------------------------------------------------------
| epoch 51 | step 500/4071 | loss 8.2395 | lr 0.00100 | ngrams/sec 27761.2 | eta 0h1m5s
| epoch 51 | step 1000/4071 | loss 8.2443 | lr 0.00100 | ngrams/sec 39673.1 | eta 0h0m39s
| epoch 51 | step 1500/4071 | loss 8.2421 | lr 0.00100 | ngrams/sec 39682.6 | eta 0h0m33s
| epoch 51 | step 2000/4071 | loss 8.2596 | lr 0.00100 | ngrams/sec 39652.6 | eta 0h0m26s
| epoch 51 | step 2500/4071 | loss 8.2610 | lr 0.00100 | ngrams/sec 39608.4 | eta 0h0m20s
| epoch 51 | step 3000/4071 | loss 8.2381 | lr 0.00100 | ngrams/sec 39590.1 | eta 0h0m13s
| epoch 51 | step 3500/4071 | loss 8.2411 | lr 0.00100 | ngrams/sec 39633.4 | eta 0h0m7s
| epoch 51 | step 4000/4071 | loss 8.2517 | lr 0.00100 | ngrams/sec 39577.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1160.59it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.61it/s]


-----------------------------------------------------------------------------------------
| end of epoch  51 | time 54.04s | valid loss  6.63 | valid ppl   754.56
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 52 | step 500/4071 | loss 8.2271 | lr 0.00100 | ngrams/sec 27517.3 | eta 0h1m6s
| epoch 52 | step 1000/4071 | loss 8.2298 | lr 0.00100 | ngrams/sec 39734.7 | eta 0h0m39s
| epoch 52 | step 1500/4071 | loss 8.2305 | lr 0.00100 | ngrams/sec 39690.2 | eta 0h0m33s
| epoch 52 | step 2000/4071 | loss 8.2331 | lr 0.00100 | ngrams/sec 39744.7 | eta 0h0m26s
| epoch 52 | step 2500/4071 | loss 8.2351 | lr 0.00100 | ngrams/sec 39766.7 | eta 0h0m20s
| epoch 52 | step 3000/4071 | loss 8.2422 | lr 0.00100 | ngrams/sec 39785.6 | eta 0h0m13s
| epoch 52 | step 3500/4071 | loss 8.2667 | lr 0.00100 | ngrams/sec 39721.4 | eta 0h0m7s
| epoch 52 | step 4000/4071 | loss 8.2469 | lr 0.00100 | ngrams/sec 39746.1 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1178.14it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.00it/s]


-----------------------------------------------------------------------------------------
| end of epoch  52 | time 53.89s | valid loss  6.65 | valid ppl   772.40
-----------------------------------------------------------------------------------------
| epoch 53 | step 500/4071 | loss 8.2077 | lr 0.00100 | ngrams/sec 27915.2 | eta 0h1m5s
| epoch 53 | step 1000/4071 | loss 8.2170 | lr 0.00100 | ngrams/sec 39835.5 | eta 0h0m39s
| epoch 53 | step 1500/4071 | loss 8.2403 | lr 0.00100 | ngrams/sec 39862.3 | eta 0h0m33s
| epoch 53 | step 2000/4071 | loss 8.2450 | lr 0.00100 | ngrams/sec 39963.3 | eta 0h0m26s
| epoch 53 | step 2500/4071 | loss 8.2531 | lr 0.00100 | ngrams/sec 39914.8 | eta 0h0m20s
| epoch 53 | step 3000/4071 | loss 8.2494 | lr 0.00100 | ngrams/sec 39943.0 | eta 0h0m13s
| epoch 53 | step 3500/4071 | loss 8.2328 | lr 0.00100 | ngrams/sec 39919.8 | eta 0h0m7s
| epoch 53 | step 4000/4071 | loss 8.2380 | lr 0.00100 | ngrams/sec 39952.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1183.60it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.82it/s]


-----------------------------------------------------------------------------------------
| end of epoch  53 | time 53.66s | valid loss  6.62 | valid ppl   747.07
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 54 | step 500/4071 | loss 8.2189 | lr 0.00100 | ngrams/sec 27755.9 | eta 0h1m5s
| epoch 54 | step 1000/4071 | loss 8.2101 | lr 0.00100 | ngrams/sec 39887.0 | eta 0h0m39s
| epoch 54 | step 1500/4071 | loss 8.2323 | lr 0.00100 | ngrams/sec 39937.3 | eta 0h0m32s
| epoch 54 | step 2000/4071 | loss 8.2409 | lr 0.00100 | ngrams/sec 39953.6 | eta 0h0m26s
| epoch 54 | step 2500/4071 | loss 8.2361 | lr 0.00100 | ngrams/sec 39888.7 | eta 0h0m20s
| epoch 54 | step 3000/4071 | loss 8.2313 | lr 0.00100 | ngrams/sec 39900.7 | eta 0h0m13s
| epoch 54 | step 3500/4071 | loss 8.2318 | lr 0.00100 | ngrams/sec 39896.8 | eta 0h0m7s
| epoch 54 | step 4000/4071 | loss 8.2327 | lr 0.00100 | ngrams/sec 39791.9 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1162.31it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  54 | time 53.68s | valid loss  6.62 | valid ppl   753.35
-----------------------------------------------------------------------------------------
| epoch 55 | step 500/4071 | loss 8.2134 | lr 0.00100 | ngrams/sec 27881.7 | eta 0h1m5s
| epoch 55 | step 1000/4071 | loss 8.2217 | lr 0.00100 | ngrams/sec 39889.9 | eta 0h0m39s
| epoch 55 | step 1500/4071 | loss 8.2292 | lr 0.00100 | ngrams/sec 39853.5 | eta 0h0m33s
| epoch 55 | step 2000/4071 | loss 8.2106 | lr 0.00100 | ngrams/sec 39855.4 | eta 0h0m26s
| epoch 55 | step 2500/4071 | loss 8.2306 | lr 0.00100 | ngrams/sec 39865.4 | eta 0h0m20s
| epoch 55 | step 3000/4071 | loss 8.2262 | lr 0.00100 | ngrams/sec 39848.4 | eta 0h0m13s
| epoch 55 | step 3500/4071 | loss 8.2417 | lr 0.00100 | ngrams/sec 39750.7 | eta 0h0m7s
| epoch 55 | step 4000/4071 | loss 8.2173 | lr 0.00100 | ngrams/sec 39814.2 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1156.36it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.30it/s]


-----------------------------------------------------------------------------------------
| end of epoch  55 | time 53.75s | valid loss  6.63 | valid ppl   759.81
-----------------------------------------------------------------------------------------
| epoch 56 | step 500/4071 | loss 8.2035 | lr 0.00100 | ngrams/sec 27895.0 | eta 0h1m5s
| epoch 56 | step 1000/4071 | loss 8.2088 | lr 0.00100 | ngrams/sec 39857.9 | eta 0h0m39s
| epoch 56 | step 1500/4071 | loss 8.2229 | lr 0.00100 | ngrams/sec 39713.7 | eta 0h0m33s
| epoch 56 | step 2000/4071 | loss 8.2186 | lr 0.00100 | ngrams/sec 39839.1 | eta 0h0m26s
| epoch 56 | step 2500/4071 | loss 8.2225 | lr 0.00100 | ngrams/sec 39854.8 | eta 0h0m20s
| epoch 56 | step 3000/4071 | loss 8.2226 | lr 0.00100 | ngrams/sec 39818.6 | eta 0h0m13s
| epoch 56 | step 3500/4071 | loss 8.2358 | lr 0.00100 | ngrams/sec 39831.4 | eta 0h0m7s
| epoch 56 | step 4000/4071 | loss 8.2258 | lr 0.00100 | ngrams/sec 39767.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1148.64it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.36it/s]


-----------------------------------------------------------------------------------------
| end of epoch  56 | time 53.79s | valid loss  6.65 | valid ppl   770.64
-----------------------------------------------------------------------------------------
| epoch 57 | step 500/4071 | loss 8.1991 | lr 0.00100 | ngrams/sec 27877.9 | eta 0h1m5s
| epoch 57 | step 1000/4071 | loss 8.2081 | lr 0.00100 | ngrams/sec 39851.2 | eta 0h0m39s
| epoch 57 | step 1500/4071 | loss 8.2183 | lr 0.00100 | ngrams/sec 39805.8 | eta 0h0m33s
| epoch 57 | step 2000/4071 | loss 8.2271 | lr 0.00100 | ngrams/sec 39853.9 | eta 0h0m26s
| epoch 57 | step 2500/4071 | loss 8.2276 | lr 0.00100 | ngrams/sec 39851.6 | eta 0h0m20s
| epoch 57 | step 3000/4071 | loss 8.2202 | lr 0.00100 | ngrams/sec 39806.7 | eta 0h0m13s
| epoch 57 | step 3500/4071 | loss 8.2179 | lr 0.00100 | ngrams/sec 39838.2 | eta 0h0m7s
| epoch 57 | step 4000/4071 | loss 8.2264 | lr 0.00100 | ngrams/sec 39842.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1160.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.76it/s]


-----------------------------------------------------------------------------------------
| end of epoch  57 | time 53.76s | valid loss  6.61 | valid ppl   745.71
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 58 | step 500/4071 | loss 8.2065 | lr 0.00100 | ngrams/sec 27667.7 | eta 0h1m6s
| epoch 58 | step 1000/4071 | loss 8.2181 | lr 0.00100 | ngrams/sec 39851.6 | eta 0h0m39s
| epoch 58 | step 1500/4071 | loss 8.2133 | lr 0.00100 | ngrams/sec 39842.0 | eta 0h0m33s
| epoch 58 | step 2000/4071 | loss 8.2113 | lr 0.00100 | ngrams/sec 39859.7 | eta 0h0m26s
| epoch 58 | step 2500/4071 | loss 8.2322 | lr 0.00100 | ngrams/sec 39835.9 | eta 0h0m20s
| epoch 58 | step 3000/4071 | loss 8.2089 | lr 0.00100 | ngrams/sec 39657.9 | eta 0h0m13s
| epoch 58 | step 3500/4071 | loss 8.2117 | lr 0.00100 | ngrams/sec 39875.7 | eta 0h0m7s
| epoch 58 | step 4000/4071 | loss 8.2132 | lr 0.00100 | ngrams/sec 39831.2 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1162.58it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.56it/s]


-----------------------------------------------------------------------------------------
| end of epoch  58 | time 53.79s | valid loss  6.60 | valid ppl   733.45
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 59 | step 500/4071 | loss 8.1998 | lr 0.00100 | ngrams/sec 27665.3 | eta 0h1m6s
| epoch 59 | step 1000/4071 | loss 8.1958 | lr 0.00100 | ngrams/sec 39811.9 | eta 0h0m39s
| epoch 59 | step 1500/4071 | loss 8.2096 | lr 0.00100 | ngrams/sec 39795.8 | eta 0h0m33s
| epoch 59 | step 2000/4071 | loss 8.2055 | lr 0.00100 | ngrams/sec 39827.6 | eta 0h0m26s
| epoch 59 | step 2500/4071 | loss 8.2032 | lr 0.00100 | ngrams/sec 39760.1 | eta 0h0m20s
| epoch 59 | step 3000/4071 | loss 8.2111 | lr 0.00100 | ngrams/sec 39802.6 | eta 0h0m13s
| epoch 59 | step 3500/4071 | loss 8.2090 | lr 0.00100 | ngrams/sec 39783.6 | eta 0h0m7s
| epoch 59 | step 4000/4071 | loss 8.2190 | lr 0.00100 | ngrams/sec 39864.6 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1166.79it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.10it/s]


-----------------------------------------------------------------------------------------
| end of epoch  59 | time 53.80s | valid loss  6.57 | valid ppl   715.02
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 60 | step 500/4071 | loss 8.1955 | lr 0.00100 | ngrams/sec 27666.1 | eta 0h1m6s
| epoch 60 | step 1000/4071 | loss 8.1952 | lr 0.00100 | ngrams/sec 39794.6 | eta 0h0m39s
| epoch 60 | step 1500/4071 | loss 8.2100 | lr 0.00100 | ngrams/sec 39853.3 | eta 0h0m33s
| epoch 60 | step 2000/4071 | loss 8.2120 | lr 0.00100 | ngrams/sec 39821.4 | eta 0h0m26s
| epoch 60 | step 2500/4071 | loss 8.2040 | lr 0.00100 | ngrams/sec 39837.3 | eta 0h0m20s
| epoch 60 | step 3000/4071 | loss 8.2110 | lr 0.00100 | ngrams/sec 39835.3 | eta 0h0m13s
| epoch 60 | step 3500/4071 | loss 8.2105 | lr 0.00100 | ngrams/sec 39805.9 | eta 0h0m7s
| epoch 60 | step 4000/4071 | loss 8.1995 | lr 0.00100 | ngrams/sec 39782.9 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1144.62it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.18it/s]


-----------------------------------------------------------------------------------------
| end of epoch  60 | time 53.80s | valid loss  6.59 | valid ppl   729.56
-----------------------------------------------------------------------------------------
| epoch 61 | step 500/4071 | loss 8.1806 | lr 0.00100 | ngrams/sec 27870.8 | eta 0h1m5s
| epoch 61 | step 1000/4071 | loss 8.1819 | lr 0.00100 | ngrams/sec 39865.5 | eta 0h0m39s
| epoch 61 | step 1500/4071 | loss 8.1988 | lr 0.00100 | ngrams/sec 39847.0 | eta 0h0m33s
| epoch 61 | step 2000/4071 | loss 8.1979 | lr 0.00100 | ngrams/sec 39824.2 | eta 0h0m26s
| epoch 61 | step 2500/4071 | loss 8.1993 | lr 0.00100 | ngrams/sec 39817.9 | eta 0h0m20s
| epoch 61 | step 3000/4071 | loss 8.2087 | lr 0.00100 | ngrams/sec 39778.9 | eta 0h0m13s
| epoch 61 | step 3500/4071 | loss 8.2139 | lr 0.00100 | ngrams/sec 39824.9 | eta 0h0m7s
| epoch 61 | step 4000/4071 | loss 8.2031 | lr 0.00100 | ngrams/sec 39761.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1187.46it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch  61 | time 53.79s | valid loss  6.60 | valid ppl   736.66
-----------------------------------------------------------------------------------------
| epoch 62 | step 500/4071 | loss 8.1746 | lr 0.00100 | ngrams/sec 27782.7 | eta 0h1m5s
| epoch 62 | step 1000/4071 | loss 8.1857 | lr 0.00100 | ngrams/sec 39770.5 | eta 0h0m39s
| epoch 62 | step 1500/4071 | loss 8.1919 | lr 0.00100 | ngrams/sec 39795.7 | eta 0h0m33s
| epoch 62 | step 2000/4071 | loss 8.1918 | lr 0.00100 | ngrams/sec 39769.7 | eta 0h0m26s
| epoch 62 | step 2500/4071 | loss 8.1995 | lr 0.00100 | ngrams/sec 39725.1 | eta 0h0m20s
| epoch 62 | step 3000/4071 | loss 8.2186 | lr 0.00100 | ngrams/sec 39614.3 | eta 0h0m13s
| epoch 62 | step 3500/4071 | loss 8.1999 | lr 0.00100 | ngrams/sec 39571.5 | eta 0h0m7s
| epoch 62 | step 4000/4071 | loss 8.1908 | lr 0.00100 | ngrams/sec 39591.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1156.23it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.52it/s]


-----------------------------------------------------------------------------------------
| end of epoch  62 | time 53.97s | valid loss  6.59 | valid ppl   729.18
-----------------------------------------------------------------------------------------
| epoch 63 | step 500/4071 | loss 8.1642 | lr 0.00100 | ngrams/sec 27774.9 | eta 0h1m5s
| epoch 63 | step 1000/4071 | loss 8.1961 | lr 0.00100 | ngrams/sec 39686.3 | eta 0h0m39s
| epoch 63 | step 1500/4071 | loss 8.1864 | lr 0.00100 | ngrams/sec 39805.1 | eta 0h0m33s
| epoch 63 | step 2000/4071 | loss 8.1833 | lr 0.00100 | ngrams/sec 39843.9 | eta 0h0m26s
| epoch 63 | step 2500/4071 | loss 8.1862 | lr 0.00100 | ngrams/sec 39911.1 | eta 0h0m20s
| epoch 63 | step 3000/4071 | loss 8.1914 | lr 0.00100 | ngrams/sec 39899.5 | eta 0h0m13s
| epoch 63 | step 3500/4071 | loss 8.1917 | lr 0.00100 | ngrams/sec 39959.7 | eta 0h0m7s
| epoch 63 | step 4000/4071 | loss 8.1940 | lr 0.00100 | ngrams/sec 39955.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1176.24it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.49it/s]


-----------------------------------------------------------------------------------------
| end of epoch  63 | time 53.74s | valid loss  6.60 | valid ppl   732.64
-----------------------------------------------------------------------------------------
| epoch 64 | step 500/4071 | loss 8.1626 | lr 0.00100 | ngrams/sec 27934.7 | eta 0h1m5s
| epoch 64 | step 1000/4071 | loss 8.1874 | lr 0.00100 | ngrams/sec 39842.4 | eta 0h0m39s
| epoch 64 | step 1500/4071 | loss 8.1753 | lr 0.00100 | ngrams/sec 39833.2 | eta 0h0m33s
| epoch 64 | step 2000/4071 | loss 8.1806 | lr 0.00100 | ngrams/sec 39806.8 | eta 0h0m26s
| epoch 64 | step 2500/4071 | loss 8.1866 | lr 0.00100 | ngrams/sec 39713.8 | eta 0h0m20s
| epoch 64 | step 3000/4071 | loss 8.1854 | lr 0.00100 | ngrams/sec 39668.8 | eta 0h0m13s
| epoch 64 | step 3500/4071 | loss 8.1887 | lr 0.00100 | ngrams/sec 39696.5 | eta 0h0m7s
| epoch 64 | step 4000/4071 | loss 8.2084 | lr 0.00100 | ngrams/sec 39685.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1148.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.89it/s]


-----------------------------------------------------------------------------------------
| end of epoch  64 | time 53.86s | valid loss  6.59 | valid ppl   728.28
-----------------------------------------------------------------------------------------
| epoch 65 | step 500/4071 | loss 8.1710 | lr 0.00100 | ngrams/sec 27804.4 | eta 0h1m5s
| epoch 65 | step 1000/4071 | loss 8.1813 | lr 0.00100 | ngrams/sec 39687.9 | eta 0h0m39s
| epoch 65 | step 1500/4071 | loss 8.1840 | lr 0.00100 | ngrams/sec 39650.7 | eta 0h0m33s
| epoch 65 | step 2000/4071 | loss 8.1884 | lr 0.00100 | ngrams/sec 39637.9 | eta 0h0m26s
| epoch 65 | step 2500/4071 | loss 8.1733 | lr 0.00100 | ngrams/sec 39623.3 | eta 0h0m20s
| epoch 65 | step 3000/4071 | loss 8.1864 | lr 0.00100 | ngrams/sec 39612.1 | eta 0h0m13s
| epoch 65 | step 3500/4071 | loss 8.1675 | lr 0.00100 | ngrams/sec 39672.9 | eta 0h0m7s
| epoch 65 | step 4000/4071 | loss 8.1920 | lr 0.00100 | ngrams/sec 39656.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1160.42it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.19it/s]


-----------------------------------------------------------------------------------------
| end of epoch  65 | time 53.99s | valid loss  6.58 | valid ppl   718.68
-----------------------------------------------------------------------------------------
| epoch 66 | step 500/4071 | loss 8.1651 | lr 0.00100 | ngrams/sec 27853.6 | eta 0h1m5s
| epoch 66 | step 1000/4071 | loss 8.1751 | lr 0.00100 | ngrams/sec 39700.4 | eta 0h0m39s
| epoch 66 | step 1500/4071 | loss 8.1683 | lr 0.00100 | ngrams/sec 39694.0 | eta 0h0m33s
| epoch 66 | step 2000/4071 | loss 8.1737 | lr 0.00100 | ngrams/sec 39654.5 | eta 0h0m26s
| epoch 66 | step 2500/4071 | loss 8.1866 | lr 0.00100 | ngrams/sec 39663.4 | eta 0h0m20s
| epoch 66 | step 3000/4071 | loss 8.1822 | lr 0.00100 | ngrams/sec 39654.9 | eta 0h0m13s
| epoch 66 | step 3500/4071 | loss 8.1883 | lr 0.00100 | ngrams/sec 39624.0 | eta 0h0m7s
| epoch 66 | step 4000/4071 | loss 8.1841 | lr 0.00100 | ngrams/sec 39712.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1169.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.29it/s]


-----------------------------------------------------------------------------------------
| end of epoch  66 | time 53.96s | valid loss  6.57 | valid ppl   711.20
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 67 | step 500/4071 | loss 8.1687 | lr 0.00100 | ngrams/sec 27559.8 | eta 0h1m6s
| epoch 67 | step 1000/4071 | loss 8.1586 | lr 0.00100 | ngrams/sec 39682.2 | eta 0h0m39s
| epoch 67 | step 1500/4071 | loss 8.1691 | lr 0.00100 | ngrams/sec 39649.9 | eta 0h0m33s
| epoch 67 | step 2000/4071 | loss 8.1725 | lr 0.00100 | ngrams/sec 39701.1 | eta 0h0m26s
| epoch 67 | step 2500/4071 | loss 8.1846 | lr 0.00100 | ngrams/sec 39713.2 | eta 0h0m20s
| epoch 67 | step 3000/4071 | loss 8.1838 | lr 0.00100 | ngrams/sec 39638.9 | eta 0h0m13s
| epoch 67 | step 3500/4071 | loss 8.1815 | lr 0.00100 | ngrams/sec 39655.8 | eta 0h0m7s
| epoch 67 | step 4000/4071 | loss 8.1803 | lr 0.00100 | ngrams/sec 39679.4 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1189.71it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.48it/s]


-----------------------------------------------------------------------------------------
| end of epoch  67 | time 53.99s | valid loss  6.54 | valid ppl   695.41
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 68 | step 500/4071 | loss 8.1613 | lr 0.00100 | ngrams/sec 27585.0 | eta 0h1m6s
| epoch 68 | step 1000/4071 | loss 8.1536 | lr 0.00100 | ngrams/sec 39578.6 | eta 0h0m39s
| epoch 68 | step 1500/4071 | loss 8.1688 | lr 0.00100 | ngrams/sec 39551.2 | eta 0h0m33s
| epoch 68 | step 2000/4071 | loss 8.1866 | lr 0.00100 | ngrams/sec 39533.0 | eta 0h0m26s
| epoch 68 | step 2500/4071 | loss 8.1874 | lr 0.00100 | ngrams/sec 39646.8 | eta 0h0m20s
| epoch 68 | step 3000/4071 | loss 8.1735 | lr 0.00100 | ngrams/sec 39640.3 | eta 0h0m13s
| epoch 68 | step 3500/4071 | loss 8.1796 | lr 0.00100 | ngrams/sec 39623.4 | eta 0h0m7s
| epoch 68 | step 4000/4071 | loss 8.1662 | lr 0.00100 | ngrams/sec 39658.4 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1181.35it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.16it/s]


-----------------------------------------------------------------------------------------
| end of epoch  68 | time 54.07s | valid loss  6.55 | valid ppl   702.00
-----------------------------------------------------------------------------------------
| epoch 69 | step 500/4071 | loss 8.1384 | lr 0.00100 | ngrams/sec 27710.0 | eta 0h1m5s
| epoch 69 | step 1000/4071 | loss 8.1493 | lr 0.00100 | ngrams/sec 38833.5 | eta 0h0m40s
| epoch 69 | step 1500/4071 | loss 8.1613 | lr 0.00100 | ngrams/sec 39490.3 | eta 0h0m33s
| epoch 69 | step 2000/4071 | loss 8.1658 | lr 0.00100 | ngrams/sec 39663.6 | eta 0h0m26s
| epoch 69 | step 2500/4071 | loss 8.1627 | lr 0.00100 | ngrams/sec 39706.4 | eta 0h0m20s
| epoch 69 | step 3000/4071 | loss 8.1790 | lr 0.00100 | ngrams/sec 39777.0 | eta 0h0m13s
| epoch 69 | step 3500/4071 | loss 8.1677 | lr 0.00100 | ngrams/sec 39709.2 | eta 0h0m7s
| epoch 69 | step 4000/4071 | loss 8.1719 | lr 0.00100 | ngrams/sec 39698.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1146.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.70it/s]


-----------------------------------------------------------------------------------------
| end of epoch  69 | time 54.12s | valid loss  6.56 | valid ppl   705.22
-----------------------------------------------------------------------------------------
| epoch 70 | step 500/4071 | loss 8.1362 | lr 0.00100 | ngrams/sec 27809.9 | eta 0h1m5s
| epoch 70 | step 1000/4071 | loss 8.1633 | lr 0.00100 | ngrams/sec 39752.0 | eta 0h0m39s
| epoch 70 | step 1500/4071 | loss 8.1621 | lr 0.00100 | ngrams/sec 39750.7 | eta 0h0m33s
| epoch 70 | step 2000/4071 | loss 8.1655 | lr 0.00100 | ngrams/sec 39676.4 | eta 0h0m26s
| epoch 70 | step 2500/4071 | loss 8.1640 | lr 0.00100 | ngrams/sec 39715.7 | eta 0h0m20s
| epoch 70 | step 3000/4071 | loss 8.1569 | lr 0.00100 | ngrams/sec 39748.6 | eta 0h0m13s
| epoch 70 | step 3500/4071 | loss 8.1737 | lr 0.00100 | ngrams/sec 39701.0 | eta 0h0m7s
| epoch 70 | step 4000/4071 | loss 8.1665 | lr 0.00100 | ngrams/sec 39743.0 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1172.14it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.51it/s]


-----------------------------------------------------------------------------------------
| end of epoch  70 | time 53.90s | valid loss  6.58 | valid ppl   723.51
-----------------------------------------------------------------------------------------
| epoch 71 | step 500/4071 | loss 8.1334 | lr 0.00100 | ngrams/sec 27844.5 | eta 0h1m5s
| epoch 71 | step 1000/4071 | loss 8.1496 | lr 0.00100 | ngrams/sec 39674.5 | eta 0h0m39s
| epoch 71 | step 1500/4071 | loss 8.1512 | lr 0.00100 | ngrams/sec 39784.4 | eta 0h0m33s
| epoch 71 | step 2000/4071 | loss 8.1585 | lr 0.00100 | ngrams/sec 39764.5 | eta 0h0m26s
| epoch 71 | step 2500/4071 | loss 8.1481 | lr 0.00100 | ngrams/sec 39759.6 | eta 0h0m20s
| epoch 71 | step 3000/4071 | loss 8.1781 | lr 0.00100 | ngrams/sec 39769.3 | eta 0h0m13s
| epoch 71 | step 3500/4071 | loss 8.1491 | lr 0.00100 | ngrams/sec 39743.8 | eta 0h0m7s
| epoch 71 | step 4000/4071 | loss 8.1602 | lr 0.00100 | ngrams/sec 39764.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1142.12it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.18it/s]


-----------------------------------------------------------------------------------------
| end of epoch  71 | time 53.87s | valid loss  6.54 | valid ppl   691.49
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 72 | step 500/4071 | loss 8.1364 | lr 0.00100 | ngrams/sec 27612.0 | eta 0h1m6s
| epoch 72 | step 1000/4071 | loss 8.1386 | lr 0.00100 | ngrams/sec 39788.7 | eta 0h0m39s
| epoch 72 | step 1500/4071 | loss 8.1413 | lr 0.00100 | ngrams/sec 39759.3 | eta 0h0m33s
| epoch 72 | step 2000/4071 | loss 8.1422 | lr 0.00100 | ngrams/sec 39743.9 | eta 0h0m26s
| epoch 72 | step 2500/4071 | loss 8.1373 | lr 0.00100 | ngrams/sec 39750.8 | eta 0h0m20s
| epoch 72 | step 3000/4071 | loss 8.1817 | lr 0.00100 | ngrams/sec 39761.2 | eta 0h0m13s
| epoch 72 | step 3500/4071 | loss 8.1754 | lr 0.00100 | ngrams/sec 39711.7 | eta 0h0m7s
| epoch 72 | step 4000/4071 | loss 8.1663 | lr 0.00100 | ngrams/sec 39798.6 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1162.81it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.74it/s]


-----------------------------------------------------------------------------------------
| end of epoch  72 | time 53.87s | valid loss  6.57 | valid ppl   713.73
-----------------------------------------------------------------------------------------
| epoch 73 | step 500/4071 | loss 8.1313 | lr 0.00100 | ngrams/sec 27857.8 | eta 0h1m5s
| epoch 73 | step 1000/4071 | loss 8.1438 | lr 0.00100 | ngrams/sec 39762.6 | eta 0h0m39s
| epoch 73 | step 1500/4071 | loss 8.1354 | lr 0.00100 | ngrams/sec 39723.0 | eta 0h0m33s
| epoch 73 | step 2000/4071 | loss 8.1543 | lr 0.00100 | ngrams/sec 39788.1 | eta 0h0m26s
| epoch 73 | step 2500/4071 | loss 8.1422 | lr 0.00100 | ngrams/sec 39765.0 | eta 0h0m20s
| epoch 73 | step 3000/4071 | loss 8.1535 | lr 0.00100 | ngrams/sec 39782.7 | eta 0h0m13s
| epoch 73 | step 3500/4071 | loss 8.1569 | lr 0.00100 | ngrams/sec 39802.4 | eta 0h0m7s
| epoch 73 | step 4000/4071 | loss 8.1465 | lr 0.00100 | ngrams/sec 39779.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1189.26it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.61it/s]


-----------------------------------------------------------------------------------------
| end of epoch  73 | time 53.83s | valid loss  6.58 | valid ppl   719.80
-----------------------------------------------------------------------------------------
| epoch 74 | step 500/4071 | loss 8.1234 | lr 0.00100 | ngrams/sec 27884.3 | eta 0h1m5s
| epoch 74 | step 1000/4071 | loss 8.1371 | lr 0.00100 | ngrams/sec 39837.6 | eta 0h0m39s
| epoch 74 | step 1500/4071 | loss 8.1369 | lr 0.00100 | ngrams/sec 39788.4 | eta 0h0m33s
| epoch 74 | step 2000/4071 | loss 8.1575 | lr 0.00100 | ngrams/sec 39842.3 | eta 0h0m26s
| epoch 74 | step 2500/4071 | loss 8.1497 | lr 0.00100 | ngrams/sec 39848.4 | eta 0h0m20s
| epoch 74 | step 3000/4071 | loss 8.1472 | lr 0.00100 | ngrams/sec 39845.4 | eta 0h0m13s
| epoch 74 | step 3500/4071 | loss 8.1584 | lr 0.00100 | ngrams/sec 39858.7 | eta 0h0m7s
| epoch 74 | step 4000/4071 | loss 8.1568 | lr 0.00100 | ngrams/sec 39818.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1172.73it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.29it/s]


-----------------------------------------------------------------------------------------
| end of epoch  74 | time 53.77s | valid loss  6.54 | valid ppl   691.68
-----------------------------------------------------------------------------------------
| epoch 75 | step 500/4071 | loss 8.1175 | lr 0.00100 | ngrams/sec 27920.4 | eta 0h1m5s
| epoch 75 | step 1000/4071 | loss 8.1419 | lr 0.00100 | ngrams/sec 39900.6 | eta 0h0m39s
| epoch 75 | step 1500/4071 | loss 8.1434 | lr 0.00100 | ngrams/sec 39879.9 | eta 0h0m33s
| epoch 75 | step 2000/4071 | loss 8.1323 | lr 0.00100 | ngrams/sec 39823.2 | eta 0h0m26s
| epoch 75 | step 2500/4071 | loss 8.1522 | lr 0.00100 | ngrams/sec 39882.9 | eta 0h0m20s
| epoch 75 | step 3000/4071 | loss 8.1531 | lr 0.00100 | ngrams/sec 39837.5 | eta 0h0m13s
| epoch 75 | step 3500/4071 | loss 8.1630 | lr 0.00100 | ngrams/sec 39870.1 | eta 0h0m7s
| epoch 75 | step 4000/4071 | loss 8.1534 | lr 0.00100 | ngrams/sec 39925.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1171.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.36it/s]


-----------------------------------------------------------------------------------------
| end of epoch  75 | time 53.70s | valid loss  6.52 | valid ppl   681.80
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 76 | step 500/4071 | loss 8.1170 | lr 0.00100 | ngrams/sec 27712.5 | eta 0h1m5s
| epoch 76 | step 1000/4071 | loss 8.1467 | lr 0.00100 | ngrams/sec 39833.4 | eta 0h0m39s
| epoch 76 | step 1500/4071 | loss 8.1431 | lr 0.00100 | ngrams/sec 39974.5 | eta 0h0m32s
| epoch 76 | step 2000/4071 | loss 8.1327 | lr 0.00100 | ngrams/sec 39942.7 | eta 0h0m26s
| epoch 76 | step 2500/4071 | loss 8.1455 | lr 0.00100 | ngrams/sec 39929.1 | eta 0h0m20s
| epoch 76 | step 3000/4071 | loss 8.1478 | lr 0.00100 | ngrams/sec 39971.9 | eta 0h0m13s
| epoch 76 | step 3500/4071 | loss 8.1574 | lr 0.00100 | ngrams/sec 39947.8 | eta 0h0m7s
| epoch 76 | step 4000/4071 | loss 8.1295 | lr 0.00100 | ngrams/sec 39937.9 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1170.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  76 | time 53.65s | valid loss  6.59 | valid ppl   726.21
-----------------------------------------------------------------------------------------
| epoch 77 | step 500/4071 | loss 8.1217 | lr 0.00100 | ngrams/sec 27925.4 | eta 0h1m5s
| epoch 77 | step 1000/4071 | loss 8.1285 | lr 0.00100 | ngrams/sec 39973.7 | eta 0h0m39s
| epoch 77 | step 1500/4071 | loss 8.1269 | lr 0.00100 | ngrams/sec 39941.2 | eta 0h0m32s
| epoch 77 | step 2000/4071 | loss 8.1286 | lr 0.00100 | ngrams/sec 39888.6 | eta 0h0m26s
| epoch 77 | step 2500/4071 | loss 8.1256 | lr 0.00100 | ngrams/sec 39895.3 | eta 0h0m20s
| epoch 77 | step 3000/4071 | loss 8.1503 | lr 0.00100 | ngrams/sec 39922.4 | eta 0h0m13s
| epoch 77 | step 3500/4071 | loss 8.1399 | lr 0.00100 | ngrams/sec 39858.2 | eta 0h0m7s
| epoch 77 | step 4000/4071 | loss 8.1199 | lr 0.00100 | ngrams/sec 39781.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.93it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.14it/s]


-----------------------------------------------------------------------------------------
| end of epoch  77 | time 53.69s | valid loss  6.55 | valid ppl   699.22
-----------------------------------------------------------------------------------------
| epoch 78 | step 500/4071 | loss 8.1176 | lr 0.00100 | ngrams/sec 27879.7 | eta 0h1m5s
| epoch 78 | step 1000/4071 | loss 8.1054 | lr 0.00100 | ngrams/sec 39923.2 | eta 0h0m39s
| epoch 78 | step 1500/4071 | loss 8.1155 | lr 0.00100 | ngrams/sec 39831.8 | eta 0h0m33s
| epoch 78 | step 2000/4071 | loss 8.1345 | lr 0.00100 | ngrams/sec 39884.8 | eta 0h0m26s
| epoch 78 | step 2500/4071 | loss 8.1314 | lr 0.00100 | ngrams/sec 39813.5 | eta 0h0m20s
| epoch 78 | step 3000/4071 | loss 8.1256 | lr 0.00100 | ngrams/sec 39845.2 | eta 0h0m13s
| epoch 78 | step 3500/4071 | loss 8.1382 | lr 0.00100 | ngrams/sec 39876.3 | eta 0h0m7s
| epoch 78 | step 4000/4071 | loss 8.1350 | lr 0.00100 | ngrams/sec 39860.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1152.88it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  78 | time 53.73s | valid loss  6.57 | valid ppl   713.30
-----------------------------------------------------------------------------------------
| epoch 79 | step 500/4071 | loss 8.0965 | lr 0.00100 | ngrams/sec 27900.6 | eta 0h1m5s
| epoch 79 | step 1000/4071 | loss 8.1114 | lr 0.00100 | ngrams/sec 39756.8 | eta 0h0m39s
| epoch 79 | step 1500/4071 | loss 8.1301 | lr 0.00100 | ngrams/sec 39839.0 | eta 0h0m33s
| epoch 79 | step 2000/4071 | loss 8.1153 | lr 0.00100 | ngrams/sec 39865.5 | eta 0h0m26s
| epoch 79 | step 2500/4071 | loss 8.1329 | lr 0.00100 | ngrams/sec 39844.9 | eta 0h0m20s
| epoch 79 | step 3000/4071 | loss 8.1271 | lr 0.00100 | ngrams/sec 39783.5 | eta 0h0m13s
| epoch 79 | step 3500/4071 | loss 8.1210 | lr 0.00100 | ngrams/sec 39760.4 | eta 0h0m7s
| epoch 79 | step 4000/4071 | loss 8.1317 | lr 0.00100 | ngrams/sec 39817.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1168.79it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  79 | time 53.79s | valid loss  6.56 | valid ppl   707.23
-----------------------------------------------------------------------------------------
| epoch 80 | step 500/4071 | loss 8.1083 | lr 0.00100 | ngrams/sec 27816.2 | eta 0h1m5s
| epoch 80 | step 1000/4071 | loss 8.0926 | lr 0.00100 | ngrams/sec 39837.2 | eta 0h0m39s
| epoch 80 | step 1500/4071 | loss 8.1179 | lr 0.00100 | ngrams/sec 39801.6 | eta 0h0m33s
| epoch 80 | step 2000/4071 | loss 8.1158 | lr 0.00100 | ngrams/sec 39781.7 | eta 0h0m26s
| epoch 80 | step 2500/4071 | loss 8.1329 | lr 0.00100 | ngrams/sec 39807.2 | eta 0h0m20s
| epoch 80 | step 3000/4071 | loss 8.1270 | lr 0.00100 | ngrams/sec 39759.3 | eta 0h0m13s
| epoch 80 | step 3500/4071 | loss 8.1265 | lr 0.00100 | ngrams/sec 39787.1 | eta 0h0m7s
| epoch 80 | step 4000/4071 | loss 8.1175 | lr 0.00100 | ngrams/sec 39777.7 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1179.42it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.59it/s]


-----------------------------------------------------------------------------------------
| end of epoch  80 | time 53.83s | valid loss  6.55 | valid ppl   698.05
-----------------------------------------------------------------------------------------
| epoch 81 | step 500/4071 | loss 8.1011 | lr 0.00100 | ngrams/sec 27853.5 | eta 0h1m5s
| epoch 81 | step 1000/4071 | loss 8.1194 | lr 0.00100 | ngrams/sec 39705.5 | eta 0h0m39s
| epoch 81 | step 1500/4071 | loss 8.1146 | lr 0.00100 | ngrams/sec 39759.5 | eta 0h0m33s
| epoch 81 | step 2000/4071 | loss 8.0992 | lr 0.00100 | ngrams/sec 39667.3 | eta 0h0m26s
| epoch 81 | step 2500/4071 | loss 8.1211 | lr 0.00100 | ngrams/sec 39726.1 | eta 0h0m20s
| epoch 81 | step 3000/4071 | loss 8.1244 | lr 0.00100 | ngrams/sec 39720.0 | eta 0h0m13s
| epoch 81 | step 3500/4071 | loss 8.1149 | lr 0.00100 | ngrams/sec 39717.0 | eta 0h0m7s
| epoch 81 | step 4000/4071 | loss 8.1026 | lr 0.00100 | ngrams/sec 39634.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1160.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.11it/s]


-----------------------------------------------------------------------------------------
| end of epoch  81 | time 53.92s | valid loss  6.55 | valid ppl   698.72
-----------------------------------------------------------------------------------------
| epoch 82 | step 500/4071 | loss 8.0937 | lr 0.00100 | ngrams/sec 27794.4 | eta 0h1m5s
| epoch 82 | step 1000/4071 | loss 8.1001 | lr 0.00100 | ngrams/sec 39732.8 | eta 0h0m39s
| epoch 82 | step 1500/4071 | loss 8.1230 | lr 0.00100 | ngrams/sec 39709.5 | eta 0h0m33s
| epoch 82 | step 2000/4071 | loss 8.1171 | lr 0.00100 | ngrams/sec 39681.8 | eta 0h0m26s
| epoch 82 | step 2500/4071 | loss 8.1014 | lr 0.00100 | ngrams/sec 39677.6 | eta 0h0m20s
| epoch 82 | step 3000/4071 | loss 8.0996 | lr 0.00100 | ngrams/sec 39690.5 | eta 0h0m13s
| epoch 82 | step 3500/4071 | loss 8.1129 | lr 0.00100 | ngrams/sec 39673.8 | eta 0h0m7s
| epoch 82 | step 4000/4071 | loss 8.1069 | lr 0.00100 | ngrams/sec 39687.0 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1158.31it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.36it/s]


-----------------------------------------------------------------------------------------
| end of epoch  82 | time 53.96s | valid loss  6.54 | valid ppl   692.81
-----------------------------------------------------------------------------------------
| epoch 83 | step 500/4071 | loss 8.0882 | lr 0.00100 | ngrams/sec 27766.6 | eta 0h1m5s
| epoch 83 | step 1000/4071 | loss 8.0807 | lr 0.00100 | ngrams/sec 39675.0 | eta 0h0m39s
| epoch 83 | step 1500/4071 | loss 8.1055 | lr 0.00100 | ngrams/sec 39708.8 | eta 0h0m33s
| epoch 83 | step 2000/4071 | loss 8.1102 | lr 0.00100 | ngrams/sec 39677.8 | eta 0h0m26s
| epoch 83 | step 2500/4071 | loss 8.1023 | lr 0.00100 | ngrams/sec 39677.6 | eta 0h0m20s
| epoch 83 | step 3000/4071 | loss 8.1050 | lr 0.00100 | ngrams/sec 39697.0 | eta 0h0m13s
| epoch 83 | step 3500/4071 | loss 8.1264 | lr 0.00100 | ngrams/sec 39721.5 | eta 0h0m7s
| epoch 83 | step 4000/4071 | loss 8.1193 | lr 0.00100 | ngrams/sec 39714.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1175.46it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.21it/s]


-----------------------------------------------------------------------------------------
| end of epoch  83 | time 53.95s | valid loss  6.55 | valid ppl   698.31
-----------------------------------------------------------------------------------------
| epoch 84 | step 500/4071 | loss 8.0941 | lr 0.00100 | ngrams/sec 27808.4 | eta 0h1m5s
| epoch 84 | step 1000/4071 | loss 8.0901 | lr 0.00100 | ngrams/sec 39744.6 | eta 0h0m39s
| epoch 84 | step 1500/4071 | loss 8.1098 | lr 0.00100 | ngrams/sec 39781.0 | eta 0h0m33s
| epoch 84 | step 2000/4071 | loss 8.1006 | lr 0.00100 | ngrams/sec 39823.1 | eta 0h0m26s
| epoch 84 | step 2500/4071 | loss 8.0969 | lr 0.00100 | ngrams/sec 39760.9 | eta 0h0m20s
| epoch 84 | step 3000/4071 | loss 8.0963 | lr 0.00100 | ngrams/sec 39845.3 | eta 0h0m13s
| epoch 84 | step 3500/4071 | loss 8.1083 | lr 0.00100 | ngrams/sec 39859.8 | eta 0h0m7s
| epoch 84 | step 4000/4071 | loss 8.1089 | lr 0.00100 | ngrams/sec 39812.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1182.43it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  84 | time 53.81s | valid loss  6.53 | valid ppl   687.44
-----------------------------------------------------------------------------------------
| epoch 85 | step 500/4071 | loss 8.0775 | lr 0.00100 | ngrams/sec 27904.4 | eta 0h1m5s
| epoch 85 | step 1000/4071 | loss 8.0948 | lr 0.00100 | ngrams/sec 39846.1 | eta 0h0m39s
| epoch 85 | step 1500/4071 | loss 8.0951 | lr 0.00100 | ngrams/sec 39831.8 | eta 0h0m33s
| epoch 85 | step 2000/4071 | loss 8.1127 | lr 0.00100 | ngrams/sec 39853.0 | eta 0h0m26s
| epoch 85 | step 2500/4071 | loss 8.0964 | lr 0.00100 | ngrams/sec 39859.7 | eta 0h0m20s
| epoch 85 | step 3000/4071 | loss 8.1008 | lr 0.00100 | ngrams/sec 39812.2 | eta 0h0m13s
| epoch 85 | step 3500/4071 | loss 8.1017 | lr 0.00100 | ngrams/sec 39837.5 | eta 0h0m7s
| epoch 85 | step 4000/4071 | loss 8.1053 | lr 0.00100 | ngrams/sec 39927.0 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1141.99it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  85 | time 53.74s | valid loss  6.55 | valid ppl   696.92
-----------------------------------------------------------------------------------------
| epoch 86 | step 500/4071 | loss 8.0539 | lr 0.00100 | ngrams/sec 27940.7 | eta 0h1m5s
| epoch 86 | step 1000/4071 | loss 8.0994 | lr 0.00100 | ngrams/sec 39885.8 | eta 0h0m39s
| epoch 86 | step 1500/4071 | loss 8.0952 | lr 0.00100 | ngrams/sec 39911.3 | eta 0h0m32s
| epoch 86 | step 2000/4071 | loss 8.0973 | lr 0.00100 | ngrams/sec 39922.6 | eta 0h0m26s
| epoch 86 | step 2500/4071 | loss 8.1009 | lr 0.00100 | ngrams/sec 39943.7 | eta 0h0m20s
| epoch 86 | step 3000/4071 | loss 8.0998 | lr 0.00100 | ngrams/sec 39952.8 | eta 0h0m13s
| epoch 86 | step 3500/4071 | loss 8.0927 | lr 0.00100 | ngrams/sec 39997.9 | eta 0h0m7s
| epoch 86 | step 4000/4071 | loss 8.0984 | lr 0.00100 | ngrams/sec 39917.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.66it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.49it/s]


-----------------------------------------------------------------------------------------
| end of epoch  86 | time 53.63s | valid loss  6.52 | valid ppl   681.70
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 87 | step 500/4071 | loss 8.0787 | lr 0.00100 | ngrams/sec 27747.8 | eta 0h1m5s
| epoch 87 | step 1000/4071 | loss 8.0618 | lr 0.00100 | ngrams/sec 39933.7 | eta 0h0m39s
| epoch 87 | step 1500/4071 | loss 8.0819 | lr 0.00100 | ngrams/sec 39921.2 | eta 0h0m32s
| epoch 87 | step 2000/4071 | loss 8.0885 | lr 0.00100 | ngrams/sec 39962.6 | eta 0h0m26s
| epoch 87 | step 2500/4071 | loss 8.0809 | lr 0.00100 | ngrams/sec 39963.9 | eta 0h0m20s
| epoch 87 | step 3000/4071 | loss 8.0842 | lr 0.00100 | ngrams/sec 39894.9 | eta 0h0m13s
| epoch 87 | step 3500/4071 | loss 8.0966 | lr 0.00100 | ngrams/sec 39932.1 | eta 0h0m7s
| epoch 87 | step 4000/4071 | loss 8.0875 | lr 0.00100 | ngrams/sec 39962.9 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1174.50it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.52it/s]


-----------------------------------------------------------------------------------------
| end of epoch  87 | time 53.63s | valid loss  6.52 | valid ppl   677.64
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 88 | step 500/4071 | loss 8.0630 | lr 0.00100 | ngrams/sec 27724.4 | eta 0h1m5s
| epoch 88 | step 1000/4071 | loss 8.0569 | lr 0.00100 | ngrams/sec 39957.2 | eta 0h0m39s
| epoch 88 | step 1500/4071 | loss 8.0757 | lr 0.00100 | ngrams/sec 39964.8 | eta 0h0m32s
| epoch 88 | step 2000/4071 | loss 8.0827 | lr 0.00100 | ngrams/sec 39884.5 | eta 0h0m26s
| epoch 88 | step 2500/4071 | loss 8.0732 | lr 0.00100 | ngrams/sec 39977.3 | eta 0h0m20s
| epoch 88 | step 3000/4071 | loss 8.0919 | lr 0.00100 | ngrams/sec 39873.5 | eta 0h0m13s
| epoch 88 | step 3500/4071 | loss 8.0931 | lr 0.00100 | ngrams/sec 39978.8 | eta 0h0m7s
| epoch 88 | step 4000/4071 | loss 8.0887 | lr 0.00100 | ngrams/sec 39894.2 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1167.23it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.64it/s]


-----------------------------------------------------------------------------------------
| end of epoch  88 | time 53.65s | valid loss  6.54 | valid ppl   688.85
-----------------------------------------------------------------------------------------
| epoch 89 | step 500/4071 | loss 8.0550 | lr 0.00100 | ngrams/sec 27902.4 | eta 0h1m5s
| epoch 89 | step 1000/4071 | loss 8.0613 | lr 0.00100 | ngrams/sec 39918.6 | eta 0h0m39s
| epoch 89 | step 1500/4071 | loss 8.0822 | lr 0.00100 | ngrams/sec 39852.8 | eta 0h0m33s
| epoch 89 | step 2000/4071 | loss 8.0607 | lr 0.00100 | ngrams/sec 39928.6 | eta 0h0m26s
| epoch 89 | step 2500/4071 | loss 8.0803 | lr 0.00100 | ngrams/sec 39867.7 | eta 0h0m20s
| epoch 89 | step 3000/4071 | loss 8.0780 | lr 0.00100 | ngrams/sec 39836.5 | eta 0h0m13s
| epoch 89 | step 3500/4071 | loss 8.0743 | lr 0.00100 | ngrams/sec 39898.5 | eta 0h0m7s
| epoch 89 | step 4000/4071 | loss 8.0850 | lr 0.00100 | ngrams/sec 39869.4 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1170.60it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  89 | time 53.70s | valid loss  6.50 | valid ppl   667.03
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 90 | step 500/4071 | loss 8.0479 | lr 0.00100 | ngrams/sec 27709.0 | eta 0h1m5s
| epoch 90 | step 1000/4071 | loss 8.0616 | lr 0.00100 | ngrams/sec 39871.8 | eta 0h0m39s
| epoch 90 | step 1500/4071 | loss 8.0625 | lr 0.00100 | ngrams/sec 39851.9 | eta 0h0m33s
| epoch 90 | step 2000/4071 | loss 8.0528 | lr 0.00100 | ngrams/sec 39845.1 | eta 0h0m26s
| epoch 90 | step 2500/4071 | loss 8.0749 | lr 0.00100 | ngrams/sec 39830.7 | eta 0h0m20s
| epoch 90 | step 3000/4071 | loss 8.0713 | lr 0.00100 | ngrams/sec 39750.9 | eta 0h0m13s
| epoch 90 | step 3500/4071 | loss 8.0742 | lr 0.00100 | ngrams/sec 39729.8 | eta 0h0m7s
| epoch 90 | step 4000/4071 | loss 8.0698 | lr 0.00100 | ngrams/sec 39845.6 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1173.07it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.74it/s]


-----------------------------------------------------------------------------------------
| end of epoch  90 | time 53.78s | valid loss  6.51 | valid ppl   672.47
-----------------------------------------------------------------------------------------
| epoch 91 | step 500/4071 | loss 8.0304 | lr 0.00100 | ngrams/sec 27898.9 | eta 0h1m5s
| epoch 91 | step 1000/4071 | loss 8.0540 | lr 0.00100 | ngrams/sec 39827.8 | eta 0h0m39s
| epoch 91 | step 1500/4071 | loss 8.0735 | lr 0.00100 | ngrams/sec 39810.8 | eta 0h0m33s
| epoch 91 | step 2000/4071 | loss 8.0434 | lr 0.00100 | ngrams/sec 39869.4 | eta 0h0m26s
| epoch 91 | step 2500/4071 | loss 8.0562 | lr 0.00100 | ngrams/sec 39792.6 | eta 0h0m20s
| epoch 91 | step 3000/4071 | loss 8.0567 | lr 0.00100 | ngrams/sec 39744.4 | eta 0h0m13s
| epoch 91 | step 3500/4071 | loss 8.0709 | lr 0.00100 | ngrams/sec 39838.7 | eta 0h0m7s
| epoch 91 | step 4000/4071 | loss 8.0668 | lr 0.00100 | ngrams/sec 39807.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1169.73it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  91 | time 53.78s | valid loss  6.50 | valid ppl   668.06
-----------------------------------------------------------------------------------------
| epoch 92 | step 500/4071 | loss 8.0381 | lr 0.00100 | ngrams/sec 27900.5 | eta 0h1m5s
| epoch 92 | step 1000/4071 | loss 8.0444 | lr 0.00100 | ngrams/sec 39836.8 | eta 0h0m39s
| epoch 92 | step 1500/4071 | loss 8.0520 | lr 0.00100 | ngrams/sec 39792.5 | eta 0h0m33s
| epoch 92 | step 2000/4071 | loss 8.0377 | lr 0.00100 | ngrams/sec 39846.6 | eta 0h0m26s
| epoch 92 | step 2500/4071 | loss 8.0589 | lr 0.00100 | ngrams/sec 39828.8 | eta 0h0m20s
| epoch 92 | step 3000/4071 | loss 8.0493 | lr 0.00100 | ngrams/sec 39812.2 | eta 0h0m13s
| epoch 92 | step 3500/4071 | loss 8.0573 | lr 0.00100 | ngrams/sec 39751.0 | eta 0h0m7s
| epoch 92 | step 4000/4071 | loss 8.0574 | lr 0.00100 | ngrams/sec 39858.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1164.77it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.89it/s]


-----------------------------------------------------------------------------------------
| end of epoch  92 | time 53.78s | valid loss  6.50 | valid ppl   665.56
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 93 | step 500/4071 | loss 8.0293 | lr 0.00100 | ngrams/sec 27690.9 | eta 0h1m6s
| epoch 93 | step 1000/4071 | loss 8.0277 | lr 0.00100 | ngrams/sec 39870.4 | eta 0h0m39s
| epoch 93 | step 1500/4071 | loss 8.0534 | lr 0.00100 | ngrams/sec 39835.1 | eta 0h0m33s
| epoch 93 | step 2000/4071 | loss 8.0502 | lr 0.00100 | ngrams/sec 39874.9 | eta 0h0m26s
| epoch 93 | step 2500/4071 | loss 8.0587 | lr 0.00100 | ngrams/sec 39959.3 | eta 0h0m20s
| epoch 93 | step 3000/4071 | loss 8.0479 | lr 0.00100 | ngrams/sec 39918.1 | eta 0h0m13s
| epoch 93 | step 3500/4071 | loss 8.0527 | lr 0.00100 | ngrams/sec 39876.5 | eta 0h0m7s
| epoch 93 | step 4000/4071 | loss 8.0578 | lr 0.00100 | ngrams/sec 39856.0 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1169.67it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.28it/s]


-----------------------------------------------------------------------------------------
| end of epoch  93 | time 53.71s | valid loss  6.52 | valid ppl   679.02
-----------------------------------------------------------------------------------------
| epoch 94 | step 500/4071 | loss 8.0155 | lr 0.00100 | ngrams/sec 27956.6 | eta 0h1m5s
| epoch 94 | step 1000/4071 | loss 8.0375 | lr 0.00100 | ngrams/sec 39868.9 | eta 0h0m39s
| epoch 94 | step 1500/4071 | loss 8.0431 | lr 0.00100 | ngrams/sec 39891.1 | eta 0h0m32s
| epoch 94 | step 2000/4071 | loss 8.0589 | lr 0.00100 | ngrams/sec 39885.0 | eta 0h0m26s
| epoch 94 | step 2500/4071 | loss 8.0399 | lr 0.00100 | ngrams/sec 39906.8 | eta 0h0m20s
| epoch 94 | step 3000/4071 | loss 8.0507 | lr 0.00100 | ngrams/sec 39977.2 | eta 0h0m13s
| epoch 94 | step 3500/4071 | loss 8.0500 | lr 0.00100 | ngrams/sec 39935.8 | eta 0h0m7s
| epoch 94 | step 4000/4071 | loss 8.0627 | lr 0.00100 | ngrams/sec 39938.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1161.07it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.36it/s]


-----------------------------------------------------------------------------------------
| end of epoch  94 | time 53.64s | valid loss  6.48 | valid ppl   649.44
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 95 | step 500/4071 | loss 8.0239 | lr 0.00100 | ngrams/sec 27783.9 | eta 0h1m5s
| epoch 95 | step 1000/4071 | loss 8.0183 | lr 0.00100 | ngrams/sec 39947.4 | eta 0h0m39s
| epoch 95 | step 1500/4071 | loss 8.0316 | lr 0.00100 | ngrams/sec 39248.4 | eta 0h0m33s
| epoch 95 | step 2000/4071 | loss 8.0466 | lr 0.00100 | ngrams/sec 39725.7 | eta 0h0m26s
| epoch 95 | step 2500/4071 | loss 8.0385 | lr 0.00100 | ngrams/sec 39891.5 | eta 0h0m20s
| epoch 95 | step 3000/4071 | loss 8.0460 | lr 0.00100 | ngrams/sec 39986.8 | eta 0h0m13s
| epoch 95 | step 3500/4071 | loss 8.0374 | lr 0.00100 | ngrams/sec 40022.1 | eta 0h0m7s
| epoch 95 | step 4000/4071 | loss 8.0640 | lr 0.00100 | ngrams/sec 40010.8 | eta 0h

 29%|██▉       | 120/417 [00:00<00:00, 1148.11it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  95 | time 53.74s | valid loss  6.50 | valid ppl   665.60
-----------------------------------------------------------------------------------------
| epoch 96 | step 500/4071 | loss 8.0196 | lr 0.00100 | ngrams/sec 27964.7 | eta 0h1m5s
| epoch 96 | step 1000/4071 | loss 8.0091 | lr 0.00100 | ngrams/sec 39824.8 | eta 0h0m39s
| epoch 96 | step 1500/4071 | loss 8.0391 | lr 0.00100 | ngrams/sec 39953.2 | eta 0h0m32s
| epoch 96 | step 2000/4071 | loss 8.0448 | lr 0.00100 | ngrams/sec 39867.4 | eta 0h0m26s
| epoch 96 | step 2500/4071 | loss 8.0335 | lr 0.00100 | ngrams/sec 39821.7 | eta 0h0m20s
| epoch 96 | step 3000/4071 | loss 8.0400 | lr 0.00100 | ngrams/sec 39843.4 | eta 0h0m13s
| epoch 96 | step 3500/4071 | loss 8.0475 | lr 0.00100 | ngrams/sec 39727.6 | eta 0h0m7s
| epoch 96 | step 4000/4071 | loss 8.0420 | lr 0.00100 | ngrams/sec 39726.9 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1160.30it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.71it/s]


-----------------------------------------------------------------------------------------
| end of epoch  96 | time 53.76s | valid loss  6.51 | valid ppl   668.62
-----------------------------------------------------------------------------------------
| epoch 97 | step 500/4071 | loss 8.0079 | lr 0.00100 | ngrams/sec 27859.4 | eta 0h1m5s
| epoch 97 | step 1000/4071 | loss 8.0103 | lr 0.00100 | ngrams/sec 39741.5 | eta 0h0m39s
| epoch 97 | step 1500/4071 | loss 8.0267 | lr 0.00100 | ngrams/sec 39728.1 | eta 0h0m33s
| epoch 97 | step 2000/4071 | loss 8.0328 | lr 0.00100 | ngrams/sec 39665.8 | eta 0h0m26s
| epoch 97 | step 2500/4071 | loss 8.0400 | lr 0.00100 | ngrams/sec 39691.9 | eta 0h0m20s
| epoch 97 | step 3000/4071 | loss 8.0368 | lr 0.00100 | ngrams/sec 39754.5 | eta 0h0m13s
| epoch 97 | step 3500/4071 | loss 8.0246 | lr 0.00100 | ngrams/sec 39745.6 | eta 0h0m7s
| epoch 97 | step 4000/4071 | loss 8.0381 | lr 0.00100 | ngrams/sec 39721.9 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1177.97it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  97 | time 53.90s | valid loss  6.54 | valid ppl   690.74
-----------------------------------------------------------------------------------------
| epoch 98 | step 500/4071 | loss 8.0037 | lr 0.00100 | ngrams/sec 27863.1 | eta 0h1m5s
| epoch 98 | step 1000/4071 | loss 8.0138 | lr 0.00100 | ngrams/sec 39867.0 | eta 0h0m39s
| epoch 98 | step 1500/4071 | loss 8.0218 | lr 0.00100 | ngrams/sec 39861.3 | eta 0h0m33s
| epoch 98 | step 2000/4071 | loss 8.0167 | lr 0.00100 | ngrams/sec 39819.7 | eta 0h0m26s
| epoch 98 | step 2500/4071 | loss 8.0304 | lr 0.00100 | ngrams/sec 39807.4 | eta 0h0m20s
| epoch 98 | step 3000/4071 | loss 8.0259 | lr 0.00100 | ngrams/sec 39867.7 | eta 0h0m13s
| epoch 98 | step 3500/4071 | loss 8.0182 | lr 0.00100 | ngrams/sec 39857.5 | eta 0h0m7s
| epoch 98 | step 4000/4071 | loss 8.0503 | lr 0.00100 | ngrams/sec 39886.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1153.40it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch  98 | time 53.74s | valid loss  6.53 | valid ppl   683.05
-----------------------------------------------------------------------------------------
| epoch 99 | step 500/4071 | loss 8.0043 | lr 0.00100 | ngrams/sec 27866.0 | eta 0h1m5s
| epoch 99 | step 1000/4071 | loss 8.0093 | lr 0.00100 | ngrams/sec 39904.2 | eta 0h0m39s
| epoch 99 | step 1500/4071 | loss 8.0130 | lr 0.00100 | ngrams/sec 39803.2 | eta 0h0m33s
| epoch 99 | step 2000/4071 | loss 8.0119 | lr 0.00100 | ngrams/sec 39876.5 | eta 0h0m26s
| epoch 99 | step 2500/4071 | loss 8.0063 | lr 0.00100 | ngrams/sec 39839.9 | eta 0h0m20s
| epoch 99 | step 3000/4071 | loss 8.0077 | lr 0.00100 | ngrams/sec 39948.0 | eta 0h0m13s
| epoch 99 | step 3500/4071 | loss 8.0180 | lr 0.00100 | ngrams/sec 39834.6 | eta 0h0m7s
| epoch 99 | step 4000/4071 | loss 8.0256 | lr 0.00100 | ngrams/sec 39856.0 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1177.51it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.19it/s]


-----------------------------------------------------------------------------------------
| end of epoch  99 | time 53.73s | valid loss  6.50 | valid ppl   664.44
-----------------------------------------------------------------------------------------
| epoch 100 | step 500/4071 | loss 7.9944 | lr 0.00100 | ngrams/sec 27920.7 | eta 0h1m5s
| epoch 100 | step 1000/4071 | loss 8.0051 | lr 0.00100 | ngrams/sec 39927.5 | eta 0h0m39s
| epoch 100 | step 1500/4071 | loss 8.0085 | lr 0.00100 | ngrams/sec 39867.4 | eta 0h0m33s
| epoch 100 | step 2000/4071 | loss 8.0168 | lr 0.00100 | ngrams/sec 39967.4 | eta 0h0m26s
| epoch 100 | step 2500/4071 | loss 8.0195 | lr 0.00100 | ngrams/sec 39870.9 | eta 0h0m20s
| epoch 100 | step 3000/4071 | loss 8.0299 | lr 0.00100 | ngrams/sec 39874.1 | eta 0h0m13s
| epoch 100 | step 3500/4071 | loss 8.0200 | lr 0.00100 | ngrams/sec 39878.1 | eta 0h0m7s
| epoch 100 | step 4000/4071 | loss 8.0223 | lr 0.00100 | ngrams/sec 39937.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1161.53it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.05it/s]


-----------------------------------------------------------------------------------------


 25%|██▍       | 117/471 [00:00<00:00, 1166.77it/s]

| end of epoch 100 | time 53.67s | valid loss  6.51 | valid ppl   671.17
-----------------------------------------------------------------------------------------
Evaluating on test set...


100%|██████████| 471/471 [00:01<00:00, 287.19it/s]


| End of training | test loss  6.45 | test ppl   633.41


In [None]:
from google.colab import files
files.download('checkpoint.pth')
!cp "checkpoint.pth" "gdrive/MyDrive/checkpoint.pth"

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Generate

In [None]:
!cp "gdrive/MyDrive/checkpoint.pth" "checkpoint.pth" 

In [12]:
import torch 
# Model parameters.
class Args:
    data = 'gdrive/MyDrive/wikitext-2'
    checkpoint = 'checkpoint-tied-512.pth'
    outf = 'generated.txt'
    #words = 1000
    seed = 42
    cuda = True
    temperature = 1.0 # temperature - higher will increase diversity
    log_interval = 10 # reporting interval
    words = 100
args = Args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

model.load_state_dict(torch.load(args.checkpoint))
print(model)
model.eval()

ntokens = n_class
input_idx = 104#torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
# input_idx = torch.autograd.Variable(torch.t(torch.randint(ntokens, (1, 7), dtype=torch.long))).to(device)
input_words = [corpus.dictionary.idx2word[i] for i in train_data[input_idx:order+input_idx, 0]]
input = torch.tensor([i for i in train_data[input_idx:order+input_idx, 0]]).to(device)
print(input)
print(input_words)

FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
tensor([27, 63, 64, 65, 66, 17, 67], device='cuda:0')
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation']


In [13]:
glue = ' '
start = None
with open(args.outf, 'w') as outf:
    for i in range(args.words):
        output = model(input)
        word_weights = output.squeeze().div(args.temperature).exp().cpu()
        # if args.no_unk:
        #     word_weights[corpus.dictionary.w2i[unk]] = 0
        word_idx = torch.multinomial(word_weights, 1)[0]
        # word_idx = word_idx.data[0]
        word = corpus.dictionary.idx2word[word_idx]
        print(word)

        # ids.append(word_idx)
        # input = Variable(torch.LongTensor(ids[-model.order:]).unsqueeze(0))
        input.fill_(word_idx)
        input = input.cuda() if cuda else input
        # print(input)
        if word is "<sos>": # ignore start of sentence predictns
            continue
        elif word is "<eos>":
            outf.write('\n')
        else:
            outf.write(word + glue)

        if i % args.log_interval == 0:
            print('| Generated {}/{} words'.format(i, args.words))

relief
| Generated 0/100 words
flights
became
most
delicate
glow
overabundance
recreational
experience
concepts
representation
| Generated 10/100 words
it
almost
180
2005
between
1974
,
respected
extremely
historical
| Generated 20/100 words
cure
silence
as
jazz
effort
from
while
better
solitaire
deteriorated
| Generated 30/100 words
deemed
believed
cue
same
underneath
parallel
10
the
play
producer
| Generated 40/100 words
fey
12
continued
independence
;
atp
houston
atop
without
veto
| Generated 50/100 words
colourful
adriatic
just
opening
the
35th
string
reception
usually
apparent
| Generated 60/100 words
durand
finished
delayed
differences
dining
wore
grown
to
date
rejected
| Generated 70/100 words
,
experienced
observations
documented
funds
teams
's
planned
coach
watched
| Generated 80/100 words
relay
hockey
named
this
application
continued
population
produced
performing
humanity
| Generated 90/100 words
<unk>
distinct
epic
puzzles
dương
decides
population
gave
interest
