In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!ls "gdrive/MyDrive/wikitext-2" # check that it has successfully connected
# files should be at ur GDrive inside folder wikitext-2
!cp "gdrive/MyDrive/wikitext-2/wiki.test.tokens.txt" "test.txt" # copy the files to colab runtime
!cp "gdrive/MyDrive/wikitext-2/wiki.train.tokens.txt" "train.txt"
!cp "gdrive/MyDrive/wikitext-2/wiki.valid.tokens.txt" "valid.txt"

wiki.test.tokens.txt  wiki.train.tokens.txt  wiki.valid.tokens.txt


# Preprocessing

In [3]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                # remove the headers e.g.  = = Description = = 
                if line.startswith('='): 
                    continue
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word.lower()) # make to lower

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word.lower()]) # make to lower
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
    
    @property
    def vocab_size(self):
        return len(self.dictionary.idx2word)

# Params

In [4]:
#=== params
corpus = Corpus('/content')
n_class = corpus.vocab_size
n_step = 7 # n-1 in paper
n_hidden = 200 # h in paper
embed_size = 200       # m in paper
batch_size = 512
order = n_step # order (int): the order of the language model, i.e. length of the history
epochs = 100
learning_rate = 0.001
cuda = torch.cuda.is_available()
seed = 42
clip = 2.0
#===

# Model

In [5]:
#== MODEL ==#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''
    the neural model learns the distributed representation of each word 
    (embedding matrix C) and 
    the probability function of a word sequence as a function of their distributed representations. 
    It has a hidden layer with 
    tanh activation and the output layer is a 
    Softmax layer. 
    The output of the model for each 
    input of (n - 1) previous words are the 
    probabilities over the |V | words in the vocabulary for the next word.
'''
class FNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, context_size, no_hidden, dropout=0.5, tie_weight=True):
        super(FNNModel,self).__init__()
        """
        Args:
            n_class (int): no. of vocabulary
            m (int): size of each embedding vector
#n-gram models construct tables of conditional probabilities for the next word, 
#for each one of a large number of contexts, i.e. combinations of the last n − 1 words
            n_step (int): n-1 in paper. #n_step + 1 = n-gram. if n_step = 1, bigram
            n_hidden (int): no. of hidden units associated with each word
        """
        """
        Vars:
            C: encoder (|V| x m)
            H: hiden layer weight (n x (n-1)m)
            W: word feature to output weights (|V| x (n-1)m)
            d: hidden layer bias (has h no. of elements)
            U: hidden-to-output weights (|V| × h matrix)
            b: output bias (has |V| no. of elements)
        """
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear1 = nn.Linear(context_size * embed_size, no_hidden)
        self.linear2 = nn.Linear(no_hidden, vocab_size)
        self.context_size = context_size
        self.embed_size = embed_size
        self.dropout = nn.Dropout(p=dropout)
        if tie_weight:
            self.linear2.weight = self.embeddings.weight

    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((-1, self.context_size * self.embed_size))
        hidden_output = self.linear1(embeds)
        # hidden_output = self.dropout(hidden_output)
        out = hidden_output.tanh()
        # out = self.dropout(out)
        out = self.linear2(out)
        out = self.dropout(out)
        log_probs = F.log_softmax(out, dim=1) # [1000, 28912]: softmax on 28912's dim
        return log_probs

# Data Loading

In [6]:
def batchify(data, batch_size):
    # Work out how cleanly we can divide the dataset into args.batch_size parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data
def get_batch(data, i, order):
    x = torch.autograd.Variable(torch.t(data[i:i+order]))
    y = torch.autograd.Variable(data[i+order].view(-1))
    return x, y
def evaluate(data, model, criterion):
	model.eval()
	total_loss = 0
	n_steps = data.size(0) - order - 1
	for i in tqdm(range(n_steps)):
		x, y = get_batch(data, i, order)
		out = model(x)
		loss = criterion(out, y)
		total_loss += loss.data.data
	return total_loss / n_steps

In [7]:
def clock_time(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    return int(h), int(m), int(s)

In [8]:
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import time


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)
if cuda:
	train_data, val_data, test_data = train_data.cuda(), val_data.cuda(), test_data.cuda()
print('Using cuda: {}'.format(cuda))
print('Size of training set: {:,} tokens'.format(np.prod(train_data.size())))
print('Size of validation set: {:,} tokens'.format(np.prod(val_data.size())))
print('Size of test set: {:,} tokens'.format(np.prod(test_data.size())))
print('Vocabulary size: {:,}'.format(corpus.vocab_size))
print('Example data:')
for k in range(100, 107):
    x = [corpus.dictionary.idx2word[i] for i in train_data[k:order+k, 0]]
    y = [corpus.dictionary.idx2word[train_data[k+order, 0]]]
    print(x, y)
#=== initialise model
model = FNNModel(
    n_class, 
    embed_size, 
    n_step, 
    n_hidden,
    tie_weight=False
    )
if cuda:
  model.cuda()
# Display the model's architecture
print('Model: \n', model)
criterion = nn.NLLLoss()
optimizer = optim.RMSprop(model.parameters(),lr=learning_rate)

Using cuda: True
Size of training set: 2,088,448 tokens
Size of validation set: 217,600 tokens
Size of test set: 245,248 tokens
Vocabulary size: 28,912
Example data:
['"', 'nameless', '"', ',', 'a', 'penal', 'military'] ['unit']
['nameless', '"', ',', 'a', 'penal', 'military', 'unit'] ['serving']
['"', ',', 'a', 'penal', 'military', 'unit', 'serving'] ['the']
[',', 'a', 'penal', 'military', 'unit', 'serving', 'the'] ['nation']
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation'] ['of']
['penal', 'military', 'unit', 'serving', 'the', 'nation', 'of'] ['gallia']
['military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia'] ['during']
Model: 
 FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [9]:
!nvidia-smi

Fri Nov 27 12:53:03 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    31W /  70W |   1035MiB / 15079MiB |      3%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
# Set seed for reproducibility.
torch.manual_seed(seed)
np.random.seed(seed)
parameters = [param for param in model.parameters() if param.requires_grad]
# Training
print('Training...')
losses = dict(train=[], val=[])

# initialize the early_stopping counter
stop_counter = 0

lr = learning_rate # so that can alter later if SGD not descending 
best_val_loss = None

num_steps = train_data.size(0) - order - 1
batch_order = np.arange(num_steps)

t0 = time.time()
try:
    for epoch in range(1, epochs+1):
        model.train()
        epoch_start_time = time.time()
        np.random.shuffle(batch_order)

        for step in range(1, num_steps+1):
            idx = batch_order[step-1]
            x, y = get_batch(train_data, idx, order)

            model.zero_grad()
            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)
            debug = False
            #   if debug:
            #     # Debugging softmax approximation.
            #     xe = nn.CrossEntropyLoss()
            #     true_loss = xe(logits, y)
            #     print('approx {:>3.2f}, true {:>3.2f}, diff {:>3.4f}'.format(
            #       loss.data, true_loss.data, true_loss.data - loss.data))

            # Update parameters
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # reduce exploding grad
            optimizer.step()

            # Save loss.
            losses['train'].append(loss.cpu().data)
            print_every = 500
            if step % print_every == 0:
                avg_loss = sum(losses['train'][-print_every:]) / print_every
                t1 = time.time()
                steps_per_second = print_every / (t1 - t0)
                print('| epoch {} | step {}/{} | loss {:.4f} | lr {:.5f} | '
                    'ngrams/sec {:.1f} | eta {}h{}m{}s'.format(
                    epoch, step, num_steps, avg_loss, lr,
                    steps_per_second * batch_size,
                    *clock_time((num_steps - step) / steps_per_second)))
                t0 = time.time()
            
        print('Evaluating on validation set...')
        val_loss = evaluate(val_data, model, criterion)
        losses['val'].append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, torch.exp(val_loss)))
        print('-' * 89)

        if not best_val_loss or val_loss < best_val_loss:
            stop_counter = 0 # reset counter
            best_val_loss = val_loss
            print('| saving current state of model ...')
            torch.save(model.state_dict(), 'checkpoint.pth')
            #=== download checkpoint file
            # files.download('checkpoint.pth')
        elif val_loss < best_val_loss and val_loss < losses['val'][-2] and val_loss < torch.mean(torch.stack(losses['val'])): # curr loss less than best loss and previous loss
            stop_counter += 1
            if stop_counter >= 10:
                print("Early stopping")
                break
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    
# write_losses(losses['train'], args.log_dir, name='train-losses')
# write_losses(losses['val'], args.log_dir, name='val-losses')

print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

Training...
| epoch 1 | step 500/4071 | loss 8.6664 | lr 0.00100 | ngrams/sec 36137.9 | eta 0h0m50s
| epoch 1 | step 1000/4071 | loss 8.4106 | lr 0.00100 | ngrams/sec 37248.5 | eta 0h0m42s
| epoch 1 | step 1500/4071 | loss 8.3506 | lr 0.00100 | ngrams/sec 37028.3 | eta 0h0m35s
| epoch 1 | step 2000/4071 | loss 8.3293 | lr 0.00100 | ngrams/sec 36757.7 | eta 0h0m28s
| epoch 1 | step 2500/4071 | loss 8.2924 | lr 0.00100 | ngrams/sec 36352.9 | eta 0h0m22s
| epoch 1 | step 3000/4071 | loss 8.2649 | lr 0.00100 | ngrams/sec 35794.8 | eta 0h0m15s
| epoch 1 | step 3500/4071 | loss 8.2579 | lr 0.00100 | ngrams/sec 35137.0 | eta 0h0m8s
| epoch 1 | step 4000/4071 | loss 8.2201 | lr 0.00100 | ngrams/sec 34337.4 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1131.35it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 262.66it/s]


-----------------------------------------------------------------------------------------
| end of epoch   1 | time 59.43s | valid loss  6.84 | valid ppl   938.10
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 2 | step 500/4071 | loss 8.1124 | lr 0.00100 | ngrams/sec 23716.4 | eta 0h1m17s
| epoch 2 | step 1000/4071 | loss 8.1403 | lr 0.00100 | ngrams/sec 34493.2 | eta 0h0m45s
| epoch 2 | step 1500/4071 | loss 8.1596 | lr 0.00100 | ngrams/sec 35310.9 | eta 0h0m37s
| epoch 2 | step 2000/4071 | loss 8.1401 | lr 0.00100 | ngrams/sec 35690.8 | eta 0h0m29s
| epoch 2 | step 2500/4071 | loss 8.1222 | lr 0.00100 | ngrams/sec 35741.2 | eta 0h0m22s
| epoch 2 | step 3000/4071 | loss 8.1277 | lr 0.00100 | ngrams/sec 36008.2 | eta 0h0m15s
| epoch 2 | step 3500/4071 | loss 8.1417 | lr 0.00100 | ngrams/sec 36062.5 | eta 0h0m8s
| epoch 2 | step 4000/4071 | loss 8.1499 | lr 0.00100 | ngrams/sec 35836.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1147.79it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.82it/s]


-----------------------------------------------------------------------------------------
| end of epoch   2 | time 60.42s | valid loss  6.57 | valid ppl   714.60
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 3 | step 500/4071 | loss 8.0164 | lr 0.00100 | ngrams/sec 24841.3 | eta 0h1m13s
| epoch 3 | step 1000/4071 | loss 8.0313 | lr 0.00100 | ngrams/sec 35077.1 | eta 0h0m44s
| epoch 3 | step 1500/4071 | loss 8.0573 | lr 0.00100 | ngrams/sec 35058.3 | eta 0h0m37s
| epoch 3 | step 2000/4071 | loss 8.0652 | lr 0.00100 | ngrams/sec 35077.8 | eta 0h0m30s
| epoch 3 | step 2500/4071 | loss 8.0857 | lr 0.00100 | ngrams/sec 35305.3 | eta 0h0m22s
| epoch 3 | step 3000/4071 | loss 8.0686 | lr 0.00100 | ngrams/sec 35470.6 | eta 0h0m15s
| epoch 3 | step 3500/4071 | loss 8.0193 | lr 0.00100 | ngrams/sec 35610.7 | eta 0h0m8s
| epoch 3 | step 4000/4071 | loss 8.0394 | lr 0.00100 | ngrams/sec 35629.6 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1151.87it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch   3 | time 60.50s | valid loss  6.62 | valid ppl   748.00
-----------------------------------------------------------------------------------------
| epoch 4 | step 500/4071 | loss 7.8857 | lr 0.00100 | ngrams/sec 25189.6 | eta 0h1m12s
| epoch 4 | step 1000/4071 | loss 7.9179 | lr 0.00100 | ngrams/sec 35430.8 | eta 0h0m44s
| epoch 4 | step 1500/4071 | loss 7.9641 | lr 0.00100 | ngrams/sec 35264.5 | eta 0h0m37s
| epoch 4 | step 2000/4071 | loss 7.9866 | lr 0.00100 | ngrams/sec 35188.8 | eta 0h0m30s
| epoch 4 | step 2500/4071 | loss 8.0082 | lr 0.00100 | ngrams/sec 35208.9 | eta 0h0m22s
| epoch 4 | step 3000/4071 | loss 8.0079 | lr 0.00100 | ngrams/sec 35285.0 | eta 0h0m15s
| epoch 4 | step 3500/4071 | loss 8.0158 | lr 0.00100 | ngrams/sec 35354.9 | eta 0h0m8s
| epoch 4 | step 4000/4071 | loss 8.0355 | lr 0.00100 | ngrams/sec 35478.5 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1141.23it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.17it/s]


-----------------------------------------------------------------------------------------
| end of epoch   4 | time 60.49s | valid loss  6.38 | valid ppl   592.87
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 5 | step 500/4071 | loss 7.8739 | lr 0.00100 | ngrams/sec 24902.1 | eta 0h1m13s
| epoch 5 | step 1000/4071 | loss 7.9193 | lr 0.00100 | ngrams/sec 35650.6 | eta 0h0m44s
| epoch 5 | step 1500/4071 | loss 7.9318 | lr 0.00100 | ngrams/sec 35598.6 | eta 0h0m36s
| epoch 5 | step 2000/4071 | loss 7.9340 | lr 0.00100 | ngrams/sec 35496.6 | eta 0h0m29s
| epoch 5 | step 2500/4071 | loss 7.9481 | lr 0.00100 | ngrams/sec 35326.8 | eta 0h0m22s
| epoch 5 | step 3000/4071 | loss 7.9737 | lr 0.00100 | ngrams/sec 35286.9 | eta 0h0m15s
| epoch 5 | step 3500/4071 | loss 7.9780 | lr 0.00100 | ngrams/sec 35194.3 | eta 0h0m8s
| epoch 5 | step 4000/4071 | loss 7.9187 | lr 0.00100 | ngrams/sec 35322.5 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1122.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 275.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch   5 | time 60.36s | valid loss  6.71 | valid ppl   824.16
-----------------------------------------------------------------------------------------
| epoch 6 | step 500/4071 | loss 7.7809 | lr 0.00100 | ngrams/sec 25063.4 | eta 0h1m12s
| epoch 6 | step 1000/4071 | loss 7.8260 | lr 0.00100 | ngrams/sec 35522.9 | eta 0h0m44s
| epoch 6 | step 1500/4071 | loss 7.8585 | lr 0.00100 | ngrams/sec 35549.1 | eta 0h0m37s
| epoch 6 | step 2000/4071 | loss 7.8836 | lr 0.00100 | ngrams/sec 35595.9 | eta 0h0m29s
| epoch 6 | step 2500/4071 | loss 7.9028 | lr 0.00100 | ngrams/sec 35651.7 | eta 0h0m22s
| epoch 6 | step 3000/4071 | loss 7.9287 | lr 0.00100 | ngrams/sec 35552.7 | eta 0h0m15s
| epoch 6 | step 3500/4071 | loss 7.9308 | lr 0.00100 | ngrams/sec 35471.9 | eta 0h0m8s
| epoch 6 | step 4000/4071 | loss 7.9318 | lr 0.00100 | ngrams/sec 35439.6 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1163.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 277.05it/s]


-----------------------------------------------------------------------------------------
| end of epoch   6 | time 60.19s | valid loss  6.40 | valid ppl   600.08
-----------------------------------------------------------------------------------------
| epoch 7 | step 500/4071 | loss 7.7809 | lr 0.00100 | ngrams/sec 25078.7 | eta 0h1m12s
| epoch 7 | step 1000/4071 | loss 7.8156 | lr 0.00100 | ngrams/sec 35361.0 | eta 0h0m44s
| epoch 7 | step 1500/4071 | loss 7.8431 | lr 0.00100 | ngrams/sec 35335.1 | eta 0h0m37s
| epoch 7 | step 2000/4071 | loss 7.8470 | lr 0.00100 | ngrams/sec 35283.3 | eta 0h0m30s
| epoch 7 | step 2500/4071 | loss 7.8775 | lr 0.00100 | ngrams/sec 35304.5 | eta 0h0m22s
| epoch 7 | step 3000/4071 | loss 7.8889 | lr 0.00100 | ngrams/sec 35291.9 | eta 0h0m15s
| epoch 7 | step 3500/4071 | loss 7.9087 | lr 0.00100 | ngrams/sec 35242.1 | eta 0h0m8s
| epoch 7 | step 4000/4071 | loss 7.9024 | lr 0.00100 | ngrams/sec 35293.9 | eta 0h0m1s


 28%|██▊       | 117/417 [00:00<00:00, 1163.92it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 276.33it/s]


-----------------------------------------------------------------------------------------
| end of epoch   7 | time 60.55s | valid loss  6.28 | valid ppl   534.90
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 8 | step 500/4071 | loss 7.7417 | lr 0.00100 | ngrams/sec 24856.3 | eta 0h1m13s
| epoch 8 | step 1000/4071 | loss 7.7642 | lr 0.00100 | ngrams/sec 35623.0 | eta 0h0m44s
| epoch 8 | step 1500/4071 | loss 7.7838 | lr 0.00100 | ngrams/sec 35689.5 | eta 0h0m36s
| epoch 8 | step 2000/4071 | loss 7.8028 | lr 0.00100 | ngrams/sec 35669.5 | eta 0h0m29s
| epoch 8 | step 2500/4071 | loss 7.7922 | lr 0.00100 | ngrams/sec 35611.3 | eta 0h0m22s
| epoch 8 | step 3000/4071 | loss 7.8093 | lr 0.00100 | ngrams/sec 35506.4 | eta 0h0m15s
| epoch 8 | step 3500/4071 | loss 7.8188 | lr 0.00100 | ngrams/sec 35542.5 | eta 0h0m8s
| epoch 8 | step 4000/4071 | loss 7.8406 | lr 0.00100 | ngrams/sec 35414.5 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1157.47it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 276.31it/s]


-----------------------------------------------------------------------------------------
| end of epoch   8 | time 60.11s | valid loss  6.29 | valid ppl   539.99
-----------------------------------------------------------------------------------------
| epoch 9 | step 500/4071 | loss 7.6795 | lr 0.00100 | ngrams/sec 25047.8 | eta 0h1m12s
| epoch 9 | step 1000/4071 | loss 7.7050 | lr 0.00100 | ngrams/sec 35266.1 | eta 0h0m44s
| epoch 9 | step 1500/4071 | loss 7.7273 | lr 0.00100 | ngrams/sec 35387.1 | eta 0h0m37s
| epoch 9 | step 2000/4071 | loss 7.7523 | lr 0.00100 | ngrams/sec 35496.9 | eta 0h0m29s
| epoch 9 | step 2500/4071 | loss 7.7688 | lr 0.00100 | ngrams/sec 35512.6 | eta 0h0m22s
| epoch 9 | step 3000/4071 | loss 7.7992 | lr 0.00100 | ngrams/sec 35654.7 | eta 0h0m15s
| epoch 9 | step 3500/4071 | loss 7.8055 | lr 0.00100 | ngrams/sec 35706.1 | eta 0h0m8s
| epoch 9 | step 4000/4071 | loss 7.8236 | lr 0.00100 | ngrams/sec 35568.9 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1130.45it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 277.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch   9 | time 60.23s | valid loss  6.22 | valid ppl   502.95
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 10 | step 500/4071 | loss 7.6378 | lr 0.00100 | ngrams/sec 24910.6 | eta 0h1m13s
| epoch 10 | step 1000/4071 | loss 7.6721 | lr 0.00100 | ngrams/sec 35395.6 | eta 0h0m44s
| epoch 10 | step 1500/4071 | loss 7.6856 | lr 0.00100 | ngrams/sec 35346.4 | eta 0h0m37s
| epoch 10 | step 2000/4071 | loss 7.7057 | lr 0.00100 | ngrams/sec 35300.6 | eta 0h0m30s
| epoch 10 | step 2500/4071 | loss 7.7396 | lr 0.00100 | ngrams/sec 35318.6 | eta 0h0m22s
| epoch 10 | step 3000/4071 | loss 7.7482 | lr 0.00100 | ngrams/sec 35410.0 | eta 0h0m15s
| epoch 10 | step 3500/4071 | loss 7.7647 | lr 0.00100 | ngrams/sec 35494.8 | eta 0h0m8s
| epoch 10 | step 4000/4071 | loss 7.7891 | lr 0.00100 | ngrams/sec 35564.7 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1174.34it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  10 | time 60.35s | valid loss  6.18 | valid ppl   481.50
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 11 | step 500/4071 | loss 7.5910 | lr 0.00100 | ngrams/sec 24944.1 | eta 0h1m13s
| epoch 11 | step 1000/4071 | loss 7.6352 | lr 0.00100 | ngrams/sec 35650.1 | eta 0h0m44s
| epoch 11 | step 1500/4071 | loss 7.6637 | lr 0.00100 | ngrams/sec 35673.6 | eta 0h0m36s
| epoch 11 | step 2000/4071 | loss 7.6712 | lr 0.00100 | ngrams/sec 35655.5 | eta 0h0m29s
| epoch 11 | step 2500/4071 | loss 7.6901 | lr 0.00100 | ngrams/sec 35695.4 | eta 0h0m22s
| epoch 11 | step 3000/4071 | loss 7.6925 | lr 0.00100 | ngrams/sec 35767.0 | eta 0h0m15s
| epoch 11 | step 3500/4071 | loss 7.7223 | lr 0.00100 | ngrams/sec 35684.3 | eta 0h0m8s
| epoch 11 | step 4000/4071 | loss 7.7385 | lr 0.00100 | ngrams/sec 35671.3 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1141.48it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  11 | time 59.93s | valid loss  6.15 | valid ppl   469.85
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 12 | step 500/4071 | loss 7.5450 | lr 0.00100 | ngrams/sec 24996.7 | eta 0h1m13s
| epoch 12 | step 1000/4071 | loss 7.5939 | lr 0.00100 | ngrams/sec 35600.5 | eta 0h0m44s
| epoch 12 | step 1500/4071 | loss 7.6095 | lr 0.00100 | ngrams/sec 35625.9 | eta 0h0m36s
| epoch 12 | step 2000/4071 | loss 7.6151 | lr 0.00100 | ngrams/sec 35527.2 | eta 0h0m29s
| epoch 12 | step 2500/4071 | loss 7.6580 | lr 0.00100 | ngrams/sec 35530.7 | eta 0h0m22s
| epoch 12 | step 3000/4071 | loss 7.6745 | lr 0.00100 | ngrams/sec 35480.7 | eta 0h0m15s
| epoch 12 | step 3500/4071 | loss 7.6846 | lr 0.00100 | ngrams/sec 35504.0 | eta 0h0m8s
| epoch 12 | step 4000/4071 | loss 7.7141 | lr 0.00100 | ngrams/sec 35398.9 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1122.18it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 275.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  12 | time 60.17s | valid loss  6.14 | valid ppl   462.46
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 13 | step 500/4071 | loss 7.5064 | lr 0.00100 | ngrams/sec 24774.0 | eta 0h1m13s
| epoch 13 | step 1000/4071 | loss 7.5487 | lr 0.00100 | ngrams/sec 35397.7 | eta 0h0m44s
| epoch 13 | step 1500/4071 | loss 7.5731 | lr 0.00100 | ngrams/sec 35399.3 | eta 0h0m37s
| epoch 13 | step 2000/4071 | loss 7.6024 | lr 0.00100 | ngrams/sec 35395.0 | eta 0h0m29s
| epoch 13 | step 2500/4071 | loss 7.6160 | lr 0.00100 | ngrams/sec 35388.0 | eta 0h0m22s
| epoch 13 | step 3000/4071 | loss 7.6181 | lr 0.00100 | ngrams/sec 35420.3 | eta 0h0m15s
| epoch 13 | step 3500/4071 | loss 7.6429 | lr 0.00100 | ngrams/sec 35440.8 | eta 0h0m8s
| epoch 13 | step 4000/4071 | loss 7.6629 | lr 0.00100 | ngrams/sec 35403.2 | eta 0

 28%|██▊       | 117/417 [00:00<00:00, 1144.78it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 274.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch  13 | time 60.41s | valid loss  6.13 | valid ppl   458.59
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 14 | step 500/4071 | loss 7.4641 | lr 0.00100 | ngrams/sec 24767.3 | eta 0h1m13s
| epoch 14 | step 1000/4071 | loss 7.5027 | lr 0.00100 | ngrams/sec 35407.1 | eta 0h0m44s
| epoch 14 | step 1500/4071 | loss 7.5409 | lr 0.00100 | ngrams/sec 35430.5 | eta 0h0m37s
| epoch 14 | step 2000/4071 | loss 7.5539 | lr 0.00100 | ngrams/sec 35433.9 | eta 0h0m29s
| epoch 14 | step 2500/4071 | loss 7.5755 | lr 0.00100 | ngrams/sec 35529.8 | eta 0h0m22s
| epoch 14 | step 3000/4071 | loss 7.6008 | lr 0.00100 | ngrams/sec 35617.4 | eta 0h0m15s
| epoch 14 | step 3500/4071 | loss 7.5988 | lr 0.00100 | ngrams/sec 35566.0 | eta 0h0m8s
| epoch 14 | step 4000/4071 | loss 7.6204 | lr 0.00100 | ngrams/sec 35562.1 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1130.37it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 277.67it/s]


-----------------------------------------------------------------------------------------
| end of epoch  14 | time 60.25s | valid loss  6.10 | valid ppl   445.94
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 15 | step 500/4071 | loss 7.4251 | lr 0.00100 | ngrams/sec 24994.4 | eta 0h1m13s
| epoch 15 | step 1000/4071 | loss 7.4598 | lr 0.00100 | ngrams/sec 35676.2 | eta 0h0m44s
| epoch 15 | step 1500/4071 | loss 7.5029 | lr 0.00100 | ngrams/sec 35727.6 | eta 0h0m36s
| epoch 15 | step 2000/4071 | loss 7.5158 | lr 0.00100 | ngrams/sec 35698.0 | eta 0h0m29s
| epoch 15 | step 2500/4071 | loss 7.5351 | lr 0.00100 | ngrams/sec 35735.6 | eta 0h0m22s
| epoch 15 | step 3000/4071 | loss 7.5543 | lr 0.00100 | ngrams/sec 35763.0 | eta 0h0m15s
| epoch 15 | step 3500/4071 | loss 7.5871 | lr 0.00100 | ngrams/sec 35735.3 | eta 0h0m8s
| epoch 15 | step 4000/4071 | loss 7.5996 | lr 0.00100 | ngrams/sec 35703.1 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1144.88it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.36it/s]


-----------------------------------------------------------------------------------------
| end of epoch  15 | time 59.85s | valid loss  6.08 | valid ppl   439.07
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 16 | step 500/4071 | loss 7.3894 | lr 0.00100 | ngrams/sec 25057.4 | eta 0h1m12s
| epoch 16 | step 1000/4071 | loss 7.4475 | lr 0.00100 | ngrams/sec 35696.9 | eta 0h0m44s
| epoch 16 | step 1500/4071 | loss 7.4615 | lr 0.00100 | ngrams/sec 35652.0 | eta 0h0m36s
| epoch 16 | step 2000/4071 | loss 7.4745 | lr 0.00100 | ngrams/sec 35524.6 | eta 0h0m29s
| epoch 16 | step 2500/4071 | loss 7.5062 | lr 0.00100 | ngrams/sec 35527.3 | eta 0h0m22s
| epoch 16 | step 3000/4071 | loss 7.5234 | lr 0.00100 | ngrams/sec 35545.1 | eta 0h0m15s
| epoch 16 | step 3500/4071 | loss 7.5508 | lr 0.00100 | ngrams/sec 35477.4 | eta 0h0m8s
| epoch 16 | step 4000/4071 | loss 7.5525 | lr 0.00100 | ngrams/sec 35477.7 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1144.09it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 275.76it/s]


-----------------------------------------------------------------------------------------
| end of epoch  16 | time 60.11s | valid loss  6.07 | valid ppl   433.54
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 17 | step 500/4071 | loss 7.3616 | lr 0.00100 | ngrams/sec 24824.3 | eta 0h1m13s
| epoch 17 | step 1000/4071 | loss 7.3968 | lr 0.00100 | ngrams/sec 35401.9 | eta 0h0m44s
| epoch 17 | step 1500/4071 | loss 7.4282 | lr 0.00100 | ngrams/sec 35431.2 | eta 0h0m37s
| epoch 17 | step 2000/4071 | loss 7.4533 | lr 0.00100 | ngrams/sec 35400.8 | eta 0h0m29s
| epoch 17 | step 2500/4071 | loss 7.4684 | lr 0.00100 | ngrams/sec 35440.6 | eta 0h0m22s
| epoch 17 | step 3000/4071 | loss 7.4954 | lr 0.00100 | ngrams/sec 35431.8 | eta 0h0m15s
| epoch 17 | step 3500/4071 | loss 7.5031 | lr 0.00100 | ngrams/sec 35444.3 | eta 0h0m8s
| epoch 17 | step 4000/4071 | loss 7.5085 | lr 0.00100 | ngrams/sec 35474.5 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1162.83it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 276.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  17 | time 60.34s | valid loss  6.08 | valid ppl   436.17
-----------------------------------------------------------------------------------------
| epoch 18 | step 500/4071 | loss 7.3217 | lr 0.00100 | ngrams/sec 25075.7 | eta 0h1m12s
| epoch 18 | step 1000/4071 | loss 7.3544 | lr 0.00100 | ngrams/sec 35422.4 | eta 0h0m44s
| epoch 18 | step 1500/4071 | loss 7.4083 | lr 0.00100 | ngrams/sec 35365.8 | eta 0h0m37s
| epoch 18 | step 2000/4071 | loss 7.4175 | lr 0.00100 | ngrams/sec 35420.2 | eta 0h0m29s
| epoch 18 | step 2500/4071 | loss 7.4332 | lr 0.00100 | ngrams/sec 35367.9 | eta 0h0m22s
| epoch 18 | step 3000/4071 | loss 7.4586 | lr 0.00100 | ngrams/sec 35471.4 | eta 0h0m15s
| epoch 18 | step 3500/4071 | loss 7.4793 | lr 0.00100 | ngrams/sec 35476.1 | eta 0h0m8s
| epoch 18 | step 4000/4071 | loss 7.4886 | lr 0.00100 | ngrams/sec 35477.4 | eta 0h0m1s


 28%|██▊       | 117/417 [00:00<00:00, 1159.41it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 276.91it/s]


-----------------------------------------------------------------------------------------
| end of epoch  18 | time 60.36s | valid loss  6.06 | valid ppl   427.84
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 19 | step 500/4071 | loss 7.2967 | lr 0.00100 | ngrams/sec 24904.5 | eta 0h1m13s
| epoch 19 | step 1000/4071 | loss 7.3326 | lr 0.00100 | ngrams/sec 35518.2 | eta 0h0m44s
| epoch 19 | step 1500/4071 | loss 7.3708 | lr 0.00100 | ngrams/sec 35569.3 | eta 0h0m37s
| epoch 19 | step 2000/4071 | loss 7.3777 | lr 0.00100 | ngrams/sec 35665.7 | eta 0h0m29s
| epoch 19 | step 2500/4071 | loss 7.4054 | lr 0.00100 | ngrams/sec 35674.0 | eta 0h0m22s
| epoch 19 | step 3000/4071 | loss 7.4264 | lr 0.00100 | ngrams/sec 35564.6 | eta 0h0m15s
| epoch 19 | step 3500/4071 | loss 7.4354 | lr 0.00100 | ngrams/sec 35472.2 | eta 0h0m8s
| epoch 19 | step 4000/4071 | loss 7.4606 | lr 0.00100 | ngrams/sec 35381.4 | eta 0

 28%|██▊       | 117/417 [00:00<00:00, 1156.83it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 274.22it/s]


-----------------------------------------------------------------------------------------
| end of epoch  19 | time 60.16s | valid loss  6.05 | valid ppl   425.83
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 20 | step 500/4071 | loss 7.2601 | lr 0.00100 | ngrams/sec 24802.3 | eta 0h1m13s
| epoch 20 | step 1000/4071 | loss 7.3020 | lr 0.00100 | ngrams/sec 35326.3 | eta 0h0m44s
| epoch 20 | step 1500/4071 | loss 7.3301 | lr 0.00100 | ngrams/sec 35459.2 | eta 0h0m37s
| epoch 20 | step 2000/4071 | loss 7.3598 | lr 0.00100 | ngrams/sec 35517.2 | eta 0h0m29s
| epoch 20 | step 2500/4071 | loss 7.3888 | lr 0.00100 | ngrams/sec 35562.4 | eta 0h0m22s
| epoch 20 | step 3000/4071 | loss 7.4126 | lr 0.00100 | ngrams/sec 35599.1 | eta 0h0m15s
| epoch 20 | step 3500/4071 | loss 7.4133 | lr 0.00100 | ngrams/sec 35615.1 | eta 0h0m8s
| epoch 20 | step 4000/4071 | loss 7.4402 | lr 0.00100 | ngrams/sec 35669.6 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1162.65it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  20 | time 60.18s | valid loss  6.05 | valid ppl   423.78
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 21 | step 500/4071 | loss 7.2423 | lr 0.00100 | ngrams/sec 25023.0 | eta 0h1m13s
| epoch 21 | step 1000/4071 | loss 7.2774 | lr 0.00100 | ngrams/sec 35771.6 | eta 0h0m43s
| epoch 21 | step 1500/4071 | loss 7.3074 | lr 0.00100 | ngrams/sec 35769.4 | eta 0h0m36s
| epoch 21 | step 2000/4071 | loss 7.3160 | lr 0.00100 | ngrams/sec 35812.4 | eta 0h0m29s
| epoch 21 | step 2500/4071 | loss 7.3578 | lr 0.00100 | ngrams/sec 35782.7 | eta 0h0m22s
| epoch 21 | step 3000/4071 | loss 7.3703 | lr 0.00100 | ngrams/sec 35751.8 | eta 0h0m15s
| epoch 21 | step 3500/4071 | loss 7.3922 | lr 0.00100 | ngrams/sec 35749.7 | eta 0h0m8s
| epoch 21 | step 4000/4071 | loss 7.4054 | lr 0.00100 | ngrams/sec 35696.5 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1153.02it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.01it/s]


-----------------------------------------------------------------------------------------
| end of epoch  21 | time 59.80s | valid loss  6.05 | valid ppl   422.83
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 22 | step 500/4071 | loss 7.2176 | lr 0.00100 | ngrams/sec 25028.1 | eta 0h1m13s
| epoch 22 | step 1000/4071 | loss 7.2490 | lr 0.00100 | ngrams/sec 35746.0 | eta 0h0m43s
| epoch 22 | step 1500/4071 | loss 7.2862 | lr 0.00100 | ngrams/sec 35702.5 | eta 0h0m36s
| epoch 22 | step 2000/4071 | loss 7.2965 | lr 0.00100 | ngrams/sec 35652.0 | eta 0h0m29s
| epoch 22 | step 2500/4071 | loss 7.3367 | lr 0.00100 | ngrams/sec 35662.1 | eta 0h0m22s
| epoch 22 | step 3000/4071 | loss 7.3556 | lr 0.00100 | ngrams/sec 35696.2 | eta 0h0m15s
| epoch 22 | step 3500/4071 | loss 7.3473 | lr 0.00100 | ngrams/sec 35696.4 | eta 0h0m8s
| epoch 22 | step 4000/4071 | loss 7.3925 | lr 0.00100 | ngrams/sec 35629.5 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1146.28it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.35it/s]


-----------------------------------------------------------------------------------------
| end of epoch  22 | time 59.91s | valid loss  6.05 | valid ppl   425.44
-----------------------------------------------------------------------------------------
| epoch 23 | step 500/4071 | loss 7.1889 | lr 0.00100 | ngrams/sec 25270.3 | eta 0h1m12s
| epoch 23 | step 1000/4071 | loss 7.2426 | lr 0.00100 | ngrams/sec 35663.3 | eta 0h0m44s
| epoch 23 | step 1500/4071 | loss 7.2687 | lr 0.00100 | ngrams/sec 35631.3 | eta 0h0m36s
| epoch 23 | step 2000/4071 | loss 7.2807 | lr 0.00100 | ngrams/sec 35629.2 | eta 0h0m29s
| epoch 23 | step 2500/4071 | loss 7.2921 | lr 0.00100 | ngrams/sec 35686.4 | eta 0h0m22s
| epoch 23 | step 3000/4071 | loss 7.3326 | lr 0.00100 | ngrams/sec 35606.2 | eta 0h0m15s
| epoch 23 | step 3500/4071 | loss 7.3354 | lr 0.00100 | ngrams/sec 35648.8 | eta 0h0m8s
| epoch 23 | step 4000/4071 | loss 7.3593 | lr 0.00100 | ngrams/sec 35660.1 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1152.52it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 277.51it/s]


-----------------------------------------------------------------------------------------
| end of epoch  23 | time 59.99s | valid loss  6.04 | valid ppl   419.04
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 24 | step 500/4071 | loss 7.1632 | lr 0.00100 | ngrams/sec 24971.6 | eta 0h1m13s
| epoch 24 | step 1000/4071 | loss 7.1960 | lr 0.00100 | ngrams/sec 35554.9 | eta 0h0m44s
| epoch 24 | step 1500/4071 | loss 7.2322 | lr 0.00100 | ngrams/sec 35620.1 | eta 0h0m36s
| epoch 24 | step 2000/4071 | loss 7.2545 | lr 0.00100 | ngrams/sec 35568.1 | eta 0h0m29s
| epoch 24 | step 2500/4071 | loss 7.2776 | lr 0.00100 | ngrams/sec 35568.7 | eta 0h0m22s
| epoch 24 | step 3000/4071 | loss 7.3013 | lr 0.00100 | ngrams/sec 35524.7 | eta 0h0m15s
| epoch 24 | step 3500/4071 | loss 7.3192 | lr 0.00100 | ngrams/sec 35227.1 | eta 0h0m8s
| epoch 24 | step 4000/4071 | loss 7.3241 | lr 0.00100 | ngrams/sec 35529.8 | eta 0

 28%|██▊       | 117/417 [00:00<00:00, 1160.96it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 276.52it/s]


-----------------------------------------------------------------------------------------
| end of epoch  24 | time 60.18s | valid loss  6.04 | valid ppl   418.45
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 25 | step 500/4071 | loss 7.1364 | lr 0.00100 | ngrams/sec 24943.3 | eta 0h1m13s
| epoch 25 | step 1000/4071 | loss 7.1820 | lr 0.00100 | ngrams/sec 35579.5 | eta 0h0m44s
| epoch 25 | step 1500/4071 | loss 7.2079 | lr 0.00100 | ngrams/sec 35614.2 | eta 0h0m36s
| epoch 25 | step 2000/4071 | loss 7.2283 | lr 0.00100 | ngrams/sec 35581.1 | eta 0h0m29s
| epoch 25 | step 2500/4071 | loss 7.2551 | lr 0.00100 | ngrams/sec 35530.5 | eta 0h0m22s
| epoch 25 | step 3000/4071 | loss 7.2829 | lr 0.00100 | ngrams/sec 35519.2 | eta 0h0m15s
| epoch 25 | step 3500/4071 | loss 7.2933 | lr 0.00100 | ngrams/sec 35543.1 | eta 0h0m8s
| epoch 25 | step 4000/4071 | loss 7.3033 | lr 0.00100 | ngrams/sec 35548.4 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1129.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 276.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  25 | time 60.13s | valid loss  6.03 | valid ppl   416.97
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 26 | step 500/4071 | loss 7.1200 | lr 0.00100 | ngrams/sec 24914.7 | eta 0h1m13s
| epoch 26 | step 1000/4071 | loss 7.1579 | lr 0.00100 | ngrams/sec 35592.8 | eta 0h0m44s
| epoch 26 | step 1500/4071 | loss 7.1878 | lr 0.00100 | ngrams/sec 35624.1 | eta 0h0m36s
| epoch 26 | step 2000/4071 | loss 7.2063 | lr 0.00100 | ngrams/sec 35565.6 | eta 0h0m29s
| epoch 26 | step 2500/4071 | loss 7.2163 | lr 0.00100 | ngrams/sec 35552.5 | eta 0h0m22s
| epoch 26 | step 3000/4071 | loss 7.2351 | lr 0.00100 | ngrams/sec 35555.6 | eta 0h0m15s
| epoch 26 | step 3500/4071 | loss 7.2589 | lr 0.00100 | ngrams/sec 35542.7 | eta 0h0m8s
| epoch 26 | step 4000/4071 | loss 7.2817 | lr 0.00100 | ngrams/sec 35561.2 | eta 0

 28%|██▊       | 117/417 [00:00<00:00, 1147.24it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 276.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  26 | time 60.12s | valid loss  6.04 | valid ppl   418.37
-----------------------------------------------------------------------------------------
| epoch 27 | step 500/4071 | loss 7.1004 | lr 0.00100 | ngrams/sec 25191.6 | eta 0h1m12s
| epoch 27 | step 1000/4071 | loss 7.1454 | lr 0.00100 | ngrams/sec 35530.9 | eta 0h0m44s
| epoch 27 | step 1500/4071 | loss 7.1579 | lr 0.00100 | ngrams/sec 35513.5 | eta 0h0m37s
| epoch 27 | step 2000/4071 | loss 7.1833 | lr 0.00100 | ngrams/sec 35532.5 | eta 0h0m29s
| epoch 27 | step 2500/4071 | loss 7.1961 | lr 0.00100 | ngrams/sec 35536.9 | eta 0h0m22s
| epoch 27 | step 3000/4071 | loss 7.2280 | lr 0.00100 | ngrams/sec 35553.9 | eta 0h0m15s
| epoch 27 | step 3500/4071 | loss 7.2410 | lr 0.00100 | ngrams/sec 35541.8 | eta 0h0m8s
| epoch 27 | step 4000/4071 | loss 7.2655 | lr 0.00100 | ngrams/sec 35494.8 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1145.72it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 277.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch  27 | time 60.17s | valid loss  6.03 | valid ppl   414.37
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 28 | step 500/4071 | loss 7.0656 | lr 0.00100 | ngrams/sec 24921.8 | eta 0h1m13s
| epoch 28 | step 1000/4071 | loss 7.1202 | lr 0.00100 | ngrams/sec 35645.8 | eta 0h0m44s
| epoch 28 | step 1500/4071 | loss 7.1235 | lr 0.00100 | ngrams/sec 35611.8 | eta 0h0m36s
| epoch 28 | step 2000/4071 | loss 7.1748 | lr 0.00100 | ngrams/sec 35534.3 | eta 0h0m29s
| epoch 28 | step 2500/4071 | loss 7.1885 | lr 0.00100 | ngrams/sec 35617.8 | eta 0h0m22s
| epoch 28 | step 3000/4071 | loss 7.2143 | lr 0.00100 | ngrams/sec 35599.7 | eta 0h0m15s
| epoch 28 | step 3500/4071 | loss 7.2520 | lr 0.00100 | ngrams/sec 35603.9 | eta 0h0m8s
| epoch 28 | step 4000/4071 | loss 7.2369 | lr 0.00100 | ngrams/sec 35598.3 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1146.56it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.74it/s]


-----------------------------------------------------------------------------------------
| end of epoch  28 | time 60.05s | valid loss  6.02 | valid ppl   412.75
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 29 | step 500/4071 | loss 7.0513 | lr 0.00100 | ngrams/sec 24973.6 | eta 0h1m13s
| epoch 29 | step 1000/4071 | loss 7.0991 | lr 0.00100 | ngrams/sec 35608.6 | eta 0h0m44s
| epoch 29 | step 1500/4071 | loss 7.1285 | lr 0.00100 | ngrams/sec 35571.7 | eta 0h0m37s
| epoch 29 | step 2000/4071 | loss 7.1481 | lr 0.00100 | ngrams/sec 35593.4 | eta 0h0m29s
| epoch 29 | step 2500/4071 | loss 7.1704 | lr 0.00100 | ngrams/sec 35564.7 | eta 0h0m22s
| epoch 29 | step 3000/4071 | loss 7.1967 | lr 0.00100 | ngrams/sec 35658.8 | eta 0h0m15s
| epoch 29 | step 3500/4071 | loss 7.2193 | lr 0.00100 | ngrams/sec 35554.3 | eta 0h0m8s
| epoch 29 | step 4000/4071 | loss 7.2149 | lr 0.00100 | ngrams/sec 35623.7 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1178.26it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.61it/s]


-----------------------------------------------------------------------------------------
| end of epoch  29 | time 60.06s | valid loss  6.02 | valid ppl   412.82
-----------------------------------------------------------------------------------------
| epoch 30 | step 500/4071 | loss 7.0243 | lr 0.00100 | ngrams/sec 25224.5 | eta 0h1m12s
| epoch 30 | step 1000/4071 | loss 7.0679 | lr 0.00100 | ngrams/sec 35663.4 | eta 0h0m44s
| epoch 30 | step 1500/4071 | loss 7.1095 | lr 0.00100 | ngrams/sec 35601.3 | eta 0h0m36s
| epoch 30 | step 2000/4071 | loss 7.1253 | lr 0.00100 | ngrams/sec 35632.0 | eta 0h0m29s
| epoch 30 | step 2500/4071 | loss 7.1601 | lr 0.00100 | ngrams/sec 35544.3 | eta 0h0m22s
| epoch 30 | step 3000/4071 | loss 7.1734 | lr 0.00100 | ngrams/sec 35540.1 | eta 0h0m15s
| epoch 30 | step 3500/4071 | loss 7.1830 | lr 0.00100 | ngrams/sec 35613.4 | eta 0h0m8s
| epoch 30 | step 4000/4071 | loss 7.2040 | lr 0.00100 | ngrams/sec 35582.8 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1164.19it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.40it/s]


-----------------------------------------------------------------------------------------
| end of epoch  30 | time 60.06s | valid loss  6.03 | valid ppl   414.60
-----------------------------------------------------------------------------------------
| epoch 31 | step 500/4071 | loss 7.0047 | lr 0.00100 | ngrams/sec 25210.7 | eta 0h1m12s
| epoch 31 | step 1000/4071 | loss 7.0443 | lr 0.00100 | ngrams/sec 35610.6 | eta 0h0m44s
| epoch 31 | step 1500/4071 | loss 7.0791 | lr 0.00100 | ngrams/sec 35607.9 | eta 0h0m36s
| epoch 31 | step 2000/4071 | loss 7.1038 | lr 0.00100 | ngrams/sec 35621.0 | eta 0h0m29s
| epoch 31 | step 2500/4071 | loss 7.1377 | lr 0.00100 | ngrams/sec 35621.2 | eta 0h0m22s
| epoch 31 | step 3000/4071 | loss 7.1650 | lr 0.00100 | ngrams/sec 35617.8 | eta 0h0m15s
| epoch 31 | step 3500/4071 | loss 7.1703 | lr 0.00100 | ngrams/sec 35533.4 | eta 0h0m8s
| epoch 31 | step 4000/4071 | loss 7.1768 | lr 0.00100 | ngrams/sec 35551.3 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1154.20it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  31 | time 60.07s | valid loss  6.03 | valid ppl   416.11
-----------------------------------------------------------------------------------------
| epoch 32 | step 500/4071 | loss 6.9904 | lr 0.00100 | ngrams/sec 25218.9 | eta 0h1m12s
| epoch 32 | step 1000/4071 | loss 7.0343 | lr 0.00100 | ngrams/sec 35606.5 | eta 0h0m44s
| epoch 32 | step 1500/4071 | loss 7.0706 | lr 0.00100 | ngrams/sec 35577.6 | eta 0h0m36s
| epoch 32 | step 2000/4071 | loss 7.0833 | lr 0.00100 | ngrams/sec 35576.8 | eta 0h0m29s
| epoch 32 | step 2500/4071 | loss 7.1149 | lr 0.00100 | ngrams/sec 35588.9 | eta 0h0m22s
| epoch 32 | step 3000/4071 | loss 7.1429 | lr 0.00100 | ngrams/sec 35581.9 | eta 0h0m15s
| epoch 32 | step 3500/4071 | loss 7.1440 | lr 0.00100 | ngrams/sec 35571.3 | eta 0h0m8s
| epoch 32 | step 4000/4071 | loss 7.1836 | lr 0.00100 | ngrams/sec 35483.3 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1177.38it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.88it/s]


-----------------------------------------------------------------------------------------
| end of epoch  32 | time 60.10s | valid loss  6.03 | valid ppl   416.20
-----------------------------------------------------------------------------------------
| epoch 33 | step 500/4071 | loss 6.9864 | lr 0.00100 | ngrams/sec 25188.1 | eta 0h1m12s
| epoch 33 | step 1000/4071 | loss 7.0181 | lr 0.00100 | ngrams/sec 35565.5 | eta 0h0m44s
| epoch 33 | step 1500/4071 | loss 7.0445 | lr 0.00100 | ngrams/sec 35524.7 | eta 0h0m37s
| epoch 33 | step 2000/4071 | loss 7.0799 | lr 0.00100 | ngrams/sec 35574.3 | eta 0h0m29s
| epoch 33 | step 2500/4071 | loss 7.0868 | lr 0.00100 | ngrams/sec 35572.4 | eta 0h0m22s
| epoch 33 | step 3000/4071 | loss 7.1188 | lr 0.00100 | ngrams/sec 35542.7 | eta 0h0m15s
| epoch 33 | step 3500/4071 | loss 7.1318 | lr 0.00100 | ngrams/sec 35576.9 | eta 0h0m8s
| epoch 33 | step 4000/4071 | loss 7.1489 | lr 0.00100 | ngrams/sec 35484.3 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1157.49it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 277.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  33 | time 60.15s | valid loss  6.03 | valid ppl   417.19
-----------------------------------------------------------------------------------------
| epoch 34 | step 500/4071 | loss 6.9754 | lr 0.00100 | ngrams/sec 25200.6 | eta 0h1m12s
| epoch 34 | step 1000/4071 | loss 6.9972 | lr 0.00100 | ngrams/sec 35583.1 | eta 0h0m44s
| epoch 34 | step 1500/4071 | loss 7.0341 | lr 0.00100 | ngrams/sec 35583.2 | eta 0h0m36s
| epoch 34 | step 2000/4071 | loss 7.0580 | lr 0.00100 | ngrams/sec 35557.4 | eta 0h0m29s
| epoch 34 | step 2500/4071 | loss 7.0983 | lr 0.00100 | ngrams/sec 35503.8 | eta 0h0m22s
| epoch 34 | step 3000/4071 | loss 7.1169 | lr 0.00100 | ngrams/sec 35522.7 | eta 0h0m15s
| epoch 34 | step 3500/4071 | loss 7.1235 | lr 0.00100 | ngrams/sec 35574.7 | eta 0h0m8s
| epoch 34 | step 4000/4071 | loss 7.1361 | lr 0.00100 | ngrams/sec 35527.5 | eta 0h0m1s


 28%|██▊       | 117/417 [00:00<00:00, 1161.11it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 276.94it/s]


-----------------------------------------------------------------------------------------
| end of epoch  34 | time 60.14s | valid loss  6.03 | valid ppl   415.48
-----------------------------------------------------------------------------------------
| epoch 35 | step 500/4071 | loss 6.9272 | lr 0.00100 | ngrams/sec 25179.8 | eta 0h1m12s
| epoch 35 | step 1000/4071 | loss 6.9876 | lr 0.00100 | ngrams/sec 35509.2 | eta 0h0m44s
| epoch 35 | step 1500/4071 | loss 7.0144 | lr 0.00100 | ngrams/sec 35603.1 | eta 0h0m36s
| epoch 35 | step 2000/4071 | loss 7.0452 | lr 0.00100 | ngrams/sec 35515.7 | eta 0h0m29s
| epoch 35 | step 2500/4071 | loss 7.0601 | lr 0.00100 | ngrams/sec 35522.8 | eta 0h0m22s
| epoch 35 | step 3000/4071 | loss 7.0861 | lr 0.00100 | ngrams/sec 35532.7 | eta 0h0m15s
| epoch 35 | step 3500/4071 | loss 7.1125 | lr 0.00100 | ngrams/sec 35509.6 | eta 0h0m8s
| epoch 35 | step 4000/4071 | loss 7.1099 | lr 0.00100 | ngrams/sec 35563.9 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1150.86it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 277.79it/s]


-----------------------------------------------------------------------------------------
| end of epoch  35 | time 60.16s | valid loss  6.02 | valid ppl   412.44
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 36 | step 500/4071 | loss 6.9366 | lr 0.00100 | ngrams/sec 24912.2 | eta 0h1m13s
| epoch 36 | step 1000/4071 | loss 6.9690 | lr 0.00100 | ngrams/sec 35630.7 | eta 0h0m44s
| epoch 36 | step 1500/4071 | loss 7.0074 | lr 0.00100 | ngrams/sec 35518.3 | eta 0h0m37s
| epoch 36 | step 2000/4071 | loss 7.0182 | lr 0.00100 | ngrams/sec 35583.9 | eta 0h0m29s
| epoch 36 | step 2500/4071 | loss 7.0572 | lr 0.00100 | ngrams/sec 35574.7 | eta 0h0m22s
| epoch 36 | step 3000/4071 | loss 7.0731 | lr 0.00100 | ngrams/sec 35612.6 | eta 0h0m15s
| epoch 36 | step 3500/4071 | loss 7.0863 | lr 0.00100 | ngrams/sec 35538.4 | eta 0h0m8s
| epoch 36 | step 4000/4071 | loss 7.1065 | lr 0.00100 | ngrams/sec 35585.3 | eta 0

 28%|██▊       | 118/417 [00:00<00:00, 1151.62it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.30it/s]


-----------------------------------------------------------------------------------------
| end of epoch  36 | time 60.10s | valid loss  6.03 | valid ppl   416.36
-----------------------------------------------------------------------------------------
| epoch 37 | step 500/4071 | loss 6.9164 | lr 0.00100 | ngrams/sec 25202.0 | eta 0h1m12s
| epoch 37 | step 1000/4071 | loss 6.9672 | lr 0.00100 | ngrams/sec 35571.6 | eta 0h0m44s
| epoch 37 | step 1500/4071 | loss 6.9845 | lr 0.00100 | ngrams/sec 35603.4 | eta 0h0m36s
| epoch 37 | step 2000/4071 | loss 7.0069 | lr 0.00100 | ngrams/sec 35591.7 | eta 0h0m29s
| epoch 37 | step 2500/4071 | loss 7.0466 | lr 0.00100 | ngrams/sec 35611.6 | eta 0h0m22s
| epoch 37 | step 3000/4071 | loss 7.0520 | lr 0.00100 | ngrams/sec 35646.9 | eta 0h0m15s
| epoch 37 | step 3500/4071 | loss 7.0756 | lr 0.00100 | ngrams/sec 35592.2 | eta 0h0m8s
| epoch 37 | step 4000/4071 | loss 7.0863 | lr 0.00100 | ngrams/sec 35637.7 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1144.12it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.17it/s]


-----------------------------------------------------------------------------------------
| end of epoch  37 | time 60.04s | valid loss  6.04 | valid ppl   418.27
-----------------------------------------------------------------------------------------
| epoch 38 | step 500/4071 | loss 6.9010 | lr 0.00100 | ngrams/sec 25230.5 | eta 0h1m12s
| epoch 38 | step 1000/4071 | loss 6.9317 | lr 0.00100 | ngrams/sec 35622.8 | eta 0h0m44s
| epoch 38 | step 1500/4071 | loss 6.9727 | lr 0.00100 | ngrams/sec 35665.7 | eta 0h0m36s
| epoch 38 | step 2000/4071 | loss 7.0033 | lr 0.00100 | ngrams/sec 35574.9 | eta 0h0m29s
| epoch 38 | step 2500/4071 | loss 7.0370 | lr 0.00100 | ngrams/sec 35664.2 | eta 0h0m22s
| epoch 38 | step 3000/4071 | loss 7.0393 | lr 0.00100 | ngrams/sec 35683.4 | eta 0h0m15s
| epoch 38 | step 3500/4071 | loss 7.0469 | lr 0.00100 | ngrams/sec 35696.6 | eta 0h0m8s
| epoch 38 | step 4000/4071 | loss 7.0694 | lr 0.00100 | ngrams/sec 35673.0 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1140.99it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.13it/s]


-----------------------------------------------------------------------------------------
| end of epoch  38 | time 59.97s | valid loss  6.04 | valid ppl   418.73
-----------------------------------------------------------------------------------------
| epoch 39 | step 500/4071 | loss 6.8832 | lr 0.00100 | ngrams/sec 25278.7 | eta 0h1m12s
| epoch 39 | step 1000/4071 | loss 6.9279 | lr 0.00100 | ngrams/sec 35625.5 | eta 0h0m44s
| epoch 39 | step 1500/4071 | loss 6.9594 | lr 0.00100 | ngrams/sec 35648.8 | eta 0h0m36s
| epoch 39 | step 2000/4071 | loss 6.9908 | lr 0.00100 | ngrams/sec 35650.9 | eta 0h0m29s
| epoch 39 | step 2500/4071 | loss 7.0107 | lr 0.00100 | ngrams/sec 35659.1 | eta 0h0m22s
| epoch 39 | step 3000/4071 | loss 7.0200 | lr 0.00100 | ngrams/sec 35620.9 | eta 0h0m15s
| epoch 39 | step 3500/4071 | loss 7.0489 | lr 0.00100 | ngrams/sec 35640.3 | eta 0h0m8s
| epoch 39 | step 4000/4071 | loss 7.0584 | lr 0.00100 | ngrams/sec 35736.3 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1169.66it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.23it/s]


-----------------------------------------------------------------------------------------
| end of epoch  39 | time 59.96s | valid loss  6.03 | valid ppl   417.64
-----------------------------------------------------------------------------------------
| epoch 40 | step 500/4071 | loss 6.8671 | lr 0.00100 | ngrams/sec 25303.5 | eta 0h1m12s
| epoch 40 | step 1000/4071 | loss 6.9104 | lr 0.00100 | ngrams/sec 35696.0 | eta 0h0m44s
| epoch 40 | step 1500/4071 | loss 6.9490 | lr 0.00100 | ngrams/sec 35695.0 | eta 0h0m36s
| epoch 40 | step 2000/4071 | loss 6.9834 | lr 0.00100 | ngrams/sec 35684.1 | eta 0h0m29s
| epoch 40 | step 2500/4071 | loss 6.9840 | lr 0.00100 | ngrams/sec 35719.8 | eta 0h0m22s
| epoch 40 | step 3000/4071 | loss 7.0118 | lr 0.00100 | ngrams/sec 35722.7 | eta 0h0m15s
| epoch 40 | step 3500/4071 | loss 7.0208 | lr 0.00100 | ngrams/sec 35645.4 | eta 0h0m8s
| epoch 40 | step 4000/4071 | loss 7.0657 | lr 0.00100 | ngrams/sec 35682.4 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1175.79it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  40 | time 59.90s | valid loss  6.03 | valid ppl   417.08
-----------------------------------------------------------------------------------------
| epoch 41 | step 500/4071 | loss 6.8631 | lr 0.00100 | ngrams/sec 25290.3 | eta 0h1m12s
| epoch 41 | step 1000/4071 | loss 6.9039 | lr 0.00100 | ngrams/sec 35660.5 | eta 0h0m44s
| epoch 41 | step 1500/4071 | loss 6.9285 | lr 0.00100 | ngrams/sec 35681.7 | eta 0h0m36s
| epoch 41 | step 2000/4071 | loss 6.9512 | lr 0.00100 | ngrams/sec 35672.2 | eta 0h0m29s
| epoch 41 | step 2500/4071 | loss 6.9768 | lr 0.00100 | ngrams/sec 35664.2 | eta 0h0m22s
| epoch 41 | step 3000/4071 | loss 7.0013 | lr 0.00100 | ngrams/sec 35681.4 | eta 0h0m15s
| epoch 41 | step 3500/4071 | loss 7.0060 | lr 0.00100 | ngrams/sec 35671.5 | eta 0h0m8s
| epoch 41 | step 4000/4071 | loss 7.0453 | lr 0.00100 | ngrams/sec 35648.2 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1172.02it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  41 | time 59.94s | valid loss  6.03 | valid ppl   417.11
-----------------------------------------------------------------------------------------
| epoch 42 | step 500/4071 | loss 6.8520 | lr 0.00100 | ngrams/sec 25269.3 | eta 0h1m12s
| epoch 42 | step 1000/4071 | loss 6.8932 | lr 0.00100 | ngrams/sec 35685.5 | eta 0h0m44s
| epoch 42 | step 1500/4071 | loss 6.9263 | lr 0.00100 | ngrams/sec 35633.6 | eta 0h0m36s
| epoch 42 | step 2000/4071 | loss 6.9387 | lr 0.00100 | ngrams/sec 35652.9 | eta 0h0m29s
| epoch 42 | step 2500/4071 | loss 6.9632 | lr 0.00100 | ngrams/sec 35702.4 | eta 0h0m22s
| epoch 42 | step 3000/4071 | loss 6.9773 | lr 0.00100 | ngrams/sec 35618.7 | eta 0h0m15s
| epoch 42 | step 3500/4071 | loss 7.0015 | lr 0.00100 | ngrams/sec 35651.7 | eta 0h0m8s
| epoch 42 | step 4000/4071 | loss 7.0258 | lr 0.00100 | ngrams/sec 35629.4 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1147.46it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.16it/s]


-----------------------------------------------------------------------------------------
| end of epoch  42 | time 59.96s | valid loss  6.04 | valid ppl   420.17
-----------------------------------------------------------------------------------------
| epoch 43 | step 500/4071 | loss 6.8306 | lr 0.00100 | ngrams/sec 25270.3 | eta 0h1m12s
| epoch 43 | step 1000/4071 | loss 6.8797 | lr 0.00100 | ngrams/sec 35703.0 | eta 0h0m44s
| epoch 43 | step 1500/4071 | loss 6.9117 | lr 0.00100 | ngrams/sec 35696.8 | eta 0h0m36s
| epoch 43 | step 2000/4071 | loss 6.9367 | lr 0.00100 | ngrams/sec 35669.7 | eta 0h0m29s
| epoch 43 | step 2500/4071 | loss 6.9579 | lr 0.00100 | ngrams/sec 35636.3 | eta 0h0m22s
| epoch 43 | step 3000/4071 | loss 6.9720 | lr 0.00100 | ngrams/sec 35651.6 | eta 0h0m15s
| epoch 43 | step 3500/4071 | loss 6.9897 | lr 0.00100 | ngrams/sec 35676.4 | eta 0h0m8s
| epoch 43 | step 4000/4071 | loss 7.0120 | lr 0.00100 | ngrams/sec 35663.4 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1173.87it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.52it/s]


-----------------------------------------------------------------------------------------
| end of epoch  43 | time 59.94s | valid loss  6.04 | valid ppl   421.26
-----------------------------------------------------------------------------------------
| epoch 44 | step 500/4071 | loss 6.8305 | lr 0.00100 | ngrams/sec 25292.8 | eta 0h1m12s
| epoch 44 | step 1000/4071 | loss 6.8576 | lr 0.00100 | ngrams/sec 35680.4 | eta 0h0m44s
| epoch 44 | step 1500/4071 | loss 6.8978 | lr 0.00100 | ngrams/sec 35668.8 | eta 0h0m36s
| epoch 44 | step 2000/4071 | loss 6.9135 | lr 0.00100 | ngrams/sec 35653.4 | eta 0h0m29s
| epoch 44 | step 2500/4071 | loss 6.9455 | lr 0.00100 | ngrams/sec 35657.7 | eta 0h0m22s
| epoch 44 | step 3000/4071 | loss 6.9564 | lr 0.00100 | ngrams/sec 35661.7 | eta 0h0m15s
| epoch 44 | step 3500/4071 | loss 6.9807 | lr 0.00100 | ngrams/sec 35706.3 | eta 0h0m8s
| epoch 44 | step 4000/4071 | loss 6.9947 | lr 0.00100 | ngrams/sec 35674.5 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1143.05it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.54it/s]


-----------------------------------------------------------------------------------------
| end of epoch  44 | time 59.94s | valid loss  6.04 | valid ppl   421.19
-----------------------------------------------------------------------------------------
| epoch 45 | step 500/4071 | loss 6.8242 | lr 0.00100 | ngrams/sec 25282.0 | eta 0h1m12s
| epoch 45 | step 1000/4071 | loss 6.8628 | lr 0.00100 | ngrams/sec 35650.9 | eta 0h0m44s
| epoch 45 | step 1500/4071 | loss 6.8840 | lr 0.00100 | ngrams/sec 35623.6 | eta 0h0m36s
| epoch 45 | step 2000/4071 | loss 6.9096 | lr 0.00100 | ngrams/sec 35637.6 | eta 0h0m29s
| epoch 45 | step 2500/4071 | loss 6.9291 | lr 0.00100 | ngrams/sec 35639.6 | eta 0h0m22s
| epoch 45 | step 3000/4071 | loss 6.9537 | lr 0.00100 | ngrams/sec 35640.2 | eta 0h0m15s
| epoch 45 | step 3500/4071 | loss 6.9611 | lr 0.00100 | ngrams/sec 35622.4 | eta 0h0m8s
| epoch 45 | step 4000/4071 | loss 6.9727 | lr 0.00100 | ngrams/sec 35636.1 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1157.37it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.59it/s]


-----------------------------------------------------------------------------------------
| end of epoch  45 | time 59.98s | valid loss  6.04 | valid ppl   421.19
-----------------------------------------------------------------------------------------
| epoch 46 | step 500/4071 | loss 6.8023 | lr 0.00100 | ngrams/sec 25287.7 | eta 0h1m12s
| epoch 46 | step 1000/4071 | loss 6.8424 | lr 0.00100 | ngrams/sec 35660.8 | eta 0h0m44s
| epoch 46 | step 1500/4071 | loss 6.8661 | lr 0.00100 | ngrams/sec 35707.0 | eta 0h0m36s
| epoch 46 | step 2000/4071 | loss 6.9002 | lr 0.00100 | ngrams/sec 35706.7 | eta 0h0m29s
| epoch 46 | step 2500/4071 | loss 6.9118 | lr 0.00100 | ngrams/sec 35719.7 | eta 0h0m22s
| epoch 46 | step 3000/4071 | loss 6.9472 | lr 0.00100 | ngrams/sec 35720.7 | eta 0h0m15s
| epoch 46 | step 3500/4071 | loss 6.9579 | lr 0.00100 | ngrams/sec 35743.0 | eta 0h0m8s
| epoch 46 | step 4000/4071 | loss 6.9622 | lr 0.00100 | ngrams/sec 35267.7 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1159.52it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 283.40it/s]


-----------------------------------------------------------------------------------------
| end of epoch  46 | time 59.95s | valid loss  6.05 | valid ppl   424.86
-----------------------------------------------------------------------------------------
| epoch 47 | step 500/4071 | loss 6.8144 | lr 0.00100 | ngrams/sec 25293.5 | eta 0h1m12s
| epoch 47 | step 1000/4071 | loss 6.8262 | lr 0.00100 | ngrams/sec 35762.9 | eta 0h0m43s
| epoch 47 | step 1500/4071 | loss 6.8660 | lr 0.00100 | ngrams/sec 35734.4 | eta 0h0m36s
| epoch 47 | step 2000/4071 | loss 6.8965 | lr 0.00100 | ngrams/sec 35785.1 | eta 0h0m29s
| epoch 47 | step 2500/4071 | loss 6.9157 | lr 0.00100 | ngrams/sec 35746.0 | eta 0h0m22s
| epoch 47 | step 3000/4071 | loss 6.9314 | lr 0.00100 | ngrams/sec 35715.6 | eta 0h0m15s
| epoch 47 | step 3500/4071 | loss 6.9369 | lr 0.00100 | ngrams/sec 35720.6 | eta 0h0m8s
| epoch 47 | step 4000/4071 | loss 6.9622 | lr 0.00100 | ngrams/sec 35862.6 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1148.36it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  47 | time 59.80s | valid loss  6.06 | valid ppl   426.90
-----------------------------------------------------------------------------------------
| epoch 48 | step 500/4071 | loss 6.7848 | lr 0.00100 | ngrams/sec 25355.8 | eta 0h1m12s
| epoch 48 | step 1000/4071 | loss 6.8305 | lr 0.00100 | ngrams/sec 35819.0 | eta 0h0m43s
| epoch 48 | step 1500/4071 | loss 6.8464 | lr 0.00100 | ngrams/sec 35815.5 | eta 0h0m36s
| epoch 48 | step 2000/4071 | loss 6.8781 | lr 0.00100 | ngrams/sec 35792.2 | eta 0h0m29s
| epoch 48 | step 2500/4071 | loss 6.8899 | lr 0.00100 | ngrams/sec 35755.6 | eta 0h0m22s
| epoch 48 | step 3000/4071 | loss 6.9237 | lr 0.00100 | ngrams/sec 35792.9 | eta 0h0m15s
| epoch 48 | step 3500/4071 | loss 6.9428 | lr 0.00100 | ngrams/sec 35765.6 | eta 0h0m8s
| epoch 48 | step 4000/4071 | loss 6.9406 | lr 0.00100 | ngrams/sec 35756.0 | eta 0h0m1s


 28%|██▊       | 117/417 [00:00<00:00, 1168.72it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.92it/s]


-----------------------------------------------------------------------------------------
| end of epoch  48 | time 59.74s | valid loss  6.06 | valid ppl   428.47
-----------------------------------------------------------------------------------------
| epoch 49 | step 500/4071 | loss 6.7593 | lr 0.00100 | ngrams/sec 25358.8 | eta 0h1m12s
| epoch 49 | step 1000/4071 | loss 6.8079 | lr 0.00100 | ngrams/sec 35780.0 | eta 0h0m43s
| epoch 49 | step 1500/4071 | loss 6.8484 | lr 0.00100 | ngrams/sec 35798.6 | eta 0h0m36s
| epoch 49 | step 2000/4071 | loss 6.8592 | lr 0.00100 | ngrams/sec 35803.0 | eta 0h0m29s
| epoch 49 | step 2500/4071 | loss 6.8859 | lr 0.00100 | ngrams/sec 35818.8 | eta 0h0m22s
| epoch 49 | step 3000/4071 | loss 6.9089 | lr 0.00100 | ngrams/sec 35715.7 | eta 0h0m15s
| epoch 49 | step 3500/4071 | loss 6.9204 | lr 0.00100 | ngrams/sec 35781.8 | eta 0h0m8s
| epoch 49 | step 4000/4071 | loss 6.9288 | lr 0.00100 | ngrams/sec 35806.2 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1146.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  49 | time 59.75s | valid loss  6.06 | valid ppl   429.09
-----------------------------------------------------------------------------------------
| epoch 50 | step 500/4071 | loss 6.7701 | lr 0.00100 | ngrams/sec 25365.7 | eta 0h1m12s
| epoch 50 | step 1000/4071 | loss 6.7905 | lr 0.00100 | ngrams/sec 35720.2 | eta 0h0m44s
| epoch 50 | step 1500/4071 | loss 6.8377 | lr 0.00100 | ngrams/sec 35796.5 | eta 0h0m36s
| epoch 50 | step 2000/4071 | loss 6.8543 | lr 0.00100 | ngrams/sec 35806.8 | eta 0h0m29s
| epoch 50 | step 2500/4071 | loss 6.8857 | lr 0.00100 | ngrams/sec 35753.2 | eta 0h0m22s
| epoch 50 | step 3000/4071 | loss 6.8879 | lr 0.00100 | ngrams/sec 35768.0 | eta 0h0m15s
| epoch 50 | step 3500/4071 | loss 6.9079 | lr 0.00100 | ngrams/sec 35718.8 | eta 0h0m8s
| epoch 50 | step 4000/4071 | loss 6.9342 | lr 0.00100 | ngrams/sec 35679.2 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1173.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.05it/s]


-----------------------------------------------------------------------------------------
| end of epoch  50 | time 59.79s | valid loss  6.07 | valid ppl   431.01
-----------------------------------------------------------------------------------------
| epoch 51 | step 500/4071 | loss 6.7626 | lr 0.00100 | ngrams/sec 25323.6 | eta 0h1m12s
| epoch 51 | step 1000/4071 | loss 6.8006 | lr 0.00100 | ngrams/sec 35742.0 | eta 0h0m43s
| epoch 51 | step 1500/4071 | loss 6.8156 | lr 0.00100 | ngrams/sec 35741.9 | eta 0h0m36s
| epoch 51 | step 2000/4071 | loss 6.8538 | lr 0.00100 | ngrams/sec 35737.3 | eta 0h0m29s
| epoch 51 | step 2500/4071 | loss 6.8786 | lr 0.00100 | ngrams/sec 35740.6 | eta 0h0m22s
| epoch 51 | step 3000/4071 | loss 6.8715 | lr 0.00100 | ngrams/sec 35757.8 | eta 0h0m15s
| epoch 51 | step 3500/4071 | loss 6.8960 | lr 0.00100 | ngrams/sec 35716.3 | eta 0h0m8s
| epoch 51 | step 4000/4071 | loss 6.9192 | lr 0.00100 | ngrams/sec 35728.9 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1172.03it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.11it/s]


-----------------------------------------------------------------------------------------
| end of epoch  51 | time 59.83s | valid loss  6.06 | valid ppl   429.57
-----------------------------------------------------------------------------------------
| epoch 52 | step 500/4071 | loss 6.7443 | lr 0.00100 | ngrams/sec 25328.1 | eta 0h1m12s
| epoch 52 | step 1000/4071 | loss 6.7817 | lr 0.00100 | ngrams/sec 35703.2 | eta 0h0m44s
| epoch 52 | step 1500/4071 | loss 6.8014 | lr 0.00100 | ngrams/sec 35748.0 | eta 0h0m36s
| epoch 52 | step 2000/4071 | loss 6.8270 | lr 0.00100 | ngrams/sec 35719.0 | eta 0h0m29s
| epoch 52 | step 2500/4071 | loss 6.8479 | lr 0.00100 | ngrams/sec 35719.6 | eta 0h0m22s
| epoch 52 | step 3000/4071 | loss 6.8770 | lr 0.00100 | ngrams/sec 35682.5 | eta 0h0m15s
| epoch 52 | step 3500/4071 | loss 6.9067 | lr 0.00100 | ngrams/sec 35656.7 | eta 0h0m8s
| epoch 52 | step 4000/4071 | loss 6.9157 | lr 0.00100 | ngrams/sec 35652.4 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1169.66it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.60it/s]


-----------------------------------------------------------------------------------------
| end of epoch  52 | time 59.88s | valid loss  6.08 | valid ppl   436.84
-----------------------------------------------------------------------------------------
| epoch 53 | step 500/4071 | loss 6.7218 | lr 0.00100 | ngrams/sec 25287.7 | eta 0h1m12s
| epoch 53 | step 1000/4071 | loss 6.7582 | lr 0.00100 | ngrams/sec 35742.2 | eta 0h0m43s
| epoch 53 | step 1500/4071 | loss 6.8040 | lr 0.00100 | ngrams/sec 35673.5 | eta 0h0m36s
| epoch 53 | step 2000/4071 | loss 6.8346 | lr 0.00100 | ngrams/sec 35672.1 | eta 0h0m29s
| epoch 53 | step 2500/4071 | loss 6.8626 | lr 0.00100 | ngrams/sec 35694.2 | eta 0h0m22s
| epoch 53 | step 3000/4071 | loss 6.8734 | lr 0.00100 | ngrams/sec 35708.3 | eta 0h0m15s
| epoch 53 | step 3500/4071 | loss 6.8848 | lr 0.00100 | ngrams/sec 35715.4 | eta 0h0m8s
| epoch 53 | step 4000/4071 | loss 6.8994 | lr 0.00100 | ngrams/sec 35691.0 | eta 0h0m1s


 28%|██▊       | 117/417 [00:00<00:00, 1165.33it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 278.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch  53 | time 59.90s | valid loss  6.07 | valid ppl   430.57
-----------------------------------------------------------------------------------------
| epoch 54 | step 500/4071 | loss 6.7272 | lr 0.00100 | ngrams/sec 25298.6 | eta 0h1m12s
| epoch 54 | step 1000/4071 | loss 6.7525 | lr 0.00100 | ngrams/sec 35604.8 | eta 0h0m44s
| epoch 54 | step 1500/4071 | loss 6.7762 | lr 0.00100 | ngrams/sec 35655.5 | eta 0h0m36s
| epoch 54 | step 2000/4071 | loss 6.8260 | lr 0.00100 | ngrams/sec 35676.1 | eta 0h0m29s
| epoch 54 | step 2500/4071 | loss 6.8334 | lr 0.00100 | ngrams/sec 35665.8 | eta 0h0m22s
| epoch 54 | step 3000/4071 | loss 6.8618 | lr 0.00100 | ngrams/sec 35628.1 | eta 0h0m15s
| epoch 54 | step 3500/4071 | loss 6.8625 | lr 0.00100 | ngrams/sec 35666.8 | eta 0h0m8s
| epoch 54 | step 4000/4071 | loss 6.8965 | lr 0.00100 | ngrams/sec 35615.7 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1174.25it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.37it/s]


-----------------------------------------------------------------------------------------
| end of epoch  54 | time 59.97s | valid loss  6.08 | valid ppl   435.47
-----------------------------------------------------------------------------------------
| epoch 55 | step 500/4071 | loss 6.7174 | lr 0.00100 | ngrams/sec 25276.0 | eta 0h1m12s
| epoch 55 | step 1000/4071 | loss 6.7623 | lr 0.00100 | ngrams/sec 35695.6 | eta 0h0m44s
| epoch 55 | step 1500/4071 | loss 6.7907 | lr 0.00100 | ngrams/sec 35590.7 | eta 0h0m36s
| epoch 55 | step 2000/4071 | loss 6.8033 | lr 0.00100 | ngrams/sec 35693.0 | eta 0h0m29s
| epoch 55 | step 2500/4071 | loss 6.8404 | lr 0.00100 | ngrams/sec 35668.9 | eta 0h0m22s
| epoch 55 | step 3000/4071 | loss 6.8479 | lr 0.00100 | ngrams/sec 35652.2 | eta 0h0m15s
| epoch 55 | step 3500/4071 | loss 6.8663 | lr 0.00100 | ngrams/sec 35685.8 | eta 0h0m8s
| epoch 55 | step 4000/4071 | loss 6.8756 | lr 0.00100 | ngrams/sec 35692.3 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1139.06it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.41it/s]


-----------------------------------------------------------------------------------------
| end of epoch  55 | time 59.93s | valid loss  6.08 | valid ppl   436.42
-----------------------------------------------------------------------------------------
| epoch 56 | step 500/4071 | loss 6.7142 | lr 0.00100 | ngrams/sec 25300.8 | eta 0h1m12s
| epoch 56 | step 1000/4071 | loss 6.7579 | lr 0.00100 | ngrams/sec 35723.3 | eta 0h0m44s
| epoch 56 | step 1500/4071 | loss 6.7852 | lr 0.00100 | ngrams/sec 35698.8 | eta 0h0m36s
| epoch 56 | step 2000/4071 | loss 6.8070 | lr 0.00100 | ngrams/sec 35722.9 | eta 0h0m29s
| epoch 56 | step 2500/4071 | loss 6.8234 | lr 0.00100 | ngrams/sec 35736.6 | eta 0h0m22s
| epoch 56 | step 3000/4071 | loss 6.8376 | lr 0.00100 | ngrams/sec 35674.2 | eta 0h0m15s
| epoch 56 | step 3500/4071 | loss 6.8649 | lr 0.00100 | ngrams/sec 35697.6 | eta 0h0m8s
| epoch 56 | step 4000/4071 | loss 6.8589 | lr 0.00100 | ngrams/sec 35719.7 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1161.30it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  56 | time 59.88s | valid loss  6.08 | valid ppl   439.13
-----------------------------------------------------------------------------------------
| epoch 57 | step 500/4071 | loss 6.6976 | lr 0.00100 | ngrams/sec 25328.9 | eta 0h1m12s
| epoch 57 | step 1000/4071 | loss 6.7454 | lr 0.00100 | ngrams/sec 35732.8 | eta 0h0m44s
| epoch 57 | step 1500/4071 | loss 6.7716 | lr 0.00100 | ngrams/sec 35772.2 | eta 0h0m36s
| epoch 57 | step 2000/4071 | loss 6.8008 | lr 0.00100 | ngrams/sec 35784.7 | eta 0h0m29s
| epoch 57 | step 2500/4071 | loss 6.8159 | lr 0.00100 | ngrams/sec 35750.1 | eta 0h0m22s
| epoch 57 | step 3000/4071 | loss 6.8300 | lr 0.00100 | ngrams/sec 35633.9 | eta 0h0m15s
| epoch 57 | step 3500/4071 | loss 6.8502 | lr 0.00100 | ngrams/sec 35700.2 | eta 0h0m8s
| epoch 57 | step 4000/4071 | loss 6.8554 | lr 0.00100 | ngrams/sec 35741.2 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1145.94it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.40it/s]


-----------------------------------------------------------------------------------------
| end of epoch  57 | time 59.83s | valid loss  6.08 | valid ppl   438.20
-----------------------------------------------------------------------------------------
| epoch 58 | step 500/4071 | loss 6.6950 | lr 0.00100 | ngrams/sec 25322.1 | eta 0h1m12s
| epoch 58 | step 1000/4071 | loss 6.7410 | lr 0.00100 | ngrams/sec 35710.6 | eta 0h0m44s
| epoch 58 | step 1500/4071 | loss 6.7683 | lr 0.00100 | ngrams/sec 35758.1 | eta 0h0m36s
| epoch 58 | step 2000/4071 | loss 6.7808 | lr 0.00100 | ngrams/sec 35761.2 | eta 0h0m29s
| epoch 58 | step 2500/4071 | loss 6.8204 | lr 0.00100 | ngrams/sec 35778.4 | eta 0h0m22s
| epoch 58 | step 3000/4071 | loss 6.8168 | lr 0.00100 | ngrams/sec 35712.1 | eta 0h0m15s
| epoch 58 | step 3500/4071 | loss 6.8394 | lr 0.00100 | ngrams/sec 35733.1 | eta 0h0m8s
| epoch 58 | step 4000/4071 | loss 6.8621 | lr 0.00100 | ngrams/sec 35755.7 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1154.32it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.22it/s]


-----------------------------------------------------------------------------------------
| end of epoch  58 | time 59.82s | valid loss  6.09 | valid ppl   439.80
-----------------------------------------------------------------------------------------
| epoch 59 | step 500/4071 | loss 6.6913 | lr 0.00100 | ngrams/sec 25329.7 | eta 0h1m12s
| epoch 59 | step 1000/4071 | loss 6.7229 | lr 0.00100 | ngrams/sec 35741.8 | eta 0h0m43s
| epoch 59 | step 1500/4071 | loss 6.7552 | lr 0.00100 | ngrams/sec 35714.0 | eta 0h0m36s
| epoch 59 | step 2000/4071 | loss 6.7677 | lr 0.00100 | ngrams/sec 35698.2 | eta 0h0m29s
| epoch 59 | step 2500/4071 | loss 6.7997 | lr 0.00100 | ngrams/sec 35767.6 | eta 0h0m22s
| epoch 59 | step 3000/4071 | loss 6.8192 | lr 0.00100 | ngrams/sec 35790.3 | eta 0h0m15s
| epoch 59 | step 3500/4071 | loss 6.8273 | lr 0.00100 | ngrams/sec 35718.2 | eta 0h0m8s
| epoch 59 | step 4000/4071 | loss 6.8552 | lr 0.00100 | ngrams/sec 35755.9 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1155.10it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.97it/s]


-----------------------------------------------------------------------------------------
| end of epoch  59 | time 59.82s | valid loss  6.08 | valid ppl   437.66
-----------------------------------------------------------------------------------------
| epoch 60 | step 500/4071 | loss 6.6883 | lr 0.00100 | ngrams/sec 25301.4 | eta 0h1m12s
| epoch 60 | step 1000/4071 | loss 6.7211 | lr 0.00100 | ngrams/sec 35713.9 | eta 0h0m44s
| epoch 60 | step 1500/4071 | loss 6.7556 | lr 0.00100 | ngrams/sec 35705.4 | eta 0h0m36s
| epoch 60 | step 2000/4071 | loss 6.7763 | lr 0.00100 | ngrams/sec 35770.8 | eta 0h0m29s
| epoch 60 | step 2500/4071 | loss 6.7981 | lr 0.00100 | ngrams/sec 35716.6 | eta 0h0m22s
| epoch 60 | step 3000/4071 | loss 6.8122 | lr 0.00100 | ngrams/sec 35762.1 | eta 0h0m15s
| epoch 60 | step 3500/4071 | loss 6.8337 | lr 0.00100 | ngrams/sec 35736.8 | eta 0h0m8s
| epoch 60 | step 4000/4071 | loss 6.8294 | lr 0.00100 | ngrams/sec 35732.0 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1146.58it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 279.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch  60 | time 59.84s | valid loss  6.10 | valid ppl   446.65
-----------------------------------------------------------------------------------------
| epoch 61 | step 500/4071 | loss 6.6674 | lr 0.00100 | ngrams/sec 25307.0 | eta 0h1m12s
| epoch 61 | step 1000/4071 | loss 6.7031 | lr 0.00100 | ngrams/sec 35734.9 | eta 0h0m44s
| epoch 61 | step 1500/4071 | loss 6.7434 | lr 0.00100 | ngrams/sec 35755.9 | eta 0h0m36s
| epoch 61 | step 2000/4071 | loss 6.7551 | lr 0.00100 | ngrams/sec 35715.8 | eta 0h0m29s
| epoch 61 | step 2500/4071 | loss 6.7863 | lr 0.00100 | ngrams/sec 35727.4 | eta 0h0m22s
| epoch 61 | step 3000/4071 | loss 6.7970 | lr 0.00100 | ngrams/sec 35737.0 | eta 0h0m15s
| epoch 61 | step 3500/4071 | loss 6.8253 | lr 0.00100 | ngrams/sec 35769.2 | eta 0h0m8s
| epoch 61 | step 4000/4071 | loss 6.8278 | lr 0.00100 | ngrams/sec 35801.3 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1148.01it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.05it/s]


-----------------------------------------------------------------------------------------
| end of epoch  61 | time 59.82s | valid loss  6.10 | valid ppl   445.30
-----------------------------------------------------------------------------------------
| epoch 62 | step 500/4071 | loss 6.6607 | lr 0.00100 | ngrams/sec 25355.1 | eta 0h1m12s
| epoch 62 | step 1000/4071 | loss 6.6996 | lr 0.00100 | ngrams/sec 35797.3 | eta 0h0m43s
| epoch 62 | step 1500/4071 | loss 6.7270 | lr 0.00100 | ngrams/sec 35791.3 | eta 0h0m36s
| epoch 62 | step 2000/4071 | loss 6.7445 | lr 0.00100 | ngrams/sec 35764.3 | eta 0h0m29s
| epoch 62 | step 2500/4071 | loss 6.7748 | lr 0.00100 | ngrams/sec 35785.0 | eta 0h0m22s
| epoch 62 | step 3000/4071 | loss 6.8208 | lr 0.00100 | ngrams/sec 35763.2 | eta 0h0m15s
| epoch 62 | step 3500/4071 | loss 6.8118 | lr 0.00100 | ngrams/sec 35756.4 | eta 0h0m8s
| epoch 62 | step 4000/4071 | loss 6.8207 | lr 0.00100 | ngrams/sec 35832.5 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1137.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  62 | time 59.75s | valid loss  6.10 | valid ppl   447.34
-----------------------------------------------------------------------------------------
| epoch 63 | step 500/4071 | loss 6.6571 | lr 0.00100 | ngrams/sec 25373.6 | eta 0h1m12s
| epoch 63 | step 1000/4071 | loss 6.7062 | lr 0.00100 | ngrams/sec 35809.2 | eta 0h0m43s
| epoch 63 | step 1500/4071 | loss 6.7252 | lr 0.00100 | ngrams/sec 35803.4 | eta 0h0m36s
| epoch 63 | step 2000/4071 | loss 6.7458 | lr 0.00100 | ngrams/sec 35851.7 | eta 0h0m29s
| epoch 63 | step 2500/4071 | loss 6.7515 | lr 0.00100 | ngrams/sec 35811.4 | eta 0h0m22s
| epoch 63 | step 3000/4071 | loss 6.7866 | lr 0.00100 | ngrams/sec 35797.4 | eta 0h0m15s
| epoch 63 | step 3500/4071 | loss 6.7970 | lr 0.00100 | ngrams/sec 35865.5 | eta 0h0m8s
| epoch 63 | step 4000/4071 | loss 6.8212 | lr 0.00100 | ngrams/sec 35854.2 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1144.25it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.26it/s]


-----------------------------------------------------------------------------------------
| end of epoch  63 | time 59.68s | valid loss  6.10 | valid ppl   444.85
-----------------------------------------------------------------------------------------
| epoch 64 | step 500/4071 | loss 6.6526 | lr 0.00100 | ngrams/sec 25433.7 | eta 0h1m11s
| epoch 64 | step 1000/4071 | loss 6.7013 | lr 0.00100 | ngrams/sec 35796.1 | eta 0h0m43s
| epoch 64 | step 1500/4071 | loss 6.7143 | lr 0.00100 | ngrams/sec 35869.1 | eta 0h0m36s
| epoch 64 | step 2000/4071 | loss 6.7288 | lr 0.00100 | ngrams/sec 35883.4 | eta 0h0m29s
| epoch 64 | step 2500/4071 | loss 6.7658 | lr 0.00100 | ngrams/sec 35917.9 | eta 0h0m22s
| epoch 64 | step 3000/4071 | loss 6.7714 | lr 0.00100 | ngrams/sec 35908.2 | eta 0h0m15s
| epoch 64 | step 3500/4071 | loss 6.7960 | lr 0.00100 | ngrams/sec 35922.7 | eta 0h0m8s
| epoch 64 | step 4000/4071 | loss 6.8233 | lr 0.00100 | ngrams/sec 35906.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1144.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.97it/s]


-----------------------------------------------------------------------------------------
| end of epoch  64 | time 59.58s | valid loss  6.11 | valid ppl   450.63
-----------------------------------------------------------------------------------------
| epoch 65 | step 500/4071 | loss 6.6619 | lr 0.00100 | ngrams/sec 25426.6 | eta 0h1m11s
| epoch 65 | step 1000/4071 | loss 6.6914 | lr 0.00100 | ngrams/sec 35888.1 | eta 0h0m43s
| epoch 65 | step 1500/4071 | loss 6.7159 | lr 0.00100 | ngrams/sec 35906.8 | eta 0h0m36s
| epoch 65 | step 2000/4071 | loss 6.7430 | lr 0.00100 | ngrams/sec 35928.8 | eta 0h0m29s
| epoch 65 | step 2500/4071 | loss 6.7370 | lr 0.00100 | ngrams/sec 35913.0 | eta 0h0m22s
| epoch 65 | step 3000/4071 | loss 6.7729 | lr 0.00100 | ngrams/sec 35940.4 | eta 0h0m15s
| epoch 65 | step 3500/4071 | loss 6.7864 | lr 0.00100 | ngrams/sec 35946.2 | eta 0h0m8s
| epoch 65 | step 4000/4071 | loss 6.7959 | lr 0.00100 | ngrams/sec 35926.3 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1135.31it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.35it/s]


-----------------------------------------------------------------------------------------
| end of epoch  65 | time 59.52s | valid loss  6.11 | valid ppl   451.94
-----------------------------------------------------------------------------------------
| epoch 66 | step 500/4071 | loss 6.6478 | lr 0.00100 | ngrams/sec 25505.7 | eta 0h1m11s
| epoch 66 | step 1000/4071 | loss 6.6773 | lr 0.00100 | ngrams/sec 35925.1 | eta 0h0m43s
| epoch 66 | step 1500/4071 | loss 6.7096 | lr 0.00100 | ngrams/sec 35883.9 | eta 0h0m36s
| epoch 66 | step 2000/4071 | loss 6.7314 | lr 0.00100 | ngrams/sec 35940.2 | eta 0h0m29s
| epoch 66 | step 2500/4071 | loss 6.7516 | lr 0.00100 | ngrams/sec 35873.7 | eta 0h0m22s
| epoch 66 | step 3000/4071 | loss 6.7666 | lr 0.00100 | ngrams/sec 35899.6 | eta 0h0m15s
| epoch 66 | step 3500/4071 | loss 6.7917 | lr 0.00100 | ngrams/sec 35894.3 | eta 0h0m8s
| epoch 66 | step 4000/4071 | loss 6.8065 | lr 0.00100 | ngrams/sec 35917.5 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1151.17it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.56it/s]


-----------------------------------------------------------------------------------------
| end of epoch  66 | time 59.54s | valid loss  6.11 | valid ppl   450.87
-----------------------------------------------------------------------------------------
| epoch 67 | step 500/4071 | loss 6.6410 | lr 0.00100 | ngrams/sec 25440.1 | eta 0h1m11s
| epoch 67 | step 1000/4071 | loss 6.6631 | lr 0.00100 | ngrams/sec 35918.2 | eta 0h0m43s
| epoch 67 | step 1500/4071 | loss 6.6938 | lr 0.00100 | ngrams/sec 35915.1 | eta 0h0m36s
| epoch 67 | step 2000/4071 | loss 6.7185 | lr 0.00100 | ngrams/sec 35896.8 | eta 0h0m29s
| epoch 67 | step 2500/4071 | loss 6.7376 | lr 0.00100 | ngrams/sec 35882.3 | eta 0h0m22s
| epoch 67 | step 3000/4071 | loss 6.7520 | lr 0.00100 | ngrams/sec 35755.9 | eta 0h0m15s
| epoch 67 | step 3500/4071 | loss 6.7800 | lr 0.00100 | ngrams/sec 35896.1 | eta 0h0m8s
| epoch 67 | step 4000/4071 | loss 6.7920 | lr 0.00100 | ngrams/sec 35902.8 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1167.57it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.97it/s]


-----------------------------------------------------------------------------------------
| end of epoch  67 | time 59.58s | valid loss  6.11 | valid ppl   451.09
-----------------------------------------------------------------------------------------
| epoch 68 | step 500/4071 | loss 6.6422 | lr 0.00100 | ngrams/sec 25413.5 | eta 0h1m11s
| epoch 68 | step 1000/4071 | loss 6.6487 | lr 0.00100 | ngrams/sec 35906.0 | eta 0h0m43s
| epoch 68 | step 1500/4071 | loss 6.7012 | lr 0.00100 | ngrams/sec 35914.0 | eta 0h0m36s
| epoch 68 | step 2000/4071 | loss 6.7303 | lr 0.00100 | ngrams/sec 35907.8 | eta 0h0m29s
| epoch 68 | step 2500/4071 | loss 6.7418 | lr 0.00100 | ngrams/sec 35910.6 | eta 0h0m22s
| epoch 68 | step 3000/4071 | loss 6.7516 | lr 0.00100 | ngrams/sec 35926.2 | eta 0h0m15s
| epoch 68 | step 3500/4071 | loss 6.7755 | lr 0.00100 | ngrams/sec 35842.5 | eta 0h0m8s
| epoch 68 | step 4000/4071 | loss 6.7728 | lr 0.00100 | ngrams/sec 35919.3 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1151.36it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.41it/s]


-----------------------------------------------------------------------------------------
| end of epoch  68 | time 59.56s | valid loss  6.12 | valid ppl   455.80
-----------------------------------------------------------------------------------------
| epoch 69 | step 500/4071 | loss 6.6111 | lr 0.00100 | ngrams/sec 25434.7 | eta 0h1m11s
| epoch 69 | step 1000/4071 | loss 6.6512 | lr 0.00100 | ngrams/sec 35322.0 | eta 0h0m44s
| epoch 69 | step 1500/4071 | loss 6.6970 | lr 0.00100 | ngrams/sec 35929.6 | eta 0h0m36s
| epoch 69 | step 2000/4071 | loss 6.7057 | lr 0.00100 | ngrams/sec 35907.3 | eta 0h0m29s
| epoch 69 | step 2500/4071 | loss 6.7223 | lr 0.00100 | ngrams/sec 35916.1 | eta 0h0m22s
| epoch 69 | step 3000/4071 | loss 6.7617 | lr 0.00100 | ngrams/sec 35931.7 | eta 0h0m15s
| epoch 69 | step 3500/4071 | loss 6.7626 | lr 0.00100 | ngrams/sec 35937.5 | eta 0h0m8s
| epoch 69 | step 4000/4071 | loss 6.7741 | lr 0.00100 | ngrams/sec 35903.1 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1138.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.03it/s]


-----------------------------------------------------------------------------------------
| end of epoch  69 | time 59.65s | valid loss  6.13 | valid ppl   457.93
-----------------------------------------------------------------------------------------
| epoch 70 | step 500/4071 | loss 6.6004 | lr 0.00100 | ngrams/sec 25483.3 | eta 0h1m11s
| epoch 70 | step 1000/4071 | loss 6.6589 | lr 0.00100 | ngrams/sec 35951.8 | eta 0h0m43s
| epoch 70 | step 1500/4071 | loss 6.6847 | lr 0.00100 | ngrams/sec 35978.0 | eta 0h0m36s
| epoch 70 | step 2000/4071 | loss 6.7125 | lr 0.00100 | ngrams/sec 35913.9 | eta 0h0m29s
| epoch 70 | step 2500/4071 | loss 6.7189 | lr 0.00100 | ngrams/sec 35909.3 | eta 0h0m22s
| epoch 70 | step 3000/4071 | loss 6.7294 | lr 0.00100 | ngrams/sec 35954.5 | eta 0h0m15s
| epoch 70 | step 3500/4071 | loss 6.7713 | lr 0.00100 | ngrams/sec 35969.0 | eta 0h0m8s
| epoch 70 | step 4000/4071 | loss 6.7734 | lr 0.00100 | ngrams/sec 35976.3 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1151.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  70 | time 59.47s | valid loss  6.13 | valid ppl   459.99
-----------------------------------------------------------------------------------------
| epoch 71 | step 500/4071 | loss 6.5906 | lr 0.00100 | ngrams/sec 25493.9 | eta 0h1m11s
| epoch 71 | step 1000/4071 | loss 6.6552 | lr 0.00100 | ngrams/sec 35953.3 | eta 0h0m43s
| epoch 71 | step 1500/4071 | loss 6.6761 | lr 0.00100 | ngrams/sec 35980.0 | eta 0h0m36s
| epoch 71 | step 2000/4071 | loss 6.6922 | lr 0.00100 | ngrams/sec 35950.1 | eta 0h0m29s
| epoch 71 | step 2500/4071 | loss 6.7126 | lr 0.00100 | ngrams/sec 35968.5 | eta 0h0m22s
| epoch 71 | step 3000/4071 | loss 6.7451 | lr 0.00100 | ngrams/sec 35994.5 | eta 0h0m15s
| epoch 71 | step 3500/4071 | loss 6.7394 | lr 0.00100 | ngrams/sec 36009.7 | eta 0h0m8s
| epoch 71 | step 4000/4071 | loss 6.7636 | lr 0.00100 | ngrams/sec 35950.1 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1174.21it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 283.56it/s]


-----------------------------------------------------------------------------------------
| end of epoch  71 | time 59.43s | valid loss  6.13 | valid ppl   460.09
-----------------------------------------------------------------------------------------
| epoch 72 | step 500/4071 | loss 6.5997 | lr 0.00100 | ngrams/sec 25518.1 | eta 0h1m11s
| epoch 72 | step 1000/4071 | loss 6.6279 | lr 0.00100 | ngrams/sec 35943.3 | eta 0h0m43s
| epoch 72 | step 1500/4071 | loss 6.6704 | lr 0.00100 | ngrams/sec 35950.8 | eta 0h0m36s
| epoch 72 | step 2000/4071 | loss 6.6922 | lr 0.00100 | ngrams/sec 35972.1 | eta 0h0m29s
| epoch 72 | step 2500/4071 | loss 6.6942 | lr 0.00100 | ngrams/sec 35989.1 | eta 0h0m22s
| epoch 72 | step 3000/4071 | loss 6.7405 | lr 0.00100 | ngrams/sec 35964.6 | eta 0h0m15s
| epoch 72 | step 3500/4071 | loss 6.7535 | lr 0.00100 | ngrams/sec 35976.4 | eta 0h0m8s
| epoch 72 | step 4000/4071 | loss 6.7587 | lr 0.00100 | ngrams/sec 35971.9 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1167.13it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 283.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  72 | time 59.44s | valid loss  6.14 | valid ppl   463.35
-----------------------------------------------------------------------------------------
| epoch 73 | step 500/4071 | loss 6.6047 | lr 0.00100 | ngrams/sec 25503.0 | eta 0h1m11s
| epoch 73 | step 1000/4071 | loss 6.6368 | lr 0.00100 | ngrams/sec 35990.5 | eta 0h0m43s
| epoch 73 | step 1500/4071 | loss 6.6518 | lr 0.00100 | ngrams/sec 35951.0 | eta 0h0m36s
| epoch 73 | step 2000/4071 | loss 6.6779 | lr 0.00100 | ngrams/sec 35924.3 | eta 0h0m29s
| epoch 73 | step 2500/4071 | loss 6.6952 | lr 0.00100 | ngrams/sec 35944.8 | eta 0h0m22s
| epoch 73 | step 3000/4071 | loss 6.7313 | lr 0.00100 | ngrams/sec 35961.5 | eta 0h0m15s
| epoch 73 | step 3500/4071 | loss 6.7473 | lr 0.00100 | ngrams/sec 35942.2 | eta 0h0m8s
| epoch 73 | step 4000/4071 | loss 6.7456 | lr 0.00100 | ngrams/sec 35956.5 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1128.11it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  73 | time 59.46s | valid loss  6.14 | valid ppl   465.63
-----------------------------------------------------------------------------------------
| epoch 74 | step 500/4071 | loss 6.5815 | lr 0.00100 | ngrams/sec 25469.2 | eta 0h1m11s
| epoch 74 | step 1000/4071 | loss 6.6291 | lr 0.00100 | ngrams/sec 35985.3 | eta 0h0m43s
| epoch 74 | step 1500/4071 | loss 6.6379 | lr 0.00100 | ngrams/sec 35925.5 | eta 0h0m36s
| epoch 74 | step 2000/4071 | loss 6.6956 | lr 0.00100 | ngrams/sec 35937.6 | eta 0h0m29s
| epoch 74 | step 2500/4071 | loss 6.6952 | lr 0.00100 | ngrams/sec 35936.9 | eta 0h0m22s
| epoch 74 | step 3000/4071 | loss 6.7078 | lr 0.00100 | ngrams/sec 35961.1 | eta 0h0m15s
| epoch 74 | step 3500/4071 | loss 6.7327 | lr 0.00100 | ngrams/sec 35952.4 | eta 0h0m8s
| epoch 74 | step 4000/4071 | loss 6.7416 | lr 0.00100 | ngrams/sec 35961.0 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1137.01it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.79it/s]


-----------------------------------------------------------------------------------------
| end of epoch  74 | time 59.47s | valid loss  6.15 | valid ppl   467.34
-----------------------------------------------------------------------------------------
| epoch 75 | step 500/4071 | loss 6.5849 | lr 0.00100 | ngrams/sec 25493.7 | eta 0h1m11s
| epoch 75 | step 1000/4071 | loss 6.6192 | lr 0.00100 | ngrams/sec 35955.8 | eta 0h0m43s
| epoch 75 | step 1500/4071 | loss 6.6518 | lr 0.00100 | ngrams/sec 35959.4 | eta 0h0m36s
| epoch 75 | step 2000/4071 | loss 6.6678 | lr 0.00100 | ngrams/sec 35960.9 | eta 0h0m29s
| epoch 75 | step 2500/4071 | loss 6.7068 | lr 0.00100 | ngrams/sec 35937.3 | eta 0h0m22s
| epoch 75 | step 3000/4071 | loss 6.7258 | lr 0.00100 | ngrams/sec 35919.4 | eta 0h0m15s
| epoch 75 | step 3500/4071 | loss 6.7359 | lr 0.00100 | ngrams/sec 35979.0 | eta 0h0m8s
| epoch 75 | step 4000/4071 | loss 6.7373 | lr 0.00100 | ngrams/sec 35903.3 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1168.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.38it/s]


-----------------------------------------------------------------------------------------
| end of epoch  75 | time 59.47s | valid loss  6.15 | valid ppl   469.24
-----------------------------------------------------------------------------------------
| epoch 76 | step 500/4071 | loss 6.5785 | lr 0.00100 | ngrams/sec 25432.2 | eta 0h1m11s
| epoch 76 | step 1000/4071 | loss 6.6360 | lr 0.00100 | ngrams/sec 36026.5 | eta 0h0m43s
| epoch 76 | step 1500/4071 | loss 6.6430 | lr 0.00100 | ngrams/sec 35928.2 | eta 0h0m36s
| epoch 76 | step 2000/4071 | loss 6.6631 | lr 0.00100 | ngrams/sec 35963.0 | eta 0h0m29s
| epoch 76 | step 2500/4071 | loss 6.6972 | lr 0.00100 | ngrams/sec 35940.1 | eta 0h0m22s
| epoch 76 | step 3000/4071 | loss 6.7160 | lr 0.00100 | ngrams/sec 35891.3 | eta 0h0m15s
| epoch 76 | step 3500/4071 | loss 6.7181 | lr 0.00100 | ngrams/sec 35907.4 | eta 0h0m8s
| epoch 76 | step 4000/4071 | loss 6.7178 | lr 0.00100 | ngrams/sec 35768.9 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1125.01it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.33it/s]


-----------------------------------------------------------------------------------------
| end of epoch  76 | time 59.54s | valid loss  6.15 | valid ppl   470.20
-----------------------------------------------------------------------------------------
| epoch 77 | step 500/4071 | loss 6.5830 | lr 0.00100 | ngrams/sec 25484.7 | eta 0h1m11s
| epoch 77 | step 1000/4071 | loss 6.6052 | lr 0.00100 | ngrams/sec 35924.9 | eta 0h0m43s
| epoch 77 | step 1500/4071 | loss 6.6333 | lr 0.00100 | ngrams/sec 35909.1 | eta 0h0m36s
| epoch 77 | step 2000/4071 | loss 6.6644 | lr 0.00100 | ngrams/sec 35905.1 | eta 0h0m29s
| epoch 77 | step 2500/4071 | loss 6.6798 | lr 0.00100 | ngrams/sec 35950.4 | eta 0h0m22s
| epoch 77 | step 3000/4071 | loss 6.7205 | lr 0.00100 | ngrams/sec 35914.2 | eta 0h0m15s
| epoch 77 | step 3500/4071 | loss 6.7180 | lr 0.00100 | ngrams/sec 35889.7 | eta 0h0m8s
| epoch 77 | step 4000/4071 | loss 6.7252 | lr 0.00100 | ngrams/sec 35850.1 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1147.14it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  77 | time 59.54s | valid loss  6.15 | valid ppl   470.54
-----------------------------------------------------------------------------------------
| epoch 78 | step 500/4071 | loss 6.5793 | lr 0.00100 | ngrams/sec 25442.7 | eta 0h1m11s
| epoch 78 | step 1000/4071 | loss 6.5973 | lr 0.00100 | ngrams/sec 35857.8 | eta 0h0m43s
| epoch 78 | step 1500/4071 | loss 6.6234 | lr 0.00100 | ngrams/sec 35855.1 | eta 0h0m36s
| epoch 78 | step 2000/4071 | loss 6.6702 | lr 0.00100 | ngrams/sec 35913.1 | eta 0h0m29s
| epoch 78 | step 2500/4071 | loss 6.6816 | lr 0.00100 | ngrams/sec 35905.2 | eta 0h0m22s
| epoch 78 | step 3000/4071 | loss 6.6873 | lr 0.00100 | ngrams/sec 35908.4 | eta 0h0m15s
| epoch 78 | step 3500/4071 | loss 6.7172 | lr 0.00100 | ngrams/sec 35880.1 | eta 0h0m8s
| epoch 78 | step 4000/4071 | loss 6.7253 | lr 0.00100 | ngrams/sec 35882.1 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1177.48it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.36it/s]


-----------------------------------------------------------------------------------------
| end of epoch  78 | time 59.57s | valid loss  6.16 | valid ppl   473.95
-----------------------------------------------------------------------------------------
| epoch 79 | step 500/4071 | loss 6.5626 | lr 0.00100 | ngrams/sec 25452.1 | eta 0h1m11s
| epoch 79 | step 1000/4071 | loss 6.6062 | lr 0.00100 | ngrams/sec 35882.5 | eta 0h0m43s
| epoch 79 | step 1500/4071 | loss 6.6438 | lr 0.00100 | ngrams/sec 35879.7 | eta 0h0m36s
| epoch 79 | step 2000/4071 | loss 6.6525 | lr 0.00100 | ngrams/sec 35902.3 | eta 0h0m29s
| epoch 79 | step 2500/4071 | loss 6.6729 | lr 0.00100 | ngrams/sec 35932.8 | eta 0h0m22s
| epoch 79 | step 3000/4071 | loss 6.6883 | lr 0.00100 | ngrams/sec 35903.4 | eta 0h0m15s
| epoch 79 | step 3500/4071 | loss 6.7031 | lr 0.00100 | ngrams/sec 35962.3 | eta 0h0m8s
| epoch 79 | step 4000/4071 | loss 6.7230 | lr 0.00100 | ngrams/sec 35927.5 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1179.89it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.67it/s]


-----------------------------------------------------------------------------------------
| end of epoch  79 | time 59.53s | valid loss  6.17 | valid ppl   476.38
-----------------------------------------------------------------------------------------
| epoch 80 | step 500/4071 | loss 6.5736 | lr 0.00100 | ngrams/sec 25453.0 | eta 0h1m11s
| epoch 80 | step 1000/4071 | loss 6.5914 | lr 0.00100 | ngrams/sec 35966.0 | eta 0h0m43s
| epoch 80 | step 1500/4071 | loss 6.6204 | lr 0.00100 | ngrams/sec 35922.9 | eta 0h0m36s
| epoch 80 | step 2000/4071 | loss 6.6490 | lr 0.00100 | ngrams/sec 35920.1 | eta 0h0m29s
| epoch 80 | step 2500/4071 | loss 6.6676 | lr 0.00100 | ngrams/sec 35947.6 | eta 0h0m22s
| epoch 80 | step 3000/4071 | loss 6.6999 | lr 0.00100 | ngrams/sec 35978.2 | eta 0h0m15s
| epoch 80 | step 3500/4071 | loss 6.7004 | lr 0.00100 | ngrams/sec 35935.1 | eta 0h0m8s
| epoch 80 | step 4000/4071 | loss 6.6964 | lr 0.00100 | ngrams/sec 35914.1 | eta 0h0m1s


 28%|██▊       | 117/417 [00:00<00:00, 1166.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.77it/s]


-----------------------------------------------------------------------------------------
| end of epoch  80 | time 59.50s | valid loss  6.17 | valid ppl   479.25
-----------------------------------------------------------------------------------------
| epoch 81 | step 500/4071 | loss 6.5646 | lr 0.00100 | ngrams/sec 25486.3 | eta 0h1m11s
| epoch 81 | step 1000/4071 | loss 6.6063 | lr 0.00100 | ngrams/sec 35933.7 | eta 0h0m43s
| epoch 81 | step 1500/4071 | loss 6.6263 | lr 0.00100 | ngrams/sec 35980.4 | eta 0h0m36s
| epoch 81 | step 2000/4071 | loss 6.6269 | lr 0.00100 | ngrams/sec 35976.9 | eta 0h0m29s
| epoch 81 | step 2500/4071 | loss 6.6696 | lr 0.00100 | ngrams/sec 35969.9 | eta 0h0m22s
| epoch 81 | step 3000/4071 | loss 6.6892 | lr 0.00100 | ngrams/sec 35943.3 | eta 0h0m15s
| epoch 81 | step 3500/4071 | loss 6.6971 | lr 0.00100 | ngrams/sec 35942.1 | eta 0h0m8s
| epoch 81 | step 4000/4071 | loss 6.6976 | lr 0.00100 | ngrams/sec 35928.1 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1156.41it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 283.06it/s]


-----------------------------------------------------------------------------------------
| end of epoch  81 | time 59.46s | valid loss  6.17 | valid ppl   475.94
-----------------------------------------------------------------------------------------
| epoch 82 | step 500/4071 | loss 6.5633 | lr 0.00100 | ngrams/sec 25496.4 | eta 0h1m11s
| epoch 82 | step 1000/4071 | loss 6.5833 | lr 0.00100 | ngrams/sec 35945.5 | eta 0h0m43s
| epoch 82 | step 1500/4071 | loss 6.6242 | lr 0.00100 | ngrams/sec 35894.9 | eta 0h0m36s
| epoch 82 | step 2000/4071 | loss 6.6362 | lr 0.00100 | ngrams/sec 35915.7 | eta 0h0m29s
| epoch 82 | step 2500/4071 | loss 6.6459 | lr 0.00100 | ngrams/sec 35951.6 | eta 0h0m22s
| epoch 82 | step 3000/4071 | loss 6.6626 | lr 0.00100 | ngrams/sec 35988.7 | eta 0h0m15s
| epoch 82 | step 3500/4071 | loss 6.6944 | lr 0.00100 | ngrams/sec 35973.3 | eta 0h0m8s
| epoch 82 | step 4000/4071 | loss 6.7050 | lr 0.00100 | ngrams/sec 35979.3 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1167.94it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.87it/s]


-----------------------------------------------------------------------------------------
| end of epoch  82 | time 59.46s | valid loss  6.18 | valid ppl   483.50
-----------------------------------------------------------------------------------------
| epoch 83 | step 500/4071 | loss 6.5602 | lr 0.00100 | ngrams/sec 25516.6 | eta 0h1m11s
| epoch 83 | step 1000/4071 | loss 6.5627 | lr 0.00100 | ngrams/sec 35930.8 | eta 0h0m43s
| epoch 83 | step 1500/4071 | loss 6.6116 | lr 0.00100 | ngrams/sec 35987.0 | eta 0h0m36s
| epoch 83 | step 2000/4071 | loss 6.6416 | lr 0.00100 | ngrams/sec 35925.2 | eta 0h0m29s
| epoch 83 | step 2500/4071 | loss 6.6591 | lr 0.00100 | ngrams/sec 35987.4 | eta 0h0m22s
| epoch 83 | step 3000/4071 | loss 6.6729 | lr 0.00100 | ngrams/sec 35951.4 | eta 0h0m15s
| epoch 83 | step 3500/4071 | loss 6.6972 | lr 0.00100 | ngrams/sec 35915.4 | eta 0h0m8s
| epoch 83 | step 4000/4071 | loss 6.7172 | lr 0.00100 | ngrams/sec 35951.9 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1147.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 283.29it/s]


-----------------------------------------------------------------------------------------
| end of epoch  83 | time 59.45s | valid loss  6.19 | valid ppl   485.50
-----------------------------------------------------------------------------------------
| epoch 84 | step 500/4071 | loss 6.5506 | lr 0.00100 | ngrams/sec 25490.6 | eta 0h1m11s
| epoch 84 | step 1000/4071 | loss 6.5786 | lr 0.00100 | ngrams/sec 35919.4 | eta 0h0m43s
| epoch 84 | step 1500/4071 | loss 6.6105 | lr 0.00100 | ngrams/sec 35953.3 | eta 0h0m36s
| epoch 84 | step 2000/4071 | loss 6.6267 | lr 0.00100 | ngrams/sec 35942.1 | eta 0h0m29s
| epoch 84 | step 2500/4071 | loss 6.6495 | lr 0.00100 | ngrams/sec 35940.1 | eta 0h0m22s
| epoch 84 | step 3000/4071 | loss 6.6661 | lr 0.00100 | ngrams/sec 35902.5 | eta 0h0m15s
| epoch 84 | step 3500/4071 | loss 6.6780 | lr 0.00100 | ngrams/sec 35900.8 | eta 0h0m8s
| epoch 84 | step 4000/4071 | loss 6.6985 | lr 0.00100 | ngrams/sec 35932.1 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1153.82it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.98it/s]


-----------------------------------------------------------------------------------------
| end of epoch  84 | time 59.50s | valid loss  6.18 | valid ppl   485.40
-----------------------------------------------------------------------------------------
| epoch 85 | step 500/4071 | loss 6.5404 | lr 0.00100 | ngrams/sec 25475.1 | eta 0h1m11s
| epoch 85 | step 1000/4071 | loss 6.5831 | lr 0.00100 | ngrams/sec 35924.9 | eta 0h0m43s
| epoch 85 | step 1500/4071 | loss 6.5982 | lr 0.00100 | ngrams/sec 35910.8 | eta 0h0m36s
| epoch 85 | step 2000/4071 | loss 6.6388 | lr 0.00100 | ngrams/sec 35931.6 | eta 0h0m29s
| epoch 85 | step 2500/4071 | loss 6.6502 | lr 0.00100 | ngrams/sec 35935.9 | eta 0h0m22s
| epoch 85 | step 3000/4071 | loss 6.6578 | lr 0.00100 | ngrams/sec 35848.3 | eta 0h0m15s
| epoch 85 | step 3500/4071 | loss 6.6801 | lr 0.00100 | ngrams/sec 35865.5 | eta 0h0m8s
| epoch 85 | step 4000/4071 | loss 6.6983 | lr 0.00100 | ngrams/sec 35842.5 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1151.65it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.13it/s]


-----------------------------------------------------------------------------------------
| end of epoch  85 | time 59.56s | valid loss  6.19 | valid ppl   486.67
-----------------------------------------------------------------------------------------
| epoch 86 | step 500/4071 | loss 6.5148 | lr 0.00100 | ngrams/sec 25397.7 | eta 0h1m11s
| epoch 86 | step 1000/4071 | loss 6.5943 | lr 0.00100 | ngrams/sec 35877.2 | eta 0h0m43s
| epoch 86 | step 1500/4071 | loss 6.6070 | lr 0.00100 | ngrams/sec 35868.4 | eta 0h0m36s
| epoch 86 | step 2000/4071 | loss 6.6223 | lr 0.00100 | ngrams/sec 35861.2 | eta 0h0m29s
| epoch 86 | step 2500/4071 | loss 6.6509 | lr 0.00100 | ngrams/sec 35880.2 | eta 0h0m22s
| epoch 86 | step 3000/4071 | loss 6.6675 | lr 0.00100 | ngrams/sec 35876.2 | eta 0h0m15s
| epoch 86 | step 3500/4071 | loss 6.6739 | lr 0.00100 | ngrams/sec 35834.5 | eta 0h0m8s
| epoch 86 | step 4000/4071 | loss 6.6814 | lr 0.00100 | ngrams/sec 35822.3 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1151.70it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  86 | time 59.63s | valid loss  6.19 | valid ppl   488.95
-----------------------------------------------------------------------------------------
| epoch 87 | step 500/4071 | loss 6.5386 | lr 0.00100 | ngrams/sec 25399.3 | eta 0h1m11s
| epoch 87 | step 1000/4071 | loss 6.5532 | lr 0.00100 | ngrams/sec 35805.8 | eta 0h0m43s
| epoch 87 | step 1500/4071 | loss 6.5954 | lr 0.00100 | ngrams/sec 35858.2 | eta 0h0m36s
| epoch 87 | step 2000/4071 | loss 6.6137 | lr 0.00100 | ngrams/sec 35817.7 | eta 0h0m29s
| epoch 87 | step 2500/4071 | loss 6.6413 | lr 0.00100 | ngrams/sec 35802.8 | eta 0h0m22s
| epoch 87 | step 3000/4071 | loss 6.6627 | lr 0.00100 | ngrams/sec 35800.3 | eta 0h0m15s
| epoch 87 | step 3500/4071 | loss 6.6690 | lr 0.00100 | ngrams/sec 35815.9 | eta 0h0m8s
| epoch 87 | step 4000/4071 | loss 6.6786 | lr 0.00100 | ngrams/sec 35826.1 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1139.21it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.33it/s]


-----------------------------------------------------------------------------------------
| end of epoch  87 | time 59.69s | valid loss  6.20 | valid ppl   493.37
-----------------------------------------------------------------------------------------
| epoch 88 | step 500/4071 | loss 6.5277 | lr 0.00100 | ngrams/sec 25369.2 | eta 0h1m12s
| epoch 88 | step 1000/4071 | loss 6.5476 | lr 0.00100 | ngrams/sec 35801.1 | eta 0h0m43s
| epoch 88 | step 1500/4071 | loss 6.5930 | lr 0.00100 | ngrams/sec 35834.9 | eta 0h0m36s
| epoch 88 | step 2000/4071 | loss 6.6027 | lr 0.00100 | ngrams/sec 35849.4 | eta 0h0m29s
| epoch 88 | step 2500/4071 | loss 6.6264 | lr 0.00100 | ngrams/sec 35817.9 | eta 0h0m22s
| epoch 88 | step 3000/4071 | loss 6.6554 | lr 0.00100 | ngrams/sec 35870.1 | eta 0h0m15s
| epoch 88 | step 3500/4071 | loss 6.6724 | lr 0.00100 | ngrams/sec 35843.0 | eta 0h0m8s
| epoch 88 | step 4000/4071 | loss 6.6748 | lr 0.00100 | ngrams/sec 35850.2 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1144.00it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  88 | time 59.67s | valid loss  6.20 | valid ppl   493.24
-----------------------------------------------------------------------------------------
| epoch 89 | step 500/4071 | loss 6.5325 | lr 0.00100 | ngrams/sec 25412.0 | eta 0h1m11s
| epoch 89 | step 1000/4071 | loss 6.5538 | lr 0.00100 | ngrams/sec 35851.2 | eta 0h0m43s
| epoch 89 | step 1500/4071 | loss 6.5844 | lr 0.00100 | ngrams/sec 35893.1 | eta 0h0m36s
| epoch 89 | step 2000/4071 | loss 6.6039 | lr 0.00100 | ngrams/sec 35878.1 | eta 0h0m29s
| epoch 89 | step 2500/4071 | loss 6.6275 | lr 0.00100 | ngrams/sec 35850.3 | eta 0h0m22s
| epoch 89 | step 3000/4071 | loss 6.6469 | lr 0.00100 | ngrams/sec 35890.9 | eta 0h0m15s
| epoch 89 | step 3500/4071 | loss 6.6517 | lr 0.00100 | ngrams/sec 35876.6 | eta 0h0m8s
| epoch 89 | step 4000/4071 | loss 6.6739 | lr 0.00100 | ngrams/sec 35837.4 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1160.18it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.09it/s]


-----------------------------------------------------------------------------------------
| end of epoch  89 | time 59.61s | valid loss  6.20 | valid ppl   494.33
-----------------------------------------------------------------------------------------
| epoch 90 | step 500/4071 | loss 6.5139 | lr 0.00100 | ngrams/sec 25426.2 | eta 0h1m11s
| epoch 90 | step 1000/4071 | loss 6.5530 | lr 0.00100 | ngrams/sec 35915.8 | eta 0h0m43s
| epoch 90 | step 1500/4071 | loss 6.5913 | lr 0.00100 | ngrams/sec 35881.7 | eta 0h0m36s
| epoch 90 | step 2000/4071 | loss 6.5912 | lr 0.00100 | ngrams/sec 35914.7 | eta 0h0m29s
| epoch 90 | step 2500/4071 | loss 6.6275 | lr 0.00100 | ngrams/sec 35890.9 | eta 0h0m22s
| epoch 90 | step 3000/4071 | loss 6.6352 | lr 0.00100 | ngrams/sec 35862.4 | eta 0h0m15s
| epoch 90 | step 3500/4071 | loss 6.6624 | lr 0.00100 | ngrams/sec 35900.3 | eta 0h0m8s
| epoch 90 | step 4000/4071 | loss 6.6597 | lr 0.00100 | ngrams/sec 35909.8 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1134.93it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  90 | time 59.56s | valid loss  6.21 | valid ppl   496.73
-----------------------------------------------------------------------------------------
| epoch 91 | step 500/4071 | loss 6.5102 | lr 0.00100 | ngrams/sec 25461.9 | eta 0h1m11s
| epoch 91 | step 1000/4071 | loss 6.5546 | lr 0.00100 | ngrams/sec 35906.9 | eta 0h0m43s
| epoch 91 | step 1500/4071 | loss 6.5931 | lr 0.00100 | ngrams/sec 35853.1 | eta 0h0m36s
| epoch 91 | step 2000/4071 | loss 6.5929 | lr 0.00100 | ngrams/sec 35877.5 | eta 0h0m29s
| epoch 91 | step 2500/4071 | loss 6.6107 | lr 0.00100 | ngrams/sec 35911.0 | eta 0h0m22s
| epoch 91 | step 3000/4071 | loss 6.6247 | lr 0.00100 | ngrams/sec 35883.6 | eta 0h0m15s
| epoch 91 | step 3500/4071 | loss 6.6603 | lr 0.00100 | ngrams/sec 35932.0 | eta 0h0m8s
| epoch 91 | step 4000/4071 | loss 6.6671 | lr 0.00100 | ngrams/sec 35931.0 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1176.51it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.64it/s]


-----------------------------------------------------------------------------------------
| end of epoch  91 | time 59.55s | valid loss  6.22 | valid ppl   501.78
-----------------------------------------------------------------------------------------
| epoch 92 | step 500/4071 | loss 6.5089 | lr 0.00100 | ngrams/sec 25455.6 | eta 0h1m11s
| epoch 92 | step 1000/4071 | loss 6.5561 | lr 0.00100 | ngrams/sec 35906.7 | eta 0h0m43s
| epoch 92 | step 1500/4071 | loss 6.5665 | lr 0.00100 | ngrams/sec 35926.8 | eta 0h0m36s
| epoch 92 | step 2000/4071 | loss 6.5866 | lr 0.00100 | ngrams/sec 35840.7 | eta 0h0m29s
| epoch 92 | step 2500/4071 | loss 6.6180 | lr 0.00100 | ngrams/sec 35897.1 | eta 0h0m22s
| epoch 92 | step 3000/4071 | loss 6.6194 | lr 0.00100 | ngrams/sec 35952.0 | eta 0h0m15s
| epoch 92 | step 3500/4071 | loss 6.6403 | lr 0.00100 | ngrams/sec 35915.6 | eta 0h0m8s
| epoch 92 | step 4000/4071 | loss 6.6548 | lr 0.00100 | ngrams/sec 35895.7 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1174.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.40it/s]


-----------------------------------------------------------------------------------------
| end of epoch  92 | time 59.54s | valid loss  6.21 | valid ppl   500.08
-----------------------------------------------------------------------------------------
| epoch 93 | step 500/4071 | loss 6.5021 | lr 0.00100 | ngrams/sec 25455.8 | eta 0h1m11s
| epoch 93 | step 1000/4071 | loss 6.5382 | lr 0.00100 | ngrams/sec 35936.3 | eta 0h0m43s
| epoch 93 | step 1500/4071 | loss 6.5641 | lr 0.00100 | ngrams/sec 35933.6 | eta 0h0m36s
| epoch 93 | step 2000/4071 | loss 6.5922 | lr 0.00100 | ngrams/sec 35924.3 | eta 0h0m29s
| epoch 93 | step 2500/4071 | loss 6.6202 | lr 0.00100 | ngrams/sec 35921.5 | eta 0h0m22s
| epoch 93 | step 3000/4071 | loss 6.6177 | lr 0.00100 | ngrams/sec 35942.7 | eta 0h0m15s
| epoch 93 | step 3500/4071 | loss 6.6360 | lr 0.00100 | ngrams/sec 35906.3 | eta 0h0m8s
| epoch 93 | step 4000/4071 | loss 6.6505 | lr 0.00100 | ngrams/sec 35953.9 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1173.25it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 283.24it/s]


-----------------------------------------------------------------------------------------
| end of epoch  93 | time 59.50s | valid loss  6.23 | valid ppl   505.34
-----------------------------------------------------------------------------------------
| epoch 94 | step 500/4071 | loss 6.4946 | lr 0.00100 | ngrams/sec 25483.9 | eta 0h1m11s
| epoch 94 | step 1000/4071 | loss 6.5364 | lr 0.00100 | ngrams/sec 35924.1 | eta 0h0m43s
| epoch 94 | step 1500/4071 | loss 6.5741 | lr 0.00100 | ngrams/sec 35963.7 | eta 0h0m36s
| epoch 94 | step 2000/4071 | loss 6.5951 | lr 0.00100 | ngrams/sec 35913.5 | eta 0h0m29s
| epoch 94 | step 2500/4071 | loss 6.5963 | lr 0.00100 | ngrams/sec 35942.7 | eta 0h0m22s
| epoch 94 | step 3000/4071 | loss 6.6203 | lr 0.00100 | ngrams/sec 35898.5 | eta 0h0m15s
| epoch 94 | step 3500/4071 | loss 6.6402 | lr 0.00100 | ngrams/sec 35858.5 | eta 0h0m8s
| epoch 94 | step 4000/4071 | loss 6.6641 | lr 0.00100 | ngrams/sec 35878.8 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1142.72it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 282.03it/s]


-----------------------------------------------------------------------------------------
| end of epoch  94 | time 59.53s | valid loss  6.22 | valid ppl   504.44
-----------------------------------------------------------------------------------------
| epoch 95 | step 500/4071 | loss 6.5011 | lr 0.00100 | ngrams/sec 25423.1 | eta 0h1m11s
| epoch 95 | step 1000/4071 | loss 6.5325 | lr 0.00100 | ngrams/sec 35834.0 | eta 0h0m43s
| epoch 95 | step 1500/4071 | loss 6.5642 | lr 0.00100 | ngrams/sec 35151.1 | eta 0h0m37s
| epoch 95 | step 2000/4071 | loss 6.5987 | lr 0.00100 | ngrams/sec 35789.7 | eta 0h0m29s
| epoch 95 | step 2500/4071 | loss 6.6005 | lr 0.00100 | ngrams/sec 35824.5 | eta 0h0m22s
| epoch 95 | step 3000/4071 | loss 6.6316 | lr 0.00100 | ngrams/sec 35857.1 | eta 0h0m15s
| epoch 95 | step 3500/4071 | loss 6.6224 | lr 0.00100 | ngrams/sec 35826.0 | eta 0h0m8s
| epoch 95 | step 4000/4071 | loss 6.6643 | lr 0.00100 | ngrams/sec 35852.1 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1152.65it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.29it/s]


-----------------------------------------------------------------------------------------
| end of epoch  95 | time 59.80s | valid loss  6.23 | valid ppl   505.24
-----------------------------------------------------------------------------------------
| epoch 96 | step 500/4071 | loss 6.5134 | lr 0.00100 | ngrams/sec 25434.9 | eta 0h1m11s
| epoch 96 | step 1000/4071 | loss 6.5246 | lr 0.00100 | ngrams/sec 35824.7 | eta 0h0m43s
| epoch 96 | step 1500/4071 | loss 6.5596 | lr 0.00100 | ngrams/sec 35826.5 | eta 0h0m36s
| epoch 96 | step 2000/4071 | loss 6.5870 | lr 0.00100 | ngrams/sec 35806.6 | eta 0h0m29s
| epoch 96 | step 2500/4071 | loss 6.5960 | lr 0.00100 | ngrams/sec 35795.6 | eta 0h0m22s
| epoch 96 | step 3000/4071 | loss 6.6181 | lr 0.00100 | ngrams/sec 35809.5 | eta 0h0m15s
| epoch 96 | step 3500/4071 | loss 6.6342 | lr 0.00100 | ngrams/sec 35808.0 | eta 0h0m8s
| epoch 96 | step 4000/4071 | loss 6.6425 | lr 0.00100 | ngrams/sec 35788.2 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1125.89it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  96 | time 59.69s | valid loss  6.23 | valid ppl   507.92
-----------------------------------------------------------------------------------------
| epoch 97 | step 500/4071 | loss 6.4952 | lr 0.00100 | ngrams/sec 25362.2 | eta 0h1m12s
| epoch 97 | step 1000/4071 | loss 6.5273 | lr 0.00100 | ngrams/sec 35815.1 | eta 0h0m43s
| epoch 97 | step 1500/4071 | loss 6.5496 | lr 0.00100 | ngrams/sec 35803.8 | eta 0h0m36s
| epoch 97 | step 2000/4071 | loss 6.5677 | lr 0.00100 | ngrams/sec 35714.3 | eta 0h0m29s
| epoch 97 | step 2500/4071 | loss 6.6045 | lr 0.00100 | ngrams/sec 35749.4 | eta 0h0m22s
| epoch 97 | step 3000/4071 | loss 6.6027 | lr 0.00100 | ngrams/sec 35786.4 | eta 0h0m15s
| epoch 97 | step 3500/4071 | loss 6.6271 | lr 0.00100 | ngrams/sec 35747.7 | eta 0h0m8s
| epoch 97 | step 4000/4071 | loss 6.6415 | lr 0.00100 | ngrams/sec 35715.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1139.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  97 | time 59.78s | valid loss  6.24 | valid ppl   514.79
-----------------------------------------------------------------------------------------
| epoch 98 | step 500/4071 | loss 6.4853 | lr 0.00100 | ngrams/sec 25319.5 | eta 0h1m12s
| epoch 98 | step 1000/4071 | loss 6.5174 | lr 0.00100 | ngrams/sec 35760.6 | eta 0h0m43s
| epoch 98 | step 1500/4071 | loss 6.5527 | lr 0.00100 | ngrams/sec 35747.5 | eta 0h0m36s
| epoch 98 | step 2000/4071 | loss 6.5757 | lr 0.00100 | ngrams/sec 35708.3 | eta 0h0m29s
| epoch 98 | step 2500/4071 | loss 6.5982 | lr 0.00100 | ngrams/sec 35691.9 | eta 0h0m22s
| epoch 98 | step 3000/4071 | loss 6.6063 | lr 0.00100 | ngrams/sec 35784.6 | eta 0h0m15s
| epoch 98 | step 3500/4071 | loss 6.6218 | lr 0.00100 | ngrams/sec 35690.2 | eta 0h0m8s
| epoch 98 | step 4000/4071 | loss 6.6499 | lr 0.00100 | ngrams/sec 35700.7 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1141.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.83it/s]


-----------------------------------------------------------------------------------------
| end of epoch  98 | time 59.84s | valid loss  6.24 | valid ppl   513.87
-----------------------------------------------------------------------------------------
| epoch 99 | step 500/4071 | loss 6.4925 | lr 0.00100 | ngrams/sec 25354.0 | eta 0h1m12s
| epoch 99 | step 1000/4071 | loss 6.5222 | lr 0.00100 | ngrams/sec 35787.3 | eta 0h0m43s
| epoch 99 | step 1500/4071 | loss 6.5441 | lr 0.00100 | ngrams/sec 35696.7 | eta 0h0m36s
| epoch 99 | step 2000/4071 | loss 6.5682 | lr 0.00100 | ngrams/sec 35737.5 | eta 0h0m29s
| epoch 99 | step 2500/4071 | loss 6.5743 | lr 0.00100 | ngrams/sec 35732.0 | eta 0h0m22s
| epoch 99 | step 3000/4071 | loss 6.5863 | lr 0.00100 | ngrams/sec 35738.3 | eta 0h0m15s
| epoch 99 | step 3500/4071 | loss 6.6087 | lr 0.00100 | ngrams/sec 35771.8 | eta 0h0m8s
| epoch 99 | step 4000/4071 | loss 6.6368 | lr 0.00100 | ngrams/sec 35771.9 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1157.51it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 281.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch  99 | time 59.80s | valid loss  6.25 | valid ppl   515.45
-----------------------------------------------------------------------------------------
| epoch 100 | step 500/4071 | loss 6.4725 | lr 0.00100 | ngrams/sec 25341.3 | eta 0h1m12s
| epoch 100 | step 1000/4071 | loss 6.5183 | lr 0.00100 | ngrams/sec 35802.0 | eta 0h0m43s
| epoch 100 | step 1500/4071 | loss 6.5489 | lr 0.00100 | ngrams/sec 35797.6 | eta 0h0m36s
| epoch 100 | step 2000/4071 | loss 6.5718 | lr 0.00100 | ngrams/sec 35770.4 | eta 0h0m29s
| epoch 100 | step 2500/4071 | loss 6.5982 | lr 0.00100 | ngrams/sec 35798.3 | eta 0h0m22s
| epoch 100 | step 3000/4071 | loss 6.6060 | lr 0.00100 | ngrams/sec 35796.9 | eta 0h0m15s
| epoch 100 | step 3500/4071 | loss 6.6187 | lr 0.00100 | ngrams/sec 35822.1 | eta 0h0m8s
| epoch 100 | step 4000/4071 | loss 6.6323 | lr 0.00100 | ngrams/sec 35831.5 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1141.55it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 280.88it/s]


-----------------------------------------------------------------------------------------


 25%|██▍       | 117/471 [00:00<00:00, 1143.49it/s]

| end of epoch 100 | time 59.73s | valid loss  6.25 | valid ppl   518.62
-----------------------------------------------------------------------------------------
Evaluating on test set...


100%|██████████| 471/471 [00:01<00:00, 272.98it/s]


| End of training | test loss  6.20 | test ppl   493.85


In [11]:
from google.colab import files
files.download('checkpoint.pth')
!cp "checkpoint.pth" "gdrive/MyDrive/checkpoint.pth"

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Generate

In [None]:
!cp "gdrive/MyDrive/checkpoint.pth" "checkpoint.pth" 

In [12]:
import torch 
# Model parameters.
class Args:
    data = 'gdrive/MyDrive/wikitext-2'
    checkpoint = 'checkpoint.pth'
    outf = 'generated.txt'
    #words = 1000
    seed = 42
    cuda = True
    temperature = 1.0 # temperature - higher will increase diversity
    log_interval = 10 # reporting interval
    words = 100
args = Args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

model.load_state_dict(torch.load(args.checkpoint))
print(model)
model.eval()

ntokens = n_class
# input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
input = torch.autograd.Variable(torch.t(torch.randint(ntokens, (1, 7), dtype=torch.long))).to(device)
print(input)

FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
tensor([[ 2774],
        [26931],
        [16204],
        [23326],
        [28058],
        [14935],
        [16636]], device='cuda:0')


In [13]:
glue = ' '
start = None
with open(args.outf, 'w') as outf:
    for i in range(args.words):
        output = model(input)
        word_weights = output.squeeze().div(args.temperature).exp().cpu()
        # if args.no_unk:
        #     word_weights[corpus.dictionary.w2i[unk]] = 0
        word_idx = torch.multinomial(word_weights, 1)[0]
        # word_idx = word_idx.data[0]
        word = corpus.dictionary.idx2word[word_idx]
        print(word)

        # ids.append(word_idx)
        # input = Variable(torch.LongTensor(ids[-model.order:]).unsqueeze(0))
        input.fill_(word_idx)
        input = input.cuda() if cuda else input
        # print(input)
        if word is "<sos>": # ignore start of sentence predictns
            continue
        elif word is "<eos>":
            outf.write('\n')
        else:
            outf.write(word + glue)

        if i % args.log_interval == 0:
            print('| Generated {}/{} words'.format(i, args.words))

nl
| Generated 0/100 words
1801
lines
animal
targets
approval
study
spacing
class
places
course
| Generated 10/100 words
petrus
sparrow
banai
past
effect
emilio
minute
episodes
positively
decision
| Generated 20/100 words
to
escape
symons
del
:
anglo
magazine
five
riaa
by
| Generated 30/100 words
paul
radio
america
released
i
real
us
against
spain
raged
| Generated 40/100 words
had
been
served
he
struck
naval
term
slavery
overwhelming
13
| Generated 50/100 words
that
i
saying
will
attempts
disappeared
<eos>
retirement
rick
for
| Generated 60/100 words
blocked
bay
southeastern
anderson
revising
cinema
hardcore
.
moving
region
| Generated 70/100 words
1
both
government
serves
total
casualties
later
charged
greater
project
| Generated 80/100 words
opening
part
then
his
still
sexually
the
nile
terms
feather
| Generated 90/100 words
my
feelings
maryada
coast
through
january
april
specially
major
