In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!ls "gdrive/MyDrive/wikitext-2" # check that it has successfully connected
# files should be at ur GDrive inside folder wikitext-2
!cp "gdrive/MyDrive/wikitext-2/wiki.test.tokens.txt" "test.txt" # copy the files to colab runtime
!cp "gdrive/MyDrive/wikitext-2/wiki.train.tokens.txt" "train.txt"
!cp "gdrive/MyDrive/wikitext-2/wiki.valid.tokens.txt" "valid.txt"

wiki.test.tokens.txt  wiki.train.tokens.txt  wiki.valid.tokens.txt


# Preprocessing

In [47]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                # remove the headers e.g.  = = Description = = 
                if line.startswith('='): 
                    continue
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word.lower()) # make to lower

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word.lower()]) # make to lower
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
    
    @property
    def vocab_size(self):
        return len(self.dictionary.idx2word)

# Params

In [48]:
#=== params
corpus = Corpus('/content')
n_class = corpus.vocab_size
n_step = 7 # n-1 in paper
n_hidden = 200 # h in paper
embed_size = 200       # m in paper
batch_size = 512
order = n_step # order (int): the order of the language model, i.e. length of the history
epochs = 100
learning_rate = 0.001
cuda = torch.cuda.is_available()
seed = 42
clip = 2.0
#===

# Model

In [49]:
#== MODEL ==#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''
    the neural model learns the distributed representation of each word 
    (embedding matrix C) and 
    the probability function of a word sequence as a function of their distributed representations. 
    It has a hidden layer with 
    tanh activation and the output layer is a 
    Softmax layer. 
    The output of the model for each 
    input of (n - 1) previous words are the 
    probabilities over the |V | words in the vocabulary for the next word.
'''
class FNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, context_size, no_hidden, dropout=0.5, tie_weight=True):
        super(FNNModel,self).__init__()
        """
        Args:
            n_class (int): no. of vocabulary
            m (int): size of each embedding vector
#n-gram models construct tables of conditional probabilities for the next word, 
#for each one of a large number of contexts, i.e. combinations of the last n − 1 words
            n_step (int): n-1 in paper. #n_step + 1 = n-gram. if n_step = 1, bigram
            n_hidden (int): no. of hidden units associated with each word
        """
        """
        Vars:
            C: encoder (|V| x m)
            H: hiden layer weight (n x (n-1)m)
            W: word feature to output weights (|V| x (n-1)m)
            d: hidden layer bias (has h no. of elements)
            U: hidden-to-output weights (|V| × h matrix)
            b: output bias (has |V| no. of elements)
        """
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear1 = nn.Linear(context_size * embed_size, no_hidden)
        self.linear2 = nn.Linear(no_hidden, vocab_size)
        self.context_size = context_size
        self.embed_size = embed_size
        self.dropout = nn.Dropout(p=dropout)
        if tie_weight:
            self.linear2.weight = self.embeddings.weight

    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((-1, self.context_size * self.embed_size))
        hidden_output = self.linear1(embeds)
        # hidden_output = self.dropout(hidden_output)
        out = hidden_output.tanh()
        # out = self.dropout(out)
        out = self.linear2(out)
        out = self.dropout(out)
        log_probs = F.log_softmax(out, dim=1) # [1000, 28912]: softmax on 28912's dim
        return log_probs

# Data Loading

In [50]:
def batchify(data, batch_size):
    # Work out how cleanly we can divide the dataset into args.batch_size parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data
def get_batch(data, i, order):
    x = torch.autograd.Variable(torch.t(data[i:i+order]))
    y = torch.autograd.Variable(data[i+order].view(-1))
    return x, y
def evaluate(data, model, criterion):
	model.eval()
	total_loss = 0
	n_steps = data.size(0) - order - 1
	for i in tqdm(range(n_steps)):
		x, y = get_batch(data, i, order)
		out = model(x)
		loss = criterion(out, y)
		total_loss += loss.data.data
	return total_loss / n_steps

In [51]:
def clock_time(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    return int(h), int(m), int(s)

In [52]:
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import time


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)
if cuda:
	train_data, val_data, test_data = train_data.cuda(), val_data.cuda(), test_data.cuda()
print('Using cuda: {}'.format(cuda))
print('Size of training set: {:,} tokens'.format(np.prod(train_data.size())))
print('Size of validation set: {:,} tokens'.format(np.prod(val_data.size())))
print('Size of test set: {:,} tokens'.format(np.prod(test_data.size())))
print('Vocabulary size: {:,}'.format(corpus.vocab_size))
print('Example data:')
for k in range(100, 107):
    x = [corpus.dictionary.idx2word[i] for i in train_data[k:order+k, 0]]
    y = [corpus.dictionary.idx2word[train_data[k+order, 0]]]
    print(x, y)
#=== initialise model
model = FNNModel(
    n_class, 
    embed_size, 
    n_step, 
    n_hidden,
    )
if cuda:
  model.cuda()
# Display the model's architecture
print('Model: \n', model)
criterion = nn.NLLLoss()
optimizer = optim.RMSprop(model.parameters(),lr=learning_rate)

Using cuda: True
Size of training set: 2,088,448 tokens
Size of validation set: 217,600 tokens
Size of test set: 245,248 tokens
Vocabulary size: 28,912
Example data:
['"', 'nameless', '"', ',', 'a', 'penal', 'military'] ['unit']
['nameless', '"', ',', 'a', 'penal', 'military', 'unit'] ['serving']
['"', ',', 'a', 'penal', 'military', 'unit', 'serving'] ['the']
[',', 'a', 'penal', 'military', 'unit', 'serving', 'the'] ['nation']
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation'] ['of']
['penal', 'military', 'unit', 'serving', 'the', 'nation', 'of'] ['gallia']
['military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia'] ['during']
Model: 
 FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [53]:
!nvidia-smi

Fri Nov 27 08:29:33 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    39W / 250W |   1597MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [54]:
# Set seed for reproducibility.
torch.manual_seed(seed)
np.random.seed(seed)
parameters = [param for param in model.parameters() if param.requires_grad]
# Training
print('Training...')
losses = dict(train=[], val=[])

# initialize the early_stopping counter
stop_counter = 0

lr = learning_rate # so that can alter later if SGD not descending 
best_val_loss = None

num_steps = train_data.size(0) - order - 1
batch_order = np.arange(num_steps)

t0 = time.time()
try:
    for epoch in range(1, epochs+1):
        model.train()
        epoch_start_time = time.time()
        np.random.shuffle(batch_order)

        for step in range(1, num_steps+1):
            idx = batch_order[step-1]
            x, y = get_batch(train_data, idx, order)

            model.zero_grad()
            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)
            debug = False
            #   if debug:
            #     # Debugging softmax approximation.
            #     xe = nn.CrossEntropyLoss()
            #     true_loss = xe(logits, y)
            #     print('approx {:>3.2f}, true {:>3.2f}, diff {:>3.4f}'.format(
            #       loss.data, true_loss.data, true_loss.data - loss.data))

            # Update parameters
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # reduce exploding grad
            optimizer.step()

            # Save loss.
            losses['train'].append(loss.cpu().data)
            print_every = 500
            if step % print_every == 0:
                avg_loss = sum(losses['train'][-print_every:]) / print_every
                t1 = time.time()
                steps_per_second = print_every / (t1 - t0)
                print('| epoch {} | step {}/{} | loss {:.4f} | lr {:.5f} | '
                    'ngrams/sec {:.1f} | eta {}h{}m{}s'.format(
                    epoch, step, num_steps, avg_loss, lr,
                    steps_per_second * batch_size,
                    *clock_time((num_steps - step) / steps_per_second)))
                t0 = time.time()
            
        print('Evaluating on validation set...')
        val_loss = evaluate(val_data, model, criterion)
        losses['val'].append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, torch.exp(val_loss)))
        print('-' * 89)

        if not best_val_loss or val_loss < best_val_loss:
            stop_counter = 0 # reset counter
            best_val_loss = val_loss
            print('| saving current state of model ...')
            torch.save(model.state_dict(), 'checkpoint.pth')
            #=== download checkpoint file
            # files.download('checkpoint.pth')
        elif val_loss < best_val_loss and val_loss < losses['val'][-2] and val_loss < torch.mean(torch.stack(losses['val'])): # curr loss less than best loss and previous loss
            stop_counter += 1
            if stop_counter >= 10:
                print("Early stopping")
                break
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    
# write_losses(losses['train'], args.log_dir, name='train-losses')
# write_losses(losses['val'], args.log_dir, name='val-losses')

print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

Training...
| epoch 1 | step 500/4071 | loss 26.2183 | lr 0.00100 | ngrams/sec 66440.5 | eta 0h0m27s
| epoch 1 | step 1000/4071 | loss 12.1935 | lr 0.00100 | ngrams/sec 66660.5 | eta 0h0m23s
| epoch 1 | step 1500/4071 | loss 10.6389 | lr 0.00100 | ngrams/sec 66650.8 | eta 0h0m19s
| epoch 1 | step 2000/4071 | loss 10.7796 | lr 0.00100 | ngrams/sec 66595.3 | eta 0h0m15s
| epoch 1 | step 2500/4071 | loss 11.3941 | lr 0.00100 | ngrams/sec 66338.3 | eta 0h0m12s
| epoch 1 | step 3000/4071 | loss 11.4819 | lr 0.00100 | ngrams/sec 66864.2 | eta 0h0m8s
| epoch 1 | step 3500/4071 | loss 11.4490 | lr 0.00100 | ngrams/sec 66928.5 | eta 0h0m4s
| epoch 1 | step 4000/4071 | loss 11.3583 | lr 0.00100 | ngrams/sec 66917.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1543.36it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.67it/s]


-----------------------------------------------------------------------------------------
| end of epoch   1 | time 31.84s | valid loss  7.82 | valid ppl  2488.81
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 2 | step 500/4071 | loss 11.1883 | lr 0.00100 | ngrams/sec 48912.8 | eta 0h0m37s
| epoch 2 | step 1000/4071 | loss 11.0873 | lr 0.00100 | ngrams/sec 66543.5 | eta 0h0m23s
| epoch 2 | step 1500/4071 | loss 10.8169 | lr 0.00100 | ngrams/sec 67120.8 | eta 0h0m19s
| epoch 2 | step 2000/4071 | loss 10.3143 | lr 0.00100 | ngrams/sec 66867.2 | eta 0h0m15s
| epoch 2 | step 2500/4071 | loss 9.9062 | lr 0.00100 | ngrams/sec 66860.2 | eta 0h0m12s
| epoch 2 | step 3000/4071 | loss 9.6588 | lr 0.00100 | ngrams/sec 66545.2 | eta 0h0m8s
| epoch 2 | step 3500/4071 | loss 9.4935 | lr 0.00100 | ngrams/sec 67474.3 | eta 0h0m4s
| epoch 2 | step 4000/4071 | loss 9.4051 | lr 0.00100 | ngrams/sec 67820.4 | eta 0h0m0s

 37%|███▋      | 156/417 [00:00<00:00, 1547.68it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch   2 | time 31.70s | valid loss  7.63 | valid ppl  2051.42
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 3 | step 500/4071 | loss 9.2542 | lr 0.00100 | ngrams/sec 50161.6 | eta 0h0m36s
| epoch 3 | step 1000/4071 | loss 9.2476 | lr 0.00100 | ngrams/sec 67949.1 | eta 0h0m23s
| epoch 3 | step 1500/4071 | loss 9.1891 | lr 0.00100 | ngrams/sec 68148.0 | eta 0h0m19s
| epoch 3 | step 2000/4071 | loss 9.1356 | lr 0.00100 | ngrams/sec 68139.2 | eta 0h0m15s
| epoch 3 | step 2500/4071 | loss 9.1174 | lr 0.00100 | ngrams/sec 68147.6 | eta 0h0m11s
| epoch 3 | step 3000/4071 | loss 9.0853 | lr 0.00100 | ngrams/sec 68618.7 | eta 0h0m7s
| epoch 3 | step 3500/4071 | loss 9.0630 | lr 0.00100 | ngrams/sec 67922.4 | eta 0h0m4s
| epoch 3 | step 4000/4071 | loss 9.0382 | lr 0.00100 | ngrams/sec 68124.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1557.03it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.29it/s]


-----------------------------------------------------------------------------------------
| end of epoch   3 | time 31.15s | valid loss  7.47 | valid ppl  1751.83
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 4 | step 500/4071 | loss 8.9686 | lr 0.00100 | ngrams/sec 50042.0 | eta 0h0m36s
| epoch 4 | step 1000/4071 | loss 8.9639 | lr 0.00100 | ngrams/sec 68461.9 | eta 0h0m22s
| epoch 4 | step 1500/4071 | loss 8.9518 | lr 0.00100 | ngrams/sec 67586.1 | eta 0h0m19s
| epoch 4 | step 2000/4071 | loss 8.9424 | lr 0.00100 | ngrams/sec 68461.6 | eta 0h0m15s
| epoch 4 | step 2500/4071 | loss 8.9468 | lr 0.00100 | ngrams/sec 67845.8 | eta 0h0m11s
| epoch 4 | step 3000/4071 | loss 8.9035 | lr 0.00100 | ngrams/sec 67680.9 | eta 0h0m8s
| epoch 4 | step 3500/4071 | loss 8.9132 | lr 0.00100 | ngrams/sec 68125.4 | eta 0h0m4s
| epoch 4 | step 4000/4071 | loss 8.8928 | lr 0.00100 | ngrams/sec 67996.6 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1542.07it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.34it/s]


-----------------------------------------------------------------------------------------
| end of epoch   4 | time 31.22s | valid loss  7.37 | valid ppl  1582.53
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 5 | step 500/4071 | loss 8.8363 | lr 0.00100 | ngrams/sec 49622.0 | eta 0h0m36s
| epoch 5 | step 1000/4071 | loss 8.8553 | lr 0.00100 | ngrams/sec 67659.0 | eta 0h0m23s
| epoch 5 | step 1500/4071 | loss 8.8307 | lr 0.00100 | ngrams/sec 68117.0 | eta 0h0m19s
| epoch 5 | step 2000/4071 | loss 8.8313 | lr 0.00100 | ngrams/sec 68292.3 | eta 0h0m15s
| epoch 5 | step 2500/4071 | loss 8.8245 | lr 0.00100 | ngrams/sec 68069.3 | eta 0h0m11s
| epoch 5 | step 3000/4071 | loss 8.8093 | lr 0.00100 | ngrams/sec 68059.7 | eta 0h0m8s
| epoch 5 | step 3500/4071 | loss 8.8171 | lr 0.00100 | ngrams/sec 67957.4 | eta 0h0m4s
| epoch 5 | step 4000/4071 | loss 8.8199 | lr 0.00100 | ngrams/sec 67931.8 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.77it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.72it/s]


-----------------------------------------------------------------------------------------
| end of epoch   5 | time 31.25s | valid loss  7.35 | valid ppl  1554.20
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 6 | step 500/4071 | loss 8.7687 | lr 0.00100 | ngrams/sec 49691.8 | eta 0h0m36s
| epoch 6 | step 1000/4071 | loss 8.7587 | lr 0.00100 | ngrams/sec 67671.5 | eta 0h0m23s
| epoch 6 | step 1500/4071 | loss 8.7535 | lr 0.00100 | ngrams/sec 67874.0 | eta 0h0m19s
| epoch 6 | step 2000/4071 | loss 8.7607 | lr 0.00100 | ngrams/sec 68171.8 | eta 0h0m15s
| epoch 6 | step 2500/4071 | loss 8.7532 | lr 0.00100 | ngrams/sec 68220.6 | eta 0h0m11s
| epoch 6 | step 3000/4071 | loss 8.7571 | lr 0.00100 | ngrams/sec 67882.5 | eta 0h0m8s
| epoch 6 | step 3500/4071 | loss 8.7436 | lr 0.00100 | ngrams/sec 67923.0 | eta 0h0m4s
| epoch 6 | step 4000/4071 | loss 8.7411 | lr 0.00100 | ngrams/sec 68260.2 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1543.25it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch   6 | time 31.26s | valid loss  7.24 | valid ppl  1395.78
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 7 | step 500/4071 | loss 8.7078 | lr 0.00100 | ngrams/sec 49861.4 | eta 0h0m36s
| epoch 7 | step 1000/4071 | loss 8.7106 | lr 0.00100 | ngrams/sec 68187.3 | eta 0h0m23s
| epoch 7 | step 1500/4071 | loss 8.7114 | lr 0.00100 | ngrams/sec 67863.7 | eta 0h0m19s
| epoch 7 | step 2000/4071 | loss 8.7035 | lr 0.00100 | ngrams/sec 68298.6 | eta 0h0m15s
| epoch 7 | step 2500/4071 | loss 8.7126 | lr 0.00100 | ngrams/sec 67937.2 | eta 0h0m11s
| epoch 7 | step 3000/4071 | loss 8.7046 | lr 0.00100 | ngrams/sec 68130.0 | eta 0h0m8s
| epoch 7 | step 3500/4071 | loss 8.6942 | lr 0.00100 | ngrams/sec 68263.6 | eta 0h0m4s
| epoch 7 | step 4000/4071 | loss 8.6966 | lr 0.00100 | ngrams/sec 68105.4 | eta 0h0m0s


 37%|███▋      | 155/417 [00:00<00:00, 1547.67it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.01it/s]


-----------------------------------------------------------------------------------------
| end of epoch   7 | time 31.20s | valid loss  7.23 | valid ppl  1374.14
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 8 | step 500/4071 | loss 8.6723 | lr 0.00100 | ngrams/sec 49868.2 | eta 0h0m36s
| epoch 8 | step 1000/4071 | loss 8.6521 | lr 0.00100 | ngrams/sec 68350.5 | eta 0h0m23s
| epoch 8 | step 1500/4071 | loss 8.6761 | lr 0.00100 | ngrams/sec 68314.5 | eta 0h0m19s
| epoch 8 | step 2000/4071 | loss 8.6721 | lr 0.00100 | ngrams/sec 68449.3 | eta 0h0m15s
| epoch 8 | step 2500/4071 | loss 8.6648 | lr 0.00100 | ngrams/sec 68268.4 | eta 0h0m11s
| epoch 8 | step 3000/4071 | loss 8.6577 | lr 0.00100 | ngrams/sec 68124.7 | eta 0h0m8s
| epoch 8 | step 3500/4071 | loss 8.6646 | lr 0.00100 | ngrams/sec 68043.9 | eta 0h0m4s
| epoch 8 | step 4000/4071 | loss 8.6699 | lr 0.00100 | ngrams/sec 68057.0 | eta 0h0m0s


 37%|███▋      | 155/417 [00:00<00:00, 1542.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 719.69it/s]


-----------------------------------------------------------------------------------------
| end of epoch   8 | time 31.15s | valid loss  7.20 | valid ppl  1336.45
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 9 | step 500/4071 | loss 8.6362 | lr 0.00100 | ngrams/sec 49901.4 | eta 0h0m36s
| epoch 9 | step 1000/4071 | loss 8.6293 | lr 0.00100 | ngrams/sec 67868.6 | eta 0h0m23s
| epoch 9 | step 1500/4071 | loss 8.6466 | lr 0.00100 | ngrams/sec 68047.2 | eta 0h0m19s
| epoch 9 | step 2000/4071 | loss 8.6423 | lr 0.00100 | ngrams/sec 68095.5 | eta 0h0m15s
| epoch 9 | step 2500/4071 | loss 8.6535 | lr 0.00100 | ngrams/sec 68423.0 | eta 0h0m11s
| epoch 9 | step 3000/4071 | loss 8.6373 | lr 0.00100 | ngrams/sec 67999.1 | eta 0h0m8s
| epoch 9 | step 3500/4071 | loss 8.6381 | lr 0.00100 | ngrams/sec 68215.6 | eta 0h0m4s
| epoch 9 | step 4000/4071 | loss 8.6439 | lr 0.00100 | ngrams/sec 67857.3 | eta 0h0m0s


 37%|███▋      | 155/417 [00:00<00:00, 1536.34it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 719.27it/s]


-----------------------------------------------------------------------------------------
| end of epoch   9 | time 31.21s | valid loss  7.14 | valid ppl  1264.72
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 10 | step 500/4071 | loss 8.6111 | lr 0.00100 | ngrams/sec 50008.7 | eta 0h0m36s
| epoch 10 | step 1000/4071 | loss 8.6110 | lr 0.00100 | ngrams/sec 68118.1 | eta 0h0m23s
| epoch 10 | step 1500/4071 | loss 8.6238 | lr 0.00100 | ngrams/sec 68122.2 | eta 0h0m19s
| epoch 10 | step 2000/4071 | loss 8.6162 | lr 0.00100 | ngrams/sec 68314.8 | eta 0h0m15s
| epoch 10 | step 2500/4071 | loss 8.6215 | lr 0.00100 | ngrams/sec 67743.9 | eta 0h0m11s
| epoch 10 | step 3000/4071 | loss 8.6220 | lr 0.00100 | ngrams/sec 68189.3 | eta 0h0m8s
| epoch 10 | step 3500/4071 | loss 8.6083 | lr 0.00100 | ngrams/sec 68262.3 | eta 0h0m4s
| epoch 10 | step 4000/4071 | loss 8.6208 | lr 0.00100 | ngrams/sec 68262.5 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1550.15it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.92it/s]


-----------------------------------------------------------------------------------------
| end of epoch  10 | time 31.17s | valid loss  7.10 | valid ppl  1212.35
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 11 | step 500/4071 | loss 8.5944 | lr 0.00100 | ngrams/sec 49880.6 | eta 0h0m36s
| epoch 11 | step 1000/4071 | loss 8.5868 | lr 0.00100 | ngrams/sec 67970.0 | eta 0h0m23s
| epoch 11 | step 1500/4071 | loss 8.6076 | lr 0.00100 | ngrams/sec 68205.6 | eta 0h0m19s
| epoch 11 | step 2000/4071 | loss 8.6016 | lr 0.00100 | ngrams/sec 67959.1 | eta 0h0m15s
| epoch 11 | step 2500/4071 | loss 8.5961 | lr 0.00100 | ngrams/sec 68616.9 | eta 0h0m11s
| epoch 11 | step 3000/4071 | loss 8.5930 | lr 0.00100 | ngrams/sec 68181.4 | eta 0h0m8s
| epoch 11 | step 3500/4071 | loss 8.5944 | lr 0.00100 | ngrams/sec 68374.9 | eta 0h0m4s
| epoch 11 | step 4000/4071 | loss 8.5971 | lr 0.00100 | ngrams/sec 68345.1 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1557.13it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.53it/s]


-----------------------------------------------------------------------------------------
| end of epoch  11 | time 31.14s | valid loss  7.08 | valid ppl  1192.58
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 12 | step 500/4071 | loss 8.5654 | lr 0.00100 | ngrams/sec 50024.5 | eta 0h0m36s
| epoch 12 | step 1000/4071 | loss 8.5935 | lr 0.00100 | ngrams/sec 68025.3 | eta 0h0m23s
| epoch 12 | step 1500/4071 | loss 8.5762 | lr 0.00100 | ngrams/sec 68268.3 | eta 0h0m19s
| epoch 12 | step 2000/4071 | loss 8.5848 | lr 0.00100 | ngrams/sec 67875.9 | eta 0h0m15s
| epoch 12 | step 2500/4071 | loss 8.5759 | lr 0.00100 | ngrams/sec 68098.4 | eta 0h0m11s
| epoch 12 | step 3000/4071 | loss 8.5937 | lr 0.00100 | ngrams/sec 68212.3 | eta 0h0m8s
| epoch 12 | step 3500/4071 | loss 8.5812 | lr 0.00100 | ngrams/sec 68286.2 | eta 0h0m4s
| epoch 12 | step 4000/4071 | loss 8.5907 | lr 0.00100 | ngrams/sec 68263.2 | eta 0h

 37%|███▋      | 155/417 [00:00<00:00, 1541.49it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 718.97it/s]


-----------------------------------------------------------------------------------------
| end of epoch  12 | time 31.17s | valid loss  7.04 | valid ppl  1141.18
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 13 | step 500/4071 | loss 8.5658 | lr 0.00100 | ngrams/sec 50070.7 | eta 0h0m36s
| epoch 13 | step 1000/4071 | loss 8.5569 | lr 0.00100 | ngrams/sec 68021.3 | eta 0h0m23s
| epoch 13 | step 1500/4071 | loss 8.5674 | lr 0.00100 | ngrams/sec 68223.8 | eta 0h0m19s
| epoch 13 | step 2000/4071 | loss 8.5764 | lr 0.00100 | ngrams/sec 68146.1 | eta 0h0m15s
| epoch 13 | step 2500/4071 | loss 8.5651 | lr 0.00100 | ngrams/sec 67856.8 | eta 0h0m11s
| epoch 13 | step 3000/4071 | loss 8.5611 | lr 0.00100 | ngrams/sec 68342.5 | eta 0h0m8s
| epoch 13 | step 3500/4071 | loss 8.5566 | lr 0.00100 | ngrams/sec 67689.5 | eta 0h0m4s
| epoch 13 | step 4000/4071 | loss 8.5792 | lr 0.00100 | ngrams/sec 68344.8 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1555.00it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.27it/s]


-----------------------------------------------------------------------------------------
| end of epoch  13 | time 31.20s | valid loss  7.10 | valid ppl  1208.30
-----------------------------------------------------------------------------------------
| epoch 14 | step 500/4071 | loss 8.5409 | lr 0.00100 | ngrams/sec 50495.3 | eta 0h0m36s
| epoch 14 | step 1000/4071 | loss 8.5528 | lr 0.00100 | ngrams/sec 68285.6 | eta 0h0m23s
| epoch 14 | step 1500/4071 | loss 8.5486 | lr 0.00100 | ngrams/sec 68065.6 | eta 0h0m19s
| epoch 14 | step 2000/4071 | loss 8.5601 | lr 0.00100 | ngrams/sec 68117.8 | eta 0h0m15s
| epoch 14 | step 2500/4071 | loss 8.5596 | lr 0.00100 | ngrams/sec 68058.7 | eta 0h0m11s
| epoch 14 | step 3000/4071 | loss 8.5603 | lr 0.00100 | ngrams/sec 68132.7 | eta 0h0m8s
| epoch 14 | step 3500/4071 | loss 8.5401 | lr 0.00100 | ngrams/sec 68132.6 | eta 0h0m4s
| epoch 14 | step 4000/4071 | loss 8.5621 | lr 0.00100 | ngrams/sec 67563.1 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1545.83it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  14 | time 31.21s | valid loss  7.01 | valid ppl  1112.05
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 15 | step 500/4071 | loss 8.5250 | lr 0.00100 | ngrams/sec 49570.4 | eta 0h0m36s
| epoch 15 | step 1000/4071 | loss 8.5414 | lr 0.00100 | ngrams/sec 68132.3 | eta 0h0m23s
| epoch 15 | step 1500/4071 | loss 8.5320 | lr 0.00100 | ngrams/sec 67830.3 | eta 0h0m19s
| epoch 15 | step 2000/4071 | loss 8.5410 | lr 0.00100 | ngrams/sec 68295.6 | eta 0h0m15s
| epoch 15 | step 2500/4071 | loss 8.5354 | lr 0.00100 | ngrams/sec 68016.9 | eta 0h0m11s
| epoch 15 | step 3000/4071 | loss 8.5442 | lr 0.00100 | ngrams/sec 67712.5 | eta 0h0m8s
| epoch 15 | step 3500/4071 | loss 8.5477 | lr 0.00100 | ngrams/sec 67834.6 | eta 0h0m4s
| epoch 15 | step 4000/4071 | loss 8.5426 | lr 0.00100 | ngrams/sec 67687.4 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1558.43it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.99it/s]


-----------------------------------------------------------------------------------------
| end of epoch  15 | time 31.30s | valid loss  6.97 | valid ppl  1066.71
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 16 | step 500/4071 | loss 8.5125 | lr 0.00100 | ngrams/sec 49980.6 | eta 0h0m36s
| epoch 16 | step 1000/4071 | loss 8.5261 | lr 0.00100 | ngrams/sec 67696.0 | eta 0h0m23s
| epoch 16 | step 1500/4071 | loss 8.5439 | lr 0.00100 | ngrams/sec 68071.9 | eta 0h0m19s
| epoch 16 | step 2000/4071 | loss 8.5309 | lr 0.00100 | ngrams/sec 67979.7 | eta 0h0m15s
| epoch 16 | step 2500/4071 | loss 8.5344 | lr 0.00100 | ngrams/sec 68151.5 | eta 0h0m11s
| epoch 16 | step 3000/4071 | loss 8.5361 | lr 0.00100 | ngrams/sec 68080.3 | eta 0h0m8s
| epoch 16 | step 3500/4071 | loss 8.5352 | lr 0.00100 | ngrams/sec 68232.7 | eta 0h0m4s
| epoch 16 | step 4000/4071 | loss 8.5285 | lr 0.00100 | ngrams/sec 68169.5 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1552.45it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.77it/s]


-----------------------------------------------------------------------------------------
| end of epoch  16 | time 31.21s | valid loss  6.97 | valid ppl  1060.12
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 17 | step 500/4071 | loss 8.5042 | lr 0.00100 | ngrams/sec 49909.6 | eta 0h0m36s
| epoch 17 | step 1000/4071 | loss 8.5084 | lr 0.00100 | ngrams/sec 68059.7 | eta 0h0m23s
| epoch 17 | step 1500/4071 | loss 8.5279 | lr 0.00100 | ngrams/sec 68228.6 | eta 0h0m19s
| epoch 17 | step 2000/4071 | loss 8.5240 | lr 0.00100 | ngrams/sec 68215.8 | eta 0h0m15s
| epoch 17 | step 2500/4071 | loss 8.5088 | lr 0.00100 | ngrams/sec 68255.0 | eta 0h0m11s
| epoch 17 | step 3000/4071 | loss 8.5387 | lr 0.00100 | ngrams/sec 68165.6 | eta 0h0m8s
| epoch 17 | step 3500/4071 | loss 8.5173 | lr 0.00100 | ngrams/sec 68119.1 | eta 0h0m4s
| epoch 17 | step 4000/4071 | loss 8.5195 | lr 0.00100 | ngrams/sec 67988.4 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1547.18it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  17 | time 31.17s | valid loss  6.92 | valid ppl  1011.84
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 18 | step 500/4071 | loss 8.4992 | lr 0.00100 | ngrams/sec 49941.2 | eta 0h0m36s
| epoch 18 | step 1000/4071 | loss 8.5050 | lr 0.00100 | ngrams/sec 67966.5 | eta 0h0m23s
| epoch 18 | step 1500/4071 | loss 8.5174 | lr 0.00100 | ngrams/sec 68432.6 | eta 0h0m19s
| epoch 18 | step 2000/4071 | loss 8.5089 | lr 0.00100 | ngrams/sec 68269.8 | eta 0h0m15s
| epoch 18 | step 2500/4071 | loss 8.5008 | lr 0.00100 | ngrams/sec 68201.4 | eta 0h0m11s
| epoch 18 | step 3000/4071 | loss 8.5111 | lr 0.00100 | ngrams/sec 68039.8 | eta 0h0m8s
| epoch 18 | step 3500/4071 | loss 8.5015 | lr 0.00100 | ngrams/sec 67708.6 | eta 0h0m4s
| epoch 18 | step 4000/4071 | loss 8.5218 | lr 0.00100 | ngrams/sec 68247.3 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1548.73it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  18 | time 31.18s | valid loss  6.88 | valid ppl   971.25
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 19 | step 500/4071 | loss 8.4946 | lr 0.00100 | ngrams/sec 50166.7 | eta 0h0m36s
| epoch 19 | step 1000/4071 | loss 8.5029 | lr 0.00100 | ngrams/sec 68184.1 | eta 0h0m23s
| epoch 19 | step 1500/4071 | loss 8.5029 | lr 0.00100 | ngrams/sec 68473.9 | eta 0h0m19s
| epoch 19 | step 2000/4071 | loss 8.5151 | lr 0.00100 | ngrams/sec 68264.3 | eta 0h0m15s
| epoch 19 | step 2500/4071 | loss 8.5030 | lr 0.00100 | ngrams/sec 67969.0 | eta 0h0m11s
| epoch 19 | step 3000/4071 | loss 8.5062 | lr 0.00100 | ngrams/sec 68241.7 | eta 0h0m8s
| epoch 19 | step 3500/4071 | loss 8.5092 | lr 0.00100 | ngrams/sec 68343.1 | eta 0h0m4s
| epoch 19 | step 4000/4071 | loss 8.5016 | lr 0.00100 | ngrams/sec 68437.2 | eta 0h

 37%|███▋      | 155/417 [00:00<00:00, 1543.51it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 719.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  19 | time 31.11s | valid loss  6.86 | valid ppl   950.57
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 20 | step 500/4071 | loss 8.4819 | lr 0.00100 | ngrams/sec 49867.5 | eta 0h0m36s
| epoch 20 | step 1000/4071 | loss 8.4763 | lr 0.00100 | ngrams/sec 68438.9 | eta 0h0m22s
| epoch 20 | step 1500/4071 | loss 8.4807 | lr 0.00100 | ngrams/sec 68209.2 | eta 0h0m19s
| epoch 20 | step 2000/4071 | loss 8.4907 | lr 0.00100 | ngrams/sec 68437.7 | eta 0h0m15s
| epoch 20 | step 2500/4071 | loss 8.4932 | lr 0.00100 | ngrams/sec 68125.6 | eta 0h0m11s
| epoch 20 | step 3000/4071 | loss 8.5069 | lr 0.00100 | ngrams/sec 68019.8 | eta 0h0m8s
| epoch 20 | step 3500/4071 | loss 8.4965 | lr 0.00100 | ngrams/sec 68194.4 | eta 0h0m4s
| epoch 20 | step 4000/4071 | loss 8.4979 | lr 0.00100 | ngrams/sec 68288.4 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1547.94it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.37it/s]


-----------------------------------------------------------------------------------------
| end of epoch  20 | time 31.15s | valid loss  6.84 | valid ppl   936.74
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 21 | step 500/4071 | loss 8.4613 | lr 0.00100 | ngrams/sec 49888.3 | eta 0h0m36s
| epoch 21 | step 1000/4071 | loss 8.4789 | lr 0.00100 | ngrams/sec 68114.3 | eta 0h0m23s
| epoch 21 | step 1500/4071 | loss 8.4811 | lr 0.00100 | ngrams/sec 68183.1 | eta 0h0m19s
| epoch 21 | step 2000/4071 | loss 8.4783 | lr 0.00100 | ngrams/sec 68172.2 | eta 0h0m15s
| epoch 21 | step 2500/4071 | loss 8.4916 | lr 0.00100 | ngrams/sec 68284.0 | eta 0h0m11s
| epoch 21 | step 3000/4071 | loss 8.4918 | lr 0.00100 | ngrams/sec 68252.1 | eta 0h0m8s
| epoch 21 | step 3500/4071 | loss 8.4878 | lr 0.00100 | ngrams/sec 68172.9 | eta 0h0m4s
| epoch 21 | step 4000/4071 | loss 8.4951 | lr 0.00100 | ngrams/sec 68102.3 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1553.05it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.94it/s]


-----------------------------------------------------------------------------------------
| end of epoch  21 | time 31.16s | valid loss  6.82 | valid ppl   916.86
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 22 | step 500/4071 | loss 8.4800 | lr 0.00100 | ngrams/sec 49984.9 | eta 0h0m36s
| epoch 22 | step 1000/4071 | loss 8.4645 | lr 0.00100 | ngrams/sec 68035.1 | eta 0h0m23s
| epoch 22 | step 1500/4071 | loss 8.4762 | lr 0.00100 | ngrams/sec 67933.4 | eta 0h0m19s
| epoch 22 | step 2000/4071 | loss 8.4897 | lr 0.00100 | ngrams/sec 68257.1 | eta 0h0m15s
| epoch 22 | step 2500/4071 | loss 8.4890 | lr 0.00100 | ngrams/sec 67770.0 | eta 0h0m11s
| epoch 22 | step 3000/4071 | loss 8.4916 | lr 0.00100 | ngrams/sec 68127.8 | eta 0h0m8s
| epoch 22 | step 3500/4071 | loss 8.4742 | lr 0.00100 | ngrams/sec 67878.5 | eta 0h0m4s
| epoch 22 | step 4000/4071 | loss 8.4893 | lr 0.00100 | ngrams/sec 68106.4 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1548.12it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch  22 | time 31.22s | valid loss  6.85 | valid ppl   943.03
-----------------------------------------------------------------------------------------
| epoch 23 | step 500/4071 | loss 8.4674 | lr 0.00100 | ngrams/sec 50680.9 | eta 0h0m36s
| epoch 23 | step 1000/4071 | loss 8.4649 | lr 0.00100 | ngrams/sec 68074.8 | eta 0h0m23s
| epoch 23 | step 1500/4071 | loss 8.4813 | lr 0.00100 | ngrams/sec 68186.4 | eta 0h0m19s
| epoch 23 | step 2000/4071 | loss 8.4746 | lr 0.00100 | ngrams/sec 68443.1 | eta 0h0m15s
| epoch 23 | step 2500/4071 | loss 8.4549 | lr 0.00100 | ngrams/sec 67972.5 | eta 0h0m11s
| epoch 23 | step 3000/4071 | loss 8.4867 | lr 0.00100 | ngrams/sec 68039.9 | eta 0h0m8s
| epoch 23 | step 3500/4071 | loss 8.4744 | lr 0.00100 | ngrams/sec 67968.9 | eta 0h0m4s
| epoch 23 | step 4000/4071 | loss 8.4836 | lr 0.00100 | ngrams/sec 67874.1 | eta 0h0m0s


 37%|███▋      | 155/417 [00:00<00:00, 1529.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 718.51it/s]


-----------------------------------------------------------------------------------------
| end of epoch  23 | time 31.19s | valid loss  6.84 | valid ppl   933.62
-----------------------------------------------------------------------------------------
| epoch 24 | step 500/4071 | loss 8.4542 | lr 0.00100 | ngrams/sec 50570.6 | eta 0h0m36s
| epoch 24 | step 1000/4071 | loss 8.4700 | lr 0.00100 | ngrams/sec 67891.4 | eta 0h0m23s
| epoch 24 | step 1500/4071 | loss 8.4583 | lr 0.00100 | ngrams/sec 68012.7 | eta 0h0m19s
| epoch 24 | step 2000/4071 | loss 8.4601 | lr 0.00100 | ngrams/sec 68106.8 | eta 0h0m15s
| epoch 24 | step 2500/4071 | loss 8.4652 | lr 0.00100 | ngrams/sec 68137.7 | eta 0h0m11s
| epoch 24 | step 3000/4071 | loss 8.4661 | lr 0.00100 | ngrams/sec 67925.0 | eta 0h0m8s
| epoch 24 | step 3500/4071 | loss 8.4709 | lr 0.00100 | ngrams/sec 67601.0 | eta 0h0m4s
| epoch 24 | step 4000/4071 | loss 8.4798 | lr 0.00100 | ngrams/sec 67386.0 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1548.45it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.53it/s]


-----------------------------------------------------------------------------------------
| end of epoch  24 | time 31.29s | valid loss  6.83 | valid ppl   921.78
-----------------------------------------------------------------------------------------
| epoch 25 | step 500/4071 | loss 8.4504 | lr 0.00100 | ngrams/sec 50516.4 | eta 0h0m36s
| epoch 25 | step 1000/4071 | loss 8.4655 | lr 0.00100 | ngrams/sec 68170.6 | eta 0h0m23s
| epoch 25 | step 1500/4071 | loss 8.4587 | lr 0.00100 | ngrams/sec 68227.6 | eta 0h0m19s
| epoch 25 | step 2000/4071 | loss 8.4566 | lr 0.00100 | ngrams/sec 68231.5 | eta 0h0m15s
| epoch 25 | step 2500/4071 | loss 8.4702 | lr 0.00100 | ngrams/sec 67780.3 | eta 0h0m11s
| epoch 25 | step 3000/4071 | loss 8.4688 | lr 0.00100 | ngrams/sec 67846.7 | eta 0h0m8s
| epoch 25 | step 3500/4071 | loss 8.4608 | lr 0.00100 | ngrams/sec 68201.1 | eta 0h0m4s
| epoch 25 | step 4000/4071 | loss 8.4626 | lr 0.00100 | ngrams/sec 67997.6 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1550.12it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  25 | time 31.20s | valid loss  6.80 | valid ppl   899.19
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 26 | step 500/4071 | loss 8.4319 | lr 0.00100 | ngrams/sec 49820.2 | eta 0h0m36s
| epoch 26 | step 1000/4071 | loss 8.4622 | lr 0.00100 | ngrams/sec 68407.2 | eta 0h0m22s
| epoch 26 | step 1500/4071 | loss 8.4410 | lr 0.00100 | ngrams/sec 68337.4 | eta 0h0m19s
| epoch 26 | step 2000/4071 | loss 8.4595 | lr 0.00100 | ngrams/sec 67954.3 | eta 0h0m15s
| epoch 26 | step 2500/4071 | loss 8.4532 | lr 0.00100 | ngrams/sec 67760.8 | eta 0h0m11s
| epoch 26 | step 3000/4071 | loss 8.4444 | lr 0.00100 | ngrams/sec 68112.8 | eta 0h0m8s
| epoch 26 | step 3500/4071 | loss 8.4512 | lr 0.00100 | ngrams/sec 67912.6 | eta 0h0m4s
| epoch 26 | step 4000/4071 | loss 8.4505 | lr 0.00100 | ngrams/sec 68309.8 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1556.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  26 | time 31.19s | valid loss  6.83 | valid ppl   928.03
-----------------------------------------------------------------------------------------
| epoch 27 | step 500/4071 | loss 8.4323 | lr 0.00100 | ngrams/sec 50653.7 | eta 0h0m36s
| epoch 27 | step 1000/4071 | loss 8.4295 | lr 0.00100 | ngrams/sec 67853.3 | eta 0h0m23s
| epoch 27 | step 1500/4071 | loss 8.4252 | lr 0.00100 | ngrams/sec 68482.9 | eta 0h0m19s
| epoch 27 | step 2000/4071 | loss 8.4441 | lr 0.00100 | ngrams/sec 67051.9 | eta 0h0m15s
| epoch 27 | step 2500/4071 | loss 8.4417 | lr 0.00100 | ngrams/sec 67965.4 | eta 0h0m11s
| epoch 27 | step 3000/4071 | loss 8.4329 | lr 0.00100 | ngrams/sec 68159.1 | eta 0h0m8s
| epoch 27 | step 3500/4071 | loss 8.4397 | lr 0.00100 | ngrams/sec 68009.7 | eta 0h0m4s
| epoch 27 | step 4000/4071 | loss 8.4351 | lr 0.00100 | ngrams/sec 68308.6 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1558.57it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch  27 | time 31.24s | valid loss  6.76 | valid ppl   862.19
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 28 | step 500/4071 | loss 8.4089 | lr 0.00100 | ngrams/sec 49889.1 | eta 0h0m36s
| epoch 28 | step 1000/4071 | loss 8.4176 | lr 0.00100 | ngrams/sec 66851.7 | eta 0h0m23s
| epoch 28 | step 1500/4071 | loss 8.4137 | lr 0.00100 | ngrams/sec 68452.8 | eta 0h0m19s
| epoch 28 | step 2000/4071 | loss 8.4332 | lr 0.00100 | ngrams/sec 68391.4 | eta 0h0m15s
| epoch 28 | step 2500/4071 | loss 8.4307 | lr 0.00100 | ngrams/sec 67987.7 | eta 0h0m11s
| epoch 28 | step 3000/4071 | loss 8.4248 | lr 0.00100 | ngrams/sec 67936.4 | eta 0h0m8s
| epoch 28 | step 3500/4071 | loss 8.4269 | lr 0.00100 | ngrams/sec 68134.2 | eta 0h0m4s
| epoch 28 | step 4000/4071 | loss 8.4271 | lr 0.00100 | ngrams/sec 68106.8 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1550.85it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.70it/s]


-----------------------------------------------------------------------------------------
| end of epoch  28 | time 31.23s | valid loss  6.78 | valid ppl   881.90
-----------------------------------------------------------------------------------------
| epoch 29 | step 500/4071 | loss 8.4010 | lr 0.00100 | ngrams/sec 50661.8 | eta 0h0m36s
| epoch 29 | step 1000/4071 | loss 8.4041 | lr 0.00100 | ngrams/sec 67864.7 | eta 0h0m23s
| epoch 29 | step 1500/4071 | loss 8.4192 | lr 0.00100 | ngrams/sec 68504.9 | eta 0h0m19s
| epoch 29 | step 2000/4071 | loss 8.4251 | lr 0.00100 | ngrams/sec 68130.7 | eta 0h0m15s
| epoch 29 | step 2500/4071 | loss 8.4122 | lr 0.00100 | ngrams/sec 67532.7 | eta 0h0m11s
| epoch 29 | step 3000/4071 | loss 8.4150 | lr 0.00100 | ngrams/sec 67875.7 | eta 0h0m8s
| epoch 29 | step 3500/4071 | loss 8.4195 | lr 0.00100 | ngrams/sec 68214.8 | eta 0h0m4s
| epoch 29 | step 4000/4071 | loss 8.4162 | lr 0.00100 | ngrams/sec 68351.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1552.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch  29 | time 31.20s | valid loss  6.78 | valid ppl   880.91
-----------------------------------------------------------------------------------------
| epoch 30 | step 500/4071 | loss 8.3927 | lr 0.00100 | ngrams/sec 50551.7 | eta 0h0m36s
| epoch 30 | step 1000/4071 | loss 8.3841 | lr 0.00100 | ngrams/sec 68225.6 | eta 0h0m23s
| epoch 30 | step 1500/4071 | loss 8.4155 | lr 0.00100 | ngrams/sec 68037.0 | eta 0h0m19s
| epoch 30 | step 2000/4071 | loss 8.4136 | lr 0.00100 | ngrams/sec 68114.4 | eta 0h0m15s
| epoch 30 | step 2500/4071 | loss 8.4162 | lr 0.00100 | ngrams/sec 68152.6 | eta 0h0m11s
| epoch 30 | step 3000/4071 | loss 8.4002 | lr 0.00100 | ngrams/sec 68193.6 | eta 0h0m8s
| epoch 30 | step 3500/4071 | loss 8.4084 | lr 0.00100 | ngrams/sec 68170.1 | eta 0h0m4s
| epoch 30 | step 4000/4071 | loss 8.4065 | lr 0.00100 | ngrams/sec 67982.8 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1537.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.61it/s]


-----------------------------------------------------------------------------------------
| end of epoch  30 | time 31.19s | valid loss  6.79 | valid ppl   886.46
-----------------------------------------------------------------------------------------
| epoch 31 | step 500/4071 | loss 8.3760 | lr 0.00100 | ngrams/sec 50580.2 | eta 0h0m36s
| epoch 31 | step 1000/4071 | loss 8.3935 | lr 0.00100 | ngrams/sec 68208.1 | eta 0h0m23s
| epoch 31 | step 1500/4071 | loss 8.3890 | lr 0.00100 | ngrams/sec 68484.1 | eta 0h0m19s
| epoch 31 | step 2000/4071 | loss 8.4068 | lr 0.00100 | ngrams/sec 67838.0 | eta 0h0m15s
| epoch 31 | step 2500/4071 | loss 8.3815 | lr 0.00100 | ngrams/sec 67533.7 | eta 0h0m11s
| epoch 31 | step 3000/4071 | loss 8.4025 | lr 0.00100 | ngrams/sec 68182.0 | eta 0h0m8s
| epoch 31 | step 3500/4071 | loss 8.4003 | lr 0.00100 | ngrams/sec 68339.1 | eta 0h0m4s
| epoch 31 | step 4000/4071 | loss 8.3831 | lr 0.00100 | ngrams/sec 67981.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1554.32it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.78it/s]


-----------------------------------------------------------------------------------------
| end of epoch  31 | time 31.20s | valid loss  6.77 | valid ppl   874.26
-----------------------------------------------------------------------------------------
| epoch 32 | step 500/4071 | loss 8.3742 | lr 0.00100 | ngrams/sec 50531.4 | eta 0h0m36s
| epoch 32 | step 1000/4071 | loss 8.3764 | lr 0.00100 | ngrams/sec 67966.4 | eta 0h0m23s
| epoch 32 | step 1500/4071 | loss 8.3833 | lr 0.00100 | ngrams/sec 68287.1 | eta 0h0m19s
| epoch 32 | step 2000/4071 | loss 8.3935 | lr 0.00100 | ngrams/sec 68174.3 | eta 0h0m15s
| epoch 32 | step 2500/4071 | loss 8.3925 | lr 0.00100 | ngrams/sec 67861.0 | eta 0h0m11s
| epoch 32 | step 3000/4071 | loss 8.3830 | lr 0.00100 | ngrams/sec 68073.8 | eta 0h0m8s
| epoch 32 | step 3500/4071 | loss 8.3990 | lr 0.00100 | ngrams/sec 68381.0 | eta 0h0m4s
| epoch 32 | step 4000/4071 | loss 8.3866 | lr 0.00100 | ngrams/sec 68226.4 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1548.13it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.34it/s]


-----------------------------------------------------------------------------------------
| end of epoch  32 | time 31.18s | valid loss  6.75 | valid ppl   855.61
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 33 | step 500/4071 | loss 8.3775 | lr 0.00100 | ngrams/sec 49781.4 | eta 0h0m36s
| epoch 33 | step 1000/4071 | loss 8.3745 | lr 0.00100 | ngrams/sec 68219.6 | eta 0h0m23s
| epoch 33 | step 1500/4071 | loss 8.3588 | lr 0.00100 | ngrams/sec 68042.5 | eta 0h0m19s
| epoch 33 | step 2000/4071 | loss 8.3905 | lr 0.00100 | ngrams/sec 67930.6 | eta 0h0m15s
| epoch 33 | step 2500/4071 | loss 8.3733 | lr 0.00100 | ngrams/sec 67575.3 | eta 0h0m11s
| epoch 33 | step 3000/4071 | loss 8.3922 | lr 0.00100 | ngrams/sec 68472.6 | eta 0h0m8s
| epoch 33 | step 3500/4071 | loss 8.3749 | lr 0.00100 | ngrams/sec 68356.9 | eta 0h0m4s
| epoch 33 | step 4000/4071 | loss 8.3790 | lr 0.00100 | ngrams/sec 68294.5 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1541.33it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.23it/s]


-----------------------------------------------------------------------------------------
| end of epoch  33 | time 31.19s | valid loss  6.75 | valid ppl   852.73
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 34 | step 500/4071 | loss 8.3637 | lr 0.00100 | ngrams/sec 49778.9 | eta 0h0m36s
| epoch 34 | step 1000/4071 | loss 8.3770 | lr 0.00100 | ngrams/sec 68468.5 | eta 0h0m22s
| epoch 34 | step 1500/4071 | loss 8.3789 | lr 0.00100 | ngrams/sec 68357.2 | eta 0h0m19s
| epoch 34 | step 2000/4071 | loss 8.3809 | lr 0.00100 | ngrams/sec 68104.4 | eta 0h0m15s
| epoch 34 | step 2500/4071 | loss 8.3689 | lr 0.00100 | ngrams/sec 68113.5 | eta 0h0m11s
| epoch 34 | step 3000/4071 | loss 8.3869 | lr 0.00100 | ngrams/sec 67520.4 | eta 0h0m8s
| epoch 34 | step 3500/4071 | loss 8.3830 | lr 0.00100 | ngrams/sec 67626.3 | eta 0h0m4s
| epoch 34 | step 4000/4071 | loss 8.3741 | lr 0.00100 | ngrams/sec 67562.6 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1558.03it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.94it/s]


-----------------------------------------------------------------------------------------
| end of epoch  34 | time 31.28s | valid loss  6.73 | valid ppl   834.80
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 35 | step 500/4071 | loss 8.3461 | lr 0.00100 | ngrams/sec 49929.4 | eta 0h0m36s
| epoch 35 | step 1000/4071 | loss 8.3698 | lr 0.00100 | ngrams/sec 68146.9 | eta 0h0m23s
| epoch 35 | step 1500/4071 | loss 8.3478 | lr 0.00100 | ngrams/sec 67753.2 | eta 0h0m19s
| epoch 35 | step 2000/4071 | loss 8.3767 | lr 0.00100 | ngrams/sec 67919.7 | eta 0h0m15s
| epoch 35 | step 2500/4071 | loss 8.3621 | lr 0.00100 | ngrams/sec 68076.0 | eta 0h0m11s
| epoch 35 | step 3000/4071 | loss 8.3756 | lr 0.00100 | ngrams/sec 68316.3 | eta 0h0m8s
| epoch 35 | step 3500/4071 | loss 8.3670 | lr 0.00100 | ngrams/sec 68619.0 | eta 0h0m4s
| epoch 35 | step 4000/4071 | loss 8.3647 | lr 0.00100 | ngrams/sec 67996.4 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1557.01it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.53it/s]


-----------------------------------------------------------------------------------------
| end of epoch  35 | time 31.18s | valid loss  6.72 | valid ppl   825.25
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 36 | step 500/4071 | loss 8.3417 | lr 0.00100 | ngrams/sec 50049.5 | eta 0h0m36s
| epoch 36 | step 1000/4071 | loss 8.3460 | lr 0.00100 | ngrams/sec 67930.6 | eta 0h0m23s
| epoch 36 | step 1500/4071 | loss 8.3613 | lr 0.00100 | ngrams/sec 68690.2 | eta 0h0m19s
| epoch 36 | step 2000/4071 | loss 8.3648 | lr 0.00100 | ngrams/sec 67920.0 | eta 0h0m15s
| epoch 36 | step 2500/4071 | loss 8.3633 | lr 0.00100 | ngrams/sec 68207.2 | eta 0h0m11s
| epoch 36 | step 3000/4071 | loss 8.3633 | lr 0.00100 | ngrams/sec 68372.8 | eta 0h0m8s
| epoch 36 | step 3500/4071 | loss 8.3649 | lr 0.00100 | ngrams/sec 68208.3 | eta 0h0m4s
| epoch 36 | step 4000/4071 | loss 8.3573 | lr 0.00100 | ngrams/sec 67580.4 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1549.47it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.16it/s]


-----------------------------------------------------------------------------------------
| end of epoch  36 | time 31.18s | valid loss  6.75 | valid ppl   855.07
-----------------------------------------------------------------------------------------
| epoch 37 | step 500/4071 | loss 8.3296 | lr 0.00100 | ngrams/sec 50728.5 | eta 0h0m36s
| epoch 37 | step 1000/4071 | loss 8.3527 | lr 0.00100 | ngrams/sec 68057.4 | eta 0h0m23s
| epoch 37 | step 1500/4071 | loss 8.3597 | lr 0.00100 | ngrams/sec 68078.0 | eta 0h0m19s
| epoch 37 | step 2000/4071 | loss 8.3526 | lr 0.00100 | ngrams/sec 67904.3 | eta 0h0m15s
| epoch 37 | step 2500/4071 | loss 8.3484 | lr 0.00100 | ngrams/sec 68266.5 | eta 0h0m11s
| epoch 37 | step 3000/4071 | loss 8.3662 | lr 0.00100 | ngrams/sec 68046.6 | eta 0h0m8s
| epoch 37 | step 3500/4071 | loss 8.3727 | lr 0.00100 | ngrams/sec 68373.7 | eta 0h0m4s
| epoch 37 | step 4000/4071 | loss 8.3634 | lr 0.00100 | ngrams/sec 67941.1 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1548.71it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.69it/s]


-----------------------------------------------------------------------------------------
| end of epoch  37 | time 31.18s | valid loss  6.71 | valid ppl   820.50
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 38 | step 500/4071 | loss 8.3163 | lr 0.00100 | ngrams/sec 50127.2 | eta 0h0m36s
| epoch 38 | step 1000/4071 | loss 8.3520 | lr 0.00100 | ngrams/sec 68154.0 | eta 0h0m23s
| epoch 38 | step 1500/4071 | loss 8.3534 | lr 0.00100 | ngrams/sec 67871.8 | eta 0h0m19s
| epoch 38 | step 2000/4071 | loss 8.3464 | lr 0.00100 | ngrams/sec 68011.5 | eta 0h0m15s
| epoch 38 | step 2500/4071 | loss 8.3509 | lr 0.00100 | ngrams/sec 68034.9 | eta 0h0m11s
| epoch 38 | step 3000/4071 | loss 8.3489 | lr 0.00100 | ngrams/sec 68287.7 | eta 0h0m8s
| epoch 38 | step 3500/4071 | loss 8.3684 | lr 0.00100 | ngrams/sec 68087.9 | eta 0h0m4s
| epoch 38 | step 4000/4071 | loss 8.3475 | lr 0.00100 | ngrams/sec 68232.6 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1533.67it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  38 | time 31.18s | valid loss  6.71 | valid ppl   820.31
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 39 | step 500/4071 | loss 8.3324 | lr 0.00100 | ngrams/sec 50010.9 | eta 0h0m36s
| epoch 39 | step 1000/4071 | loss 8.3398 | lr 0.00100 | ngrams/sec 68153.4 | eta 0h0m23s
| epoch 39 | step 1500/4071 | loss 8.3399 | lr 0.00100 | ngrams/sec 68022.0 | eta 0h0m19s
| epoch 39 | step 2000/4071 | loss 8.3496 | lr 0.00100 | ngrams/sec 68156.3 | eta 0h0m15s
| epoch 39 | step 2500/4071 | loss 8.3469 | lr 0.00100 | ngrams/sec 68157.1 | eta 0h0m11s
| epoch 39 | step 3000/4071 | loss 8.3538 | lr 0.00100 | ngrams/sec 68568.9 | eta 0h0m7s
| epoch 39 | step 3500/4071 | loss 8.3571 | lr 0.00100 | ngrams/sec 68027.0 | eta 0h0m4s
| epoch 39 | step 4000/4071 | loss 8.3721 | lr 0.00100 | ngrams/sec 68072.7 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1548.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.15it/s]


-----------------------------------------------------------------------------------------
| end of epoch  39 | time 31.16s | valid loss  6.71 | valid ppl   818.85
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 40 | step 500/4071 | loss 8.3322 | lr 0.00100 | ngrams/sec 49821.3 | eta 0h0m36s
| epoch 40 | step 1000/4071 | loss 8.3251 | lr 0.00100 | ngrams/sec 68315.5 | eta 0h0m23s
| epoch 40 | step 1500/4071 | loss 8.3418 | lr 0.00100 | ngrams/sec 67967.9 | eta 0h0m19s
| epoch 40 | step 2000/4071 | loss 8.3392 | lr 0.00100 | ngrams/sec 68290.3 | eta 0h0m15s
| epoch 40 | step 2500/4071 | loss 8.3326 | lr 0.00100 | ngrams/sec 68417.5 | eta 0h0m11s
| epoch 40 | step 3000/4071 | loss 8.3348 | lr 0.00100 | ngrams/sec 68389.7 | eta 0h0m8s
| epoch 40 | step 3500/4071 | loss 8.3341 | lr 0.00100 | ngrams/sec 68195.9 | eta 0h0m4s
| epoch 40 | step 4000/4071 | loss 8.3612 | lr 0.00100 | ngrams/sec 67746.3 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1541.32it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.35it/s]


-----------------------------------------------------------------------------------------
| end of epoch  40 | time 31.18s | valid loss  6.68 | valid ppl   797.59
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 41 | step 500/4071 | loss 8.3232 | lr 0.00100 | ngrams/sec 49982.2 | eta 0h0m36s
| epoch 41 | step 1000/4071 | loss 8.3351 | lr 0.00100 | ngrams/sec 67924.9 | eta 0h0m23s
| epoch 41 | step 1500/4071 | loss 8.3353 | lr 0.00100 | ngrams/sec 68113.8 | eta 0h0m19s
| epoch 41 | step 2000/4071 | loss 8.3343 | lr 0.00100 | ngrams/sec 68308.4 | eta 0h0m15s
| epoch 41 | step 2500/4071 | loss 8.3443 | lr 0.00100 | ngrams/sec 66179.1 | eta 0h0m12s
| epoch 41 | step 3000/4071 | loss 8.3492 | lr 0.00100 | ngrams/sec 68085.7 | eta 0h0m8s
| epoch 41 | step 3500/4071 | loss 8.3427 | lr 0.00100 | ngrams/sec 68143.4 | eta 0h0m4s
| epoch 41 | step 4000/4071 | loss 8.3386 | lr 0.00100 | ngrams/sec 68001.0 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1549.75it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.23it/s]


-----------------------------------------------------------------------------------------
| end of epoch  41 | time 31.30s | valid loss  6.69 | valid ppl   805.76
-----------------------------------------------------------------------------------------
| epoch 42 | step 500/4071 | loss 8.3091 | lr 0.00100 | ngrams/sec 50248.1 | eta 0h0m36s
| epoch 42 | step 1000/4071 | loss 8.3187 | lr 0.00100 | ngrams/sec 68027.7 | eta 0h0m23s
| epoch 42 | step 1500/4071 | loss 8.3247 | lr 0.00100 | ngrams/sec 67738.1 | eta 0h0m19s
| epoch 42 | step 2000/4071 | loss 8.3340 | lr 0.00100 | ngrams/sec 67500.7 | eta 0h0m15s
| epoch 42 | step 2500/4071 | loss 8.3274 | lr 0.00100 | ngrams/sec 67817.9 | eta 0h0m11s
| epoch 42 | step 3000/4071 | loss 8.3199 | lr 0.00100 | ngrams/sec 67878.0 | eta 0h0m8s
| epoch 42 | step 3500/4071 | loss 8.3448 | lr 0.00100 | ngrams/sec 68073.3 | eta 0h0m4s
| epoch 42 | step 4000/4071 | loss 8.3387 | lr 0.00100 | ngrams/sec 67434.3 | eta 0h0m0s


 37%|███▋      | 155/417 [00:00<00:00, 1543.12it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 719.35it/s]


-----------------------------------------------------------------------------------------
| end of epoch  42 | time 31.35s | valid loss  6.68 | valid ppl   793.47
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 43 | step 500/4071 | loss 8.3071 | lr 0.00100 | ngrams/sec 49583.8 | eta 0h0m36s
| epoch 43 | step 1000/4071 | loss 8.3201 | lr 0.00100 | ngrams/sec 67347.1 | eta 0h0m23s
| epoch 43 | step 1500/4071 | loss 8.3267 | lr 0.00100 | ngrams/sec 68135.1 | eta 0h0m19s
| epoch 43 | step 2000/4071 | loss 8.3376 | lr 0.00100 | ngrams/sec 67979.9 | eta 0h0m15s
| epoch 43 | step 2500/4071 | loss 8.3200 | lr 0.00100 | ngrams/sec 67812.6 | eta 0h0m11s
| epoch 43 | step 3000/4071 | loss 8.3269 | lr 0.00100 | ngrams/sec 68072.8 | eta 0h0m8s
| epoch 43 | step 3500/4071 | loss 8.3353 | lr 0.00100 | ngrams/sec 68429.5 | eta 0h0m4s
| epoch 43 | step 4000/4071 | loss 8.3223 | lr 0.00100 | ngrams/sec 67959.1 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1533.76it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.19it/s]


-----------------------------------------------------------------------------------------
| end of epoch  43 | time 31.28s | valid loss  6.69 | valid ppl   807.04
-----------------------------------------------------------------------------------------
| epoch 44 | step 500/4071 | loss 8.3042 | lr 0.00100 | ngrams/sec 50509.9 | eta 0h0m36s
| epoch 44 | step 1000/4071 | loss 8.3069 | lr 0.00100 | ngrams/sec 68349.0 | eta 0h0m23s
| epoch 44 | step 1500/4071 | loss 8.3175 | lr 0.00100 | ngrams/sec 68071.6 | eta 0h0m19s
| epoch 44 | step 2000/4071 | loss 8.3327 | lr 0.00100 | ngrams/sec 68585.8 | eta 0h0m15s
| epoch 44 | step 2500/4071 | loss 8.3178 | lr 0.00100 | ngrams/sec 67440.8 | eta 0h0m11s
| epoch 44 | step 3000/4071 | loss 8.3174 | lr 0.00100 | ngrams/sec 67658.8 | eta 0h0m8s
| epoch 44 | step 3500/4071 | loss 8.3172 | lr 0.00100 | ngrams/sec 67741.3 | eta 0h0m4s
| epoch 44 | step 4000/4071 | loss 8.3296 | lr 0.00100 | ngrams/sec 68025.9 | eta 0h0m0s


 37%|███▋      | 155/417 [00:00<00:00, 1540.36it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 719.74it/s]


-----------------------------------------------------------------------------------------
| end of epoch  44 | time 31.25s | valid loss  6.71 | valid ppl   819.98
-----------------------------------------------------------------------------------------
| epoch 45 | step 500/4071 | loss 8.2856 | lr 0.00100 | ngrams/sec 50166.8 | eta 0h0m36s
| epoch 45 | step 1000/4071 | loss 8.3113 | lr 0.00100 | ngrams/sec 68305.4 | eta 0h0m23s
| epoch 45 | step 1500/4071 | loss 8.3041 | lr 0.00100 | ngrams/sec 68408.3 | eta 0h0m19s
| epoch 45 | step 2000/4071 | loss 8.3015 | lr 0.00100 | ngrams/sec 68161.7 | eta 0h0m15s
| epoch 45 | step 2500/4071 | loss 8.3027 | lr 0.00100 | ngrams/sec 67671.8 | eta 0h0m11s
| epoch 45 | step 3000/4071 | loss 8.3138 | lr 0.00100 | ngrams/sec 68555.3 | eta 0h0m7s
| epoch 45 | step 3500/4071 | loss 8.3151 | lr 0.00100 | ngrams/sec 68207.4 | eta 0h0m4s
| epoch 45 | step 4000/4071 | loss 8.3188 | lr 0.00100 | ngrams/sec 68025.4 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1546.99it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  45 | time 31.19s | valid loss  6.65 | valid ppl   776.42
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 46 | step 500/4071 | loss 8.2940 | lr 0.00100 | ngrams/sec 50088.7 | eta 0h0m36s
| epoch 46 | step 1000/4071 | loss 8.2851 | lr 0.00100 | ngrams/sec 68105.4 | eta 0h0m23s
| epoch 46 | step 1500/4071 | loss 8.2975 | lr 0.00100 | ngrams/sec 68400.9 | eta 0h0m19s
| epoch 46 | step 2000/4071 | loss 8.3020 | lr 0.00100 | ngrams/sec 68111.1 | eta 0h0m15s
| epoch 46 | step 2500/4071 | loss 8.3164 | lr 0.00100 | ngrams/sec 68123.6 | eta 0h0m11s
| epoch 46 | step 3000/4071 | loss 8.3095 | lr 0.00100 | ngrams/sec 66729.1 | eta 0h0m8s
| epoch 46 | step 3500/4071 | loss 8.2994 | lr 0.00100 | ngrams/sec 68502.1 | eta 0h0m4s
| epoch 46 | step 4000/4071 | loss 8.3119 | lr 0.00100 | ngrams/sec 68321.2 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1549.01it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  46 | time 31.19s | valid loss  6.67 | valid ppl   791.46
-----------------------------------------------------------------------------------------
| epoch 47 | step 500/4071 | loss 8.2872 | lr 0.00100 | ngrams/sec 50648.9 | eta 0h0m36s
| epoch 47 | step 1000/4071 | loss 8.2860 | lr 0.00100 | ngrams/sec 68208.7 | eta 0h0m23s
| epoch 47 | step 1500/4071 | loss 8.2707 | lr 0.00100 | ngrams/sec 68501.9 | eta 0h0m19s
| epoch 47 | step 2000/4071 | loss 8.3001 | lr 0.00100 | ngrams/sec 68397.7 | eta 0h0m15s
| epoch 47 | step 2500/4071 | loss 8.3014 | lr 0.00100 | ngrams/sec 68349.2 | eta 0h0m11s
| epoch 47 | step 3000/4071 | loss 8.3041 | lr 0.00100 | ngrams/sec 68151.0 | eta 0h0m8s
| epoch 47 | step 3500/4071 | loss 8.3072 | lr 0.00100 | ngrams/sec 68013.9 | eta 0h0m4s
| epoch 47 | step 4000/4071 | loss 8.3088 | lr 0.00100 | ngrams/sec 68239.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.14it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.87it/s]


-----------------------------------------------------------------------------------------
| end of epoch  47 | time 31.13s | valid loss  6.68 | valid ppl   793.50
-----------------------------------------------------------------------------------------
| epoch 48 | step 500/4071 | loss 8.2773 | lr 0.00100 | ngrams/sec 50746.1 | eta 0h0m36s
| epoch 48 | step 1000/4071 | loss 8.2841 | lr 0.00100 | ngrams/sec 68142.7 | eta 0h0m23s
| epoch 48 | step 1500/4071 | loss 8.2918 | lr 0.00100 | ngrams/sec 68488.1 | eta 0h0m19s
| epoch 48 | step 2000/4071 | loss 8.2844 | lr 0.00100 | ngrams/sec 68320.4 | eta 0h0m15s
| epoch 48 | step 2500/4071 | loss 8.2923 | lr 0.00100 | ngrams/sec 68065.6 | eta 0h0m11s
| epoch 48 | step 3000/4071 | loss 8.2917 | lr 0.00100 | ngrams/sec 68245.3 | eta 0h0m8s
| epoch 48 | step 3500/4071 | loss 8.2895 | lr 0.00100 | ngrams/sec 68094.3 | eta 0h0m4s
| epoch 48 | step 4000/4071 | loss 8.2997 | lr 0.00100 | ngrams/sec 68149.8 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.54it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.78it/s]


-----------------------------------------------------------------------------------------
| end of epoch  48 | time 31.13s | valid loss  6.70 | valid ppl   816.23
-----------------------------------------------------------------------------------------
| epoch 49 | step 500/4071 | loss 8.2625 | lr 0.00100 | ngrams/sec 50617.4 | eta 0h0m36s
| epoch 49 | step 1000/4071 | loss 8.2737 | lr 0.00100 | ngrams/sec 68386.1 | eta 0h0m22s
| epoch 49 | step 1500/4071 | loss 8.2847 | lr 0.00100 | ngrams/sec 68223.6 | eta 0h0m19s
| epoch 49 | step 2000/4071 | loss 8.2897 | lr 0.00100 | ngrams/sec 68104.1 | eta 0h0m15s
| epoch 49 | step 2500/4071 | loss 8.2854 | lr 0.00100 | ngrams/sec 68256.0 | eta 0h0m11s
| epoch 49 | step 3000/4071 | loss 8.2936 | lr 0.00100 | ngrams/sec 68398.0 | eta 0h0m8s
| epoch 49 | step 3500/4071 | loss 8.2925 | lr 0.00100 | ngrams/sec 68216.6 | eta 0h0m4s
| epoch 49 | step 4000/4071 | loss 8.2920 | lr 0.00100 | ngrams/sec 68084.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.93it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.15it/s]


-----------------------------------------------------------------------------------------
| end of epoch  49 | time 31.13s | valid loss  6.66 | valid ppl   783.47
-----------------------------------------------------------------------------------------
| epoch 50 | step 500/4071 | loss 8.2577 | lr 0.00100 | ngrams/sec 50728.9 | eta 0h0m36s
| epoch 50 | step 1000/4071 | loss 8.2635 | lr 0.00100 | ngrams/sec 68165.8 | eta 0h0m23s
| epoch 50 | step 1500/4071 | loss 8.2840 | lr 0.00100 | ngrams/sec 66859.7 | eta 0h0m19s
| epoch 50 | step 2000/4071 | loss 8.2699 | lr 0.00100 | ngrams/sec 68108.9 | eta 0h0m15s
| epoch 50 | step 2500/4071 | loss 8.2688 | lr 0.00100 | ngrams/sec 67694.1 | eta 0h0m11s
| epoch 50 | step 3000/4071 | loss 8.2828 | lr 0.00100 | ngrams/sec 68279.6 | eta 0h0m8s
| epoch 50 | step 3500/4071 | loss 8.2906 | lr 0.00100 | ngrams/sec 68137.6 | eta 0h0m4s
| epoch 50 | step 4000/4071 | loss 8.2821 | lr 0.00100 | ngrams/sec 68547.5 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1558.65it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch  50 | time 31.24s | valid loss  6.66 | valid ppl   779.42
-----------------------------------------------------------------------------------------
| epoch 51 | step 500/4071 | loss 8.2577 | lr 0.00100 | ngrams/sec 50644.2 | eta 0h0m36s
| epoch 51 | step 1000/4071 | loss 8.2668 | lr 0.00100 | ngrams/sec 68117.6 | eta 0h0m23s
| epoch 51 | step 1500/4071 | loss 8.2681 | lr 0.00100 | ngrams/sec 68204.1 | eta 0h0m19s
| epoch 51 | step 2000/4071 | loss 8.2841 | lr 0.00100 | ngrams/sec 68333.3 | eta 0h0m15s
| epoch 51 | step 2500/4071 | loss 8.2744 | lr 0.00100 | ngrams/sec 68139.6 | eta 0h0m11s
| epoch 51 | step 3000/4071 | loss 8.2812 | lr 0.00100 | ngrams/sec 68147.9 | eta 0h0m8s
| epoch 51 | step 3500/4071 | loss 8.2730 | lr 0.00100 | ngrams/sec 68438.5 | eta 0h0m4s
| epoch 51 | step 4000/4071 | loss 8.2774 | lr 0.00100 | ngrams/sec 67684.3 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1538.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  51 | time 31.16s | valid loss  6.66 | valid ppl   777.95
-----------------------------------------------------------------------------------------
| epoch 52 | step 500/4071 | loss 8.2448 | lr 0.00100 | ngrams/sec 50639.6 | eta 0h0m36s
| epoch 52 | step 1000/4071 | loss 8.2576 | lr 0.00100 | ngrams/sec 68419.5 | eta 0h0m22s
| epoch 52 | step 1500/4071 | loss 8.2711 | lr 0.00100 | ngrams/sec 67821.7 | eta 0h0m19s
| epoch 52 | step 2000/4071 | loss 8.2639 | lr 0.00100 | ngrams/sec 68365.6 | eta 0h0m15s
| epoch 52 | step 2500/4071 | loss 8.2780 | lr 0.00100 | ngrams/sec 67796.2 | eta 0h0m11s
| epoch 52 | step 3000/4071 | loss 8.2598 | lr 0.00100 | ngrams/sec 68010.3 | eta 0h0m8s
| epoch 52 | step 3500/4071 | loss 8.2865 | lr 0.00100 | ngrams/sec 68315.7 | eta 0h0m4s
| epoch 52 | step 4000/4071 | loss 8.2717 | lr 0.00100 | ngrams/sec 68356.3 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1538.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.01it/s]


-----------------------------------------------------------------------------------------
| end of epoch  52 | time 31.16s | valid loss  6.67 | valid ppl   789.19
-----------------------------------------------------------------------------------------
| epoch 53 | step 500/4071 | loss 8.2359 | lr 0.00100 | ngrams/sec 50687.0 | eta 0h0m36s
| epoch 53 | step 1000/4071 | loss 8.2467 | lr 0.00100 | ngrams/sec 67632.7 | eta 0h0m23s
| epoch 53 | step 1500/4071 | loss 8.2498 | lr 0.00100 | ngrams/sec 68552.8 | eta 0h0m19s
| epoch 53 | step 2000/4071 | loss 8.2519 | lr 0.00100 | ngrams/sec 68210.1 | eta 0h0m15s
| epoch 53 | step 2500/4071 | loss 8.2756 | lr 0.00100 | ngrams/sec 68228.3 | eta 0h0m11s
| epoch 53 | step 3000/4071 | loss 8.2723 | lr 0.00100 | ngrams/sec 67825.4 | eta 0h0m8s
| epoch 53 | step 3500/4071 | loss 8.2718 | lr 0.00100 | ngrams/sec 68129.5 | eta 0h0m4s
| epoch 53 | step 4000/4071 | loss 8.2576 | lr 0.00100 | ngrams/sec 68216.3 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1554.58it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.91it/s]


-----------------------------------------------------------------------------------------
| end of epoch  53 | time 31.18s | valid loss  6.61 | valid ppl   739.17
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 54 | step 500/4071 | loss 8.2421 | lr 0.00100 | ngrams/sec 50119.1 | eta 0h0m36s
| epoch 54 | step 1000/4071 | loss 8.2462 | lr 0.00100 | ngrams/sec 67973.1 | eta 0h0m23s
| epoch 54 | step 1500/4071 | loss 8.2625 | lr 0.00100 | ngrams/sec 68298.9 | eta 0h0m19s
| epoch 54 | step 2000/4071 | loss 8.2584 | lr 0.00100 | ngrams/sec 67700.5 | eta 0h0m15s
| epoch 54 | step 2500/4071 | loss 8.2608 | lr 0.00100 | ngrams/sec 67457.6 | eta 0h0m11s
| epoch 54 | step 3000/4071 | loss 8.2504 | lr 0.00100 | ngrams/sec 67516.8 | eta 0h0m8s
| epoch 54 | step 3500/4071 | loss 8.2658 | lr 0.00100 | ngrams/sec 68142.9 | eta 0h0m4s
| epoch 54 | step 4000/4071 | loss 8.2598 | lr 0.00100 | ngrams/sec 68322.1 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1549.08it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.35it/s]


-----------------------------------------------------------------------------------------
| end of epoch  54 | time 31.24s | valid loss  6.63 | valid ppl   760.72
-----------------------------------------------------------------------------------------
| epoch 55 | step 500/4071 | loss 8.2423 | lr 0.00100 | ngrams/sec 50818.4 | eta 0h0m35s
| epoch 55 | step 1000/4071 | loss 8.2461 | lr 0.00100 | ngrams/sec 67886.2 | eta 0h0m23s
| epoch 55 | step 1500/4071 | loss 8.2535 | lr 0.00100 | ngrams/sec 68301.2 | eta 0h0m19s
| epoch 55 | step 2000/4071 | loss 8.2448 | lr 0.00100 | ngrams/sec 68538.9 | eta 0h0m15s
| epoch 55 | step 2500/4071 | loss 8.2630 | lr 0.00100 | ngrams/sec 67836.0 | eta 0h0m11s
| epoch 55 | step 3000/4071 | loss 8.2731 | lr 0.00100 | ngrams/sec 68389.2 | eta 0h0m8s
| epoch 55 | step 3500/4071 | loss 8.2681 | lr 0.00100 | ngrams/sec 67815.3 | eta 0h0m4s
| epoch 55 | step 4000/4071 | loss 8.2529 | lr 0.00100 | ngrams/sec 68200.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.42it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.33it/s]


-----------------------------------------------------------------------------------------
| end of epoch  55 | time 31.15s | valid loss  6.62 | valid ppl   752.31
-----------------------------------------------------------------------------------------
| epoch 56 | step 500/4071 | loss 8.2248 | lr 0.00100 | ngrams/sec 50690.2 | eta 0h0m36s
| epoch 56 | step 1000/4071 | loss 8.2414 | lr 0.00100 | ngrams/sec 67930.9 | eta 0h0m23s
| epoch 56 | step 1500/4071 | loss 8.2497 | lr 0.00100 | ngrams/sec 68162.6 | eta 0h0m19s
| epoch 56 | step 2000/4071 | loss 8.2546 | lr 0.00100 | ngrams/sec 68329.0 | eta 0h0m15s
| epoch 56 | step 2500/4071 | loss 8.2485 | lr 0.00100 | ngrams/sec 68000.2 | eta 0h0m11s
| epoch 56 | step 3000/4071 | loss 8.2462 | lr 0.00100 | ngrams/sec 68046.5 | eta 0h0m8s
| epoch 56 | step 3500/4071 | loss 8.2506 | lr 0.00100 | ngrams/sec 67916.3 | eta 0h0m4s
| epoch 56 | step 4000/4071 | loss 8.2569 | lr 0.00100 | ngrams/sec 68277.7 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1542.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.03it/s]


-----------------------------------------------------------------------------------------
| end of epoch  56 | time 31.19s | valid loss  6.64 | valid ppl   766.85
-----------------------------------------------------------------------------------------
| epoch 57 | step 500/4071 | loss 8.2182 | lr 0.00100 | ngrams/sec 50658.1 | eta 0h0m36s
| epoch 57 | step 1000/4071 | loss 8.2353 | lr 0.00100 | ngrams/sec 68185.8 | eta 0h0m23s
| epoch 57 | step 1500/4071 | loss 8.2262 | lr 0.00100 | ngrams/sec 68164.7 | eta 0h0m19s
| epoch 57 | step 2000/4071 | loss 8.2490 | lr 0.00100 | ngrams/sec 68086.5 | eta 0h0m15s
| epoch 57 | step 2500/4071 | loss 8.2659 | lr 0.00100 | ngrams/sec 67902.9 | eta 0h0m11s
| epoch 57 | step 3000/4071 | loss 8.2447 | lr 0.00100 | ngrams/sec 68244.9 | eta 0h0m8s
| epoch 57 | step 3500/4071 | loss 8.2518 | lr 0.00100 | ngrams/sec 68124.0 | eta 0h0m4s
| epoch 57 | step 4000/4071 | loss 8.2444 | lr 0.00100 | ngrams/sec 68011.3 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1543.23it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.16it/s]


-----------------------------------------------------------------------------------------
| end of epoch  57 | time 31.18s | valid loss  6.62 | valid ppl   752.35
-----------------------------------------------------------------------------------------
| epoch 58 | step 500/4071 | loss 8.2282 | lr 0.00100 | ngrams/sec 50480.7 | eta 0h0m36s
| epoch 58 | step 1000/4071 | loss 8.2296 | lr 0.00100 | ngrams/sec 68495.4 | eta 0h0m22s
| epoch 58 | step 1500/4071 | loss 8.2308 | lr 0.00100 | ngrams/sec 68425.6 | eta 0h0m19s
| epoch 58 | step 2000/4071 | loss 8.2384 | lr 0.00100 | ngrams/sec 67761.6 | eta 0h0m15s
| epoch 58 | step 2500/4071 | loss 8.2449 | lr 0.00100 | ngrams/sec 68015.3 | eta 0h0m11s
| epoch 58 | step 3000/4071 | loss 8.2381 | lr 0.00100 | ngrams/sec 68255.5 | eta 0h0m8s
| epoch 58 | step 3500/4071 | loss 8.2371 | lr 0.00100 | ngrams/sec 68138.2 | eta 0h0m4s
| epoch 58 | step 4000/4071 | loss 8.2274 | lr 0.00100 | ngrams/sec 68148.6 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1559.48it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.48it/s]


-----------------------------------------------------------------------------------------
| end of epoch  58 | time 31.17s | valid loss  6.60 | valid ppl   734.39
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 59 | step 500/4071 | loss 8.2271 | lr 0.00100 | ngrams/sec 50035.9 | eta 0h0m36s
| epoch 59 | step 1000/4071 | loss 8.2341 | lr 0.00100 | ngrams/sec 68086.3 | eta 0h0m23s
| epoch 59 | step 1500/4071 | loss 8.2217 | lr 0.00100 | ngrams/sec 68254.3 | eta 0h0m19s
| epoch 59 | step 2000/4071 | loss 8.2276 | lr 0.00100 | ngrams/sec 68146.5 | eta 0h0m15s
| epoch 59 | step 2500/4071 | loss 8.2272 | lr 0.00100 | ngrams/sec 68331.2 | eta 0h0m11s
| epoch 59 | step 3000/4071 | loss 8.2449 | lr 0.00100 | ngrams/sec 68157.4 | eta 0h0m8s
| epoch 59 | step 3500/4071 | loss 8.2385 | lr 0.00100 | ngrams/sec 68239.4 | eta 0h0m4s
| epoch 59 | step 4000/4071 | loss 8.2380 | lr 0.00100 | ngrams/sec 67925.7 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1551.84it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  59 | time 31.15s | valid loss  6.59 | valid ppl   729.40
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 60 | step 500/4071 | loss 8.2097 | lr 0.00100 | ngrams/sec 49795.7 | eta 0h0m36s
| epoch 60 | step 1000/4071 | loss 8.2217 | lr 0.00100 | ngrams/sec 68204.9 | eta 0h0m23s
| epoch 60 | step 1500/4071 | loss 8.2178 | lr 0.00100 | ngrams/sec 68308.8 | eta 0h0m19s
| epoch 60 | step 2000/4071 | loss 8.2136 | lr 0.00100 | ngrams/sec 68285.4 | eta 0h0m15s
| epoch 60 | step 2500/4071 | loss 8.2239 | lr 0.00100 | ngrams/sec 67941.7 | eta 0h0m11s
| epoch 60 | step 3000/4071 | loss 8.2285 | lr 0.00100 | ngrams/sec 68172.2 | eta 0h0m8s
| epoch 60 | step 3500/4071 | loss 8.2298 | lr 0.00100 | ngrams/sec 67966.5 | eta 0h0m4s
| epoch 60 | step 4000/4071 | loss 8.2371 | lr 0.00100 | ngrams/sec 68127.4 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1550.89it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.71it/s]


-----------------------------------------------------------------------------------------
| end of epoch  60 | time 31.19s | valid loss  6.60 | valid ppl   738.06
-----------------------------------------------------------------------------------------
| epoch 61 | step 500/4071 | loss 8.2089 | lr 0.00100 | ngrams/sec 50625.3 | eta 0h0m36s
| epoch 61 | step 1000/4071 | loss 8.2208 | lr 0.00100 | ngrams/sec 67932.8 | eta 0h0m23s
| epoch 61 | step 1500/4071 | loss 8.2178 | lr 0.00100 | ngrams/sec 68216.5 | eta 0h0m19s
| epoch 61 | step 2000/4071 | loss 8.2361 | lr 0.00100 | ngrams/sec 68348.6 | eta 0h0m15s
| epoch 61 | step 2500/4071 | loss 8.2288 | lr 0.00100 | ngrams/sec 68207.8 | eta 0h0m11s
| epoch 61 | step 3000/4071 | loss 8.2299 | lr 0.00100 | ngrams/sec 68216.3 | eta 0h0m8s
| epoch 61 | step 3500/4071 | loss 8.2399 | lr 0.00100 | ngrams/sec 68262.8 | eta 0h0m4s
| epoch 61 | step 4000/4071 | loss 8.2218 | lr 0.00100 | ngrams/sec 67852.2 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1538.02it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.09it/s]


-----------------------------------------------------------------------------------------
| end of epoch  61 | time 31.17s | valid loss  6.62 | valid ppl   748.72
-----------------------------------------------------------------------------------------
| epoch 62 | step 500/4071 | loss 8.2012 | lr 0.00100 | ngrams/sec 50641.0 | eta 0h0m36s
| epoch 62 | step 1000/4071 | loss 8.2093 | lr 0.00100 | ngrams/sec 67913.9 | eta 0h0m23s
| epoch 62 | step 1500/4071 | loss 8.2168 | lr 0.00100 | ngrams/sec 68426.1 | eta 0h0m19s
| epoch 62 | step 2000/4071 | loss 8.2148 | lr 0.00100 | ngrams/sec 68167.1 | eta 0h0m15s
| epoch 62 | step 2500/4071 | loss 8.2305 | lr 0.00100 | ngrams/sec 68090.7 | eta 0h0m11s
| epoch 62 | step 3000/4071 | loss 8.2346 | lr 0.00100 | ngrams/sec 68128.2 | eta 0h0m8s
| epoch 62 | step 3500/4071 | loss 8.2320 | lr 0.00100 | ngrams/sec 68363.2 | eta 0h0m4s
| epoch 62 | step 4000/4071 | loss 8.2272 | lr 0.00100 | ngrams/sec 68199.9 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1552.65it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.89it/s]


-----------------------------------------------------------------------------------------
| end of epoch  62 | time 31.15s | valid loss  6.59 | valid ppl   729.13
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 63 | step 500/4071 | loss 8.1935 | lr 0.00100 | ngrams/sec 50014.4 | eta 0h0m36s
| epoch 63 | step 1000/4071 | loss 8.2142 | lr 0.00100 | ngrams/sec 67942.7 | eta 0h0m23s
| epoch 63 | step 1500/4071 | loss 8.2038 | lr 0.00100 | ngrams/sec 68087.0 | eta 0h0m19s
| epoch 63 | step 2000/4071 | loss 8.2166 | lr 0.00100 | ngrams/sec 67999.3 | eta 0h0m15s
| epoch 63 | step 2500/4071 | loss 8.2281 | lr 0.00100 | ngrams/sec 68013.1 | eta 0h0m11s
| epoch 63 | step 3000/4071 | loss 8.2231 | lr 0.00100 | ngrams/sec 67952.6 | eta 0h0m8s
| epoch 63 | step 3500/4071 | loss 8.2226 | lr 0.00100 | ngrams/sec 68518.8 | eta 0h0m4s
| epoch 63 | step 4000/4071 | loss 8.2116 | lr 0.00100 | ngrams/sec 67986.9 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1546.86it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.73it/s]


-----------------------------------------------------------------------------------------
| end of epoch  63 | time 31.20s | valid loss  6.61 | valid ppl   743.10
-----------------------------------------------------------------------------------------
| epoch 64 | step 500/4071 | loss 8.1897 | lr 0.00100 | ngrams/sec 50567.5 | eta 0h0m36s
| epoch 64 | step 1000/4071 | loss 8.1998 | lr 0.00100 | ngrams/sec 67986.4 | eta 0h0m23s
| epoch 64 | step 1500/4071 | loss 8.2099 | lr 0.00100 | ngrams/sec 67933.6 | eta 0h0m19s
| epoch 64 | step 2000/4071 | loss 8.2014 | lr 0.00100 | ngrams/sec 67550.3 | eta 0h0m15s
| epoch 64 | step 2500/4071 | loss 8.2114 | lr 0.00100 | ngrams/sec 67185.1 | eta 0h0m11s
| epoch 64 | step 3000/4071 | loss 8.2143 | lr 0.00100 | ngrams/sec 68007.9 | eta 0h0m8s
| epoch 64 | step 3500/4071 | loss 8.2178 | lr 0.00100 | ngrams/sec 67966.4 | eta 0h0m4s
| epoch 64 | step 4000/4071 | loss 8.2211 | lr 0.00100 | ngrams/sec 67993.3 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1542.60it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.99it/s]


-----------------------------------------------------------------------------------------
| end of epoch  64 | time 31.31s | valid loss  6.57 | valid ppl   712.38
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 65 | step 500/4071 | loss 8.1900 | lr 0.00100 | ngrams/sec 49106.0 | eta 0h0m37s
| epoch 65 | step 1000/4071 | loss 8.2116 | lr 0.00100 | ngrams/sec 67942.3 | eta 0h0m23s
| epoch 65 | step 1500/4071 | loss 8.2120 | lr 0.00100 | ngrams/sec 68192.4 | eta 0h0m19s
| epoch 65 | step 2000/4071 | loss 8.1990 | lr 0.00100 | ngrams/sec 68260.1 | eta 0h0m15s
| epoch 65 | step 2500/4071 | loss 8.2060 | lr 0.00100 | ngrams/sec 68033.7 | eta 0h0m11s
| epoch 65 | step 3000/4071 | loss 8.2137 | lr 0.00100 | ngrams/sec 68061.8 | eta 0h0m8s
| epoch 65 | step 3500/4071 | loss 8.2076 | lr 0.00100 | ngrams/sec 68212.4 | eta 0h0m4s
| epoch 65 | step 4000/4071 | loss 8.2200 | lr 0.00100 | ngrams/sec 67046.0 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1555.25it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.28it/s]


-----------------------------------------------------------------------------------------
| end of epoch  65 | time 31.35s | valid loss  6.59 | valid ppl   730.85
-----------------------------------------------------------------------------------------
| epoch 66 | step 500/4071 | loss 8.1716 | lr 0.00100 | ngrams/sec 50468.0 | eta 0h0m36s
| epoch 66 | step 1000/4071 | loss 8.1975 | lr 0.00100 | ngrams/sec 68052.1 | eta 0h0m23s
| epoch 66 | step 1500/4071 | loss 8.1915 | lr 0.00100 | ngrams/sec 67973.1 | eta 0h0m19s
| epoch 66 | step 2000/4071 | loss 8.2174 | lr 0.00100 | ngrams/sec 68187.1 | eta 0h0m15s
| epoch 66 | step 2500/4071 | loss 8.2194 | lr 0.00100 | ngrams/sec 68034.0 | eta 0h0m11s
| epoch 66 | step 3000/4071 | loss 8.2061 | lr 0.00100 | ngrams/sec 67986.6 | eta 0h0m8s
| epoch 66 | step 3500/4071 | loss 8.2121 | lr 0.00100 | ngrams/sec 67939.8 | eta 0h0m4s
| epoch 66 | step 4000/4071 | loss 8.2132 | lr 0.00100 | ngrams/sec 68183.3 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1539.92it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch  66 | time 31.21s | valid loss  6.58 | valid ppl   719.86
-----------------------------------------------------------------------------------------
| epoch 67 | step 500/4071 | loss 8.1832 | lr 0.00100 | ngrams/sec 50452.0 | eta 0h0m36s
| epoch 67 | step 1000/4071 | loss 8.1854 | lr 0.00100 | ngrams/sec 67983.8 | eta 0h0m23s
| epoch 67 | step 1500/4071 | loss 8.1943 | lr 0.00100 | ngrams/sec 68124.0 | eta 0h0m19s
| epoch 67 | step 2000/4071 | loss 8.2063 | lr 0.00100 | ngrams/sec 67912.2 | eta 0h0m15s
| epoch 67 | step 2500/4071 | loss 8.2092 | lr 0.00100 | ngrams/sec 67572.6 | eta 0h0m11s
| epoch 67 | step 3000/4071 | loss 8.2139 | lr 0.00100 | ngrams/sec 67961.8 | eta 0h0m8s
| epoch 67 | step 3500/4071 | loss 8.2131 | lr 0.00100 | ngrams/sec 67964.3 | eta 0h0m4s
| epoch 67 | step 4000/4071 | loss 8.2076 | lr 0.00100 | ngrams/sec 67997.4 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.28it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  67 | time 31.28s | valid loss  6.60 | valid ppl   737.55
-----------------------------------------------------------------------------------------
| epoch 68 | step 500/4071 | loss 8.1824 | lr 0.00100 | ngrams/sec 50395.8 | eta 0h0m36s
| epoch 68 | step 1000/4071 | loss 8.1815 | lr 0.00100 | ngrams/sec 67817.8 | eta 0h0m23s
| epoch 68 | step 1500/4071 | loss 8.1970 | lr 0.00100 | ngrams/sec 68424.4 | eta 0h0m19s
| epoch 68 | step 2000/4071 | loss 8.1955 | lr 0.00100 | ngrams/sec 67939.1 | eta 0h0m15s
| epoch 68 | step 2500/4071 | loss 8.1915 | lr 0.00100 | ngrams/sec 67679.9 | eta 0h0m11s
| epoch 68 | step 3000/4071 | loss 8.1928 | lr 0.00100 | ngrams/sec 68036.1 | eta 0h0m8s
| epoch 68 | step 3500/4071 | loss 8.1863 | lr 0.00100 | ngrams/sec 67991.1 | eta 0h0m4s
| epoch 68 | step 4000/4071 | loss 8.2110 | lr 0.00100 | ngrams/sec 67863.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  68 | time 31.26s | valid loss  6.59 | valid ppl   725.00
-----------------------------------------------------------------------------------------
| epoch 69 | step 500/4071 | loss 8.1729 | lr 0.00100 | ngrams/sec 50450.7 | eta 0h0m36s
| epoch 69 | step 1000/4071 | loss 8.1812 | lr 0.00100 | ngrams/sec 67776.8 | eta 0h0m23s
| epoch 69 | step 1500/4071 | loss 8.1986 | lr 0.00100 | ngrams/sec 68488.9 | eta 0h0m19s
| epoch 69 | step 2000/4071 | loss 8.1949 | lr 0.00100 | ngrams/sec 68224.0 | eta 0h0m15s
| epoch 69 | step 2500/4071 | loss 8.1922 | lr 0.00100 | ngrams/sec 67877.2 | eta 0h0m11s
| epoch 69 | step 3000/4071 | loss 8.1950 | lr 0.00100 | ngrams/sec 68121.8 | eta 0h0m8s
| epoch 69 | step 3500/4071 | loss 8.2040 | lr 0.00100 | ngrams/sec 68137.4 | eta 0h0m4s
| epoch 69 | step 4000/4071 | loss 8.2013 | lr 0.00100 | ngrams/sec 67847.5 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.20it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.27it/s]


-----------------------------------------------------------------------------------------
| end of epoch  69 | time 31.22s | valid loss  6.57 | valid ppl   709.85
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 70 | step 500/4071 | loss 8.1786 | lr 0.00100 | ngrams/sec 49994.2 | eta 0h0m36s
| epoch 70 | step 1000/4071 | loss 8.1914 | lr 0.00100 | ngrams/sec 67824.8 | eta 0h0m23s
| epoch 70 | step 1500/4071 | loss 8.1862 | lr 0.00100 | ngrams/sec 68404.2 | eta 0h0m19s
| epoch 70 | step 2000/4071 | loss 8.1886 | lr 0.00100 | ngrams/sec 68125.4 | eta 0h0m15s
| epoch 70 | step 2500/4071 | loss 8.1904 | lr 0.00100 | ngrams/sec 67915.2 | eta 0h0m11s
| epoch 70 | step 3000/4071 | loss 8.1890 | lr 0.00100 | ngrams/sec 68303.9 | eta 0h0m8s
| epoch 70 | step 3500/4071 | loss 8.1902 | lr 0.00100 | ngrams/sec 68001.3 | eta 0h0m4s
| epoch 70 | step 4000/4071 | loss 8.1879 | lr 0.00100 | ngrams/sec 67734.3 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1554.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  70 | time 31.21s | valid loss  6.61 | valid ppl   745.96
-----------------------------------------------------------------------------------------
| epoch 71 | step 500/4071 | loss 8.1661 | lr 0.00100 | ngrams/sec 50426.6 | eta 0h0m36s
| epoch 71 | step 1000/4071 | loss 8.1727 | lr 0.00100 | ngrams/sec 67940.8 | eta 0h0m23s
| epoch 71 | step 1500/4071 | loss 8.1815 | lr 0.00100 | ngrams/sec 68286.3 | eta 0h0m19s
| epoch 71 | step 2000/4071 | loss 8.1777 | lr 0.00100 | ngrams/sec 68417.1 | eta 0h0m15s
| epoch 71 | step 2500/4071 | loss 8.1733 | lr 0.00100 | ngrams/sec 68006.3 | eta 0h0m11s
| epoch 71 | step 3000/4071 | loss 8.1888 | lr 0.00100 | ngrams/sec 68178.0 | eta 0h0m8s
| epoch 71 | step 3500/4071 | loss 8.1819 | lr 0.00100 | ngrams/sec 67877.3 | eta 0h0m4s
| epoch 71 | step 4000/4071 | loss 8.1866 | lr 0.00100 | ngrams/sec 68067.7 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1539.31it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.16it/s]


-----------------------------------------------------------------------------------------
| end of epoch  71 | time 31.20s | valid loss  6.57 | valid ppl   713.23
-----------------------------------------------------------------------------------------
| epoch 72 | step 500/4071 | loss 8.1711 | lr 0.00100 | ngrams/sec 50601.3 | eta 0h0m36s
| epoch 72 | step 1000/4071 | loss 8.1693 | lr 0.00100 | ngrams/sec 68242.2 | eta 0h0m23s
| epoch 72 | step 1500/4071 | loss 8.1556 | lr 0.00100 | ngrams/sec 67867.2 | eta 0h0m19s
| epoch 72 | step 2000/4071 | loss 8.1622 | lr 0.00100 | ngrams/sec 68142.1 | eta 0h0m15s
| epoch 72 | step 2500/4071 | loss 8.1691 | lr 0.00100 | ngrams/sec 66207.4 | eta 0h0m12s
| epoch 72 | step 3000/4071 | loss 8.1977 | lr 0.00100 | ngrams/sec 68118.6 | eta 0h0m8s
| epoch 72 | step 3500/4071 | loss 8.1735 | lr 0.00100 | ngrams/sec 68197.3 | eta 0h0m4s
| epoch 72 | step 4000/4071 | loss 8.1803 | lr 0.00100 | ngrams/sec 67973.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1547.64it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.60it/s]


-----------------------------------------------------------------------------------------
| end of epoch  72 | time 31.30s | valid loss  6.61 | valid ppl   740.48
-----------------------------------------------------------------------------------------
| epoch 73 | step 500/4071 | loss 8.1542 | lr 0.00100 | ngrams/sec 50826.1 | eta 0h0m35s
| epoch 73 | step 1000/4071 | loss 8.1645 | lr 0.00100 | ngrams/sec 68233.5 | eta 0h0m23s
| epoch 73 | step 1500/4071 | loss 8.1687 | lr 0.00100 | ngrams/sec 68319.6 | eta 0h0m19s
| epoch 73 | step 2000/4071 | loss 8.1771 | lr 0.00100 | ngrams/sec 67937.9 | eta 0h0m15s
| epoch 73 | step 2500/4071 | loss 8.1767 | lr 0.00100 | ngrams/sec 67939.6 | eta 0h0m11s
| epoch 73 | step 3000/4071 | loss 8.1789 | lr 0.00100 | ngrams/sec 68304.6 | eta 0h0m8s
| epoch 73 | step 3500/4071 | loss 8.1724 | lr 0.00100 | ngrams/sec 68033.7 | eta 0h0m4s
| epoch 73 | step 4000/4071 | loss 8.1755 | lr 0.00100 | ngrams/sec 67743.5 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1542.20it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.87it/s]


-----------------------------------------------------------------------------------------
| end of epoch  73 | time 31.18s | valid loss  6.60 | valid ppl   734.45
-----------------------------------------------------------------------------------------
| epoch 74 | step 500/4071 | loss 8.1556 | lr 0.00100 | ngrams/sec 50557.3 | eta 0h0m36s
| epoch 74 | step 1000/4071 | loss 8.1711 | lr 0.00100 | ngrams/sec 67205.6 | eta 0h0m23s
| epoch 74 | step 1500/4071 | loss 8.1620 | lr 0.00100 | ngrams/sec 67190.9 | eta 0h0m19s
| epoch 74 | step 2000/4071 | loss 8.1692 | lr 0.00100 | ngrams/sec 67555.8 | eta 0h0m15s
| epoch 74 | step 2500/4071 | loss 8.1734 | lr 0.00100 | ngrams/sec 68001.3 | eta 0h0m11s
| epoch 74 | step 3000/4071 | loss 8.1768 | lr 0.00100 | ngrams/sec 68063.2 | eta 0h0m8s
| epoch 74 | step 3500/4071 | loss 8.1812 | lr 0.00100 | ngrams/sec 68455.3 | eta 0h0m4s
| epoch 74 | step 4000/4071 | loss 8.1739 | lr 0.00100 | ngrams/sec 67788.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.36it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.76it/s]


-----------------------------------------------------------------------------------------
| end of epoch  74 | time 31.34s | valid loss  6.60 | valid ppl   732.60
-----------------------------------------------------------------------------------------
| epoch 75 | step 500/4071 | loss 8.1378 | lr 0.00100 | ngrams/sec 50525.3 | eta 0h0m36s
| epoch 75 | step 1000/4071 | loss 8.1652 | lr 0.00100 | ngrams/sec 68181.1 | eta 0h0m23s
| epoch 75 | step 1500/4071 | loss 8.1542 | lr 0.00100 | ngrams/sec 68343.3 | eta 0h0m19s
| epoch 75 | step 2000/4071 | loss 8.1663 | lr 0.00100 | ngrams/sec 68074.4 | eta 0h0m15s
| epoch 75 | step 2500/4071 | loss 8.1669 | lr 0.00100 | ngrams/sec 67885.7 | eta 0h0m11s
| epoch 75 | step 3000/4071 | loss 8.1603 | lr 0.00100 | ngrams/sec 68167.9 | eta 0h0m8s
| epoch 75 | step 3500/4071 | loss 8.1782 | lr 0.00100 | ngrams/sec 68225.4 | eta 0h0m4s
| epoch 75 | step 4000/4071 | loss 8.1716 | lr 0.00100 | ngrams/sec 67950.9 | eta 0h0m0s


 37%|███▋      | 155/417 [00:00<00:00, 1545.24it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  75 | time 31.18s | valid loss  6.57 | valid ppl   716.90
-----------------------------------------------------------------------------------------
| epoch 76 | step 500/4071 | loss 8.1302 | lr 0.00100 | ngrams/sec 50587.8 | eta 0h0m36s
| epoch 76 | step 1000/4071 | loss 8.1609 | lr 0.00100 | ngrams/sec 67857.3 | eta 0h0m23s
| epoch 76 | step 1500/4071 | loss 8.1636 | lr 0.00100 | ngrams/sec 67990.1 | eta 0h0m19s
| epoch 76 | step 2000/4071 | loss 8.1626 | lr 0.00100 | ngrams/sec 68177.7 | eta 0h0m15s
| epoch 76 | step 2500/4071 | loss 8.1603 | lr 0.00100 | ngrams/sec 68005.9 | eta 0h0m11s
| epoch 76 | step 3000/4071 | loss 8.1711 | lr 0.00100 | ngrams/sec 67803.3 | eta 0h0m8s
| epoch 76 | step 3500/4071 | loss 8.1772 | lr 0.00100 | ngrams/sec 68090.0 | eta 0h0m4s
| epoch 76 | step 4000/4071 | loss 8.1750 | lr 0.00100 | ngrams/sec 68040.6 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1552.17it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.71it/s]


-----------------------------------------------------------------------------------------
| end of epoch  76 | time 31.23s | valid loss  6.58 | valid ppl   724.02
-----------------------------------------------------------------------------------------
| epoch 77 | step 500/4071 | loss 8.1377 | lr 0.00100 | ngrams/sec 50495.3 | eta 0h0m36s
| epoch 77 | step 1000/4071 | loss 8.1490 | lr 0.00100 | ngrams/sec 67381.5 | eta 0h0m23s
| epoch 77 | step 1500/4071 | loss 8.1504 | lr 0.00100 | ngrams/sec 68032.7 | eta 0h0m19s
| epoch 77 | step 2000/4071 | loss 8.1571 | lr 0.00100 | ngrams/sec 67921.7 | eta 0h0m15s
| epoch 77 | step 2500/4071 | loss 8.1617 | lr 0.00100 | ngrams/sec 67612.2 | eta 0h0m11s
| epoch 77 | step 3000/4071 | loss 8.1687 | lr 0.00100 | ngrams/sec 67790.5 | eta 0h0m8s
| epoch 77 | step 3500/4071 | loss 8.1637 | lr 0.00100 | ngrams/sec 68039.2 | eta 0h0m4s
| epoch 77 | step 4000/4071 | loss 8.1490 | lr 0.00100 | ngrams/sec 67789.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.87it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  77 | time 31.32s | valid loss  6.56 | valid ppl   704.67
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 78 | step 500/4071 | loss 8.1346 | lr 0.00100 | ngrams/sec 49802.3 | eta 0h0m36s
| epoch 78 | step 1000/4071 | loss 8.1251 | lr 0.00100 | ngrams/sec 67932.0 | eta 0h0m23s
| epoch 78 | step 1500/4071 | loss 8.1437 | lr 0.00100 | ngrams/sec 68026.9 | eta 0h0m19s
| epoch 78 | step 2000/4071 | loss 8.1576 | lr 0.00100 | ngrams/sec 67915.3 | eta 0h0m15s
| epoch 78 | step 2500/4071 | loss 8.1562 | lr 0.00100 | ngrams/sec 68272.8 | eta 0h0m11s
| epoch 78 | step 3000/4071 | loss 8.1494 | lr 0.00100 | ngrams/sec 68046.6 | eta 0h0m8s
| epoch 78 | step 3500/4071 | loss 8.1664 | lr 0.00100 | ngrams/sec 68346.4 | eta 0h0m4s
| epoch 78 | step 4000/4071 | loss 8.1602 | lr 0.00100 | ngrams/sec 67820.2 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1555.92it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.12it/s]


-----------------------------------------------------------------------------------------
| end of epoch  78 | time 31.21s | valid loss  6.57 | valid ppl   714.08
-----------------------------------------------------------------------------------------
| epoch 79 | step 500/4071 | loss 8.1236 | lr 0.00100 | ngrams/sec 50484.1 | eta 0h0m36s
| epoch 79 | step 1000/4071 | loss 8.1433 | lr 0.00100 | ngrams/sec 68163.4 | eta 0h0m23s
| epoch 79 | step 1500/4071 | loss 8.1565 | lr 0.00100 | ngrams/sec 67923.6 | eta 0h0m19s
| epoch 79 | step 2000/4071 | loss 8.1542 | lr 0.00100 | ngrams/sec 68139.5 | eta 0h0m15s
| epoch 79 | step 2500/4071 | loss 8.1485 | lr 0.00100 | ngrams/sec 67999.2 | eta 0h0m11s
| epoch 79 | step 3000/4071 | loss 8.1606 | lr 0.00100 | ngrams/sec 67757.5 | eta 0h0m8s
| epoch 79 | step 3500/4071 | loss 8.1509 | lr 0.00100 | ngrams/sec 68152.3 | eta 0h0m4s
| epoch 79 | step 4000/4071 | loss 8.1641 | lr 0.00100 | ngrams/sec 67865.4 | eta 0h0m0s


 37%|███▋      | 155/417 [00:00<00:00, 1543.40it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 719.61it/s]


-----------------------------------------------------------------------------------------
| end of epoch  79 | time 31.25s | valid loss  6.58 | valid ppl   723.83
-----------------------------------------------------------------------------------------
| epoch 80 | step 500/4071 | loss 8.1323 | lr 0.00100 | ngrams/sec 50578.5 | eta 0h0m36s
| epoch 80 | step 1000/4071 | loss 8.1308 | lr 0.00100 | ngrams/sec 68190.5 | eta 0h0m23s
| epoch 80 | step 1500/4071 | loss 8.1449 | lr 0.00100 | ngrams/sec 67905.9 | eta 0h0m19s
| epoch 80 | step 2000/4071 | loss 8.1434 | lr 0.00100 | ngrams/sec 68183.2 | eta 0h0m15s
| epoch 80 | step 2500/4071 | loss 8.1541 | lr 0.00100 | ngrams/sec 67656.3 | eta 0h0m11s
| epoch 80 | step 3000/4071 | loss 8.1345 | lr 0.00100 | ngrams/sec 68026.9 | eta 0h0m8s
| epoch 80 | step 3500/4071 | loss 8.1405 | lr 0.00100 | ngrams/sec 68281.3 | eta 0h0m4s
| epoch 80 | step 4000/4071 | loss 8.1463 | lr 0.00100 | ngrams/sec 68058.2 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1537.47it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.12it/s]


-----------------------------------------------------------------------------------------
| end of epoch  80 | time 31.20s | valid loss  6.59 | valid ppl   724.80
-----------------------------------------------------------------------------------------
| epoch 81 | step 500/4071 | loss 8.1360 | lr 0.00100 | ngrams/sec 50593.9 | eta 0h0m36s
| epoch 81 | step 1000/4071 | loss 8.1362 | lr 0.00100 | ngrams/sec 67972.8 | eta 0h0m23s
| epoch 81 | step 1500/4071 | loss 8.1446 | lr 0.00100 | ngrams/sec 67755.9 | eta 0h0m19s
| epoch 81 | step 2000/4071 | loss 8.1246 | lr 0.00100 | ngrams/sec 67880.0 | eta 0h0m15s
| epoch 81 | step 2500/4071 | loss 8.1343 | lr 0.00100 | ngrams/sec 68039.6 | eta 0h0m11s
| epoch 81 | step 3000/4071 | loss 8.1628 | lr 0.00100 | ngrams/sec 68207.8 | eta 0h0m8s
| epoch 81 | step 3500/4071 | loss 8.1409 | lr 0.00100 | ngrams/sec 68171.9 | eta 0h0m4s
| epoch 81 | step 4000/4071 | loss 8.1393 | lr 0.00100 | ngrams/sec 68024.4 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.67it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.16it/s]


-----------------------------------------------------------------------------------------
| end of epoch  81 | time 31.22s | valid loss  6.58 | valid ppl   720.39
-----------------------------------------------------------------------------------------
| epoch 82 | step 500/4071 | loss 8.1235 | lr 0.00100 | ngrams/sec 50521.9 | eta 0h0m36s
| epoch 82 | step 1000/4071 | loss 8.1284 | lr 0.00100 | ngrams/sec 68101.6 | eta 0h0m23s
| epoch 82 | step 1500/4071 | loss 8.1561 | lr 0.00100 | ngrams/sec 67947.1 | eta 0h0m19s
| epoch 82 | step 2000/4071 | loss 8.1297 | lr 0.00100 | ngrams/sec 67877.8 | eta 0h0m15s
| epoch 82 | step 2500/4071 | loss 8.1418 | lr 0.00100 | ngrams/sec 68234.3 | eta 0h0m11s
| epoch 82 | step 3000/4071 | loss 8.1406 | lr 0.00100 | ngrams/sec 67868.3 | eta 0h0m8s
| epoch 82 | step 3500/4071 | loss 8.1418 | lr 0.00100 | ngrams/sec 68177.6 | eta 0h0m4s
| epoch 82 | step 4000/4071 | loss 8.1259 | lr 0.00100 | ngrams/sec 68435.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1549.33it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.32it/s]


-----------------------------------------------------------------------------------------
| end of epoch  82 | time 31.21s | valid loss  6.59 | valid ppl   727.85
-----------------------------------------------------------------------------------------
| epoch 83 | step 500/4071 | loss 8.1097 | lr 0.00100 | ngrams/sec 50719.0 | eta 0h0m36s
| epoch 83 | step 1000/4071 | loss 8.1254 | lr 0.00100 | ngrams/sec 67612.4 | eta 0h0m23s
| epoch 83 | step 1500/4071 | loss 8.1189 | lr 0.00100 | ngrams/sec 68002.6 | eta 0h0m19s
| epoch 83 | step 2000/4071 | loss 8.1381 | lr 0.00100 | ngrams/sec 68141.0 | eta 0h0m15s
| epoch 83 | step 2500/4071 | loss 8.1330 | lr 0.00100 | ngrams/sec 68180.3 | eta 0h0m11s
| epoch 83 | step 3000/4071 | loss 8.1338 | lr 0.00100 | ngrams/sec 68009.0 | eta 0h0m8s
| epoch 83 | step 3500/4071 | loss 8.1508 | lr 0.00100 | ngrams/sec 67775.9 | eta 0h0m4s
| epoch 83 | step 4000/4071 | loss 8.1413 | lr 0.00100 | ngrams/sec 67736.9 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1538.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.92it/s]


-----------------------------------------------------------------------------------------
| end of epoch  83 | time 31.25s | valid loss  6.55 | valid ppl   700.41
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 84 | step 500/4071 | loss 8.1198 | lr 0.00100 | ngrams/sec 49412.5 | eta 0h0m37s
| epoch 84 | step 1000/4071 | loss 8.1075 | lr 0.00100 | ngrams/sec 67333.1 | eta 0h0m23s
| epoch 84 | step 1500/4071 | loss 8.1359 | lr 0.00100 | ngrams/sec 66125.4 | eta 0h0m19s
| epoch 84 | step 2000/4071 | loss 8.1331 | lr 0.00100 | ngrams/sec 67741.4 | eta 0h0m15s
| epoch 84 | step 2500/4071 | loss 8.1199 | lr 0.00100 | ngrams/sec 67756.5 | eta 0h0m11s
| epoch 84 | step 3000/4071 | loss 8.1285 | lr 0.00100 | ngrams/sec 68127.8 | eta 0h0m8s
| epoch 84 | step 3500/4071 | loss 8.1345 | lr 0.00100 | ngrams/sec 68005.7 | eta 0h0m4s
| epoch 84 | step 4000/4071 | loss 8.1328 | lr 0.00100 | ngrams/sec 68071.0 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1548.00it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.49it/s]


-----------------------------------------------------------------------------------------
| end of epoch  84 | time 31.43s | valid loss  6.59 | valid ppl   726.24
-----------------------------------------------------------------------------------------
| epoch 85 | step 500/4071 | loss 8.1064 | lr 0.00100 | ngrams/sec 50364.9 | eta 0h0m36s
| epoch 85 | step 1000/4071 | loss 8.1133 | lr 0.00100 | ngrams/sec 67875.7 | eta 0h0m23s
| epoch 85 | step 1500/4071 | loss 8.1226 | lr 0.00100 | ngrams/sec 67490.7 | eta 0h0m19s
| epoch 85 | step 2000/4071 | loss 8.1238 | lr 0.00100 | ngrams/sec 67724.5 | eta 0h0m15s
| epoch 85 | step 2500/4071 | loss 8.1275 | lr 0.00100 | ngrams/sec 67518.9 | eta 0h0m11s
| epoch 85 | step 3000/4071 | loss 8.1213 | lr 0.00100 | ngrams/sec 67840.1 | eta 0h0m8s
| epoch 85 | step 3500/4071 | loss 8.1314 | lr 0.00100 | ngrams/sec 68249.5 | eta 0h0m4s
| epoch 85 | step 4000/4071 | loss 8.1298 | lr 0.00100 | ngrams/sec 67813.7 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1545.07it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.19it/s]


-----------------------------------------------------------------------------------------
| end of epoch  85 | time 31.34s | valid loss  6.56 | valid ppl   706.19
-----------------------------------------------------------------------------------------
| epoch 86 | step 500/4071 | loss 8.0880 | lr 0.00100 | ngrams/sec 50424.0 | eta 0h0m36s
| epoch 86 | step 1000/4071 | loss 8.1270 | lr 0.00100 | ngrams/sec 67824.3 | eta 0h0m23s
| epoch 86 | step 1500/4071 | loss 8.1242 | lr 0.00100 | ngrams/sec 67871.1 | eta 0h0m19s
| epoch 86 | step 2000/4071 | loss 8.1235 | lr 0.00100 | ngrams/sec 67997.6 | eta 0h0m15s
| epoch 86 | step 2500/4071 | loss 8.1153 | lr 0.00100 | ngrams/sec 67940.1 | eta 0h0m11s
| epoch 86 | step 3000/4071 | loss 8.1195 | lr 0.00100 | ngrams/sec 68010.1 | eta 0h0m8s
| epoch 86 | step 3500/4071 | loss 8.1256 | lr 0.00100 | ngrams/sec 68476.2 | eta 0h0m4s
| epoch 86 | step 4000/4071 | loss 8.1302 | lr 0.00100 | ngrams/sec 67965.9 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1556.50it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.78it/s]


-----------------------------------------------------------------------------------------
| end of epoch  86 | time 31.23s | valid loss  6.56 | valid ppl   707.34
-----------------------------------------------------------------------------------------
| epoch 87 | step 500/4071 | loss 8.1021 | lr 0.00100 | ngrams/sec 50761.8 | eta 0h0m36s
| epoch 87 | step 1000/4071 | loss 8.1121 | lr 0.00100 | ngrams/sec 68281.1 | eta 0h0m23s
| epoch 87 | step 1500/4071 | loss 8.1127 | lr 0.00100 | ngrams/sec 67958.8 | eta 0h0m19s
| epoch 87 | step 2000/4071 | loss 8.1178 | lr 0.00100 | ngrams/sec 68259.4 | eta 0h0m15s
| epoch 87 | step 2500/4071 | loss 8.1056 | lr 0.00100 | ngrams/sec 68119.6 | eta 0h0m11s
| epoch 87 | step 3000/4071 | loss 8.1181 | lr 0.00100 | ngrams/sec 67938.3 | eta 0h0m8s
| epoch 87 | step 3500/4071 | loss 8.1119 | lr 0.00100 | ngrams/sec 68408.5 | eta 0h0m4s
| epoch 87 | step 4000/4071 | loss 8.1274 | lr 0.00100 | ngrams/sec 68185.3 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1542.25it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch  87 | time 31.15s | valid loss  6.59 | valid ppl   724.99
-----------------------------------------------------------------------------------------
| epoch 88 | step 500/4071 | loss 8.0934 | lr 0.00100 | ngrams/sec 50483.5 | eta 0h0m36s
| epoch 88 | step 1000/4071 | loss 8.1016 | lr 0.00100 | ngrams/sec 68060.6 | eta 0h0m23s
| epoch 88 | step 1500/4071 | loss 8.1026 | lr 0.00100 | ngrams/sec 68084.7 | eta 0h0m19s
| epoch 88 | step 2000/4071 | loss 8.1285 | lr 0.00100 | ngrams/sec 68184.2 | eta 0h0m15s
| epoch 88 | step 2500/4071 | loss 8.1213 | lr 0.00100 | ngrams/sec 68094.1 | eta 0h0m11s
| epoch 88 | step 3000/4071 | loss 8.1080 | lr 0.00100 | ngrams/sec 67617.5 | eta 0h0m8s
| epoch 88 | step 3500/4071 | loss 8.1150 | lr 0.00100 | ngrams/sec 68131.3 | eta 0h0m4s
| epoch 88 | step 4000/4071 | loss 8.1305 | lr 0.00100 | ngrams/sec 67693.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1546.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.00it/s]


-----------------------------------------------------------------------------------------
| end of epoch  88 | time 31.25s | valid loss  6.59 | valid ppl   728.37
-----------------------------------------------------------------------------------------
| epoch 89 | step 500/4071 | loss 8.0986 | lr 0.00100 | ngrams/sec 50537.9 | eta 0h0m36s
| epoch 89 | step 1000/4071 | loss 8.1015 | lr 0.00100 | ngrams/sec 67952.6 | eta 0h0m23s
| epoch 89 | step 1500/4071 | loss 8.1195 | lr 0.00100 | ngrams/sec 68027.2 | eta 0h0m19s
| epoch 89 | step 2000/4071 | loss 8.1019 | lr 0.00100 | ngrams/sec 68100.1 | eta 0h0m15s
| epoch 89 | step 2500/4071 | loss 8.1005 | lr 0.00100 | ngrams/sec 68246.3 | eta 0h0m11s
| epoch 89 | step 3000/4071 | loss 8.1030 | lr 0.00100 | ngrams/sec 68016.6 | eta 0h0m8s
| epoch 89 | step 3500/4071 | loss 8.1230 | lr 0.00100 | ngrams/sec 68117.9 | eta 0h0m4s
| epoch 89 | step 4000/4071 | loss 8.1123 | lr 0.00100 | ngrams/sec 68093.4 | eta 0h0m0s


 37%|███▋      | 155/417 [00:00<00:00, 1546.85it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.38it/s]


-----------------------------------------------------------------------------------------
| end of epoch  89 | time 31.20s | valid loss  6.54 | valid ppl   689.31
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 90 | step 500/4071 | loss 8.0838 | lr 0.00100 | ngrams/sec 49988.4 | eta 0h0m36s
| epoch 90 | step 1000/4071 | loss 8.0992 | lr 0.00100 | ngrams/sec 67992.6 | eta 0h0m23s
| epoch 90 | step 1500/4071 | loss 8.1015 | lr 0.00100 | ngrams/sec 68047.4 | eta 0h0m19s
| epoch 90 | step 2000/4071 | loss 8.1004 | lr 0.00100 | ngrams/sec 68179.1 | eta 0h0m15s
| epoch 90 | step 2500/4071 | loss 8.1049 | lr 0.00100 | ngrams/sec 67832.4 | eta 0h0m11s
| epoch 90 | step 3000/4071 | loss 8.1208 | lr 0.00100 | ngrams/sec 68065.5 | eta 0h0m8s
| epoch 90 | step 3500/4071 | loss 8.1051 | lr 0.00100 | ngrams/sec 68030.8 | eta 0h0m4s
| epoch 90 | step 4000/4071 | loss 8.0978 | lr 0.00100 | ngrams/sec 68057.8 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1555.37it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch  90 | time 31.22s | valid loss  6.54 | valid ppl   694.27
-----------------------------------------------------------------------------------------
| epoch 91 | step 500/4071 | loss 8.0790 | lr 0.00100 | ngrams/sec 50530.5 | eta 0h0m36s
| epoch 91 | step 1000/4071 | loss 8.0954 | lr 0.00100 | ngrams/sec 68325.2 | eta 0h0m23s
| epoch 91 | step 1500/4071 | loss 8.1047 | lr 0.00100 | ngrams/sec 67904.1 | eta 0h0m19s
| epoch 91 | step 2000/4071 | loss 8.0957 | lr 0.00100 | ngrams/sec 68097.2 | eta 0h0m15s
| epoch 91 | step 2500/4071 | loss 8.1019 | lr 0.00100 | ngrams/sec 68323.7 | eta 0h0m11s
| epoch 91 | step 3000/4071 | loss 8.0977 | lr 0.00100 | ngrams/sec 68230.3 | eta 0h0m8s
| epoch 91 | step 3500/4071 | loss 8.1096 | lr 0.00100 | ngrams/sec 68186.5 | eta 0h0m4s
| epoch 91 | step 4000/4071 | loss 8.1065 | lr 0.00100 | ngrams/sec 67823.3 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1545.73it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch  91 | time 31.17s | valid loss  6.58 | valid ppl   718.67
-----------------------------------------------------------------------------------------
| epoch 92 | step 500/4071 | loss 8.0832 | lr 0.00100 | ngrams/sec 50601.8 | eta 0h0m36s
| epoch 92 | step 1000/4071 | loss 8.1022 | lr 0.00100 | ngrams/sec 68216.6 | eta 0h0m23s
| epoch 92 | step 1500/4071 | loss 8.1077 | lr 0.00100 | ngrams/sec 67994.7 | eta 0h0m19s
| epoch 92 | step 2000/4071 | loss 8.0922 | lr 0.00100 | ngrams/sec 68432.1 | eta 0h0m15s
| epoch 92 | step 2500/4071 | loss 8.1094 | lr 0.00100 | ngrams/sec 67953.2 | eta 0h0m11s
| epoch 92 | step 3000/4071 | loss 8.1011 | lr 0.00100 | ngrams/sec 68126.5 | eta 0h0m8s
| epoch 92 | step 3500/4071 | loss 8.1075 | lr 0.00100 | ngrams/sec 67975.9 | eta 0h0m4s
| epoch 92 | step 4000/4071 | loss 8.1012 | lr 0.00100 | ngrams/sec 68000.2 | eta 0h0m0s


 37%|███▋      | 155/417 [00:00<00:00, 1549.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.59it/s]


-----------------------------------------------------------------------------------------
| end of epoch  92 | time 31.20s | valid loss  6.54 | valid ppl   693.67
-----------------------------------------------------------------------------------------
| epoch 93 | step 500/4071 | loss 8.0660 | lr 0.00100 | ngrams/sec 50658.7 | eta 0h0m36s
| epoch 93 | step 1000/4071 | loss 8.0780 | lr 0.00100 | ngrams/sec 68025.6 | eta 0h0m23s
| epoch 93 | step 1500/4071 | loss 8.1018 | lr 0.00100 | ngrams/sec 68315.3 | eta 0h0m19s
| epoch 93 | step 2000/4071 | loss 8.0949 | lr 0.00100 | ngrams/sec 68220.4 | eta 0h0m15s
| epoch 93 | step 2500/4071 | loss 8.1041 | lr 0.00100 | ngrams/sec 67968.2 | eta 0h0m11s
| epoch 93 | step 3000/4071 | loss 8.0942 | lr 0.00100 | ngrams/sec 67463.1 | eta 0h0m8s
| epoch 93 | step 3500/4071 | loss 8.1144 | lr 0.00100 | ngrams/sec 67983.1 | eta 0h0m4s
| epoch 93 | step 4000/4071 | loss 8.1059 | lr 0.00100 | ngrams/sec 68086.5 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1543.06it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.71it/s]


-----------------------------------------------------------------------------------------
| end of epoch  93 | time 31.23s | valid loss  6.56 | valid ppl   709.56
-----------------------------------------------------------------------------------------
| epoch 94 | step 500/4071 | loss 8.0691 | lr 0.00100 | ngrams/sec 50290.5 | eta 0h0m36s
| epoch 94 | step 1000/4071 | loss 8.0903 | lr 0.00100 | ngrams/sec 67414.4 | eta 0h0m23s
| epoch 94 | step 1500/4071 | loss 8.0834 | lr 0.00100 | ngrams/sec 67911.6 | eta 0h0m19s
| epoch 94 | step 2000/4071 | loss 8.0818 | lr 0.00100 | ngrams/sec 68011.5 | eta 0h0m15s
| epoch 94 | step 2500/4071 | loss 8.0846 | lr 0.00100 | ngrams/sec 68009.2 | eta 0h0m11s
| epoch 94 | step 3000/4071 | loss 8.1004 | lr 0.00100 | ngrams/sec 68141.7 | eta 0h0m8s
| epoch 94 | step 3500/4071 | loss 8.0841 | lr 0.00100 | ngrams/sec 68107.5 | eta 0h0m4s
| epoch 94 | step 4000/4071 | loss 8.0967 | lr 0.00100 | ngrams/sec 68323.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1558.29it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  94 | time 31.26s | valid loss  6.56 | valid ppl   707.50
-----------------------------------------------------------------------------------------
| epoch 95 | step 500/4071 | loss 8.0666 | lr 0.00100 | ngrams/sec 50449.5 | eta 0h0m36s
| epoch 95 | step 1000/4071 | loss 8.0771 | lr 0.00100 | ngrams/sec 67992.7 | eta 0h0m23s
| epoch 95 | step 1500/4071 | loss 8.0838 | lr 0.00100 | ngrams/sec 68030.2 | eta 0h0m19s
| epoch 95 | step 2000/4071 | loss 8.0901 | lr 0.00100 | ngrams/sec 68014.6 | eta 0h0m15s
| epoch 95 | step 2500/4071 | loss 8.0875 | lr 0.00100 | ngrams/sec 68041.8 | eta 0h0m11s
| epoch 95 | step 3000/4071 | loss 8.0804 | lr 0.00100 | ngrams/sec 68017.8 | eta 0h0m8s
| epoch 95 | step 3500/4071 | loss 8.0917 | lr 0.00100 | ngrams/sec 68069.4 | eta 0h0m4s
| epoch 95 | step 4000/4071 | loss 8.1138 | lr 0.00100 | ngrams/sec 68103.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1552.86it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  95 | time 31.22s | valid loss  6.54 | valid ppl   693.52
-----------------------------------------------------------------------------------------
| epoch 96 | step 500/4071 | loss 8.0592 | lr 0.00100 | ngrams/sec 50490.3 | eta 0h0m36s
| epoch 96 | step 1000/4071 | loss 8.0726 | lr 0.00100 | ngrams/sec 67923.0 | eta 0h0m23s
| epoch 96 | step 1500/4071 | loss 8.0823 | lr 0.00100 | ngrams/sec 67856.2 | eta 0h0m19s
| epoch 96 | step 2000/4071 | loss 8.0790 | lr 0.00100 | ngrams/sec 68151.9 | eta 0h0m15s
| epoch 96 | step 2500/4071 | loss 8.0807 | lr 0.00100 | ngrams/sec 68193.4 | eta 0h0m11s
| epoch 96 | step 3000/4071 | loss 8.0913 | lr 0.00100 | ngrams/sec 67842.7 | eta 0h0m8s
| epoch 96 | step 3500/4071 | loss 8.0912 | lr 0.00100 | ngrams/sec 68219.7 | eta 0h0m4s
| epoch 96 | step 4000/4071 | loss 8.0825 | lr 0.00100 | ngrams/sec 67895.2 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1535.62it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.77it/s]


-----------------------------------------------------------------------------------------
| end of epoch  96 | time 31.24s | valid loss  6.56 | valid ppl   706.92
-----------------------------------------------------------------------------------------
| epoch 97 | step 500/4071 | loss 8.0630 | lr 0.00100 | ngrams/sec 50482.1 | eta 0h0m36s
| epoch 97 | step 1000/4071 | loss 8.0721 | lr 0.00100 | ngrams/sec 68227.9 | eta 0h0m23s
| epoch 97 | step 1500/4071 | loss 8.0771 | lr 0.00100 | ngrams/sec 67616.4 | eta 0h0m19s
| epoch 97 | step 2000/4071 | loss 8.0772 | lr 0.00100 | ngrams/sec 68042.1 | eta 0h0m15s
| epoch 97 | step 2500/4071 | loss 8.0767 | lr 0.00100 | ngrams/sec 67977.7 | eta 0h0m11s
| epoch 97 | step 3000/4071 | loss 8.0823 | lr 0.00100 | ngrams/sec 67555.0 | eta 0h0m8s
| epoch 97 | step 3500/4071 | loss 8.0881 | lr 0.00100 | ngrams/sec 67934.8 | eta 0h0m4s
| epoch 97 | step 4000/4071 | loss 8.0883 | lr 0.00100 | ngrams/sec 68141.2 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1554.72it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.37it/s]


-----------------------------------------------------------------------------------------
| end of epoch  97 | time 31.26s | valid loss  6.62 | valid ppl   746.69
-----------------------------------------------------------------------------------------
| epoch 98 | step 500/4071 | loss 8.0440 | lr 0.00100 | ngrams/sec 50518.3 | eta 0h0m36s
| epoch 98 | step 1000/4071 | loss 8.0657 | lr 0.00100 | ngrams/sec 67878.5 | eta 0h0m23s
| epoch 98 | step 1500/4071 | loss 8.0689 | lr 0.00100 | ngrams/sec 67736.5 | eta 0h0m19s
| epoch 98 | step 2000/4071 | loss 8.0693 | lr 0.00100 | ngrams/sec 68062.5 | eta 0h0m15s
| epoch 98 | step 2500/4071 | loss 8.0814 | lr 0.00100 | ngrams/sec 67999.3 | eta 0h0m11s
| epoch 98 | step 3000/4071 | loss 8.0753 | lr 0.00100 | ngrams/sec 68057.1 | eta 0h0m8s
| epoch 98 | step 3500/4071 | loss 8.0681 | lr 0.00100 | ngrams/sec 67934.8 | eta 0h0m4s
| epoch 98 | step 4000/4071 | loss 8.0923 | lr 0.00100 | ngrams/sec 67963.4 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1556.46it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch  98 | time 31.25s | valid loss  6.58 | valid ppl   722.72
-----------------------------------------------------------------------------------------
| epoch 99 | step 500/4071 | loss 8.0578 | lr 0.00100 | ngrams/sec 50378.7 | eta 0h0m36s
| epoch 99 | step 1000/4071 | loss 8.0660 | lr 0.00100 | ngrams/sec 68109.1 | eta 0h0m23s
| epoch 99 | step 1500/4071 | loss 8.0722 | lr 0.00100 | ngrams/sec 67961.6 | eta 0h0m19s
| epoch 99 | step 2000/4071 | loss 8.0716 | lr 0.00100 | ngrams/sec 67790.0 | eta 0h0m15s
| epoch 99 | step 2500/4071 | loss 8.0783 | lr 0.00100 | ngrams/sec 67984.2 | eta 0h0m11s
| epoch 99 | step 3000/4071 | loss 8.0829 | lr 0.00100 | ngrams/sec 68021.3 | eta 0h0m8s
| epoch 99 | step 3500/4071 | loss 8.0730 | lr 0.00100 | ngrams/sec 67911.4 | eta 0h0m4s
| epoch 99 | step 4000/4071 | loss 8.0813 | lr 0.00100 | ngrams/sec 68175.1 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1552.28it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.57it/s]


-----------------------------------------------------------------------------------------
| end of epoch  99 | time 31.26s | valid loss  6.55 | valid ppl   701.74
-----------------------------------------------------------------------------------------
| epoch 100 | step 500/4071 | loss 8.0636 | lr 0.00100 | ngrams/sec 50460.2 | eta 0h0m36s
| epoch 100 | step 1000/4071 | loss 8.0581 | lr 0.00100 | ngrams/sec 68010.1 | eta 0h0m23s
| epoch 100 | step 1500/4071 | loss 8.0660 | lr 0.00100 | ngrams/sec 68210.2 | eta 0h0m19s
| epoch 100 | step 2000/4071 | loss 8.0652 | lr 0.00100 | ngrams/sec 67875.9 | eta 0h0m15s
| epoch 100 | step 2500/4071 | loss 8.0756 | lr 0.00100 | ngrams/sec 67965.5 | eta 0h0m11s
| epoch 100 | step 3000/4071 | loss 8.0706 | lr 0.00100 | ngrams/sec 65316.4 | eta 0h0m8s
| epoch 100 | step 3500/4071 | loss 8.0771 | lr 0.00100 | ngrams/sec 67915.9 | eta 0h0m4s
| epoch 100 | step 4000/4071 | loss 8.0610 | lr 0.00100 | ngrams/sec 67862.2 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1538.62it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.12it/s]
  0%|          | 0/471 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| end of epoch 100 | time 31.41s | valid loss  6.56 | valid ppl   704.23
-----------------------------------------------------------------------------------------
Evaluating on test set...


100%|██████████| 471/471 [00:00<00:00, 697.49it/s]


| End of training | test loss  6.51 | test ppl   671.21


In [56]:
from google.colab import files
files.download('checkpoint.pth')
!cp "checkpoint.pth" "gdrive/MyDrive/checkpoint.pth"

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Generate

In [None]:
!cp "gdrive/MyDrive/checkpoint.pth" "checkpoint.pth" 

In [57]:
import torch 
# Model parameters.
class Args:
    data = 'gdrive/MyDrive/wikitext-2'
    checkpoint = 'checkpoint.pth'
    outf = 'generated.txt'
    #words = 1000
    seed = 42
    cuda = True
    temperature = 1.0 # temperature - higher will increase diversity
    log_interval = 10 # reporting interval
    words = 100
args = Args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

model.load_state_dict(torch.load(args.checkpoint))
print(model)
model.eval()

ntokens = n_class
# input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
input = torch.autograd.Variable(torch.t(torch.randint(ntokens, (1, 7), dtype=torch.long))).to(device)
print(input)

FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
tensor([[ 2774],
        [26931],
        [16204],
        [23326],
        [28058],
        [14935],
        [16636]], device='cuda:0')


In [58]:
glue = ' '
start = None
with open(args.outf, 'w') as outf:
    for i in range(args.words):
        output = model(input)
        word_weights = output.squeeze().div(args.temperature).exp().cpu()
        # if args.no_unk:
        #     word_weights[corpus.dictionary.w2i[unk]] = 0
        word_idx = torch.multinomial(word_weights, 1)[0]
        # word_idx = word_idx.data[0]
        word = corpus.dictionary.idx2word[word_idx]
        print(word)

        # ids.append(word_idx)
        # input = Variable(torch.LongTensor(ids[-model.order:]).unsqueeze(0))
        input = input.cuda() if cuda else input
        if word is "<sos>": # ignore start of sentence predictns
            continue
        elif word is "<eos>":
            outf.write('\n')
        else:
            outf.write(word + glue)

        if i % args.log_interval == 0:
            print('| Generated {}/{} words'.format(i, args.words))

making
| Generated 0/100 words
usually
commissioned
animal
targets
mccall
study
others
during
places
course
| Generated 10/100 words
for
while
armor
areas
published
off
minute
emperor
that
decision
| Generated 20/100 words
flight
and
of
based
combined
adding
magazine
going
donated
parish
| Generated 30/100 words
faculty
had
spring
crimes
i
,
iii
(
influence
,
| Generated 40/100 words
had
apparently
-
he
programs
mass
just
slope
o
13
| Generated 50/100 words
that
and
by
at
attempts
jordan
important
adriatic
contain
for
| Generated 60/100 words
blocked
film
transport
anderson
this
lifestyle
occupied
.
to
of
| Generated 70/100 words
jack
both
such
serves
–
meyerbeer
later
develop
krishna
project
| Generated 80/100 words
heroes
part
then
adam
still
on
lies
on
terms
<unk>
| Generated 90/100 words
8
port
raj
civilian
any
using
,
,
manager
