In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!ls "gdrive/MyDrive/wikitext-2" # check that it has successfully connected
# files should be at ur GDrive inside folder wikitext-2
!cp "gdrive/MyDrive/wikitext-2/wiki.test.tokens.txt" "test.txt" # copy the files to colab runtime
!cp "gdrive/MyDrive/wikitext-2/wiki.train.tokens.txt" "train.txt"
!cp "gdrive/MyDrive/wikitext-2/wiki.valid.tokens.txt" "valid.txt"

wiki.test.tokens.txt  wiki.train.tokens.txt  wiki.valid.tokens.txt


# Preprocessing

In [16]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                # remove the headers e.g.  = = Description = = 
                if line.startswith('='): 
                    continue
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word.lower()) # make to lower

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word.lower()]) # make to lower
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
    
    @property
    def vocab_size(self):
        return len(self.dictionary.idx2word)

# Params

In [17]:
#=== params
corpus = Corpus('/content')
n_class = corpus.vocab_size
n_step = 7 # n-1 in paper
n_hidden = 200 # h in paper
embed_size = 200       # m in paper
batch_size = 512
order = n_step # order (int): the order of the language model, i.e. length of the history
epochs = 100
learning_rate = 0.001
cuda = torch.cuda.is_available()
seed = 42
clip = 2.0
#===

# Model

In [18]:
#== MODEL ==#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''
    the neural model learns the distributed representation of each word 
    (embedding matrix C) and 
    the probability function of a word sequence as a function of their distributed representations. 
    It has a hidden layer with 
    tanh activation and the output layer is a 
    Softmax layer. 
    The output of the model for each 
    input of (n - 1) previous words are the 
    probabilities over the |V | words in the vocabulary for the next word.
'''
class FNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, context_size, no_hidden, dropout=0.5, tie_weight=True):
        super(FNNModel,self).__init__()
        """
        Args:
            n_class (int): no. of vocabulary
            m (int): size of each embedding vector
#n-gram models construct tables of conditional probabilities for the next word, 
#for each one of a large number of contexts, i.e. combinations of the last n − 1 words
            n_step (int): n-1 in paper. #n_step + 1 = n-gram. if n_step = 1, bigram
            n_hidden (int): no. of hidden units associated with each word
        """
        """
        Vars:
            C: encoder (|V| x m)
            H: hiden layer weight (n x (n-1)m)
            W: word feature to output weights (|V| x (n-1)m)
            d: hidden layer bias (has h no. of elements)
            U: hidden-to-output weights (|V| × h matrix)
            b: output bias (has |V| no. of elements)
        """
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear1 = nn.Linear(context_size * embed_size, no_hidden)
        self.linear2 = nn.Linear(no_hidden, vocab_size)
        self.context_size = context_size
        self.embed_size = embed_size
        self.dropout = nn.Dropout(p=dropout)
        if tie_weight:
            self.linear2.weight = self.embeddings.weight

    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((-1, self.context_size * self.embed_size))
        hidden_output = self.linear1(embeds)
        # hidden_output = self.dropout(hidden_output)
        out = hidden_output.tanh()
        # out = self.dropout(out)
        out = self.linear2(out)
        out = self.dropout(out)
        log_probs = F.log_softmax(out, dim=1) # [1000, 28912]: softmax on 28912's dim
        return log_probs

# Data Loading

In [19]:
def batchify(data, batch_size):
    # Work out how cleanly we can divide the dataset into args.batch_size parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data
def get_batch(data, i, order):
    x = torch.autograd.Variable(torch.t(data[i:i+order]))
    y = torch.autograd.Variable(data[i+order].view(-1))
    return x, y
def evaluate(data, model, criterion):
	model.eval()
	total_loss = 0
	n_steps = data.size(0) - order - 1
	for i in tqdm(range(n_steps)):
		x, y = get_batch(data, i, order)
		out = model(x)
		loss = criterion(out, y)
		total_loss += loss.data.data
	return total_loss / n_steps

In [20]:
def clock_time(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    return int(h), int(m), int(s)

In [21]:
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import time


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)
if cuda:
	train_data, val_data, test_data = train_data.cuda(), val_data.cuda(), test_data.cuda()
print('Using cuda: {}'.format(cuda))
print('Size of training set: {:,} tokens'.format(np.prod(train_data.size())))
print('Size of validation set: {:,} tokens'.format(np.prod(val_data.size())))
print('Size of test set: {:,} tokens'.format(np.prod(test_data.size())))
print('Vocabulary size: {:,}'.format(corpus.vocab_size))
print('Example data:')
for k in range(100, 107):
    x = [corpus.dictionary.idx2word[i] for i in train_data[k:order+k, 0]]
    y = [corpus.dictionary.idx2word[train_data[k+order, 0]]]
    print(x, y)
#=== initialise model
model = FNNModel(
    n_class, 
    embed_size, 
    n_step, 
    n_hidden,
    )
if cuda:
  model.cuda()
# Display the model's architecture
print('Model: \n', model)
criterion = nn.NLLLoss()
optimizer = optim.RMSprop(model.parameters(),lr=learning_rate)

Using cuda: True
Size of training set: 2,088,448 tokens
Size of validation set: 217,600 tokens
Size of test set: 245,248 tokens
Vocabulary size: 28,912
Example data:
['"', 'nameless', '"', ',', 'a', 'penal', 'military'] ['unit']
['nameless', '"', ',', 'a', 'penal', 'military', 'unit'] ['serving']
['"', ',', 'a', 'penal', 'military', 'unit', 'serving'] ['the']
[',', 'a', 'penal', 'military', 'unit', 'serving', 'the'] ['nation']
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation'] ['of']
['penal', 'military', 'unit', 'serving', 'the', 'nation', 'of'] ['gallia']
['military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia'] ['during']
Model: 
 FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [22]:
!nvidia-smi

Fri Nov 27 11:12:15 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    32W / 250W |   1259MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [23]:
# Set seed for reproducibility.
torch.manual_seed(seed)
np.random.seed(seed)
parameters = [param for param in model.parameters() if param.requires_grad]
# Training
print('Training...')
losses = dict(train=[], val=[])

# initialize the early_stopping counter
stop_counter = 0

lr = learning_rate # so that can alter later if SGD not descending 
best_val_loss = None

num_steps = train_data.size(0) - order - 1
batch_order = np.arange(num_steps)

t0 = time.time()
try:
    for epoch in range(1, epochs+1):
        model.train()
        epoch_start_time = time.time()
        np.random.shuffle(batch_order)

        for step in range(1, num_steps+1):
            idx = batch_order[step-1]
            x, y = get_batch(train_data, idx, order)

            model.zero_grad()
            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)
            debug = False
            #   if debug:
            #     # Debugging softmax approximation.
            #     xe = nn.CrossEntropyLoss()
            #     true_loss = xe(logits, y)
            #     print('approx {:>3.2f}, true {:>3.2f}, diff {:>3.4f}'.format(
            #       loss.data, true_loss.data, true_loss.data - loss.data))

            # Update parameters
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # reduce exploding grad
            optimizer.step()

            # Save loss.
            losses['train'].append(loss.cpu().data)
            print_every = 500
            if step % print_every == 0:
                avg_loss = sum(losses['train'][-print_every:]) / print_every
                t1 = time.time()
                steps_per_second = print_every / (t1 - t0)
                print('| epoch {} | step {}/{} | loss {:.4f} | lr {:.5f} | '
                    'ngrams/sec {:.1f} | eta {}h{}m{}s'.format(
                    epoch, step, num_steps, avg_loss, lr,
                    steps_per_second * batch_size,
                    *clock_time((num_steps - step) / steps_per_second)))
                t0 = time.time()
            
        print('Evaluating on validation set...')
        val_loss = evaluate(val_data, model, criterion)
        losses['val'].append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, torch.exp(val_loss)))
        print('-' * 89)

        if not best_val_loss or val_loss < best_val_loss:
            stop_counter = 0 # reset counter
            best_val_loss = val_loss
            print('| saving current state of model ...')
            torch.save(model.state_dict(), 'checkpoint.pth')
            #=== download checkpoint file
            # files.download('checkpoint.pth')
        elif val_loss < best_val_loss and val_loss < losses['val'][-2] and val_loss < torch.mean(torch.stack(losses['val'])): # curr loss less than best loss and previous loss
            stop_counter += 1
            if stop_counter >= 10:
                print("Early stopping")
                break
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    
# write_losses(losses['train'], args.log_dir, name='train-losses')
# write_losses(losses['val'], args.log_dir, name='val-losses')

print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

Training...
| epoch 1 | step 500/4071 | loss 25.9473 | lr 0.00100 | ngrams/sec 71559.9 | eta 0h0m25s
| epoch 1 | step 1000/4071 | loss 12.1431 | lr 0.00100 | ngrams/sec 71737.8 | eta 0h0m21s
| epoch 1 | step 1500/4071 | loss 10.6067 | lr 0.00100 | ngrams/sec 71974.8 | eta 0h0m18s
| epoch 1 | step 2000/4071 | loss 10.7713 | lr 0.00100 | ngrams/sec 71977.9 | eta 0h0m14s
| epoch 1 | step 2500/4071 | loss 11.3857 | lr 0.00100 | ngrams/sec 71700.4 | eta 0h0m11s
| epoch 1 | step 3000/4071 | loss 11.4669 | lr 0.00100 | ngrams/sec 71847.5 | eta 0h0m7s
| epoch 1 | step 3500/4071 | loss 11.4311 | lr 0.00100 | ngrams/sec 71944.5 | eta 0h0m4s
| epoch 1 | step 4000/4071 | loss 11.3315 | lr 0.00100 | ngrams/sec 71736.8 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.41it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.48it/s]


-----------------------------------------------------------------------------------------
| end of epoch   1 | time 29.61s | valid loss  7.82 | valid ppl  2489.85
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 2 | step 500/4071 | loss 11.1537 | lr 0.00100 | ngrams/sec 52074.9 | eta 0h0m35s
| epoch 2 | step 1000/4071 | loss 11.0904 | lr 0.00100 | ngrams/sec 71783.1 | eta 0h0m21s
| epoch 2 | step 1500/4071 | loss 10.9442 | lr 0.00100 | ngrams/sec 71934.1 | eta 0h0m18s
| epoch 2 | step 2000/4071 | loss 10.6070 | lr 0.00100 | ngrams/sec 71700.8 | eta 0h0m14s
| epoch 2 | step 2500/4071 | loss 10.0507 | lr 0.00100 | ngrams/sec 71923.5 | eta 0h0m11s
| epoch 2 | step 3000/4071 | loss 9.6777 | lr 0.00100 | ngrams/sec 71942.8 | eta 0h0m7s
| epoch 2 | step 3500/4071 | loss 9.4967 | lr 0.00100 | ngrams/sec 72556.4 | eta 0h0m4s
| epoch 2 | step 4000/4071 | loss 9.4099 | lr 0.00100 | ngrams/sec 72695.2 | eta 0h0m0

 38%|███▊      | 157/417 [00:00<00:00, 1549.71it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.22it/s]


-----------------------------------------------------------------------------------------
| end of epoch   2 | time 29.51s | valid loss  7.60 | valid ppl  1996.57
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 3 | step 500/4071 | loss 9.2508 | lr 0.00100 | ngrams/sec 52801.9 | eta 0h0m34s
| epoch 3 | step 1000/4071 | loss 9.2322 | lr 0.00100 | ngrams/sec 72683.4 | eta 0h0m21s
| epoch 3 | step 1500/4071 | loss 9.1715 | lr 0.00100 | ngrams/sec 72536.6 | eta 0h0m18s
| epoch 3 | step 2000/4071 | loss 9.1264 | lr 0.00100 | ngrams/sec 72640.9 | eta 0h0m14s
| epoch 3 | step 2500/4071 | loss 9.1034 | lr 0.00100 | ngrams/sec 72544.0 | eta 0h0m11s
| epoch 3 | step 3000/4071 | loss 9.0716 | lr 0.00100 | ngrams/sec 72447.6 | eta 0h0m7s
| epoch 3 | step 3500/4071 | loss 9.0474 | lr 0.00100 | ngrams/sec 72730.5 | eta 0h0m4s
| epoch 3 | step 4000/4071 | loss 9.0133 | lr 0.00100 | ngrams/sec 72557.3 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1548.14it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.99it/s]


-----------------------------------------------------------------------------------------
| end of epoch   3 | time 29.29s | valid loss  7.44 | valid ppl  1695.70
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 4 | step 500/4071 | loss 8.9621 | lr 0.00100 | ngrams/sec 52710.2 | eta 0h0m34s
| epoch 4 | step 1000/4071 | loss 8.9381 | lr 0.00100 | ngrams/sec 72690.1 | eta 0h0m21s
| epoch 4 | step 1500/4071 | loss 8.9247 | lr 0.00100 | ngrams/sec 72865.7 | eta 0h0m18s
| epoch 4 | step 2000/4071 | loss 8.9228 | lr 0.00100 | ngrams/sec 72763.2 | eta 0h0m14s
| epoch 4 | step 2500/4071 | loss 8.9254 | lr 0.00100 | ngrams/sec 72861.8 | eta 0h0m11s
| epoch 4 | step 3000/4071 | loss 8.8768 | lr 0.00100 | ngrams/sec 72892.6 | eta 0h0m7s
| epoch 4 | step 3500/4071 | loss 8.8902 | lr 0.00100 | ngrams/sec 72834.0 | eta 0h0m4s
| epoch 4 | step 4000/4071 | loss 8.8738 | lr 0.00100 | ngrams/sec 72953.5 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.38it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.91it/s]


-----------------------------------------------------------------------------------------
| end of epoch   4 | time 29.21s | valid loss  7.34 | valid ppl  1547.99
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 5 | step 500/4071 | loss 8.8169 | lr 0.00100 | ngrams/sec 52806.5 | eta 0h0m34s
| epoch 5 | step 1000/4071 | loss 8.8372 | lr 0.00100 | ngrams/sec 72771.7 | eta 0h0m21s
| epoch 5 | step 1500/4071 | loss 8.8091 | lr 0.00100 | ngrams/sec 72895.5 | eta 0h0m18s
| epoch 5 | step 2000/4071 | loss 8.8111 | lr 0.00100 | ngrams/sec 72642.0 | eta 0h0m14s
| epoch 5 | step 2500/4071 | loss 8.8078 | lr 0.00100 | ngrams/sec 72790.7 | eta 0h0m11s
| epoch 5 | step 3000/4071 | loss 8.7879 | lr 0.00100 | ngrams/sec 72858.3 | eta 0h0m7s
| epoch 5 | step 3500/4071 | loss 8.8008 | lr 0.00100 | ngrams/sec 72902.0 | eta 0h0m4s
| epoch 5 | step 4000/4071 | loss 8.8021 | lr 0.00100 | ngrams/sec 72985.4 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1551.28it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.03it/s]


-----------------------------------------------------------------------------------------
| end of epoch   5 | time 29.21s | valid loss  7.32 | valid ppl  1512.47
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 6 | step 500/4071 | loss 8.7432 | lr 0.00100 | ngrams/sec 52807.8 | eta 0h0m34s
| epoch 6 | step 1000/4071 | loss 8.7407 | lr 0.00100 | ngrams/sec 72737.9 | eta 0h0m21s
| epoch 6 | step 1500/4071 | loss 8.7415 | lr 0.00100 | ngrams/sec 72938.3 | eta 0h0m18s
| epoch 6 | step 2000/4071 | loss 8.7452 | lr 0.00100 | ngrams/sec 72726.6 | eta 0h0m14s
| epoch 6 | step 2500/4071 | loss 8.7372 | lr 0.00100 | ngrams/sec 72730.7 | eta 0h0m11s
| epoch 6 | step 3000/4071 | loss 8.7441 | lr 0.00100 | ngrams/sec 72852.5 | eta 0h0m7s
| epoch 6 | step 3500/4071 | loss 8.7301 | lr 0.00100 | ngrams/sec 72999.8 | eta 0h0m4s
| epoch 6 | step 4000/4071 | loss 8.7287 | lr 0.00100 | ngrams/sec 73068.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1545.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.12it/s]


-----------------------------------------------------------------------------------------
| end of epoch   6 | time 29.19s | valid loss  7.22 | valid ppl  1360.10
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 7 | step 500/4071 | loss 8.6956 | lr 0.00100 | ngrams/sec 52899.4 | eta 0h0m34s
| epoch 7 | step 1000/4071 | loss 8.6946 | lr 0.00100 | ngrams/sec 72779.1 | eta 0h0m21s
| epoch 7 | step 1500/4071 | loss 8.6956 | lr 0.00100 | ngrams/sec 72962.9 | eta 0h0m18s
| epoch 7 | step 2000/4071 | loss 8.6888 | lr 0.00100 | ngrams/sec 72770.2 | eta 0h0m14s
| epoch 7 | step 2500/4071 | loss 8.6988 | lr 0.00100 | ngrams/sec 72978.7 | eta 0h0m11s
| epoch 7 | step 3000/4071 | loss 8.6950 | lr 0.00100 | ngrams/sec 72863.0 | eta 0h0m7s
| epoch 7 | step 3500/4071 | loss 8.6846 | lr 0.00100 | ngrams/sec 72994.4 | eta 0h0m4s
| epoch 7 | step 4000/4071 | loss 8.6881 | lr 0.00100 | ngrams/sec 72828.3 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1549.84it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.28it/s]


-----------------------------------------------------------------------------------------
| end of epoch   7 | time 29.18s | valid loss  7.20 | valid ppl  1345.46
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 8 | step 500/4071 | loss 8.6638 | lr 0.00100 | ngrams/sec 52525.2 | eta 0h0m34s
| epoch 8 | step 1000/4071 | loss 8.6417 | lr 0.00100 | ngrams/sec 72540.4 | eta 0h0m21s
| epoch 8 | step 1500/4071 | loss 8.6650 | lr 0.00100 | ngrams/sec 71337.9 | eta 0h0m18s
| epoch 8 | step 2000/4071 | loss 8.6645 | lr 0.00100 | ngrams/sec 72979.5 | eta 0h0m14s
| epoch 8 | step 2500/4071 | loss 8.6587 | lr 0.00100 | ngrams/sec 72820.8 | eta 0h0m11s
| epoch 8 | step 3000/4071 | loss 8.6488 | lr 0.00100 | ngrams/sec 72944.3 | eta 0h0m7s
| epoch 8 | step 3500/4071 | loss 8.6594 | lr 0.00100 | ngrams/sec 72885.1 | eta 0h0m4s
| epoch 8 | step 4000/4071 | loss 8.6614 | lr 0.00100 | ngrams/sec 72917.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1551.99it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch   8 | time 29.30s | valid loss  7.17 | valid ppl  1293.81
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 9 | step 500/4071 | loss 8.6275 | lr 0.00100 | ngrams/sec 52782.5 | eta 0h0m34s
| epoch 9 | step 1000/4071 | loss 8.6217 | lr 0.00100 | ngrams/sec 72864.1 | eta 0h0m21s
| epoch 9 | step 1500/4071 | loss 8.6410 | lr 0.00100 | ngrams/sec 72638.5 | eta 0h0m18s
| epoch 9 | step 2000/4071 | loss 8.6352 | lr 0.00100 | ngrams/sec 72669.3 | eta 0h0m14s
| epoch 9 | step 2500/4071 | loss 8.6462 | lr 0.00100 | ngrams/sec 73018.5 | eta 0h0m11s
| epoch 9 | step 3000/4071 | loss 8.6330 | lr 0.00100 | ngrams/sec 73003.7 | eta 0h0m7s
| epoch 9 | step 3500/4071 | loss 8.6297 | lr 0.00100 | ngrams/sec 72960.6 | eta 0h0m4s
| epoch 9 | step 4000/4071 | loss 8.6349 | lr 0.00100 | ngrams/sec 72931.6 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1557.53it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.92it/s]


-----------------------------------------------------------------------------------------
| end of epoch   9 | time 29.20s | valid loss  7.10 | valid ppl  1207.61
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 10 | step 500/4071 | loss 8.6049 | lr 0.00100 | ngrams/sec 52785.6 | eta 0h0m34s
| epoch 10 | step 1000/4071 | loss 8.6047 | lr 0.00100 | ngrams/sec 73046.2 | eta 0h0m21s
| epoch 10 | step 1500/4071 | loss 8.6185 | lr 0.00100 | ngrams/sec 72793.2 | eta 0h0m18s
| epoch 10 | step 2000/4071 | loss 8.6072 | lr 0.00100 | ngrams/sec 72893.3 | eta 0h0m14s
| epoch 10 | step 2500/4071 | loss 8.6128 | lr 0.00100 | ngrams/sec 73022.7 | eta 0h0m11s
| epoch 10 | step 3000/4071 | loss 8.6131 | lr 0.00100 | ngrams/sec 72969.7 | eta 0h0m7s
| epoch 10 | step 3500/4071 | loss 8.6029 | lr 0.00100 | ngrams/sec 72732.3 | eta 0h0m4s
| epoch 10 | step 4000/4071 | loss 8.6169 | lr 0.00100 | ngrams/sec 73050.9 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1559.30it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  10 | time 29.18s | valid loss  7.05 | valid ppl  1154.41
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 11 | step 500/4071 | loss 8.5892 | lr 0.00100 | ngrams/sec 52790.0 | eta 0h0m34s
| epoch 11 | step 1000/4071 | loss 8.5781 | lr 0.00100 | ngrams/sec 72776.4 | eta 0h0m21s
| epoch 11 | step 1500/4071 | loss 8.5977 | lr 0.00100 | ngrams/sec 73012.8 | eta 0h0m18s
| epoch 11 | step 2000/4071 | loss 8.5983 | lr 0.00100 | ngrams/sec 72815.7 | eta 0h0m14s
| epoch 11 | step 2500/4071 | loss 8.5888 | lr 0.00100 | ngrams/sec 72616.7 | eta 0h0m11s
| epoch 11 | step 3000/4071 | loss 8.5870 | lr 0.00100 | ngrams/sec 73004.9 | eta 0h0m7s
| epoch 11 | step 3500/4071 | loss 8.5859 | lr 0.00100 | ngrams/sec 72842.3 | eta 0h0m4s
| epoch 11 | step 4000/4071 | loss 8.5931 | lr 0.00100 | ngrams/sec 72632.4 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1560.08it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.20it/s]


-----------------------------------------------------------------------------------------
| end of epoch  11 | time 29.21s | valid loss  7.03 | valid ppl  1133.19
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 12 | step 500/4071 | loss 8.5562 | lr 0.00100 | ngrams/sec 52790.4 | eta 0h0m34s
| epoch 12 | step 1000/4071 | loss 8.5888 | lr 0.00100 | ngrams/sec 72737.1 | eta 0h0m21s
| epoch 12 | step 1500/4071 | loss 8.5668 | lr 0.00100 | ngrams/sec 72618.3 | eta 0h0m18s
| epoch 12 | step 2000/4071 | loss 8.5806 | lr 0.00100 | ngrams/sec 72672.3 | eta 0h0m14s
| epoch 12 | step 2500/4071 | loss 8.5726 | lr 0.00100 | ngrams/sec 72962.5 | eta 0h0m11s
| epoch 12 | step 3000/4071 | loss 8.5921 | lr 0.00100 | ngrams/sec 72695.5 | eta 0h0m7s
| epoch 12 | step 3500/4071 | loss 8.5757 | lr 0.00100 | ngrams/sec 72576.0 | eta 0h0m4s
| epoch 12 | step 4000/4071 | loss 8.5828 | lr 0.00100 | ngrams/sec 72800.5 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1558.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch  12 | time 29.24s | valid loss  6.98 | valid ppl  1074.27
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 13 | step 500/4071 | loss 8.5587 | lr 0.00100 | ngrams/sec 52923.5 | eta 0h0m34s
| epoch 13 | step 1000/4071 | loss 8.5458 | lr 0.00100 | ngrams/sec 72725.9 | eta 0h0m21s
| epoch 13 | step 1500/4071 | loss 8.5611 | lr 0.00100 | ngrams/sec 72817.6 | eta 0h0m18s
| epoch 13 | step 2000/4071 | loss 8.5713 | lr 0.00100 | ngrams/sec 72977.3 | eta 0h0m14s
| epoch 13 | step 2500/4071 | loss 8.5579 | lr 0.00100 | ngrams/sec 73042.1 | eta 0h0m11s
| epoch 13 | step 3000/4071 | loss 8.5585 | lr 0.00100 | ngrams/sec 72917.2 | eta 0h0m7s
| epoch 13 | step 3500/4071 | loss 8.5481 | lr 0.00100 | ngrams/sec 72979.2 | eta 0h0m4s
| epoch 13 | step 4000/4071 | loss 8.5719 | lr 0.00100 | ngrams/sec 73000.0 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1553.15it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  13 | time 29.17s | valid loss  7.05 | valid ppl  1151.02
-----------------------------------------------------------------------------------------
| epoch 14 | step 500/4071 | loss 8.5343 | lr 0.00100 | ngrams/sec 53500.5 | eta 0h0m34s
| epoch 14 | step 1000/4071 | loss 8.5423 | lr 0.00100 | ngrams/sec 72732.6 | eta 0h0m21s
| epoch 14 | step 1500/4071 | loss 8.5427 | lr 0.00100 | ngrams/sec 72700.8 | eta 0h0m18s
| epoch 14 | step 2000/4071 | loss 8.5494 | lr 0.00100 | ngrams/sec 72868.9 | eta 0h0m14s
| epoch 14 | step 2500/4071 | loss 8.5511 | lr 0.00100 | ngrams/sec 72878.1 | eta 0h0m11s
| epoch 14 | step 3000/4071 | loss 8.5468 | lr 0.00100 | ngrams/sec 72899.9 | eta 0h0m7s
| epoch 14 | step 3500/4071 | loss 8.5266 | lr 0.00100 | ngrams/sec 72783.5 | eta 0h0m4s
| epoch 14 | step 4000/4071 | loss 8.5490 | lr 0.00100 | ngrams/sec 73140.6 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1556.87it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  14 | time 29.19s | valid loss  6.96 | valid ppl  1057.72
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 15 | step 500/4071 | loss 8.5103 | lr 0.00100 | ngrams/sec 52812.4 | eta 0h0m34s
| epoch 15 | step 1000/4071 | loss 8.5279 | lr 0.00100 | ngrams/sec 72796.1 | eta 0h0m21s
| epoch 15 | step 1500/4071 | loss 8.5183 | lr 0.00100 | ngrams/sec 72868.4 | eta 0h0m18s
| epoch 15 | step 2000/4071 | loss 8.5238 | lr 0.00100 | ngrams/sec 72804.5 | eta 0h0m14s
| epoch 15 | step 2500/4071 | loss 8.5259 | lr 0.00100 | ngrams/sec 72842.4 | eta 0h0m11s
| epoch 15 | step 3000/4071 | loss 8.5336 | lr 0.00100 | ngrams/sec 72939.5 | eta 0h0m7s
| epoch 15 | step 3500/4071 | loss 8.5312 | lr 0.00100 | ngrams/sec 72959.3 | eta 0h0m4s
| epoch 15 | step 4000/4071 | loss 8.5282 | lr 0.00100 | ngrams/sec 72295.7 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1551.81it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.74it/s]


-----------------------------------------------------------------------------------------
| end of epoch  15 | time 29.23s | valid loss  6.94 | valid ppl  1034.66
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 16 | step 500/4071 | loss 8.4960 | lr 0.00100 | ngrams/sec 52841.6 | eta 0h0m34s
| epoch 16 | step 1000/4071 | loss 8.5083 | lr 0.00100 | ngrams/sec 72743.2 | eta 0h0m21s
| epoch 16 | step 1500/4071 | loss 8.5237 | lr 0.00100 | ngrams/sec 72877.0 | eta 0h0m18s
| epoch 16 | step 2000/4071 | loss 8.5179 | lr 0.00100 | ngrams/sec 72765.5 | eta 0h0m14s
| epoch 16 | step 2500/4071 | loss 8.5182 | lr 0.00100 | ngrams/sec 72766.3 | eta 0h0m11s
| epoch 16 | step 3000/4071 | loss 8.5160 | lr 0.00100 | ngrams/sec 72839.0 | eta 0h0m7s
| epoch 16 | step 3500/4071 | loss 8.5183 | lr 0.00100 | ngrams/sec 72779.8 | eta 0h0m4s
| epoch 16 | step 4000/4071 | loss 8.5130 | lr 0.00100 | ngrams/sec 72717.4 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1555.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.85it/s]


-----------------------------------------------------------------------------------------
| end of epoch  16 | time 29.22s | valid loss  6.95 | valid ppl  1047.68
-----------------------------------------------------------------------------------------
| epoch 17 | step 500/4071 | loss 8.4884 | lr 0.00100 | ngrams/sec 53567.5 | eta 0h0m34s
| epoch 17 | step 1000/4071 | loss 8.4876 | lr 0.00100 | ngrams/sec 72939.6 | eta 0h0m21s
| epoch 17 | step 1500/4071 | loss 8.5075 | lr 0.00100 | ngrams/sec 72425.0 | eta 0h0m18s
| epoch 17 | step 2000/4071 | loss 8.5067 | lr 0.00100 | ngrams/sec 72758.3 | eta 0h0m14s
| epoch 17 | step 2500/4071 | loss 8.4913 | lr 0.00100 | ngrams/sec 72696.3 | eta 0h0m11s
| epoch 17 | step 3000/4071 | loss 8.5228 | lr 0.00100 | ngrams/sec 72480.1 | eta 0h0m7s
| epoch 17 | step 3500/4071 | loss 8.5023 | lr 0.00100 | ngrams/sec 72920.5 | eta 0h0m4s
| epoch 17 | step 4000/4071 | loss 8.5004 | lr 0.00100 | ngrams/sec 72880.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1558.41it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.04it/s]


-----------------------------------------------------------------------------------------
| end of epoch  17 | time 29.23s | valid loss  6.92 | valid ppl  1009.11
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 18 | step 500/4071 | loss 8.4804 | lr 0.00100 | ngrams/sec 52893.3 | eta 0h0m34s
| epoch 18 | step 1000/4071 | loss 8.4861 | lr 0.00100 | ngrams/sec 73040.3 | eta 0h0m21s
| epoch 18 | step 1500/4071 | loss 8.4996 | lr 0.00100 | ngrams/sec 72669.8 | eta 0h0m18s
| epoch 18 | step 2000/4071 | loss 8.4909 | lr 0.00100 | ngrams/sec 72748.6 | eta 0h0m14s
| epoch 18 | step 2500/4071 | loss 8.4828 | lr 0.00100 | ngrams/sec 72721.5 | eta 0h0m11s
| epoch 18 | step 3000/4071 | loss 8.4922 | lr 0.00100 | ngrams/sec 72460.7 | eta 0h0m7s
| epoch 18 | step 3500/4071 | loss 8.4896 | lr 0.00100 | ngrams/sec 72651.9 | eta 0h0m4s
| epoch 18 | step 4000/4071 | loss 8.5041 | lr 0.00100 | ngrams/sec 72701.8 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1553.49it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.72it/s]


-----------------------------------------------------------------------------------------
| end of epoch  18 | time 29.24s | valid loss  6.89 | valid ppl   985.48
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 19 | step 500/4071 | loss 8.4778 | lr 0.00100 | ngrams/sec 52836.2 | eta 0h0m34s
| epoch 19 | step 1000/4071 | loss 8.4874 | lr 0.00100 | ngrams/sec 72708.7 | eta 0h0m21s
| epoch 19 | step 1500/4071 | loss 8.4840 | lr 0.00100 | ngrams/sec 72923.0 | eta 0h0m18s
| epoch 19 | step 2000/4071 | loss 8.5002 | lr 0.00100 | ngrams/sec 72809.8 | eta 0h0m14s
| epoch 19 | step 2500/4071 | loss 8.4889 | lr 0.00100 | ngrams/sec 72747.0 | eta 0h0m11s
| epoch 19 | step 3000/4071 | loss 8.4881 | lr 0.00100 | ngrams/sec 72925.7 | eta 0h0m7s
| epoch 19 | step 3500/4071 | loss 8.4947 | lr 0.00100 | ngrams/sec 72885.7 | eta 0h0m4s
| epoch 19 | step 4000/4071 | loss 8.4822 | lr 0.00100 | ngrams/sec 72814.7 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1556.62it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.16it/s]


-----------------------------------------------------------------------------------------
| end of epoch  19 | time 29.20s | valid loss  6.89 | valid ppl   981.71
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 20 | step 500/4071 | loss 8.4652 | lr 0.00100 | ngrams/sec 52772.7 | eta 0h0m34s
| epoch 20 | step 1000/4071 | loss 8.4612 | lr 0.00100 | ngrams/sec 72753.5 | eta 0h0m21s
| epoch 20 | step 1500/4071 | loss 8.4675 | lr 0.00100 | ngrams/sec 72592.8 | eta 0h0m18s
| epoch 20 | step 2000/4071 | loss 8.4744 | lr 0.00100 | ngrams/sec 72832.7 | eta 0h0m14s
| epoch 20 | step 2500/4071 | loss 8.4754 | lr 0.00100 | ngrams/sec 72793.9 | eta 0h0m11s
| epoch 20 | step 3000/4071 | loss 8.4925 | lr 0.00100 | ngrams/sec 72848.3 | eta 0h0m7s
| epoch 20 | step 3500/4071 | loss 8.4806 | lr 0.00100 | ngrams/sec 72817.9 | eta 0h0m4s
| epoch 20 | step 4000/4071 | loss 8.4819 | lr 0.00100 | ngrams/sec 72892.0 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1556.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.12it/s]


-----------------------------------------------------------------------------------------
| end of epoch  20 | time 29.23s | valid loss  6.86 | valid ppl   955.56
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 21 | step 500/4071 | loss 8.4465 | lr 0.00100 | ngrams/sec 52862.8 | eta 0h0m34s
| epoch 21 | step 1000/4071 | loss 8.4623 | lr 0.00100 | ngrams/sec 72872.2 | eta 0h0m21s
| epoch 21 | step 1500/4071 | loss 8.4629 | lr 0.00100 | ngrams/sec 73047.7 | eta 0h0m18s
| epoch 21 | step 2000/4071 | loss 8.4594 | lr 0.00100 | ngrams/sec 72912.9 | eta 0h0m14s
| epoch 21 | step 2500/4071 | loss 8.4742 | lr 0.00100 | ngrams/sec 72845.3 | eta 0h0m11s
| epoch 21 | step 3000/4071 | loss 8.4723 | lr 0.00100 | ngrams/sec 73048.7 | eta 0h0m7s
| epoch 21 | step 3500/4071 | loss 8.4695 | lr 0.00100 | ngrams/sec 73067.2 | eta 0h0m4s
| epoch 21 | step 4000/4071 | loss 8.4754 | lr 0.00100 | ngrams/sec 73039.4 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1538.19it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.80it/s]


-----------------------------------------------------------------------------------------
| end of epoch  21 | time 29.15s | valid loss  6.87 | valid ppl   965.31
-----------------------------------------------------------------------------------------
| epoch 22 | step 500/4071 | loss 8.4578 | lr 0.00100 | ngrams/sec 53727.7 | eta 0h0m34s
| epoch 22 | step 1000/4071 | loss 8.4451 | lr 0.00100 | ngrams/sec 73200.1 | eta 0h0m21s
| epoch 22 | step 1500/4071 | loss 8.4554 | lr 0.00100 | ngrams/sec 72995.3 | eta 0h0m18s
| epoch 22 | step 2000/4071 | loss 8.4670 | lr 0.00100 | ngrams/sec 72814.1 | eta 0h0m14s
| epoch 22 | step 2500/4071 | loss 8.4701 | lr 0.00100 | ngrams/sec 73114.8 | eta 0h0m11s
| epoch 22 | step 3000/4071 | loss 8.4742 | lr 0.00100 | ngrams/sec 72893.2 | eta 0h0m7s
| epoch 22 | step 3500/4071 | loss 8.4568 | lr 0.00100 | ngrams/sec 72987.2 | eta 0h0m4s
| epoch 22 | step 4000/4071 | loss 8.4728 | lr 0.00100 | ngrams/sec 72688.3 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1548.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch  22 | time 29.14s | valid loss  6.87 | valid ppl   964.94
-----------------------------------------------------------------------------------------
| epoch 23 | step 500/4071 | loss 8.4470 | lr 0.00100 | ngrams/sec 53545.6 | eta 0h0m34s
| epoch 23 | step 1000/4071 | loss 8.4438 | lr 0.00100 | ngrams/sec 72719.5 | eta 0h0m21s
| epoch 23 | step 1500/4071 | loss 8.4607 | lr 0.00100 | ngrams/sec 72785.4 | eta 0h0m18s
| epoch 23 | step 2000/4071 | loss 8.4556 | lr 0.00100 | ngrams/sec 72730.5 | eta 0h0m14s
| epoch 23 | step 2500/4071 | loss 8.4346 | lr 0.00100 | ngrams/sec 72850.8 | eta 0h0m11s
| epoch 23 | step 3000/4071 | loss 8.4701 | lr 0.00100 | ngrams/sec 72627.5 | eta 0h0m7s
| epoch 23 | step 3500/4071 | loss 8.4578 | lr 0.00100 | ngrams/sec 72987.9 | eta 0h0m4s
| epoch 23 | step 4000/4071 | loss 8.4620 | lr 0.00100 | ngrams/sec 72768.9 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1538.56it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.23it/s]


-----------------------------------------------------------------------------------------
| end of epoch  23 | time 29.22s | valid loss  6.86 | valid ppl   958.06
-----------------------------------------------------------------------------------------
| epoch 24 | step 500/4071 | loss 8.4369 | lr 0.00100 | ngrams/sec 53535.3 | eta 0h0m34s
| epoch 24 | step 1000/4071 | loss 8.4497 | lr 0.00100 | ngrams/sec 72866.1 | eta 0h0m21s
| epoch 24 | step 1500/4071 | loss 8.4428 | lr 0.00100 | ngrams/sec 72870.6 | eta 0h0m18s
| epoch 24 | step 2000/4071 | loss 8.4383 | lr 0.00100 | ngrams/sec 72749.6 | eta 0h0m14s
| epoch 24 | step 2500/4071 | loss 8.4454 | lr 0.00100 | ngrams/sec 72828.4 | eta 0h0m11s
| epoch 24 | step 3000/4071 | loss 8.4476 | lr 0.00100 | ngrams/sec 72682.7 | eta 0h0m7s
| epoch 24 | step 3500/4071 | loss 8.4515 | lr 0.00100 | ngrams/sec 72746.9 | eta 0h0m4s
| epoch 24 | step 4000/4071 | loss 8.4593 | lr 0.00100 | ngrams/sec 72731.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1558.24it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  24 | time 29.22s | valid loss  6.85 | valid ppl   946.45
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 25 | step 500/4071 | loss 8.4295 | lr 0.00100 | ngrams/sec 52810.8 | eta 0h0m34s
| epoch 25 | step 1000/4071 | loss 8.4462 | lr 0.00100 | ngrams/sec 72988.7 | eta 0h0m21s
| epoch 25 | step 1500/4071 | loss 8.4412 | lr 0.00100 | ngrams/sec 72799.8 | eta 0h0m18s
| epoch 25 | step 2000/4071 | loss 8.4371 | lr 0.00100 | ngrams/sec 72851.6 | eta 0h0m14s
| epoch 25 | step 2500/4071 | loss 8.4501 | lr 0.00100 | ngrams/sec 72279.1 | eta 0h0m11s
| epoch 25 | step 3000/4071 | loss 8.4540 | lr 0.00100 | ngrams/sec 72670.5 | eta 0h0m7s
| epoch 25 | step 3500/4071 | loss 8.4452 | lr 0.00100 | ngrams/sec 72709.7 | eta 0h0m4s
| epoch 25 | step 4000/4071 | loss 8.4474 | lr 0.00100 | ngrams/sec 72727.6 | eta 0h

 37%|███▋      | 155/417 [00:00<00:00, 1544.78it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.33it/s]


-----------------------------------------------------------------------------------------
| end of epoch  25 | time 29.24s | valid loss  6.84 | valid ppl   938.59
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 26 | step 500/4071 | loss 8.4142 | lr 0.00100 | ngrams/sec 52883.0 | eta 0h0m34s
| epoch 26 | step 1000/4071 | loss 8.4493 | lr 0.00100 | ngrams/sec 73021.7 | eta 0h0m21s
| epoch 26 | step 1500/4071 | loss 8.4317 | lr 0.00100 | ngrams/sec 72475.5 | eta 0h0m18s
| epoch 26 | step 2000/4071 | loss 8.4517 | lr 0.00100 | ngrams/sec 72755.2 | eta 0h0m14s
| epoch 26 | step 2500/4071 | loss 8.4423 | lr 0.00100 | ngrams/sec 72716.0 | eta 0h0m11s
| epoch 26 | step 3000/4071 | loss 8.4324 | lr 0.00100 | ngrams/sec 72602.9 | eta 0h0m7s
| epoch 26 | step 3500/4071 | loss 8.4443 | lr 0.00100 | ngrams/sec 72815.6 | eta 0h0m4s
| epoch 26 | step 4000/4071 | loss 8.4406 | lr 0.00100 | ngrams/sec 72751.0 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1534.87it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch  26 | time 29.23s | valid loss  6.86 | valid ppl   957.93
-----------------------------------------------------------------------------------------
| epoch 27 | step 500/4071 | loss 8.4260 | lr 0.00100 | ngrams/sec 53592.5 | eta 0h0m34s
| epoch 27 | step 1000/4071 | loss 8.4238 | lr 0.00100 | ngrams/sec 72764.8 | eta 0h0m21s
| epoch 27 | step 1500/4071 | loss 8.4209 | lr 0.00100 | ngrams/sec 72705.2 | eta 0h0m18s
| epoch 27 | step 2000/4071 | loss 8.4370 | lr 0.00100 | ngrams/sec 72911.6 | eta 0h0m14s
| epoch 27 | step 2500/4071 | loss 8.4381 | lr 0.00100 | ngrams/sec 72935.8 | eta 0h0m11s
| epoch 27 | step 3000/4071 | loss 8.4316 | lr 0.00100 | ngrams/sec 72751.9 | eta 0h0m7s
| epoch 27 | step 3500/4071 | loss 8.4363 | lr 0.00100 | ngrams/sec 72738.1 | eta 0h0m4s
| epoch 27 | step 4000/4071 | loss 8.4356 | lr 0.00100 | ngrams/sec 72896.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1551.88it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.92it/s]


-----------------------------------------------------------------------------------------
| end of epoch  27 | time 29.20s | valid loss  6.79 | valid ppl   890.47
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 28 | step 500/4071 | loss 8.4069 | lr 0.00100 | ngrams/sec 52901.5 | eta 0h0m34s
| epoch 28 | step 1000/4071 | loss 8.4163 | lr 0.00100 | ngrams/sec 72700.7 | eta 0h0m21s
| epoch 28 | step 1500/4071 | loss 8.4143 | lr 0.00100 | ngrams/sec 72850.7 | eta 0h0m18s
| epoch 28 | step 2000/4071 | loss 8.4350 | lr 0.00100 | ngrams/sec 72823.4 | eta 0h0m14s
| epoch 28 | step 2500/4071 | loss 8.4337 | lr 0.00100 | ngrams/sec 72810.2 | eta 0h0m11s
| epoch 28 | step 3000/4071 | loss 8.4316 | lr 0.00100 | ngrams/sec 72824.1 | eta 0h0m7s
| epoch 28 | step 3500/4071 | loss 8.4322 | lr 0.00100 | ngrams/sec 72858.9 | eta 0h0m4s
| epoch 28 | step 4000/4071 | loss 8.4318 | lr 0.00100 | ngrams/sec 72958.6 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1555.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.00it/s]


-----------------------------------------------------------------------------------------
| end of epoch  28 | time 29.20s | valid loss  6.82 | valid ppl   913.76
-----------------------------------------------------------------------------------------
| epoch 29 | step 500/4071 | loss 8.4033 | lr 0.00100 | ngrams/sec 53534.9 | eta 0h0m34s
| epoch 29 | step 1000/4071 | loss 8.4126 | lr 0.00100 | ngrams/sec 72358.2 | eta 0h0m21s
| epoch 29 | step 1500/4071 | loss 8.4244 | lr 0.00100 | ngrams/sec 72314.3 | eta 0h0m18s
| epoch 29 | step 2000/4071 | loss 8.4338 | lr 0.00100 | ngrams/sec 72524.4 | eta 0h0m14s
| epoch 29 | step 2500/4071 | loss 8.4221 | lr 0.00100 | ngrams/sec 72699.4 | eta 0h0m11s
| epoch 29 | step 3000/4071 | loss 8.4226 | lr 0.00100 | ngrams/sec 72893.5 | eta 0h0m7s
| epoch 29 | step 3500/4071 | loss 8.4294 | lr 0.00100 | ngrams/sec 72949.7 | eta 0h0m4s
| epoch 29 | step 4000/4071 | loss 8.4277 | lr 0.00100 | ngrams/sec 72861.4 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1535.33it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.05it/s]


-----------------------------------------------------------------------------------------
| end of epoch  29 | time 29.26s | valid loss  6.81 | valid ppl   909.83
-----------------------------------------------------------------------------------------
| epoch 30 | step 500/4071 | loss 8.4028 | lr 0.00100 | ngrams/sec 53613.5 | eta 0h0m34s
| epoch 30 | step 1000/4071 | loss 8.3927 | lr 0.00100 | ngrams/sec 72954.3 | eta 0h0m21s
| epoch 30 | step 1500/4071 | loss 8.4260 | lr 0.00100 | ngrams/sec 72938.3 | eta 0h0m18s
| epoch 30 | step 2000/4071 | loss 8.4254 | lr 0.00100 | ngrams/sec 72872.5 | eta 0h0m14s
| epoch 30 | step 2500/4071 | loss 8.4291 | lr 0.00100 | ngrams/sec 71417.5 | eta 0h0m11s
| epoch 30 | step 3000/4071 | loss 8.4084 | lr 0.00100 | ngrams/sec 72913.0 | eta 0h0m7s
| epoch 30 | step 3500/4071 | loss 8.4217 | lr 0.00100 | ngrams/sec 72826.6 | eta 0h0m4s
| epoch 30 | step 4000/4071 | loss 8.4250 | lr 0.00100 | ngrams/sec 72908.7 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1537.84it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch  30 | time 29.25s | valid loss  6.82 | valid ppl   920.21
-----------------------------------------------------------------------------------------
| epoch 31 | step 500/4071 | loss 8.3909 | lr 0.00100 | ngrams/sec 53499.7 | eta 0h0m34s
| epoch 31 | step 1000/4071 | loss 8.4071 | lr 0.00100 | ngrams/sec 72610.4 | eta 0h0m21s
| epoch 31 | step 1500/4071 | loss 8.4001 | lr 0.00100 | ngrams/sec 72803.9 | eta 0h0m18s
| epoch 31 | step 2000/4071 | loss 8.4228 | lr 0.00100 | ngrams/sec 72966.0 | eta 0h0m14s
| epoch 31 | step 2500/4071 | loss 8.4021 | lr 0.00100 | ngrams/sec 72966.3 | eta 0h0m11s
| epoch 31 | step 3000/4071 | loss 8.4186 | lr 0.00100 | ngrams/sec 72813.5 | eta 0h0m7s
| epoch 31 | step 3500/4071 | loss 8.4191 | lr 0.00100 | ngrams/sec 72932.7 | eta 0h0m4s
| epoch 31 | step 4000/4071 | loss 8.4038 | lr 0.00100 | ngrams/sec 72805.4 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1537.17it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.72it/s]


-----------------------------------------------------------------------------------------
| end of epoch  31 | time 29.20s | valid loss  6.80 | valid ppl   897.54
-----------------------------------------------------------------------------------------
| epoch 32 | step 500/4071 | loss 8.3952 | lr 0.00100 | ngrams/sec 53651.2 | eta 0h0m34s
| epoch 32 | step 1000/4071 | loss 8.3920 | lr 0.00100 | ngrams/sec 72918.5 | eta 0h0m21s
| epoch 32 | step 1500/4071 | loss 8.4031 | lr 0.00100 | ngrams/sec 72740.6 | eta 0h0m18s
| epoch 32 | step 2000/4071 | loss 8.4122 | lr 0.00100 | ngrams/sec 72905.5 | eta 0h0m14s
| epoch 32 | step 2500/4071 | loss 8.4136 | lr 0.00100 | ngrams/sec 72752.9 | eta 0h0m11s
| epoch 32 | step 3000/4071 | loss 8.4076 | lr 0.00100 | ngrams/sec 72752.4 | eta 0h0m7s
| epoch 32 | step 3500/4071 | loss 8.4216 | lr 0.00100 | ngrams/sec 72912.5 | eta 0h0m4s
| epoch 32 | step 4000/4071 | loss 8.4134 | lr 0.00100 | ngrams/sec 72907.4 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.79it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.80it/s]


-----------------------------------------------------------------------------------------
| end of epoch  32 | time 29.19s | valid loss  6.78 | valid ppl   880.85
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 33 | step 500/4071 | loss 8.4018 | lr 0.00100 | ngrams/sec 52866.5 | eta 0h0m34s
| epoch 33 | step 1000/4071 | loss 8.4029 | lr 0.00100 | ngrams/sec 73116.4 | eta 0h0m21s
| epoch 33 | step 1500/4071 | loss 8.3859 | lr 0.00100 | ngrams/sec 73129.8 | eta 0h0m18s
| epoch 33 | step 2000/4071 | loss 8.4095 | lr 0.00100 | ngrams/sec 72915.1 | eta 0h0m14s
| epoch 33 | step 2500/4071 | loss 8.3999 | lr 0.00100 | ngrams/sec 73096.9 | eta 0h0m11s
| epoch 33 | step 3000/4071 | loss 8.4145 | lr 0.00100 | ngrams/sec 72904.6 | eta 0h0m7s
| epoch 33 | step 3500/4071 | loss 8.4011 | lr 0.00100 | ngrams/sec 72791.0 | eta 0h0m4s
| epoch 33 | step 4000/4071 | loss 8.4025 | lr 0.00100 | ngrams/sec 72804.0 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1557.76it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  33 | time 29.15s | valid loss  6.78 | valid ppl   878.02
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 34 | step 500/4071 | loss 8.3881 | lr 0.00100 | ngrams/sec 52953.6 | eta 0h0m34s
| epoch 34 | step 1000/4071 | loss 8.4001 | lr 0.00100 | ngrams/sec 72916.5 | eta 0h0m21s
| epoch 34 | step 1500/4071 | loss 8.4044 | lr 0.00100 | ngrams/sec 73058.3 | eta 0h0m18s
| epoch 34 | step 2000/4071 | loss 8.4016 | lr 0.00100 | ngrams/sec 73047.0 | eta 0h0m14s
| epoch 34 | step 2500/4071 | loss 8.3957 | lr 0.00100 | ngrams/sec 72876.7 | eta 0h0m11s
| epoch 34 | step 3000/4071 | loss 8.4134 | lr 0.00100 | ngrams/sec 72996.2 | eta 0h0m7s
| epoch 34 | step 3500/4071 | loss 8.4062 | lr 0.00100 | ngrams/sec 72877.3 | eta 0h0m4s
| epoch 34 | step 4000/4071 | loss 8.3947 | lr 0.00100 | ngrams/sec 72753.3 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1541.55it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.26it/s]


-----------------------------------------------------------------------------------------
| end of epoch  34 | time 29.16s | valid loss  6.77 | valid ppl   868.09
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 35 | step 500/4071 | loss 8.3722 | lr 0.00100 | ngrams/sec 52884.8 | eta 0h0m34s
| epoch 35 | step 1000/4071 | loss 8.3925 | lr 0.00100 | ngrams/sec 72871.6 | eta 0h0m21s
| epoch 35 | step 1500/4071 | loss 8.3726 | lr 0.00100 | ngrams/sec 72955.8 | eta 0h0m18s
| epoch 35 | step 2000/4071 | loss 8.4001 | lr 0.00100 | ngrams/sec 72865.1 | eta 0h0m14s
| epoch 35 | step 2500/4071 | loss 8.3842 | lr 0.00100 | ngrams/sec 72906.1 | eta 0h0m11s
| epoch 35 | step 3000/4071 | loss 8.3936 | lr 0.00100 | ngrams/sec 73092.0 | eta 0h0m7s
| epoch 35 | step 3500/4071 | loss 8.3906 | lr 0.00100 | ngrams/sec 72906.4 | eta 0h0m4s
| epoch 35 | step 4000/4071 | loss 8.3900 | lr 0.00100 | ngrams/sec 72764.7 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1552.31it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.59it/s]


-----------------------------------------------------------------------------------------
| end of epoch  35 | time 29.17s | valid loss  6.73 | valid ppl   836.71
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 36 | step 500/4071 | loss 8.3611 | lr 0.00100 | ngrams/sec 52767.1 | eta 0h0m34s
| epoch 36 | step 1000/4071 | loss 8.3695 | lr 0.00100 | ngrams/sec 72906.6 | eta 0h0m21s
| epoch 36 | step 1500/4071 | loss 8.3834 | lr 0.00100 | ngrams/sec 72569.6 | eta 0h0m18s
| epoch 36 | step 2000/4071 | loss 8.3872 | lr 0.00100 | ngrams/sec 72513.0 | eta 0h0m14s
| epoch 36 | step 2500/4071 | loss 8.3895 | lr 0.00100 | ngrams/sec 71961.7 | eta 0h0m11s
| epoch 36 | step 3000/4071 | loss 8.3829 | lr 0.00100 | ngrams/sec 72657.5 | eta 0h0m7s
| epoch 36 | step 3500/4071 | loss 8.3855 | lr 0.00100 | ngrams/sec 72718.9 | eta 0h0m4s
| epoch 36 | step 4000/4071 | loss 8.3819 | lr 0.00100 | ngrams/sec 72730.3 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1541.25it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.40it/s]


-----------------------------------------------------------------------------------------
| end of epoch  36 | time 29.30s | valid loss  6.78 | valid ppl   883.03
-----------------------------------------------------------------------------------------
| epoch 37 | step 500/4071 | loss 8.3503 | lr 0.00100 | ngrams/sec 53488.2 | eta 0h0m34s
| epoch 37 | step 1000/4071 | loss 8.3757 | lr 0.00100 | ngrams/sec 72800.5 | eta 0h0m21s
| epoch 37 | step 1500/4071 | loss 8.3813 | lr 0.00100 | ngrams/sec 72883.2 | eta 0h0m18s
| epoch 37 | step 2000/4071 | loss 8.3764 | lr 0.00100 | ngrams/sec 72845.8 | eta 0h0m14s
| epoch 37 | step 2500/4071 | loss 8.3757 | lr 0.00100 | ngrams/sec 72901.4 | eta 0h0m11s
| epoch 37 | step 3000/4071 | loss 8.3871 | lr 0.00100 | ngrams/sec 72886.7 | eta 0h0m7s
| epoch 37 | step 3500/4071 | loss 8.3958 | lr 0.00100 | ngrams/sec 72858.4 | eta 0h0m4s
| epoch 37 | step 4000/4071 | loss 8.3827 | lr 0.00100 | ngrams/sec 72681.3 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1557.89it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.17it/s]


-----------------------------------------------------------------------------------------
| end of epoch  37 | time 29.20s | valid loss  6.76 | valid ppl   862.04
-----------------------------------------------------------------------------------------
| epoch 38 | step 500/4071 | loss 8.3419 | lr 0.00100 | ngrams/sec 53565.6 | eta 0h0m34s
| epoch 38 | step 1000/4071 | loss 8.3677 | lr 0.00100 | ngrams/sec 72931.7 | eta 0h0m21s
| epoch 38 | step 1500/4071 | loss 8.3739 | lr 0.00100 | ngrams/sec 72917.4 | eta 0h0m18s
| epoch 38 | step 2000/4071 | loss 8.3694 | lr 0.00100 | ngrams/sec 72896.3 | eta 0h0m14s
| epoch 38 | step 2500/4071 | loss 8.3704 | lr 0.00100 | ngrams/sec 72996.4 | eta 0h0m11s
| epoch 38 | step 3000/4071 | loss 8.3717 | lr 0.00100 | ngrams/sec 72836.8 | eta 0h0m7s
| epoch 38 | step 3500/4071 | loss 8.3896 | lr 0.00100 | ngrams/sec 72743.5 | eta 0h0m4s
| epoch 38 | step 4000/4071 | loss 8.3672 | lr 0.00100 | ngrams/sec 72753.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1556.70it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  38 | time 29.19s | valid loss  6.75 | valid ppl   854.32
-----------------------------------------------------------------------------------------
| epoch 39 | step 500/4071 | loss 8.3503 | lr 0.00100 | ngrams/sec 53605.0 | eta 0h0m34s
| epoch 39 | step 1000/4071 | loss 8.3579 | lr 0.00100 | ngrams/sec 72624.6 | eta 0h0m21s
| epoch 39 | step 1500/4071 | loss 8.3600 | lr 0.00100 | ngrams/sec 72957.2 | eta 0h0m18s
| epoch 39 | step 2000/4071 | loss 8.3638 | lr 0.00100 | ngrams/sec 72949.3 | eta 0h0m14s
| epoch 39 | step 2500/4071 | loss 8.3620 | lr 0.00100 | ngrams/sec 72761.8 | eta 0h0m11s
| epoch 39 | step 3000/4071 | loss 8.3706 | lr 0.00100 | ngrams/sec 72662.9 | eta 0h0m7s
| epoch 39 | step 3500/4071 | loss 8.3804 | lr 0.00100 | ngrams/sec 72593.9 | eta 0h0m4s
| epoch 39 | step 4000/4071 | loss 8.3892 | lr 0.00100 | ngrams/sec 72676.8 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1554.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.11it/s]


-----------------------------------------------------------------------------------------
| end of epoch  39 | time 29.23s | valid loss  6.75 | valid ppl   854.87
-----------------------------------------------------------------------------------------
| epoch 40 | step 500/4071 | loss 8.3486 | lr 0.00100 | ngrams/sec 53350.7 | eta 0h0m34s
| epoch 40 | step 1000/4071 | loss 8.3410 | lr 0.00100 | ngrams/sec 72854.4 | eta 0h0m21s
| epoch 40 | step 1500/4071 | loss 8.3574 | lr 0.00100 | ngrams/sec 72997.0 | eta 0h0m18s
| epoch 40 | step 2000/4071 | loss 8.3509 | lr 0.00100 | ngrams/sec 72858.1 | eta 0h0m14s
| epoch 40 | step 2500/4071 | loss 8.3542 | lr 0.00100 | ngrams/sec 72895.3 | eta 0h0m11s
| epoch 40 | step 3000/4071 | loss 8.3490 | lr 0.00100 | ngrams/sec 72770.9 | eta 0h0m7s
| epoch 40 | step 3500/4071 | loss 8.3502 | lr 0.00100 | ngrams/sec 72973.1 | eta 0h0m4s
| epoch 40 | step 4000/4071 | loss 8.3770 | lr 0.00100 | ngrams/sec 72651.7 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1537.71it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.24it/s]


-----------------------------------------------------------------------------------------
| end of epoch  40 | time 29.21s | valid loss  6.73 | valid ppl   835.70
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 41 | step 500/4071 | loss 8.3428 | lr 0.00100 | ngrams/sec 52655.1 | eta 0h0m34s
| epoch 41 | step 1000/4071 | loss 8.3527 | lr 0.00100 | ngrams/sec 72770.0 | eta 0h0m21s
| epoch 41 | step 1500/4071 | loss 8.3529 | lr 0.00100 | ngrams/sec 72756.2 | eta 0h0m18s
| epoch 41 | step 2000/4071 | loss 8.3512 | lr 0.00100 | ngrams/sec 72989.6 | eta 0h0m14s
| epoch 41 | step 2500/4071 | loss 8.3604 | lr 0.00100 | ngrams/sec 72793.0 | eta 0h0m11s
| epoch 41 | step 3000/4071 | loss 8.3630 | lr 0.00100 | ngrams/sec 72505.9 | eta 0h0m7s
| epoch 41 | step 3500/4071 | loss 8.3601 | lr 0.00100 | ngrams/sec 72670.0 | eta 0h0m4s
| epoch 41 | step 4000/4071 | loss 8.3529 | lr 0.00100 | ngrams/sec 72687.2 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1551.49it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  41 | time 29.24s | valid loss  6.75 | valid ppl   855.27
-----------------------------------------------------------------------------------------
| epoch 42 | step 500/4071 | loss 8.3303 | lr 0.00100 | ngrams/sec 53404.1 | eta 0h0m34s
| epoch 42 | step 1000/4071 | loss 8.3359 | lr 0.00100 | ngrams/sec 72594.7 | eta 0h0m21s
| epoch 42 | step 1500/4071 | loss 8.3439 | lr 0.00100 | ngrams/sec 72865.0 | eta 0h0m18s
| epoch 42 | step 2000/4071 | loss 8.3532 | lr 0.00100 | ngrams/sec 72697.0 | eta 0h0m14s
| epoch 42 | step 2500/4071 | loss 8.3524 | lr 0.00100 | ngrams/sec 72804.6 | eta 0h0m11s
| epoch 42 | step 3000/4071 | loss 8.3399 | lr 0.00100 | ngrams/sec 72728.2 | eta 0h0m7s
| epoch 42 | step 3500/4071 | loss 8.3648 | lr 0.00100 | ngrams/sec 72688.9 | eta 0h0m4s
| epoch 42 | step 4000/4071 | loss 8.3624 | lr 0.00100 | ngrams/sec 72872.7 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1538.65it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.27it/s]


-----------------------------------------------------------------------------------------
| end of epoch  42 | time 29.24s | valid loss  6.72 | valid ppl   829.47
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 43 | step 500/4071 | loss 8.3309 | lr 0.00100 | ngrams/sec 52699.8 | eta 0h0m34s
| epoch 43 | step 1000/4071 | loss 8.3381 | lr 0.00100 | ngrams/sec 72652.8 | eta 0h0m21s
| epoch 43 | step 1500/4071 | loss 8.3543 | lr 0.00100 | ngrams/sec 72827.4 | eta 0h0m18s
| epoch 43 | step 2000/4071 | loss 8.3654 | lr 0.00100 | ngrams/sec 72751.9 | eta 0h0m14s
| epoch 43 | step 2500/4071 | loss 8.3439 | lr 0.00100 | ngrams/sec 72774.5 | eta 0h0m11s
| epoch 43 | step 3000/4071 | loss 8.3504 | lr 0.00100 | ngrams/sec 72935.5 | eta 0h0m7s
| epoch 43 | step 3500/4071 | loss 8.3574 | lr 0.00100 | ngrams/sec 72893.8 | eta 0h0m4s
| epoch 43 | step 4000/4071 | loss 8.3462 | lr 0.00100 | ngrams/sec 72608.4 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1541.53it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.60it/s]


-----------------------------------------------------------------------------------------
| end of epoch  43 | time 29.23s | valid loss  6.75 | valid ppl   853.77
-----------------------------------------------------------------------------------------
| epoch 44 | step 500/4071 | loss 8.3294 | lr 0.00100 | ngrams/sec 53467.5 | eta 0h0m34s
| epoch 44 | step 1000/4071 | loss 8.3308 | lr 0.00100 | ngrams/sec 72745.1 | eta 0h0m21s
| epoch 44 | step 1500/4071 | loss 8.3410 | lr 0.00100 | ngrams/sec 72761.5 | eta 0h0m18s
| epoch 44 | step 2000/4071 | loss 8.3577 | lr 0.00100 | ngrams/sec 72696.4 | eta 0h0m14s
| epoch 44 | step 2500/4071 | loss 8.3441 | lr 0.00100 | ngrams/sec 72659.5 | eta 0h0m11s
| epoch 44 | step 3000/4071 | loss 8.3425 | lr 0.00100 | ngrams/sec 72682.7 | eta 0h0m7s
| epoch 44 | step 3500/4071 | loss 8.3424 | lr 0.00100 | ngrams/sec 72645.5 | eta 0h0m4s
| epoch 44 | step 4000/4071 | loss 8.3534 | lr 0.00100 | ngrams/sec 72678.4 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1556.46it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.22it/s]


-----------------------------------------------------------------------------------------
| end of epoch  44 | time 29.25s | valid loss  6.77 | valid ppl   870.14
-----------------------------------------------------------------------------------------
| epoch 45 | step 500/4071 | loss 8.3111 | lr 0.00100 | ngrams/sec 53539.1 | eta 0h0m34s
| epoch 45 | step 1000/4071 | loss 8.3347 | lr 0.00100 | ngrams/sec 72479.5 | eta 0h0m21s
| epoch 45 | step 1500/4071 | loss 8.3269 | lr 0.00100 | ngrams/sec 72755.1 | eta 0h0m18s
| epoch 45 | step 2000/4071 | loss 8.3246 | lr 0.00100 | ngrams/sec 72915.5 | eta 0h0m14s
| epoch 45 | step 2500/4071 | loss 8.3248 | lr 0.00100 | ngrams/sec 72812.7 | eta 0h0m11s
| epoch 45 | step 3000/4071 | loss 8.3382 | lr 0.00100 | ngrams/sec 72572.1 | eta 0h0m7s
| epoch 45 | step 3500/4071 | loss 8.3340 | lr 0.00100 | ngrams/sec 72974.4 | eta 0h0m4s
| epoch 45 | step 4000/4071 | loss 8.3427 | lr 0.00100 | ngrams/sec 72853.3 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.37it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  45 | time 29.22s | valid loss  6.72 | valid ppl   830.62
-----------------------------------------------------------------------------------------
| epoch 46 | step 500/4071 | loss 8.3156 | lr 0.00100 | ngrams/sec 53486.3 | eta 0h0m34s
| epoch 46 | step 1000/4071 | loss 8.3126 | lr 0.00100 | ngrams/sec 72782.7 | eta 0h0m21s
| epoch 46 | step 1500/4071 | loss 8.3194 | lr 0.00100 | ngrams/sec 72915.0 | eta 0h0m18s
| epoch 46 | step 2000/4071 | loss 8.3304 | lr 0.00100 | ngrams/sec 72825.1 | eta 0h0m14s
| epoch 46 | step 2500/4071 | loss 8.3356 | lr 0.00100 | ngrams/sec 72711.0 | eta 0h0m11s
| epoch 46 | step 3000/4071 | loss 8.3318 | lr 0.00100 | ngrams/sec 72921.5 | eta 0h0m7s
| epoch 46 | step 3500/4071 | loss 8.3225 | lr 0.00100 | ngrams/sec 72641.4 | eta 0h0m4s
| epoch 46 | step 4000/4071 | loss 8.3387 | lr 0.00100 | ngrams/sec 72690.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.50it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  46 | time 29.23s | valid loss  6.72 | valid ppl   826.53
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 47 | step 500/4071 | loss 8.3110 | lr 0.00100 | ngrams/sec 52582.5 | eta 0h0m34s
| epoch 47 | step 1000/4071 | loss 8.3130 | lr 0.00100 | ngrams/sec 72950.8 | eta 0h0m21s
| epoch 47 | step 1500/4071 | loss 8.2977 | lr 0.00100 | ngrams/sec 72821.0 | eta 0h0m18s
| epoch 47 | step 2000/4071 | loss 8.3290 | lr 0.00100 | ngrams/sec 72681.4 | eta 0h0m14s
| epoch 47 | step 2500/4071 | loss 8.3235 | lr 0.00100 | ngrams/sec 72663.6 | eta 0h0m11s
| epoch 47 | step 3000/4071 | loss 8.3300 | lr 0.00100 | ngrams/sec 72796.7 | eta 0h0m7s
| epoch 47 | step 3500/4071 | loss 8.3327 | lr 0.00100 | ngrams/sec 72754.7 | eta 0h0m4s
| epoch 47 | step 4000/4071 | loss 8.3322 | lr 0.00100 | ngrams/sec 72856.9 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1555.07it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.15it/s]


-----------------------------------------------------------------------------------------
| end of epoch  47 | time 29.23s | valid loss  6.73 | valid ppl   836.44
-----------------------------------------------------------------------------------------
| epoch 48 | step 500/4071 | loss 8.3080 | lr 0.00100 | ngrams/sec 53445.7 | eta 0h0m34s
| epoch 48 | step 1000/4071 | loss 8.3080 | lr 0.00100 | ngrams/sec 72687.5 | eta 0h0m21s
| epoch 48 | step 1500/4071 | loss 8.3158 | lr 0.00100 | ngrams/sec 72737.3 | eta 0h0m18s
| epoch 48 | step 2000/4071 | loss 8.3124 | lr 0.00100 | ngrams/sec 72748.9 | eta 0h0m14s
| epoch 48 | step 2500/4071 | loss 8.3166 | lr 0.00100 | ngrams/sec 72505.3 | eta 0h0m11s
| epoch 48 | step 3000/4071 | loss 8.3178 | lr 0.00100 | ngrams/sec 72933.7 | eta 0h0m7s
| epoch 48 | step 3500/4071 | loss 8.3171 | lr 0.00100 | ngrams/sec 72535.8 | eta 0h0m4s
| epoch 48 | step 4000/4071 | loss 8.3275 | lr 0.00100 | ngrams/sec 72540.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.13it/s]


-----------------------------------------------------------------------------------------
| end of epoch  48 | time 29.27s | valid loss  6.75 | valid ppl   854.97
-----------------------------------------------------------------------------------------
| epoch 49 | step 500/4071 | loss 8.2898 | lr 0.00100 | ngrams/sec 53357.6 | eta 0h0m34s
| epoch 49 | step 1000/4071 | loss 8.2997 | lr 0.00100 | ngrams/sec 72565.7 | eta 0h0m21s
| epoch 49 | step 1500/4071 | loss 8.3106 | lr 0.00100 | ngrams/sec 72744.8 | eta 0h0m18s
| epoch 49 | step 2000/4071 | loss 8.3160 | lr 0.00100 | ngrams/sec 72782.9 | eta 0h0m14s
| epoch 49 | step 2500/4071 | loss 8.3157 | lr 0.00100 | ngrams/sec 72685.4 | eta 0h0m11s
| epoch 49 | step 3000/4071 | loss 8.3166 | lr 0.00100 | ngrams/sec 72768.9 | eta 0h0m7s
| epoch 49 | step 3500/4071 | loss 8.3219 | lr 0.00100 | ngrams/sec 72799.4 | eta 0h0m4s
| epoch 49 | step 4000/4071 | loss 8.3197 | lr 0.00100 | ngrams/sec 72575.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1551.50it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.61it/s]


-----------------------------------------------------------------------------------------
| end of epoch  49 | time 29.26s | valid loss  6.71 | valid ppl   822.33
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 50 | step 500/4071 | loss 8.2878 | lr 0.00100 | ngrams/sec 52792.6 | eta 0h0m34s
| epoch 50 | step 1000/4071 | loss 8.2922 | lr 0.00100 | ngrams/sec 72310.6 | eta 0h0m21s
| epoch 50 | step 1500/4071 | loss 8.3128 | lr 0.00100 | ngrams/sec 72270.1 | eta 0h0m18s
| epoch 50 | step 2000/4071 | loss 8.2967 | lr 0.00100 | ngrams/sec 72022.5 | eta 0h0m14s
| epoch 50 | step 2500/4071 | loss 8.2971 | lr 0.00100 | ngrams/sec 72430.5 | eta 0h0m11s
| epoch 50 | step 3000/4071 | loss 8.3114 | lr 0.00100 | ngrams/sec 72766.0 | eta 0h0m7s
| epoch 50 | step 3500/4071 | loss 8.3192 | lr 0.00100 | ngrams/sec 72914.2 | eta 0h0m4s
| epoch 50 | step 4000/4071 | loss 8.3080 | lr 0.00100 | ngrams/sec 72852.9 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1555.20it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.05it/s]


-----------------------------------------------------------------------------------------
| end of epoch  50 | time 29.32s | valid loss  6.72 | valid ppl   831.98
-----------------------------------------------------------------------------------------
| epoch 51 | step 500/4071 | loss 8.2886 | lr 0.00100 | ngrams/sec 53346.0 | eta 0h0m34s
| epoch 51 | step 1000/4071 | loss 8.2956 | lr 0.00100 | ngrams/sec 72784.9 | eta 0h0m21s
| epoch 51 | step 1500/4071 | loss 8.2998 | lr 0.00100 | ngrams/sec 72967.4 | eta 0h0m18s
| epoch 51 | step 2000/4071 | loss 8.3134 | lr 0.00100 | ngrams/sec 72859.6 | eta 0h0m14s
| epoch 51 | step 2500/4071 | loss 8.3018 | lr 0.00100 | ngrams/sec 72809.4 | eta 0h0m11s
| epoch 51 | step 3000/4071 | loss 8.3079 | lr 0.00100 | ngrams/sec 72879.5 | eta 0h0m7s
| epoch 51 | step 3500/4071 | loss 8.3009 | lr 0.00100 | ngrams/sec 72585.9 | eta 0h0m4s
| epoch 51 | step 4000/4071 | loss 8.3052 | lr 0.00100 | ngrams/sec 72948.4 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.21it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.09it/s]


-----------------------------------------------------------------------------------------
| end of epoch  51 | time 29.21s | valid loss  6.71 | valid ppl   823.91
-----------------------------------------------------------------------------------------
| epoch 52 | step 500/4071 | loss 8.2729 | lr 0.00100 | ngrams/sec 53561.6 | eta 0h0m34s
| epoch 52 | step 1000/4071 | loss 8.2920 | lr 0.00100 | ngrams/sec 72510.6 | eta 0h0m21s
| epoch 52 | step 1500/4071 | loss 8.3035 | lr 0.00100 | ngrams/sec 72942.4 | eta 0h0m18s
| epoch 52 | step 2000/4071 | loss 8.2963 | lr 0.00100 | ngrams/sec 72976.8 | eta 0h0m14s
| epoch 52 | step 2500/4071 | loss 8.3068 | lr 0.00100 | ngrams/sec 72724.4 | eta 0h0m11s
| epoch 52 | step 3000/4071 | loss 8.2913 | lr 0.00100 | ngrams/sec 71048.9 | eta 0h0m7s
| epoch 52 | step 3500/4071 | loss 8.3166 | lr 0.00100 | ngrams/sec 72903.2 | eta 0h0m4s
| epoch 52 | step 4000/4071 | loss 8.3036 | lr 0.00100 | ngrams/sec 72648.6 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1555.92it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  52 | time 29.31s | valid loss  6.72 | valid ppl   826.69
-----------------------------------------------------------------------------------------
| epoch 53 | step 500/4071 | loss 8.2677 | lr 0.00100 | ngrams/sec 53455.2 | eta 0h0m34s
| epoch 53 | step 1000/4071 | loss 8.2763 | lr 0.00100 | ngrams/sec 72719.3 | eta 0h0m21s
| epoch 53 | step 1500/4071 | loss 8.2782 | lr 0.00100 | ngrams/sec 72892.6 | eta 0h0m18s
| epoch 53 | step 2000/4071 | loss 8.2852 | lr 0.00100 | ngrams/sec 72790.8 | eta 0h0m14s
| epoch 53 | step 2500/4071 | loss 8.3044 | lr 0.00100 | ngrams/sec 72669.7 | eta 0h0m11s
| epoch 53 | step 3000/4071 | loss 8.3012 | lr 0.00100 | ngrams/sec 73049.3 | eta 0h0m7s
| epoch 53 | step 3500/4071 | loss 8.3028 | lr 0.00100 | ngrams/sec 73012.3 | eta 0h0m4s
| epoch 53 | step 4000/4071 | loss 8.2883 | lr 0.00100 | ngrams/sec 72914.5 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.01it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch  53 | time 29.20s | valid loss  6.66 | valid ppl   782.18
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 54 | step 500/4071 | loss 8.2741 | lr 0.00100 | ngrams/sec 52810.0 | eta 0h0m34s
| epoch 54 | step 1000/4071 | loss 8.2800 | lr 0.00100 | ngrams/sec 72768.0 | eta 0h0m21s
| epoch 54 | step 1500/4071 | loss 8.2904 | lr 0.00100 | ngrams/sec 72754.0 | eta 0h0m18s
| epoch 54 | step 2000/4071 | loss 8.2866 | lr 0.00100 | ngrams/sec 72845.1 | eta 0h0m14s
| epoch 54 | step 2500/4071 | loss 8.2870 | lr 0.00100 | ngrams/sec 72921.5 | eta 0h0m11s
| epoch 54 | step 3000/4071 | loss 8.2833 | lr 0.00100 | ngrams/sec 72700.8 | eta 0h0m7s
| epoch 54 | step 3500/4071 | loss 8.2966 | lr 0.00100 | ngrams/sec 72866.7 | eta 0h0m4s
| epoch 54 | step 4000/4071 | loss 8.2937 | lr 0.00100 | ngrams/sec 72865.3 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1542.66it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.59it/s]


-----------------------------------------------------------------------------------------
| end of epoch  54 | time 29.21s | valid loss  6.69 | valid ppl   801.55
-----------------------------------------------------------------------------------------
| epoch 55 | step 500/4071 | loss 8.2768 | lr 0.00100 | ngrams/sec 53639.0 | eta 0h0m34s
| epoch 55 | step 1000/4071 | loss 8.2756 | lr 0.00100 | ngrams/sec 72751.0 | eta 0h0m21s
| epoch 55 | step 1500/4071 | loss 8.2841 | lr 0.00100 | ngrams/sec 72736.7 | eta 0h0m18s
| epoch 55 | step 2000/4071 | loss 8.2775 | lr 0.00100 | ngrams/sec 72745.7 | eta 0h0m14s
| epoch 55 | step 2500/4071 | loss 8.2983 | lr 0.00100 | ngrams/sec 72966.8 | eta 0h0m11s
| epoch 55 | step 3000/4071 | loss 8.3031 | lr 0.00100 | ngrams/sec 72762.0 | eta 0h0m7s
| epoch 55 | step 3500/4071 | loss 8.3007 | lr 0.00100 | ngrams/sec 72736.9 | eta 0h0m4s
| epoch 55 | step 4000/4071 | loss 8.2905 | lr 0.00100 | ngrams/sec 72851.8 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1556.96it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  55 | time 29.20s | valid loss  6.68 | valid ppl   793.52
-----------------------------------------------------------------------------------------
| epoch 56 | step 500/4071 | loss 8.2585 | lr 0.00100 | ngrams/sec 53616.2 | eta 0h0m34s
| epoch 56 | step 1000/4071 | loss 8.2749 | lr 0.00100 | ngrams/sec 72870.5 | eta 0h0m21s
| epoch 56 | step 1500/4071 | loss 8.2817 | lr 0.00100 | ngrams/sec 72747.4 | eta 0h0m18s
| epoch 56 | step 2000/4071 | loss 8.2872 | lr 0.00100 | ngrams/sec 72821.0 | eta 0h0m14s
| epoch 56 | step 2500/4071 | loss 8.2825 | lr 0.00100 | ngrams/sec 72914.5 | eta 0h0m11s
| epoch 56 | step 3000/4071 | loss 8.2803 | lr 0.00100 | ngrams/sec 72804.1 | eta 0h0m7s
| epoch 56 | step 3500/4071 | loss 8.2850 | lr 0.00100 | ngrams/sec 72960.9 | eta 0h0m4s
| epoch 56 | step 4000/4071 | loss 8.2885 | lr 0.00100 | ngrams/sec 72884.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1558.01it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  56 | time 29.19s | valid loss  6.68 | valid ppl   798.43
-----------------------------------------------------------------------------------------
| epoch 57 | step 500/4071 | loss 8.2512 | lr 0.00100 | ngrams/sec 53499.7 | eta 0h0m34s
| epoch 57 | step 1000/4071 | loss 8.2701 | lr 0.00100 | ngrams/sec 72540.5 | eta 0h0m21s
| epoch 57 | step 1500/4071 | loss 8.2574 | lr 0.00100 | ngrams/sec 72647.5 | eta 0h0m18s
| epoch 57 | step 2000/4071 | loss 8.2825 | lr 0.00100 | ngrams/sec 72768.4 | eta 0h0m14s
| epoch 57 | step 2500/4071 | loss 8.2972 | lr 0.00100 | ngrams/sec 72899.4 | eta 0h0m11s
| epoch 57 | step 3000/4071 | loss 8.2791 | lr 0.00100 | ngrams/sec 72971.5 | eta 0h0m7s
| epoch 57 | step 3500/4071 | loss 8.2855 | lr 0.00100 | ngrams/sec 72925.0 | eta 0h0m4s
| epoch 57 | step 4000/4071 | loss 8.2788 | lr 0.00100 | ngrams/sec 72768.5 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1554.52it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch  57 | time 29.22s | valid loss  6.66 | valid ppl   776.79
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 58 | step 500/4071 | loss 8.2619 | lr 0.00100 | ngrams/sec 52865.7 | eta 0h0m34s
| epoch 58 | step 1000/4071 | loss 8.2649 | lr 0.00100 | ngrams/sec 72810.6 | eta 0h0m21s
| epoch 58 | step 1500/4071 | loss 8.2656 | lr 0.00100 | ngrams/sec 72765.9 | eta 0h0m18s
| epoch 58 | step 2000/4071 | loss 8.2721 | lr 0.00100 | ngrams/sec 72713.3 | eta 0h0m14s
| epoch 58 | step 2500/4071 | loss 8.2782 | lr 0.00100 | ngrams/sec 72608.9 | eta 0h0m11s
| epoch 58 | step 3000/4071 | loss 8.2665 | lr 0.00100 | ngrams/sec 72789.6 | eta 0h0m7s
| epoch 58 | step 3500/4071 | loss 8.2745 | lr 0.00100 | ngrams/sec 72812.1 | eta 0h0m4s
| epoch 58 | step 4000/4071 | loss 8.2611 | lr 0.00100 | ngrams/sec 72729.4 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1539.82it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.37it/s]


-----------------------------------------------------------------------------------------
| end of epoch  58 | time 29.23s | valid loss  6.64 | valid ppl   765.46
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 59 | step 500/4071 | loss 8.2595 | lr 0.00100 | ngrams/sec 52796.2 | eta 0h0m34s
| epoch 59 | step 1000/4071 | loss 8.2659 | lr 0.00100 | ngrams/sec 72732.4 | eta 0h0m21s
| epoch 59 | step 1500/4071 | loss 8.2571 | lr 0.00100 | ngrams/sec 72853.6 | eta 0h0m18s
| epoch 59 | step 2000/4071 | loss 8.2647 | lr 0.00100 | ngrams/sec 72854.9 | eta 0h0m14s
| epoch 59 | step 2500/4071 | loss 8.2571 | lr 0.00100 | ngrams/sec 72691.2 | eta 0h0m11s
| epoch 59 | step 3000/4071 | loss 8.2781 | lr 0.00100 | ngrams/sec 72761.7 | eta 0h0m7s
| epoch 59 | step 3500/4071 | loss 8.2763 | lr 0.00100 | ngrams/sec 72779.1 | eta 0h0m4s
| epoch 59 | step 4000/4071 | loss 8.2767 | lr 0.00100 | ngrams/sec 72972.8 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1547.54it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  59 | time 29.21s | valid loss  6.63 | valid ppl   757.45
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 60 | step 500/4071 | loss 8.2462 | lr 0.00100 | ngrams/sec 52852.8 | eta 0h0m34s
| epoch 60 | step 1000/4071 | loss 8.2574 | lr 0.00100 | ngrams/sec 72852.9 | eta 0h0m21s
| epoch 60 | step 1500/4071 | loss 8.2523 | lr 0.00100 | ngrams/sec 72743.7 | eta 0h0m18s
| epoch 60 | step 2000/4071 | loss 8.2478 | lr 0.00100 | ngrams/sec 72691.0 | eta 0h0m14s
| epoch 60 | step 2500/4071 | loss 8.2579 | lr 0.00100 | ngrams/sec 72789.0 | eta 0h0m11s
| epoch 60 | step 3000/4071 | loss 8.2627 | lr 0.00100 | ngrams/sec 72822.7 | eta 0h0m7s
| epoch 60 | step 3500/4071 | loss 8.2661 | lr 0.00100 | ngrams/sec 72049.3 | eta 0h0m4s
| epoch 60 | step 4000/4071 | loss 8.2698 | lr 0.00100 | ngrams/sec 72534.2 | eta 0h

 37%|███▋      | 156/417 [00:00<00:00, 1537.84it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.17it/s]


-----------------------------------------------------------------------------------------
| end of epoch  60 | time 29.27s | valid loss  6.65 | valid ppl   773.31
-----------------------------------------------------------------------------------------
| epoch 61 | step 500/4071 | loss 8.2471 | lr 0.00100 | ngrams/sec 53127.9 | eta 0h0m34s
| epoch 61 | step 1000/4071 | loss 8.2558 | lr 0.00100 | ngrams/sec 72657.5 | eta 0h0m21s
| epoch 61 | step 1500/4071 | loss 8.2514 | lr 0.00100 | ngrams/sec 72639.5 | eta 0h0m18s
| epoch 61 | step 2000/4071 | loss 8.2699 | lr 0.00100 | ngrams/sec 72548.7 | eta 0h0m14s
| epoch 61 | step 2500/4071 | loss 8.2626 | lr 0.00100 | ngrams/sec 72892.0 | eta 0h0m11s
| epoch 61 | step 3000/4071 | loss 8.2662 | lr 0.00100 | ngrams/sec 72915.4 | eta 0h0m7s
| epoch 61 | step 3500/4071 | loss 8.2741 | lr 0.00100 | ngrams/sec 72638.9 | eta 0h0m4s
| epoch 61 | step 4000/4071 | loss 8.2585 | lr 0.00100 | ngrams/sec 72841.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1558.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  61 | time 29.27s | valid loss  6.65 | valid ppl   774.52
-----------------------------------------------------------------------------------------
| epoch 62 | step 500/4071 | loss 8.2347 | lr 0.00100 | ngrams/sec 53470.1 | eta 0h0m34s
| epoch 62 | step 1000/4071 | loss 8.2455 | lr 0.00100 | ngrams/sec 72623.3 | eta 0h0m21s
| epoch 62 | step 1500/4071 | loss 8.2517 | lr 0.00100 | ngrams/sec 72710.7 | eta 0h0m18s
| epoch 62 | step 2000/4071 | loss 8.2462 | lr 0.00100 | ngrams/sec 72609.6 | eta 0h0m14s
| epoch 62 | step 2500/4071 | loss 8.2621 | lr 0.00100 | ngrams/sec 72698.2 | eta 0h0m11s
| epoch 62 | step 3000/4071 | loss 8.2674 | lr 0.00100 | ngrams/sec 72847.6 | eta 0h0m7s
| epoch 62 | step 3500/4071 | loss 8.2647 | lr 0.00100 | ngrams/sec 72753.6 | eta 0h0m4s
| epoch 62 | step 4000/4071 | loss 8.2602 | lr 0.00100 | ngrams/sec 72831.8 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1552.23it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  62 | time 29.24s | valid loss  6.63 | valid ppl   754.42
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 63 | step 500/4071 | loss 8.2289 | lr 0.00100 | ngrams/sec 52806.0 | eta 0h0m34s
| epoch 63 | step 1000/4071 | loss 8.2471 | lr 0.00100 | ngrams/sec 72805.9 | eta 0h0m21s
| epoch 63 | step 1500/4071 | loss 8.2343 | lr 0.00100 | ngrams/sec 72677.5 | eta 0h0m18s
| epoch 63 | step 2000/4071 | loss 8.2537 | lr 0.00100 | ngrams/sec 72854.4 | eta 0h0m14s
| epoch 63 | step 2500/4071 | loss 8.2624 | lr 0.00100 | ngrams/sec 72770.0 | eta 0h0m11s
| epoch 63 | step 3000/4071 | loss 8.2573 | lr 0.00100 | ngrams/sec 72661.4 | eta 0h0m7s
| epoch 63 | step 3500/4071 | loss 8.2569 | lr 0.00100 | ngrams/sec 72774.5 | eta 0h0m4s
| epoch 63 | step 4000/4071 | loss 8.2451 | lr 0.00100 | ngrams/sec 72821.5 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1557.87it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.30it/s]


-----------------------------------------------------------------------------------------
| end of epoch  63 | time 29.22s | valid loss  6.64 | valid ppl   762.70
-----------------------------------------------------------------------------------------
| epoch 64 | step 500/4071 | loss 8.2210 | lr 0.00100 | ngrams/sec 53444.2 | eta 0h0m34s
| epoch 64 | step 1000/4071 | loss 8.2348 | lr 0.00100 | ngrams/sec 72853.4 | eta 0h0m21s
| epoch 64 | step 1500/4071 | loss 8.2456 | lr 0.00100 | ngrams/sec 72898.3 | eta 0h0m18s
| epoch 64 | step 2000/4071 | loss 8.2349 | lr 0.00100 | ngrams/sec 72705.1 | eta 0h0m14s
| epoch 64 | step 2500/4071 | loss 8.2399 | lr 0.00100 | ngrams/sec 72790.2 | eta 0h0m11s
| epoch 64 | step 3000/4071 | loss 8.2474 | lr 0.00100 | ngrams/sec 72835.9 | eta 0h0m7s
| epoch 64 | step 3500/4071 | loss 8.2491 | lr 0.00100 | ngrams/sec 72957.0 | eta 0h0m4s
| epoch 64 | step 4000/4071 | loss 8.2538 | lr 0.00100 | ngrams/sec 72625.6 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1551.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  64 | time 29.22s | valid loss  6.61 | valid ppl   740.55
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 65 | step 500/4071 | loss 8.2239 | lr 0.00100 | ngrams/sec 52708.1 | eta 0h0m34s
| epoch 65 | step 1000/4071 | loss 8.2448 | lr 0.00100 | ngrams/sec 72610.7 | eta 0h0m21s
| epoch 65 | step 1500/4071 | loss 8.2415 | lr 0.00100 | ngrams/sec 72839.0 | eta 0h0m18s
| epoch 65 | step 2000/4071 | loss 8.2329 | lr 0.00100 | ngrams/sec 72804.9 | eta 0h0m14s
| epoch 65 | step 2500/4071 | loss 8.2359 | lr 0.00100 | ngrams/sec 72764.8 | eta 0h0m11s
| epoch 65 | step 3000/4071 | loss 8.2430 | lr 0.00100 | ngrams/sec 72782.7 | eta 0h0m7s
| epoch 65 | step 3500/4071 | loss 8.2374 | lr 0.00100 | ngrams/sec 72806.4 | eta 0h0m4s
| epoch 65 | step 4000/4071 | loss 8.2494 | lr 0.00100 | ngrams/sec 72760.9 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1551.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.80it/s]


-----------------------------------------------------------------------------------------
| end of epoch  65 | time 29.23s | valid loss  6.63 | valid ppl   760.53
-----------------------------------------------------------------------------------------
| epoch 66 | step 500/4071 | loss 8.2018 | lr 0.00100 | ngrams/sec 53532.9 | eta 0h0m34s
| epoch 66 | step 1000/4071 | loss 8.2297 | lr 0.00100 | ngrams/sec 72866.3 | eta 0h0m21s
| epoch 66 | step 1500/4071 | loss 8.2227 | lr 0.00100 | ngrams/sec 73046.2 | eta 0h0m18s
| epoch 66 | step 2000/4071 | loss 8.2482 | lr 0.00100 | ngrams/sec 72890.4 | eta 0h0m14s
| epoch 66 | step 2500/4071 | loss 8.2559 | lr 0.00100 | ngrams/sec 72661.1 | eta 0h0m11s
| epoch 66 | step 3000/4071 | loss 8.2345 | lr 0.00100 | ngrams/sec 72750.1 | eta 0h0m7s
| epoch 66 | step 3500/4071 | loss 8.2421 | lr 0.00100 | ngrams/sec 72815.1 | eta 0h0m4s
| epoch 66 | step 4000/4071 | loss 8.2440 | lr 0.00100 | ngrams/sec 72770.1 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.97it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.92it/s]


-----------------------------------------------------------------------------------------
| end of epoch  66 | time 29.20s | valid loss  6.62 | valid ppl   746.26
-----------------------------------------------------------------------------------------
| epoch 67 | step 500/4071 | loss 8.2149 | lr 0.00100 | ngrams/sec 53572.7 | eta 0h0m34s
| epoch 67 | step 1000/4071 | loss 8.2143 | lr 0.00100 | ngrams/sec 72905.3 | eta 0h0m21s
| epoch 67 | step 1500/4071 | loss 8.2223 | lr 0.00100 | ngrams/sec 72524.0 | eta 0h0m18s
| epoch 67 | step 2000/4071 | loss 8.2359 | lr 0.00100 | ngrams/sec 72863.0 | eta 0h0m14s
| epoch 67 | step 2500/4071 | loss 8.2389 | lr 0.00100 | ngrams/sec 72813.1 | eta 0h0m11s
| epoch 67 | step 3000/4071 | loss 8.2453 | lr 0.00100 | ngrams/sec 72445.9 | eta 0h0m7s
| epoch 67 | step 3500/4071 | loss 8.2423 | lr 0.00100 | ngrams/sec 72534.4 | eta 0h0m4s
| epoch 67 | step 4000/4071 | loss 8.2347 | lr 0.00100 | ngrams/sec 72674.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1559.38it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  67 | time 29.25s | valid loss  6.65 | valid ppl   772.61
-----------------------------------------------------------------------------------------
| epoch 68 | step 500/4071 | loss 8.2109 | lr 0.00100 | ngrams/sec 53530.9 | eta 0h0m34s
| epoch 68 | step 1000/4071 | loss 8.2110 | lr 0.00100 | ngrams/sec 72681.8 | eta 0h0m21s
| epoch 68 | step 1500/4071 | loss 8.2271 | lr 0.00100 | ngrams/sec 72811.4 | eta 0h0m18s
| epoch 68 | step 2000/4071 | loss 8.2229 | lr 0.00100 | ngrams/sec 72837.1 | eta 0h0m14s
| epoch 68 | step 2500/4071 | loss 8.2183 | lr 0.00100 | ngrams/sec 72547.6 | eta 0h0m11s
| epoch 68 | step 3000/4071 | loss 8.2249 | lr 0.00100 | ngrams/sec 72712.4 | eta 0h0m7s
| epoch 68 | step 3500/4071 | loss 8.2155 | lr 0.00100 | ngrams/sec 72910.5 | eta 0h0m4s
| epoch 68 | step 4000/4071 | loss 8.2411 | lr 0.00100 | ngrams/sec 72557.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1549.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.62it/s]


-----------------------------------------------------------------------------------------
| end of epoch  68 | time 29.24s | valid loss  6.62 | valid ppl   750.22
-----------------------------------------------------------------------------------------
| epoch 69 | step 500/4071 | loss 8.2023 | lr 0.00100 | ngrams/sec 53512.0 | eta 0h0m34s
| epoch 69 | step 1000/4071 | loss 8.2104 | lr 0.00100 | ngrams/sec 72522.9 | eta 0h0m21s
| epoch 69 | step 1500/4071 | loss 8.2267 | lr 0.00100 | ngrams/sec 72831.7 | eta 0h0m18s
| epoch 69 | step 2000/4071 | loss 8.2229 | lr 0.00100 | ngrams/sec 72663.9 | eta 0h0m14s
| epoch 69 | step 2500/4071 | loss 8.2214 | lr 0.00100 | ngrams/sec 72611.0 | eta 0h0m11s
| epoch 69 | step 3000/4071 | loss 8.2218 | lr 0.00100 | ngrams/sec 72811.5 | eta 0h0m7s
| epoch 69 | step 3500/4071 | loss 8.2307 | lr 0.00100 | ngrams/sec 72841.0 | eta 0h0m4s
| epoch 69 | step 4000/4071 | loss 8.2264 | lr 0.00100 | ngrams/sec 72620.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1550.27it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  69 | time 29.25s | valid loss  6.60 | valid ppl   735.34
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 70 | step 500/4071 | loss 8.2060 | lr 0.00100 | ngrams/sec 52767.1 | eta 0h0m34s
| epoch 70 | step 1000/4071 | loss 8.2196 | lr 0.00100 | ngrams/sec 72618.6 | eta 0h0m21s
| epoch 70 | step 1500/4071 | loss 8.2167 | lr 0.00100 | ngrams/sec 72712.5 | eta 0h0m18s
| epoch 70 | step 2000/4071 | loss 8.2182 | lr 0.00100 | ngrams/sec 72793.7 | eta 0h0m14s
| epoch 70 | step 2500/4071 | loss 8.2177 | lr 0.00100 | ngrams/sec 72636.9 | eta 0h0m11s
| epoch 70 | step 3000/4071 | loss 8.2184 | lr 0.00100 | ngrams/sec 72425.5 | eta 0h0m7s
| epoch 70 | step 3500/4071 | loss 8.2204 | lr 0.00100 | ngrams/sec 72793.4 | eta 0h0m4s
| epoch 70 | step 4000/4071 | loss 8.2184 | lr 0.00100 | ngrams/sec 72818.5 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1552.01it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.88it/s]


-----------------------------------------------------------------------------------------
| end of epoch  70 | time 29.26s | valid loss  6.65 | valid ppl   774.61
-----------------------------------------------------------------------------------------
| epoch 71 | step 500/4071 | loss 8.1920 | lr 0.00100 | ngrams/sec 53446.9 | eta 0h0m34s
| epoch 71 | step 1000/4071 | loss 8.1946 | lr 0.00100 | ngrams/sec 72754.3 | eta 0h0m21s
| epoch 71 | step 1500/4071 | loss 8.2102 | lr 0.00100 | ngrams/sec 72573.3 | eta 0h0m18s
| epoch 71 | step 2000/4071 | loss 8.2075 | lr 0.00100 | ngrams/sec 72289.1 | eta 0h0m14s
| epoch 71 | step 2500/4071 | loss 8.2054 | lr 0.00100 | ngrams/sec 72533.1 | eta 0h0m11s
| epoch 71 | step 3000/4071 | loss 8.2189 | lr 0.00100 | ngrams/sec 72389.1 | eta 0h0m7s
| epoch 71 | step 3500/4071 | loss 8.2117 | lr 0.00100 | ngrams/sec 72730.5 | eta 0h0m4s
| epoch 71 | step 4000/4071 | loss 8.2157 | lr 0.00100 | ngrams/sec 72849.0 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1539.92it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  71 | time 29.29s | valid loss  6.60 | valid ppl   737.35
-----------------------------------------------------------------------------------------
| epoch 72 | step 500/4071 | loss 8.1982 | lr 0.00100 | ngrams/sec 53323.6 | eta 0h0m34s
| epoch 72 | step 1000/4071 | loss 8.1970 | lr 0.00100 | ngrams/sec 72844.3 | eta 0h0m21s
| epoch 72 | step 1500/4071 | loss 8.1859 | lr 0.00100 | ngrams/sec 72768.3 | eta 0h0m18s
| epoch 72 | step 2000/4071 | loss 8.1889 | lr 0.00100 | ngrams/sec 72802.8 | eta 0h0m14s
| epoch 72 | step 2500/4071 | loss 8.1976 | lr 0.00100 | ngrams/sec 72928.4 | eta 0h0m11s
| epoch 72 | step 3000/4071 | loss 8.2305 | lr 0.00100 | ngrams/sec 72766.9 | eta 0h0m7s
| epoch 72 | step 3500/4071 | loss 8.2053 | lr 0.00100 | ngrams/sec 72455.5 | eta 0h0m4s
| epoch 72 | step 4000/4071 | loss 8.2100 | lr 0.00100 | ngrams/sec 72802.2 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1554.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.04it/s]


-----------------------------------------------------------------------------------------
| end of epoch  72 | time 29.25s | valid loss  6.64 | valid ppl   766.50
-----------------------------------------------------------------------------------------
| epoch 73 | step 500/4071 | loss 8.1832 | lr 0.00100 | ngrams/sec 53421.3 | eta 0h0m34s
| epoch 73 | step 1000/4071 | loss 8.1919 | lr 0.00100 | ngrams/sec 72529.5 | eta 0h0m21s
| epoch 73 | step 1500/4071 | loss 8.2039 | lr 0.00100 | ngrams/sec 72539.2 | eta 0h0m18s
| epoch 73 | step 2000/4071 | loss 8.2085 | lr 0.00100 | ngrams/sec 72818.1 | eta 0h0m14s
| epoch 73 | step 2500/4071 | loss 8.2067 | lr 0.00100 | ngrams/sec 72545.6 | eta 0h0m11s
| epoch 73 | step 3000/4071 | loss 8.2101 | lr 0.00100 | ngrams/sec 72785.7 | eta 0h0m7s
| epoch 73 | step 3500/4071 | loss 8.2026 | lr 0.00100 | ngrams/sec 72693.2 | eta 0h0m4s
| epoch 73 | step 4000/4071 | loss 8.2040 | lr 0.00100 | ngrams/sec 72791.8 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1538.83it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  73 | time 29.26s | valid loss  6.62 | valid ppl   750.23
-----------------------------------------------------------------------------------------
| epoch 74 | step 500/4071 | loss 8.1892 | lr 0.00100 | ngrams/sec 53490.4 | eta 0h0m34s
| epoch 74 | step 1000/4071 | loss 8.2013 | lr 0.00100 | ngrams/sec 72832.4 | eta 0h0m21s
| epoch 74 | step 1500/4071 | loss 8.1925 | lr 0.00100 | ngrams/sec 72709.7 | eta 0h0m18s
| epoch 74 | step 2000/4071 | loss 8.2010 | lr 0.00100 | ngrams/sec 72792.0 | eta 0h0m14s
| epoch 74 | step 2500/4071 | loss 8.2023 | lr 0.00100 | ngrams/sec 72798.9 | eta 0h0m11s
| epoch 74 | step 3000/4071 | loss 8.2090 | lr 0.00100 | ngrams/sec 72522.7 | eta 0h0m7s
| epoch 74 | step 3500/4071 | loss 8.2112 | lr 0.00100 | ngrams/sec 70135.2 | eta 0h0m4s
| epoch 74 | step 4000/4071 | loss 8.2019 | lr 0.00100 | ngrams/sec 72731.5 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1551.96it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  74 | time 29.37s | valid loss  6.62 | valid ppl   750.42
-----------------------------------------------------------------------------------------
| epoch 75 | step 500/4071 | loss 8.1640 | lr 0.00100 | ngrams/sec 53378.1 | eta 0h0m34s
| epoch 75 | step 1000/4071 | loss 8.1967 | lr 0.00100 | ngrams/sec 72625.3 | eta 0h0m21s
| epoch 75 | step 1500/4071 | loss 8.1844 | lr 0.00100 | ngrams/sec 72671.8 | eta 0h0m18s
| epoch 75 | step 2000/4071 | loss 8.1964 | lr 0.00100 | ngrams/sec 72765.6 | eta 0h0m14s
| epoch 75 | step 2500/4071 | loss 8.1971 | lr 0.00100 | ngrams/sec 72527.3 | eta 0h0m11s
| epoch 75 | step 3000/4071 | loss 8.1908 | lr 0.00100 | ngrams/sec 72758.6 | eta 0h0m7s
| epoch 75 | step 3500/4071 | loss 8.2056 | lr 0.00100 | ngrams/sec 72604.3 | eta 0h0m4s
| epoch 75 | step 4000/4071 | loss 8.2058 | lr 0.00100 | ngrams/sec 72849.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1557.01it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.15it/s]


-----------------------------------------------------------------------------------------
| end of epoch  75 | time 29.26s | valid loss  6.60 | valid ppl   733.53
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 76 | step 500/4071 | loss 8.1634 | lr 0.00100 | ngrams/sec 52675.7 | eta 0h0m34s
| epoch 76 | step 1000/4071 | loss 8.1907 | lr 0.00100 | ngrams/sec 72820.7 | eta 0h0m21s
| epoch 76 | step 1500/4071 | loss 8.1934 | lr 0.00100 | ngrams/sec 72656.2 | eta 0h0m18s
| epoch 76 | step 2000/4071 | loss 8.1940 | lr 0.00100 | ngrams/sec 72637.0 | eta 0h0m14s
| epoch 76 | step 2500/4071 | loss 8.1905 | lr 0.00100 | ngrams/sec 72828.3 | eta 0h0m11s
| epoch 76 | step 3000/4071 | loss 8.2019 | lr 0.00100 | ngrams/sec 72673.8 | eta 0h0m7s
| epoch 76 | step 3500/4071 | loss 8.2082 | lr 0.00100 | ngrams/sec 72893.6 | eta 0h0m4s
| epoch 76 | step 4000/4071 | loss 8.2082 | lr 0.00100 | ngrams/sec 72727.4 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1553.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch  76 | time 29.24s | valid loss  6.61 | valid ppl   745.52
-----------------------------------------------------------------------------------------
| epoch 77 | step 500/4071 | loss 8.1704 | lr 0.00100 | ngrams/sec 53417.1 | eta 0h0m34s
| epoch 77 | step 1000/4071 | loss 8.1791 | lr 0.00100 | ngrams/sec 72701.8 | eta 0h0m21s
| epoch 77 | step 1500/4071 | loss 8.1820 | lr 0.00100 | ngrams/sec 72696.8 | eta 0h0m18s
| epoch 77 | step 2000/4071 | loss 8.1907 | lr 0.00100 | ngrams/sec 72752.4 | eta 0h0m14s
| epoch 77 | step 2500/4071 | loss 8.1938 | lr 0.00100 | ngrams/sec 72616.3 | eta 0h0m11s
| epoch 77 | step 3000/4071 | loss 8.2034 | lr 0.00100 | ngrams/sec 72547.8 | eta 0h0m7s
| epoch 77 | step 3500/4071 | loss 8.1959 | lr 0.00100 | ngrams/sec 72654.1 | eta 0h0m4s
| epoch 77 | step 4000/4071 | loss 8.1779 | lr 0.00100 | ngrams/sec 72702.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1558.69it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.00it/s]


-----------------------------------------------------------------------------------------
| end of epoch  77 | time 29.27s | valid loss  6.59 | valid ppl   725.96
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 78 | step 500/4071 | loss 8.1666 | lr 0.00100 | ngrams/sec 52554.2 | eta 0h0m34s
| epoch 78 | step 1000/4071 | loss 8.1555 | lr 0.00100 | ngrams/sec 72110.3 | eta 0h0m21s
| epoch 78 | step 1500/4071 | loss 8.1750 | lr 0.00100 | ngrams/sec 72914.0 | eta 0h0m18s
| epoch 78 | step 2000/4071 | loss 8.1867 | lr 0.00100 | ngrams/sec 72723.0 | eta 0h0m14s
| epoch 78 | step 2500/4071 | loss 8.1897 | lr 0.00100 | ngrams/sec 72427.8 | eta 0h0m11s
| epoch 78 | step 3000/4071 | loss 8.1788 | lr 0.00100 | ngrams/sec 72607.0 | eta 0h0m7s
| epoch 78 | step 3500/4071 | loss 8.2003 | lr 0.00100 | ngrams/sec 72712.8 | eta 0h0m4s
| epoch 78 | step 4000/4071 | loss 8.1956 | lr 0.00100 | ngrams/sec 72557.0 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1552.43it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  78 | time 29.31s | valid loss  6.60 | valid ppl   731.46
-----------------------------------------------------------------------------------------
| epoch 79 | step 500/4071 | loss 8.1522 | lr 0.00100 | ngrams/sec 53480.2 | eta 0h0m34s
| epoch 79 | step 1000/4071 | loss 8.1762 | lr 0.00100 | ngrams/sec 72921.2 | eta 0h0m21s
| epoch 79 | step 1500/4071 | loss 8.1905 | lr 0.00100 | ngrams/sec 72606.8 | eta 0h0m18s
| epoch 79 | step 2000/4071 | loss 8.1815 | lr 0.00100 | ngrams/sec 72832.4 | eta 0h0m14s
| epoch 79 | step 2500/4071 | loss 8.1808 | lr 0.00100 | ngrams/sec 72766.0 | eta 0h0m11s
| epoch 79 | step 3000/4071 | loss 8.1916 | lr 0.00100 | ngrams/sec 72504.3 | eta 0h0m7s
| epoch 79 | step 3500/4071 | loss 8.1830 | lr 0.00100 | ngrams/sec 72629.4 | eta 0h0m4s
| epoch 79 | step 4000/4071 | loss 8.1965 | lr 0.00100 | ngrams/sec 72632.1 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1539.43it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  79 | time 29.25s | valid loss  6.60 | valid ppl   735.20
-----------------------------------------------------------------------------------------
| epoch 80 | step 500/4071 | loss 8.1585 | lr 0.00100 | ngrams/sec 53498.4 | eta 0h0m34s
| epoch 80 | step 1000/4071 | loss 8.1589 | lr 0.00100 | ngrams/sec 72525.4 | eta 0h0m21s
| epoch 80 | step 1500/4071 | loss 8.1783 | lr 0.00100 | ngrams/sec 72596.3 | eta 0h0m18s
| epoch 80 | step 2000/4071 | loss 8.1774 | lr 0.00100 | ngrams/sec 72654.9 | eta 0h0m14s
| epoch 80 | step 2500/4071 | loss 8.1857 | lr 0.00100 | ngrams/sec 72572.0 | eta 0h0m11s
| epoch 80 | step 3000/4071 | loss 8.1630 | lr 0.00100 | ngrams/sec 72585.8 | eta 0h0m7s
| epoch 80 | step 3500/4071 | loss 8.1713 | lr 0.00100 | ngrams/sec 72523.0 | eta 0h0m4s
| epoch 80 | step 4000/4071 | loss 8.1784 | lr 0.00100 | ngrams/sec 72640.6 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1542.27it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.71it/s]


-----------------------------------------------------------------------------------------
| end of epoch  80 | time 29.29s | valid loss  6.60 | valid ppl   734.66
-----------------------------------------------------------------------------------------
| epoch 81 | step 500/4071 | loss 8.1705 | lr 0.00100 | ngrams/sec 53500.2 | eta 0h0m34s
| epoch 81 | step 1000/4071 | loss 8.1665 | lr 0.00100 | ngrams/sec 72783.8 | eta 0h0m21s
| epoch 81 | step 1500/4071 | loss 8.1786 | lr 0.00100 | ngrams/sec 72942.8 | eta 0h0m18s
| epoch 81 | step 2000/4071 | loss 8.1592 | lr 0.00100 | ngrams/sec 72619.8 | eta 0h0m14s
| epoch 81 | step 2500/4071 | loss 8.1650 | lr 0.00100 | ngrams/sec 72781.3 | eta 0h0m11s
| epoch 81 | step 3000/4071 | loss 8.1943 | lr 0.00100 | ngrams/sec 72543.3 | eta 0h0m7s
| epoch 81 | step 3500/4071 | loss 8.1709 | lr 0.00100 | ngrams/sec 72380.0 | eta 0h0m4s
| epoch 81 | step 4000/4071 | loss 8.1736 | lr 0.00100 | ngrams/sec 72420.8 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1551.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  81 | time 29.27s | valid loss  6.60 | valid ppl   733.61
-----------------------------------------------------------------------------------------
| epoch 82 | step 500/4071 | loss 8.1519 | lr 0.00100 | ngrams/sec 53167.6 | eta 0h0m34s
| epoch 82 | step 1000/4071 | loss 8.1596 | lr 0.00100 | ngrams/sec 72461.7 | eta 0h0m21s
| epoch 82 | step 1500/4071 | loss 8.1908 | lr 0.00100 | ngrams/sec 72810.5 | eta 0h0m18s
| epoch 82 | step 2000/4071 | loss 8.1600 | lr 0.00100 | ngrams/sec 72841.1 | eta 0h0m14s
| epoch 82 | step 2500/4071 | loss 8.1748 | lr 0.00100 | ngrams/sec 72580.8 | eta 0h0m11s
| epoch 82 | step 3000/4071 | loss 8.1723 | lr 0.00100 | ngrams/sec 72817.5 | eta 0h0m7s
| epoch 82 | step 3500/4071 | loss 8.1745 | lr 0.00100 | ngrams/sec 72587.4 | eta 0h0m4s
| epoch 82 | step 4000/4071 | loss 8.1535 | lr 0.00100 | ngrams/sec 72725.6 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1551.81it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  82 | time 29.28s | valid loss  6.61 | valid ppl   740.13
-----------------------------------------------------------------------------------------
| epoch 83 | step 500/4071 | loss 8.1373 | lr 0.00100 | ngrams/sec 53449.0 | eta 0h0m34s
| epoch 83 | step 1000/4071 | loss 8.1590 | lr 0.00100 | ngrams/sec 72605.3 | eta 0h0m21s
| epoch 83 | step 1500/4071 | loss 8.1531 | lr 0.00100 | ngrams/sec 72850.0 | eta 0h0m18s
| epoch 83 | step 2000/4071 | loss 8.1681 | lr 0.00100 | ngrams/sec 72818.6 | eta 0h0m14s
| epoch 83 | step 2500/4071 | loss 8.1620 | lr 0.00100 | ngrams/sec 72511.7 | eta 0h0m11s
| epoch 83 | step 3000/4071 | loss 8.1697 | lr 0.00100 | ngrams/sec 72660.4 | eta 0h0m7s
| epoch 83 | step 3500/4071 | loss 8.1822 | lr 0.00100 | ngrams/sec 72689.4 | eta 0h0m4s
| epoch 83 | step 4000/4071 | loss 8.1721 | lr 0.00100 | ngrams/sec 72679.3 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1542.89it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.76it/s]


-----------------------------------------------------------------------------------------
| end of epoch  83 | time 29.26s | valid loss  6.56 | valid ppl   709.11
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 84 | step 500/4071 | loss 8.1519 | lr 0.00100 | ngrams/sec 52689.1 | eta 0h0m34s
| epoch 84 | step 1000/4071 | loss 8.1381 | lr 0.00100 | ngrams/sec 72767.7 | eta 0h0m21s
| epoch 84 | step 1500/4071 | loss 8.1651 | lr 0.00100 | ngrams/sec 72510.4 | eta 0h0m18s
| epoch 84 | step 2000/4071 | loss 8.1621 | lr 0.00100 | ngrams/sec 72682.2 | eta 0h0m14s
| epoch 84 | step 2500/4071 | loss 8.1511 | lr 0.00100 | ngrams/sec 72596.5 | eta 0h0m11s
| epoch 84 | step 3000/4071 | loss 8.1598 | lr 0.00100 | ngrams/sec 72643.5 | eta 0h0m7s
| epoch 84 | step 3500/4071 | loss 8.1641 | lr 0.00100 | ngrams/sec 72548.9 | eta 0h0m4s
| epoch 84 | step 4000/4071 | loss 8.1659 | lr 0.00100 | ngrams/sec 72544.4 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1553.17it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.00it/s]


-----------------------------------------------------------------------------------------
| end of epoch  84 | time 29.28s | valid loss  6.60 | valid ppl   737.05
-----------------------------------------------------------------------------------------
| epoch 85 | step 500/4071 | loss 8.1403 | lr 0.00100 | ngrams/sec 53535.4 | eta 0h0m34s
| epoch 85 | step 1000/4071 | loss 8.1432 | lr 0.00100 | ngrams/sec 72850.7 | eta 0h0m21s
| epoch 85 | step 1500/4071 | loss 8.1578 | lr 0.00100 | ngrams/sec 72673.9 | eta 0h0m18s
| epoch 85 | step 2000/4071 | loss 8.1570 | lr 0.00100 | ngrams/sec 72787.5 | eta 0h0m14s
| epoch 85 | step 2500/4071 | loss 8.1563 | lr 0.00100 | ngrams/sec 72710.7 | eta 0h0m11s
| epoch 85 | step 3000/4071 | loss 8.1545 | lr 0.00100 | ngrams/sec 72503.7 | eta 0h0m7s
| epoch 85 | step 3500/4071 | loss 8.1662 | lr 0.00100 | ngrams/sec 72563.4 | eta 0h0m4s
| epoch 85 | step 4000/4071 | loss 8.1560 | lr 0.00100 | ngrams/sec 72666.3 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.79it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.10it/s]


-----------------------------------------------------------------------------------------
| end of epoch  85 | time 29.25s | valid loss  6.57 | valid ppl   710.87
-----------------------------------------------------------------------------------------
| epoch 86 | step 500/4071 | loss 8.1185 | lr 0.00100 | ngrams/sec 53472.9 | eta 0h0m34s
| epoch 86 | step 1000/4071 | loss 8.1566 | lr 0.00100 | ngrams/sec 72754.1 | eta 0h0m21s
| epoch 86 | step 1500/4071 | loss 8.1550 | lr 0.00100 | ngrams/sec 72787.3 | eta 0h0m18s
| epoch 86 | step 2000/4071 | loss 8.1548 | lr 0.00100 | ngrams/sec 72674.6 | eta 0h0m14s
| epoch 86 | step 2500/4071 | loss 8.1487 | lr 0.00100 | ngrams/sec 72767.0 | eta 0h0m11s
| epoch 86 | step 3000/4071 | loss 8.1531 | lr 0.00100 | ngrams/sec 72786.8 | eta 0h0m7s
| epoch 86 | step 3500/4071 | loss 8.1570 | lr 0.00100 | ngrams/sec 72647.8 | eta 0h0m4s
| epoch 86 | step 4000/4071 | loss 8.1590 | lr 0.00100 | ngrams/sec 72793.0 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1550.68it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  86 | time 29.24s | valid loss  6.58 | valid ppl   717.17
-----------------------------------------------------------------------------------------
| epoch 87 | step 500/4071 | loss 8.1323 | lr 0.00100 | ngrams/sec 53326.0 | eta 0h0m34s
| epoch 87 | step 1000/4071 | loss 8.1381 | lr 0.00100 | ngrams/sec 72801.1 | eta 0h0m21s
| epoch 87 | step 1500/4071 | loss 8.1408 | lr 0.00100 | ngrams/sec 72744.7 | eta 0h0m18s
| epoch 87 | step 2000/4071 | loss 8.1446 | lr 0.00100 | ngrams/sec 72771.9 | eta 0h0m14s
| epoch 87 | step 2500/4071 | loss 8.1389 | lr 0.00100 | ngrams/sec 72596.2 | eta 0h0m11s
| epoch 87 | step 3000/4071 | loss 8.1518 | lr 0.00100 | ngrams/sec 72943.0 | eta 0h0m7s
| epoch 87 | step 3500/4071 | loss 8.1397 | lr 0.00100 | ngrams/sec 72535.6 | eta 0h0m4s
| epoch 87 | step 4000/4071 | loss 8.1580 | lr 0.00100 | ngrams/sec 72716.8 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1554.13it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.87it/s]


-----------------------------------------------------------------------------------------
| end of epoch  87 | time 29.25s | valid loss  6.60 | valid ppl   731.93
-----------------------------------------------------------------------------------------
| epoch 88 | step 500/4071 | loss 8.1229 | lr 0.00100 | ngrams/sec 53478.7 | eta 0h0m34s
| epoch 88 | step 1000/4071 | loss 8.1285 | lr 0.00100 | ngrams/sec 72569.2 | eta 0h0m21s
| epoch 88 | step 1500/4071 | loss 8.1365 | lr 0.00100 | ngrams/sec 72827.9 | eta 0h0m18s
| epoch 88 | step 2000/4071 | loss 8.1578 | lr 0.00100 | ngrams/sec 72560.4 | eta 0h0m14s
| epoch 88 | step 2500/4071 | loss 8.1493 | lr 0.00100 | ngrams/sec 72309.0 | eta 0h0m11s
| epoch 88 | step 3000/4071 | loss 8.1395 | lr 0.00100 | ngrams/sec 72419.5 | eta 0h0m7s
| epoch 88 | step 3500/4071 | loss 8.1481 | lr 0.00100 | ngrams/sec 72700.2 | eta 0h0m4s
| epoch 88 | step 4000/4071 | loss 8.1628 | lr 0.00100 | ngrams/sec 72633.6 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1542.77it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 720.32it/s]


-----------------------------------------------------------------------------------------
| end of epoch  88 | time 29.30s | valid loss  6.60 | valid ppl   734.54
-----------------------------------------------------------------------------------------
| epoch 89 | step 500/4071 | loss 8.1282 | lr 0.00100 | ngrams/sec 53529.4 | eta 0h0m34s
| epoch 89 | step 1000/4071 | loss 8.1302 | lr 0.00100 | ngrams/sec 72668.9 | eta 0h0m21s
| epoch 89 | step 1500/4071 | loss 8.1516 | lr 0.00100 | ngrams/sec 72800.2 | eta 0h0m18s
| epoch 89 | step 2000/4071 | loss 8.1300 | lr 0.00100 | ngrams/sec 72810.5 | eta 0h0m14s
| epoch 89 | step 2500/4071 | loss 8.1365 | lr 0.00100 | ngrams/sec 72819.8 | eta 0h0m11s
| epoch 89 | step 3000/4071 | loss 8.1312 | lr 0.00100 | ngrams/sec 72670.4 | eta 0h0m7s
| epoch 89 | step 3500/4071 | loss 8.1522 | lr 0.00100 | ngrams/sec 72784.5 | eta 0h0m4s
| epoch 89 | step 4000/4071 | loss 8.1411 | lr 0.00100 | ngrams/sec 72689.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1552.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.06it/s]


-----------------------------------------------------------------------------------------
| end of epoch  89 | time 29.23s | valid loss  6.55 | valid ppl   697.47
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 90 | step 500/4071 | loss 8.1102 | lr 0.00100 | ngrams/sec 52741.9 | eta 0h0m34s
| epoch 90 | step 1000/4071 | loss 8.1273 | lr 0.00100 | ngrams/sec 72742.9 | eta 0h0m21s
| epoch 90 | step 1500/4071 | loss 8.1327 | lr 0.00100 | ngrams/sec 72688.7 | eta 0h0m18s
| epoch 90 | step 2000/4071 | loss 8.1334 | lr 0.00100 | ngrams/sec 72798.7 | eta 0h0m14s
| epoch 90 | step 2500/4071 | loss 8.1358 | lr 0.00100 | ngrams/sec 72792.8 | eta 0h0m11s
| epoch 90 | step 3000/4071 | loss 8.1527 | lr 0.00100 | ngrams/sec 72748.8 | eta 0h0m7s
| epoch 90 | step 3500/4071 | loss 8.1352 | lr 0.00100 | ngrams/sec 72875.2 | eta 0h0m4s
| epoch 90 | step 4000/4071 | loss 8.1292 | lr 0.00100 | ngrams/sec 72724.8 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1555.64it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.24it/s]


-----------------------------------------------------------------------------------------
| end of epoch  90 | time 29.23s | valid loss  6.55 | valid ppl   700.56
-----------------------------------------------------------------------------------------
| epoch 91 | step 500/4071 | loss 8.1135 | lr 0.00100 | ngrams/sec 53469.1 | eta 0h0m34s
| epoch 91 | step 1000/4071 | loss 8.1240 | lr 0.00100 | ngrams/sec 72588.4 | eta 0h0m21s
| epoch 91 | step 1500/4071 | loss 8.1374 | lr 0.00100 | ngrams/sec 72727.2 | eta 0h0m18s
| epoch 91 | step 2000/4071 | loss 8.1289 | lr 0.00100 | ngrams/sec 72776.2 | eta 0h0m14s
| epoch 91 | step 2500/4071 | loss 8.1299 | lr 0.00100 | ngrams/sec 72767.8 | eta 0h0m11s
| epoch 91 | step 3000/4071 | loss 8.1302 | lr 0.00100 | ngrams/sec 72734.4 | eta 0h0m7s
| epoch 91 | step 3500/4071 | loss 8.1426 | lr 0.00100 | ngrams/sec 72546.2 | eta 0h0m4s
| epoch 91 | step 4000/4071 | loss 8.1400 | lr 0.00100 | ngrams/sec 72875.7 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.27it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.77it/s]


-----------------------------------------------------------------------------------------
| end of epoch  91 | time 29.25s | valid loss  6.58 | valid ppl   723.60
-----------------------------------------------------------------------------------------
| epoch 92 | step 500/4071 | loss 8.1115 | lr 0.00100 | ngrams/sec 53474.3 | eta 0h0m34s
| epoch 92 | step 1000/4071 | loss 8.1320 | lr 0.00100 | ngrams/sec 72569.5 | eta 0h0m21s
| epoch 92 | step 1500/4071 | loss 8.1356 | lr 0.00100 | ngrams/sec 72739.3 | eta 0h0m18s
| epoch 92 | step 2000/4071 | loss 8.1227 | lr 0.00100 | ngrams/sec 72314.3 | eta 0h0m14s
| epoch 92 | step 2500/4071 | loss 8.1398 | lr 0.00100 | ngrams/sec 72226.9 | eta 0h0m11s
| epoch 92 | step 3000/4071 | loss 8.1339 | lr 0.00100 | ngrams/sec 72483.2 | eta 0h0m7s
| epoch 92 | step 3500/4071 | loss 8.1394 | lr 0.00100 | ngrams/sec 72655.0 | eta 0h0m4s
| epoch 92 | step 4000/4071 | loss 8.1395 | lr 0.00100 | ngrams/sec 72647.8 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1550.06it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.30it/s]


-----------------------------------------------------------------------------------------
| end of epoch  92 | time 29.32s | valid loss  6.56 | valid ppl   706.71
-----------------------------------------------------------------------------------------
| epoch 93 | step 500/4071 | loss 8.0957 | lr 0.00100 | ngrams/sec 53417.0 | eta 0h0m34s
| epoch 93 | step 1000/4071 | loss 8.1069 | lr 0.00100 | ngrams/sec 72768.4 | eta 0h0m21s
| epoch 93 | step 1500/4071 | loss 8.1333 | lr 0.00100 | ngrams/sec 72805.6 | eta 0h0m18s
| epoch 93 | step 2000/4071 | loss 8.1272 | lr 0.00100 | ngrams/sec 72663.9 | eta 0h0m14s
| epoch 93 | step 2500/4071 | loss 8.1367 | lr 0.00100 | ngrams/sec 72715.9 | eta 0h0m11s
| epoch 93 | step 3000/4071 | loss 8.1255 | lr 0.00100 | ngrams/sec 72795.3 | eta 0h0m7s
| epoch 93 | step 3500/4071 | loss 8.1440 | lr 0.00100 | ngrams/sec 72737.3 | eta 0h0m4s
| epoch 93 | step 4000/4071 | loss 8.1387 | lr 0.00100 | ngrams/sec 72317.7 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1553.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  93 | time 29.26s | valid loss  6.57 | valid ppl   711.15
-----------------------------------------------------------------------------------------
| epoch 94 | step 500/4071 | loss 8.0972 | lr 0.00100 | ngrams/sec 53465.2 | eta 0h0m34s
| epoch 94 | step 1000/4071 | loss 8.1215 | lr 0.00100 | ngrams/sec 72690.2 | eta 0h0m21s
| epoch 94 | step 1500/4071 | loss 8.1173 | lr 0.00100 | ngrams/sec 72632.2 | eta 0h0m18s
| epoch 94 | step 2000/4071 | loss 8.1121 | lr 0.00100 | ngrams/sec 72855.6 | eta 0h0m14s
| epoch 94 | step 2500/4071 | loss 8.1175 | lr 0.00100 | ngrams/sec 72670.0 | eta 0h0m11s
| epoch 94 | step 3000/4071 | loss 8.1310 | lr 0.00100 | ngrams/sec 72646.9 | eta 0h0m7s
| epoch 94 | step 3500/4071 | loss 8.1161 | lr 0.00100 | ngrams/sec 72619.9 | eta 0h0m4s
| epoch 94 | step 4000/4071 | loss 8.1283 | lr 0.00100 | ngrams/sec 72876.5 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1551.10it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 723.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  94 | time 29.25s | valid loss  6.58 | valid ppl   718.77
-----------------------------------------------------------------------------------------
| epoch 95 | step 500/4071 | loss 8.0993 | lr 0.00100 | ngrams/sec 53391.4 | eta 0h0m34s
| epoch 95 | step 1000/4071 | loss 8.1086 | lr 0.00100 | ngrams/sec 72745.8 | eta 0h0m21s
| epoch 95 | step 1500/4071 | loss 8.1128 | lr 0.00100 | ngrams/sec 72359.1 | eta 0h0m18s
| epoch 95 | step 2000/4071 | loss 8.1249 | lr 0.00100 | ngrams/sec 72839.2 | eta 0h0m14s
| epoch 95 | step 2500/4071 | loss 8.1161 | lr 0.00100 | ngrams/sec 72804.5 | eta 0h0m11s
| epoch 95 | step 3000/4071 | loss 8.1113 | lr 0.00100 | ngrams/sec 72777.4 | eta 0h0m7s
| epoch 95 | step 3500/4071 | loss 8.1211 | lr 0.00100 | ngrams/sec 72605.2 | eta 0h0m4s
| epoch 95 | step 4000/4071 | loss 8.1453 | lr 0.00100 | ngrams/sec 72746.4 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.23it/s]


-----------------------------------------------------------------------------------------
| end of epoch  95 | time 29.26s | valid loss  6.55 | valid ppl   696.86
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 96 | step 500/4071 | loss 8.0900 | lr 0.00100 | ngrams/sec 52646.9 | eta 0h0m34s
| epoch 96 | step 1000/4071 | loss 8.1022 | lr 0.00100 | ngrams/sec 72749.6 | eta 0h0m21s
| epoch 96 | step 1500/4071 | loss 8.1178 | lr 0.00100 | ngrams/sec 72663.9 | eta 0h0m18s
| epoch 96 | step 2000/4071 | loss 8.1075 | lr 0.00100 | ngrams/sec 72720.4 | eta 0h0m14s
| epoch 96 | step 2500/4071 | loss 8.1113 | lr 0.00100 | ngrams/sec 72666.3 | eta 0h0m11s
| epoch 96 | step 3000/4071 | loss 8.1212 | lr 0.00100 | ngrams/sec 72664.1 | eta 0h0m7s
| epoch 96 | step 3500/4071 | loss 8.1246 | lr 0.00100 | ngrams/sec 72647.4 | eta 0h0m4s
| epoch 96 | step 4000/4071 | loss 8.1118 | lr 0.00100 | ngrams/sec 72819.5 | eta 0h

 38%|███▊      | 157/417 [00:00<00:00, 1555.66it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.31it/s]


-----------------------------------------------------------------------------------------
| end of epoch  96 | time 29.26s | valid loss  6.56 | valid ppl   709.38
-----------------------------------------------------------------------------------------
| epoch 97 | step 500/4071 | loss 8.0901 | lr 0.00100 | ngrams/sec 53547.1 | eta 0h0m34s
| epoch 97 | step 1000/4071 | loss 8.0990 | lr 0.00100 | ngrams/sec 72978.5 | eta 0h0m21s
| epoch 97 | step 1500/4071 | loss 8.1087 | lr 0.00100 | ngrams/sec 72938.6 | eta 0h0m18s
| epoch 97 | step 2000/4071 | loss 8.1119 | lr 0.00100 | ngrams/sec 72510.1 | eta 0h0m14s
| epoch 97 | step 2500/4071 | loss 8.1112 | lr 0.00100 | ngrams/sec 72657.7 | eta 0h0m11s
| epoch 97 | step 3000/4071 | loss 8.1141 | lr 0.00100 | ngrams/sec 72718.7 | eta 0h0m7s
| epoch 97 | step 3500/4071 | loss 8.1212 | lr 0.00100 | ngrams/sec 72865.1 | eta 0h0m4s
| epoch 97 | step 4000/4071 | loss 8.1188 | lr 0.00100 | ngrams/sec 72700.1 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1555.56it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.05it/s]


-----------------------------------------------------------------------------------------
| end of epoch  97 | time 29.22s | valid loss  6.62 | valid ppl   751.73
-----------------------------------------------------------------------------------------
| epoch 98 | step 500/4071 | loss 8.0714 | lr 0.00100 | ngrams/sec 53476.2 | eta 0h0m34s
| epoch 98 | step 1000/4071 | loss 8.0947 | lr 0.00100 | ngrams/sec 71635.0 | eta 0h0m21s
| epoch 98 | step 1500/4071 | loss 8.0997 | lr 0.00100 | ngrams/sec 72834.2 | eta 0h0m18s
| epoch 98 | step 2000/4071 | loss 8.1000 | lr 0.00100 | ngrams/sec 72413.7 | eta 0h0m14s
| epoch 98 | step 2500/4071 | loss 8.1103 | lr 0.00100 | ngrams/sec 72297.5 | eta 0h0m11s
| epoch 98 | step 3000/4071 | loss 8.1057 | lr 0.00100 | ngrams/sec 72431.5 | eta 0h0m7s
| epoch 98 | step 3500/4071 | loss 8.0964 | lr 0.00100 | ngrams/sec 72375.8 | eta 0h0m4s
| epoch 98 | step 4000/4071 | loss 8.1230 | lr 0.00100 | ngrams/sec 72092.9 | eta 0h0m0s


 38%|███▊      | 157/417 [00:00<00:00, 1553.43it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 724.10it/s]


-----------------------------------------------------------------------------------------
| end of epoch  98 | time 29.40s | valid loss  6.58 | valid ppl   721.21
-----------------------------------------------------------------------------------------
| epoch 99 | step 500/4071 | loss 8.0866 | lr 0.00100 | ngrams/sec 53097.7 | eta 0h0m34s
| epoch 99 | step 1000/4071 | loss 8.0967 | lr 0.00100 | ngrams/sec 72309.9 | eta 0h0m21s
| epoch 99 | step 1500/4071 | loss 8.0990 | lr 0.00100 | ngrams/sec 72355.2 | eta 0h0m18s
| epoch 99 | step 2000/4071 | loss 8.0983 | lr 0.00100 | ngrams/sec 72190.8 | eta 0h0m14s
| epoch 99 | step 2500/4071 | loss 8.1065 | lr 0.00100 | ngrams/sec 72414.8 | eta 0h0m11s
| epoch 99 | step 3000/4071 | loss 8.1048 | lr 0.00100 | ngrams/sec 72358.3 | eta 0h0m7s
| epoch 99 | step 3500/4071 | loss 8.1015 | lr 0.00100 | ngrams/sec 72501.3 | eta 0h0m4s
| epoch 99 | step 4000/4071 | loss 8.1081 | lr 0.00100 | ngrams/sec 72606.1 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1539.10it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 721.54it/s]


-----------------------------------------------------------------------------------------
| end of epoch  99 | time 29.39s | valid loss  6.56 | valid ppl   707.38
-----------------------------------------------------------------------------------------
| epoch 100 | step 500/4071 | loss 8.0904 | lr 0.00100 | ngrams/sec 53514.8 | eta 0h0m34s
| epoch 100 | step 1000/4071 | loss 8.0862 | lr 0.00100 | ngrams/sec 72736.8 | eta 0h0m21s
| epoch 100 | step 1500/4071 | loss 8.0927 | lr 0.00100 | ngrams/sec 72660.7 | eta 0h0m18s
| epoch 100 | step 2000/4071 | loss 8.0888 | lr 0.00100 | ngrams/sec 72077.0 | eta 0h0m14s
| epoch 100 | step 2500/4071 | loss 8.0968 | lr 0.00100 | ngrams/sec 72555.1 | eta 0h0m11s
| epoch 100 | step 3000/4071 | loss 8.1014 | lr 0.00100 | ngrams/sec 72951.1 | eta 0h0m7s
| epoch 100 | step 3500/4071 | loss 8.1014 | lr 0.00100 | ngrams/sec 72567.8 | eta 0h0m4s
| epoch 100 | step 4000/4071 | loss 8.0879 | lr 0.00100 | ngrams/sec 72538.2 | eta 0h0m0s


 37%|███▋      | 156/417 [00:00<00:00, 1549.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:00<00:00, 722.23it/s]
  0%|          | 0/471 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| end of epoch 100 | time 29.29s | valid loss  6.57 | valid ppl   712.73
-----------------------------------------------------------------------------------------
Evaluating on test set...


100%|██████████| 471/471 [00:00<00:00, 698.46it/s]


| End of training | test loss  6.52 | test ppl   679.87


In [24]:
from google.colab import files
files.download('checkpoint.pth')
!cp "checkpoint.pth" "gdrive/MyDrive/checkpoint.pth"

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Generate

In [None]:
!cp "gdrive/MyDrive/checkpoint.pth" "checkpoint.pth" 

In [25]:
import torch 
# Model parameters.
class Args:
    data = 'gdrive/MyDrive/wikitext-2'
    checkpoint = 'checkpoint.pth'
    outf = 'generated.txt'
    #words = 1000
    seed = 42
    cuda = True
    temperature = 1.0 # temperature - higher will increase diversity
    log_interval = 10 # reporting interval
    words = 100
args = Args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

model.load_state_dict(torch.load(args.checkpoint))
print(model)
model.eval()

ntokens = n_class
# input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
input = torch.autograd.Variable(torch.t(torch.randint(ntokens, (1, 7), dtype=torch.long))).to(device)
print(input)

FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
tensor([[ 2774],
        [26931],
        [16204],
        [23326],
        [28058],
        [14935],
        [16636]], device='cuda:0')


In [26]:
glue = ' '
start = None
with open(args.outf, 'w') as outf:
    for i in range(args.words):
        output = model(input)
        word_weights = output.squeeze().div(args.temperature).exp().cpu()
        # if args.no_unk:
        #     word_weights[corpus.dictionary.w2i[unk]] = 0
        word_idx = torch.multinomial(word_weights, 1)[0]
        # word_idx = word_idx.data[0]
        word = corpus.dictionary.idx2word[word_idx]
        print(word)

        # ids.append(word_idx)
        # input = Variable(torch.LongTensor(ids[-model.order:]).unsqueeze(0))
        input.fill_(word_idx)
        input = input.cuda() if cuda else input
        # print(input)
        if word is "<sos>": # ignore start of sentence predictns
            continue
        elif word is "<eos>":
            outf.write('\n')
        else:
            outf.write(word + glue)

        if i % args.log_interval == 0:
            print('| Generated {}/{} words'.format(i, args.words))

making
| Generated 0/100 words
early
because
rachel
focuses
mccall
drains
others
delivering
lunch
course
| Generated 10/100 words
for
while
he
perfect
effect
off
minute
than
her
decision
| Generated 20/100 words
<eos>
finds
of
inari
telling
adding
magazine
breaking
anderson
legendary
| Generated 30/100 words
conformity
participating
there
released
i
real
iii
logo
influence
persisted
| Generated 40/100 words
had
apparently
-
he
perceived
mass
term
teeth
o
nominations
| Generated 50/100 words
that
zrínyi
1963
hospitals
directly
jordan
hollow
plates
contain
weapons
| Generated 60/100 words
blocked
both
transport
anderson
this
helps
unaltered
midge
year
of
| Generated 70/100 words
jack
]
government
serves
medical
meyerbeer
winner
anchors
protestant
project
| Generated 80/100 words
heroes
part
then
dover
festival
on
july
he
tried
visitors
| Generated 90/100 words
8
2001
1973
chapel
tuy
1969
–
national
undisclosed
