In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls "gdrive/MyDrive/Colab_Files/wikitext-2" # check that it has successfully connected
# files should be at ur GDrive inside folder wikitext-2
!cp "gdrive/MyDrive/Colab_Files/wikitext-2/wiki.test.tokens.txt" "test.txt" # copy the files to colab runtime
!cp "gdrive/MyDrive/Colab_Files/wikitext-2/wiki.train.tokens.txt" "train.txt"
!cp "gdrive/MyDrive/Colab_Files/wikitext-2/wiki.valid.tokens.txt" "valid.txt"

wiki.test.tokens.txt  wiki.train.tokens.txt  wiki.valid.tokens.txt


# Preprocessing

In [None]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                # remove the headers e.g.  = = Description = = 
                if line.startswith('='): 
                    continue
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word.lower()) # make to lower

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word.lower()]) # make to lower
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
    
    @property
    def vocab_size(self):
        return len(self.dictionary.idx2word)

# Params

In [None]:
#=== params
corpus = Corpus('/content')
n_class = corpus.vocab_size
n_step = 7 # n-1 in paper
n_hidden = 200 # h in paper
embed_size = 200       # m in paper
batch_size = 512
order = n_step # order (int): the order of the language model, i.e. length of the history
epochs = 100
learning_rate = 0.001
cuda = torch.cuda.is_available()
seed = 42
clip = 2.0
#===

# Model

In [None]:
#== MODEL ==#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''
    the neural model learns the distributed representation of each word 
    (embedding matrix C) and 
    the probability function of a word sequence as a function of their distributed representations. 
    It has a hidden layer with 
    tanh activation and the output layer is a 
    Softmax layer. 
    The output of the model for each 
    input of (n - 1) previous words are the 
    probabilities over the |V | words in the vocabulary for the next word.
'''
class FNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, context_size, no_hidden, dropout=0.5, tie_weight=True):
        super(FNNModel,self).__init__()
        """
        Args:
            n_class (int): no. of vocabulary
            m (int): size of each embedding vector
#n-gram models construct tables of conditional probabilities for the next word, 
#for each one of a large number of contexts, i.e. combinations of the last n − 1 words
            n_step (int): n-1 in paper. #n_step + 1 = n-gram. if n_step = 1, bigram
            n_hidden (int): no. of hidden units associated with each word
        """
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear1 = nn.Linear(context_size * embed_size, no_hidden)
        self.linear2 = nn.Linear(no_hidden, vocab_size)
        self.context_size = context_size
        self.embed_size = embed_size
        self.dropout = nn.Dropout(p=dropout)
        if tie_weight:
            self.linear2.weight = self.embeddings.weight

    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((-1, self.context_size * self.embed_size))
        hidden_output = self.linear1(embeds)
        # hidden_output = self.dropout(hidden_output)
        out = hidden_output.tanh()
        out = self.dropout(out)
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1) # [1000, 28912]: softmax on 28912's dim
        return log_probs

# Data Loading

In [None]:
def batchify(data, batch_size):
    # Work out how cleanly we can divide the dataset into args.batch_size parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data
def get_batch(data, i, order):
    x = torch.autograd.Variable(torch.t(data[i:i+order]))
    y = torch.autograd.Variable(data[i+order].view(-1))
    return x, y
def evaluate(data, model, criterion):
	model.eval()
	total_loss = 0
	n_steps = data.size(0) - order - 1
	for i in tqdm(range(n_steps)):
		x, y = get_batch(data, i, order)
		out = model(x)
		loss = criterion(out, y)
		total_loss += loss.data.data
	return total_loss / n_steps

In [None]:
def clock_time(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    return int(h), int(m), int(s)

In [None]:
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import time


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)
if cuda:
	train_data, val_data, test_data = train_data.cuda(), val_data.cuda(), test_data.cuda()
print('Using cuda: {}'.format(cuda))
print('Size of training set: {:,} tokens'.format(np.prod(train_data.size())))
print('Size of validation set: {:,} tokens'.format(np.prod(val_data.size())))
print('Size of test set: {:,} tokens'.format(np.prod(test_data.size())))
print('Vocabulary size: {:,}'.format(corpus.vocab_size))
print('Example data:')
for k in range(100, 107):
    x = [corpus.dictionary.idx2word[i] for i in train_data[k:order+k, 0]]
    y = [corpus.dictionary.idx2word[train_data[k+order, 0]]]
    print(x, y)
#=== initialise model
model = FNNModel(
    n_class, 
    embed_size, 
    n_step, 
    n_hidden,
    tie_weight=False
    )
if cuda:
  model.cuda()
# Display the model's architecture
print('Model: \n', model)
criterion = nn.NLLLoss()
optimizer = optim.RMSprop(model.parameters(),lr=learning_rate)

Using cuda: True
Size of training set: 2,088,448 tokens
Size of validation set: 217,600 tokens
Size of test set: 245,248 tokens
Vocabulary size: 28,912
Example data:
['"', 'nameless', '"', ',', 'a', 'penal', 'military'] ['unit']
['nameless', '"', ',', 'a', 'penal', 'military', 'unit'] ['serving']
['"', ',', 'a', 'penal', 'military', 'unit', 'serving'] ['the']
[',', 'a', 'penal', 'military', 'unit', 'serving', 'the'] ['nation']
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation'] ['of']
['penal', 'military', 'unit', 'serving', 'the', 'nation', 'of'] ['gallia']
['military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia'] ['during']
Model: 
 FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [None]:
!nvidia-smi

Mon Nov 30 07:58:51 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    28W /  70W |   1035MiB / 15079MiB |      3%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Set seed for reproducibility.
torch.manual_seed(seed)
np.random.seed(seed)
parameters = [param for param in model.parameters() if param.requires_grad]
# Training
print('Training...')
losses = dict(train=[], val=[])

# initialize the early_stopping counter
stop_counter = 0

lr = learning_rate # so that can alter later if SGD not descending 
best_val_loss = None

num_steps = train_data.size(0) - order - 1
batch_order = np.arange(num_steps)

t0 = time.time()
try:
    for epoch in range(1, epochs+1):
        model.train()
        epoch_start_time = time.time()
        np.random.shuffle(batch_order)

        for step in range(1, num_steps+1):
            idx = batch_order[step-1]
            x, y = get_batch(train_data, idx, order)

            model.zero_grad()
            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)
            debug = False
            #   if debug:
            #     # Debugging softmax approximation.
            #     xe = nn.CrossEntropyLoss()
            #     true_loss = xe(logits, y)
            #     print('approx {:>3.2f}, true {:>3.2f}, diff {:>3.4f}'.format(
            #       loss.data, true_loss.data, true_loss.data - loss.data))

            # Update parameters
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # reduce exploding grad
            optimizer.step()

            # Save loss.
            losses['train'].append(loss.cpu().data)
            print_every = 500
            if step % print_every == 0:
                avg_loss = sum(losses['train'][-print_every:]) / print_every
                t1 = time.time()
                steps_per_second = print_every / (t1 - t0)
                print('| epoch {} | step {}/{} | loss {:.4f} | lr {:.5f} | '
                    'ngrams/sec {:.1f} | eta {}h{}m{}s'.format(
                    epoch, step, num_steps, avg_loss, lr,
                    steps_per_second * batch_size,
                    *clock_time((num_steps - step) / steps_per_second)))
                t0 = time.time()
            
        print('Evaluating on validation set...')
        val_loss = evaluate(val_data, model, criterion)
        losses['val'].append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, torch.exp(val_loss)))
        print('-' * 89)

        if not best_val_loss or val_loss < best_val_loss:
            stop_counter = 0 # reset counter
            best_val_loss = val_loss
            print('| saving current state of model ...')
            torch.save(model.state_dict(), 'checkpoint.pth')
            #=== download checkpoint file
            # files.download('checkpoint.pth')
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    
# write_losses(losses['train'], args.log_dir, name='train-losses')
# write_losses(losses['val'], args.log_dir, name='val-losses')

print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

Training...
| epoch 1 | step 500/4071 | loss 6.9354 | lr 0.00100 | ngrams/sec 39885.9 | eta 0h0m45s
| epoch 1 | step 1000/4071 | loss 6.6793 | lr 0.00100 | ngrams/sec 41547.3 | eta 0h0m37s
| epoch 1 | step 1500/4071 | loss 6.5985 | lr 0.00100 | ngrams/sec 41345.5 | eta 0h0m31s
| epoch 1 | step 2000/4071 | loss 6.5587 | lr 0.00100 | ngrams/sec 41172.8 | eta 0h0m25s
| epoch 1 | step 2500/4071 | loss 6.5265 | lr 0.00100 | ngrams/sec 40907.0 | eta 0h0m19s
| epoch 1 | step 3000/4071 | loss 6.4907 | lr 0.00100 | ngrams/sec 40646.2 | eta 0h0m13s
| epoch 1 | step 3500/4071 | loss 6.4688 | lr 0.00100 | ngrams/sec 40481.1 | eta 0h0m7s
| epoch 1 | step 4000/4071 | loss 6.4302 | lr 0.00100 | ngrams/sec 40246.3 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1189.97it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 315.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch   1 | time 52.46s | valid loss  5.88 | valid ppl   358.63
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 2 | step 500/4071 | loss 6.2749 | lr 0.00100 | ngrams/sec 28027.9 | eta 0h1m5s
| epoch 2 | step 1000/4071 | loss 6.2735 | lr 0.00100 | ngrams/sec 39607.3 | eta 0h0m39s
| epoch 2 | step 1500/4071 | loss 6.2881 | lr 0.00100 | ngrams/sec 38953.6 | eta 0h0m33s
| epoch 2 | step 2000/4071 | loss 6.2472 | lr 0.00100 | ngrams/sec 38213.9 | eta 0h0m27s
| epoch 2 | step 2500/4071 | loss 6.2363 | lr 0.00100 | ngrams/sec 37580.9 | eta 0h0m21s
| epoch 2 | step 3000/4071 | loss 6.1945 | lr 0.00100 | ngrams/sec 37567.9 | eta 0h0m14s
| epoch 2 | step 3500/4071 | loss 6.1717 | lr 0.00100 | ngrams/sec 37758.0 | eta 0h0m7s
| epoch 2 | step 4000/4071 | loss 6.1582 | lr 0.00100 | ngrams/sec 37963.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1140.14it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.94it/s]


-----------------------------------------------------------------------------------------
| end of epoch   2 | time 55.70s | valid loss  5.70 | valid ppl   298.81
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 3 | step 500/4071 | loss 6.0167 | lr 0.00100 | ngrams/sec 26846.6 | eta 0h1m8s
| epoch 3 | step 1000/4071 | loss 6.0285 | lr 0.00100 | ngrams/sec 39001.9 | eta 0h0m40s
| epoch 3 | step 1500/4071 | loss 6.0192 | lr 0.00100 | ngrams/sec 39181.9 | eta 0h0m33s
| epoch 3 | step 2000/4071 | loss 5.9845 | lr 0.00100 | ngrams/sec 39102.8 | eta 0h0m27s
| epoch 3 | step 2500/4071 | loss 5.9900 | lr 0.00100 | ngrams/sec 38876.0 | eta 0h0m20s
| epoch 3 | step 3000/4071 | loss 5.9565 | lr 0.00100 | ngrams/sec 38532.1 | eta 0h0m14s
| epoch 3 | step 3500/4071 | loss 5.9304 | lr 0.00100 | ngrams/sec 38304.3 | eta 0h0m7s
| epoch 3 | step 4000/4071 | loss 5.9135 | lr 0.00100 | ngrams/sec 38189.7 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1148.69it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 285.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch   3 | time 55.28s | valid loss  5.56 | valid ppl   259.59
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 4 | step 500/4071 | loss 5.7691 | lr 0.00100 | ngrams/sec 26519.7 | eta 0h1m8s
| epoch 4 | step 1000/4071 | loss 5.7767 | lr 0.00100 | ngrams/sec 37965.9 | eta 0h0m41s
| epoch 4 | step 1500/4071 | loss 5.7925 | lr 0.00100 | ngrams/sec 38025.3 | eta 0h0m34s
| epoch 4 | step 2000/4071 | loss 5.7820 | lr 0.00100 | ngrams/sec 38082.1 | eta 0h0m27s
| epoch 4 | step 2500/4071 | loss 5.7947 | lr 0.00100 | ngrams/sec 38380.4 | eta 0h0m20s
| epoch 4 | step 3000/4071 | loss 5.7733 | lr 0.00100 | ngrams/sec 38452.3 | eta 0h0m14s
| epoch 4 | step 3500/4071 | loss 5.7806 | lr 0.00100 | ngrams/sec 38550.1 | eta 0h0m7s
| epoch 4 | step 4000/4071 | loss 5.7699 | lr 0.00100 | ngrams/sec 38432.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1169.07it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch   4 | time 55.92s | valid loss  5.50 | valid ppl   245.44
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 5 | step 500/4071 | loss 5.6475 | lr 0.00100 | ngrams/sec 26804.5 | eta 0h1m8s
| epoch 5 | step 1000/4071 | loss 5.6724 | lr 0.00100 | ngrams/sec 38355.8 | eta 0h0m40s
| epoch 5 | step 1500/4071 | loss 5.6871 | lr 0.00100 | ngrams/sec 38262.7 | eta 0h0m34s
| epoch 5 | step 2000/4071 | loss 5.6889 | lr 0.00100 | ngrams/sec 38200.7 | eta 0h0m27s
| epoch 5 | step 2500/4071 | loss 5.6916 | lr 0.00100 | ngrams/sec 38204.4 | eta 0h0m21s
| epoch 5 | step 3000/4071 | loss 5.6925 | lr 0.00100 | ngrams/sec 38163.9 | eta 0h0m14s
| epoch 5 | step 3500/4071 | loss 5.7158 | lr 0.00100 | ngrams/sec 38175.1 | eta 0h0m7s
| epoch 5 | step 4000/4071 | loss 5.7185 | lr 0.00100 | ngrams/sec 38211.6 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1169.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 287.45it/s]


-----------------------------------------------------------------------------------------
| end of epoch   5 | time 55.93s | valid loss  5.48 | valid ppl   238.85
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 6 | step 500/4071 | loss 5.5847 | lr 0.00100 | ngrams/sec 26689.3 | eta 0h1m8s
| epoch 6 | step 1000/4071 | loss 5.6038 | lr 0.00100 | ngrams/sec 38163.5 | eta 0h0m41s
| epoch 6 | step 1500/4071 | loss 5.6321 | lr 0.00100 | ngrams/sec 38157.7 | eta 0h0m34s
| epoch 6 | step 2000/4071 | loss 5.6460 | lr 0.00100 | ngrams/sec 38136.3 | eta 0h0m27s
| epoch 6 | step 2500/4071 | loss 5.6512 | lr 0.00100 | ngrams/sec 38108.4 | eta 0h0m21s
| epoch 6 | step 3000/4071 | loss 5.6665 | lr 0.00100 | ngrams/sec 38125.9 | eta 0h0m14s
| epoch 6 | step 3500/4071 | loss 5.6770 | lr 0.00100 | ngrams/sec 38087.6 | eta 0h0m7s
| epoch 6 | step 4000/4071 | loss 5.6713 | lr 0.00100 | ngrams/sec 38026.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1139.93it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 286.97it/s]


-----------------------------------------------------------------------------------------
| end of epoch   6 | time 56.09s | valid loss  5.46 | valid ppl   235.74
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 7 | step 500/4071 | loss 5.5584 | lr 0.00100 | ngrams/sec 26730.1 | eta 0h1m8s
| epoch 7 | step 1000/4071 | loss 5.5833 | lr 0.00100 | ngrams/sec 38169.5 | eta 0h0m41s
| epoch 7 | step 1500/4071 | loss 5.6072 | lr 0.00100 | ngrams/sec 38188.5 | eta 0h0m34s
| epoch 7 | step 2000/4071 | loss 5.6051 | lr 0.00100 | ngrams/sec 38142.5 | eta 0h0m27s
| epoch 7 | step 2500/4071 | loss 5.6379 | lr 0.00100 | ngrams/sec 38174.0 | eta 0h0m21s
| epoch 7 | step 3000/4071 | loss 5.6362 | lr 0.00100 | ngrams/sec 38197.4 | eta 0h0m14s
| epoch 7 | step 3500/4071 | loss 5.6501 | lr 0.00100 | ngrams/sec 38124.3 | eta 0h0m7s
| epoch 7 | step 4000/4071 | loss 5.6470 | lr 0.00100 | ngrams/sec 38145.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1137.81it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 287.10it/s]


-----------------------------------------------------------------------------------------
| end of epoch   7 | time 56.02s | valid loss  5.46 | valid ppl   234.23
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 8 | step 500/4071 | loss 5.5321 | lr 0.00100 | ngrams/sec 26702.2 | eta 0h1m8s
| epoch 8 | step 1000/4071 | loss 5.5492 | lr 0.00100 | ngrams/sec 38363.5 | eta 0h0m40s
| epoch 8 | step 1500/4071 | loss 5.5829 | lr 0.00100 | ngrams/sec 38239.9 | eta 0h0m34s
| epoch 8 | step 2000/4071 | loss 5.5976 | lr 0.00100 | ngrams/sec 38245.8 | eta 0h0m27s
| epoch 8 | step 2500/4071 | loss 5.6081 | lr 0.00100 | ngrams/sec 38336.4 | eta 0h0m20s
| epoch 8 | step 3000/4071 | loss 5.6190 | lr 0.00100 | ngrams/sec 38232.0 | eta 0h0m14s
| epoch 8 | step 3500/4071 | loss 5.6225 | lr 0.00100 | ngrams/sec 38077.5 | eta 0h0m7s
| epoch 8 | step 4000/4071 | loss 5.6275 | lr 0.00100 | ngrams/sec 38090.7 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1169.21it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 286.82it/s]


-----------------------------------------------------------------------------------------
| end of epoch   8 | time 55.93s | valid loss  5.45 | valid ppl   232.35
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 9 | step 500/4071 | loss 5.4980 | lr 0.00100 | ngrams/sec 26738.3 | eta 0h1m8s
| epoch 9 | step 1000/4071 | loss 5.5328 | lr 0.00100 | ngrams/sec 38326.4 | eta 0h0m41s
| epoch 9 | step 1500/4071 | loss 5.5598 | lr 0.00100 | ngrams/sec 38218.6 | eta 0h0m34s
| epoch 9 | step 2000/4071 | loss 5.5710 | lr 0.00100 | ngrams/sec 38169.9 | eta 0h0m27s
| epoch 9 | step 2500/4071 | loss 5.5913 | lr 0.00100 | ngrams/sec 38147.2 | eta 0h0m21s
| epoch 9 | step 3000/4071 | loss 5.5979 | lr 0.00100 | ngrams/sec 38262.1 | eta 0h0m14s
| epoch 9 | step 3500/4071 | loss 5.6012 | lr 0.00100 | ngrams/sec 38220.9 | eta 0h0m7s
| epoch 9 | step 4000/4071 | loss 5.6189 | lr 0.00100 | ngrams/sec 38346.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1143.70it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 288.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch   9 | time 55.91s | valid loss  5.44 | valid ppl   231.56
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 10 | step 500/4071 | loss 5.4837 | lr 0.00100 | ngrams/sec 26755.5 | eta 0h1m8s
| epoch 10 | step 1000/4071 | loss 5.5118 | lr 0.00100 | ngrams/sec 38130.7 | eta 0h0m41s
| epoch 10 | step 1500/4071 | loss 5.5345 | lr 0.00100 | ngrams/sec 38343.1 | eta 0h0m34s
| epoch 10 | step 2000/4071 | loss 5.5510 | lr 0.00100 | ngrams/sec 38170.7 | eta 0h0m27s
| epoch 10 | step 2500/4071 | loss 5.5713 | lr 0.00100 | ngrams/sec 38146.5 | eta 0h0m21s
| epoch 10 | step 3000/4071 | loss 5.5867 | lr 0.00100 | ngrams/sec 38119.3 | eta 0h0m14s
| epoch 10 | step 3500/4071 | loss 5.5830 | lr 0.00100 | ngrams/sec 38377.2 | eta 0h0m7s
| epoch 10 | step 4000/4071 | loss 5.6089 | lr 0.00100 | ngrams/sec 38371.6 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1171.85it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 288.33it/s]


-----------------------------------------------------------------------------------------
| end of epoch  10 | time 55.91s | valid loss  5.44 | valid ppl   229.96
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 11 | step 500/4071 | loss 5.4572 | lr 0.00100 | ngrams/sec 26788.1 | eta 0h1m8s
| epoch 11 | step 1000/4071 | loss 5.4949 | lr 0.00100 | ngrams/sec 38320.8 | eta 0h0m41s
| epoch 11 | step 1500/4071 | loss 5.5275 | lr 0.00100 | ngrams/sec 38400.0 | eta 0h0m34s
| epoch 11 | step 2000/4071 | loss 5.5384 | lr 0.00100 | ngrams/sec 38259.9 | eta 0h0m27s
| epoch 11 | step 2500/4071 | loss 5.5447 | lr 0.00100 | ngrams/sec 38305.7 | eta 0h0m20s
| epoch 11 | step 3000/4071 | loss 5.5520 | lr 0.00100 | ngrams/sec 38297.1 | eta 0h0m14s
| epoch 11 | step 3500/4071 | loss 5.5691 | lr 0.00100 | ngrams/sec 38441.0 | eta 0h0m7s
| epoch 11 | step 4000/4071 | loss 5.5836 | lr 0.00100 | ngrams/sec 38267.7 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1149.96it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  11 | time 55.78s | valid loss  5.43 | valid ppl   229.15
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 12 | step 500/4071 | loss 5.4398 | lr 0.00100 | ngrams/sec 26822.2 | eta 0h1m8s
| epoch 12 | step 1000/4071 | loss 5.4886 | lr 0.00100 | ngrams/sec 38409.3 | eta 0h0m40s
| epoch 12 | step 1500/4071 | loss 5.4947 | lr 0.00100 | ngrams/sec 38436.0 | eta 0h0m34s
| epoch 12 | step 2000/4071 | loss 5.5112 | lr 0.00100 | ngrams/sec 38289.5 | eta 0h0m27s
| epoch 12 | step 2500/4071 | loss 5.5279 | lr 0.00100 | ngrams/sec 38386.3 | eta 0h0m20s
| epoch 12 | step 3000/4071 | loss 5.5514 | lr 0.00100 | ngrams/sec 38391.2 | eta 0h0m14s
| epoch 12 | step 3500/4071 | loss 5.5588 | lr 0.00100 | ngrams/sec 38412.4 | eta 0h0m7s
| epoch 12 | step 4000/4071 | loss 5.5661 | lr 0.00100 | ngrams/sec 38330.8 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1160.18it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 288.88it/s]


-----------------------------------------------------------------------------------------
| end of epoch  12 | time 55.72s | valid loss  5.43 | valid ppl   228.70
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 13 | step 500/4071 | loss 5.4319 | lr 0.00100 | ngrams/sec 26790.9 | eta 0h1m8s
| epoch 13 | step 1000/4071 | loss 5.4542 | lr 0.00100 | ngrams/sec 38268.8 | eta 0h0m41s
| epoch 13 | step 1500/4071 | loss 5.4861 | lr 0.00100 | ngrams/sec 38416.2 | eta 0h0m34s
| epoch 13 | step 2000/4071 | loss 5.5079 | lr 0.00100 | ngrams/sec 38234.4 | eta 0h0m27s
| epoch 13 | step 2500/4071 | loss 5.5179 | lr 0.00100 | ngrams/sec 38425.4 | eta 0h0m20s
| epoch 13 | step 3000/4071 | loss 5.5274 | lr 0.00100 | ngrams/sec 38229.2 | eta 0h0m14s
| epoch 13 | step 3500/4071 | loss 5.5321 | lr 0.00100 | ngrams/sec 38451.3 | eta 0h0m7s
| epoch 13 | step 4000/4071 | loss 5.5458 | lr 0.00100 | ngrams/sec 38444.8 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1144.41it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.79it/s]


-----------------------------------------------------------------------------------------
| end of epoch  13 | time 55.74s | valid loss  5.43 | valid ppl   227.91
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 14 | step 500/4071 | loss 5.4096 | lr 0.00100 | ngrams/sec 26835.8 | eta 0h1m8s
| epoch 14 | step 1000/4071 | loss 5.4465 | lr 0.00100 | ngrams/sec 38399.3 | eta 0h0m40s
| epoch 14 | step 1500/4071 | loss 5.4757 | lr 0.00100 | ngrams/sec 38440.5 | eta 0h0m34s
| epoch 14 | step 2000/4071 | loss 5.4825 | lr 0.00100 | ngrams/sec 38354.2 | eta 0h0m27s
| epoch 14 | step 2500/4071 | loss 5.5029 | lr 0.00100 | ngrams/sec 38294.8 | eta 0h0m21s
| epoch 14 | step 3000/4071 | loss 5.5129 | lr 0.00100 | ngrams/sec 38350.0 | eta 0h0m14s
| epoch 14 | step 3500/4071 | loss 5.5066 | lr 0.00100 | ngrams/sec 38465.7 | eta 0h0m7s
| epoch 14 | step 4000/4071 | loss 5.5396 | lr 0.00100 | ngrams/sec 38379.4 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1143.97it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 288.64it/s]


-----------------------------------------------------------------------------------------
| end of epoch  14 | time 55.71s | valid loss  5.43 | valid ppl   227.42
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 15 | step 500/4071 | loss 5.3874 | lr 0.00100 | ngrams/sec 26866.5 | eta 0h1m8s
| epoch 15 | step 1000/4071 | loss 5.4257 | lr 0.00100 | ngrams/sec 38313.5 | eta 0h0m41s
| epoch 15 | step 1500/4071 | loss 5.4496 | lr 0.00100 | ngrams/sec 38427.2 | eta 0h0m34s
| epoch 15 | step 2000/4071 | loss 5.4632 | lr 0.00100 | ngrams/sec 38473.7 | eta 0h0m27s
| epoch 15 | step 2500/4071 | loss 5.4851 | lr 0.00100 | ngrams/sec 38560.3 | eta 0h0m20s
| epoch 15 | step 3000/4071 | loss 5.5047 | lr 0.00100 | ngrams/sec 38445.6 | eta 0h0m14s
| epoch 15 | step 3500/4071 | loss 5.5119 | lr 0.00100 | ngrams/sec 38480.8 | eta 0h0m7s
| epoch 15 | step 4000/4071 | loss 5.5168 | lr 0.00100 | ngrams/sec 38494.8 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1139.32it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch  15 | time 55.60s | valid loss  5.42 | valid ppl   226.02
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 16 | step 500/4071 | loss 5.3701 | lr 0.00100 | ngrams/sec 26889.3 | eta 0h1m7s
| epoch 16 | step 1000/4071 | loss 5.4128 | lr 0.00100 | ngrams/sec 38490.8 | eta 0h0m40s
| epoch 16 | step 1500/4071 | loss 5.4514 | lr 0.00100 | ngrams/sec 38527.8 | eta 0h0m34s
| epoch 16 | step 2000/4071 | loss 5.4502 | lr 0.00100 | ngrams/sec 38462.4 | eta 0h0m27s
| epoch 16 | step 2500/4071 | loss 5.4704 | lr 0.00100 | ngrams/sec 38492.3 | eta 0h0m20s
| epoch 16 | step 3000/4071 | loss 5.4816 | lr 0.00100 | ngrams/sec 38384.2 | eta 0h0m14s
| epoch 16 | step 3500/4071 | loss 5.4965 | lr 0.00100 | ngrams/sec 38530.3 | eta 0h0m7s
| epoch 16 | step 4000/4071 | loss 5.5014 | lr 0.00100 | ngrams/sec 38491.6 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1175.30it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.24it/s]


-----------------------------------------------------------------------------------------
| end of epoch  16 | time 55.58s | valid loss  5.42 | valid ppl   226.09
-----------------------------------------------------------------------------------------
| epoch 17 | step 500/4071 | loss 5.3609 | lr 0.00100 | ngrams/sec 27129.3 | eta 0h1m7s
| epoch 17 | step 1000/4071 | loss 5.3948 | lr 0.00100 | ngrams/sec 38480.0 | eta 0h0m40s
| epoch 17 | step 1500/4071 | loss 5.4252 | lr 0.00100 | ngrams/sec 38382.6 | eta 0h0m34s
| epoch 17 | step 2000/4071 | loss 5.4478 | lr 0.00100 | ngrams/sec 38545.1 | eta 0h0m27s
| epoch 17 | step 2500/4071 | loss 5.4565 | lr 0.00100 | ngrams/sec 38472.8 | eta 0h0m20s
| epoch 17 | step 3000/4071 | loss 5.4837 | lr 0.00100 | ngrams/sec 38627.3 | eta 0h0m14s
| epoch 17 | step 3500/4071 | loss 5.4682 | lr 0.00100 | ngrams/sec 38589.0 | eta 0h0m7s
| epoch 17 | step 4000/4071 | loss 5.4814 | lr 0.00100 | ngrams/sec 38555.4 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1130.25it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.35it/s]


-----------------------------------------------------------------------------------------
| end of epoch  17 | time 55.54s | valid loss  5.42 | valid ppl   225.41
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 18 | step 500/4071 | loss 5.3466 | lr 0.00100 | ngrams/sec 26964.8 | eta 0h1m7s
| epoch 18 | step 1000/4071 | loss 5.3764 | lr 0.00100 | ngrams/sec 38479.2 | eta 0h0m40s
| epoch 18 | step 1500/4071 | loss 5.4193 | lr 0.00100 | ngrams/sec 38422.1 | eta 0h0m34s
| epoch 18 | step 2000/4071 | loss 5.4220 | lr 0.00100 | ngrams/sec 38504.8 | eta 0h0m27s
| epoch 18 | step 2500/4071 | loss 5.4374 | lr 0.00100 | ngrams/sec 38647.4 | eta 0h0m20s
| epoch 18 | step 3000/4071 | loss 5.4586 | lr 0.00100 | ngrams/sec 38472.5 | eta 0h0m14s
| epoch 18 | step 3500/4071 | loss 5.4498 | lr 0.00100 | ngrams/sec 38642.3 | eta 0h0m7s
| epoch 18 | step 4000/4071 | loss 5.4830 | lr 0.00100 | ngrams/sec 38650.8 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1156.33it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.97it/s]


-----------------------------------------------------------------------------------------
| end of epoch  18 | time 55.46s | valid loss  5.42 | valid ppl   225.76
-----------------------------------------------------------------------------------------
| epoch 19 | step 500/4071 | loss 5.3296 | lr 0.00100 | ngrams/sec 27199.7 | eta 0h1m7s
| epoch 19 | step 1000/4071 | loss 5.3827 | lr 0.00100 | ngrams/sec 38500.7 | eta 0h0m40s
| epoch 19 | step 1500/4071 | loss 5.4034 | lr 0.00100 | ngrams/sec 38696.9 | eta 0h0m34s
| epoch 19 | step 2000/4071 | loss 5.4223 | lr 0.00100 | ngrams/sec 38449.2 | eta 0h0m27s
| epoch 19 | step 2500/4071 | loss 5.4208 | lr 0.00100 | ngrams/sec 38646.8 | eta 0h0m20s
| epoch 19 | step 3000/4071 | loss 5.4340 | lr 0.00100 | ngrams/sec 38568.7 | eta 0h0m14s
| epoch 19 | step 3500/4071 | loss 5.4466 | lr 0.00100 | ngrams/sec 38690.3 | eta 0h0m7s
| epoch 19 | step 4000/4071 | loss 5.4514 | lr 0.00100 | ngrams/sec 38597.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1136.76it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.98it/s]


-----------------------------------------------------------------------------------------
| end of epoch  19 | time 55.44s | valid loss  5.41 | valid ppl   224.12
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 20 | step 500/4071 | loss 5.3144 | lr 0.00100 | ngrams/sec 26894.7 | eta 0h1m7s
| epoch 20 | step 1000/4071 | loss 5.3533 | lr 0.00100 | ngrams/sec 38514.3 | eta 0h0m40s
| epoch 20 | step 1500/4071 | loss 5.3756 | lr 0.00100 | ngrams/sec 38349.1 | eta 0h0m34s
| epoch 20 | step 2000/4071 | loss 5.3899 | lr 0.00100 | ngrams/sec 38405.0 | eta 0h0m27s
| epoch 20 | step 2500/4071 | loss 5.4240 | lr 0.00100 | ngrams/sec 38438.2 | eta 0h0m20s
| epoch 20 | step 3000/4071 | loss 5.4423 | lr 0.00100 | ngrams/sec 38432.7 | eta 0h0m14s
| epoch 20 | step 3500/4071 | loss 5.4380 | lr 0.00100 | ngrams/sec 38367.8 | eta 0h0m7s
| epoch 20 | step 4000/4071 | loss 5.4461 | lr 0.00100 | ngrams/sec 38249.6 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1147.44it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 288.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  20 | time 55.68s | valid loss  5.42 | valid ppl   225.53
-----------------------------------------------------------------------------------------
| epoch 21 | step 500/4071 | loss 5.3077 | lr 0.00100 | ngrams/sec 27083.3 | eta 0h1m7s
| epoch 21 | step 1000/4071 | loss 5.3516 | lr 0.00100 | ngrams/sec 38336.7 | eta 0h0m41s
| epoch 21 | step 1500/4071 | loss 5.3692 | lr 0.00100 | ngrams/sec 38363.2 | eta 0h0m34s
| epoch 21 | step 2000/4071 | loss 5.3761 | lr 0.00100 | ngrams/sec 38294.2 | eta 0h0m27s
| epoch 21 | step 2500/4071 | loss 5.4014 | lr 0.00100 | ngrams/sec 38202.0 | eta 0h0m21s
| epoch 21 | step 3000/4071 | loss 5.4200 | lr 0.00100 | ngrams/sec 38182.5 | eta 0h0m14s
| epoch 21 | step 3500/4071 | loss 5.4087 | lr 0.00100 | ngrams/sec 38173.4 | eta 0h0m7s
| epoch 21 | step 4000/4071 | loss 5.4256 | lr 0.00100 | ngrams/sec 38172.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1141.06it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 287.73it/s]


-----------------------------------------------------------------------------------------
| end of epoch  21 | time 55.90s | valid loss  5.41 | valid ppl   224.54
-----------------------------------------------------------------------------------------
| epoch 22 | step 500/4071 | loss 5.3040 | lr 0.00100 | ngrams/sec 26969.6 | eta 0h1m7s
| epoch 22 | step 1000/4071 | loss 5.3205 | lr 0.00100 | ngrams/sec 38250.2 | eta 0h0m41s
| epoch 22 | step 1500/4071 | loss 5.3562 | lr 0.00100 | ngrams/sec 38395.7 | eta 0h0m34s
| epoch 22 | step 2000/4071 | loss 5.3617 | lr 0.00100 | ngrams/sec 38368.6 | eta 0h0m27s
| epoch 22 | step 2500/4071 | loss 5.3866 | lr 0.00100 | ngrams/sec 38357.1 | eta 0h0m20s
| epoch 22 | step 3000/4071 | loss 5.4178 | lr 0.00100 | ngrams/sec 38363.0 | eta 0h0m14s
| epoch 22 | step 3500/4071 | loss 5.3940 | lr 0.00100 | ngrams/sec 38366.8 | eta 0h0m7s
| epoch 22 | step 4000/4071 | loss 5.4277 | lr 0.00100 | ngrams/sec 38590.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1160.26it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.20it/s]


-----------------------------------------------------------------------------------------
| end of epoch  22 | time 55.74s | valid loss  5.41 | valid ppl   224.73
-----------------------------------------------------------------------------------------
| epoch 23 | step 500/4071 | loss 5.2913 | lr 0.00100 | ngrams/sec 27122.3 | eta 0h1m7s
| epoch 23 | step 1000/4071 | loss 5.3130 | lr 0.00100 | ngrams/sec 38492.6 | eta 0h0m40s
| epoch 23 | step 1500/4071 | loss 5.3522 | lr 0.00100 | ngrams/sec 38517.8 | eta 0h0m34s
| epoch 23 | step 2000/4071 | loss 5.3522 | lr 0.00100 | ngrams/sec 38369.5 | eta 0h0m27s
| epoch 23 | step 2500/4071 | loss 5.3531 | lr 0.00100 | ngrams/sec 38484.8 | eta 0h0m20s
| epoch 23 | step 3000/4071 | loss 5.4041 | lr 0.00100 | ngrams/sec 38512.9 | eta 0h0m14s
| epoch 23 | step 3500/4071 | loss 5.4108 | lr 0.00100 | ngrams/sec 38562.6 | eta 0h0m7s
| epoch 23 | step 4000/4071 | loss 5.4036 | lr 0.00100 | ngrams/sec 38531.6 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1162.43it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.24it/s]


-----------------------------------------------------------------------------------------
| end of epoch  23 | time 55.57s | valid loss  5.41 | valid ppl   224.49
-----------------------------------------------------------------------------------------
| epoch 24 | step 500/4071 | loss 5.2883 | lr 0.00100 | ngrams/sec 27199.3 | eta 0h1m7s
| epoch 24 | step 1000/4071 | loss 5.3048 | lr 0.00100 | ngrams/sec 38645.2 | eta 0h0m40s
| epoch 24 | step 1500/4071 | loss 5.3278 | lr 0.00100 | ngrams/sec 38636.7 | eta 0h0m34s
| epoch 24 | step 2000/4071 | loss 5.3418 | lr 0.00100 | ngrams/sec 38622.2 | eta 0h0m27s
| epoch 24 | step 2500/4071 | loss 5.3624 | lr 0.00100 | ngrams/sec 38736.3 | eta 0h0m20s
| epoch 24 | step 3000/4071 | loss 5.3694 | lr 0.00100 | ngrams/sec 38763.5 | eta 0h0m14s
| epoch 24 | step 3500/4071 | loss 5.3891 | lr 0.00100 | ngrams/sec 38498.3 | eta 0h0m7s
| epoch 24 | step 4000/4071 | loss 5.3972 | lr 0.00100 | ngrams/sec 38839.2 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1178.99it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  24 | time 55.33s | valid loss  5.41 | valid ppl   224.60
-----------------------------------------------------------------------------------------
| epoch 25 | step 500/4071 | loss 5.2530 | lr 0.00100 | ngrams/sec 27295.2 | eta 0h1m6s
| epoch 25 | step 1000/4071 | loss 5.2964 | lr 0.00100 | ngrams/sec 38681.3 | eta 0h0m40s
| epoch 25 | step 1500/4071 | loss 5.3247 | lr 0.00100 | ngrams/sec 38729.9 | eta 0h0m33s
| epoch 25 | step 2000/4071 | loss 5.3375 | lr 0.00100 | ngrams/sec 38681.7 | eta 0h0m27s
| epoch 25 | step 2500/4071 | loss 5.3469 | lr 0.00100 | ngrams/sec 38770.1 | eta 0h0m20s
| epoch 25 | step 3000/4071 | loss 5.3592 | lr 0.00100 | ngrams/sec 38834.9 | eta 0h0m14s
| epoch 25 | step 3500/4071 | loss 5.3695 | lr 0.00100 | ngrams/sec 38725.5 | eta 0h0m7s
| epoch 25 | step 4000/4071 | loss 5.3883 | lr 0.00100 | ngrams/sec 38784.4 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1175.43it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.19it/s]


-----------------------------------------------------------------------------------------
| end of epoch  25 | time 55.22s | valid loss  5.41 | valid ppl   224.47
-----------------------------------------------------------------------------------------
| epoch 26 | step 500/4071 | loss 5.2421 | lr 0.00100 | ngrams/sec 27252.1 | eta 0h1m7s
| epoch 26 | step 1000/4071 | loss 5.2902 | lr 0.00100 | ngrams/sec 38866.4 | eta 0h0m40s
| epoch 26 | step 1500/4071 | loss 5.3056 | lr 0.00100 | ngrams/sec 38609.4 | eta 0h0m34s
| epoch 26 | step 2000/4071 | loss 5.3352 | lr 0.00100 | ngrams/sec 38646.4 | eta 0h0m27s
| epoch 26 | step 2500/4071 | loss 5.3373 | lr 0.00100 | ngrams/sec 38492.2 | eta 0h0m20s
| epoch 26 | step 3000/4071 | loss 5.3452 | lr 0.00100 | ngrams/sec 38708.2 | eta 0h0m14s
| epoch 26 | step 3500/4071 | loss 5.3556 | lr 0.00100 | ngrams/sec 38526.2 | eta 0h0m7s
| epoch 26 | step 4000/4071 | loss 5.3627 | lr 0.00100 | ngrams/sec 38675.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1147.14it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.28it/s]


-----------------------------------------------------------------------------------------
| end of epoch  26 | time 55.36s | valid loss  5.41 | valid ppl   223.89
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 27 | step 500/4071 | loss 5.2396 | lr 0.00100 | ngrams/sec 26967.1 | eta 0h1m7s
| epoch 27 | step 1000/4071 | loss 5.2805 | lr 0.00100 | ngrams/sec 38502.3 | eta 0h0m40s
| epoch 27 | step 1500/4071 | loss 5.2833 | lr 0.00100 | ngrams/sec 38540.8 | eta 0h0m34s
| epoch 27 | step 2000/4071 | loss 5.3154 | lr 0.00100 | ngrams/sec 38614.6 | eta 0h0m27s
| epoch 27 | step 2500/4071 | loss 5.3163 | lr 0.00100 | ngrams/sec 38572.0 | eta 0h0m20s
| epoch 27 | step 3000/4071 | loss 5.3382 | lr 0.00100 | ngrams/sec 38559.0 | eta 0h0m14s
| epoch 27 | step 3500/4071 | loss 5.3541 | lr 0.00100 | ngrams/sec 38499.6 | eta 0h0m7s
| epoch 27 | step 4000/4071 | loss 5.3582 | lr 0.00100 | ngrams/sec 38502.4 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1142.03it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.48it/s]


-----------------------------------------------------------------------------------------
| end of epoch  27 | time 55.49s | valid loss  5.41 | valid ppl   223.31
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 28 | step 500/4071 | loss 5.2288 | lr 0.00100 | ngrams/sec 26899.2 | eta 0h1m7s
| epoch 28 | step 1000/4071 | loss 5.2601 | lr 0.00100 | ngrams/sec 38350.5 | eta 0h0m40s
| epoch 28 | step 1500/4071 | loss 5.2676 | lr 0.00100 | ngrams/sec 38494.5 | eta 0h0m34s
| epoch 28 | step 2000/4071 | loss 5.3189 | lr 0.00100 | ngrams/sec 38557.4 | eta 0h0m27s
| epoch 28 | step 2500/4071 | loss 5.3130 | lr 0.00100 | ngrams/sec 38417.5 | eta 0h0m20s
| epoch 28 | step 3000/4071 | loss 5.3342 | lr 0.00100 | ngrams/sec 38539.3 | eta 0h0m14s
| epoch 28 | step 3500/4071 | loss 5.3390 | lr 0.00100 | ngrams/sec 38385.9 | eta 0h0m7s
| epoch 28 | step 4000/4071 | loss 5.3399 | lr 0.00100 | ngrams/sec 38431.5 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1144.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.40it/s]


-----------------------------------------------------------------------------------------
| end of epoch  28 | time 55.60s | valid loss  5.41 | valid ppl   222.99
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 29 | step 500/4071 | loss 5.2107 | lr 0.00100 | ngrams/sec 26846.4 | eta 0h1m8s
| epoch 29 | step 1000/4071 | loss 5.2459 | lr 0.00100 | ngrams/sec 38350.2 | eta 0h0m40s
| epoch 29 | step 1500/4071 | loss 5.2776 | lr 0.00100 | ngrams/sec 38391.2 | eta 0h0m34s
| epoch 29 | step 2000/4071 | loss 5.2825 | lr 0.00100 | ngrams/sec 38391.5 | eta 0h0m27s
| epoch 29 | step 2500/4071 | loss 5.3053 | lr 0.00100 | ngrams/sec 38335.9 | eta 0h0m20s
| epoch 29 | step 3000/4071 | loss 5.3257 | lr 0.00100 | ngrams/sec 38372.5 | eta 0h0m14s
| epoch 29 | step 3500/4071 | loss 5.3386 | lr 0.00100 | ngrams/sec 38405.3 | eta 0h0m7s
| epoch 29 | step 4000/4071 | loss 5.3324 | lr 0.00100 | ngrams/sec 38495.9 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1165.55it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.09it/s]


-----------------------------------------------------------------------------------------
| end of epoch  29 | time 55.68s | valid loss  5.41 | valid ppl   222.72
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 30 | step 500/4071 | loss 5.1936 | lr 0.00100 | ngrams/sec 26949.9 | eta 0h1m7s
| epoch 30 | step 1000/4071 | loss 5.2219 | lr 0.00100 | ngrams/sec 38696.6 | eta 0h0m40s
| epoch 30 | step 1500/4071 | loss 5.2747 | lr 0.00100 | ngrams/sec 38594.2 | eta 0h0m34s
| epoch 30 | step 2000/4071 | loss 5.2827 | lr 0.00100 | ngrams/sec 38702.8 | eta 0h0m27s
| epoch 30 | step 2500/4071 | loss 5.3061 | lr 0.00100 | ngrams/sec 38902.6 | eta 0h0m20s
| epoch 30 | step 3000/4071 | loss 5.3043 | lr 0.00100 | ngrams/sec 38832.1 | eta 0h0m14s
| epoch 30 | step 3500/4071 | loss 5.3183 | lr 0.00100 | ngrams/sec 38872.3 | eta 0h0m7s
| epoch 30 | step 4000/4071 | loss 5.3279 | lr 0.00100 | ngrams/sec 38875.7 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1170.65it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.51it/s]


-----------------------------------------------------------------------------------------
| end of epoch  30 | time 55.17s | valid loss  5.41 | valid ppl   223.00
-----------------------------------------------------------------------------------------
| epoch 31 | step 500/4071 | loss 5.1800 | lr 0.00100 | ngrams/sec 27368.9 | eta 0h1m6s
| epoch 31 | step 1000/4071 | loss 5.2276 | lr 0.00100 | ngrams/sec 38877.0 | eta 0h0m40s
| epoch 31 | step 1500/4071 | loss 5.2492 | lr 0.00100 | ngrams/sec 38821.4 | eta 0h0m33s
| epoch 31 | step 2000/4071 | loss 5.2822 | lr 0.00100 | ngrams/sec 38747.6 | eta 0h0m27s
| epoch 31 | step 2500/4071 | loss 5.2703 | lr 0.00100 | ngrams/sec 38752.6 | eta 0h0m20s
| epoch 31 | step 3000/4071 | loss 5.3089 | lr 0.00100 | ngrams/sec 38742.6 | eta 0h0m14s
| epoch 31 | step 3500/4071 | loss 5.3086 | lr 0.00100 | ngrams/sec 38656.7 | eta 0h0m7s
| epoch 31 | step 4000/4071 | loss 5.3050 | lr 0.00100 | ngrams/sec 38601.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1137.97it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  31 | time 55.21s | valid loss  5.41 | valid ppl   222.97
-----------------------------------------------------------------------------------------
| epoch 32 | step 500/4071 | loss 5.1652 | lr 0.00100 | ngrams/sec 27205.8 | eta 0h1m7s
| epoch 32 | step 1000/4071 | loss 5.2120 | lr 0.00100 | ngrams/sec 38577.9 | eta 0h0m40s
| epoch 32 | step 1500/4071 | loss 5.2377 | lr 0.00100 | ngrams/sec 38551.5 | eta 0h0m34s
| epoch 32 | step 2000/4071 | loss 5.2666 | lr 0.00100 | ngrams/sec 38501.4 | eta 0h0m27s
| epoch 32 | step 2500/4071 | loss 5.2751 | lr 0.00100 | ngrams/sec 38556.2 | eta 0h0m20s
| epoch 32 | step 3000/4071 | loss 5.2952 | lr 0.00100 | ngrams/sec 38436.3 | eta 0h0m14s
| epoch 32 | step 3500/4071 | loss 5.2893 | lr 0.00100 | ngrams/sec 38471.9 | eta 0h0m7s
| epoch 32 | step 4000/4071 | loss 5.3146 | lr 0.00100 | ngrams/sec 38561.5 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1161.22it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.35it/s]


-----------------------------------------------------------------------------------------
| end of epoch  32 | time 55.52s | valid loss  5.41 | valid ppl   223.14
-----------------------------------------------------------------------------------------
| epoch 33 | step 500/4071 | loss 5.1731 | lr 0.00100 | ngrams/sec 27091.2 | eta 0h1m7s
| epoch 33 | step 1000/4071 | loss 5.2041 | lr 0.00100 | ngrams/sec 38422.4 | eta 0h0m40s
| epoch 33 | step 1500/4071 | loss 5.2207 | lr 0.00100 | ngrams/sec 38366.3 | eta 0h0m34s
| epoch 33 | step 2000/4071 | loss 5.2589 | lr 0.00100 | ngrams/sec 38405.5 | eta 0h0m27s
| epoch 33 | step 2500/4071 | loss 5.2571 | lr 0.00100 | ngrams/sec 38500.5 | eta 0h0m20s
| epoch 33 | step 3000/4071 | loss 5.3008 | lr 0.00100 | ngrams/sec 38497.7 | eta 0h0m14s
| epoch 33 | step 3500/4071 | loss 5.2843 | lr 0.00100 | ngrams/sec 38494.9 | eta 0h0m7s
| epoch 33 | step 4000/4071 | loss 5.2875 | lr 0.00100 | ngrams/sec 38581.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1151.46it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.56it/s]


-----------------------------------------------------------------------------------------
| end of epoch  33 | time 55.61s | valid loss  5.41 | valid ppl   223.11
-----------------------------------------------------------------------------------------
| epoch 34 | step 500/4071 | loss 5.1639 | lr 0.00100 | ngrams/sec 27202.7 | eta 0h1m7s
| epoch 34 | step 1000/4071 | loss 5.1887 | lr 0.00100 | ngrams/sec 38615.1 | eta 0h0m40s
| epoch 34 | step 1500/4071 | loss 5.2177 | lr 0.00100 | ngrams/sec 38657.5 | eta 0h0m34s
| epoch 34 | step 2000/4071 | loss 5.2388 | lr 0.00100 | ngrams/sec 38772.9 | eta 0h0m27s
| epoch 34 | step 2500/4071 | loss 5.2517 | lr 0.00100 | ngrams/sec 38700.8 | eta 0h0m20s
| epoch 34 | step 3000/4071 | loss 5.2806 | lr 0.00100 | ngrams/sec 38843.3 | eta 0h0m14s
| epoch 34 | step 3500/4071 | loss 5.2795 | lr 0.00100 | ngrams/sec 38773.9 | eta 0h0m7s
| epoch 34 | step 4000/4071 | loss 5.2740 | lr 0.00100 | ngrams/sec 38820.0 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1149.69it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.34it/s]


-----------------------------------------------------------------------------------------
| end of epoch  34 | time 55.25s | valid loss  5.41 | valid ppl   223.22
-----------------------------------------------------------------------------------------
| epoch 35 | step 500/4071 | loss 5.1459 | lr 0.00100 | ngrams/sec 27348.5 | eta 0h1m6s
| epoch 35 | step 1000/4071 | loss 5.1910 | lr 0.00100 | ngrams/sec 38782.0 | eta 0h0m40s
| epoch 35 | step 1500/4071 | loss 5.2123 | lr 0.00100 | ngrams/sec 38938.4 | eta 0h0m33s
| epoch 35 | step 2000/4071 | loss 5.2375 | lr 0.00100 | ngrams/sec 38915.4 | eta 0h0m27s
| epoch 35 | step 2500/4071 | loss 5.2486 | lr 0.00100 | ngrams/sec 38897.3 | eta 0h0m20s
| epoch 35 | step 3000/4071 | loss 5.2604 | lr 0.00100 | ngrams/sec 38915.2 | eta 0h0m14s
| epoch 35 | step 3500/4071 | loss 5.2693 | lr 0.00100 | ngrams/sec 38886.7 | eta 0h0m7s
| epoch 35 | step 4000/4071 | loss 5.2677 | lr 0.00100 | ngrams/sec 38908.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1159.77it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.69it/s]


-----------------------------------------------------------------------------------------
| end of epoch  35 | time 55.03s | valid loss  5.41 | valid ppl   223.46
-----------------------------------------------------------------------------------------
| epoch 36 | step 500/4071 | loss 5.1313 | lr 0.00100 | ngrams/sec 27320.0 | eta 0h1m6s
| epoch 36 | step 1000/4071 | loss 5.1707 | lr 0.00100 | ngrams/sec 39022.6 | eta 0h0m40s
| epoch 36 | step 1500/4071 | loss 5.2004 | lr 0.00100 | ngrams/sec 38827.3 | eta 0h0m33s
| epoch 36 | step 2000/4071 | loss 5.2325 | lr 0.00100 | ngrams/sec 38926.2 | eta 0h0m27s
| epoch 36 | step 2500/4071 | loss 5.2350 | lr 0.00100 | ngrams/sec 38928.4 | eta 0h0m20s
| epoch 36 | step 3000/4071 | loss 5.2528 | lr 0.00100 | ngrams/sec 38876.8 | eta 0h0m14s
| epoch 36 | step 3500/4071 | loss 5.2669 | lr 0.00100 | ngrams/sec 38940.1 | eta 0h0m7s
| epoch 36 | step 4000/4071 | loss 5.2678 | lr 0.00100 | ngrams/sec 38875.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.20it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.91it/s]


-----------------------------------------------------------------------------------------
| end of epoch  36 | time 55.00s | valid loss  5.41 | valid ppl   224.06
-----------------------------------------------------------------------------------------
| epoch 37 | step 500/4071 | loss 5.1260 | lr 0.00100 | ngrams/sec 27450.4 | eta 0h1m6s
| epoch 37 | step 1000/4071 | loss 5.1626 | lr 0.00100 | ngrams/sec 38885.6 | eta 0h0m40s
| epoch 37 | step 1500/4071 | loss 5.2025 | lr 0.00100 | ngrams/sec 38964.7 | eta 0h0m33s
| epoch 37 | step 2000/4071 | loss 5.2177 | lr 0.00100 | ngrams/sec 38883.5 | eta 0h0m27s
| epoch 37 | step 2500/4071 | loss 5.2220 | lr 0.00100 | ngrams/sec 38934.9 | eta 0h0m20s
| epoch 37 | step 3000/4071 | loss 5.2355 | lr 0.00100 | ngrams/sec 38885.5 | eta 0h0m14s
| epoch 37 | step 3500/4071 | loss 5.2613 | lr 0.00100 | ngrams/sec 38914.8 | eta 0h0m7s
| epoch 37 | step 4000/4071 | loss 5.2544 | lr 0.00100 | ngrams/sec 38889.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1172.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.51it/s]


-----------------------------------------------------------------------------------------
| end of epoch  37 | time 54.98s | valid loss  5.41 | valid ppl   223.36
-----------------------------------------------------------------------------------------
| epoch 38 | step 500/4071 | loss 5.1170 | lr 0.00100 | ngrams/sec 27386.4 | eta 0h1m6s
| epoch 38 | step 1000/4071 | loss 5.1542 | lr 0.00100 | ngrams/sec 38902.1 | eta 0h0m40s
| epoch 38 | step 1500/4071 | loss 5.1864 | lr 0.00100 | ngrams/sec 38881.6 | eta 0h0m33s
| epoch 38 | step 2000/4071 | loss 5.1999 | lr 0.00100 | ngrams/sec 38687.1 | eta 0h0m27s
| epoch 38 | step 2500/4071 | loss 5.2147 | lr 0.00100 | ngrams/sec 38869.7 | eta 0h0m20s
| epoch 38 | step 3000/4071 | loss 5.2363 | lr 0.00100 | ngrams/sec 38899.1 | eta 0h0m14s
| epoch 38 | step 3500/4071 | loss 5.2478 | lr 0.00100 | ngrams/sec 38813.7 | eta 0h0m7s
| epoch 38 | step 4000/4071 | loss 5.2402 | lr 0.00100 | ngrams/sec 38862.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1160.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.56it/s]


-----------------------------------------------------------------------------------------
| end of epoch  38 | time 55.07s | valid loss  5.41 | valid ppl   223.26
-----------------------------------------------------------------------------------------
| epoch 39 | step 500/4071 | loss 5.1112 | lr 0.00100 | ngrams/sec 27379.8 | eta 0h1m6s
| epoch 39 | step 1000/4071 | loss 5.1423 | lr 0.00100 | ngrams/sec 38862.0 | eta 0h0m40s
| epoch 39 | step 1500/4071 | loss 5.1714 | lr 0.00100 | ngrams/sec 38796.1 | eta 0h0m33s
| epoch 39 | step 2000/4071 | loss 5.1971 | lr 0.00100 | ngrams/sec 38666.6 | eta 0h0m27s
| epoch 39 | step 2500/4071 | loss 5.2021 | lr 0.00100 | ngrams/sec 38874.2 | eta 0h0m20s
| epoch 39 | step 3000/4071 | loss 5.2261 | lr 0.00100 | ngrams/sec 38815.1 | eta 0h0m14s
| epoch 39 | step 3500/4071 | loss 5.2400 | lr 0.00100 | ngrams/sec 38814.3 | eta 0h0m7s
| epoch 39 | step 4000/4071 | loss 5.2561 | lr 0.00100 | ngrams/sec 38821.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1171.53it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.26it/s]


-----------------------------------------------------------------------------------------
| end of epoch  39 | time 55.12s | valid loss  5.41 | valid ppl   224.63
-----------------------------------------------------------------------------------------
| epoch 40 | step 500/4071 | loss 5.1060 | lr 0.00100 | ngrams/sec 27357.1 | eta 0h1m6s
| epoch 40 | step 1000/4071 | loss 5.1229 | lr 0.00100 | ngrams/sec 38918.3 | eta 0h0m40s
| epoch 40 | step 1500/4071 | loss 5.1728 | lr 0.00100 | ngrams/sec 38783.8 | eta 0h0m33s
| epoch 40 | step 2000/4071 | loss 5.1860 | lr 0.00100 | ngrams/sec 38857.5 | eta 0h0m27s
| epoch 40 | step 2500/4071 | loss 5.1969 | lr 0.00100 | ngrams/sec 38950.9 | eta 0h0m20s
| epoch 40 | step 3000/4071 | loss 5.2177 | lr 0.00100 | ngrams/sec 39070.0 | eta 0h0m14s
| epoch 40 | step 3500/4071 | loss 5.2146 | lr 0.00100 | ngrams/sec 38900.3 | eta 0h0m7s
| epoch 40 | step 4000/4071 | loss 5.2536 | lr 0.00100 | ngrams/sec 38904.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1170.04it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  40 | time 55.01s | valid loss  5.41 | valid ppl   223.84
-----------------------------------------------------------------------------------------
| epoch 41 | step 500/4071 | loss 5.1026 | lr 0.00100 | ngrams/sec 27410.3 | eta 0h1m6s
| epoch 41 | step 1000/4071 | loss 5.1199 | lr 0.00100 | ngrams/sec 38934.4 | eta 0h0m40s
| epoch 41 | step 1500/4071 | loss 5.1565 | lr 0.00100 | ngrams/sec 38955.4 | eta 0h0m33s
| epoch 41 | step 2000/4071 | loss 5.1734 | lr 0.00100 | ngrams/sec 39058.9 | eta 0h0m27s
| epoch 41 | step 2500/4071 | loss 5.1974 | lr 0.00100 | ngrams/sec 38974.0 | eta 0h0m20s
| epoch 41 | step 3000/4071 | loss 5.2077 | lr 0.00100 | ngrams/sec 38903.1 | eta 0h0m14s
| epoch 41 | step 3500/4071 | loss 5.2273 | lr 0.00100 | ngrams/sec 38947.1 | eta 0h0m7s
| epoch 41 | step 4000/4071 | loss 5.2312 | lr 0.00100 | ngrams/sec 39007.6 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1174.82it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.55it/s]


-----------------------------------------------------------------------------------------
| end of epoch  41 | time 54.91s | valid loss  5.41 | valid ppl   223.55
-----------------------------------------------------------------------------------------
| epoch 42 | step 500/4071 | loss 5.0952 | lr 0.00100 | ngrams/sec 27434.1 | eta 0h1m6s
| epoch 42 | step 1000/4071 | loss 5.1160 | lr 0.00100 | ngrams/sec 39012.6 | eta 0h0m40s
| epoch 42 | step 1500/4071 | loss 5.1625 | lr 0.00100 | ngrams/sec 38990.2 | eta 0h0m33s
| epoch 42 | step 2000/4071 | loss 5.1706 | lr 0.00100 | ngrams/sec 38880.2 | eta 0h0m27s
| epoch 42 | step 2500/4071 | loss 5.1811 | lr 0.00100 | ngrams/sec 38716.2 | eta 0h0m20s
| epoch 42 | step 3000/4071 | loss 5.1999 | lr 0.00100 | ngrams/sec 38821.1 | eta 0h0m14s
| epoch 42 | step 3500/4071 | loss 5.2160 | lr 0.00100 | ngrams/sec 38691.0 | eta 0h0m7s
| epoch 42 | step 4000/4071 | loss 5.2154 | lr 0.00100 | ngrams/sec 38643.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1172.12it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.80it/s]


-----------------------------------------------------------------------------------------
| end of epoch  42 | time 55.09s | valid loss  5.41 | valid ppl   223.96
-----------------------------------------------------------------------------------------
| epoch 43 | step 500/4071 | loss 5.0656 | lr 0.00100 | ngrams/sec 27267.9 | eta 0h1m7s
| epoch 43 | step 1000/4071 | loss 5.1132 | lr 0.00100 | ngrams/sec 38673.0 | eta 0h0m40s
| epoch 43 | step 1500/4071 | loss 5.1440 | lr 0.00100 | ngrams/sec 38666.0 | eta 0h0m34s
| epoch 43 | step 2000/4071 | loss 5.1711 | lr 0.00100 | ngrams/sec 38602.7 | eta 0h0m27s
| epoch 43 | step 2500/4071 | loss 5.1769 | lr 0.00100 | ngrams/sec 38617.4 | eta 0h0m20s
| epoch 43 | step 3000/4071 | loss 5.1968 | lr 0.00100 | ngrams/sec 38608.3 | eta 0h0m14s
| epoch 43 | step 3500/4071 | loss 5.2177 | lr 0.00100 | ngrams/sec 38480.9 | eta 0h0m7s
| epoch 43 | step 4000/4071 | loss 5.2088 | lr 0.00100 | ngrams/sec 38685.4 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1162.00it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.51it/s]


-----------------------------------------------------------------------------------------
| end of epoch  43 | time 55.40s | valid loss  5.41 | valid ppl   224.52
-----------------------------------------------------------------------------------------
| epoch 44 | step 500/4071 | loss 5.0707 | lr 0.00100 | ngrams/sec 27216.3 | eta 0h1m7s
| epoch 44 | step 1000/4071 | loss 5.0993 | lr 0.00100 | ngrams/sec 38406.4 | eta 0h0m40s
| epoch 44 | step 1500/4071 | loss 5.1397 | lr 0.00100 | ngrams/sec 38514.4 | eta 0h0m34s
| epoch 44 | step 2000/4071 | loss 5.1545 | lr 0.00100 | ngrams/sec 38512.4 | eta 0h0m27s
| epoch 44 | step 2500/4071 | loss 5.1727 | lr 0.00100 | ngrams/sec 38478.2 | eta 0h0m20s
| epoch 44 | step 3000/4071 | loss 5.1775 | lr 0.00100 | ngrams/sec 38453.0 | eta 0h0m14s
| epoch 44 | step 3500/4071 | loss 5.1995 | lr 0.00100 | ngrams/sec 38474.3 | eta 0h0m7s
| epoch 44 | step 4000/4071 | loss 5.2109 | lr 0.00100 | ngrams/sec 38485.0 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1148.67it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.37it/s]


-----------------------------------------------------------------------------------------
| end of epoch  44 | time 55.58s | valid loss  5.41 | valid ppl   223.45
-----------------------------------------------------------------------------------------
| epoch 45 | step 500/4071 | loss 5.0677 | lr 0.00100 | ngrams/sec 27102.5 | eta 0h1m7s
| epoch 45 | step 1000/4071 | loss 5.1076 | lr 0.00100 | ngrams/sec 38611.2 | eta 0h0m40s
| epoch 45 | step 1500/4071 | loss 5.1307 | lr 0.00100 | ngrams/sec 38589.7 | eta 0h0m34s
| epoch 45 | step 2000/4071 | loss 5.1320 | lr 0.00100 | ngrams/sec 38603.9 | eta 0h0m27s
| epoch 45 | step 2500/4071 | loss 5.1552 | lr 0.00100 | ngrams/sec 38641.5 | eta 0h0m20s
| epoch 45 | step 3000/4071 | loss 5.1832 | lr 0.00100 | ngrams/sec 38721.6 | eta 0h0m14s
| epoch 45 | step 3500/4071 | loss 5.1867 | lr 0.00100 | ngrams/sec 38680.3 | eta 0h0m7s
| epoch 45 | step 4000/4071 | loss 5.1985 | lr 0.00100 | ngrams/sec 38834.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1144.12it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.98it/s]


-----------------------------------------------------------------------------------------
| end of epoch  45 | time 55.36s | valid loss  5.41 | valid ppl   224.21
-----------------------------------------------------------------------------------------
| epoch 46 | step 500/4071 | loss 5.0720 | lr 0.00100 | ngrams/sec 27352.4 | eta 0h1m6s
| epoch 46 | step 1000/4071 | loss 5.0833 | lr 0.00100 | ngrams/sec 38827.5 | eta 0h0m40s
| epoch 46 | step 1500/4071 | loss 5.1105 | lr 0.00100 | ngrams/sec 38840.6 | eta 0h0m33s
| epoch 46 | step 2000/4071 | loss 5.1362 | lr 0.00100 | ngrams/sec 38944.1 | eta 0h0m27s
| epoch 46 | step 2500/4071 | loss 5.1575 | lr 0.00100 | ngrams/sec 38846.4 | eta 0h0m20s
| epoch 46 | step 3000/4071 | loss 5.1691 | lr 0.00100 | ngrams/sec 38858.2 | eta 0h0m14s
| epoch 46 | step 3500/4071 | loss 5.1843 | lr 0.00100 | ngrams/sec 38963.2 | eta 0h0m7s
| epoch 46 | step 4000/4071 | loss 5.1953 | lr 0.00100 | ngrams/sec 38448.1 | eta 0h0m0s


 28%|██▊       | 117/417 [00:00<00:00, 1120.52it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch  46 | time 55.10s | valid loss  5.41 | valid ppl   224.43
-----------------------------------------------------------------------------------------
| epoch 47 | step 500/4071 | loss 5.0696 | lr 0.00100 | ngrams/sec 27415.8 | eta 0h1m6s
| epoch 47 | step 1000/4071 | loss 5.0734 | lr 0.00100 | ngrams/sec 39007.9 | eta 0h0m40s
| epoch 47 | step 1500/4071 | loss 5.0997 | lr 0.00100 | ngrams/sec 39013.2 | eta 0h0m33s
| epoch 47 | step 2000/4071 | loss 5.1353 | lr 0.00100 | ngrams/sec 39061.1 | eta 0h0m27s
| epoch 47 | step 2500/4071 | loss 5.1419 | lr 0.00100 | ngrams/sec 39056.2 | eta 0h0m20s
| epoch 47 | step 3000/4071 | loss 5.1752 | lr 0.00100 | ngrams/sec 38998.1 | eta 0h0m14s
| epoch 47 | step 3500/4071 | loss 5.1887 | lr 0.00100 | ngrams/sec 39092.8 | eta 0h0m7s
| epoch 47 | step 4000/4071 | loss 5.1874 | lr 0.00100 | ngrams/sec 39113.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1162.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.06it/s]


-----------------------------------------------------------------------------------------
| end of epoch  47 | time 54.83s | valid loss  5.41 | valid ppl   224.61
-----------------------------------------------------------------------------------------
| epoch 48 | step 500/4071 | loss 5.0379 | lr 0.00100 | ngrams/sec 27441.0 | eta 0h1m6s
| epoch 48 | step 1000/4071 | loss 5.0751 | lr 0.00100 | ngrams/sec 38888.7 | eta 0h0m40s
| epoch 48 | step 1500/4071 | loss 5.1089 | lr 0.00100 | ngrams/sec 38967.2 | eta 0h0m33s
| epoch 48 | step 2000/4071 | loss 5.1319 | lr 0.00100 | ngrams/sec 38757.9 | eta 0h0m27s
| epoch 48 | step 2500/4071 | loss 5.1424 | lr 0.00100 | ngrams/sec 38914.4 | eta 0h0m20s
| epoch 48 | step 3000/4071 | loss 5.1594 | lr 0.00100 | ngrams/sec 38792.5 | eta 0h0m14s
| epoch 48 | step 3500/4071 | loss 5.1618 | lr 0.00100 | ngrams/sec 38777.4 | eta 0h0m7s
| epoch 48 | step 4000/4071 | loss 5.1839 | lr 0.00100 | ngrams/sec 38772.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1149.78it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.04it/s]


-----------------------------------------------------------------------------------------
| end of epoch  48 | time 55.08s | valid loss  5.41 | valid ppl   224.65
-----------------------------------------------------------------------------------------
| epoch 49 | step 500/4071 | loss 5.0225 | lr 0.00100 | ngrams/sec 27304.6 | eta 0h1m6s
| epoch 49 | step 1000/4071 | loss 5.0688 | lr 0.00100 | ngrams/sec 38761.0 | eta 0h0m40s
| epoch 49 | step 1500/4071 | loss 5.1017 | lr 0.00100 | ngrams/sec 38676.7 | eta 0h0m34s
| epoch 49 | step 2000/4071 | loss 5.1173 | lr 0.00100 | ngrams/sec 38587.0 | eta 0h0m27s
| epoch 49 | step 2500/4071 | loss 5.1556 | lr 0.00100 | ngrams/sec 38679.1 | eta 0h0m20s
| epoch 49 | step 3000/4071 | loss 5.1560 | lr 0.00100 | ngrams/sec 38674.3 | eta 0h0m14s
| epoch 49 | step 3500/4071 | loss 5.1566 | lr 0.00100 | ngrams/sec 38494.9 | eta 0h0m7s
| epoch 49 | step 4000/4071 | loss 5.1746 | lr 0.00100 | ngrams/sec 38466.0 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1158.45it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch  49 | time 55.38s | valid loss  5.41 | valid ppl   224.61
-----------------------------------------------------------------------------------------
| epoch 50 | step 500/4071 | loss 5.0537 | lr 0.00100 | ngrams/sec 27227.0 | eta 0h1m7s
| epoch 50 | step 1000/4071 | loss 5.0477 | lr 0.00100 | ngrams/sec 38547.7 | eta 0h0m40s
| epoch 50 | step 1500/4071 | loss 5.1017 | lr 0.00100 | ngrams/sec 38551.6 | eta 0h0m34s
| epoch 50 | step 2000/4071 | loss 5.1051 | lr 0.00100 | ngrams/sec 38541.7 | eta 0h0m27s
| epoch 50 | step 2500/4071 | loss 5.1258 | lr 0.00100 | ngrams/sec 38493.3 | eta 0h0m20s
| epoch 50 | step 3000/4071 | loss 5.1393 | lr 0.00100 | ngrams/sec 38542.0 | eta 0h0m14s
| epoch 50 | step 3500/4071 | loss 5.1525 | lr 0.00100 | ngrams/sec 38551.1 | eta 0h0m7s
| epoch 50 | step 4000/4071 | loss 5.1600 | lr 0.00100 | ngrams/sec 38529.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1165.44it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.71it/s]


-----------------------------------------------------------------------------------------
| end of epoch  50 | time 55.49s | valid loss  5.42 | valid ppl   224.77
-----------------------------------------------------------------------------------------
| epoch 51 | step 500/4071 | loss 5.0257 | lr 0.00100 | ngrams/sec 27182.4 | eta 0h1m7s
| epoch 51 | step 1000/4071 | loss 5.0566 | lr 0.00100 | ngrams/sec 38500.4 | eta 0h0m40s
| epoch 51 | step 1500/4071 | loss 5.0833 | lr 0.00100 | ngrams/sec 38503.1 | eta 0h0m34s
| epoch 51 | step 2000/4071 | loss 5.1132 | lr 0.00100 | ngrams/sec 38614.0 | eta 0h0m27s
| epoch 51 | step 2500/4071 | loss 5.1342 | lr 0.00100 | ngrams/sec 38558.8 | eta 0h0m20s
| epoch 51 | step 3000/4071 | loss 5.1310 | lr 0.00100 | ngrams/sec 38751.2 | eta 0h0m14s
| epoch 51 | step 3500/4071 | loss 5.1411 | lr 0.00100 | ngrams/sec 38776.4 | eta 0h0m7s
| epoch 51 | step 4000/4071 | loss 5.1550 | lr 0.00100 | ngrams/sec 38822.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1163.08it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.67it/s]


-----------------------------------------------------------------------------------------
| end of epoch  51 | time 55.36s | valid loss  5.42 | valid ppl   225.02
-----------------------------------------------------------------------------------------
| epoch 52 | step 500/4071 | loss 5.0171 | lr 0.00100 | ngrams/sec 27387.0 | eta 0h1m6s
| epoch 52 | step 1000/4071 | loss 5.0574 | lr 0.00100 | ngrams/sec 38904.7 | eta 0h0m40s
| epoch 52 | step 1500/4071 | loss 5.0785 | lr 0.00100 | ngrams/sec 38921.7 | eta 0h0m33s
| epoch 52 | step 2000/4071 | loss 5.1023 | lr 0.00100 | ngrams/sec 39082.0 | eta 0h0m27s
| epoch 52 | step 2500/4071 | loss 5.1146 | lr 0.00100 | ngrams/sec 38954.4 | eta 0h0m20s
| epoch 52 | step 3000/4071 | loss 5.1197 | lr 0.00100 | ngrams/sec 38909.4 | eta 0h0m14s
| epoch 52 | step 3500/4071 | loss 5.1562 | lr 0.00100 | ngrams/sec 39035.5 | eta 0h0m7s
| epoch 52 | step 4000/4071 | loss 5.1470 | lr 0.00100 | ngrams/sec 38949.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1174.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.99it/s]


-----------------------------------------------------------------------------------------
| end of epoch  52 | time 54.92s | valid loss  5.41 | valid ppl   224.72
-----------------------------------------------------------------------------------------
| epoch 53 | step 500/4071 | loss 5.0154 | lr 0.00100 | ngrams/sec 27487.9 | eta 0h1m6s
| epoch 53 | step 1000/4071 | loss 5.0344 | lr 0.00100 | ngrams/sec 38954.3 | eta 0h0m40s
| epoch 53 | step 1500/4071 | loss 5.0732 | lr 0.00100 | ngrams/sec 38939.0 | eta 0h0m33s
| epoch 53 | step 2000/4071 | loss 5.0948 | lr 0.00100 | ngrams/sec 39060.2 | eta 0h0m27s
| epoch 53 | step 2500/4071 | loss 5.1225 | lr 0.00100 | ngrams/sec 38940.3 | eta 0h0m20s
| epoch 53 | step 3000/4071 | loss 5.1309 | lr 0.00100 | ngrams/sec 38824.3 | eta 0h0m14s
| epoch 53 | step 3500/4071 | loss 5.1317 | lr 0.00100 | ngrams/sec 38879.9 | eta 0h0m7s
| epoch 53 | step 4000/4071 | loss 5.1417 | lr 0.00100 | ngrams/sec 38795.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1183.14it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.98it/s]


-----------------------------------------------------------------------------------------
| end of epoch  53 | time 54.97s | valid loss  5.42 | valid ppl   225.41
-----------------------------------------------------------------------------------------
| epoch 54 | step 500/4071 | loss 5.0037 | lr 0.00100 | ngrams/sec 27392.4 | eta 0h1m6s
| epoch 54 | step 1000/4071 | loss 5.0325 | lr 0.00100 | ngrams/sec 38739.1 | eta 0h0m40s
| epoch 54 | step 1500/4071 | loss 5.0718 | lr 0.00100 | ngrams/sec 38652.6 | eta 0h0m34s
| epoch 54 | step 2000/4071 | loss 5.0910 | lr 0.00100 | ngrams/sec 38671.8 | eta 0h0m27s
| epoch 54 | step 2500/4071 | loss 5.1159 | lr 0.00100 | ngrams/sec 38662.3 | eta 0h0m20s
| epoch 54 | step 3000/4071 | loss 5.1160 | lr 0.00100 | ngrams/sec 38752.8 | eta 0h0m14s
| epoch 54 | step 3500/4071 | loss 5.1227 | lr 0.00100 | ngrams/sec 38733.9 | eta 0h0m7s
| epoch 54 | step 4000/4071 | loss 5.1347 | lr 0.00100 | ngrams/sec 38710.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1173.44it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.85it/s]


-----------------------------------------------------------------------------------------
| end of epoch  54 | time 55.27s | valid loss  5.42 | valid ppl   225.28
-----------------------------------------------------------------------------------------
| epoch 55 | step 500/4071 | loss 5.0002 | lr 0.00100 | ngrams/sec 27210.7 | eta 0h1m7s
| epoch 55 | step 1000/4071 | loss 5.0361 | lr 0.00100 | ngrams/sec 38728.0 | eta 0h0m40s
| epoch 55 | step 1500/4071 | loss 5.0539 | lr 0.00100 | ngrams/sec 38620.0 | eta 0h0m34s
| epoch 55 | step 2000/4071 | loss 5.0675 | lr 0.00100 | ngrams/sec 38605.6 | eta 0h0m27s
| epoch 55 | step 2500/4071 | loss 5.1022 | lr 0.00100 | ngrams/sec 38730.0 | eta 0h0m20s
| epoch 55 | step 3000/4071 | loss 5.1218 | lr 0.00100 | ngrams/sec 38576.5 | eta 0h0m14s
| epoch 55 | step 3500/4071 | loss 5.1330 | lr 0.00100 | ngrams/sec 38541.5 | eta 0h0m7s
| epoch 55 | step 4000/4071 | loss 5.1307 | lr 0.00100 | ngrams/sec 38496.8 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1173.92it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.19it/s]


-----------------------------------------------------------------------------------------
| end of epoch  55 | time 55.40s | valid loss  5.42 | valid ppl   225.39
-----------------------------------------------------------------------------------------
| epoch 56 | step 500/4071 | loss 4.9797 | lr 0.00100 | ngrams/sec 27196.1 | eta 0h1m7s
| epoch 56 | step 1000/4071 | loss 5.0220 | lr 0.00100 | ngrams/sec 38557.4 | eta 0h0m40s
| epoch 56 | step 1500/4071 | loss 5.0603 | lr 0.00100 | ngrams/sec 38477.0 | eta 0h0m34s
| epoch 56 | step 2000/4071 | loss 5.0809 | lr 0.00100 | ngrams/sec 38515.1 | eta 0h0m27s
| epoch 56 | step 2500/4071 | loss 5.0958 | lr 0.00100 | ngrams/sec 38496.5 | eta 0h0m20s
| epoch 56 | step 3000/4071 | loss 5.0979 | lr 0.00100 | ngrams/sec 38584.2 | eta 0h0m14s
| epoch 56 | step 3500/4071 | loss 5.1201 | lr 0.00100 | ngrams/sec 38494.3 | eta 0h0m7s
| epoch 56 | step 4000/4071 | loss 5.1404 | lr 0.00100 | ngrams/sec 38579.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1145.35it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  56 | time 55.51s | valid loss  5.42 | valid ppl   224.87
-----------------------------------------------------------------------------------------
| epoch 57 | step 500/4071 | loss 4.9777 | lr 0.00100 | ngrams/sec 27172.0 | eta 0h1m7s
| epoch 57 | step 1000/4071 | loss 5.0283 | lr 0.00100 | ngrams/sec 38645.5 | eta 0h0m40s
| epoch 57 | step 1500/4071 | loss 5.0370 | lr 0.00100 | ngrams/sec 38724.7 | eta 0h0m33s
| epoch 57 | step 2000/4071 | loss 5.0769 | lr 0.00100 | ngrams/sec 38751.6 | eta 0h0m27s
| epoch 57 | step 2500/4071 | loss 5.0917 | lr 0.00100 | ngrams/sec 38664.7 | eta 0h0m20s
| epoch 57 | step 3000/4071 | loss 5.0939 | lr 0.00100 | ngrams/sec 38555.6 | eta 0h0m14s
| epoch 57 | step 3500/4071 | loss 5.1166 | lr 0.00100 | ngrams/sec 38782.1 | eta 0h0m7s
| epoch 57 | step 4000/4071 | loss 5.1182 | lr 0.00100 | ngrams/sec 38809.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1149.25it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch  57 | time 55.29s | valid loss  5.42 | valid ppl   225.33
-----------------------------------------------------------------------------------------
| epoch 58 | step 500/4071 | loss 4.9990 | lr 0.00100 | ngrams/sec 27297.0 | eta 0h1m6s
| epoch 58 | step 1000/4071 | loss 5.0318 | lr 0.00100 | ngrams/sec 38853.4 | eta 0h0m40s
| epoch 58 | step 1500/4071 | loss 5.0550 | lr 0.00100 | ngrams/sec 38916.4 | eta 0h0m33s
| epoch 58 | step 2000/4071 | loss 5.0606 | lr 0.00100 | ngrams/sec 38898.8 | eta 0h0m27s
| epoch 58 | step 2500/4071 | loss 5.0843 | lr 0.00100 | ngrams/sec 38753.4 | eta 0h0m20s
| epoch 58 | step 3000/4071 | loss 5.0786 | lr 0.00100 | ngrams/sec 38871.8 | eta 0h0m14s
| epoch 58 | step 3500/4071 | loss 5.1028 | lr 0.00100 | ngrams/sec 38932.0 | eta 0h0m7s
| epoch 58 | step 4000/4071 | loss 5.1036 | lr 0.00100 | ngrams/sec 38864.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1142.96it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.59it/s]


-----------------------------------------------------------------------------------------
| end of epoch  58 | time 55.07s | valid loss  5.42 | valid ppl   225.74
-----------------------------------------------------------------------------------------
| epoch 59 | step 500/4071 | loss 4.9803 | lr 0.00100 | ngrams/sec 27406.1 | eta 0h1m6s
| epoch 59 | step 1000/4071 | loss 5.0167 | lr 0.00100 | ngrams/sec 38890.0 | eta 0h0m40s
| epoch 59 | step 1500/4071 | loss 5.0406 | lr 0.00100 | ngrams/sec 38906.9 | eta 0h0m33s
| epoch 59 | step 2000/4071 | loss 5.0608 | lr 0.00100 | ngrams/sec 38821.6 | eta 0h0m27s
| epoch 59 | step 2500/4071 | loss 5.0722 | lr 0.00100 | ngrams/sec 38930.5 | eta 0h0m20s
| epoch 59 | step 3000/4071 | loss 5.0876 | lr 0.00100 | ngrams/sec 38940.7 | eta 0h0m14s
| epoch 59 | step 3500/4071 | loss 5.0974 | lr 0.00100 | ngrams/sec 38800.4 | eta 0h0m7s
| epoch 59 | step 4000/4071 | loss 5.1226 | lr 0.00100 | ngrams/sec 38983.0 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1183.76it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.79it/s]


-----------------------------------------------------------------------------------------
| end of epoch  59 | time 55.00s | valid loss  5.42 | valid ppl   225.45
-----------------------------------------------------------------------------------------
| epoch 60 | step 500/4071 | loss 4.9736 | lr 0.00100 | ngrams/sec 27415.9 | eta 0h1m6s
| epoch 60 | step 1000/4071 | loss 5.0052 | lr 0.00100 | ngrams/sec 38899.3 | eta 0h0m40s
| epoch 60 | step 1500/4071 | loss 5.0403 | lr 0.00100 | ngrams/sec 38931.0 | eta 0h0m33s
| epoch 60 | step 2000/4071 | loss 5.0599 | lr 0.00100 | ngrams/sec 38916.8 | eta 0h0m27s
| epoch 60 | step 2500/4071 | loss 5.0700 | lr 0.00100 | ngrams/sec 39046.8 | eta 0h0m20s
| epoch 60 | step 3000/4071 | loss 5.0858 | lr 0.00100 | ngrams/sec 38998.6 | eta 0h0m14s
| epoch 60 | step 3500/4071 | loss 5.1002 | lr 0.00100 | ngrams/sec 39005.4 | eta 0h0m7s
| epoch 60 | step 4000/4071 | loss 5.0982 | lr 0.00100 | ngrams/sec 38971.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1169.64it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.76it/s]


-----------------------------------------------------------------------------------------
| end of epoch  60 | time 54.92s | valid loss  5.42 | valid ppl   225.90
-----------------------------------------------------------------------------------------
| epoch 61 | step 500/4071 | loss 4.9569 | lr 0.00100 | ngrams/sec 27416.0 | eta 0h1m6s
| epoch 61 | step 1000/4071 | loss 5.0063 | lr 0.00100 | ngrams/sec 38917.1 | eta 0h0m40s
| epoch 61 | step 1500/4071 | loss 5.0239 | lr 0.00100 | ngrams/sec 39030.8 | eta 0h0m33s
| epoch 61 | step 2000/4071 | loss 5.0489 | lr 0.00100 | ngrams/sec 39052.1 | eta 0h0m27s
| epoch 61 | step 2500/4071 | loss 5.0679 | lr 0.00100 | ngrams/sec 38987.5 | eta 0h0m20s
| epoch 61 | step 3000/4071 | loss 5.0908 | lr 0.00100 | ngrams/sec 38911.1 | eta 0h0m14s
| epoch 61 | step 3500/4071 | loss 5.0953 | lr 0.00100 | ngrams/sec 39040.6 | eta 0h0m7s
| epoch 61 | step 4000/4071 | loss 5.1015 | lr 0.00100 | ngrams/sec 39070.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1162.83it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch  61 | time 54.87s | valid loss  5.42 | valid ppl   225.71
-----------------------------------------------------------------------------------------
| epoch 62 | step 500/4071 | loss 4.9464 | lr 0.00100 | ngrams/sec 27432.2 | eta 0h1m6s
| epoch 62 | step 1000/4071 | loss 4.9982 | lr 0.00100 | ngrams/sec 39034.5 | eta 0h0m40s
| epoch 62 | step 1500/4071 | loss 5.0334 | lr 0.00100 | ngrams/sec 39008.7 | eta 0h0m33s
| epoch 62 | step 2000/4071 | loss 5.0394 | lr 0.00100 | ngrams/sec 39011.5 | eta 0h0m27s
| epoch 62 | step 2500/4071 | loss 5.0740 | lr 0.00100 | ngrams/sec 39035.0 | eta 0h0m20s
| epoch 62 | step 3000/4071 | loss 5.0734 | lr 0.00100 | ngrams/sec 39008.6 | eta 0h0m14s
| epoch 62 | step 3500/4071 | loss 5.0924 | lr 0.00100 | ngrams/sec 39057.4 | eta 0h0m7s
| epoch 62 | step 4000/4071 | loss 5.0909 | lr 0.00100 | ngrams/sec 39096.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1177.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.59it/s]


-----------------------------------------------------------------------------------------
| end of epoch  62 | time 54.83s | valid loss  5.42 | valid ppl   225.73
-----------------------------------------------------------------------------------------
| epoch 63 | step 500/4071 | loss 4.9547 | lr 0.00100 | ngrams/sec 27498.0 | eta 0h1m6s
| epoch 63 | step 1000/4071 | loss 4.9956 | lr 0.00100 | ngrams/sec 38917.8 | eta 0h0m40s
| epoch 63 | step 1500/4071 | loss 5.0069 | lr 0.00100 | ngrams/sec 38795.3 | eta 0h0m33s
| epoch 63 | step 2000/4071 | loss 5.0444 | lr 0.00100 | ngrams/sec 38871.4 | eta 0h0m27s
| epoch 63 | step 2500/4071 | loss 5.0588 | lr 0.00100 | ngrams/sec 38795.8 | eta 0h0m20s
| epoch 63 | step 3000/4071 | loss 5.0704 | lr 0.00100 | ngrams/sec 38738.0 | eta 0h0m14s
| epoch 63 | step 3500/4071 | loss 5.0914 | lr 0.00100 | ngrams/sec 38775.9 | eta 0h0m7s
| epoch 63 | step 4000/4071 | loss 5.0813 | lr 0.00100 | ngrams/sec 38687.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1156.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.30it/s]


-----------------------------------------------------------------------------------------
| end of epoch  63 | time 55.11s | valid loss  5.42 | valid ppl   225.88
-----------------------------------------------------------------------------------------
| epoch 64 | step 500/4071 | loss 4.9489 | lr 0.00100 | ngrams/sec 27267.9 | eta 0h1m7s
| epoch 64 | step 1000/4071 | loss 4.9789 | lr 0.00100 | ngrams/sec 38644.9 | eta 0h0m40s
| epoch 64 | step 1500/4071 | loss 5.0146 | lr 0.00100 | ngrams/sec 38560.3 | eta 0h0m34s
| epoch 64 | step 2000/4071 | loss 5.0284 | lr 0.00100 | ngrams/sec 38519.9 | eta 0h0m27s
| epoch 64 | step 2500/4071 | loss 5.0516 | lr 0.00100 | ngrams/sec 38640.7 | eta 0h0m20s
| epoch 64 | step 3000/4071 | loss 5.0636 | lr 0.00100 | ngrams/sec 38557.6 | eta 0h0m14s
| epoch 64 | step 3500/4071 | loss 5.0786 | lr 0.00100 | ngrams/sec 38697.7 | eta 0h0m7s
| epoch 64 | step 4000/4071 | loss 5.0977 | lr 0.00100 | ngrams/sec 38475.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1164.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.49it/s]


-----------------------------------------------------------------------------------------
| end of epoch  64 | time 55.43s | valid loss  5.42 | valid ppl   226.66
-----------------------------------------------------------------------------------------
| epoch 65 | step 500/4071 | loss 4.9552 | lr 0.00100 | ngrams/sec 27287.8 | eta 0h1m7s
| epoch 65 | step 1000/4071 | loss 4.9925 | lr 0.00100 | ngrams/sec 38806.9 | eta 0h0m40s
| epoch 65 | step 1500/4071 | loss 5.0102 | lr 0.00100 | ngrams/sec 38775.3 | eta 0h0m33s
| epoch 65 | step 2000/4071 | loss 5.0418 | lr 0.00100 | ngrams/sec 38677.6 | eta 0h0m27s
| epoch 65 | step 2500/4071 | loss 5.0306 | lr 0.00100 | ngrams/sec 38827.8 | eta 0h0m20s
| epoch 65 | step 3000/4071 | loss 5.0585 | lr 0.00100 | ngrams/sec 38888.7 | eta 0h0m14s
| epoch 65 | step 3500/4071 | loss 5.0577 | lr 0.00100 | ngrams/sec 38901.4 | eta 0h0m7s
| epoch 65 | step 4000/4071 | loss 5.0892 | lr 0.00100 | ngrams/sec 38989.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1182.89it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  65 | time 55.09s | valid loss  5.42 | valid ppl   226.67
-----------------------------------------------------------------------------------------
| epoch 66 | step 500/4071 | loss 4.9389 | lr 0.00100 | ngrams/sec 27422.0 | eta 0h1m6s
| epoch 66 | step 1000/4071 | loss 4.9730 | lr 0.00100 | ngrams/sec 38927.6 | eta 0h0m40s
| epoch 66 | step 1500/4071 | loss 4.9959 | lr 0.00100 | ngrams/sec 38993.1 | eta 0h0m33s
| epoch 66 | step 2000/4071 | loss 5.0319 | lr 0.00100 | ngrams/sec 39001.2 | eta 0h0m27s
| epoch 66 | step 2500/4071 | loss 5.0416 | lr 0.00100 | ngrams/sec 38972.9 | eta 0h0m20s
| epoch 66 | step 3000/4071 | loss 5.0509 | lr 0.00100 | ngrams/sec 38986.3 | eta 0h0m14s
| epoch 66 | step 3500/4071 | loss 5.0680 | lr 0.00100 | ngrams/sec 39017.2 | eta 0h0m7s
| epoch 66 | step 4000/4071 | loss 5.0816 | lr 0.00100 | ngrams/sec 38985.3 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1173.40it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.97it/s]


-----------------------------------------------------------------------------------------
| end of epoch  66 | time 54.90s | valid loss  5.42 | valid ppl   226.89
-----------------------------------------------------------------------------------------
| epoch 67 | step 500/4071 | loss 4.9472 | lr 0.00100 | ngrams/sec 27469.4 | eta 0h1m6s
| epoch 67 | step 1000/4071 | loss 4.9584 | lr 0.00100 | ngrams/sec 39054.5 | eta 0h0m40s
| epoch 67 | step 1500/4071 | loss 4.9878 | lr 0.00100 | ngrams/sec 38951.2 | eta 0h0m33s
| epoch 67 | step 2000/4071 | loss 5.0156 | lr 0.00100 | ngrams/sec 38950.1 | eta 0h0m27s
| epoch 67 | step 2500/4071 | loss 5.0461 | lr 0.00100 | ngrams/sec 39019.3 | eta 0h0m20s
| epoch 67 | step 3000/4071 | loss 5.0652 | lr 0.00100 | ngrams/sec 39028.7 | eta 0h0m14s
| epoch 67 | step 3500/4071 | loss 5.0737 | lr 0.00100 | ngrams/sec 39019.5 | eta 0h0m7s
| epoch 67 | step 4000/4071 | loss 5.0692 | lr 0.00100 | ngrams/sec 38920.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1168.07it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  67 | time 54.86s | valid loss  5.43 | valid ppl   227.68
-----------------------------------------------------------------------------------------
| epoch 68 | step 500/4071 | loss 4.9329 | lr 0.00100 | ngrams/sec 27465.1 | eta 0h1m6s
| epoch 68 | step 1000/4071 | loss 4.9613 | lr 0.00100 | ngrams/sec 38900.0 | eta 0h0m40s
| epoch 68 | step 1500/4071 | loss 4.9965 | lr 0.00100 | ngrams/sec 38953.6 | eta 0h0m33s
| epoch 68 | step 2000/4071 | loss 5.0286 | lr 0.00100 | ngrams/sec 38911.6 | eta 0h0m27s
| epoch 68 | step 2500/4071 | loss 5.0323 | lr 0.00100 | ngrams/sec 38825.8 | eta 0h0m20s
| epoch 68 | step 3000/4071 | loss 5.0436 | lr 0.00100 | ngrams/sec 38950.8 | eta 0h0m14s
| epoch 68 | step 3500/4071 | loss 5.0620 | lr 0.00100 | ngrams/sec 38952.0 | eta 0h0m7s
| epoch 68 | step 4000/4071 | loss 5.0730 | lr 0.00100 | ngrams/sec 38915.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1183.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  68 | time 54.97s | valid loss  5.42 | valid ppl   226.62
-----------------------------------------------------------------------------------------
| epoch 69 | step 500/4071 | loss 4.9211 | lr 0.00100 | ngrams/sec 27428.6 | eta 0h1m6s
| epoch 69 | step 1000/4071 | loss 4.9570 | lr 0.00100 | ngrams/sec 38299.2 | eta 0h0m41s
| epoch 69 | step 1500/4071 | loss 5.0019 | lr 0.00100 | ngrams/sec 38959.6 | eta 0h0m33s
| epoch 69 | step 2000/4071 | loss 5.0160 | lr 0.00100 | ngrams/sec 38889.5 | eta 0h0m27s
| epoch 69 | step 2500/4071 | loss 5.0261 | lr 0.00100 | ngrams/sec 38993.4 | eta 0h0m20s
| epoch 69 | step 3000/4071 | loss 5.0463 | lr 0.00100 | ngrams/sec 38937.1 | eta 0h0m14s
| epoch 69 | step 3500/4071 | loss 5.0677 | lr 0.00100 | ngrams/sec 38994.7 | eta 0h0m7s
| epoch 69 | step 4000/4071 | loss 5.0617 | lr 0.00100 | ngrams/sec 38937.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1163.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.82it/s]


-----------------------------------------------------------------------------------------
| end of epoch  69 | time 55.05s | valid loss  5.43 | valid ppl   227.74
-----------------------------------------------------------------------------------------
| epoch 70 | step 500/4071 | loss 4.9179 | lr 0.00100 | ngrams/sec 27408.8 | eta 0h1m6s
| epoch 70 | step 1000/4071 | loss 4.9666 | lr 0.00100 | ngrams/sec 38891.5 | eta 0h0m40s
| epoch 70 | step 1500/4071 | loss 4.9971 | lr 0.00100 | ngrams/sec 38881.3 | eta 0h0m33s
| epoch 70 | step 2000/4071 | loss 5.0143 | lr 0.00100 | ngrams/sec 38778.3 | eta 0h0m27s
| epoch 70 | step 2500/4071 | loss 5.0228 | lr 0.00100 | ngrams/sec 38988.2 | eta 0h0m20s
| epoch 70 | step 3000/4071 | loss 5.0404 | lr 0.00100 | ngrams/sec 38995.6 | eta 0h0m14s
| epoch 70 | step 3500/4071 | loss 5.0377 | lr 0.00100 | ngrams/sec 38987.5 | eta 0h0m7s
| epoch 70 | step 4000/4071 | loss 5.0553 | lr 0.00100 | ngrams/sec 39001.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1140.72it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.85it/s]


-----------------------------------------------------------------------------------------
| end of epoch  70 | time 54.96s | valid loss  5.42 | valid ppl   226.57
-----------------------------------------------------------------------------------------
| epoch 71 | step 500/4071 | loss 4.9151 | lr 0.00100 | ngrams/sec 27492.8 | eta 0h1m6s
| epoch 71 | step 1000/4071 | loss 4.9528 | lr 0.00100 | ngrams/sec 38951.3 | eta 0h0m40s
| epoch 71 | step 1500/4071 | loss 4.9897 | lr 0.00100 | ngrams/sec 38979.7 | eta 0h0m33s
| epoch 71 | step 2000/4071 | loss 5.0111 | lr 0.00100 | ngrams/sec 38967.2 | eta 0h0m27s
| epoch 71 | step 2500/4071 | loss 5.0132 | lr 0.00100 | ngrams/sec 38979.9 | eta 0h0m20s
| epoch 71 | step 3000/4071 | loss 5.0416 | lr 0.00100 | ngrams/sec 38945.0 | eta 0h0m14s
| epoch 71 | step 3500/4071 | loss 5.0377 | lr 0.00100 | ngrams/sec 38971.5 | eta 0h0m7s
| epoch 71 | step 4000/4071 | loss 5.0557 | lr 0.00100 | ngrams/sec 39000.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1189.56it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.38it/s]


-----------------------------------------------------------------------------------------
| end of epoch  71 | time 54.89s | valid loss  5.43 | valid ppl   227.86
-----------------------------------------------------------------------------------------
| epoch 72 | step 500/4071 | loss 4.9225 | lr 0.00100 | ngrams/sec 27465.6 | eta 0h1m6s
| epoch 72 | step 1000/4071 | loss 4.9479 | lr 0.00100 | ngrams/sec 39116.0 | eta 0h0m40s
| epoch 72 | step 1500/4071 | loss 4.9656 | lr 0.00100 | ngrams/sec 39071.2 | eta 0h0m33s
| epoch 72 | step 2000/4071 | loss 4.9817 | lr 0.00100 | ngrams/sec 39042.9 | eta 0h0m27s
| epoch 72 | step 2500/4071 | loss 5.0013 | lr 0.00100 | ngrams/sec 39070.1 | eta 0h0m20s
| epoch 72 | step 3000/4071 | loss 5.0540 | lr 0.00100 | ngrams/sec 39073.5 | eta 0h0m14s
| epoch 72 | step 3500/4071 | loss 5.0496 | lr 0.00100 | ngrams/sec 39117.7 | eta 0h0m7s
| epoch 72 | step 4000/4071 | loss 5.0601 | lr 0.00100 | ngrams/sec 39009.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1187.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.89it/s]


-----------------------------------------------------------------------------------------
| end of epoch  72 | time 54.78s | valid loss  5.43 | valid ppl   227.40
-----------------------------------------------------------------------------------------
| epoch 73 | step 500/4071 | loss 4.9053 | lr 0.00100 | ngrams/sec 27461.4 | eta 0h1m6s
| epoch 73 | step 1000/4071 | loss 4.9556 | lr 0.00100 | ngrams/sec 39073.7 | eta 0h0m40s
| epoch 73 | step 1500/4071 | loss 4.9699 | lr 0.00100 | ngrams/sec 39023.2 | eta 0h0m33s
| epoch 73 | step 2000/4071 | loss 5.0050 | lr 0.00100 | ngrams/sec 38983.2 | eta 0h0m27s
| epoch 73 | step 2500/4071 | loss 5.0097 | lr 0.00100 | ngrams/sec 38939.9 | eta 0h0m20s
| epoch 73 | step 3000/4071 | loss 5.0275 | lr 0.00100 | ngrams/sec 38899.1 | eta 0h0m14s
| epoch 73 | step 3500/4071 | loss 5.0486 | lr 0.00100 | ngrams/sec 38984.4 | eta 0h0m7s
| epoch 73 | step 4000/4071 | loss 5.0346 | lr 0.00100 | ngrams/sec 38922.0 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1178.68it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  73 | time 54.90s | valid loss  5.43 | valid ppl   227.81
-----------------------------------------------------------------------------------------
| epoch 74 | step 500/4071 | loss 4.8979 | lr 0.00100 | ngrams/sec 27417.2 | eta 0h1m6s
| epoch 74 | step 1000/4071 | loss 4.9486 | lr 0.00100 | ngrams/sec 38876.9 | eta 0h0m40s
| epoch 74 | step 1500/4071 | loss 4.9645 | lr 0.00100 | ngrams/sec 38962.2 | eta 0h0m33s
| epoch 74 | step 2000/4071 | loss 4.9945 | lr 0.00100 | ngrams/sec 38946.1 | eta 0h0m27s
| epoch 74 | step 2500/4071 | loss 5.0148 | lr 0.00100 | ngrams/sec 38895.2 | eta 0h0m20s
| epoch 74 | step 3000/4071 | loss 5.0336 | lr 0.00100 | ngrams/sec 38905.5 | eta 0h0m14s
| epoch 74 | step 3500/4071 | loss 5.0275 | lr 0.00100 | ngrams/sec 38906.7 | eta 0h0m7s
| epoch 74 | step 4000/4071 | loss 5.0476 | lr 0.00100 | ngrams/sec 38915.1 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1151.27it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.12it/s]


-----------------------------------------------------------------------------------------
| end of epoch  74 | time 54.98s | valid loss  5.43 | valid ppl   227.60
-----------------------------------------------------------------------------------------
| epoch 75 | step 500/4071 | loss 4.8898 | lr 0.00100 | ngrams/sec 27357.1 | eta 0h1m6s
| epoch 75 | step 1000/4071 | loss 4.9449 | lr 0.00100 | ngrams/sec 38872.9 | eta 0h0m40s
| epoch 75 | step 1500/4071 | loss 4.9615 | lr 0.00100 | ngrams/sec 38813.1 | eta 0h0m33s
| epoch 75 | step 2000/4071 | loss 4.9872 | lr 0.00100 | ngrams/sec 38794.8 | eta 0h0m27s
| epoch 75 | step 2500/4071 | loss 5.0045 | lr 0.00100 | ngrams/sec 38776.8 | eta 0h0m20s
| epoch 75 | step 3000/4071 | loss 5.0175 | lr 0.00100 | ngrams/sec 38924.2 | eta 0h0m14s
| epoch 75 | step 3500/4071 | loss 5.0390 | lr 0.00100 | ngrams/sec 38870.0 | eta 0h0m7s
| epoch 75 | step 4000/4071 | loss 5.0452 | lr 0.00100 | ngrams/sec 38913.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1167.85it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.18it/s]


-----------------------------------------------------------------------------------------
| end of epoch  75 | time 55.08s | valid loss  5.43 | valid ppl   227.74
-----------------------------------------------------------------------------------------
| epoch 76 | step 500/4071 | loss 4.8816 | lr 0.00100 | ngrams/sec 27235.0 | eta 0h1m7s
| epoch 76 | step 1000/4071 | loss 4.9394 | lr 0.00100 | ngrams/sec 38877.7 | eta 0h0m40s
| epoch 76 | step 1500/4071 | loss 4.9660 | lr 0.00100 | ngrams/sec 38904.1 | eta 0h0m33s
| epoch 76 | step 2000/4071 | loss 4.9838 | lr 0.00100 | ngrams/sec 38844.3 | eta 0h0m27s
| epoch 76 | step 2500/4071 | loss 5.0015 | lr 0.00100 | ngrams/sec 38837.8 | eta 0h0m20s
| epoch 76 | step 3000/4071 | loss 5.0187 | lr 0.00100 | ngrams/sec 38900.2 | eta 0h0m14s
| epoch 76 | step 3500/4071 | loss 5.0278 | lr 0.00100 | ngrams/sec 38837.4 | eta 0h0m7s
| epoch 76 | step 4000/4071 | loss 5.0439 | lr 0.00100 | ngrams/sec 38696.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1147.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.69it/s]


-----------------------------------------------------------------------------------------
| end of epoch  76 | time 55.13s | valid loss  5.43 | valid ppl   227.89
-----------------------------------------------------------------------------------------
| epoch 77 | step 500/4071 | loss 4.8944 | lr 0.00100 | ngrams/sec 27316.2 | eta 0h1m6s
| epoch 77 | step 1000/4071 | loss 4.9356 | lr 0.00100 | ngrams/sec 38797.8 | eta 0h0m40s
| epoch 77 | step 1500/4071 | loss 4.9724 | lr 0.00100 | ngrams/sec 38829.7 | eta 0h0m33s
| epoch 77 | step 2000/4071 | loss 4.9811 | lr 0.00100 | ngrams/sec 38821.2 | eta 0h0m27s
| epoch 77 | step 2500/4071 | loss 4.9973 | lr 0.00100 | ngrams/sec 38873.9 | eta 0h0m20s
| epoch 77 | step 3000/4071 | loss 5.0189 | lr 0.00100 | ngrams/sec 38927.6 | eta 0h0m14s
| epoch 77 | step 3500/4071 | loss 5.0234 | lr 0.00100 | ngrams/sec 38795.3 | eta 0h0m7s
| epoch 77 | step 4000/4071 | loss 5.0175 | lr 0.00100 | ngrams/sec 38871.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1168.42it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.59it/s]


-----------------------------------------------------------------------------------------
| end of epoch  77 | time 55.08s | valid loss  5.43 | valid ppl   227.68
-----------------------------------------------------------------------------------------
| epoch 78 | step 500/4071 | loss 4.9063 | lr 0.00100 | ngrams/sec 27343.2 | eta 0h1m6s
| epoch 78 | step 1000/4071 | loss 4.9178 | lr 0.00100 | ngrams/sec 38996.3 | eta 0h0m40s
| epoch 78 | step 1500/4071 | loss 4.9549 | lr 0.00100 | ngrams/sec 38932.8 | eta 0h0m33s
| epoch 78 | step 2000/4071 | loss 4.9755 | lr 0.00100 | ngrams/sec 38880.3 | eta 0h0m27s
| epoch 78 | step 2500/4071 | loss 5.0020 | lr 0.00100 | ngrams/sec 38886.0 | eta 0h0m20s
| epoch 78 | step 3000/4071 | loss 5.0116 | lr 0.00100 | ngrams/sec 38852.5 | eta 0h0m14s
| epoch 78 | step 3500/4071 | loss 5.0205 | lr 0.00100 | ngrams/sec 38820.4 | eta 0h0m7s
| epoch 78 | step 4000/4071 | loss 5.0274 | lr 0.00100 | ngrams/sec 38756.9 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1168.70it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  78 | time 55.06s | valid loss  5.43 | valid ppl   228.58
-----------------------------------------------------------------------------------------
| epoch 79 | step 500/4071 | loss 4.8798 | lr 0.00100 | ngrams/sec 27371.1 | eta 0h1m6s
| epoch 79 | step 1000/4071 | loss 4.9285 | lr 0.00100 | ngrams/sec 38964.2 | eta 0h0m40s
| epoch 79 | step 1500/4071 | loss 4.9530 | lr 0.00100 | ngrams/sec 38893.3 | eta 0h0m33s
| epoch 79 | step 2000/4071 | loss 4.9653 | lr 0.00100 | ngrams/sec 38921.3 | eta 0h0m27s
| epoch 79 | step 2500/4071 | loss 4.9955 | lr 0.00100 | ngrams/sec 38810.6 | eta 0h0m20s
| epoch 79 | step 3000/4071 | loss 5.0145 | lr 0.00100 | ngrams/sec 38800.6 | eta 0h0m14s
| epoch 79 | step 3500/4071 | loss 5.0121 | lr 0.00100 | ngrams/sec 38785.0 | eta 0h0m7s
| epoch 79 | step 4000/4071 | loss 5.0421 | lr 0.00100 | ngrams/sec 38691.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1177.17it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.26it/s]


-----------------------------------------------------------------------------------------
| end of epoch  79 | time 55.09s | valid loss  5.43 | valid ppl   228.43
-----------------------------------------------------------------------------------------
| epoch 80 | step 500/4071 | loss 4.8926 | lr 0.00100 | ngrams/sec 27362.2 | eta 0h1m6s
| epoch 80 | step 1000/4071 | loss 4.9076 | lr 0.00100 | ngrams/sec 38903.4 | eta 0h0m40s
| epoch 80 | step 1500/4071 | loss 4.9581 | lr 0.00100 | ngrams/sec 38821.9 | eta 0h0m33s
| epoch 80 | step 2000/4071 | loss 4.9621 | lr 0.00100 | ngrams/sec 38783.3 | eta 0h0m27s
| epoch 80 | step 2500/4071 | loss 4.9994 | lr 0.00100 | ngrams/sec 38879.9 | eta 0h0m20s
| epoch 80 | step 3000/4071 | loss 4.9860 | lr 0.00100 | ngrams/sec 38804.4 | eta 0h0m14s
| epoch 80 | step 3500/4071 | loss 5.0216 | lr 0.00100 | ngrams/sec 38801.2 | eta 0h0m7s
| epoch 80 | step 4000/4071 | loss 5.0198 | lr 0.00100 | ngrams/sec 38915.0 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1176.99it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.69it/s]


-----------------------------------------------------------------------------------------
| end of epoch  80 | time 55.08s | valid loss  5.43 | valid ppl   228.27
-----------------------------------------------------------------------------------------
| epoch 81 | step 500/4071 | loss 4.8905 | lr 0.00100 | ngrams/sec 27357.6 | eta 0h1m6s
| epoch 81 | step 1000/4071 | loss 4.9201 | lr 0.00100 | ngrams/sec 38913.8 | eta 0h0m40s
| epoch 81 | step 1500/4071 | loss 4.9584 | lr 0.00100 | ngrams/sec 38834.1 | eta 0h0m33s
| epoch 81 | step 2000/4071 | loss 4.9605 | lr 0.00100 | ngrams/sec 38826.6 | eta 0h0m27s
| epoch 81 | step 2500/4071 | loss 4.9842 | lr 0.00100 | ngrams/sec 38762.9 | eta 0h0m20s
| epoch 81 | step 3000/4071 | loss 5.0037 | lr 0.00100 | ngrams/sec 38854.6 | eta 0h0m14s
| epoch 81 | step 3500/4071 | loss 5.0073 | lr 0.00100 | ngrams/sec 38791.7 | eta 0h0m7s
| epoch 81 | step 4000/4071 | loss 4.9981 | lr 0.00100 | ngrams/sec 38808.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1182.92it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.39it/s]


-----------------------------------------------------------------------------------------
| end of epoch  81 | time 55.11s | valid loss  5.43 | valid ppl   228.60
-----------------------------------------------------------------------------------------
| epoch 82 | step 500/4071 | loss 4.8776 | lr 0.00100 | ngrams/sec 27342.5 | eta 0h1m6s
| epoch 82 | step 1000/4071 | loss 4.9217 | lr 0.00100 | ngrams/sec 38896.2 | eta 0h0m40s
| epoch 82 | step 1500/4071 | loss 4.9552 | lr 0.00100 | ngrams/sec 38790.7 | eta 0h0m33s
| epoch 82 | step 2000/4071 | loss 4.9771 | lr 0.00100 | ngrams/sec 38747.6 | eta 0h0m27s
| epoch 82 | step 2500/4071 | loss 4.9815 | lr 0.00100 | ngrams/sec 38845.5 | eta 0h0m20s
| epoch 82 | step 3000/4071 | loss 4.9823 | lr 0.00100 | ngrams/sec 38791.4 | eta 0h0m14s
| epoch 82 | step 3500/4071 | loss 5.0080 | lr 0.00100 | ngrams/sec 38750.7 | eta 0h0m7s
| epoch 82 | step 4000/4071 | loss 5.0024 | lr 0.00100 | ngrams/sec 38834.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1170.28it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.27it/s]


-----------------------------------------------------------------------------------------
| end of epoch  82 | time 55.13s | valid loss  5.43 | valid ppl   228.58
-----------------------------------------------------------------------------------------
| epoch 83 | step 500/4071 | loss 4.8863 | lr 0.00100 | ngrams/sec 27357.9 | eta 0h1m6s
| epoch 83 | step 1000/4071 | loss 4.9081 | lr 0.00100 | ngrams/sec 38819.9 | eta 0h0m40s
| epoch 83 | step 1500/4071 | loss 4.9452 | lr 0.00100 | ngrams/sec 38813.6 | eta 0h0m33s
| epoch 83 | step 2000/4071 | loss 4.9580 | lr 0.00100 | ngrams/sec 38759.3 | eta 0h0m27s
| epoch 83 | step 2500/4071 | loss 4.9686 | lr 0.00100 | ngrams/sec 38841.3 | eta 0h0m20s
| epoch 83 | step 3000/4071 | loss 4.9857 | lr 0.00100 | ngrams/sec 38809.2 | eta 0h0m14s
| epoch 83 | step 3500/4071 | loss 5.0142 | lr 0.00100 | ngrams/sec 38808.6 | eta 0h0m7s
| epoch 83 | step 4000/4071 | loss 5.0084 | lr 0.00100 | ngrams/sec 38777.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1183.92it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.00it/s]


-----------------------------------------------------------------------------------------
| end of epoch  83 | time 55.13s | valid loss  5.43 | valid ppl   228.70
-----------------------------------------------------------------------------------------
| epoch 84 | step 500/4071 | loss 4.8812 | lr 0.00100 | ngrams/sec 27374.7 | eta 0h1m6s
| epoch 84 | step 1000/4071 | loss 4.8945 | lr 0.00100 | ngrams/sec 38816.3 | eta 0h0m40s
| epoch 84 | step 1500/4071 | loss 4.9576 | lr 0.00100 | ngrams/sec 38773.0 | eta 0h0m33s
| epoch 84 | step 2000/4071 | loss 4.9685 | lr 0.00100 | ngrams/sec 38837.6 | eta 0h0m27s
| epoch 84 | step 2500/4071 | loss 4.9633 | lr 0.00100 | ngrams/sec 38873.7 | eta 0h0m20s
| epoch 84 | step 3000/4071 | loss 4.9719 | lr 0.00100 | ngrams/sec 38781.5 | eta 0h0m14s
| epoch 84 | step 3500/4071 | loss 5.0091 | lr 0.00100 | ngrams/sec 38711.4 | eta 0h0m7s
| epoch 84 | step 4000/4071 | loss 5.0040 | lr 0.00100 | ngrams/sec 38899.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1154.89it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.59it/s]


-----------------------------------------------------------------------------------------
| end of epoch  84 | time 55.11s | valid loss  5.43 | valid ppl   229.05
-----------------------------------------------------------------------------------------
| epoch 85 | step 500/4071 | loss 4.8741 | lr 0.00100 | ngrams/sec 27385.3 | eta 0h1m6s
| epoch 85 | step 1000/4071 | loss 4.9030 | lr 0.00100 | ngrams/sec 38752.0 | eta 0h0m40s
| epoch 85 | step 1500/4071 | loss 4.9358 | lr 0.00100 | ngrams/sec 38912.8 | eta 0h0m33s
| epoch 85 | step 2000/4071 | loss 4.9569 | lr 0.00100 | ngrams/sec 38822.9 | eta 0h0m27s
| epoch 85 | step 2500/4071 | loss 4.9718 | lr 0.00100 | ngrams/sec 38796.2 | eta 0h0m20s
| epoch 85 | step 3000/4071 | loss 4.9761 | lr 0.00100 | ngrams/sec 38841.1 | eta 0h0m14s
| epoch 85 | step 3500/4071 | loss 5.0016 | lr 0.00100 | ngrams/sec 38846.7 | eta 0h0m7s
| epoch 85 | step 4000/4071 | loss 5.0074 | lr 0.00100 | ngrams/sec 38956.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1156.49it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  85 | time 55.07s | valid loss  5.44 | valid ppl   229.70
-----------------------------------------------------------------------------------------
| epoch 86 | step 500/4071 | loss 4.8483 | lr 0.00100 | ngrams/sec 27447.8 | eta 0h1m6s
| epoch 86 | step 1000/4071 | loss 4.9054 | lr 0.00100 | ngrams/sec 38834.8 | eta 0h0m40s
| epoch 86 | step 1500/4071 | loss 4.9343 | lr 0.00100 | ngrams/sec 38901.8 | eta 0h0m33s
| epoch 86 | step 2000/4071 | loss 4.9566 | lr 0.00100 | ngrams/sec 38909.9 | eta 0h0m27s
| epoch 86 | step 2500/4071 | loss 4.9712 | lr 0.00100 | ngrams/sec 38867.3 | eta 0h0m20s
| epoch 86 | step 3000/4071 | loss 4.9809 | lr 0.00100 | ngrams/sec 38939.5 | eta 0h0m14s
| epoch 86 | step 3500/4071 | loss 5.0012 | lr 0.00100 | ngrams/sec 38917.6 | eta 0h0m7s
| epoch 86 | step 4000/4071 | loss 5.0119 | lr 0.00100 | ngrams/sec 38883.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1140.27it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.14it/s]


-----------------------------------------------------------------------------------------
| end of epoch  86 | time 55.01s | valid loss  5.43 | valid ppl   229.26
-----------------------------------------------------------------------------------------
| epoch 87 | step 500/4071 | loss 4.8792 | lr 0.00100 | ngrams/sec 27422.4 | eta 0h1m6s
| epoch 87 | step 1000/4071 | loss 4.8980 | lr 0.00100 | ngrams/sec 38962.7 | eta 0h0m40s
| epoch 87 | step 1500/4071 | loss 4.9259 | lr 0.00100 | ngrams/sec 38948.5 | eta 0h0m33s
| epoch 87 | step 2000/4071 | loss 4.9627 | lr 0.00100 | ngrams/sec 38854.0 | eta 0h0m27s
| epoch 87 | step 2500/4071 | loss 4.9455 | lr 0.00100 | ngrams/sec 38942.9 | eta 0h0m20s
| epoch 87 | step 3000/4071 | loss 4.9684 | lr 0.00100 | ngrams/sec 38888.7 | eta 0h0m14s
| epoch 87 | step 3500/4071 | loss 4.9957 | lr 0.00100 | ngrams/sec 38905.2 | eta 0h0m7s
| epoch 87 | step 4000/4071 | loss 4.9984 | lr 0.00100 | ngrams/sec 38843.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.99it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.61it/s]


-----------------------------------------------------------------------------------------
| end of epoch  87 | time 54.98s | valid loss  5.44 | valid ppl   229.62
-----------------------------------------------------------------------------------------
| epoch 88 | step 500/4071 | loss 4.8565 | lr 0.00100 | ngrams/sec 27378.0 | eta 0h1m6s
| epoch 88 | step 1000/4071 | loss 4.8936 | lr 0.00100 | ngrams/sec 38924.0 | eta 0h0m40s
| epoch 88 | step 1500/4071 | loss 4.9195 | lr 0.00100 | ngrams/sec 38674.8 | eta 0h0m34s
| epoch 88 | step 2000/4071 | loss 4.9608 | lr 0.00100 | ngrams/sec 38832.8 | eta 0h0m27s
| epoch 88 | step 2500/4071 | loss 4.9674 | lr 0.00100 | ngrams/sec 38854.9 | eta 0h0m20s
| epoch 88 | step 3000/4071 | loss 4.9769 | lr 0.00100 | ngrams/sec 38837.9 | eta 0h0m14s
| epoch 88 | step 3500/4071 | loss 4.9826 | lr 0.00100 | ngrams/sec 38861.7 | eta 0h0m7s
| epoch 88 | step 4000/4071 | loss 5.0038 | lr 0.00100 | ngrams/sec 38786.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.63it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  88 | time 55.11s | valid loss  5.44 | valid ppl   229.72
-----------------------------------------------------------------------------------------
| epoch 89 | step 500/4071 | loss 4.8556 | lr 0.00100 | ngrams/sec 27370.8 | eta 0h1m6s
| epoch 89 | step 1000/4071 | loss 4.8909 | lr 0.00100 | ngrams/sec 38822.2 | eta 0h0m40s
| epoch 89 | step 1500/4071 | loss 4.9346 | lr 0.00100 | ngrams/sec 38895.2 | eta 0h0m33s
| epoch 89 | step 2000/4071 | loss 4.9325 | lr 0.00100 | ngrams/sec 38864.9 | eta 0h0m27s
| epoch 89 | step 2500/4071 | loss 4.9560 | lr 0.00100 | ngrams/sec 38892.7 | eta 0h0m20s
| epoch 89 | step 3000/4071 | loss 4.9732 | lr 0.00100 | ngrams/sec 38884.4 | eta 0h0m14s
| epoch 89 | step 3500/4071 | loss 4.9851 | lr 0.00100 | ngrams/sec 38921.3 | eta 0h0m7s
| epoch 89 | step 4000/4071 | loss 5.0102 | lr 0.00100 | ngrams/sec 38860.0 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1150.86it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch  89 | time 55.04s | valid loss  5.44 | valid ppl   229.32
-----------------------------------------------------------------------------------------
| epoch 90 | step 500/4071 | loss 4.8572 | lr 0.00100 | ngrams/sec 27414.1 | eta 0h1m6s
| epoch 90 | step 1000/4071 | loss 4.8919 | lr 0.00100 | ngrams/sec 38832.7 | eta 0h0m40s
| epoch 90 | step 1500/4071 | loss 4.9159 | lr 0.00100 | ngrams/sec 38866.0 | eta 0h0m33s
| epoch 90 | step 2000/4071 | loss 4.9405 | lr 0.00100 | ngrams/sec 38771.3 | eta 0h0m27s
| epoch 90 | step 2500/4071 | loss 4.9571 | lr 0.00100 | ngrams/sec 38798.0 | eta 0h0m20s
| epoch 90 | step 3000/4071 | loss 4.9745 | lr 0.00100 | ngrams/sec 38823.4 | eta 0h0m14s
| epoch 90 | step 3500/4071 | loss 4.9774 | lr 0.00100 | ngrams/sec 38859.9 | eta 0h0m7s
| epoch 90 | step 4000/4071 | loss 4.9892 | lr 0.00100 | ngrams/sec 38869.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.72it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.64it/s]


-----------------------------------------------------------------------------------------
| end of epoch  90 | time 55.07s | valid loss  5.44 | valid ppl   229.36
-----------------------------------------------------------------------------------------
| epoch 91 | step 500/4071 | loss 4.8533 | lr 0.00100 | ngrams/sec 27406.7 | eta 0h1m6s
| epoch 91 | step 1000/4071 | loss 4.8711 | lr 0.00100 | ngrams/sec 38885.6 | eta 0h0m40s
| epoch 91 | step 1500/4071 | loss 4.9367 | lr 0.00100 | ngrams/sec 38871.3 | eta 0h0m33s
| epoch 91 | step 2000/4071 | loss 4.9318 | lr 0.00100 | ngrams/sec 38807.0 | eta 0h0m27s
| epoch 91 | step 2500/4071 | loss 4.9436 | lr 0.00100 | ngrams/sec 38843.5 | eta 0h0m20s
| epoch 91 | step 3000/4071 | loss 4.9659 | lr 0.00100 | ngrams/sec 38783.7 | eta 0h0m14s
| epoch 91 | step 3500/4071 | loss 4.9855 | lr 0.00100 | ngrams/sec 38800.1 | eta 0h0m7s
| epoch 91 | step 4000/4071 | loss 4.9940 | lr 0.00100 | ngrams/sec 38769.1 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1172.15it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.67it/s]


-----------------------------------------------------------------------------------------
| end of epoch  91 | time 55.10s | valid loss  5.44 | valid ppl   229.54
-----------------------------------------------------------------------------------------
| epoch 92 | step 500/4071 | loss 4.8474 | lr 0.00100 | ngrams/sec 27368.0 | eta 0h1m6s
| epoch 92 | step 1000/4071 | loss 4.8961 | lr 0.00100 | ngrams/sec 38820.8 | eta 0h0m40s
| epoch 92 | step 1500/4071 | loss 4.9232 | lr 0.00100 | ngrams/sec 38788.9 | eta 0h0m33s
| epoch 92 | step 2000/4071 | loss 4.9294 | lr 0.00100 | ngrams/sec 38904.1 | eta 0h0m27s
| epoch 92 | step 2500/4071 | loss 4.9580 | lr 0.00100 | ngrams/sec 38855.3 | eta 0h0m20s
| epoch 92 | step 3000/4071 | loss 4.9542 | lr 0.00100 | ngrams/sec 38891.5 | eta 0h0m14s
| epoch 92 | step 3500/4071 | loss 4.9738 | lr 0.00100 | ngrams/sec 38911.3 | eta 0h0m7s
| epoch 92 | step 4000/4071 | loss 4.9823 | lr 0.00100 | ngrams/sec 38891.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.17it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.52it/s]


-----------------------------------------------------------------------------------------
| end of epoch  92 | time 55.06s | valid loss  5.44 | valid ppl   229.36
-----------------------------------------------------------------------------------------
| epoch 93 | step 500/4071 | loss 4.8433 | lr 0.00100 | ngrams/sec 27385.7 | eta 0h1m6s
| epoch 93 | step 1000/4071 | loss 4.8687 | lr 0.00100 | ngrams/sec 38822.4 | eta 0h0m40s
| epoch 93 | step 1500/4071 | loss 4.9142 | lr 0.00100 | ngrams/sec 38886.0 | eta 0h0m33s
| epoch 93 | step 2000/4071 | loss 4.9274 | lr 0.00100 | ngrams/sec 38865.0 | eta 0h0m27s
| epoch 93 | step 2500/4071 | loss 4.9547 | lr 0.00100 | ngrams/sec 38859.3 | eta 0h0m20s
| epoch 93 | step 3000/4071 | loss 4.9541 | lr 0.00100 | ngrams/sec 38818.2 | eta 0h0m14s
| epoch 93 | step 3500/4071 | loss 4.9853 | lr 0.00100 | ngrams/sec 38889.0 | eta 0h0m7s
| epoch 93 | step 4000/4071 | loss 4.9974 | lr 0.00100 | ngrams/sec 38774.9 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1169.24it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.53it/s]


-----------------------------------------------------------------------------------------
| end of epoch  93 | time 55.08s | valid loss  5.44 | valid ppl   229.43
-----------------------------------------------------------------------------------------
| epoch 94 | step 500/4071 | loss 4.8380 | lr 0.00100 | ngrams/sec 27396.2 | eta 0h1m6s
| epoch 94 | step 1000/4071 | loss 4.8844 | lr 0.00100 | ngrams/sec 38804.1 | eta 0h0m40s
| epoch 94 | step 1500/4071 | loss 4.9122 | lr 0.00100 | ngrams/sec 38899.2 | eta 0h0m33s
| epoch 94 | step 2000/4071 | loss 4.9252 | lr 0.00100 | ngrams/sec 38919.0 | eta 0h0m27s
| epoch 94 | step 2500/4071 | loss 4.9327 | lr 0.00100 | ngrams/sec 38851.7 | eta 0h0m20s
| epoch 94 | step 3000/4071 | loss 4.9598 | lr 0.00100 | ngrams/sec 38837.9 | eta 0h0m14s
| epoch 94 | step 3500/4071 | loss 4.9663 | lr 0.00100 | ngrams/sec 38943.7 | eta 0h0m7s
| epoch 94 | step 4000/4071 | loss 4.9992 | lr 0.00100 | ngrams/sec 38815.9 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1178.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  94 | time 55.03s | valid loss  5.44 | valid ppl   229.63
-----------------------------------------------------------------------------------------
| epoch 95 | step 500/4071 | loss 4.8346 | lr 0.00100 | ngrams/sec 27412.5 | eta 0h1m6s
| epoch 95 | step 1000/4071 | loss 4.8703 | lr 0.00100 | ngrams/sec 38804.8 | eta 0h0m40s
| epoch 95 | step 1500/4071 | loss 4.9098 | lr 0.00100 | ngrams/sec 38084.4 | eta 0h0m34s
| epoch 95 | step 2000/4071 | loss 4.9311 | lr 0.00100 | ngrams/sec 38858.2 | eta 0h0m27s
| epoch 95 | step 2500/4071 | loss 4.9465 | lr 0.00100 | ngrams/sec 38888.2 | eta 0h0m20s
| epoch 95 | step 3000/4071 | loss 4.9449 | lr 0.00100 | ngrams/sec 38869.6 | eta 0h0m14s
| epoch 95 | step 3500/4071 | loss 4.9729 | lr 0.00100 | ngrams/sec 38863.1 | eta 0h0m7s
| epoch 95 | step 4000/4071 | loss 4.9967 | lr 0.00100 | ngrams/sec 38808.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1178.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.22it/s]


-----------------------------------------------------------------------------------------
| end of epoch  95 | time 55.20s | valid loss  5.44 | valid ppl   229.72
-----------------------------------------------------------------------------------------
| epoch 96 | step 500/4071 | loss 4.8293 | lr 0.00100 | ngrams/sec 27393.7 | eta 0h1m6s
| epoch 96 | step 1000/4071 | loss 4.8619 | lr 0.00100 | ngrams/sec 38911.6 | eta 0h0m40s
| epoch 96 | step 1500/4071 | loss 4.9252 | lr 0.00100 | ngrams/sec 38859.5 | eta 0h0m33s
| epoch 96 | step 2000/4071 | loss 4.9262 | lr 0.00100 | ngrams/sec 38821.2 | eta 0h0m27s
| epoch 96 | step 2500/4071 | loss 4.9348 | lr 0.00100 | ngrams/sec 38822.4 | eta 0h0m20s
| epoch 96 | step 3000/4071 | loss 4.9598 | lr 0.00100 | ngrams/sec 38838.5 | eta 0h0m14s
| epoch 96 | step 3500/4071 | loss 4.9723 | lr 0.00100 | ngrams/sec 38883.3 | eta 0h0m7s
| epoch 96 | step 4000/4071 | loss 4.9743 | lr 0.00100 | ngrams/sec 38868.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1158.17it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.98it/s]


-----------------------------------------------------------------------------------------
| end of epoch  96 | time 55.05s | valid loss  5.44 | valid ppl   229.81
-----------------------------------------------------------------------------------------
| epoch 97 | step 500/4071 | loss 4.8291 | lr 0.00100 | ngrams/sec 27409.1 | eta 0h1m6s
| epoch 97 | step 1000/4071 | loss 4.8706 | lr 0.00100 | ngrams/sec 38864.4 | eta 0h0m40s
| epoch 97 | step 1500/4071 | loss 4.8966 | lr 0.00100 | ngrams/sec 38841.6 | eta 0h0m33s
| epoch 97 | step 2000/4071 | loss 4.9326 | lr 0.00100 | ngrams/sec 38817.8 | eta 0h0m27s
| epoch 97 | step 2500/4071 | loss 4.9367 | lr 0.00100 | ngrams/sec 38935.7 | eta 0h0m20s
| epoch 97 | step 3000/4071 | loss 4.9534 | lr 0.00100 | ngrams/sec 38855.3 | eta 0h0m14s
| epoch 97 | step 3500/4071 | loss 4.9600 | lr 0.00100 | ngrams/sec 38877.5 | eta 0h0m7s
| epoch 97 | step 4000/4071 | loss 4.9720 | lr 0.00100 | ngrams/sec 38899.7 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1153.26it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.41it/s]


-----------------------------------------------------------------------------------------
| end of epoch  97 | time 55.04s | valid loss  5.44 | valid ppl   229.34
-----------------------------------------------------------------------------------------
| epoch 98 | step 500/4071 | loss 4.8170 | lr 0.00100 | ngrams/sec 27403.7 | eta 0h1m6s
| epoch 98 | step 1000/4071 | loss 4.8678 | lr 0.00100 | ngrams/sec 38818.3 | eta 0h0m40s
| epoch 98 | step 1500/4071 | loss 4.9040 | lr 0.00100 | ngrams/sec 38814.8 | eta 0h0m33s
| epoch 98 | step 2000/4071 | loss 4.9174 | lr 0.00100 | ngrams/sec 38931.1 | eta 0h0m27s
| epoch 98 | step 2500/4071 | loss 4.9405 | lr 0.00100 | ngrams/sec 38825.6 | eta 0h0m20s
| epoch 98 | step 3000/4071 | loss 4.9509 | lr 0.00100 | ngrams/sec 38899.8 | eta 0h0m14s
| epoch 98 | step 3500/4071 | loss 4.9454 | lr 0.00100 | ngrams/sec 38848.0 | eta 0h0m7s
| epoch 98 | step 4000/4071 | loss 4.9908 | lr 0.00100 | ngrams/sec 38897.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1174.78it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.34it/s]


-----------------------------------------------------------------------------------------
| end of epoch  98 | time 55.05s | valid loss  5.44 | valid ppl   229.62
-----------------------------------------------------------------------------------------
| epoch 99 | step 500/4071 | loss 4.8336 | lr 0.00100 | ngrams/sec 27390.1 | eta 0h1m6s
| epoch 99 | step 1000/4071 | loss 4.8717 | lr 0.00100 | ngrams/sec 38802.4 | eta 0h0m40s
| epoch 99 | step 1500/4071 | loss 4.8921 | lr 0.00100 | ngrams/sec 38748.0 | eta 0h0m33s
| epoch 99 | step 2000/4071 | loss 4.9145 | lr 0.00100 | ngrams/sec 38837.5 | eta 0h0m27s
| epoch 99 | step 2500/4071 | loss 4.9339 | lr 0.00100 | ngrams/sec 38846.8 | eta 0h0m20s
| epoch 99 | step 3000/4071 | loss 4.9394 | lr 0.00100 | ngrams/sec 38782.2 | eta 0h0m14s
| epoch 99 | step 3500/4071 | loss 4.9559 | lr 0.00100 | ngrams/sec 38786.5 | eta 0h0m7s
| epoch 99 | step 4000/4071 | loss 4.9736 | lr 0.00100 | ngrams/sec 38844.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1178.15it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.49it/s]


-----------------------------------------------------------------------------------------
| end of epoch  99 | time 55.12s | valid loss  5.44 | valid ppl   229.60
-----------------------------------------------------------------------------------------
| epoch 100 | step 500/4071 | loss 4.8461 | lr 0.00100 | ngrams/sec 27351.7 | eta 0h1m6s
| epoch 100 | step 1000/4071 | loss 4.8739 | lr 0.00100 | ngrams/sec 38864.8 | eta 0h0m40s
| epoch 100 | step 1500/4071 | loss 4.8834 | lr 0.00100 | ngrams/sec 38806.9 | eta 0h0m33s
| epoch 100 | step 2000/4071 | loss 4.9016 | lr 0.00100 | ngrams/sec 38922.9 | eta 0h0m27s
| epoch 100 | step 2500/4071 | loss 4.9242 | lr 0.00100 | ngrams/sec 38918.1 | eta 0h0m20s
| epoch 100 | step 3000/4071 | loss 4.9624 | lr 0.00100 | ngrams/sec 38827.8 | eta 0h0m14s
| epoch 100 | step 3500/4071 | loss 4.9493 | lr 0.00100 | ngrams/sec 38833.6 | eta 0h0m7s
| epoch 100 | step 4000/4071 | loss 4.9571 | lr 0.00100 | ngrams/sec 38910.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1187.27it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.55it/s]


-----------------------------------------------------------------------------------------


 25%|██▍       | 117/471 [00:00<00:00, 1163.05it/s]

| end of epoch 100 | time 55.06s | valid loss  5.44 | valid ppl   230.34
-----------------------------------------------------------------------------------------
Evaluating on test set...


100%|██████████| 471/471 [00:01<00:00, 287.31it/s]


| End of training | test loss  5.37 | test ppl   214.96


In [None]:
from google.colab import files
files.download('checkpoint.pth')
!cp "checkpoint.pth" "gdrive/MyDrive/Colab_Files/checkpoint-true.pth"

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Generate

In [None]:
!cp "gdrive/MyDrive/Colab_Files/checkpoint-true.pth" "checkpoint.pth" 

In [None]:
import torch 
# Model parameters.
class Args:
    data = 'gdrive/MyDrive/wikitext-2'
    checkpoint = 'checkpoint.pth'
    outf = 'generated.txt'
    #words = 1000
    seed = 42
    cuda = True
    temperature = 1.0 # temperature - higher will increase diversity
    log_interval = 10 # reporting interval
    words = 100
args = Args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

model.load_state_dict(torch.load(args.checkpoint))
print(model)
model.eval()

ntokens = n_class
input_idx = 104#torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
# input_idx = torch.autograd.Variable(torch.t(torch.randint(ntokens, (1, 7), dtype=torch.long))).to(device)
input_words = [corpus.dictionary.idx2word[i] for i in train_data[input_idx:order+input_idx, 0]]
input = torch.tensor([i for i in train_data[input_idx:order+input_idx, 0]], dtype=torch.long).to(device)
print(input)
print(input_words)

FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
tensor([27, 63, 64, 65, 66, 17, 67], device='cuda:0')
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation']


In [None]:
glue = ' '
start = None
with open(args.outf, 'w') as outf:
    for i in range(args.words):
        output = model(input)
        word_weights = output.squeeze().div(args.temperature).exp().cpu()
        # if args.no_unk:
        #     word_weights[corpus.dictionary.w2i[unk]] = 0
        word_idx = torch.multinomial(word_weights, 1)[0]
        # word_idx = word_idx.data[0]
        word = corpus.dictionary.idx2word[word_idx]
        print(word)

        # ids.append(word_idx)
        # input = Variable(torch.LongTensor(ids[-model.order:]).unsqueeze(0))
        word_tensor = torch.tensor([word_idx]).to(device)
        input = torch.cat((input[1:], word_tensor), 0)
        # input.fill_(word_idx)
        input = input.cuda() if cuda else input
        # print(input)
        if word is "<sos>": # ignore start of sentence predictns
            continue
        elif word is "<eos>":
            outf.write('\n')
        else:
            outf.write(word + glue)

        if i % args.log_interval == 0:
            print('| Generated {}/{} words'.format(i, args.words))

of
| Generated 0/100 words
the
archaeological
survey
(
designated
<unk>
)
was
released
representation
| Generated 10/100 words
of
almost
180
in
the
kickoff
,
and
extremely
historical
| Generated 20/100 words
l.
<unk>
is
a
very
well
.
better
@-@
barnes
| Generated 30/100 words
cheese
believed
that
same
a
<unk>
japanese
fashion
<unk>
.
| Generated 40/100 words
fey
<unk>
continued
in
rare
drama
book
atop
without
these
| Generated 50/100 words
specimens
,
just
along
the
end
of
any
usually
the
| Generated 60/100 words
right
had
died
differences
dining
julio
,
the
date
of
| Generated 70/100 words
phase
)
.
the
medical
sciences
's
system
became
expected
| Generated 80/100 words
.
<eos>
the
this
episode
continued
population
produced
by
16
| Generated 90/100 words
<unk>
.
the
company
began
the
population
was
named


In [None]:
!cp "generated.txt" "gdrive/MyDrive/Colab_Files/generated-true.txt"

In [None]:
print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

 25%|██▍       | 117/471 [00:00<00:00, 1127.22it/s]

Evaluating on test set...


100%|██████████| 471/471 [00:01<00:00, 297.79it/s]


| End of training | test loss  5.34 | test ppl   208.81


In [None]:
drive.flush_and_unmount()