In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!ls "gdrive/MyDrive/wikitext-2" # check that it has successfully connected
# files should be at ur GDrive inside folder wikitext-2
!cp "gdrive/MyDrive/wikitext-2/wiki.test.tokens.txt" "test.txt" # copy the files to colab runtime
!cp "gdrive/MyDrive/wikitext-2/wiki.train.tokens.txt" "train.txt"
!cp "gdrive/MyDrive/wikitext-2/wiki.valid.tokens.txt" "valid.txt"

wiki.test.tokens      wiki.train.tokens      wiki.valid.tokens
wiki.test.tokens.txt  wiki.train.tokens.txt  wiki.valid.tokens.txt


# Preprocessing

In [3]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                # remove the headers e.g.  = = Description = = 
                if line.startswith('='): 
                    continue
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word.lower()) # make to lower

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word.lower()]) # make to lower
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
    
    @property
    def vocab_size(self):
        return len(self.dictionary.idx2word)

# Params

In [4]:
#=== params
corpus = Corpus('/content')
n_class = corpus.vocab_size
n_step = 7 # n-1 in paper
n_hidden = 200 # h in paper
embed_size = 200       # m in paper
batch_size = 512
order = n_step # order (int): the order of the language model, i.e. length of the history
epochs = 100
learning_rate = 0.001
cuda = torch.cuda.is_available()
seed = 42
clip = 2.0
#===

# Model

In [10]:
#== MODEL ==#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''
    the neural model learns the distributed representation of each word 
    (embedding matrix C) and 
    the probability function of a word sequence as a function of their distributed representations. 
    It has a hidden layer with 
    tanh activation and the output layer is a 
    Softmax layer. 
    The output of the model for each 
    input of (n - 1) previous words are the 
    probabilities over the |V | words in the vocabulary for the next word.
'''
class FNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, context_size, no_hidden, dropout=0.5, tie_weight=True):
        super(FNNModel,self).__init__()
        """
        Args:
            n_class (int): no. of vocabulary
            m (int): size of each embedding vector
#n-gram models construct tables of conditional probabilities for the next word, 
#for each one of a large number of contexts, i.e. combinations of the last n − 1 words
            n_step (int): n-1 in paper. #n_step + 1 = n-gram. if n_step = 1, bigram
            n_hidden (int): no. of hidden units associated with each word
        """
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear1 = nn.Linear(context_size * embed_size, no_hidden)
        self.dropout = nn.Dropout(p=dropout)
        self.linear2 = nn.Linear(no_hidden, vocab_size)
        self.context_size = context_size
        self.embed_size = embed_size
        if tie_weight:
            self.linear2.weight = self.embeddings.weight

    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((-1, self.context_size * self.embed_size))
        hidden_output = self.linear1(embeds)
        out = hidden_output.tanh()
        out = self.dropout(out)
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1) # [1000, 28912]: softmax on 28912's dim
        return log_probs

# Data Loading

In [11]:
def batchify(data, batch_size):
    # Work out how cleanly we can divide the dataset into args.batch_size parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data
def get_batch(data, i, order):
    x = torch.autograd.Variable(torch.t(data[i:i+order]))
    y = torch.autograd.Variable(data[i+order].view(-1))
    return x, y
def evaluate(data, model, criterion):
	model.eval()
	total_loss = 0
	n_steps = data.size(0) - order - 1
	for i in tqdm(range(n_steps)):
		x, y = get_batch(data, i, order)
		out = model(x)
		loss = criterion(out, y)
		total_loss += loss.data.data
	return total_loss / n_steps

In [12]:
def clock_time(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    return int(h), int(m), int(s)

In [13]:
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import time


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)
if cuda:
	train_data, val_data, test_data = train_data.cuda(), val_data.cuda(), test_data.cuda()
print('Using cuda: {}'.format(cuda))
print('Size of training set: {:,} tokens'.format(np.prod(train_data.size())))
print('Size of validation set: {:,} tokens'.format(np.prod(val_data.size())))
print('Size of test set: {:,} tokens'.format(np.prod(test_data.size())))
print('Vocabulary size: {:,}'.format(corpus.vocab_size))
print('Example data:')
for k in range(100, 107):
    x = [corpus.dictionary.idx2word[i] for i in train_data[k:order+k, 0]]
    y = [corpus.dictionary.idx2word[train_data[k+order, 0]]]
    print(x, y)
#=== initialise model
model = FNNModel(
    n_class, 
    embed_size, 
    n_step, 
    n_hidden,
    tie_weight=True
    )
if cuda:
  model.cuda()
# Display the model's architecture
print('Model: \n', model)
criterion = nn.NLLLoss()
optimizer = optim.RMSprop(model.parameters(),lr=learning_rate)

Using cuda: True
Size of training set: 2,088,448 tokens
Size of validation set: 217,600 tokens
Size of test set: 245,248 tokens
Vocabulary size: 28,912
Example data:
['"', 'nameless', '"', ',', 'a', 'penal', 'military'] ['unit']
['nameless', '"', ',', 'a', 'penal', 'military', 'unit'] ['serving']
['"', ',', 'a', 'penal', 'military', 'unit', 'serving'] ['the']
[',', 'a', 'penal', 'military', 'unit', 'serving', 'the'] ['nation']
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation'] ['of']
['penal', 'military', 'unit', 'serving', 'the', 'nation', 'of'] ['gallia']
['military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia'] ['during']
Model: 
 FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
)


In [14]:
!nvidia-smi

Mon Nov 30 08:05:51 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    28W /  70W |   1035MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [15]:
# Set seed for reproducibility.
torch.manual_seed(seed)
np.random.seed(seed)
parameters = [param for param in model.parameters() if param.requires_grad]
# Training
print('Training...')
losses = dict(train=[], val=[])

# initialize the early_stopping counter
stop_counter = 0

lr = learning_rate # so that can alter later if SGD not descending 
best_val_loss = None

num_steps = train_data.size(0) - order - 1
batch_order = np.arange(num_steps)

t0 = time.time()
try:
    for epoch in range(1, epochs+1):
        model.train()
        epoch_start_time = time.time()
        np.random.shuffle(batch_order)

        for step in range(1, num_steps+1):
            idx = batch_order[step-1]
            x, y = get_batch(train_data, idx, order)

            model.zero_grad()
            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)
            debug = False
            #   if debug:
            #     # Debugging softmax approximation.
            #     xe = nn.CrossEntropyLoss()
            #     true_loss = xe(logits, y)
            #     print('approx {:>3.2f}, true {:>3.2f}, diff {:>3.4f}'.format(
            #       loss.data, true_loss.data, true_loss.data - loss.data))

            # Update parameters
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # reduce exploding grad
            optimizer.step()

            # Save loss.
            losses['train'].append(loss.cpu().data)
            print_every = 500
            if step % print_every == 0:
                avg_loss = sum(losses['train'][-print_every:]) / print_every
                t1 = time.time()
                steps_per_second = print_every / (t1 - t0)
                print('| epoch {} | step {}/{} | loss {:.4f} | lr {:.5f} | '
                    'ngrams/sec {:.1f} | eta {}h{}m{}s'.format(
                    epoch, step, num_steps, avg_loss, lr,
                    steps_per_second * batch_size,
                    *clock_time((num_steps - step) / steps_per_second)))
                t0 = time.time()
            
        print('Evaluating on validation set...')
        val_loss = evaluate(val_data, model, criterion)
        losses['val'].append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, torch.exp(val_loss)))
        print('-' * 89)

        if not best_val_loss or val_loss < best_val_loss:
            stop_counter = 0 # reset counter
            best_val_loss = val_loss
            print('| saving current state of model ...')
            torch.save(model.state_dict(), 'checkpoint.pth')
            #=== download checkpoint file
            # files.download('checkpoint.pth')
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    
# write_losses(losses['train'], args.log_dir, name='train-losses')
# write_losses(losses['val'], args.log_dir, name='val-losses')

print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

Training...
| epoch 1 | step 500/4071 | loss 23.5508 | lr 0.00100 | ngrams/sec 43370.8 | eta 0h0m42s
| epoch 1 | step 1000/4071 | loss 12.0775 | lr 0.00100 | ngrams/sec 45608.3 | eta 0h0m34s
| epoch 1 | step 1500/4071 | loss 10.8827 | lr 0.00100 | ngrams/sec 45458.0 | eta 0h0m28s
| epoch 1 | step 2000/4071 | loss 10.0806 | lr 0.00100 | ngrams/sec 45363.7 | eta 0h0m23s
| epoch 1 | step 2500/4071 | loss 9.5241 | lr 0.00100 | ngrams/sec 45244.1 | eta 0h0m17s
| epoch 1 | step 3000/4071 | loss 9.0843 | lr 0.00100 | ngrams/sec 45104.7 | eta 0h0m12s
| epoch 1 | step 3500/4071 | loss 8.7811 | lr 0.00100 | ngrams/sec 45026.7 | eta 0h0m6s
| epoch 1 | step 4000/4071 | loss 8.5269 | lr 0.00100 | ngrams/sec 44990.6 | eta 0h0m0s


 29%|██▉       | 122/417 [00:00<00:00, 1183.78it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 337.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch   1 | time 47.55s | valid loss  7.05 | valid ppl  1153.73
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 2 | step 500/4071 | loss 8.2578 | lr 0.00100 | ngrams/sec 31056.5 | eta 0h0m58s
| epoch 2 | step 1000/4071 | loss 8.0733 | lr 0.00100 | ngrams/sec 44701.8 | eta 0h0m35s
| epoch 2 | step 1500/4071 | loss 7.9560 | lr 0.00100 | ngrams/sec 44472.3 | eta 0h0m29s
| epoch 2 | step 2000/4071 | loss 7.8127 | lr 0.00100 | ngrams/sec 44440.9 | eta 0h0m23s
| epoch 2 | step 2500/4071 | loss 7.7047 | lr 0.00100 | ngrams/sec 44391.6 | eta 0h0m18s
| epoch 2 | step 3000/4071 | loss 7.5852 | lr 0.00100 | ngrams/sec 44328.5 | eta 0h0m12s
| epoch 2 | step 3500/4071 | loss 7.4877 | lr 0.00100 | ngrams/sec 44282.8 | eta 0h0m6s
| epoch 2 | step 4000/4071 | loss 7.4133 | lr 0.00100 | ngrams/sec 44176.3 | eta 0h0m0s


 29%|██▉       | 121/417 [00:00<00:00, 1181.33it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 323.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch   2 | time 48.24s | valid loss  6.52 | valid ppl   678.04
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 3 | step 500/4071 | loss 7.2799 | lr 0.00100 | ngrams/sec 30623.7 | eta 0h0m59s
| epoch 3 | step 1000/4071 | loss 7.2279 | lr 0.00100 | ngrams/sec 44016.0 | eta 0h0m35s
| epoch 3 | step 1500/4071 | loss 7.1561 | lr 0.00100 | ngrams/sec 44031.3 | eta 0h0m29s
| epoch 3 | step 2000/4071 | loss 7.0798 | lr 0.00100 | ngrams/sec 43912.0 | eta 0h0m24s
| epoch 3 | step 2500/4071 | loss 7.0433 | lr 0.00100 | ngrams/sec 43861.1 | eta 0h0m18s
| epoch 3 | step 3000/4071 | loss 6.9870 | lr 0.00100 | ngrams/sec 43764.5 | eta 0h0m12s
| epoch 3 | step 3500/4071 | loss 6.9323 | lr 0.00100 | ngrams/sec 43676.0 | eta 0h0m6s
| epoch 3 | step 4000/4071 | loss 6.9021 | lr 0.00100 | ngrams/sec 43536.3 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1172.48it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 310.21it/s]


-----------------------------------------------------------------------------------------
| end of epoch   3 | time 48.88s | valid loss  6.21 | valid ppl   498.06
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 4 | step 500/4071 | loss 6.7873 | lr 0.00100 | ngrams/sec 29982.2 | eta 0h1m0s
| epoch 4 | step 1000/4071 | loss 6.7683 | lr 0.00100 | ngrams/sec 43205.7 | eta 0h0m36s
| epoch 4 | step 1500/4071 | loss 6.7482 | lr 0.00100 | ngrams/sec 43165.3 | eta 0h0m30s
| epoch 4 | step 2000/4071 | loss 6.6935 | lr 0.00100 | ngrams/sec 43086.3 | eta 0h0m24s
| epoch 4 | step 2500/4071 | loss 6.6756 | lr 0.00100 | ngrams/sec 43015.8 | eta 0h0m18s
| epoch 4 | step 3000/4071 | loss 6.6253 | lr 0.00100 | ngrams/sec 42925.9 | eta 0h0m12s
| epoch 4 | step 3500/4071 | loss 6.6061 | lr 0.00100 | ngrams/sec 42805.3 | eta 0h0m6s
| epoch 4 | step 4000/4071 | loss 6.5711 | lr 0.00100 | ngrams/sec 42675.3 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1153.09it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 301.97it/s]


-----------------------------------------------------------------------------------------
| end of epoch   4 | time 49.84s | valid loss  5.99 | valid ppl   400.04
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 5 | step 500/4071 | loss 6.4765 | lr 0.00100 | ngrams/sec 29637.3 | eta 0h1m1s
| epoch 5 | step 1000/4071 | loss 6.4787 | lr 0.00100 | ngrams/sec 43160.4 | eta 0h0m36s
| epoch 5 | step 1500/4071 | loss 6.4580 | lr 0.00100 | ngrams/sec 43017.5 | eta 0h0m30s
| epoch 5 | step 2000/4071 | loss 6.4345 | lr 0.00100 | ngrams/sec 43009.2 | eta 0h0m24s
| epoch 5 | step 2500/4071 | loss 6.4069 | lr 0.00100 | ngrams/sec 42948.2 | eta 0h0m18s
| epoch 5 | step 3000/4071 | loss 6.3799 | lr 0.00100 | ngrams/sec 42888.6 | eta 0h0m12s
| epoch 5 | step 3500/4071 | loss 6.3837 | lr 0.00100 | ngrams/sec 42830.4 | eta 0h0m6s
| epoch 5 | step 4000/4071 | loss 6.3652 | lr 0.00100 | ngrams/sec 42831.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1165.99it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 302.54it/s]


-----------------------------------------------------------------------------------------
| end of epoch   5 | time 49.89s | valid loss  5.91 | valid ppl   367.67
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 6 | step 500/4071 | loss 6.2465 | lr 0.00100 | ngrams/sec 29536.4 | eta 0h1m1s
| epoch 6 | step 1000/4071 | loss 6.2461 | lr 0.00100 | ngrams/sec 42813.3 | eta 0h0m36s
| epoch 6 | step 1500/4071 | loss 6.2443 | lr 0.00100 | ngrams/sec 42769.6 | eta 0h0m30s
| epoch 6 | step 2000/4071 | loss 6.2304 | lr 0.00100 | ngrams/sec 42779.5 | eta 0h0m24s
| epoch 6 | step 2500/4071 | loss 6.2135 | lr 0.00100 | ngrams/sec 42644.3 | eta 0h0m18s
| epoch 6 | step 3000/4071 | loss 6.2100 | lr 0.00100 | ngrams/sec 42671.8 | eta 0h0m12s
| epoch 6 | step 3500/4071 | loss 6.1946 | lr 0.00100 | ngrams/sec 42563.0 | eta 0h0m6s
| epoch 6 | step 4000/4071 | loss 6.1572 | lr 0.00100 | ngrams/sec 42480.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1181.83it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 299.87it/s]


-----------------------------------------------------------------------------------------
| end of epoch   6 | time 50.22s | valid loss  5.73 | valid ppl   308.96
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 7 | step 500/4071 | loss 6.0579 | lr 0.00100 | ngrams/sec 29323.3 | eta 0h1m2s
| epoch 7 | step 1000/4071 | loss 6.0656 | lr 0.00100 | ngrams/sec 42420.8 | eta 0h0m37s
| epoch 7 | step 1500/4071 | loss 6.0609 | lr 0.00100 | ngrams/sec 42354.9 | eta 0h0m31s
| epoch 7 | step 2000/4071 | loss 6.0341 | lr 0.00100 | ngrams/sec 42351.2 | eta 0h0m25s
| epoch 7 | step 2500/4071 | loss 6.0492 | lr 0.00100 | ngrams/sec 42449.5 | eta 0h0m18s
| epoch 7 | step 3000/4071 | loss 6.0218 | lr 0.00100 | ngrams/sec 42377.4 | eta 0h0m12s
| epoch 7 | step 3500/4071 | loss 6.0269 | lr 0.00100 | ngrams/sec 42360.4 | eta 0h0m6s
| epoch 7 | step 4000/4071 | loss 5.9938 | lr 0.00100 | ngrams/sec 42346.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1156.66it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.48it/s]


-----------------------------------------------------------------------------------------
| end of epoch   7 | time 50.57s | valid loss  5.62 | valid ppl   276.89
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 8 | step 500/4071 | loss 5.8742 | lr 0.00100 | ngrams/sec 29222.6 | eta 0h1m2s
| epoch 8 | step 1000/4071 | loss 5.8856 | lr 0.00100 | ngrams/sec 42498.2 | eta 0h0m36s
| epoch 8 | step 1500/4071 | loss 5.8924 | lr 0.00100 | ngrams/sec 42529.1 | eta 0h0m30s
| epoch 8 | step 2000/4071 | loss 5.8918 | lr 0.00100 | ngrams/sec 42667.2 | eta 0h0m24s
| epoch 8 | step 2500/4071 | loss 5.8773 | lr 0.00100 | ngrams/sec 42633.3 | eta 0h0m18s
| epoch 8 | step 3000/4071 | loss 5.8733 | lr 0.00100 | ngrams/sec 42773.1 | eta 0h0m12s
| epoch 8 | step 3500/4071 | loss 5.8570 | lr 0.00100 | ngrams/sec 42894.1 | eta 0h0m6s
| epoch 8 | step 4000/4071 | loss 5.8484 | lr 0.00100 | ngrams/sec 42890.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1142.82it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.57it/s]


-----------------------------------------------------------------------------------------
| end of epoch   8 | time 50.26s | valid loss  5.57 | valid ppl   263.12
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 9 | step 500/4071 | loss 5.7156 | lr 0.00100 | ngrams/sec 29453.2 | eta 0h1m2s
| epoch 9 | step 1000/4071 | loss 5.7332 | lr 0.00100 | ngrams/sec 42654.9 | eta 0h0m36s
| epoch 9 | step 1500/4071 | loss 5.7411 | lr 0.00100 | ngrams/sec 42610.0 | eta 0h0m30s
| epoch 9 | step 2000/4071 | loss 5.7398 | lr 0.00100 | ngrams/sec 42527.8 | eta 0h0m24s
| epoch 9 | step 2500/4071 | loss 5.7440 | lr 0.00100 | ngrams/sec 42592.3 | eta 0h0m18s
| epoch 9 | step 3000/4071 | loss 5.7347 | lr 0.00100 | ngrams/sec 42612.5 | eta 0h0m12s
| epoch 9 | step 3500/4071 | loss 5.7298 | lr 0.00100 | ngrams/sec 42608.5 | eta 0h0m6s
| epoch 9 | step 4000/4071 | loss 5.7299 | lr 0.00100 | ngrams/sec 42811.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1172.94it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.20it/s]


-----------------------------------------------------------------------------------------
| end of epoch   9 | time 50.27s | valid loss  5.51 | valid ppl   247.77
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 10 | step 500/4071 | loss 5.5845 | lr 0.00100 | ngrams/sec 29461.8 | eta 0h1m2s
| epoch 10 | step 1000/4071 | loss 5.5949 | lr 0.00100 | ngrams/sec 42842.4 | eta 0h0m36s
| epoch 10 | step 1500/4071 | loss 5.6116 | lr 0.00100 | ngrams/sec 42928.5 | eta 0h0m30s
| epoch 10 | step 2000/4071 | loss 5.6193 | lr 0.00100 | ngrams/sec 42984.1 | eta 0h0m24s
| epoch 10 | step 2500/4071 | loss 5.6235 | lr 0.00100 | ngrams/sec 42873.1 | eta 0h0m18s
| epoch 10 | step 3000/4071 | loss 5.6306 | lr 0.00100 | ngrams/sec 42904.4 | eta 0h0m12s
| epoch 10 | step 3500/4071 | loss 5.6176 | lr 0.00100 | ngrams/sec 42690.5 | eta 0h0m6s
| epoch 10 | step 4000/4071 | loss 5.6334 | lr 0.00100 | ngrams/sec 42526.3 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1154.58it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.62it/s]


-----------------------------------------------------------------------------------------
| end of epoch  10 | time 50.10s | valid loss  5.48 | valid ppl   240.54
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 11 | step 500/4071 | loss 5.4629 | lr 0.00100 | ngrams/sec 29287.4 | eta 0h1m2s
| epoch 11 | step 1000/4071 | loss 5.4955 | lr 0.00100 | ngrams/sec 42582.2 | eta 0h0m36s
| epoch 11 | step 1500/4071 | loss 5.5128 | lr 0.00100 | ngrams/sec 42337.2 | eta 0h0m31s
| epoch 11 | step 2000/4071 | loss 5.5210 | lr 0.00100 | ngrams/sec 42386.0 | eta 0h0m25s
| epoch 11 | step 2500/4071 | loss 5.5226 | lr 0.00100 | ngrams/sec 42445.4 | eta 0h0m18s
| epoch 11 | step 3000/4071 | loss 5.5171 | lr 0.00100 | ngrams/sec 42470.5 | eta 0h0m12s
| epoch 11 | step 3500/4071 | loss 5.5294 | lr 0.00100 | ngrams/sec 42410.2 | eta 0h0m6s
| epoch 11 | step 4000/4071 | loss 5.5465 | lr 0.00100 | ngrams/sec 42505.1 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1172.45it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.34it/s]


-----------------------------------------------------------------------------------------
| end of epoch  11 | time 50.49s | valid loss  5.43 | valid ppl   227.97
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 12 | step 500/4071 | loss 5.3673 | lr 0.00100 | ngrams/sec 29311.9 | eta 0h1m2s
| epoch 12 | step 1000/4071 | loss 5.4116 | lr 0.00100 | ngrams/sec 42478.1 | eta 0h0m37s
| epoch 12 | step 1500/4071 | loss 5.4116 | lr 0.00100 | ngrams/sec 42450.4 | eta 0h0m31s
| epoch 12 | step 2000/4071 | loss 5.4336 | lr 0.00100 | ngrams/sec 42492.8 | eta 0h0m24s
| epoch 12 | step 2500/4071 | loss 5.4460 | lr 0.00100 | ngrams/sec 42551.0 | eta 0h0m18s
| epoch 12 | step 3000/4071 | loss 5.4575 | lr 0.00100 | ngrams/sec 42417.3 | eta 0h0m12s
| epoch 12 | step 3500/4071 | loss 5.4639 | lr 0.00100 | ngrams/sec 42508.7 | eta 0h0m6s
| epoch 12 | step 4000/4071 | loss 5.4676 | lr 0.00100 | ngrams/sec 42350.0 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1155.60it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.64it/s]


-----------------------------------------------------------------------------------------
| end of epoch  12 | time 50.48s | valid loss  5.42 | valid ppl   224.91
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 13 | step 500/4071 | loss 5.2914 | lr 0.00100 | ngrams/sec 29189.4 | eta 0h1m2s
| epoch 13 | step 1000/4071 | loss 5.3229 | lr 0.00100 | ngrams/sec 42451.6 | eta 0h0m37s
| epoch 13 | step 1500/4071 | loss 5.3498 | lr 0.00100 | ngrams/sec 42402.1 | eta 0h0m31s
| epoch 13 | step 2000/4071 | loss 5.3688 | lr 0.00100 | ngrams/sec 42428.4 | eta 0h0m24s
| epoch 13 | step 2500/4071 | loss 5.3739 | lr 0.00100 | ngrams/sec 42423.7 | eta 0h0m18s
| epoch 13 | step 3000/4071 | loss 5.3937 | lr 0.00100 | ngrams/sec 42370.3 | eta 0h0m12s
| epoch 13 | step 3500/4071 | loss 5.3918 | lr 0.00100 | ngrams/sec 42421.4 | eta 0h0m6s
| epoch 13 | step 4000/4071 | loss 5.4074 | lr 0.00100 | ngrams/sec 42442.7 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1160.55it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.36it/s]


-----------------------------------------------------------------------------------------
| end of epoch  13 | time 50.54s | valid loss  5.41 | valid ppl   223.91
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 14 | step 500/4071 | loss 5.2290 | lr 0.00100 | ngrams/sec 29280.8 | eta 0h1m2s
| epoch 14 | step 1000/4071 | loss 5.2583 | lr 0.00100 | ngrams/sec 42644.6 | eta 0h0m36s
| epoch 14 | step 1500/4071 | loss 5.2863 | lr 0.00100 | ngrams/sec 42726.5 | eta 0h0m30s
| epoch 14 | step 2000/4071 | loss 5.3052 | lr 0.00100 | ngrams/sec 42867.7 | eta 0h0m24s
| epoch 14 | step 2500/4071 | loss 5.3247 | lr 0.00100 | ngrams/sec 42955.8 | eta 0h0m18s
| epoch 14 | step 3000/4071 | loss 5.3358 | lr 0.00100 | ngrams/sec 42848.5 | eta 0h0m12s
| epoch 14 | step 3500/4071 | loss 5.3335 | lr 0.00100 | ngrams/sec 42780.7 | eta 0h0m6s
| epoch 14 | step 4000/4071 | loss 5.3712 | lr 0.00100 | ngrams/sec 42695.3 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1140.92it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.23it/s]


-----------------------------------------------------------------------------------------
| end of epoch  14 | time 50.16s | valid loss  5.41 | valid ppl   224.66
-----------------------------------------------------------------------------------------
| epoch 15 | step 500/4071 | loss 5.1669 | lr 0.00100 | ngrams/sec 29456.6 | eta 0h1m2s
| epoch 15 | step 1000/4071 | loss 5.1999 | lr 0.00100 | ngrams/sec 42516.8 | eta 0h0m36s
| epoch 15 | step 1500/4071 | loss 5.2349 | lr 0.00100 | ngrams/sec 42473.6 | eta 0h0m30s
| epoch 15 | step 2000/4071 | loss 5.2436 | lr 0.00100 | ngrams/sec 42434.5 | eta 0h0m24s
| epoch 15 | step 2500/4071 | loss 5.2738 | lr 0.00100 | ngrams/sec 42284.5 | eta 0h0m19s
| epoch 15 | step 3000/4071 | loss 5.2986 | lr 0.00100 | ngrams/sec 42211.2 | eta 0h0m12s
| epoch 15 | step 3500/4071 | loss 5.3060 | lr 0.00100 | ngrams/sec 42166.3 | eta 0h0m6s
| epoch 15 | step 4000/4071 | loss 5.3170 | lr 0.00100 | ngrams/sec 42189.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1161.78it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.61it/s]


-----------------------------------------------------------------------------------------
| end of epoch  15 | time 50.64s | valid loss  5.40 | valid ppl   220.47
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 16 | step 500/4071 | loss 5.1116 | lr 0.00100 | ngrams/sec 29086.6 | eta 0h1m2s
| epoch 16 | step 1000/4071 | loss 5.1689 | lr 0.00100 | ngrams/sec 42365.8 | eta 0h0m37s
| epoch 16 | step 1500/4071 | loss 5.2002 | lr 0.00100 | ngrams/sec 42354.1 | eta 0h0m31s
| epoch 16 | step 2000/4071 | loss 5.2070 | lr 0.00100 | ngrams/sec 42415.6 | eta 0h0m24s
| epoch 16 | step 2500/4071 | loss 5.2319 | lr 0.00100 | ngrams/sec 42505.7 | eta 0h0m18s
| epoch 16 | step 3000/4071 | loss 5.2529 | lr 0.00100 | ngrams/sec 42491.0 | eta 0h0m12s
| epoch 16 | step 3500/4071 | loss 5.2653 | lr 0.00100 | ngrams/sec 42568.3 | eta 0h0m6s
| epoch 16 | step 4000/4071 | loss 5.2688 | lr 0.00100 | ngrams/sec 42551.6 | eta 0h

 29%|██▊       | 119/417 [00:00<00:00, 1149.27it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.30it/s]


-----------------------------------------------------------------------------------------
| end of epoch  16 | time 50.51s | valid loss  5.40 | valid ppl   221.58
-----------------------------------------------------------------------------------------
| epoch 17 | step 500/4071 | loss 5.0818 | lr 0.00100 | ngrams/sec 29516.0 | eta 0h1m1s
| epoch 17 | step 1000/4071 | loss 5.1205 | lr 0.00100 | ngrams/sec 42558.7 | eta 0h0m36s
| epoch 17 | step 1500/4071 | loss 5.1562 | lr 0.00100 | ngrams/sec 42655.4 | eta 0h0m30s
| epoch 17 | step 2000/4071 | loss 5.1850 | lr 0.00100 | ngrams/sec 42684.6 | eta 0h0m24s
| epoch 17 | step 2500/4071 | loss 5.2007 | lr 0.00100 | ngrams/sec 42712.8 | eta 0h0m18s
| epoch 17 | step 3000/4071 | loss 5.2389 | lr 0.00100 | ngrams/sec 42756.1 | eta 0h0m12s
| epoch 17 | step 3500/4071 | loss 5.2261 | lr 0.00100 | ngrams/sec 42631.5 | eta 0h0m6s
| epoch 17 | step 4000/4071 | loss 5.2428 | lr 0.00100 | ngrams/sec 42674.3 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1176.68it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  17 | time 50.27s | valid loss  5.41 | valid ppl   223.08
-----------------------------------------------------------------------------------------
| epoch 18 | step 500/4071 | loss 5.0525 | lr 0.00100 | ngrams/sec 29449.7 | eta 0h1m2s
| epoch 18 | step 1000/4071 | loss 5.0905 | lr 0.00100 | ngrams/sec 42503.5 | eta 0h0m36s
| epoch 18 | step 1500/4071 | loss 5.1356 | lr 0.00100 | ngrams/sec 42593.2 | eta 0h0m30s
| epoch 18 | step 2000/4071 | loss 5.1487 | lr 0.00100 | ngrams/sec 42530.8 | eta 0h0m24s
| epoch 18 | step 2500/4071 | loss 5.1690 | lr 0.00100 | ngrams/sec 42538.0 | eta 0h0m18s
| epoch 18 | step 3000/4071 | loss 5.1956 | lr 0.00100 | ngrams/sec 42656.8 | eta 0h0m12s
| epoch 18 | step 3500/4071 | loss 5.1952 | lr 0.00100 | ngrams/sec 42602.7 | eta 0h0m6s
| epoch 18 | step 4000/4071 | loss 5.2326 | lr 0.00100 | ngrams/sec 42509.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1143.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch  18 | time 50.39s | valid loss  5.40 | valid ppl   221.97
-----------------------------------------------------------------------------------------
| epoch 19 | step 500/4071 | loss 5.0332 | lr 0.00100 | ngrams/sec 29460.3 | eta 0h1m2s
| epoch 19 | step 1000/4071 | loss 5.0805 | lr 0.00100 | ngrams/sec 42558.3 | eta 0h0m36s
| epoch 19 | step 1500/4071 | loss 5.1121 | lr 0.00100 | ngrams/sec 42671.6 | eta 0h0m30s
| epoch 19 | step 2000/4071 | loss 5.1415 | lr 0.00100 | ngrams/sec 42578.1 | eta 0h0m24s
| epoch 19 | step 2500/4071 | loss 5.1430 | lr 0.00100 | ngrams/sec 42580.5 | eta 0h0m18s
| epoch 19 | step 3000/4071 | loss 5.1651 | lr 0.00100 | ngrams/sec 42522.9 | eta 0h0m12s
| epoch 19 | step 3500/4071 | loss 5.1919 | lr 0.00100 | ngrams/sec 42578.4 | eta 0h0m6s
| epoch 19 | step 4000/4071 | loss 5.1986 | lr 0.00100 | ngrams/sec 42681.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1178.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  19 | time 50.35s | valid loss  5.41 | valid ppl   222.79
-----------------------------------------------------------------------------------------
| epoch 20 | step 500/4071 | loss 5.0046 | lr 0.00100 | ngrams/sec 29437.2 | eta 0h1m2s
| epoch 20 | step 1000/4071 | loss 5.0569 | lr 0.00100 | ngrams/sec 42572.3 | eta 0h0m36s
| epoch 20 | step 1500/4071 | loss 5.0855 | lr 0.00100 | ngrams/sec 42655.1 | eta 0h0m30s
| epoch 20 | step 2000/4071 | loss 5.1126 | lr 0.00100 | ngrams/sec 42607.8 | eta 0h0m24s
| epoch 20 | step 2500/4071 | loss 5.1493 | lr 0.00100 | ngrams/sec 42683.1 | eta 0h0m18s
| epoch 20 | step 3000/4071 | loss 5.1780 | lr 0.00100 | ngrams/sec 42706.6 | eta 0h0m12s
| epoch 20 | step 3500/4071 | loss 5.1791 | lr 0.00100 | ngrams/sec 42814.3 | eta 0h0m6s
| epoch 20 | step 4000/4071 | loss 5.1940 | lr 0.00100 | ngrams/sec 42845.6 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1153.34it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.71it/s]


-----------------------------------------------------------------------------------------
| end of epoch  20 | time 50.25s | valid loss  5.39 | valid ppl   218.54
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 21 | step 500/4071 | loss 4.9997 | lr 0.00100 | ngrams/sec 29453.5 | eta 0h1m2s
| epoch 21 | step 1000/4071 | loss 5.0539 | lr 0.00100 | ngrams/sec 42769.1 | eta 0h0m36s
| epoch 21 | step 1500/4071 | loss 5.0787 | lr 0.00100 | ngrams/sec 42600.1 | eta 0h0m30s
| epoch 21 | step 2000/4071 | loss 5.1002 | lr 0.00100 | ngrams/sec 42731.6 | eta 0h0m24s
| epoch 21 | step 2500/4071 | loss 5.1324 | lr 0.00100 | ngrams/sec 42789.0 | eta 0h0m18s
| epoch 21 | step 3000/4071 | loss 5.1584 | lr 0.00100 | ngrams/sec 42672.2 | eta 0h0m12s
| epoch 21 | step 3500/4071 | loss 5.1685 | lr 0.00100 | ngrams/sec 42719.0 | eta 0h0m6s
| epoch 21 | step 4000/4071 | loss 5.1880 | lr 0.00100 | ngrams/sec 42674.5 | eta 0h

 28%|██▊       | 118/417 [00:00<00:00, 1161.97it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.41it/s]


-----------------------------------------------------------------------------------------
| end of epoch  21 | time 50.20s | valid loss  5.42 | valid ppl   226.82
-----------------------------------------------------------------------------------------
| epoch 22 | step 500/4071 | loss 5.0050 | lr 0.00100 | ngrams/sec 29538.8 | eta 0h1m1s
| epoch 22 | step 1000/4071 | loss 5.0303 | lr 0.00100 | ngrams/sec 42732.8 | eta 0h0m36s
| epoch 22 | step 1500/4071 | loss 5.0736 | lr 0.00100 | ngrams/sec 42619.8 | eta 0h0m30s
| epoch 22 | step 2000/4071 | loss 5.0929 | lr 0.00100 | ngrams/sec 42605.3 | eta 0h0m24s
| epoch 22 | step 2500/4071 | loss 5.1279 | lr 0.00100 | ngrams/sec 42581.7 | eta 0h0m18s
| epoch 22 | step 3000/4071 | loss 5.1617 | lr 0.00100 | ngrams/sec 42510.8 | eta 0h0m12s
| epoch 22 | step 3500/4071 | loss 5.1560 | lr 0.00100 | ngrams/sec 42597.9 | eta 0h0m6s
| epoch 22 | step 4000/4071 | loss 5.1927 | lr 0.00100 | ngrams/sec 42629.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1162.54it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.34it/s]


-----------------------------------------------------------------------------------------
| end of epoch  22 | time 50.32s | valid loss  5.41 | valid ppl   223.64
-----------------------------------------------------------------------------------------
| epoch 23 | step 500/4071 | loss 5.0108 | lr 0.00100 | ngrams/sec 29536.6 | eta 0h1m1s
| epoch 23 | step 1000/4071 | loss 5.0287 | lr 0.00100 | ngrams/sec 42732.9 | eta 0h0m36s
| epoch 23 | step 1500/4071 | loss 5.0876 | lr 0.00100 | ngrams/sec 42635.1 | eta 0h0m30s
| epoch 23 | step 2000/4071 | loss 5.0982 | lr 0.00100 | ngrams/sec 42647.3 | eta 0h0m24s
| epoch 23 | step 2500/4071 | loss 5.1075 | lr 0.00100 | ngrams/sec 42750.6 | eta 0h0m18s
| epoch 23 | step 3000/4071 | loss 5.1550 | lr 0.00100 | ngrams/sec 42830.6 | eta 0h0m12s
| epoch 23 | step 3500/4071 | loss 5.1742 | lr 0.00100 | ngrams/sec 42806.5 | eta 0h0m6s
| epoch 23 | step 4000/4071 | loss 5.1789 | lr 0.00100 | ngrams/sec 42736.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1172.31it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.18it/s]


-----------------------------------------------------------------------------------------
| end of epoch  23 | time 50.19s | valid loss  5.48 | valid ppl   239.80
-----------------------------------------------------------------------------------------
| epoch 24 | step 500/4071 | loss 5.0126 | lr 0.00100 | ngrams/sec 29614.5 | eta 0h1m1s
| epoch 24 | step 1000/4071 | loss 5.0448 | lr 0.00100 | ngrams/sec 42659.7 | eta 0h0m36s
| epoch 24 | step 1500/4071 | loss 5.0737 | lr 0.00100 | ngrams/sec 42312.2 | eta 0h0m31s
| epoch 24 | step 2000/4071 | loss 5.0959 | lr 0.00100 | ngrams/sec 42742.2 | eta 0h0m24s
| epoch 24 | step 2500/4071 | loss 5.1180 | lr 0.00100 | ngrams/sec 42666.5 | eta 0h0m18s
| epoch 24 | step 3000/4071 | loss 5.1450 | lr 0.00100 | ngrams/sec 42677.9 | eta 0h0m12s
| epoch 24 | step 3500/4071 | loss 5.1692 | lr 0.00100 | ngrams/sec 42798.8 | eta 0h0m6s
| epoch 24 | step 4000/4071 | loss 5.1976 | lr 0.00100 | ngrams/sec 42812.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1155.69it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.27it/s]


-----------------------------------------------------------------------------------------
| end of epoch  24 | time 50.24s | valid loss  5.46 | valid ppl   234.29
-----------------------------------------------------------------------------------------
| epoch 25 | step 500/4071 | loss 4.9919 | lr 0.00100 | ngrams/sec 29634.6 | eta 0h1m1s
| epoch 25 | step 1000/4071 | loss 5.0479 | lr 0.00100 | ngrams/sec 42839.2 | eta 0h0m36s
| epoch 25 | step 1500/4071 | loss 5.0899 | lr 0.00100 | ngrams/sec 42873.5 | eta 0h0m30s
| epoch 25 | step 2000/4071 | loss 5.1040 | lr 0.00100 | ngrams/sec 42785.9 | eta 0h0m24s
| epoch 25 | step 2500/4071 | loss 5.1277 | lr 0.00100 | ngrams/sec 42912.3 | eta 0h0m18s
| epoch 25 | step 3000/4071 | loss 5.1577 | lr 0.00100 | ngrams/sec 42821.1 | eta 0h0m12s
| epoch 25 | step 3500/4071 | loss 5.1705 | lr 0.00100 | ngrams/sec 42800.6 | eta 0h0m6s
| epoch 25 | step 4000/4071 | loss 5.1983 | lr 0.00100 | ngrams/sec 42791.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1148.13it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.40it/s]


-----------------------------------------------------------------------------------------
| end of epoch  25 | time 50.08s | valid loss  5.46 | valid ppl   234.24
-----------------------------------------------------------------------------------------
| epoch 26 | step 500/4071 | loss 5.0105 | lr 0.00100 | ngrams/sec 29534.2 | eta 0h1m1s
| epoch 26 | step 1000/4071 | loss 5.0576 | lr 0.00100 | ngrams/sec 42718.2 | eta 0h0m36s
| epoch 26 | step 1500/4071 | loss 5.0906 | lr 0.00100 | ngrams/sec 42741.4 | eta 0h0m30s
| epoch 26 | step 2000/4071 | loss 5.1246 | lr 0.00100 | ngrams/sec 42699.9 | eta 0h0m24s
| epoch 26 | step 2500/4071 | loss 5.1395 | lr 0.00100 | ngrams/sec 42668.7 | eta 0h0m18s
| epoch 26 | step 3000/4071 | loss 5.1550 | lr 0.00100 | ngrams/sec 42681.9 | eta 0h0m12s
| epoch 26 | step 3500/4071 | loss 5.1780 | lr 0.00100 | ngrams/sec 42722.4 | eta 0h0m6s
| epoch 26 | step 4000/4071 | loss 5.1920 | lr 0.00100 | ngrams/sec 42695.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1183.75it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  26 | time 50.22s | valid loss  5.46 | valid ppl   234.26
-----------------------------------------------------------------------------------------
| epoch 27 | step 500/4071 | loss 5.0211 | lr 0.00100 | ngrams/sec 29563.2 | eta 0h1m1s
| epoch 27 | step 1000/4071 | loss 5.0747 | lr 0.00100 | ngrams/sec 42672.9 | eta 0h0m36s
| epoch 27 | step 1500/4071 | loss 5.0817 | lr 0.00100 | ngrams/sec 42563.0 | eta 0h0m30s
| epoch 27 | step 2000/4071 | loss 5.1252 | lr 0.00100 | ngrams/sec 42703.3 | eta 0h0m24s
| epoch 27 | step 2500/4071 | loss 5.1437 | lr 0.00100 | ngrams/sec 42803.4 | eta 0h0m18s
| epoch 27 | step 3000/4071 | loss 5.1686 | lr 0.00100 | ngrams/sec 42810.1 | eta 0h0m12s
| epoch 27 | step 3500/4071 | loss 5.1955 | lr 0.00100 | ngrams/sec 42855.6 | eta 0h0m6s
| epoch 27 | step 4000/4071 | loss 5.2165 | lr 0.00100 | ngrams/sec 42805.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1181.35it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.57it/s]


-----------------------------------------------------------------------------------------
| end of epoch  27 | time 50.18s | valid loss  5.49 | valid ppl   241.76
-----------------------------------------------------------------------------------------
| epoch 28 | step 500/4071 | loss 5.0342 | lr 0.00100 | ngrams/sec 29747.2 | eta 0h1m1s
| epoch 28 | step 1000/4071 | loss 5.0747 | lr 0.00100 | ngrams/sec 42809.9 | eta 0h0m36s
| epoch 28 | step 1500/4071 | loss 5.0949 | lr 0.00100 | ngrams/sec 42941.8 | eta 0h0m30s
| epoch 28 | step 2000/4071 | loss 5.1455 | lr 0.00100 | ngrams/sec 42879.2 | eta 0h0m24s
| epoch 28 | step 2500/4071 | loss 5.1586 | lr 0.00100 | ngrams/sec 42819.8 | eta 0h0m18s
| epoch 28 | step 3000/4071 | loss 5.1843 | lr 0.00100 | ngrams/sec 42972.4 | eta 0h0m12s
| epoch 28 | step 3500/4071 | loss 5.2048 | lr 0.00100 | ngrams/sec 42842.7 | eta 0h0m6s
| epoch 28 | step 4000/4071 | loss 5.2236 | lr 0.00100 | ngrams/sec 42850.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1141.40it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.38it/s]


-----------------------------------------------------------------------------------------
| end of epoch  28 | time 50.01s | valid loss  5.53 | valid ppl   252.97
-----------------------------------------------------------------------------------------
| epoch 29 | step 500/4071 | loss 5.0384 | lr 0.00100 | ngrams/sec 29571.8 | eta 0h1m1s
| epoch 29 | step 1000/4071 | loss 5.0773 | lr 0.00100 | ngrams/sec 42669.8 | eta 0h0m36s
| epoch 29 | step 1500/4071 | loss 5.1213 | lr 0.00100 | ngrams/sec 42715.8 | eta 0h0m30s
| epoch 29 | step 2000/4071 | loss 5.1418 | lr 0.00100 | ngrams/sec 42566.5 | eta 0h0m24s
| epoch 29 | step 2500/4071 | loss 5.1706 | lr 0.00100 | ngrams/sec 42535.2 | eta 0h0m18s
| epoch 29 | step 3000/4071 | loss 5.2014 | lr 0.00100 | ngrams/sec 42518.5 | eta 0h0m12s
| epoch 29 | step 3500/4071 | loss 5.2263 | lr 0.00100 | ngrams/sec 42536.4 | eta 0h0m6s
| epoch 29 | step 4000/4071 | loss 5.2323 | lr 0.00100 | ngrams/sec 42507.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1149.18it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch  29 | time 50.36s | valid loss  5.62 | valid ppl   275.21
-----------------------------------------------------------------------------------------
| epoch 30 | step 500/4071 | loss 5.0389 | lr 0.00100 | ngrams/sec 29375.2 | eta 0h1m2s
| epoch 30 | step 1000/4071 | loss 5.0830 | lr 0.00100 | ngrams/sec 42332.2 | eta 0h0m37s
| epoch 30 | step 1500/4071 | loss 5.1472 | lr 0.00100 | ngrams/sec 42403.3 | eta 0h0m31s
| epoch 30 | step 2000/4071 | loss 5.1620 | lr 0.00100 | ngrams/sec 42335.7 | eta 0h0m25s
| epoch 30 | step 2500/4071 | loss 5.2004 | lr 0.00100 | ngrams/sec 42250.1 | eta 0h0m19s
| epoch 30 | step 3000/4071 | loss 5.2051 | lr 0.00100 | ngrams/sec 42351.0 | eta 0h0m12s
| epoch 30 | step 3500/4071 | loss 5.2279 | lr 0.00100 | ngrams/sec 42306.6 | eta 0h0m6s
| epoch 30 | step 4000/4071 | loss 5.2529 | lr 0.00100 | ngrams/sec 42283.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1163.81it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  30 | time 50.65s | valid loss  5.49 | valid ppl   243.24
-----------------------------------------------------------------------------------------
| epoch 31 | step 500/4071 | loss 5.0625 | lr 0.00100 | ngrams/sec 29282.8 | eta 0h1m2s
| epoch 31 | step 1000/4071 | loss 5.1041 | lr 0.00100 | ngrams/sec 42277.5 | eta 0h0m37s
| epoch 31 | step 1500/4071 | loss 5.1414 | lr 0.00100 | ngrams/sec 42253.3 | eta 0h0m31s
| epoch 31 | step 2000/4071 | loss 5.1905 | lr 0.00100 | ngrams/sec 42249.0 | eta 0h0m25s
| epoch 31 | step 2500/4071 | loss 5.1882 | lr 0.00100 | ngrams/sec 42242.6 | eta 0h0m19s
| epoch 31 | step 3000/4071 | loss 5.2323 | lr 0.00100 | ngrams/sec 42210.0 | eta 0h0m12s
| epoch 31 | step 3500/4071 | loss 5.2439 | lr 0.00100 | ngrams/sec 42251.9 | eta 0h0m6s
| epoch 31 | step 4000/4071 | loss 5.2541 | lr 0.00100 | ngrams/sec 42352.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1181.47it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 299.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  31 | time 50.73s | valid loss  5.46 | valid ppl   236.18
-----------------------------------------------------------------------------------------
| epoch 32 | step 500/4071 | loss 5.0645 | lr 0.00100 | ngrams/sec 29367.5 | eta 0h1m2s
| epoch 32 | step 1000/4071 | loss 5.1266 | lr 0.00100 | ngrams/sec 42313.8 | eta 0h0m37s
| epoch 32 | step 1500/4071 | loss 5.1555 | lr 0.00100 | ngrams/sec 42391.8 | eta 0h0m31s
| epoch 32 | step 2000/4071 | loss 5.1939 | lr 0.00100 | ngrams/sec 42361.0 | eta 0h0m25s
| epoch 32 | step 2500/4071 | loss 5.2162 | lr 0.00100 | ngrams/sec 42408.0 | eta 0h0m18s
| epoch 32 | step 3000/4071 | loss 5.2442 | lr 0.00100 | ngrams/sec 42316.6 | eta 0h0m12s
| epoch 32 | step 3500/4071 | loss 5.2464 | lr 0.00100 | ngrams/sec 42354.6 | eta 0h0m6s
| epoch 32 | step 4000/4071 | loss 5.2886 | lr 0.00100 | ngrams/sec 42306.9 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1173.59it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.73it/s]


-----------------------------------------------------------------------------------------
| end of epoch  32 | time 50.63s | valid loss  5.48 | valid ppl   239.48
-----------------------------------------------------------------------------------------
| epoch 33 | step 500/4071 | loss 5.0950 | lr 0.00100 | ngrams/sec 29353.9 | eta 0h1m2s
| epoch 33 | step 1000/4071 | loss 5.1432 | lr 0.00100 | ngrams/sec 42218.0 | eta 0h0m37s
| epoch 33 | step 1500/4071 | loss 5.1698 | lr 0.00100 | ngrams/sec 42165.6 | eta 0h0m31s
| epoch 33 | step 2000/4071 | loss 5.2065 | lr 0.00100 | ngrams/sec 42096.1 | eta 0h0m25s
| epoch 33 | step 2500/4071 | loss 5.2248 | lr 0.00100 | ngrams/sec 42097.0 | eta 0h0m19s
| epoch 33 | step 3000/4071 | loss 5.2732 | lr 0.00100 | ngrams/sec 42105.0 | eta 0h0m13s
| epoch 33 | step 3500/4071 | loss 5.2736 | lr 0.00100 | ngrams/sec 42038.5 | eta 0h0m6s
| epoch 33 | step 4000/4071 | loss 5.2819 | lr 0.00100 | ngrams/sec 42082.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1137.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  33 | time 50.89s | valid loss  5.54 | valid ppl   254.86
-----------------------------------------------------------------------------------------
| epoch 34 | step 500/4071 | loss 5.1209 | lr 0.00100 | ngrams/sec 29179.4 | eta 0h1m2s
| epoch 34 | step 1000/4071 | loss 5.1506 | lr 0.00100 | ngrams/sec 42097.3 | eta 0h0m37s
| epoch 34 | step 1500/4071 | loss 5.1885 | lr 0.00100 | ngrams/sec 42084.9 | eta 0h0m31s
| epoch 34 | step 2000/4071 | loss 5.2283 | lr 0.00100 | ngrams/sec 42122.2 | eta 0h0m25s
| epoch 34 | step 2500/4071 | loss 5.2480 | lr 0.00100 | ngrams/sec 42068.7 | eta 0h0m19s
| epoch 34 | step 3000/4071 | loss 5.2788 | lr 0.00100 | ngrams/sec 42049.3 | eta 0h0m13s
| epoch 34 | step 3500/4071 | loss 5.2918 | lr 0.00100 | ngrams/sec 42011.2 | eta 0h0m6s
| epoch 34 | step 4000/4071 | loss 5.3012 | lr 0.00100 | ngrams/sec 42087.8 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1171.93it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 300.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  34 | time 50.95s | valid loss  5.50 | valid ppl   243.54
-----------------------------------------------------------------------------------------
| epoch 35 | step 500/4071 | loss 5.1352 | lr 0.00100 | ngrams/sec 29181.7 | eta 0h1m2s
| epoch 35 | step 1000/4071 | loss 5.1793 | lr 0.00100 | ngrams/sec 41989.1 | eta 0h0m37s
| epoch 35 | step 1500/4071 | loss 5.2115 | lr 0.00100 | ngrams/sec 42013.0 | eta 0h0m31s
| epoch 35 | step 2000/4071 | loss 5.2486 | lr 0.00100 | ngrams/sec 42012.1 | eta 0h0m25s
| epoch 35 | step 2500/4071 | loss 5.2722 | lr 0.00100 | ngrams/sec 41958.8 | eta 0h0m19s
| epoch 35 | step 3000/4071 | loss 5.2952 | lr 0.00100 | ngrams/sec 41915.9 | eta 0h0m13s
| epoch 35 | step 3500/4071 | loss 5.3083 | lr 0.00100 | ngrams/sec 42071.3 | eta 0h0m6s
| epoch 35 | step 4000/4071 | loss 5.3182 | lr 0.00100 | ngrams/sec 42101.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1175.42it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 299.03it/s]


-----------------------------------------------------------------------------------------
| end of epoch  35 | time 51.03s | valid loss  5.53 | valid ppl   251.85
-----------------------------------------------------------------------------------------
| epoch 36 | step 500/4071 | loss 5.1452 | lr 0.00100 | ngrams/sec 29235.5 | eta 0h1m2s
| epoch 36 | step 1000/4071 | loss 5.1908 | lr 0.00100 | ngrams/sec 42065.1 | eta 0h0m37s
| epoch 36 | step 1500/4071 | loss 5.2303 | lr 0.00100 | ngrams/sec 42066.0 | eta 0h0m31s
| epoch 36 | step 2000/4071 | loss 5.2614 | lr 0.00100 | ngrams/sec 42188.0 | eta 0h0m25s
| epoch 36 | step 2500/4071 | loss 5.2850 | lr 0.00100 | ngrams/sec 42155.3 | eta 0h0m19s
| epoch 36 | step 3000/4071 | loss 5.3124 | lr 0.00100 | ngrams/sec 42141.3 | eta 0h0m13s
| epoch 36 | step 3500/4071 | loss 5.3342 | lr 0.00100 | ngrams/sec 42129.3 | eta 0h0m6s
| epoch 36 | step 4000/4071 | loss 5.3447 | lr 0.00100 | ngrams/sec 42104.1 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1169.44it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  36 | time 50.90s | valid loss  5.51 | valid ppl   246.08
-----------------------------------------------------------------------------------------
| epoch 37 | step 500/4071 | loss 5.1667 | lr 0.00100 | ngrams/sec 29236.5 | eta 0h1m2s
| epoch 37 | step 1000/4071 | loss 5.2078 | lr 0.00100 | ngrams/sec 42139.1 | eta 0h0m37s
| epoch 37 | step 1500/4071 | loss 5.2635 | lr 0.00100 | ngrams/sec 42054.6 | eta 0h0m31s
| epoch 37 | step 2000/4071 | loss 5.2728 | lr 0.00100 | ngrams/sec 42051.5 | eta 0h0m25s
| epoch 37 | step 2500/4071 | loss 5.2986 | lr 0.00100 | ngrams/sec 41952.7 | eta 0h0m19s
| epoch 37 | step 3000/4071 | loss 5.3143 | lr 0.00100 | ngrams/sec 41917.2 | eta 0h0m13s
| epoch 37 | step 3500/4071 | loss 5.3514 | lr 0.00100 | ngrams/sec 42005.4 | eta 0h0m6s
| epoch 37 | step 4000/4071 | loss 5.3524 | lr 0.00100 | ngrams/sec 42022.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1176.84it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch  37 | time 51.02s | valid loss  5.62 | valid ppl   276.27
-----------------------------------------------------------------------------------------
| epoch 38 | step 500/4071 | loss 5.1909 | lr 0.00100 | ngrams/sec 29093.1 | eta 0h1m2s
| epoch 38 | step 1000/4071 | loss 5.2253 | lr 0.00100 | ngrams/sec 41860.2 | eta 0h0m37s
| epoch 38 | step 1500/4071 | loss 5.2737 | lr 0.00100 | ngrams/sec 41880.1 | eta 0h0m31s
| epoch 38 | step 2000/4071 | loss 5.2912 | lr 0.00100 | ngrams/sec 41797.3 | eta 0h0m25s
| epoch 38 | step 2500/4071 | loss 5.3187 | lr 0.00100 | ngrams/sec 41889.2 | eta 0h0m19s
| epoch 38 | step 3000/4071 | loss 5.3422 | lr 0.00100 | ngrams/sec 41765.7 | eta 0h0m13s
| epoch 38 | step 3500/4071 | loss 5.3710 | lr 0.00100 | ngrams/sec 41909.4 | eta 0h0m6s
| epoch 38 | step 4000/4071 | loss 5.3742 | lr 0.00100 | ngrams/sec 41972.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1172.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  38 | time 51.19s | valid loss  5.53 | valid ppl   251.71
-----------------------------------------------------------------------------------------
| epoch 39 | step 500/4071 | loss 5.2002 | lr 0.00100 | ngrams/sec 29019.1 | eta 0h1m3s
| epoch 39 | step 1000/4071 | loss 5.2443 | lr 0.00100 | ngrams/sec 41775.8 | eta 0h0m37s
| epoch 39 | step 1500/4071 | loss 5.2802 | lr 0.00100 | ngrams/sec 41834.9 | eta 0h0m31s
| epoch 39 | step 2000/4071 | loss 5.3229 | lr 0.00100 | ngrams/sec 41909.7 | eta 0h0m25s
| epoch 39 | step 2500/4071 | loss 5.3376 | lr 0.00100 | ngrams/sec 41965.9 | eta 0h0m19s
| epoch 39 | step 3000/4071 | loss 5.3689 | lr 0.00100 | ngrams/sec 42012.3 | eta 0h0m13s
| epoch 39 | step 3500/4071 | loss 5.3921 | lr 0.00100 | ngrams/sec 41980.8 | eta 0h0m6s
| epoch 39 | step 4000/4071 | loss 5.4110 | lr 0.00100 | ngrams/sec 42016.0 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.99it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.54it/s]


-----------------------------------------------------------------------------------------
| end of epoch  39 | time 51.15s | valid loss  5.56 | valid ppl   260.07
-----------------------------------------------------------------------------------------
| epoch 40 | step 500/4071 | loss 5.2388 | lr 0.00100 | ngrams/sec 29257.5 | eta 0h1m2s
| epoch 40 | step 1000/4071 | loss 5.2508 | lr 0.00100 | ngrams/sec 42092.8 | eta 0h0m37s
| epoch 40 | step 1500/4071 | loss 5.3184 | lr 0.00100 | ngrams/sec 42086.5 | eta 0h0m31s
| epoch 40 | step 2000/4071 | loss 5.3349 | lr 0.00100 | ngrams/sec 42199.9 | eta 0h0m25s
| epoch 40 | step 2500/4071 | loss 5.3514 | lr 0.00100 | ngrams/sec 42037.6 | eta 0h0m19s
| epoch 40 | step 3000/4071 | loss 5.3838 | lr 0.00100 | ngrams/sec 41956.9 | eta 0h0m13s
| epoch 40 | step 3500/4071 | loss 5.3913 | lr 0.00100 | ngrams/sec 41935.2 | eta 0h0m6s
| epoch 40 | step 4000/4071 | loss 5.4353 | lr 0.00100 | ngrams/sec 41991.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1138.64it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.78it/s]


-----------------------------------------------------------------------------------------
| end of epoch  40 | time 50.97s | valid loss  5.64 | valid ppl   281.02
-----------------------------------------------------------------------------------------
| epoch 41 | step 500/4071 | loss 5.2602 | lr 0.00100 | ngrams/sec 29177.2 | eta 0h1m2s
| epoch 41 | step 1000/4071 | loss 5.2839 | lr 0.00100 | ngrams/sec 41949.6 | eta 0h0m37s
| epoch 41 | step 1500/4071 | loss 5.3313 | lr 0.00100 | ngrams/sec 41998.5 | eta 0h0m31s
| epoch 41 | step 2000/4071 | loss 5.3559 | lr 0.00100 | ngrams/sec 41915.5 | eta 0h0m25s
| epoch 41 | step 2500/4071 | loss 5.3800 | lr 0.00100 | ngrams/sec 41971.1 | eta 0h0m19s
| epoch 41 | step 3000/4071 | loss 5.3994 | lr 0.00100 | ngrams/sec 41851.1 | eta 0h0m13s
| epoch 41 | step 3500/4071 | loss 5.4286 | lr 0.00100 | ngrams/sec 41890.1 | eta 0h0m6s
| epoch 41 | step 4000/4071 | loss 5.4441 | lr 0.00100 | ngrams/sec 41904.0 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1166.62it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.41it/s]


-----------------------------------------------------------------------------------------
| end of epoch  41 | time 51.12s | valid loss  5.57 | valid ppl   262.68
-----------------------------------------------------------------------------------------
| epoch 42 | step 500/4071 | loss 5.2894 | lr 0.00100 | ngrams/sec 29082.5 | eta 0h1m2s
| epoch 42 | step 1000/4071 | loss 5.3083 | lr 0.00100 | ngrams/sec 41877.8 | eta 0h0m37s
| epoch 42 | step 1500/4071 | loss 5.3525 | lr 0.00100 | ngrams/sec 41894.7 | eta 0h0m31s
| epoch 42 | step 2000/4071 | loss 5.3778 | lr 0.00100 | ngrams/sec 41908.9 | eta 0h0m25s
| epoch 42 | step 2500/4071 | loss 5.3897 | lr 0.00100 | ngrams/sec 41960.4 | eta 0h0m19s
| epoch 42 | step 3000/4071 | loss 5.4225 | lr 0.00100 | ngrams/sec 41947.8 | eta 0h0m13s
| epoch 42 | step 3500/4071 | loss 5.4528 | lr 0.00100 | ngrams/sec 42100.7 | eta 0h0m6s
| epoch 42 | step 4000/4071 | loss 5.4581 | lr 0.00100 | ngrams/sec 42028.1 | eta 0h0m0s


 28%|██▊       | 118/417 [00:00<00:00, 1170.34it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 299.10it/s]


-----------------------------------------------------------------------------------------
| end of epoch  42 | time 51.09s | valid loss  5.54 | valid ppl   255.87
-----------------------------------------------------------------------------------------
| epoch 43 | step 500/4071 | loss 5.2863 | lr 0.00100 | ngrams/sec 29313.7 | eta 0h1m2s
| epoch 43 | step 1000/4071 | loss 5.3401 | lr 0.00100 | ngrams/sec 42122.5 | eta 0h0m37s
| epoch 43 | step 1500/4071 | loss 5.3694 | lr 0.00100 | ngrams/sec 42143.5 | eta 0h0m31s
| epoch 43 | step 2000/4071 | loss 5.4023 | lr 0.00100 | ngrams/sec 42203.0 | eta 0h0m25s
| epoch 43 | step 2500/4071 | loss 5.4162 | lr 0.00100 | ngrams/sec 42091.5 | eta 0h0m19s
| epoch 43 | step 3000/4071 | loss 5.4502 | lr 0.00100 | ngrams/sec 42147.2 | eta 0h0m13s
| epoch 43 | step 3500/4071 | loss 5.4733 | lr 0.00100 | ngrams/sec 42139.3 | eta 0h0m6s
| epoch 43 | step 4000/4071 | loss 5.4822 | lr 0.00100 | ngrams/sec 42020.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1169.23it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 300.76it/s]


-----------------------------------------------------------------------------------------
| end of epoch  43 | time 50.88s | valid loss  5.51 | valid ppl   247.75
-----------------------------------------------------------------------------------------
| epoch 44 | step 500/4071 | loss 5.3206 | lr 0.00100 | ngrams/sec 29269.9 | eta 0h1m2s
| epoch 44 | step 1000/4071 | loss 5.3555 | lr 0.00100 | ngrams/sec 42001.5 | eta 0h0m37s
| epoch 44 | step 1500/4071 | loss 5.4287 | lr 0.00100 | ngrams/sec 41983.9 | eta 0h0m31s
| epoch 44 | step 2000/4071 | loss 5.4390 | lr 0.00100 | ngrams/sec 42025.5 | eta 0h0m25s
| epoch 44 | step 2500/4071 | loss 5.4492 | lr 0.00100 | ngrams/sec 41989.9 | eta 0h0m19s
| epoch 44 | step 3000/4071 | loss 5.4645 | lr 0.00100 | ngrams/sec 41971.2 | eta 0h0m13s
| epoch 44 | step 3500/4071 | loss 5.4981 | lr 0.00100 | ngrams/sec 41972.9 | eta 0h0m6s
| epoch 44 | step 4000/4071 | loss 5.5152 | lr 0.00100 | ngrams/sec 41928.3 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1157.66it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 300.13it/s]


-----------------------------------------------------------------------------------------
| end of epoch  44 | time 51.04s | valid loss  5.55 | valid ppl   256.56
-----------------------------------------------------------------------------------------
| epoch 45 | step 500/4071 | loss 5.3508 | lr 0.00100 | ngrams/sec 29159.4 | eta 0h1m2s
| epoch 45 | step 1000/4071 | loss 5.3972 | lr 0.00100 | ngrams/sec 41963.2 | eta 0h0m37s
| epoch 45 | step 1500/4071 | loss 5.4195 | lr 0.00100 | ngrams/sec 42004.4 | eta 0h0m31s
| epoch 45 | step 2000/4071 | loss 5.4268 | lr 0.00100 | ngrams/sec 42000.7 | eta 0h0m25s
| epoch 45 | step 2500/4071 | loss 5.4565 | lr 0.00100 | ngrams/sec 42033.8 | eta 0h0m19s
| epoch 45 | step 3000/4071 | loss 5.4931 | lr 0.00100 | ngrams/sec 42132.5 | eta 0h0m13s
| epoch 45 | step 3500/4071 | loss 5.5019 | lr 0.00100 | ngrams/sec 42066.1 | eta 0h0m6s
| epoch 45 | step 4000/4071 | loss 5.5234 | lr 0.00100 | ngrams/sec 42061.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1155.77it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 299.73it/s]


-----------------------------------------------------------------------------------------
| end of epoch  45 | time 51.00s | valid loss  5.66 | valid ppl   287.20
-----------------------------------------------------------------------------------------
| epoch 46 | step 500/4071 | loss 5.3873 | lr 0.00100 | ngrams/sec 29261.0 | eta 0h1m2s
| epoch 46 | step 1000/4071 | loss 5.3970 | lr 0.00100 | ngrams/sec 42043.5 | eta 0h0m37s
| epoch 46 | step 1500/4071 | loss 5.4372 | lr 0.00100 | ngrams/sec 42093.8 | eta 0h0m31s
| epoch 46 | step 2000/4071 | loss 5.4624 | lr 0.00100 | ngrams/sec 42078.5 | eta 0h0m25s
| epoch 46 | step 2500/4071 | loss 5.4929 | lr 0.00100 | ngrams/sec 41655.8 | eta 0h0m19s
| epoch 46 | step 3000/4071 | loss 5.5114 | lr 0.00100 | ngrams/sec 42256.9 | eta 0h0m12s
| epoch 46 | step 3500/4071 | loss 5.5266 | lr 0.00100 | ngrams/sec 42163.5 | eta 0h0m6s
| epoch 46 | step 4000/4071 | loss 5.5515 | lr 0.00100 | ngrams/sec 42216.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1189.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 301.14it/s]


-----------------------------------------------------------------------------------------
| end of epoch  46 | time 50.94s | valid loss  5.51 | valid ppl   246.33
-----------------------------------------------------------------------------------------
| epoch 47 | step 500/4071 | loss 5.4130 | lr 0.00100 | ngrams/sec 29290.7 | eta 0h1m2s
| epoch 47 | step 1000/4071 | loss 5.4197 | lr 0.00100 | ngrams/sec 42140.8 | eta 0h0m37s
| epoch 47 | step 1500/4071 | loss 5.4500 | lr 0.00100 | ngrams/sec 42177.8 | eta 0h0m31s
| epoch 47 | step 2000/4071 | loss 5.4935 | lr 0.00100 | ngrams/sec 42069.9 | eta 0h0m25s
| epoch 47 | step 2500/4071 | loss 5.5040 | lr 0.00100 | ngrams/sec 42054.9 | eta 0h0m19s
| epoch 47 | step 3000/4071 | loss 5.5415 | lr 0.00100 | ngrams/sec 42069.2 | eta 0h0m13s
| epoch 47 | step 3500/4071 | loss 5.5588 | lr 0.00100 | ngrams/sec 42080.8 | eta 0h0m6s
| epoch 47 | step 4000/4071 | loss 5.5714 | lr 0.00100 | ngrams/sec 42099.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1163.82it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 299.31it/s]


-----------------------------------------------------------------------------------------
| end of epoch  47 | time 50.92s | valid loss  5.80 | valid ppl   331.93
-----------------------------------------------------------------------------------------
| epoch 48 | step 500/4071 | loss 5.4134 | lr 0.00100 | ngrams/sec 29224.8 | eta 0h1m2s
| epoch 48 | step 1000/4071 | loss 5.4534 | lr 0.00100 | ngrams/sec 42050.7 | eta 0h0m37s
| epoch 48 | step 1500/4071 | loss 5.4893 | lr 0.00100 | ngrams/sec 42078.1 | eta 0h0m31s
| epoch 48 | step 2000/4071 | loss 5.5133 | lr 0.00100 | ngrams/sec 42154.3 | eta 0h0m25s
| epoch 48 | step 2500/4071 | loss 5.5234 | lr 0.00100 | ngrams/sec 42039.6 | eta 0h0m19s
| epoch 48 | step 3000/4071 | loss 5.5567 | lr 0.00100 | ngrams/sec 42064.2 | eta 0h0m13s
| epoch 48 | step 3500/4071 | loss 5.5596 | lr 0.00100 | ngrams/sec 42061.1 | eta 0h0m6s
| epoch 48 | step 4000/4071 | loss 5.5916 | lr 0.00100 | ngrams/sec 42040.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1147.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 299.60it/s]


-----------------------------------------------------------------------------------------
| end of epoch  48 | time 50.95s | valid loss  5.70 | valid ppl   299.06
-----------------------------------------------------------------------------------------
| epoch 49 | step 500/4071 | loss 5.4333 | lr 0.00100 | ngrams/sec 29273.5 | eta 0h1m2s
| epoch 49 | step 1000/4071 | loss 5.4744 | lr 0.00100 | ngrams/sec 42070.4 | eta 0h0m37s
| epoch 49 | step 1500/4071 | loss 5.5116 | lr 0.00100 | ngrams/sec 42146.4 | eta 0h0m31s
| epoch 49 | step 2000/4071 | loss 5.5305 | lr 0.00100 | ngrams/sec 42181.1 | eta 0h0m25s
| epoch 49 | step 2500/4071 | loss 5.5665 | lr 0.00100 | ngrams/sec 42180.4 | eta 0h0m19s
| epoch 49 | step 3000/4071 | loss 5.5806 | lr 0.00100 | ngrams/sec 42165.7 | eta 0h0m13s
| epoch 49 | step 3500/4071 | loss 5.5899 | lr 0.00100 | ngrams/sec 42154.2 | eta 0h0m6s
| epoch 49 | step 4000/4071 | loss 5.6061 | lr 0.00100 | ngrams/sec 42249.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1156.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 300.83it/s]


-----------------------------------------------------------------------------------------
| end of epoch  49 | time 50.84s | valid loss  5.55 | valid ppl   257.95
-----------------------------------------------------------------------------------------
| epoch 50 | step 500/4071 | loss 5.4892 | lr 0.00100 | ngrams/sec 29371.4 | eta 0h1m2s
| epoch 50 | step 1000/4071 | loss 5.4879 | lr 0.00100 | ngrams/sec 42352.5 | eta 0h0m37s
| epoch 50 | step 1500/4071 | loss 5.5355 | lr 0.00100 | ngrams/sec 42248.8 | eta 0h0m31s
| epoch 50 | step 2000/4071 | loss 5.5548 | lr 0.00100 | ngrams/sec 42315.6 | eta 0h0m25s
| epoch 50 | step 2500/4071 | loss 5.5752 | lr 0.00100 | ngrams/sec 42192.5 | eta 0h0m19s
| epoch 50 | step 3000/4071 | loss 5.5938 | lr 0.00100 | ngrams/sec 42135.1 | eta 0h0m13s
| epoch 50 | step 3500/4071 | loss 5.6140 | lr 0.00100 | ngrams/sec 42057.0 | eta 0h0m6s
| epoch 50 | step 4000/4071 | loss 5.6285 | lr 0.00100 | ngrams/sec 42137.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1150.67it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  50 | time 50.79s | valid loss  5.56 | valid ppl   259.66
-----------------------------------------------------------------------------------------
| epoch 51 | step 500/4071 | loss 5.4934 | lr 0.00100 | ngrams/sec 29237.4 | eta 0h1m2s
| epoch 51 | step 1000/4071 | loss 5.5172 | lr 0.00100 | ngrams/sec 42091.7 | eta 0h0m37s
| epoch 51 | step 1500/4071 | loss 5.5481 | lr 0.00100 | ngrams/sec 42171.8 | eta 0h0m31s
| epoch 51 | step 2000/4071 | loss 5.5908 | lr 0.00100 | ngrams/sec 42101.1 | eta 0h0m25s
| epoch 51 | step 2500/4071 | loss 5.6240 | lr 0.00100 | ngrams/sec 42154.0 | eta 0h0m19s
| epoch 51 | step 3000/4071 | loss 5.6219 | lr 0.00100 | ngrams/sec 42213.8 | eta 0h0m12s
| epoch 51 | step 3500/4071 | loss 5.6414 | lr 0.00100 | ngrams/sec 42155.5 | eta 0h0m6s
| epoch 51 | step 4000/4071 | loss 5.6518 | lr 0.00100 | ngrams/sec 42281.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1169.94it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 300.88it/s]


-----------------------------------------------------------------------------------------
| end of epoch  51 | time 50.84s | valid loss  5.67 | valid ppl   289.53
-----------------------------------------------------------------------------------------
| epoch 52 | step 500/4071 | loss 5.5251 | lr 0.00100 | ngrams/sec 29389.6 | eta 0h1m2s
| epoch 52 | step 1000/4071 | loss 5.5590 | lr 0.00100 | ngrams/sec 42229.8 | eta 0h0m37s
| epoch 52 | step 1500/4071 | loss 5.5851 | lr 0.00100 | ngrams/sec 42212.3 | eta 0h0m31s
| epoch 52 | step 2000/4071 | loss 5.6032 | lr 0.00100 | ngrams/sec 42275.3 | eta 0h0m25s
| epoch 52 | step 2500/4071 | loss 5.6251 | lr 0.00100 | ngrams/sec 42200.4 | eta 0h0m19s
| epoch 52 | step 3000/4071 | loss 5.6338 | lr 0.00100 | ngrams/sec 42222.8 | eta 0h0m12s
| epoch 52 | step 3500/4071 | loss 5.6735 | lr 0.00100 | ngrams/sec 42166.3 | eta 0h0m6s
| epoch 52 | step 4000/4071 | loss 5.6760 | lr 0.00100 | ngrams/sec 42260.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1183.38it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 300.86it/s]


-----------------------------------------------------------------------------------------
| end of epoch  52 | time 50.75s | valid loss  5.60 | valid ppl   270.77
-----------------------------------------------------------------------------------------
| epoch 53 | step 500/4071 | loss 5.5686 | lr 0.00100 | ngrams/sec 29355.2 | eta 0h1m2s
| epoch 53 | step 1000/4071 | loss 5.5826 | lr 0.00100 | ngrams/sec 42148.4 | eta 0h0m37s
| epoch 53 | step 1500/4071 | loss 5.6189 | lr 0.00100 | ngrams/sec 42175.4 | eta 0h0m31s
| epoch 53 | step 2000/4071 | loss 5.6502 | lr 0.00100 | ngrams/sec 42190.0 | eta 0h0m25s
| epoch 53 | step 2500/4071 | loss 5.6707 | lr 0.00100 | ngrams/sec 42175.7 | eta 0h0m19s
| epoch 53 | step 3000/4071 | loss 5.6913 | lr 0.00100 | ngrams/sec 42125.3 | eta 0h0m13s
| epoch 53 | step 3500/4071 | loss 5.6895 | lr 0.00100 | ngrams/sec 42180.3 | eta 0h0m6s
| epoch 53 | step 4000/4071 | loss 5.7167 | lr 0.00100 | ngrams/sec 42204.0 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1160.04it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 301.87it/s]


-----------------------------------------------------------------------------------------
| end of epoch  53 | time 50.82s | valid loss  5.60 | valid ppl   269.77
-----------------------------------------------------------------------------------------
| epoch 54 | step 500/4071 | loss 5.5896 | lr 0.00100 | ngrams/sec 29338.2 | eta 0h1m2s
| epoch 54 | step 1000/4071 | loss 5.6025 | lr 0.00100 | ngrams/sec 42208.1 | eta 0h0m37s
| epoch 54 | step 1500/4071 | loss 5.6431 | lr 0.00100 | ngrams/sec 42187.2 | eta 0h0m31s
| epoch 54 | step 2000/4071 | loss 5.6603 | lr 0.00100 | ngrams/sec 42178.6 | eta 0h0m25s
| epoch 54 | step 2500/4071 | loss 5.6867 | lr 0.00100 | ngrams/sec 42179.1 | eta 0h0m19s
| epoch 54 | step 3000/4071 | loss 5.6956 | lr 0.00100 | ngrams/sec 42252.0 | eta 0h0m12s
| epoch 54 | step 3500/4071 | loss 5.7096 | lr 0.00100 | ngrams/sec 42261.3 | eta 0h0m6s
| epoch 54 | step 4000/4071 | loss 5.7216 | lr 0.00100 | ngrams/sec 42208.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1180.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 302.17it/s]


-----------------------------------------------------------------------------------------
| end of epoch  54 | time 50.78s | valid loss  5.64 | valid ppl   282.37
-----------------------------------------------------------------------------------------
| epoch 55 | step 500/4071 | loss 5.6033 | lr 0.00100 | ngrams/sec 29413.2 | eta 0h1m2s
| epoch 55 | step 1000/4071 | loss 5.6307 | lr 0.00100 | ngrams/sec 42267.8 | eta 0h0m37s
| epoch 55 | step 1500/4071 | loss 5.6437 | lr 0.00100 | ngrams/sec 42254.2 | eta 0h0m31s
| epoch 55 | step 2000/4071 | loss 5.6623 | lr 0.00100 | ngrams/sec 42349.6 | eta 0h0m25s
| epoch 55 | step 2500/4071 | loss 5.6989 | lr 0.00100 | ngrams/sec 42438.5 | eta 0h0m18s
| epoch 55 | step 3000/4071 | loss 5.7289 | lr 0.00100 | ngrams/sec 42365.7 | eta 0h0m12s
| epoch 55 | step 3500/4071 | loss 5.7409 | lr 0.00100 | ngrams/sec 42388.6 | eta 0h0m6s
| epoch 55 | step 4000/4071 | loss 5.7657 | lr 0.00100 | ngrams/sec 42465.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1183.79it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 303.98it/s]


-----------------------------------------------------------------------------------------
| end of epoch  55 | time 50.60s | valid loss  5.67 | valid ppl   291.03
-----------------------------------------------------------------------------------------
| epoch 56 | step 500/4071 | loss 5.6239 | lr 0.00100 | ngrams/sec 29561.5 | eta 0h1m1s
| epoch 56 | step 1000/4071 | loss 5.6665 | lr 0.00100 | ngrams/sec 42449.1 | eta 0h0m37s
| epoch 56 | step 1500/4071 | loss 5.6991 | lr 0.00100 | ngrams/sec 42399.9 | eta 0h0m31s
| epoch 56 | step 2000/4071 | loss 5.7155 | lr 0.00100 | ngrams/sec 42399.5 | eta 0h0m25s
| epoch 56 | step 2500/4071 | loss 5.7304 | lr 0.00100 | ngrams/sec 42371.8 | eta 0h0m18s
| epoch 56 | step 3000/4071 | loss 5.7383 | lr 0.00100 | ngrams/sec 42298.7 | eta 0h0m12s
| epoch 56 | step 3500/4071 | loss 5.7609 | lr 0.00100 | ngrams/sec 42158.0 | eta 0h0m6s
| epoch 56 | step 4000/4071 | loss 5.7859 | lr 0.00100 | ngrams/sec 42198.0 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1148.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 300.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  56 | time 50.63s | valid loss  5.69 | valid ppl   296.05
-----------------------------------------------------------------------------------------
| epoch 57 | step 500/4071 | loss 5.6519 | lr 0.00100 | ngrams/sec 29315.1 | eta 0h1m2s
| epoch 57 | step 1000/4071 | loss 5.6904 | lr 0.00100 | ngrams/sec 42116.7 | eta 0h0m37s
| epoch 57 | step 1500/4071 | loss 5.7122 | lr 0.00100 | ngrams/sec 42034.7 | eta 0h0m31s
| epoch 57 | step 2000/4071 | loss 5.7544 | lr 0.00100 | ngrams/sec 42020.7 | eta 0h0m25s
| epoch 57 | step 2500/4071 | loss 5.7574 | lr 0.00100 | ngrams/sec 42063.9 | eta 0h0m19s
| epoch 57 | step 3000/4071 | loss 5.7580 | lr 0.00100 | ngrams/sec 42002.7 | eta 0h0m13s
| epoch 57 | step 3500/4071 | loss 5.7814 | lr 0.00100 | ngrams/sec 42036.4 | eta 0h0m6s
| epoch 57 | step 4000/4071 | loss 5.7905 | lr 0.00100 | ngrams/sec 42094.5 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1185.30it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 301.77it/s]


-----------------------------------------------------------------------------------------
| end of epoch  57 | time 50.95s | valid loss  5.68 | valid ppl   292.34
-----------------------------------------------------------------------------------------
| epoch 58 | step 500/4071 | loss 5.6900 | lr 0.00100 | ngrams/sec 29271.6 | eta 0h1m2s
| epoch 58 | step 1000/4071 | loss 5.7165 | lr 0.00100 | ngrams/sec 42190.0 | eta 0h0m37s
| epoch 58 | step 1500/4071 | loss 5.7348 | lr 0.00100 | ngrams/sec 42160.3 | eta 0h0m31s
| epoch 58 | step 2000/4071 | loss 5.7501 | lr 0.00100 | ngrams/sec 42131.8 | eta 0h0m25s
| epoch 58 | step 2500/4071 | loss 5.7931 | lr 0.00100 | ngrams/sec 42213.5 | eta 0h0m19s
| epoch 58 | step 3000/4071 | loss 5.7696 | lr 0.00100 | ngrams/sec 42156.3 | eta 0h0m13s
| epoch 58 | step 3500/4071 | loss 5.8174 | lr 0.00100 | ngrams/sec 42104.7 | eta 0h0m6s
| epoch 58 | step 4000/4071 | loss 5.8162 | lr 0.00100 | ngrams/sec 42148.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1154.30it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 300.80it/s]


-----------------------------------------------------------------------------------------
| end of epoch  58 | time 50.85s | valid loss  5.63 | valid ppl   278.47
-----------------------------------------------------------------------------------------
| epoch 59 | step 500/4071 | loss 5.7157 | lr 0.00100 | ngrams/sec 29328.7 | eta 0h1m2s
| epoch 59 | step 1000/4071 | loss 5.7343 | lr 0.00100 | ngrams/sec 42150.3 | eta 0h0m37s
| epoch 59 | step 1500/4071 | loss 5.7622 | lr 0.00100 | ngrams/sec 42187.5 | eta 0h0m31s
| epoch 59 | step 2000/4071 | loss 5.7891 | lr 0.00100 | ngrams/sec 42203.6 | eta 0h0m25s
| epoch 59 | step 2500/4071 | loss 5.7916 | lr 0.00100 | ngrams/sec 42306.2 | eta 0h0m19s
| epoch 59 | step 3000/4071 | loss 5.8113 | lr 0.00100 | ngrams/sec 42316.5 | eta 0h0m12s
| epoch 59 | step 3500/4071 | loss 5.8172 | lr 0.00100 | ngrams/sec 42316.9 | eta 0h0m6s
| epoch 59 | step 4000/4071 | loss 5.8457 | lr 0.00100 | ngrams/sec 42274.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1148.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 301.92it/s]


-----------------------------------------------------------------------------------------
| end of epoch  59 | time 50.74s | valid loss  5.82 | valid ppl   337.86
-----------------------------------------------------------------------------------------
| epoch 60 | step 500/4071 | loss 5.7344 | lr 0.00100 | ngrams/sec 29406.9 | eta 0h1m2s
| epoch 60 | step 1000/4071 | loss 5.7481 | lr 0.00100 | ngrams/sec 42211.3 | eta 0h0m37s
| epoch 60 | step 1500/4071 | loss 5.7916 | lr 0.00100 | ngrams/sec 42119.5 | eta 0h0m31s
| epoch 60 | step 2000/4071 | loss 5.8036 | lr 0.00100 | ngrams/sec 42173.8 | eta 0h0m25s
| epoch 60 | step 2500/4071 | loss 5.8140 | lr 0.00100 | ngrams/sec 42162.0 | eta 0h0m19s
| epoch 60 | step 3000/4071 | loss 5.8230 | lr 0.00100 | ngrams/sec 42126.3 | eta 0h0m13s
| epoch 60 | step 3500/4071 | loss 5.8378 | lr 0.00100 | ngrams/sec 42102.0 | eta 0h0m6s
| epoch 60 | step 4000/4071 | loss 5.8455 | lr 0.00100 | ngrams/sec 42025.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1162.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 300.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  60 | time 50.85s | valid loss  5.64 | valid ppl   281.29
-----------------------------------------------------------------------------------------
| epoch 61 | step 500/4071 | loss 5.7362 | lr 0.00100 | ngrams/sec 29180.5 | eta 0h1m2s
| epoch 61 | step 1000/4071 | loss 5.7704 | lr 0.00100 | ngrams/sec 41980.9 | eta 0h0m37s
| epoch 61 | step 1500/4071 | loss 5.7913 | lr 0.00100 | ngrams/sec 42025.0 | eta 0h0m31s
| epoch 61 | step 2000/4071 | loss 5.8099 | lr 0.00100 | ngrams/sec 42143.0 | eta 0h0m25s
| epoch 61 | step 2500/4071 | loss 5.8260 | lr 0.00100 | ngrams/sec 42222.5 | eta 0h0m19s
| epoch 61 | step 3000/4071 | loss 5.8546 | lr 0.00100 | ngrams/sec 42230.4 | eta 0h0m12s
| epoch 61 | step 3500/4071 | loss 5.8569 | lr 0.00100 | ngrams/sec 42220.5 | eta 0h0m6s
| epoch 61 | step 4000/4071 | loss 5.8695 | lr 0.00100 | ngrams/sec 42179.3 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1173.81it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 302.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch  61 | time 50.88s | valid loss  5.70 | valid ppl   299.67
-----------------------------------------------------------------------------------------
| epoch 62 | step 500/4071 | loss 5.7489 | lr 0.00100 | ngrams/sec 29480.8 | eta 0h1m2s
| epoch 62 | step 1000/4071 | loss 5.7947 | lr 0.00100 | ngrams/sec 42332.5 | eta 0h0m37s
| epoch 62 | step 1500/4071 | loss 5.8171 | lr 0.00100 | ngrams/sec 42372.7 | eta 0h0m31s
| epoch 62 | step 2000/4071 | loss 5.8246 | lr 0.00100 | ngrams/sec 42290.6 | eta 0h0m25s
| epoch 62 | step 2500/4071 | loss 5.8643 | lr 0.00100 | ngrams/sec 42303.7 | eta 0h0m19s
| epoch 62 | step 3000/4071 | loss 5.8714 | lr 0.00100 | ngrams/sec 42261.9 | eta 0h0m12s
| epoch 62 | step 3500/4071 | loss 5.8936 | lr 0.00100 | ngrams/sec 42303.2 | eta 0h0m6s
| epoch 62 | step 4000/4071 | loss 5.8788 | lr 0.00100 | ngrams/sec 42270.7 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1182.24it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 304.24it/s]


-----------------------------------------------------------------------------------------
| end of epoch  62 | time 50.65s | valid loss  5.72 | valid ppl   305.32
-----------------------------------------------------------------------------------------
| epoch 63 | step 500/4071 | loss 5.7917 | lr 0.00100 | ngrams/sec 29478.5 | eta 0h1m2s
| epoch 63 | step 1000/4071 | loss 5.8477 | lr 0.00100 | ngrams/sec 42307.9 | eta 0h0m37s
| epoch 63 | step 1500/4071 | loss 5.8530 | lr 0.00100 | ngrams/sec 42322.3 | eta 0h0m31s
| epoch 63 | step 2000/4071 | loss 5.8745 | lr 0.00100 | ngrams/sec 42283.4 | eta 0h0m25s
| epoch 63 | step 2500/4071 | loss 5.8719 | lr 0.00100 | ngrams/sec 42347.0 | eta 0h0m18s
| epoch 63 | step 3000/4071 | loss 5.8846 | lr 0.00100 | ngrams/sec 42231.2 | eta 0h0m12s
| epoch 63 | step 3500/4071 | loss 5.9142 | lr 0.00100 | ngrams/sec 42265.6 | eta 0h0m6s
| epoch 63 | step 4000/4071 | loss 5.9139 | lr 0.00100 | ngrams/sec 42200.1 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1182.24it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 303.62it/s]


-----------------------------------------------------------------------------------------
| end of epoch  63 | time 50.67s | valid loss  5.66 | valid ppl   286.00
-----------------------------------------------------------------------------------------
| epoch 64 | step 500/4071 | loss 5.8138 | lr 0.00100 | ngrams/sec 29466.2 | eta 0h1m2s
| epoch 64 | step 1000/4071 | loss 5.8611 | lr 0.00100 | ngrams/sec 42275.4 | eta 0h0m37s
| epoch 64 | step 1500/4071 | loss 5.8832 | lr 0.00100 | ngrams/sec 42252.4 | eta 0h0m31s
| epoch 64 | step 2000/4071 | loss 5.8848 | lr 0.00100 | ngrams/sec 42204.2 | eta 0h0m25s
| epoch 64 | step 2500/4071 | loss 5.9119 | lr 0.00100 | ngrams/sec 42129.6 | eta 0h0m19s
| epoch 64 | step 3000/4071 | loss 5.9383 | lr 0.00100 | ngrams/sec 42194.2 | eta 0h0m12s
| epoch 64 | step 3500/4071 | loss 5.9515 | lr 0.00100 | ngrams/sec 42220.9 | eta 0h0m6s
| epoch 64 | step 4000/4071 | loss 5.9709 | lr 0.00100 | ngrams/sec 42154.6 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1156.40it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 302.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  64 | time 50.76s | valid loss  5.69 | valid ppl   297.23
-----------------------------------------------------------------------------------------
| epoch 65 | step 500/4071 | loss 5.8647 | lr 0.00100 | ngrams/sec 29376.9 | eta 0h1m2s
| epoch 65 | step 1000/4071 | loss 5.8918 | lr 0.00100 | ngrams/sec 42229.1 | eta 0h0m37s
| epoch 65 | step 1500/4071 | loss 5.9037 | lr 0.00100 | ngrams/sec 42161.8 | eta 0h0m31s
| epoch 65 | step 2000/4071 | loss 5.9299 | lr 0.00100 | ngrams/sec 42199.2 | eta 0h0m25s
| epoch 65 | step 2500/4071 | loss 5.9199 | lr 0.00100 | ngrams/sec 42222.7 | eta 0h0m19s
| epoch 65 | step 3000/4071 | loss 5.9417 | lr 0.00100 | ngrams/sec 42336.1 | eta 0h0m12s
| epoch 65 | step 3500/4071 | loss 5.9344 | lr 0.00100 | ngrams/sec 42334.9 | eta 0h0m6s
| epoch 65 | step 4000/4071 | loss 5.9683 | lr 0.00100 | ngrams/sec 42307.3 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1164.81it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 305.14it/s]


-----------------------------------------------------------------------------------------
| end of epoch  65 | time 50.72s | valid loss  5.72 | valid ppl   305.95
-----------------------------------------------------------------------------------------
| epoch 66 | step 500/4071 | loss 5.8822 | lr 0.00100 | ngrams/sec 29531.6 | eta 0h1m1s
| epoch 66 | step 1000/4071 | loss 5.8966 | lr 0.00100 | ngrams/sec 42353.3 | eta 0h0m37s
| epoch 66 | step 1500/4071 | loss 5.9121 | lr 0.00100 | ngrams/sec 42387.3 | eta 0h0m31s
| epoch 66 | step 2000/4071 | loss 5.9446 | lr 0.00100 | ngrams/sec 42393.2 | eta 0h0m25s
| epoch 66 | step 2500/4071 | loss 5.9584 | lr 0.00100 | ngrams/sec 42307.9 | eta 0h0m19s
| epoch 66 | step 3000/4071 | loss 5.9607 | lr 0.00100 | ngrams/sec 42320.8 | eta 0h0m12s
| epoch 66 | step 3500/4071 | loss 5.9765 | lr 0.00100 | ngrams/sec 42347.0 | eta 0h0m6s
| epoch 66 | step 4000/4071 | loss 5.9976 | lr 0.00100 | ngrams/sec 42387.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1171.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 303.94it/s]


-----------------------------------------------------------------------------------------
| end of epoch  66 | time 50.59s | valid loss  5.77 | valid ppl   319.82
-----------------------------------------------------------------------------------------
| epoch 67 | step 500/4071 | loss 5.9046 | lr 0.00100 | ngrams/sec 29474.5 | eta 0h1m2s
| epoch 67 | step 1000/4071 | loss 5.9015 | lr 0.00100 | ngrams/sec 42254.0 | eta 0h0m37s
| epoch 67 | step 1500/4071 | loss 5.9300 | lr 0.00100 | ngrams/sec 42239.2 | eta 0h0m31s
| epoch 67 | step 2000/4071 | loss 5.9500 | lr 0.00100 | ngrams/sec 42286.0 | eta 0h0m25s
| epoch 67 | step 2500/4071 | loss 5.9826 | lr 0.00100 | ngrams/sec 42237.6 | eta 0h0m19s
| epoch 67 | step 3000/4071 | loss 6.0037 | lr 0.00100 | ngrams/sec 42302.8 | eta 0h0m12s
| epoch 67 | step 3500/4071 | loss 6.0056 | lr 0.00100 | ngrams/sec 42237.5 | eta 0h0m6s
| epoch 67 | step 4000/4071 | loss 6.0072 | lr 0.00100 | ngrams/sec 42192.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1177.96it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 305.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch  67 | time 50.70s | valid loss  5.72 | valid ppl   305.65
-----------------------------------------------------------------------------------------
| epoch 68 | step 500/4071 | loss 5.9267 | lr 0.00100 | ngrams/sec 29505.5 | eta 0h1m1s
| epoch 68 | step 1000/4071 | loss 5.9466 | lr 0.00100 | ngrams/sec 42202.5 | eta 0h0m37s
| epoch 68 | step 1500/4071 | loss 5.9745 | lr 0.00100 | ngrams/sec 42143.0 | eta 0h0m31s
| epoch 68 | step 2000/4071 | loss 6.0007 | lr 0.00100 | ngrams/sec 42209.6 | eta 0h0m25s
| epoch 68 | step 2500/4071 | loss 6.0000 | lr 0.00100 | ngrams/sec 42249.4 | eta 0h0m19s
| epoch 68 | step 3000/4071 | loss 6.0071 | lr 0.00100 | ngrams/sec 41450.4 | eta 0h0m13s
| epoch 68 | step 3500/4071 | loss 6.0213 | lr 0.00100 | ngrams/sec 42214.9 | eta 0h0m6s
| epoch 68 | step 4000/4071 | loss 6.0300 | lr 0.00100 | ngrams/sec 42216.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1148.15it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 303.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch  68 | time 50.86s | valid loss  5.66 | valid ppl   286.35
-----------------------------------------------------------------------------------------
| epoch 69 | step 500/4071 | loss 5.9358 | lr 0.00100 | ngrams/sec 29440.8 | eta 0h1m2s
| epoch 69 | step 1000/4071 | loss 5.9583 | lr 0.00100 | ngrams/sec 42168.0 | eta 0h0m37s
| epoch 69 | step 1500/4071 | loss 5.9901 | lr 0.00100 | ngrams/sec 42240.6 | eta 0h0m31s
| epoch 69 | step 2000/4071 | loss 6.0020 | lr 0.00100 | ngrams/sec 42205.3 | eta 0h0m25s
| epoch 69 | step 2500/4071 | loss 6.0044 | lr 0.00100 | ngrams/sec 42257.0 | eta 0h0m19s
| epoch 69 | step 3000/4071 | loss 6.0267 | lr 0.00100 | ngrams/sec 42337.2 | eta 0h0m12s
| epoch 69 | step 3500/4071 | loss 6.0430 | lr 0.00100 | ngrams/sec 42352.8 | eta 0h0m6s
| epoch 69 | step 4000/4071 | loss 6.0428 | lr 0.00100 | ngrams/sec 42389.2 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1186.56it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 305.49it/s]


-----------------------------------------------------------------------------------------
| end of epoch  69 | time 50.69s | valid loss  5.78 | valid ppl   323.72
-----------------------------------------------------------------------------------------
| epoch 70 | step 500/4071 | loss 5.9514 | lr 0.00100 | ngrams/sec 29550.2 | eta 0h1m1s
| epoch 70 | step 1000/4071 | loss 5.9855 | lr 0.00100 | ngrams/sec 42449.6 | eta 0h0m37s
| epoch 70 | step 1500/4071 | loss 6.0006 | lr 0.00100 | ngrams/sec 42416.9 | eta 0h0m31s
| epoch 70 | step 2000/4071 | loss 6.0156 | lr 0.00100 | ngrams/sec 42414.7 | eta 0h0m24s
| epoch 70 | step 2500/4071 | loss 6.0163 | lr 0.00100 | ngrams/sec 42372.0 | eta 0h0m18s
| epoch 70 | step 3000/4071 | loss 6.0357 | lr 0.00100 | ngrams/sec 42352.4 | eta 0h0m12s
| epoch 70 | step 3500/4071 | loss 6.0340 | lr 0.00100 | ngrams/sec 42348.1 | eta 0h0m6s
| epoch 70 | step 4000/4071 | loss 6.0562 | lr 0.00100 | ngrams/sec 42386.8 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1153.65it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 305.95it/s]


-----------------------------------------------------------------------------------------
| end of epoch  70 | time 50.54s | valid loss  5.68 | valid ppl   292.87
-----------------------------------------------------------------------------------------
| epoch 71 | step 500/4071 | loss 5.9667 | lr 0.00100 | ngrams/sec 29564.8 | eta 0h1m1s
| epoch 71 | step 1000/4071 | loss 6.0126 | lr 0.00100 | ngrams/sec 42367.9 | eta 0h0m37s
| epoch 71 | step 1500/4071 | loss 6.0319 | lr 0.00100 | ngrams/sec 42384.4 | eta 0h0m31s
| epoch 71 | step 2000/4071 | loss 6.0490 | lr 0.00100 | ngrams/sec 42356.3 | eta 0h0m25s
| epoch 71 | step 2500/4071 | loss 6.0349 | lr 0.00100 | ngrams/sec 42355.7 | eta 0h0m18s
| epoch 71 | step 3000/4071 | loss 6.0719 | lr 0.00100 | ngrams/sec 42355.8 | eta 0h0m12s
| epoch 71 | step 3500/4071 | loss 6.0577 | lr 0.00100 | ngrams/sec 42349.8 | eta 0h0m6s
| epoch 71 | step 4000/4071 | loss 6.0834 | lr 0.00100 | ngrams/sec 42270.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1178.10it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 306.70it/s]


-----------------------------------------------------------------------------------------
| end of epoch  71 | time 50.58s | valid loss  5.79 | valid ppl   325.49
-----------------------------------------------------------------------------------------
| epoch 72 | step 500/4071 | loss 6.0130 | lr 0.00100 | ngrams/sec 29596.0 | eta 0h1m1s
| epoch 72 | step 1000/4071 | loss 6.0287 | lr 0.00100 | ngrams/sec 42299.5 | eta 0h0m37s
| epoch 72 | step 1500/4071 | loss 6.0279 | lr 0.00100 | ngrams/sec 42400.3 | eta 0h0m31s
| epoch 72 | step 2000/4071 | loss 6.0394 | lr 0.00100 | ngrams/sec 42461.3 | eta 0h0m24s
| epoch 72 | step 2500/4071 | loss 6.0557 | lr 0.00100 | ngrams/sec 42453.6 | eta 0h0m18s
| epoch 72 | step 3000/4071 | loss 6.1044 | lr 0.00100 | ngrams/sec 42407.3 | eta 0h0m12s
| epoch 72 | step 3500/4071 | loss 6.0955 | lr 0.00100 | ngrams/sec 42454.2 | eta 0h0m6s
| epoch 72 | step 4000/4071 | loss 6.1034 | lr 0.00100 | ngrams/sec 42493.0 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1158.67it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 307.88it/s]


-----------------------------------------------------------------------------------------
| end of epoch  72 | time 50.49s | valid loss  5.74 | valid ppl   310.81
-----------------------------------------------------------------------------------------
| epoch 73 | step 500/4071 | loss 6.0163 | lr 0.00100 | ngrams/sec 29676.3 | eta 0h1m1s
| epoch 73 | step 1000/4071 | loss 6.0462 | lr 0.00100 | ngrams/sec 42597.2 | eta 0h0m36s
| epoch 73 | step 1500/4071 | loss 6.0583 | lr 0.00100 | ngrams/sec 42592.0 | eta 0h0m30s
| epoch 73 | step 2000/4071 | loss 6.0850 | lr 0.00100 | ngrams/sec 42634.4 | eta 0h0m24s
| epoch 73 | step 2500/4071 | loss 6.0755 | lr 0.00100 | ngrams/sec 42670.1 | eta 0h0m18s
| epoch 73 | step 3000/4071 | loss 6.0975 | lr 0.00100 | ngrams/sec 42714.8 | eta 0h0m12s
| epoch 73 | step 3500/4071 | loss 6.1138 | lr 0.00100 | ngrams/sec 42642.3 | eta 0h0m6s
| epoch 73 | step 4000/4071 | loss 6.1006 | lr 0.00100 | ngrams/sec 42621.2 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1157.38it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 307.26it/s]


-----------------------------------------------------------------------------------------
| end of epoch  73 | time 50.26s | valid loss  5.77 | valid ppl   321.08
-----------------------------------------------------------------------------------------
| epoch 74 | step 500/4071 | loss 6.0264 | lr 0.00100 | ngrams/sec 29688.1 | eta 0h1m1s
| epoch 74 | step 1000/4071 | loss 6.0513 | lr 0.00100 | ngrams/sec 42541.1 | eta 0h0m36s
| epoch 74 | step 1500/4071 | loss 6.0605 | lr 0.00100 | ngrams/sec 42516.0 | eta 0h0m30s
| epoch 74 | step 2000/4071 | loss 6.0814 | lr 0.00100 | ngrams/sec 42474.1 | eta 0h0m24s
| epoch 74 | step 2500/4071 | loss 6.0908 | lr 0.00100 | ngrams/sec 42451.2 | eta 0h0m18s
| epoch 74 | step 3000/4071 | loss 6.1077 | lr 0.00100 | ngrams/sec 42515.7 | eta 0h0m12s
| epoch 74 | step 3500/4071 | loss 6.1055 | lr 0.00100 | ngrams/sec 42574.6 | eta 0h0m6s
| epoch 74 | step 4000/4071 | loss 6.1151 | lr 0.00100 | ngrams/sec 42484.1 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1153.24it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 307.09it/s]


-----------------------------------------------------------------------------------------
| end of epoch  74 | time 50.39s | valid loss  5.75 | valid ppl   315.10
-----------------------------------------------------------------------------------------
| epoch 75 | step 500/4071 | loss 6.0476 | lr 0.00100 | ngrams/sec 29653.0 | eta 0h1m1s
| epoch 75 | step 1000/4071 | loss 6.0811 | lr 0.00100 | ngrams/sec 42390.9 | eta 0h0m37s
| epoch 75 | step 1500/4071 | loss 6.0766 | lr 0.00100 | ngrams/sec 42397.9 | eta 0h0m31s
| epoch 75 | step 2000/4071 | loss 6.0986 | lr 0.00100 | ngrams/sec 42354.4 | eta 0h0m25s
| epoch 75 | step 2500/4071 | loss 6.1135 | lr 0.00100 | ngrams/sec 42366.4 | eta 0h0m18s
| epoch 75 | step 3000/4071 | loss 6.1257 | lr 0.00100 | ngrams/sec 42441.8 | eta 0h0m12s
| epoch 75 | step 3500/4071 | loss 6.1518 | lr 0.00100 | ngrams/sec 42435.4 | eta 0h0m6s
| epoch 75 | step 4000/4071 | loss 6.1493 | lr 0.00100 | ngrams/sec 42517.4 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1189.05it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 308.10it/s]


-----------------------------------------------------------------------------------------
| end of epoch  75 | time 50.49s | valid loss  5.80 | valid ppl   331.56
-----------------------------------------------------------------------------------------
| epoch 76 | step 500/4071 | loss 6.0581 | lr 0.00100 | ngrams/sec 29740.4 | eta 0h1m1s
| epoch 76 | step 1000/4071 | loss 6.0978 | lr 0.00100 | ngrams/sec 42699.4 | eta 0h0m36s
| epoch 76 | step 1500/4071 | loss 6.0989 | lr 0.00100 | ngrams/sec 42767.7 | eta 0h0m30s
| epoch 76 | step 2000/4071 | loss 6.1065 | lr 0.00100 | ngrams/sec 42833.6 | eta 0h0m24s
| epoch 76 | step 2500/4071 | loss 6.1236 | lr 0.00100 | ngrams/sec 42809.2 | eta 0h0m18s
| epoch 76 | step 3000/4071 | loss 6.1468 | lr 0.00100 | ngrams/sec 42812.6 | eta 0h0m12s
| epoch 76 | step 3500/4071 | loss 6.1618 | lr 0.00100 | ngrams/sec 42843.2 | eta 0h0m6s
| epoch 76 | step 4000/4071 | loss 6.1651 | lr 0.00100 | ngrams/sec 42807.0 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1186.50it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 308.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  76 | time 50.09s | valid loss  5.71 | valid ppl   302.15
-----------------------------------------------------------------------------------------
| epoch 77 | step 500/4071 | loss 6.0876 | lr 0.00100 | ngrams/sec 29783.8 | eta 0h1m1s
| epoch 77 | step 1000/4071 | loss 6.1130 | lr 0.00100 | ngrams/sec 42739.6 | eta 0h0m36s
| epoch 77 | step 1500/4071 | loss 6.1256 | lr 0.00100 | ngrams/sec 42779.6 | eta 0h0m30s
| epoch 77 | step 2000/4071 | loss 6.1302 | lr 0.00100 | ngrams/sec 42710.3 | eta 0h0m24s
| epoch 77 | step 2500/4071 | loss 6.1334 | lr 0.00100 | ngrams/sec 42668.7 | eta 0h0m18s
| epoch 77 | step 3000/4071 | loss 6.1624 | lr 0.00100 | ngrams/sec 42632.0 | eta 0h0m12s
| epoch 77 | step 3500/4071 | loss 6.1695 | lr 0.00100 | ngrams/sec 42617.7 | eta 0h0m6s
| epoch 77 | step 4000/4071 | loss 6.1546 | lr 0.00100 | ngrams/sec 42546.9 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1178.94it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 306.48it/s]


-----------------------------------------------------------------------------------------
| end of epoch  77 | time 50.21s | valid loss  5.77 | valid ppl   321.87
-----------------------------------------------------------------------------------------
| epoch 78 | step 500/4071 | loss 6.1135 | lr 0.00100 | ngrams/sec 29664.1 | eta 0h1m1s
| epoch 78 | step 1000/4071 | loss 6.1049 | lr 0.00100 | ngrams/sec 42521.0 | eta 0h0m36s
| epoch 78 | step 1500/4071 | loss 6.1556 | lr 0.00100 | ngrams/sec 42461.1 | eta 0h0m31s
| epoch 78 | step 2000/4071 | loss 6.1485 | lr 0.00100 | ngrams/sec 42437.6 | eta 0h0m24s
| epoch 78 | step 2500/4071 | loss 6.1706 | lr 0.00100 | ngrams/sec 42499.0 | eta 0h0m18s
| epoch 78 | step 3000/4071 | loss 6.1768 | lr 0.00100 | ngrams/sec 42372.6 | eta 0h0m12s
| epoch 78 | step 3500/4071 | loss 6.1916 | lr 0.00100 | ngrams/sec 42352.1 | eta 0h0m6s
| epoch 78 | step 4000/4071 | loss 6.1852 | lr 0.00100 | ngrams/sec 42356.2 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1170.77it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 306.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch  78 | time 50.48s | valid loss  5.78 | valid ppl   325.10
-----------------------------------------------------------------------------------------
| epoch 79 | step 500/4071 | loss 6.1242 | lr 0.00100 | ngrams/sec 29539.4 | eta 0h1m1s
| epoch 79 | step 1000/4071 | loss 6.1394 | lr 0.00100 | ngrams/sec 42311.4 | eta 0h0m37s
| epoch 79 | step 1500/4071 | loss 6.1498 | lr 0.00100 | ngrams/sec 42463.7 | eta 0h0m30s
| epoch 79 | step 2000/4071 | loss 6.1529 | lr 0.00100 | ngrams/sec 42514.6 | eta 0h0m24s
| epoch 79 | step 2500/4071 | loss 6.1832 | lr 0.00100 | ngrams/sec 42570.1 | eta 0h0m18s
| epoch 79 | step 3000/4071 | loss 6.1948 | lr 0.00100 | ngrams/sec 42696.6 | eta 0h0m12s
| epoch 79 | step 3500/4071 | loss 6.1766 | lr 0.00100 | ngrams/sec 42669.6 | eta 0h0m6s
| epoch 79 | step 4000/4071 | loss 6.2155 | lr 0.00100 | ngrams/sec 42786.3 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1175.52it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 309.26it/s]


-----------------------------------------------------------------------------------------
| end of epoch  79 | time 50.35s | valid loss  5.79 | valid ppl   327.27
-----------------------------------------------------------------------------------------
| epoch 80 | step 500/4071 | loss 6.1437 | lr 0.00100 | ngrams/sec 29807.5 | eta 0h1m1s
| epoch 80 | step 1000/4071 | loss 6.1268 | lr 0.00100 | ngrams/sec 42844.1 | eta 0h0m36s
| epoch 80 | step 1500/4071 | loss 6.1633 | lr 0.00100 | ngrams/sec 42760.9 | eta 0h0m30s
| epoch 80 | step 2000/4071 | loss 6.1655 | lr 0.00100 | ngrams/sec 42780.4 | eta 0h0m24s
| epoch 80 | step 2500/4071 | loss 6.1926 | lr 0.00100 | ngrams/sec 42817.6 | eta 0h0m18s
| epoch 80 | step 3000/4071 | loss 6.1719 | lr 0.00100 | ngrams/sec 42772.9 | eta 0h0m12s
| epoch 80 | step 3500/4071 | loss 6.2063 | lr 0.00100 | ngrams/sec 42811.6 | eta 0h0m6s
| epoch 80 | step 4000/4071 | loss 6.1983 | lr 0.00100 | ngrams/sec 42730.3 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1188.76it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 308.71it/s]


-----------------------------------------------------------------------------------------
| end of epoch  80 | time 50.09s | valid loss  5.84 | valid ppl   344.45
-----------------------------------------------------------------------------------------
| epoch 81 | step 500/4071 | loss 6.1435 | lr 0.00100 | ngrams/sec 29749.2 | eta 0h1m1s
| epoch 81 | step 1000/4071 | loss 6.1463 | lr 0.00100 | ngrams/sec 42589.7 | eta 0h0m36s
| epoch 81 | step 1500/4071 | loss 6.1711 | lr 0.00100 | ngrams/sec 42502.7 | eta 0h0m30s
| epoch 81 | step 2000/4071 | loss 6.1665 | lr 0.00100 | ngrams/sec 42505.6 | eta 0h0m24s
| epoch 81 | step 2500/4071 | loss 6.1855 | lr 0.00100 | ngrams/sec 42428.9 | eta 0h0m18s
| epoch 81 | step 3000/4071 | loss 6.1969 | lr 0.00100 | ngrams/sec 42372.4 | eta 0h0m12s
| epoch 81 | step 3500/4071 | loss 6.2074 | lr 0.00100 | ngrams/sec 42281.2 | eta 0h0m6s
| epoch 81 | step 4000/4071 | loss 6.1870 | lr 0.00100 | ngrams/sec 42299.2 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1166.63it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 306.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  81 | time 50.47s | valid loss  5.79 | valid ppl   326.42
-----------------------------------------------------------------------------------------
| epoch 82 | step 500/4071 | loss 6.1352 | lr 0.00100 | ngrams/sec 29576.8 | eta 0h1m1s
| epoch 82 | step 1000/4071 | loss 6.1701 | lr 0.00100 | ngrams/sec 42335.6 | eta 0h0m37s
| epoch 82 | step 1500/4071 | loss 6.1919 | lr 0.00100 | ngrams/sec 42298.7 | eta 0h0m31s
| epoch 82 | step 2000/4071 | loss 6.1995 | lr 0.00100 | ngrams/sec 42255.3 | eta 0h0m25s
| epoch 82 | step 2500/4071 | loss 6.1994 | lr 0.00100 | ngrams/sec 42292.9 | eta 0h0m19s
| epoch 82 | step 3000/4071 | loss 6.2035 | lr 0.00100 | ngrams/sec 42303.0 | eta 0h0m12s
| epoch 82 | step 3500/4071 | loss 6.2183 | lr 0.00100 | ngrams/sec 42373.5 | eta 0h0m6s
| epoch 82 | step 4000/4071 | loss 6.2051 | lr 0.00100 | ngrams/sec 42314.8 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1169.42it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 307.13it/s]


-----------------------------------------------------------------------------------------
| end of epoch  82 | time 50.62s | valid loss  5.82 | valid ppl   336.63
-----------------------------------------------------------------------------------------
| epoch 83 | step 500/4071 | loss 6.1609 | lr 0.00100 | ngrams/sec 29544.6 | eta 0h1m1s
| epoch 83 | step 1000/4071 | loss 6.1680 | lr 0.00100 | ngrams/sec 42450.3 | eta 0h0m37s
| epoch 83 | step 1500/4071 | loss 6.2017 | lr 0.00100 | ngrams/sec 42419.9 | eta 0h0m31s
| epoch 83 | step 2000/4071 | loss 6.2071 | lr 0.00100 | ngrams/sec 42558.5 | eta 0h0m24s
| epoch 83 | step 2500/4071 | loss 6.2045 | lr 0.00100 | ngrams/sec 42580.7 | eta 0h0m18s
| epoch 83 | step 3000/4071 | loss 6.1966 | lr 0.00100 | ngrams/sec 42527.3 | eta 0h0m12s
| epoch 83 | step 3500/4071 | loss 6.2352 | lr 0.00100 | ngrams/sec 42578.4 | eta 0h0m6s
| epoch 83 | step 4000/4071 | loss 6.2284 | lr 0.00100 | ngrams/sec 42621.5 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1167.38it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 308.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  83 | time 50.39s | valid loss  5.83 | valid ppl   340.91
-----------------------------------------------------------------------------------------
| epoch 84 | step 500/4071 | loss 6.1719 | lr 0.00100 | ngrams/sec 29753.2 | eta 0h1m1s
| epoch 84 | step 1000/4071 | loss 6.1673 | lr 0.00100 | ngrams/sec 42574.4 | eta 0h0m36s
| epoch 84 | step 1500/4071 | loss 6.2154 | lr 0.00100 | ngrams/sec 42521.1 | eta 0h0m30s
| epoch 84 | step 2000/4071 | loss 6.2175 | lr 0.00100 | ngrams/sec 42512.9 | eta 0h0m24s
| epoch 84 | step 2500/4071 | loss 6.2047 | lr 0.00100 | ngrams/sec 42444.1 | eta 0h0m18s
| epoch 84 | step 3000/4071 | loss 6.2134 | lr 0.00100 | ngrams/sec 42409.9 | eta 0h0m12s
| epoch 84 | step 3500/4071 | loss 6.2469 | lr 0.00100 | ngrams/sec 42366.5 | eta 0h0m6s
| epoch 84 | step 4000/4071 | loss 6.2426 | lr 0.00100 | ngrams/sec 42317.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1184.31it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 306.90it/s]


-----------------------------------------------------------------------------------------
| end of epoch  84 | time 50.44s | valid loss  5.80 | valid ppl   331.84
-----------------------------------------------------------------------------------------
| epoch 85 | step 500/4071 | loss 6.1775 | lr 0.00100 | ngrams/sec 29565.8 | eta 0h1m1s
| epoch 85 | step 1000/4071 | loss 6.2015 | lr 0.00100 | ngrams/sec 42363.3 | eta 0h0m37s
| epoch 85 | step 1500/4071 | loss 6.2149 | lr 0.00100 | ngrams/sec 42204.5 | eta 0h0m31s
| epoch 85 | step 2000/4071 | loss 6.2268 | lr 0.00100 | ngrams/sec 42260.9 | eta 0h0m25s
| epoch 85 | step 2500/4071 | loss 6.2318 | lr 0.00100 | ngrams/sec 42287.5 | eta 0h0m19s
| epoch 85 | step 3000/4071 | loss 6.2257 | lr 0.00100 | ngrams/sec 42248.9 | eta 0h0m12s
| epoch 85 | step 3500/4071 | loss 6.2524 | lr 0.00100 | ngrams/sec 42328.7 | eta 0h0m6s
| epoch 85 | step 4000/4071 | loss 6.2570 | lr 0.00100 | ngrams/sec 42311.8 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1172.11it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 306.12it/s]


-----------------------------------------------------------------------------------------
| end of epoch  85 | time 50.65s | valid loss  5.85 | valid ppl   347.05
-----------------------------------------------------------------------------------------
| epoch 86 | step 500/4071 | loss 6.1726 | lr 0.00100 | ngrams/sec 29549.2 | eta 0h1m1s
| epoch 86 | step 1000/4071 | loss 6.2076 | lr 0.00100 | ngrams/sec 42336.9 | eta 0h0m37s
| epoch 86 | step 1500/4071 | loss 6.2330 | lr 0.00100 | ngrams/sec 42435.1 | eta 0h0m31s
| epoch 86 | step 2000/4071 | loss 6.2311 | lr 0.00100 | ngrams/sec 42461.6 | eta 0h0m24s
| epoch 86 | step 2500/4071 | loss 6.2439 | lr 0.00100 | ngrams/sec 42411.6 | eta 0h0m18s
| epoch 86 | step 3000/4071 | loss 6.2503 | lr 0.00100 | ngrams/sec 42469.3 | eta 0h0m12s
| epoch 86 | step 3500/4071 | loss 6.2772 | lr 0.00100 | ngrams/sec 42476.3 | eta 0h0m6s
| epoch 86 | step 4000/4071 | loss 6.2733 | lr 0.00100 | ngrams/sec 42597.8 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1153.20it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 309.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch  86 | time 50.46s | valid loss  5.83 | valid ppl   341.14
-----------------------------------------------------------------------------------------
| epoch 87 | step 500/4071 | loss 6.2248 | lr 0.00100 | ngrams/sec 29741.8 | eta 0h1m1s
| epoch 87 | step 1000/4071 | loss 6.2217 | lr 0.00100 | ngrams/sec 42537.2 | eta 0h0m36s
| epoch 87 | step 1500/4071 | loss 6.2374 | lr 0.00100 | ngrams/sec 42664.5 | eta 0h0m30s
| epoch 87 | step 2000/4071 | loss 6.2633 | lr 0.00100 | ngrams/sec 42647.2 | eta 0h0m24s
| epoch 87 | step 2500/4071 | loss 6.2380 | lr 0.00100 | ngrams/sec 42537.2 | eta 0h0m18s
| epoch 87 | step 3000/4071 | loss 6.2514 | lr 0.00100 | ngrams/sec 42569.4 | eta 0h0m12s
| epoch 87 | step 3500/4071 | loss 6.2794 | lr 0.00100 | ngrams/sec 42522.3 | eta 0h0m6s
| epoch 87 | step 4000/4071 | loss 6.2812 | lr 0.00100 | ngrams/sec 42562.9 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1162.70it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 309.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  87 | time 50.31s | valid loss  5.83 | valid ppl   339.50
-----------------------------------------------------------------------------------------
| epoch 88 | step 500/4071 | loss 6.2202 | lr 0.00100 | ngrams/sec 29755.6 | eta 0h1m1s
| epoch 88 | step 1000/4071 | loss 6.2249 | lr 0.00100 | ngrams/sec 42549.4 | eta 0h0m36s
| epoch 88 | step 1500/4071 | loss 6.2411 | lr 0.00100 | ngrams/sec 42538.1 | eta 0h0m30s
| epoch 88 | step 2000/4071 | loss 6.2667 | lr 0.00100 | ngrams/sec 42540.9 | eta 0h0m24s
| epoch 88 | step 2500/4071 | loss 6.2679 | lr 0.00100 | ngrams/sec 42576.2 | eta 0h0m18s
| epoch 88 | step 3000/4071 | loss 6.2761 | lr 0.00100 | ngrams/sec 42531.5 | eta 0h0m12s
| epoch 88 | step 3500/4071 | loss 6.2916 | lr 0.00100 | ngrams/sec 42519.9 | eta 0h0m6s
| epoch 88 | step 4000/4071 | loss 6.3146 | lr 0.00100 | ngrams/sec 42574.4 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1161.29it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 308.82it/s]


-----------------------------------------------------------------------------------------
| end of epoch  88 | time 50.34s | valid loss  5.80 | valid ppl   330.23
-----------------------------------------------------------------------------------------
| epoch 89 | step 500/4071 | loss 6.2266 | lr 0.00100 | ngrams/sec 29680.4 | eta 0h1m1s
| epoch 89 | step 1000/4071 | loss 6.2492 | lr 0.00100 | ngrams/sec 42483.4 | eta 0h0m37s
| epoch 89 | step 1500/4071 | loss 6.2885 | lr 0.00100 | ngrams/sec 42541.6 | eta 0h0m30s
| epoch 89 | step 2000/4071 | loss 6.2701 | lr 0.00100 | ngrams/sec 42523.7 | eta 0h0m24s
| epoch 89 | step 2500/4071 | loss 6.2828 | lr 0.00100 | ngrams/sec 42563.4 | eta 0h0m18s
| epoch 89 | step 3000/4071 | loss 6.2959 | lr 0.00100 | ngrams/sec 42498.0 | eta 0h0m12s
| epoch 89 | step 3500/4071 | loss 6.3015 | lr 0.00100 | ngrams/sec 42456.5 | eta 0h0m6s
| epoch 89 | step 4000/4071 | loss 6.3143 | lr 0.00100 | ngrams/sec 42508.6 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1152.54it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 308.82it/s]


-----------------------------------------------------------------------------------------
| end of epoch  89 | time 50.39s | valid loss  5.92 | valid ppl   371.79
-----------------------------------------------------------------------------------------
| epoch 90 | step 500/4071 | loss 6.2483 | lr 0.00100 | ngrams/sec 29689.6 | eta 0h1m1s
| epoch 90 | step 1000/4071 | loss 6.2646 | lr 0.00100 | ngrams/sec 42590.2 | eta 0h0m36s
| epoch 90 | step 1500/4071 | loss 6.2816 | lr 0.00100 | ngrams/sec 42497.8 | eta 0h0m30s
| epoch 90 | step 2000/4071 | loss 6.2980 | lr 0.00100 | ngrams/sec 42530.3 | eta 0h0m24s
| epoch 90 | step 2500/4071 | loss 6.3017 | lr 0.00100 | ngrams/sec 42446.0 | eta 0h0m18s
| epoch 90 | step 3000/4071 | loss 6.3133 | lr 0.00100 | ngrams/sec 42530.3 | eta 0h0m12s
| epoch 90 | step 3500/4071 | loss 6.3122 | lr 0.00100 | ngrams/sec 42552.5 | eta 0h0m6s
| epoch 90 | step 4000/4071 | loss 6.3187 | lr 0.00100 | ngrams/sec 42535.6 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1152.33it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 307.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch  90 | time 50.38s | valid loss  5.96 | valid ppl   387.24
-----------------------------------------------------------------------------------------
| epoch 91 | step 500/4071 | loss 6.2636 | lr 0.00100 | ngrams/sec 29718.1 | eta 0h1m1s
| epoch 91 | step 1000/4071 | loss 6.2603 | lr 0.00100 | ngrams/sec 42533.3 | eta 0h0m36s
| epoch 91 | step 1500/4071 | loss 6.3164 | lr 0.00100 | ngrams/sec 42465.4 | eta 0h0m30s
| epoch 91 | step 2000/4071 | loss 6.2960 | lr 0.00100 | ngrams/sec 42618.7 | eta 0h0m24s
| epoch 91 | step 2500/4071 | loss 6.3068 | lr 0.00100 | ngrams/sec 42602.3 | eta 0h0m18s
| epoch 91 | step 3000/4071 | loss 6.3280 | lr 0.00100 | ngrams/sec 42539.2 | eta 0h0m12s
| epoch 91 | step 3500/4071 | loss 6.3304 | lr 0.00100 | ngrams/sec 42595.4 | eta 0h0m6s
| epoch 91 | step 4000/4071 | loss 6.3491 | lr 0.00100 | ngrams/sec 42587.3 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1180.74it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 310.91it/s]


-----------------------------------------------------------------------------------------
| end of epoch  91 | time 50.32s | valid loss  5.84 | valid ppl   345.18
-----------------------------------------------------------------------------------------
| epoch 92 | step 500/4071 | loss 6.2751 | lr 0.00100 | ngrams/sec 29797.0 | eta 0h1m1s
| epoch 92 | step 1000/4071 | loss 6.3045 | lr 0.00100 | ngrams/sec 42597.8 | eta 0h0m36s
| epoch 92 | step 1500/4071 | loss 6.3261 | lr 0.00100 | ngrams/sec 42659.9 | eta 0h0m30s
| epoch 92 | step 2000/4071 | loss 6.3234 | lr 0.00100 | ngrams/sec 42606.9 | eta 0h0m24s
| epoch 92 | step 2500/4071 | loss 6.3486 | lr 0.00100 | ngrams/sec 42623.0 | eta 0h0m18s
| epoch 92 | step 3000/4071 | loss 6.3351 | lr 0.00100 | ngrams/sec 42686.2 | eta 0h0m12s
| epoch 92 | step 3500/4071 | loss 6.3427 | lr 0.00100 | ngrams/sec 42704.8 | eta 0h0m6s
| epoch 92 | step 4000/4071 | loss 6.3630 | lr 0.00100 | ngrams/sec 42645.6 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1163.37it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 310.48it/s]


-----------------------------------------------------------------------------------------
| end of epoch  92 | time 50.23s | valid loss  5.89 | valid ppl   360.28
-----------------------------------------------------------------------------------------
| epoch 93 | step 500/4071 | loss 6.3066 | lr 0.00100 | ngrams/sec 29849.1 | eta 0h1m1s
| epoch 93 | step 1000/4071 | loss 6.3027 | lr 0.00100 | ngrams/sec 42631.7 | eta 0h0m36s
| epoch 93 | step 1500/4071 | loss 6.3368 | lr 0.00100 | ngrams/sec 42709.7 | eta 0h0m30s
| epoch 93 | step 2000/4071 | loss 6.3278 | lr 0.00100 | ngrams/sec 42753.5 | eta 0h0m24s
| epoch 93 | step 2500/4071 | loss 6.3510 | lr 0.00100 | ngrams/sec 42633.4 | eta 0h0m18s
| epoch 93 | step 3000/4071 | loss 6.3453 | lr 0.00100 | ngrams/sec 42604.4 | eta 0h0m12s
| epoch 93 | step 3500/4071 | loss 6.3713 | lr 0.00100 | ngrams/sec 42492.0 | eta 0h0m6s
| epoch 93 | step 4000/4071 | loss 6.3765 | lr 0.00100 | ngrams/sec 42448.7 | eta 0h0m0s


 29%|██▊       | 119/417 [00:00<00:00, 1162.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 307.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  93 | time 50.27s | valid loss  5.88 | valid ppl   356.12
-----------------------------------------------------------------------------------------
| epoch 94 | step 500/4071 | loss 6.3136 | lr 0.00100 | ngrams/sec 29585.9 | eta 0h1m1s
| epoch 94 | step 1000/4071 | loss 6.3382 | lr 0.00100 | ngrams/sec 42227.8 | eta 0h0m37s
| epoch 94 | step 1500/4071 | loss 6.3327 | lr 0.00100 | ngrams/sec 42241.5 | eta 0h0m31s
| epoch 94 | step 2000/4071 | loss 6.3480 | lr 0.00100 | ngrams/sec 42332.1 | eta 0h0m25s
| epoch 94 | step 2500/4071 | loss 6.3580 | lr 0.00100 | ngrams/sec 42384.3 | eta 0h0m18s
| epoch 94 | step 3000/4071 | loss 6.3704 | lr 0.00100 | ngrams/sec 42469.5 | eta 0h0m12s
| epoch 94 | step 3500/4071 | loss 6.3718 | lr 0.00100 | ngrams/sec 41617.4 | eta 0h0m7s
| epoch 94 | step 4000/4071 | loss 6.3976 | lr 0.00100 | ngrams/sec 42654.9 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1176.70it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 311.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  94 | time 50.63s | valid loss  5.86 | valid ppl   349.16
-----------------------------------------------------------------------------------------
| epoch 95 | step 500/4071 | loss 6.3247 | lr 0.00100 | ngrams/sec 29843.7 | eta 0h1m1s
| epoch 95 | step 1000/4071 | loss 6.3341 | lr 0.00100 | ngrams/sec 42682.2 | eta 0h0m36s
| epoch 95 | step 1500/4071 | loss 6.3680 | lr 0.00100 | ngrams/sec 42553.2 | eta 0h0m30s
| epoch 95 | step 2000/4071 | loss 6.3685 | lr 0.00100 | ngrams/sec 42544.9 | eta 0h0m24s
| epoch 95 | step 2500/4071 | loss 6.3799 | lr 0.00100 | ngrams/sec 42495.7 | eta 0h0m18s
| epoch 95 | step 3000/4071 | loss 6.3578 | lr 0.00100 | ngrams/sec 42525.5 | eta 0h0m12s
| epoch 95 | step 3500/4071 | loss 6.3849 | lr 0.00100 | ngrams/sec 42468.8 | eta 0h0m6s
| epoch 95 | step 4000/4071 | loss 6.4079 | lr 0.00100 | ngrams/sec 42437.9 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1168.42it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 308.33it/s]


-----------------------------------------------------------------------------------------
| end of epoch  95 | time 50.35s | valid loss  5.91 | valid ppl   370.23
-----------------------------------------------------------------------------------------
| epoch 96 | step 500/4071 | loss 6.3301 | lr 0.00100 | ngrams/sec 29601.9 | eta 0h1m1s
| epoch 96 | step 1000/4071 | loss 6.3554 | lr 0.00100 | ngrams/sec 42423.8 | eta 0h0m37s
| epoch 96 | step 1500/4071 | loss 6.3950 | lr 0.00100 | ngrams/sec 42423.8 | eta 0h0m31s
| epoch 96 | step 2000/4071 | loss 6.3832 | lr 0.00100 | ngrams/sec 42368.8 | eta 0h0m25s
| epoch 96 | step 2500/4071 | loss 6.3844 | lr 0.00100 | ngrams/sec 42414.8 | eta 0h0m18s
| epoch 96 | step 3000/4071 | loss 6.3984 | lr 0.00100 | ngrams/sec 42470.3 | eta 0h0m12s
| epoch 96 | step 3500/4071 | loss 6.4255 | lr 0.00100 | ngrams/sec 42594.4 | eta 0h0m6s
| epoch 96 | step 4000/4071 | loss 6.4269 | lr 0.00100 | ngrams/sec 42607.2 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1157.57it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 309.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  96 | time 50.44s | valid loss  5.91 | valid ppl   370.12
-----------------------------------------------------------------------------------------
| epoch 97 | step 500/4071 | loss 6.3603 | lr 0.00100 | ngrams/sec 29780.4 | eta 0h1m1s
| epoch 97 | step 1000/4071 | loss 6.3808 | lr 0.00100 | ngrams/sec 42740.1 | eta 0h0m36s
| epoch 97 | step 1500/4071 | loss 6.3846 | lr 0.00100 | ngrams/sec 42727.3 | eta 0h0m30s
| epoch 97 | step 2000/4071 | loss 6.4085 | lr 0.00100 | ngrams/sec 42721.6 | eta 0h0m24s
| epoch 97 | step 2500/4071 | loss 6.4141 | lr 0.00100 | ngrams/sec 42726.3 | eta 0h0m18s
| epoch 97 | step 3000/4071 | loss 6.4176 | lr 0.00100 | ngrams/sec 42751.1 | eta 0h0m12s
| epoch 97 | step 3500/4071 | loss 6.4129 | lr 0.00100 | ngrams/sec 42724.9 | eta 0h0m6s
| epoch 97 | step 4000/4071 | loss 6.4241 | lr 0.00100 | ngrams/sec 42661.8 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1172.47it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 310.14it/s]


-----------------------------------------------------------------------------------------
| end of epoch  97 | time 50.15s | valid loss  5.93 | valid ppl   376.76
-----------------------------------------------------------------------------------------
| epoch 98 | step 500/4071 | loss 6.3766 | lr 0.00100 | ngrams/sec 29761.8 | eta 0h1m1s
| epoch 98 | step 1000/4071 | loss 6.3993 | lr 0.00100 | ngrams/sec 42527.4 | eta 0h0m36s
| epoch 98 | step 1500/4071 | loss 6.4109 | lr 0.00100 | ngrams/sec 42607.1 | eta 0h0m30s
| epoch 98 | step 2000/4071 | loss 6.4070 | lr 0.00100 | ngrams/sec 42526.1 | eta 0h0m24s
| epoch 98 | step 2500/4071 | loss 6.4331 | lr 0.00100 | ngrams/sec 42660.3 | eta 0h0m18s
| epoch 98 | step 3000/4071 | loss 6.4184 | lr 0.00100 | ngrams/sec 42689.4 | eta 0h0m12s
| epoch 98 | step 3500/4071 | loss 6.4218 | lr 0.00100 | ngrams/sec 42723.7 | eta 0h0m6s
| epoch 98 | step 4000/4071 | loss 6.4758 | lr 0.00100 | ngrams/sec 42665.1 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1155.37it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 313.30it/s]


-----------------------------------------------------------------------------------------
| end of epoch  98 | time 50.24s | valid loss  5.99 | valid ppl   399.45
-----------------------------------------------------------------------------------------
| epoch 99 | step 500/4071 | loss 6.4057 | lr 0.00100 | ngrams/sec 29884.4 | eta 0h1m1s
| epoch 99 | step 1000/4071 | loss 6.4107 | lr 0.00100 | ngrams/sec 42845.1 | eta 0h0m36s
| epoch 99 | step 1500/4071 | loss 6.4157 | lr 0.00100 | ngrams/sec 42764.3 | eta 0h0m30s
| epoch 99 | step 2000/4071 | loss 6.4280 | lr 0.00100 | ngrams/sec 42773.5 | eta 0h0m24s
| epoch 99 | step 2500/4071 | loss 6.4348 | lr 0.00100 | ngrams/sec 42824.2 | eta 0h0m18s
| epoch 99 | step 3000/4071 | loss 6.4371 | lr 0.00100 | ngrams/sec 42784.5 | eta 0h0m12s
| epoch 99 | step 3500/4071 | loss 6.4454 | lr 0.00100 | ngrams/sec 42790.9 | eta 0h0m6s
| epoch 99 | step 4000/4071 | loss 6.4562 | lr 0.00100 | ngrams/sec 42656.2 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1166.87it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 310.54it/s]


-----------------------------------------------------------------------------------------
| end of epoch  99 | time 50.09s | valid loss  5.96 | valid ppl   386.01
-----------------------------------------------------------------------------------------
| epoch 100 | step 500/4071 | loss 6.4197 | lr 0.00100 | ngrams/sec 29857.6 | eta 0h1m1s
| epoch 100 | step 1000/4071 | loss 6.4241 | lr 0.00100 | ngrams/sec 42696.9 | eta 0h0m36s
| epoch 100 | step 1500/4071 | loss 6.4149 | lr 0.00100 | ngrams/sec 42664.0 | eta 0h0m30s
| epoch 100 | step 2000/4071 | loss 6.4277 | lr 0.00100 | ngrams/sec 42630.1 | eta 0h0m24s
| epoch 100 | step 2500/4071 | loss 6.4397 | lr 0.00100 | ngrams/sec 42651.8 | eta 0h0m18s
| epoch 100 | step 3000/4071 | loss 6.4918 | lr 0.00100 | ngrams/sec 42763.0 | eta 0h0m12s
| epoch 100 | step 3500/4071 | loss 6.4515 | lr 0.00100 | ngrams/sec 42664.5 | eta 0h0m6s
| epoch 100 | step 4000/4071 | loss 6.4549 | lr 0.00100 | ngrams/sec 42714.3 | eta 0h0m0s


 29%|██▉       | 120/417 [00:00<00:00, 1154.85it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 312.00it/s]


-----------------------------------------------------------------------------------------


 25%|██▌       | 118/471 [00:00<00:00, 1172.50it/s]

| end of epoch 100 | time 50.18s | valid loss  5.95 | valid ppl   384.61
-----------------------------------------------------------------------------------------
Evaluating on test set...


100%|██████████| 471/471 [00:01<00:00, 302.80it/s]


| End of training | test loss  5.87 | test ppl   353.46


In [None]:
from google.colab import files
files.download('checkpoint.pth')
!cp "checkpoint.pth" "gdrive/MyDrive/checkpoint.pth"

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Generate

In [None]:
!cp "gdrive/MyDrive/checkpoint.pth" "checkpoint.pth" 

In [18]:
import torch 
# Model parameters.
class Args:
    data = 'gdrive/MyDrive/wikitext-2'
    checkpoint = 'checkpoint-tied-512.pth'
    outf = 'generated.txt'
    #words = 1000
    seed = 42
    cuda = True
    temperature = 1.0 # temperature - higher will increase diversity
    log_interval = 10 # reporting interval
    words = 100
args = Args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

model.load_state_dict(torch.load(args.checkpoint))

print(model)
model.eval()

ntokens = n_class
input_idx = 104#torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
# input_idx = torch.autograd.Variable(torch.t(torch.randint(ntokens, (1, 7), dtype=torch.long))).to(device)
input_words = [corpus.dictionary.idx2word[i] for i in train_data[input_idx:order+input_idx, 0]]
input = torch.tensor([i for i in train_data[input_idx:order+input_idx, 0]]).to(device)
print(input)
print(input_words)

FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
)
tensor([27, 63, 64, 65, 66, 17, 67], device='cuda:0')
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation']


In [17]:
glue = ' '
start = None
with open(args.outf, 'w') as outf:
    for i in range(args.words):
        output = model(input)
        word_weights = output.squeeze().div(args.temperature).exp().cpu()
        # if args.no_unk:
        #     word_weights[corpus.dictionary.w2i[unk]] = 0
        word_idx = torch.multinomial(word_weights, 1)[0]

        word = corpus.dictionary.idx2word[word_idx]
        print(word)

        word_tensor = torch.tensor([word_idx]).to(device)
        input = torch.cat((input[1:], word_tensor), 0)
        # input.fill_(word_idx)
        input = input.cuda() if cuda else input
        # print(input)
        if word is "<sos>": # ignore start of sentence predictns
            continue
        elif word is "<eos>":
            outf.write('\n')
        else:
            outf.write(word + glue)

        if i % args.log_interval == 0:
            print('| Generated {}/{} words'.format(i, args.words))

of
| Generated 0/100 words
the
korean
most
popular
in
<unk>
.
the
british
representation
| Generated 10/100 words
of
almost
the
first
publication
,
north
and
extremely
historical
| Generated 20/100 words
degree
in
the
south
.
the
latter
way
to
the
| Generated 30/100 words
cheese
era
.
same
<unk>
<unk>
japanese
the
<unk>
<unk>
| Generated 40/100 words
<unk>
<unk>
continued
in
rare
warfare
and
rows
without
these
| Generated 50/100 words
due
to
an
opening
the
young
band
,
usually
<unk>
| Generated 60/100 words
,
<unk>
gill
differences
.
while
the
hero
date
guitar
| Generated 70/100 words
,
leaving
the
world
's
tone
's
work
on
may
| Generated 80/100 words
<unk>
,
the
this
level
continued
to
be
the
altar
| Generated 90/100 words
of
any
epic
puzzles
,
the
population
gave
interest


In [19]:
print('Evaluating on best test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | best test loss {:5.2f} | best test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

 25%|██▌       | 118/471 [00:00<00:00, 1141.55it/s]

Evaluating on best test set...


100%|██████████| 471/471 [00:01<00:00, 338.57it/s]


| End of training | best test loss  5.32 | best test ppl   204.33
