In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!ls "gdrive/MyDrive/wikitext-2" # check that it has successfully connected
# files should be at ur GDrive inside folder wikitext-2
!cp "gdrive/MyDrive/wikitext-2/wiki.test.tokens.txt" "test.txt" # copy the files to colab runtime
!cp "gdrive/MyDrive/wikitext-2/wiki.train.tokens.txt" "train.txt"
!cp "gdrive/MyDrive/wikitext-2/wiki.valid.tokens.txt" "valid.txt"

wiki.test.tokens      wiki.train.tokens      wiki.valid.tokens
wiki.test.tokens.txt  wiki.train.tokens.txt  wiki.valid.tokens.txt


# Preprocessing

In [3]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                # remove the headers e.g.  = = Description = = 
                if line.startswith('='): 
                    continue
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word.lower()) # make to lower

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word.lower()]) # make to lower
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
    
    @property
    def vocab_size(self):
        return len(self.dictionary.idx2word)

# Params

In [4]:
#=== params
corpus = Corpus('/content')
n_class = corpus.vocab_size
n_step = 7 # n-1 in paper
n_hidden = 200 # h in paper
embed_size = 200       # m in paper
batch_size = 512
order = n_step # order (int): the order of the language model, i.e. length of the history
epochs = 100
learning_rate = 0.001
cuda = torch.cuda.is_available()
seed = 42
clip = 2.0
#===

# Model

In [5]:
#== MODEL ==#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''
    the neural model learns the distributed representation of each word 
    (embedding matrix C) and 
    the probability function of a word sequence as a function of their distributed representations. 
    It has a hidden layer with 
    tanh activation and the output layer is a 
    Softmax layer. 
    The output of the model for each 
    input of (n - 1) previous words are the 
    probabilities over the |V | words in the vocabulary for the next word.
'''
class FNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, context_size, no_hidden, dropout=0.5, tie_weight=True):
        super(FNNModel,self).__init__()
        """
        Args:
            n_class (int): no. of vocabulary
            m (int): size of each embedding vector
#n-gram models construct tables of conditional probabilities for the next word, 
#for each one of a large number of contexts, i.e. combinations of the last n − 1 words
            n_step (int): n-1 in paper. #n_step + 1 = n-gram. if n_step = 1, bigram
            n_hidden (int): no. of hidden units associated with each word
        """
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear1 = nn.Linear(context_size * embed_size, no_hidden)
        self.dropout = nn.Dropout(p=dropout)
        self.linear2 = nn.Linear(no_hidden, vocab_size)
        self.context_size = context_size
        self.embed_size = embed_size
        if tie_weight:
            self.linear2.weight = self.embeddings.weight

    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((-1, self.context_size * self.embed_size))
        hidden_output = self.linear1(embeds)
        hidden_output = self.dropout(hidden_output)
        out = hidden_output.tanh()
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1) # [1000, 28912]: softmax on 28912's dim
        return log_probs

# Data Loading

In [6]:
def batchify(data, batch_size):
    # Work out how cleanly we can divide the dataset into args.batch_size parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data
def get_batch(data, i, order):
    x = torch.autograd.Variable(torch.t(data[i:i+order]))
    y = torch.autograd.Variable(data[i+order].view(-1))
    return x, y
def evaluate(data, model, criterion):
	model.eval()
	total_loss = 0
	n_steps = data.size(0) - order - 1
	for i in tqdm(range(n_steps)):
		x, y = get_batch(data, i, order)
		out = model(x)
		loss = criterion(out, y)
		total_loss += loss.data.data
	return total_loss / n_steps

In [7]:
def clock_time(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    return int(h), int(m), int(s)

In [8]:
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import time


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)
if cuda:
	train_data, val_data, test_data = train_data.cuda(), val_data.cuda(), test_data.cuda()
print('Using cuda: {}'.format(cuda))
print('Size of training set: {:,} tokens'.format(np.prod(train_data.size())))
print('Size of validation set: {:,} tokens'.format(np.prod(val_data.size())))
print('Size of test set: {:,} tokens'.format(np.prod(test_data.size())))
print('Vocabulary size: {:,}'.format(corpus.vocab_size))
print('Example data:')
for k in range(100, 107):
    x = [corpus.dictionary.idx2word[i] for i in train_data[k:order+k, 0]]
    y = [corpus.dictionary.idx2word[train_data[k+order, 0]]]
    print(x, y)
#=== initialise model
model = FNNModel(
    n_class, 
    embed_size, 
    n_step, 
    n_hidden,
    tie_weight=False
    )
if cuda:
  model.cuda()
# Display the model's architecture
print('Model: \n', model)
criterion = nn.NLLLoss()
optimizer = optim.RMSprop(model.parameters(),lr=learning_rate)

Using cuda: True
Size of training set: 2,088,448 tokens
Size of validation set: 217,600 tokens
Size of test set: 245,248 tokens
Vocabulary size: 28,912
Example data:
['"', 'nameless', '"', ',', 'a', 'penal', 'military'] ['unit']
['nameless', '"', ',', 'a', 'penal', 'military', 'unit'] ['serving']
['"', ',', 'a', 'penal', 'military', 'unit', 'serving'] ['the']
[',', 'a', 'penal', 'military', 'unit', 'serving', 'the'] ['nation']
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation'] ['of']
['penal', 'military', 'unit', 'serving', 'the', 'nation', 'of'] ['gallia']
['military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia'] ['during']
Model: 
 FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
)


In [None]:
!nvidia-smi

Sun Nov 29 15:59:27 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    28W /  70W |   1035MiB / 15079MiB |      3%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Set seed for reproducibility.
torch.manual_seed(seed)
np.random.seed(seed)
parameters = [param for param in model.parameters() if param.requires_grad]
# Training
print('Training...')
losses = dict(train=[], val=[])

# initialize the early_stopping counter
stop_counter = 0

lr = learning_rate # so that can alter later if SGD not descending 
best_val_loss = None

num_steps = train_data.size(0) - order - 1
batch_order = np.arange(num_steps)

t0 = time.time()
try:
    for epoch in range(1, epochs+1):
        model.train()
        epoch_start_time = time.time()
        np.random.shuffle(batch_order)

        for step in range(1, num_steps+1):
            idx = batch_order[step-1]
            x, y = get_batch(train_data, idx, order)

            model.zero_grad()
            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)
            debug = False
            #   if debug:
            #     # Debugging softmax approximation.
            #     xe = nn.CrossEntropyLoss()
            #     true_loss = xe(logits, y)
            #     print('approx {:>3.2f}, true {:>3.2f}, diff {:>3.4f}'.format(
            #       loss.data, true_loss.data, true_loss.data - loss.data))

            # Update parameters
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # reduce exploding grad
            optimizer.step()

            # Save loss.
            losses['train'].append(loss.cpu().data)
            print_every = 500
            if step % print_every == 0:
                avg_loss = sum(losses['train'][-print_every:]) / print_every
                t1 = time.time()
                steps_per_second = print_every / (t1 - t0)
                print('| epoch {} | step {}/{} | loss {:.4f} | lr {:.5f} | '
                    'ngrams/sec {:.1f} | eta {}h{}m{}s'.format(
                    epoch, step, num_steps, avg_loss, lr,
                    steps_per_second * batch_size,
                    *clock_time((num_steps - step) / steps_per_second)))
                t0 = time.time()
            
        print('Evaluating on validation set...')
        val_loss = evaluate(val_data, model, criterion)
        losses['val'].append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, torch.exp(val_loss)))
        print('-' * 89)

        if not best_val_loss or val_loss < best_val_loss:
            stop_counter = 0 # reset counter
            best_val_loss = val_loss
            print('| saving current state of model ...')
            torch.save(model.state_dict(), 'checkpoint.pth')
            #=== download checkpoint file
            # files.download('checkpoint.pth')
        elif val_loss < best_val_loss and val_loss < losses['val'][-2] and val_loss < torch.mean(torch.stack(losses['val'])): # curr loss less than best loss and previous loss
            stop_counter += 1
            if stop_counter >= 10:
                print("Early stopping")
                break
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    
# write_losses(losses['train'], args.log_dir, name='train-losses')
# write_losses(losses['val'], args.log_dir, name='val-losses')

print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

Training...
| epoch 1 | step 500/4071 | loss 8.6635 | lr 0.00100 | ngrams/sec 36670.4 | eta 0h0m49s
| epoch 1 | step 1000/4071 | loss 8.4140 | lr 0.00100 | ngrams/sec 37981.9 | eta 0h0m41s
| epoch 1 | step 1500/4071 | loss 8.3610 | lr 0.00100 | ngrams/sec 37937.8 | eta 0h0m34s
| epoch 1 | step 2000/4071 | loss 8.3168 | lr 0.00100 | ngrams/sec 37899.4 | eta 0h0m27s
| epoch 1 | step 2500/4071 | loss 8.2750 | lr 0.00100 | ngrams/sec 37902.4 | eta 0h0m21s
| epoch 1 | step 3000/4071 | loss 8.2696 | lr 0.00100 | ngrams/sec 37891.7 | eta 0h0m14s
| epoch 1 | step 3500/4071 | loss 8.2534 | lr 0.00100 | ngrams/sec 37609.9 | eta 0h0m7s
| epoch 1 | step 4000/4071 | loss 8.2288 | lr 0.00100 | ngrams/sec 37512.9 | eta 0h0m0s


 29%|██▉       | 121/417 [00:00<00:00, 1189.75it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 337.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch   1 | time 56.58s | valid loss  6.95 | valid ppl  1041.78
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 2 | step 500/4071 | loss 8.0892 | lr 0.00100 | ngrams/sec 26756.6 | eta 0h1m8s
| epoch 2 | step 1000/4071 | loss 8.1110 | lr 0.00100 | ngrams/sec 37127.0 | eta 0h0m42s
| epoch 2 | step 1500/4071 | loss 8.1406 | lr 0.00100 | ngrams/sec 36913.9 | eta 0h0m35s
| epoch 2 | step 2000/4071 | loss 8.1488 | lr 0.00100 | ngrams/sec 36762.2 | eta 0h0m28s
| epoch 2 | step 2500/4071 | loss 8.1647 | lr 0.00100 | ngrams/sec 36355.3 | eta 0h0m22s
| epoch 2 | step 3000/4071 | loss 8.1604 | lr 0.00100 | ngrams/sec 36443.0 | eta 0h0m15s
| epoch 2 | step 3500/4071 | loss 8.1690 | lr 0.00100 | ngrams/sec 36268.9 | eta 0h0m8s
| epoch 2 | step 4000/4071 | loss 8.1600 | lr 0.00100 | ngrams/sec 36129.9 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1167.30it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.56it/s]


-----------------------------------------------------------------------------------------
| end of epoch   2 | time 58.31s | valid loss  6.63 | valid ppl   757.44
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 3 | step 500/4071 | loss 7.9921 | lr 0.00100 | ngrams/sec 25300.6 | eta 0h1m12s
| epoch 3 | step 1000/4071 | loss 7.9827 | lr 0.00100 | ngrams/sec 35818.5 | eta 0h0m43s
| epoch 3 | step 1500/4071 | loss 8.0219 | lr 0.00100 | ngrams/sec 35740.4 | eta 0h0m36s
| epoch 3 | step 2000/4071 | loss 8.0305 | lr 0.00100 | ngrams/sec 35887.4 | eta 0h0m29s
| epoch 3 | step 2500/4071 | loss 8.0435 | lr 0.00100 | ngrams/sec 36014.3 | eta 0h0m22s
| epoch 3 | step 3000/4071 | loss 8.0545 | lr 0.00100 | ngrams/sec 36042.6 | eta 0h0m15s
| epoch 3 | step 3500/4071 | loss 8.0673 | lr 0.00100 | ngrams/sec 36102.7 | eta 0h0m8s
| epoch 3 | step 4000/4071 | loss 8.0877 | lr 0.00100 | ngrams/sec 36138.8 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1170.48it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.18it/s]


-----------------------------------------------------------------------------------------
| end of epoch   3 | time 59.37s | valid loss  6.47 | valid ppl   645.11
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 4 | step 500/4071 | loss 7.9271 | lr 0.00100 | ngrams/sec 25550.2 | eta 0h1m11s
| epoch 4 | step 1000/4071 | loss 7.9538 | lr 0.00100 | ngrams/sec 36073.9 | eta 0h0m43s
| epoch 4 | step 1500/4071 | loss 7.9868 | lr 0.00100 | ngrams/sec 35971.8 | eta 0h0m36s
| epoch 4 | step 2000/4071 | loss 7.9987 | lr 0.00100 | ngrams/sec 35941.9 | eta 0h0m29s
| epoch 4 | step 2500/4071 | loss 8.0162 | lr 0.00100 | ngrams/sec 35922.0 | eta 0h0m22s
| epoch 4 | step 3000/4071 | loss 8.0091 | lr 0.00100 | ngrams/sec 35876.5 | eta 0h0m15s
| epoch 4 | step 3500/4071 | loss 8.0121 | lr 0.00100 | ngrams/sec 35843.6 | eta 0h0m8s
| epoch 4 | step 4000/4071 | loss 8.0340 | lr 0.00100 | ngrams/sec 35883.4 | eta 0h0m1s


 28%|██▊       | 118/417 [00:00<00:00, 1169.18it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 288.09it/s]


-----------------------------------------------------------------------------------------
| end of epoch   4 | time 59.43s | valid loss  6.37 | valid ppl   585.75
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 5 | step 500/4071 | loss 7.8497 | lr 0.00100 | ngrams/sec 25274.5 | eta 0h1m12s
| epoch 5 | step 1000/4071 | loss 7.8959 | lr 0.00100 | ngrams/sec 35956.0 | eta 0h0m43s
| epoch 5 | step 1500/4071 | loss 7.9116 | lr 0.00100 | ngrams/sec 35987.0 | eta 0h0m36s
| epoch 5 | step 2000/4071 | loss 7.9182 | lr 0.00100 | ngrams/sec 35991.8 | eta 0h0m29s
| epoch 5 | step 2500/4071 | loss 7.9296 | lr 0.00100 | ngrams/sec 35994.9 | eta 0h0m22s
| epoch 5 | step 3000/4071 | loss 7.9567 | lr 0.00100 | ngrams/sec 36022.5 | eta 0h0m15s
| epoch 5 | step 3500/4071 | loss 7.9735 | lr 0.00100 | ngrams/sec 36010.6 | eta 0h0m8s
| epoch 5 | step 4000/4071 | loss 7.9765 | lr 0.00100 | ngrams/sec 35973.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1176.09it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.73it/s]


-----------------------------------------------------------------------------------------
| end of epoch   5 | time 59.37s | valid loss  6.33 | valid ppl   560.88
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 6 | step 500/4071 | loss 7.8247 | lr 0.00100 | ngrams/sec 25268.8 | eta 0h1m12s
| epoch 6 | step 1000/4071 | loss 7.8581 | lr 0.00100 | ngrams/sec 35829.5 | eta 0h0m43s
| epoch 6 | step 1500/4071 | loss 7.8697 | lr 0.00100 | ngrams/sec 35864.3 | eta 0h0m36s
| epoch 6 | step 2000/4071 | loss 7.8661 | lr 0.00100 | ngrams/sec 35875.1 | eta 0h0m29s
| epoch 6 | step 2500/4071 | loss 7.8479 | lr 0.00100 | ngrams/sec 35864.0 | eta 0h0m22s
| epoch 6 | step 3000/4071 | loss 7.8767 | lr 0.00100 | ngrams/sec 35921.9 | eta 0h0m15s
| epoch 6 | step 3500/4071 | loss 7.8797 | lr 0.00100 | ngrams/sec 35975.4 | eta 0h0m8s
| epoch 6 | step 4000/4071 | loss 7.8815 | lr 0.00100 | ngrams/sec 35942.2 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1148.97it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch   6 | time 59.51s | valid loss  6.51 | valid ppl   671.39
-----------------------------------------------------------------------------------------
| epoch 7 | step 500/4071 | loss 7.7543 | lr 0.00100 | ngrams/sec 25598.6 | eta 0h1m11s
| epoch 7 | step 1000/4071 | loss 7.7932 | lr 0.00100 | ngrams/sec 35959.5 | eta 0h0m43s
| epoch 7 | step 1500/4071 | loss 7.8281 | lr 0.00100 | ngrams/sec 35959.4 | eta 0h0m36s
| epoch 7 | step 2000/4071 | loss 7.8349 | lr 0.00100 | ngrams/sec 35923.1 | eta 0h0m29s
| epoch 7 | step 2500/4071 | loss 7.8751 | lr 0.00100 | ngrams/sec 35899.6 | eta 0h0m22s
| epoch 7 | step 3000/4071 | loss 7.8809 | lr 0.00100 | ngrams/sec 35905.6 | eta 0h0m15s
| epoch 7 | step 3500/4071 | loss 7.9038 | lr 0.00100 | ngrams/sec 35906.7 | eta 0h0m8s
| epoch 7 | step 4000/4071 | loss 7.8988 | lr 0.00100 | ngrams/sec 35923.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1154.27it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch   7 | time 59.48s | valid loss  6.30 | valid ppl   547.06
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 8 | step 500/4071 | loss 7.7423 | lr 0.00100 | ngrams/sec 25297.8 | eta 0h1m12s
| epoch 8 | step 1000/4071 | loss 7.7671 | lr 0.00100 | ngrams/sec 35950.7 | eta 0h0m43s
| epoch 8 | step 1500/4071 | loss 7.7928 | lr 0.00100 | ngrams/sec 35920.1 | eta 0h0m36s
| epoch 8 | step 2000/4071 | loss 7.8129 | lr 0.00100 | ngrams/sec 35965.3 | eta 0h0m29s
| epoch 8 | step 2500/4071 | loss 7.8258 | lr 0.00100 | ngrams/sec 35910.0 | eta 0h0m22s
| epoch 8 | step 3000/4071 | loss 7.8452 | lr 0.00100 | ngrams/sec 35963.5 | eta 0h0m15s
| epoch 8 | step 3500/4071 | loss 7.8464 | lr 0.00100 | ngrams/sec 35971.1 | eta 0h0m8s
| epoch 8 | step 4000/4071 | loss 7.8619 | lr 0.00100 | ngrams/sec 35981.7 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1169.30it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.33it/s]


-----------------------------------------------------------------------------------------
| end of epoch   8 | time 59.42s | valid loss  6.26 | valid ppl   523.57
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 9 | step 500/4071 | loss 7.6894 | lr 0.00100 | ngrams/sec 25316.8 | eta 0h1m12s
| epoch 9 | step 1000/4071 | loss 7.7102 | lr 0.00100 | ngrams/sec 35974.7 | eta 0h0m43s
| epoch 9 | step 1500/4071 | loss 7.7028 | lr 0.00100 | ngrams/sec 35986.5 | eta 0h0m36s
| epoch 9 | step 2000/4071 | loss 7.7170 | lr 0.00100 | ngrams/sec 35944.4 | eta 0h0m29s
| epoch 9 | step 2500/4071 | loss 7.7430 | lr 0.00100 | ngrams/sec 35949.9 | eta 0h0m22s
| epoch 9 | step 3000/4071 | loss 7.7756 | lr 0.00100 | ngrams/sec 35961.2 | eta 0h0m15s
| epoch 9 | step 3500/4071 | loss 7.7853 | lr 0.00100 | ngrams/sec 35990.7 | eta 0h0m8s
| epoch 9 | step 4000/4071 | loss 7.8104 | lr 0.00100 | ngrams/sec 35991.8 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1165.07it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.20it/s]


-----------------------------------------------------------------------------------------
| end of epoch   9 | time 59.39s | valid loss  6.28 | valid ppl   532.29
-----------------------------------------------------------------------------------------
| epoch 10 | step 500/4071 | loss 7.6345 | lr 0.00100 | ngrams/sec 25621.1 | eta 0h1m11s
| epoch 10 | step 1000/4071 | loss 7.6660 | lr 0.00100 | ngrams/sec 35992.0 | eta 0h0m43s
| epoch 10 | step 1500/4071 | loss 7.6856 | lr 0.00100 | ngrams/sec 35996.3 | eta 0h0m36s
| epoch 10 | step 2000/4071 | loss 7.7036 | lr 0.00100 | ngrams/sec 36002.3 | eta 0h0m29s
| epoch 10 | step 2500/4071 | loss 7.7426 | lr 0.00100 | ngrams/sec 35987.3 | eta 0h0m22s
| epoch 10 | step 3000/4071 | loss 7.7518 | lr 0.00100 | ngrams/sec 36014.4 | eta 0h0m15s
| epoch 10 | step 3500/4071 | loss 7.7669 | lr 0.00100 | ngrams/sec 35989.3 | eta 0h0m8s
| epoch 10 | step 4000/4071 | loss 7.7936 | lr 0.00100 | ngrams/sec 35979.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1161.96it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.72it/s]


-----------------------------------------------------------------------------------------
| end of epoch  10 | time 59.36s | valid loss  6.20 | valid ppl   493.86
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 11 | step 500/4071 | loss 7.6014 | lr 0.00100 | ngrams/sec 25379.0 | eta 0h1m12s
| epoch 11 | step 1000/4071 | loss 7.6444 | lr 0.00100 | ngrams/sec 36001.6 | eta 0h0m43s
| epoch 11 | step 1500/4071 | loss 7.6701 | lr 0.00100 | ngrams/sec 35974.6 | eta 0h0m36s
| epoch 11 | step 2000/4071 | loss 7.6805 | lr 0.00100 | ngrams/sec 36009.4 | eta 0h0m29s
| epoch 11 | step 2500/4071 | loss 7.7000 | lr 0.00100 | ngrams/sec 35954.3 | eta 0h0m22s
| epoch 11 | step 3000/4071 | loss 7.6987 | lr 0.00100 | ngrams/sec 35954.1 | eta 0h0m15s
| epoch 11 | step 3500/4071 | loss 7.7366 | lr 0.00100 | ngrams/sec 35991.7 | eta 0h0m8s
| epoch 11 | step 4000/4071 | loss 7.7490 | lr 0.00100 | ngrams/sec 35939.5 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1163.35it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.22it/s]


-----------------------------------------------------------------------------------------
| end of epoch  11 | time 59.38s | valid loss  6.16 | valid ppl   474.88
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 12 | step 500/4071 | loss 7.5557 | lr 0.00100 | ngrams/sec 25312.9 | eta 0h1m12s
| epoch 12 | step 1000/4071 | loss 7.6026 | lr 0.00100 | ngrams/sec 35855.4 | eta 0h0m43s
| epoch 12 | step 1500/4071 | loss 7.6231 | lr 0.00100 | ngrams/sec 35824.3 | eta 0h0m36s
| epoch 12 | step 2000/4071 | loss 7.6284 | lr 0.00100 | ngrams/sec 35922.4 | eta 0h0m29s
| epoch 12 | step 2500/4071 | loss 7.6717 | lr 0.00100 | ngrams/sec 35869.7 | eta 0h0m22s
| epoch 12 | step 3000/4071 | loss 7.6866 | lr 0.00100 | ngrams/sec 35912.2 | eta 0h0m15s
| epoch 12 | step 3500/4071 | loss 7.6955 | lr 0.00100 | ngrams/sec 35896.4 | eta 0h0m8s
| epoch 12 | step 4000/4071 | loss 7.7263 | lr 0.00100 | ngrams/sec 35928.7 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1143.10it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  12 | time 59.52s | valid loss  6.14 | valid ppl   463.41
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 13 | step 500/4071 | loss 7.5177 | lr 0.00100 | ngrams/sec 25303.8 | eta 0h1m12s
| epoch 13 | step 1000/4071 | loss 7.5580 | lr 0.00100 | ngrams/sec 35935.1 | eta 0h0m43s
| epoch 13 | step 1500/4071 | loss 7.5789 | lr 0.00100 | ngrams/sec 35915.5 | eta 0h0m36s
| epoch 13 | step 2000/4071 | loss 7.6173 | lr 0.00100 | ngrams/sec 35913.6 | eta 0h0m29s
| epoch 13 | step 2500/4071 | loss 7.6271 | lr 0.00100 | ngrams/sec 35927.2 | eta 0h0m22s
| epoch 13 | step 3000/4071 | loss 7.6276 | lr 0.00100 | ngrams/sec 35919.4 | eta 0h0m15s
| epoch 13 | step 3500/4071 | loss 7.6522 | lr 0.00100 | ngrams/sec 35951.1 | eta 0h0m8s
| epoch 13 | step 4000/4071 | loss 7.6736 | lr 0.00100 | ngrams/sec 35902.5 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1178.44it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.39it/s]


-----------------------------------------------------------------------------------------
| end of epoch  13 | time 59.47s | valid loss  6.12 | valid ppl   456.68
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 14 | step 500/4071 | loss 7.4743 | lr 0.00100 | ngrams/sec 25269.8 | eta 0h1m12s
| epoch 14 | step 1000/4071 | loss 7.5111 | lr 0.00100 | ngrams/sec 35934.9 | eta 0h0m43s
| epoch 14 | step 1500/4071 | loss 7.5456 | lr 0.00100 | ngrams/sec 35866.6 | eta 0h0m36s
| epoch 14 | step 2000/4071 | loss 7.5638 | lr 0.00100 | ngrams/sec 35952.7 | eta 0h0m29s
| epoch 14 | step 2500/4071 | loss 7.5826 | lr 0.00100 | ngrams/sec 35929.3 | eta 0h0m22s
| epoch 14 | step 3000/4071 | loss 7.6097 | lr 0.00100 | ngrams/sec 35969.7 | eta 0h0m15s
| epoch 14 | step 3500/4071 | loss 7.6045 | lr 0.00100 | ngrams/sec 35987.3 | eta 0h0m8s
| epoch 14 | step 4000/4071 | loss 7.6257 | lr 0.00100 | ngrams/sec 35994.4 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1171.62it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.04it/s]


-----------------------------------------------------------------------------------------
| end of epoch  14 | time 59.43s | valid loss  6.10 | valid ppl   444.60
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 15 | step 500/4071 | loss 7.4304 | lr 0.00100 | ngrams/sec 25393.0 | eta 0h1m12s
| epoch 15 | step 1000/4071 | loss 7.4680 | lr 0.00100 | ngrams/sec 36026.1 | eta 0h0m43s
| epoch 15 | step 1500/4071 | loss 7.5066 | lr 0.00100 | ngrams/sec 35994.1 | eta 0h0m36s
| epoch 15 | step 2000/4071 | loss 7.5222 | lr 0.00100 | ngrams/sec 36030.7 | eta 0h0m29s
| epoch 15 | step 2500/4071 | loss 7.5427 | lr 0.00100 | ngrams/sec 35991.8 | eta 0h0m22s
| epoch 15 | step 3000/4071 | loss 7.5604 | lr 0.00100 | ngrams/sec 35981.4 | eta 0h0m15s
| epoch 15 | step 3500/4071 | loss 7.5899 | lr 0.00100 | ngrams/sec 35975.2 | eta 0h0m8s
| epoch 15 | step 4000/4071 | loss 7.5984 | lr 0.00100 | ngrams/sec 35941.3 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1152.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 290.03it/s]


-----------------------------------------------------------------------------------------
| end of epoch  15 | time 59.35s | valid loss  6.09 | valid ppl   439.54
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 16 | step 500/4071 | loss 7.3993 | lr 0.00100 | ngrams/sec 25319.4 | eta 0h1m12s
| epoch 16 | step 1000/4071 | loss 7.4505 | lr 0.00100 | ngrams/sec 35949.5 | eta 0h0m43s
| epoch 16 | step 1500/4071 | loss 7.4641 | lr 0.00100 | ngrams/sec 35949.1 | eta 0h0m36s
| epoch 16 | step 2000/4071 | loss 7.4775 | lr 0.00100 | ngrams/sec 35959.1 | eta 0h0m29s
| epoch 16 | step 2500/4071 | loss 7.5081 | lr 0.00100 | ngrams/sec 35964.4 | eta 0h0m22s
| epoch 16 | step 3000/4071 | loss 7.5269 | lr 0.00100 | ngrams/sec 35946.3 | eta 0h0m15s
| epoch 16 | step 3500/4071 | loss 7.5506 | lr 0.00100 | ngrams/sec 35937.0 | eta 0h0m8s
| epoch 16 | step 4000/4071 | loss 7.5526 | lr 0.00100 | ngrams/sec 35974.6 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1144.26it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 289.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  16 | time 59.42s | valid loss  6.08 | valid ppl   437.20
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 17 | step 500/4071 | loss 7.3660 | lr 0.00100 | ngrams/sec 25305.1 | eta 0h1m12s
| epoch 17 | step 1000/4071 | loss 7.3984 | lr 0.00100 | ngrams/sec 35952.6 | eta 0h0m43s
| epoch 17 | step 1500/4071 | loss 7.4377 | lr 0.00100 | ngrams/sec 35920.7 | eta 0h0m36s
| epoch 17 | step 2000/4071 | loss 7.4584 | lr 0.00100 | ngrams/sec 35830.1 | eta 0h0m29s
| epoch 17 | step 2500/4071 | loss 7.4751 | lr 0.00100 | ngrams/sec 35917.4 | eta 0h0m22s
| epoch 17 | step 3000/4071 | loss 7.5021 | lr 0.00100 | ngrams/sec 35969.8 | eta 0h0m15s
| epoch 17 | step 3500/4071 | loss 7.5117 | lr 0.00100 | ngrams/sec 36002.5 | eta 0h0m8s
| epoch 17 | step 4000/4071 | loss 7.5132 | lr 0.00100 | ngrams/sec 35985.7 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1174.34it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.69it/s]


-----------------------------------------------------------------------------------------
| end of epoch  17 | time 59.43s | valid loss  6.08 | valid ppl   436.56
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 18 | step 500/4071 | loss 7.3295 | lr 0.00100 | ngrams/sec 25384.1 | eta 0h1m12s
| epoch 18 | step 1000/4071 | loss 7.3595 | lr 0.00100 | ngrams/sec 36011.3 | eta 0h0m43s
| epoch 18 | step 1500/4071 | loss 7.4077 | lr 0.00100 | ngrams/sec 36014.8 | eta 0h0m36s
| epoch 18 | step 2000/4071 | loss 7.4250 | lr 0.00100 | ngrams/sec 36042.3 | eta 0h0m29s
| epoch 18 | step 2500/4071 | loss 7.4353 | lr 0.00100 | ngrams/sec 36057.6 | eta 0h0m22s
| epoch 18 | step 3000/4071 | loss 7.4591 | lr 0.00100 | ngrams/sec 36016.9 | eta 0h0m15s
| epoch 18 | step 3500/4071 | loss 7.4841 | lr 0.00100 | ngrams/sec 36017.2 | eta 0h0m8s
| epoch 18 | step 4000/4071 | loss 7.4923 | lr 0.00100 | ngrams/sec 36000.8 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1171.81it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.23it/s]


-----------------------------------------------------------------------------------------
| end of epoch  18 | time 59.30s | valid loss  6.06 | valid ppl   428.89
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 19 | step 500/4071 | loss 7.3047 | lr 0.00100 | ngrams/sec 25364.9 | eta 0h1m12s
| epoch 19 | step 1000/4071 | loss 7.3405 | lr 0.00100 | ngrams/sec 35816.4 | eta 0h0m43s
| epoch 19 | step 1500/4071 | loss 7.3789 | lr 0.00100 | ngrams/sec 35994.6 | eta 0h0m36s
| epoch 19 | step 2000/4071 | loss 7.3790 | lr 0.00100 | ngrams/sec 36021.2 | eta 0h0m29s
| epoch 19 | step 2500/4071 | loss 7.4103 | lr 0.00100 | ngrams/sec 36000.6 | eta 0h0m22s
| epoch 19 | step 3000/4071 | loss 7.4296 | lr 0.00100 | ngrams/sec 36017.9 | eta 0h0m15s
| epoch 19 | step 3500/4071 | loss 7.4369 | lr 0.00100 | ngrams/sec 36027.6 | eta 0h0m8s
| epoch 19 | step 4000/4071 | loss 7.4647 | lr 0.00100 | ngrams/sec 36011.3 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1185.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.86it/s]


-----------------------------------------------------------------------------------------
| end of epoch  19 | time 59.36s | valid loss  6.06 | valid ppl   428.46
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 20 | step 500/4071 | loss 7.2643 | lr 0.00100 | ngrams/sec 25370.1 | eta 0h1m12s
| epoch 20 | step 1000/4071 | loss 7.3065 | lr 0.00100 | ngrams/sec 35964.3 | eta 0h0m43s
| epoch 20 | step 1500/4071 | loss 7.3336 | lr 0.00100 | ngrams/sec 35999.7 | eta 0h0m36s
| epoch 20 | step 2000/4071 | loss 7.3605 | lr 0.00100 | ngrams/sec 35996.9 | eta 0h0m29s
| epoch 20 | step 2500/4071 | loss 7.3943 | lr 0.00100 | ngrams/sec 36010.0 | eta 0h0m22s
| epoch 20 | step 3000/4071 | loss 7.4177 | lr 0.00100 | ngrams/sec 36027.2 | eta 0h0m15s
| epoch 20 | step 3500/4071 | loss 7.4155 | lr 0.00100 | ngrams/sec 36018.1 | eta 0h0m8s
| epoch 20 | step 4000/4071 | loss 7.4439 | lr 0.00100 | ngrams/sec 36031.5 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1167.12it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.17it/s]


-----------------------------------------------------------------------------------------
| end of epoch  20 | time 59.33s | valid loss  6.05 | valid ppl   423.68
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 21 | step 500/4071 | loss 7.2456 | lr 0.00100 | ngrams/sec 25386.4 | eta 0h1m12s
| epoch 21 | step 1000/4071 | loss 7.2787 | lr 0.00100 | ngrams/sec 36001.8 | eta 0h0m43s
| epoch 21 | step 1500/4071 | loss 7.3114 | lr 0.00100 | ngrams/sec 36037.1 | eta 0h0m36s
| epoch 21 | step 2000/4071 | loss 7.3214 | lr 0.00100 | ngrams/sec 36014.2 | eta 0h0m29s
| epoch 21 | step 2500/4071 | loss 7.3638 | lr 0.00100 | ngrams/sec 35995.3 | eta 0h0m22s
| epoch 21 | step 3000/4071 | loss 7.3742 | lr 0.00100 | ngrams/sec 36015.5 | eta 0h0m15s
| epoch 21 | step 3500/4071 | loss 7.3955 | lr 0.00100 | ngrams/sec 36015.0 | eta 0h0m8s
| epoch 21 | step 4000/4071 | loss 7.4090 | lr 0.00100 | ngrams/sec 36007.8 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1162.10it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.41it/s]


-----------------------------------------------------------------------------------------
| end of epoch  21 | time 59.31s | valid loss  6.05 | valid ppl   422.61
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 22 | step 500/4071 | loss 7.2232 | lr 0.00100 | ngrams/sec 25391.8 | eta 0h1m12s
| epoch 22 | step 1000/4071 | loss 7.2561 | lr 0.00100 | ngrams/sec 36042.3 | eta 0h0m43s
| epoch 22 | step 1500/4071 | loss 7.2838 | lr 0.00100 | ngrams/sec 35993.0 | eta 0h0m36s
| epoch 22 | step 2000/4071 | loss 7.3027 | lr 0.00100 | ngrams/sec 35993.3 | eta 0h0m29s
| epoch 22 | step 2500/4071 | loss 7.3423 | lr 0.00100 | ngrams/sec 35930.7 | eta 0h0m22s
| epoch 22 | step 3000/4071 | loss 7.3565 | lr 0.00100 | ngrams/sec 35928.3 | eta 0h0m15s
| epoch 22 | step 3500/4071 | loss 7.3484 | lr 0.00100 | ngrams/sec 36007.3 | eta 0h0m8s
| epoch 22 | step 4000/4071 | loss 7.3926 | lr 0.00100 | ngrams/sec 36002.4 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1180.69it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  22 | time 59.35s | valid loss  6.05 | valid ppl   425.28
-----------------------------------------------------------------------------------------
| epoch 23 | step 500/4071 | loss 7.1919 | lr 0.00100 | ngrams/sec 25677.0 | eta 0h1m11s
| epoch 23 | step 1000/4071 | loss 7.2466 | lr 0.00100 | ngrams/sec 36045.2 | eta 0h0m43s
| epoch 23 | step 1500/4071 | loss 7.2711 | lr 0.00100 | ngrams/sec 36052.6 | eta 0h0m36s
| epoch 23 | step 2000/4071 | loss 7.2843 | lr 0.00100 | ngrams/sec 36042.5 | eta 0h0m29s
| epoch 23 | step 2500/4071 | loss 7.2977 | lr 0.00100 | ngrams/sec 36039.3 | eta 0h0m22s
| epoch 23 | step 3000/4071 | loss 7.3342 | lr 0.00100 | ngrams/sec 36011.4 | eta 0h0m15s
| epoch 23 | step 3500/4071 | loss 7.3417 | lr 0.00100 | ngrams/sec 36048.2 | eta 0h0m8s
| epoch 23 | step 4000/4071 | loss 7.3634 | lr 0.00100 | ngrams/sec 36083.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1145.71it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.17it/s]


-----------------------------------------------------------------------------------------
| end of epoch  23 | time 59.27s | valid loss  6.04 | valid ppl   418.51
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 24 | step 500/4071 | loss 7.1665 | lr 0.00100 | ngrams/sec 25414.3 | eta 0h1m11s
| epoch 24 | step 1000/4071 | loss 7.1971 | lr 0.00100 | ngrams/sec 36011.2 | eta 0h0m43s
| epoch 24 | step 1500/4071 | loss 7.2354 | lr 0.00100 | ngrams/sec 35984.1 | eta 0h0m36s
| epoch 24 | step 2000/4071 | loss 7.2544 | lr 0.00100 | ngrams/sec 36019.1 | eta 0h0m29s
| epoch 24 | step 2500/4071 | loss 7.2792 | lr 0.00100 | ngrams/sec 36005.2 | eta 0h0m22s
| epoch 24 | step 3000/4071 | loss 7.3023 | lr 0.00100 | ngrams/sec 36046.1 | eta 0h0m15s
| epoch 24 | step 3500/4071 | loss 7.3194 | lr 0.00100 | ngrams/sec 35694.6 | eta 0h0m8s
| epoch 24 | step 4000/4071 | loss 7.3300 | lr 0.00100 | ngrams/sec 36048.8 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1168.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.86it/s]


-----------------------------------------------------------------------------------------
| end of epoch  24 | time 59.36s | valid loss  6.03 | valid ppl   417.44
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 25 | step 500/4071 | loss 7.1409 | lr 0.00100 | ngrams/sec 25417.9 | eta 0h1m11s
| epoch 25 | step 1000/4071 | loss 7.1870 | lr 0.00100 | ngrams/sec 36085.4 | eta 0h0m43s
| epoch 25 | step 1500/4071 | loss 7.2028 | lr 0.00100 | ngrams/sec 36014.8 | eta 0h0m36s
| epoch 25 | step 2000/4071 | loss 7.2307 | lr 0.00100 | ngrams/sec 36055.0 | eta 0h0m29s
| epoch 25 | step 2500/4071 | loss 7.2538 | lr 0.00100 | ngrams/sec 36049.0 | eta 0h0m22s
| epoch 25 | step 3000/4071 | loss 7.2877 | lr 0.00100 | ngrams/sec 36055.1 | eta 0h0m15s
| epoch 25 | step 3500/4071 | loss 7.2948 | lr 0.00100 | ngrams/sec 36026.9 | eta 0h0m8s
| epoch 25 | step 4000/4071 | loss 7.3071 | lr 0.00100 | ngrams/sec 36064.3 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1154.42it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  25 | time 59.25s | valid loss  6.03 | valid ppl   417.05
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 26 | step 500/4071 | loss 7.1196 | lr 0.00100 | ngrams/sec 25429.4 | eta 0h1m11s
| epoch 26 | step 1000/4071 | loss 7.1619 | lr 0.00100 | ngrams/sec 36072.4 | eta 0h0m43s
| epoch 26 | step 1500/4071 | loss 7.1895 | lr 0.00100 | ngrams/sec 36076.3 | eta 0h0m36s
| epoch 26 | step 2000/4071 | loss 7.2036 | lr 0.00100 | ngrams/sec 36075.5 | eta 0h0m29s
| epoch 26 | step 2500/4071 | loss 7.2175 | lr 0.00100 | ngrams/sec 36062.9 | eta 0h0m22s
| epoch 26 | step 3000/4071 | loss 7.2386 | lr 0.00100 | ngrams/sec 36052.0 | eta 0h0m15s
| epoch 26 | step 3500/4071 | loss 7.2598 | lr 0.00100 | ngrams/sec 36075.9 | eta 0h0m8s
| epoch 26 | step 4000/4071 | loss 7.2827 | lr 0.00100 | ngrams/sec 36094.2 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1152.08it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.87it/s]


-----------------------------------------------------------------------------------------
| end of epoch  26 | time 59.22s | valid loss  6.04 | valid ppl   418.82
-----------------------------------------------------------------------------------------
| epoch 27 | step 500/4071 | loss 7.1000 | lr 0.00100 | ngrams/sec 25680.8 | eta 0h1m11s
| epoch 27 | step 1000/4071 | loss 7.1480 | lr 0.00100 | ngrams/sec 36032.5 | eta 0h0m43s
| epoch 27 | step 1500/4071 | loss 7.1569 | lr 0.00100 | ngrams/sec 36065.9 | eta 0h0m36s
| epoch 27 | step 2000/4071 | loss 7.1879 | lr 0.00100 | ngrams/sec 36084.5 | eta 0h0m29s
| epoch 27 | step 2500/4071 | loss 7.1949 | lr 0.00100 | ngrams/sec 36031.8 | eta 0h0m22s
| epoch 27 | step 3000/4071 | loss 7.2296 | lr 0.00100 | ngrams/sec 35994.5 | eta 0h0m15s
| epoch 27 | step 3500/4071 | loss 7.2445 | lr 0.00100 | ngrams/sec 35995.7 | eta 0h0m8s
| epoch 27 | step 4000/4071 | loss 7.2679 | lr 0.00100 | ngrams/sec 36041.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1160.94it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.67it/s]


-----------------------------------------------------------------------------------------
| end of epoch  27 | time 59.28s | valid loss  6.03 | valid ppl   413.75
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 28 | step 500/4071 | loss 7.0660 | lr 0.00100 | ngrams/sec 25412.7 | eta 0h1m11s
| epoch 28 | step 1000/4071 | loss 7.1178 | lr 0.00100 | ngrams/sec 36047.7 | eta 0h0m43s
| epoch 28 | step 1500/4071 | loss 7.1291 | lr 0.00100 | ngrams/sec 36041.3 | eta 0h0m36s
| epoch 28 | step 2000/4071 | loss 7.1778 | lr 0.00100 | ngrams/sec 36036.6 | eta 0h0m29s
| epoch 28 | step 2500/4071 | loss 7.1870 | lr 0.00100 | ngrams/sec 36052.0 | eta 0h0m22s
| epoch 28 | step 3000/4071 | loss 7.2137 | lr 0.00100 | ngrams/sec 36059.1 | eta 0h0m15s
| epoch 28 | step 3500/4071 | loss 7.2509 | lr 0.00100 | ngrams/sec 36063.3 | eta 0h0m8s
| epoch 28 | step 4000/4071 | loss 7.2426 | lr 0.00100 | ngrams/sec 36082.4 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1170.21it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.62it/s]


-----------------------------------------------------------------------------------------
| end of epoch  28 | time 59.25s | valid loss  6.02 | valid ppl   413.16
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 29 | step 500/4071 | loss 7.0571 | lr 0.00100 | ngrams/sec 25422.2 | eta 0h1m11s
| epoch 29 | step 1000/4071 | loss 7.1036 | lr 0.00100 | ngrams/sec 36042.6 | eta 0h0m43s
| epoch 29 | step 1500/4071 | loss 7.1304 | lr 0.00100 | ngrams/sec 36046.6 | eta 0h0m36s
| epoch 29 | step 2000/4071 | loss 7.1470 | lr 0.00100 | ngrams/sec 36063.6 | eta 0h0m29s
| epoch 29 | step 2500/4071 | loss 7.1708 | lr 0.00100 | ngrams/sec 36068.8 | eta 0h0m22s
| epoch 29 | step 3000/4071 | loss 7.1981 | lr 0.00100 | ngrams/sec 36029.8 | eta 0h0m15s
| epoch 29 | step 3500/4071 | loss 7.2207 | lr 0.00100 | ngrams/sec 36062.2 | eta 0h0m8s
| epoch 29 | step 4000/4071 | loss 7.2157 | lr 0.00100 | ngrams/sec 36068.6 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1181.21it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  29 | time 59.24s | valid loss  6.03 | valid ppl   413.97
-----------------------------------------------------------------------------------------
| epoch 30 | step 500/4071 | loss 7.0256 | lr 0.00100 | ngrams/sec 25724.8 | eta 0h1m11s
| epoch 30 | step 1000/4071 | loss 7.0736 | lr 0.00100 | ngrams/sec 36091.1 | eta 0h0m43s
| epoch 30 | step 1500/4071 | loss 7.1148 | lr 0.00100 | ngrams/sec 36063.8 | eta 0h0m36s
| epoch 30 | step 2000/4071 | loss 7.1254 | lr 0.00100 | ngrams/sec 36043.0 | eta 0h0m29s
| epoch 30 | step 2500/4071 | loss 7.1616 | lr 0.00100 | ngrams/sec 36066.4 | eta 0h0m22s
| epoch 30 | step 3000/4071 | loss 7.1714 | lr 0.00100 | ngrams/sec 36054.2 | eta 0h0m15s
| epoch 30 | step 3500/4071 | loss 7.1860 | lr 0.00100 | ngrams/sec 36049.7 | eta 0h0m8s
| epoch 30 | step 4000/4071 | loss 7.1999 | lr 0.00100 | ngrams/sec 36054.8 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1168.30it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch  30 | time 59.23s | valid loss  6.03 | valid ppl   415.43
-----------------------------------------------------------------------------------------
| epoch 31 | step 500/4071 | loss 7.0055 | lr 0.00100 | ngrams/sec 25709.3 | eta 0h1m11s
| epoch 31 | step 1000/4071 | loss 7.0488 | lr 0.00100 | ngrams/sec 36060.0 | eta 0h0m43s
| epoch 31 | step 1500/4071 | loss 7.0804 | lr 0.00100 | ngrams/sec 36080.4 | eta 0h0m36s
| epoch 31 | step 2000/4071 | loss 7.1036 | lr 0.00100 | ngrams/sec 36071.5 | eta 0h0m29s
| epoch 31 | step 2500/4071 | loss 7.1355 | lr 0.00100 | ngrams/sec 36078.6 | eta 0h0m22s
| epoch 31 | step 3000/4071 | loss 7.1659 | lr 0.00100 | ngrams/sec 36015.7 | eta 0h0m15s
| epoch 31 | step 3500/4071 | loss 7.1694 | lr 0.00100 | ngrams/sec 36052.4 | eta 0h0m8s
| epoch 31 | step 4000/4071 | loss 7.1796 | lr 0.00100 | ngrams/sec 36015.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1166.12it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.85it/s]


-----------------------------------------------------------------------------------------
| end of epoch  31 | time 59.25s | valid loss  6.03 | valid ppl   415.65
-----------------------------------------------------------------------------------------
| epoch 32 | step 500/4071 | loss 6.9939 | lr 0.00100 | ngrams/sec 25710.6 | eta 0h1m11s
| epoch 32 | step 1000/4071 | loss 7.0357 | lr 0.00100 | ngrams/sec 36069.0 | eta 0h0m43s
| epoch 32 | step 1500/4071 | loss 7.0714 | lr 0.00100 | ngrams/sec 36035.7 | eta 0h0m36s
| epoch 32 | step 2000/4071 | loss 7.0834 | lr 0.00100 | ngrams/sec 36033.2 | eta 0h0m29s
| epoch 32 | step 2500/4071 | loss 7.1168 | lr 0.00100 | ngrams/sec 36038.4 | eta 0h0m22s
| epoch 32 | step 3000/4071 | loss 7.1440 | lr 0.00100 | ngrams/sec 36016.8 | eta 0h0m15s
| epoch 32 | step 3500/4071 | loss 7.1414 | lr 0.00100 | ngrams/sec 36033.9 | eta 0h0m8s
| epoch 32 | step 4000/4071 | loss 7.1827 | lr 0.00100 | ngrams/sec 36036.7 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1161.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.04it/s]


-----------------------------------------------------------------------------------------
| end of epoch  32 | time 59.27s | valid loss  6.03 | valid ppl   414.29
-----------------------------------------------------------------------------------------
| epoch 33 | step 500/4071 | loss 6.9876 | lr 0.00100 | ngrams/sec 25702.5 | eta 0h1m11s
| epoch 33 | step 1000/4071 | loss 7.0165 | lr 0.00100 | ngrams/sec 36037.7 | eta 0h0m43s
| epoch 33 | step 1500/4071 | loss 7.0439 | lr 0.00100 | ngrams/sec 36074.3 | eta 0h0m36s
| epoch 33 | step 2000/4071 | loss 7.0836 | lr 0.00100 | ngrams/sec 36061.3 | eta 0h0m29s
| epoch 33 | step 2500/4071 | loss 7.0840 | lr 0.00100 | ngrams/sec 36082.8 | eta 0h0m22s
| epoch 33 | step 3000/4071 | loss 7.1225 | lr 0.00100 | ngrams/sec 36011.3 | eta 0h0m15s
| epoch 33 | step 3500/4071 | loss 7.1313 | lr 0.00100 | ngrams/sec 35976.2 | eta 0h0m8s
| epoch 33 | step 4000/4071 | loss 7.1511 | lr 0.00100 | ngrams/sec 36029.1 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1154.98it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.56it/s]


-----------------------------------------------------------------------------------------
| end of epoch  33 | time 59.28s | valid loss  6.03 | valid ppl   413.80
-----------------------------------------------------------------------------------------
| epoch 34 | step 500/4071 | loss 6.9762 | lr 0.00100 | ngrams/sec 25652.0 | eta 0h1m11s
| epoch 34 | step 1000/4071 | loss 6.9977 | lr 0.00100 | ngrams/sec 36023.3 | eta 0h0m43s
| epoch 34 | step 1500/4071 | loss 7.0342 | lr 0.00100 | ngrams/sec 36026.7 | eta 0h0m36s
| epoch 34 | step 2000/4071 | loss 7.0598 | lr 0.00100 | ngrams/sec 35976.9 | eta 0h0m29s
| epoch 34 | step 2500/4071 | loss 7.0976 | lr 0.00100 | ngrams/sec 36033.4 | eta 0h0m22s
| epoch 34 | step 3000/4071 | loss 7.1237 | lr 0.00100 | ngrams/sec 36006.0 | eta 0h0m15s
| epoch 34 | step 3500/4071 | loss 7.1229 | lr 0.00100 | ngrams/sec 35996.4 | eta 0h0m8s
| epoch 34 | step 4000/4071 | loss 7.1430 | lr 0.00100 | ngrams/sec 36027.3 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1151.95it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.09it/s]


-----------------------------------------------------------------------------------------
| end of epoch  34 | time 59.32s | valid loss  6.03 | valid ppl   413.78
-----------------------------------------------------------------------------------------
| epoch 35 | step 500/4071 | loss 6.9297 | lr 0.00100 | ngrams/sec 25673.4 | eta 0h1m11s
| epoch 35 | step 1000/4071 | loss 6.9917 | lr 0.00100 | ngrams/sec 36039.2 | eta 0h0m43s
| epoch 35 | step 1500/4071 | loss 7.0171 | lr 0.00100 | ngrams/sec 36030.7 | eta 0h0m36s
| epoch 35 | step 2000/4071 | loss 7.0481 | lr 0.00100 | ngrams/sec 35992.7 | eta 0h0m29s
| epoch 35 | step 2500/4071 | loss 7.0556 | lr 0.00100 | ngrams/sec 36023.3 | eta 0h0m22s
| epoch 35 | step 3000/4071 | loss 7.0876 | lr 0.00100 | ngrams/sec 36042.4 | eta 0h0m15s
| epoch 35 | step 3500/4071 | loss 7.1119 | lr 0.00100 | ngrams/sec 36026.5 | eta 0h0m8s
| epoch 35 | step 4000/4071 | loss 7.1092 | lr 0.00100 | ngrams/sec 36016.1 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1171.05it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  35 | time 59.30s | valid loss  6.02 | valid ppl   412.31
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 36 | step 500/4071 | loss 6.9341 | lr 0.00100 | ngrams/sec 25380.3 | eta 0h1m12s
| epoch 36 | step 1000/4071 | loss 6.9764 | lr 0.00100 | ngrams/sec 35943.1 | eta 0h0m43s
| epoch 36 | step 1500/4071 | loss 7.0045 | lr 0.00100 | ngrams/sec 35952.2 | eta 0h0m36s
| epoch 36 | step 2000/4071 | loss 7.0137 | lr 0.00100 | ngrams/sec 35943.7 | eta 0h0m29s
| epoch 36 | step 2500/4071 | loss 7.0578 | lr 0.00100 | ngrams/sec 35918.2 | eta 0h0m22s
| epoch 36 | step 3000/4071 | loss 7.0791 | lr 0.00100 | ngrams/sec 35941.0 | eta 0h0m15s
| epoch 36 | step 3500/4071 | loss 7.0862 | lr 0.00100 | ngrams/sec 35896.7 | eta 0h0m8s
| epoch 36 | step 4000/4071 | loss 7.1062 | lr 0.00100 | ngrams/sec 35957.3 | eta 0

 29%|██▊       | 119/417 [00:00<00:00, 1186.01it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.17it/s]


-----------------------------------------------------------------------------------------
| end of epoch  36 | time 59.42s | valid loss  6.03 | valid ppl   414.47
-----------------------------------------------------------------------------------------
| epoch 37 | step 500/4071 | loss 6.9150 | lr 0.00100 | ngrams/sec 25687.6 | eta 0h1m11s
| epoch 37 | step 1000/4071 | loss 6.9693 | lr 0.00100 | ngrams/sec 35989.2 | eta 0h0m43s
| epoch 37 | step 1500/4071 | loss 6.9872 | lr 0.00100 | ngrams/sec 36017.7 | eta 0h0m36s
| epoch 37 | step 2000/4071 | loss 7.0095 | lr 0.00100 | ngrams/sec 36013.1 | eta 0h0m29s
| epoch 37 | step 2500/4071 | loss 7.0476 | lr 0.00100 | ngrams/sec 35989.6 | eta 0h0m22s
| epoch 37 | step 3000/4071 | loss 7.0566 | lr 0.00100 | ngrams/sec 35948.2 | eta 0h0m15s
| epoch 37 | step 3500/4071 | loss 7.0725 | lr 0.00100 | ngrams/sec 35950.5 | eta 0h0m8s
| epoch 37 | step 4000/4071 | loss 7.0842 | lr 0.00100 | ngrams/sec 35957.8 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1182.48it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 291.69it/s]


-----------------------------------------------------------------------------------------
| end of epoch  37 | time 59.36s | valid loss  6.03 | valid ppl   417.17
-----------------------------------------------------------------------------------------
| epoch 38 | step 500/4071 | loss 6.8995 | lr 0.00100 | ngrams/sec 25606.7 | eta 0h1m11s
| epoch 38 | step 1000/4071 | loss 6.9325 | lr 0.00100 | ngrams/sec 35811.8 | eta 0h0m43s
| epoch 38 | step 1500/4071 | loss 6.9794 | lr 0.00100 | ngrams/sec 35889.8 | eta 0h0m36s
| epoch 38 | step 2000/4071 | loss 7.0014 | lr 0.00100 | ngrams/sec 35713.7 | eta 0h0m29s
| epoch 38 | step 2500/4071 | loss 7.0309 | lr 0.00100 | ngrams/sec 35888.7 | eta 0h0m22s
| epoch 38 | step 3000/4071 | loss 7.0435 | lr 0.00100 | ngrams/sec 35854.4 | eta 0h0m15s
| epoch 38 | step 3500/4071 | loss 7.0513 | lr 0.00100 | ngrams/sec 35866.4 | eta 0h0m8s
| epoch 38 | step 4000/4071 | loss 7.0737 | lr 0.00100 | ngrams/sec 35943.4 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1163.19it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.32it/s]


-----------------------------------------------------------------------------------------
| end of epoch  38 | time 59.56s | valid loss  6.03 | valid ppl   417.02
-----------------------------------------------------------------------------------------
| epoch 39 | step 500/4071 | loss 6.8857 | lr 0.00100 | ngrams/sec 25638.6 | eta 0h1m11s
| epoch 39 | step 1000/4071 | loss 6.9277 | lr 0.00100 | ngrams/sec 35967.9 | eta 0h0m43s
| epoch 39 | step 1500/4071 | loss 6.9630 | lr 0.00100 | ngrams/sec 35976.9 | eta 0h0m36s
| epoch 39 | step 2000/4071 | loss 6.9931 | lr 0.00100 | ngrams/sec 36006.2 | eta 0h0m29s
| epoch 39 | step 2500/4071 | loss 7.0152 | lr 0.00100 | ngrams/sec 35983.4 | eta 0h0m22s
| epoch 39 | step 3000/4071 | loss 7.0247 | lr 0.00100 | ngrams/sec 36000.1 | eta 0h0m15s
| epoch 39 | step 3500/4071 | loss 7.0535 | lr 0.00100 | ngrams/sec 36035.9 | eta 0h0m8s
| epoch 39 | step 4000/4071 | loss 7.0568 | lr 0.00100 | ngrams/sec 36048.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1148.60it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.01it/s]


-----------------------------------------------------------------------------------------
| end of epoch  39 | time 59.34s | valid loss  6.03 | valid ppl   417.35
-----------------------------------------------------------------------------------------
| epoch 40 | step 500/4071 | loss 6.8699 | lr 0.00100 | ngrams/sec 25681.5 | eta 0h1m11s
| epoch 40 | step 1000/4071 | loss 6.9160 | lr 0.00100 | ngrams/sec 36020.1 | eta 0h0m43s
| epoch 40 | step 1500/4071 | loss 6.9510 | lr 0.00100 | ngrams/sec 36077.9 | eta 0h0m36s
| epoch 40 | step 2000/4071 | loss 6.9865 | lr 0.00100 | ngrams/sec 36029.6 | eta 0h0m29s
| epoch 40 | step 2500/4071 | loss 6.9838 | lr 0.00100 | ngrams/sec 36019.0 | eta 0h0m22s
| epoch 40 | step 3000/4071 | loss 7.0114 | lr 0.00100 | ngrams/sec 36034.5 | eta 0h0m15s
| epoch 40 | step 3500/4071 | loss 7.0185 | lr 0.00100 | ngrams/sec 36053.0 | eta 0h0m8s
| epoch 40 | step 4000/4071 | loss 7.0623 | lr 0.00100 | ngrams/sec 36058.1 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1183.58it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.28it/s]


-----------------------------------------------------------------------------------------
| end of epoch  40 | time 59.27s | valid loss  6.03 | valid ppl   416.75
-----------------------------------------------------------------------------------------
| epoch 41 | step 500/4071 | loss 6.8582 | lr 0.00100 | ngrams/sec 25715.8 | eta 0h1m11s
| epoch 41 | step 1000/4071 | loss 6.9044 | lr 0.00100 | ngrams/sec 36066.4 | eta 0h0m43s
| epoch 41 | step 1500/4071 | loss 6.9299 | lr 0.00100 | ngrams/sec 36008.2 | eta 0h0m36s
| epoch 41 | step 2000/4071 | loss 6.9505 | lr 0.00100 | ngrams/sec 36045.7 | eta 0h0m29s
| epoch 41 | step 2500/4071 | loss 6.9736 | lr 0.00100 | ngrams/sec 36032.5 | eta 0h0m22s
| epoch 41 | step 3000/4071 | loss 6.9970 | lr 0.00100 | ngrams/sec 35981.3 | eta 0h0m15s
| epoch 41 | step 3500/4071 | loss 7.0079 | lr 0.00100 | ngrams/sec 35995.7 | eta 0h0m8s
| epoch 41 | step 4000/4071 | loss 7.0440 | lr 0.00100 | ngrams/sec 36017.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1181.19it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.34it/s]


-----------------------------------------------------------------------------------------
| end of epoch  41 | time 59.29s | valid loss  6.03 | valid ppl   416.39
-----------------------------------------------------------------------------------------
| epoch 42 | step 500/4071 | loss 6.8550 | lr 0.00100 | ngrams/sec 25704.4 | eta 0h1m11s
| epoch 42 | step 1000/4071 | loss 6.8916 | lr 0.00100 | ngrams/sec 36049.0 | eta 0h0m43s
| epoch 42 | step 1500/4071 | loss 6.9267 | lr 0.00100 | ngrams/sec 36009.6 | eta 0h0m36s
| epoch 42 | step 2000/4071 | loss 6.9370 | lr 0.00100 | ngrams/sec 36027.4 | eta 0h0m29s
| epoch 42 | step 2500/4071 | loss 6.9627 | lr 0.00100 | ngrams/sec 36055.1 | eta 0h0m22s
| epoch 42 | step 3000/4071 | loss 6.9800 | lr 0.00100 | ngrams/sec 36022.2 | eta 0h0m15s
| epoch 42 | step 3500/4071 | loss 7.0040 | lr 0.00100 | ngrams/sec 36022.5 | eta 0h0m8s
| epoch 42 | step 4000/4071 | loss 7.0262 | lr 0.00100 | ngrams/sec 36008.5 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1184.24it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.67it/s]


-----------------------------------------------------------------------------------------
| end of epoch  42 | time 59.29s | valid loss  6.04 | valid ppl   420.89
-----------------------------------------------------------------------------------------
| epoch 43 | step 500/4071 | loss 6.8311 | lr 0.00100 | ngrams/sec 25706.4 | eta 0h1m11s
| epoch 43 | step 1000/4071 | loss 6.8803 | lr 0.00100 | ngrams/sec 35966.2 | eta 0h0m43s
| epoch 43 | step 1500/4071 | loss 6.9077 | lr 0.00100 | ngrams/sec 35928.6 | eta 0h0m36s
| epoch 43 | step 2000/4071 | loss 6.9388 | lr 0.00100 | ngrams/sec 35950.6 | eta 0h0m29s
| epoch 43 | step 2500/4071 | loss 6.9607 | lr 0.00100 | ngrams/sec 36049.7 | eta 0h0m22s
| epoch 43 | step 3000/4071 | loss 6.9664 | lr 0.00100 | ngrams/sec 36031.0 | eta 0h0m15s
| epoch 43 | step 3500/4071 | loss 6.9923 | lr 0.00100 | ngrams/sec 36039.9 | eta 0h0m8s
| epoch 43 | step 4000/4071 | loss 7.0119 | lr 0.00100 | ngrams/sec 36030.5 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1157.38it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch  43 | time 59.33s | valid loss  6.04 | valid ppl   420.39
-----------------------------------------------------------------------------------------
| epoch 44 | step 500/4071 | loss 6.8269 | lr 0.00100 | ngrams/sec 25714.4 | eta 0h1m11s
| epoch 44 | step 1000/4071 | loss 6.8593 | lr 0.00100 | ngrams/sec 36043.7 | eta 0h0m43s
| epoch 44 | step 1500/4071 | loss 6.8941 | lr 0.00100 | ngrams/sec 36059.2 | eta 0h0m36s
| epoch 44 | step 2000/4071 | loss 6.9198 | lr 0.00100 | ngrams/sec 36023.9 | eta 0h0m29s
| epoch 44 | step 2500/4071 | loss 6.9433 | lr 0.00100 | ngrams/sec 36033.0 | eta 0h0m22s
| epoch 44 | step 3000/4071 | loss 6.9596 | lr 0.00100 | ngrams/sec 36017.1 | eta 0h0m15s
| epoch 44 | step 3500/4071 | loss 6.9830 | lr 0.00100 | ngrams/sec 36069.2 | eta 0h0m8s
| epoch 44 | step 4000/4071 | loss 7.0002 | lr 0.00100 | ngrams/sec 36075.3 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1155.31it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.82it/s]


-----------------------------------------------------------------------------------------
| end of epoch  44 | time 59.26s | valid loss  6.04 | valid ppl   421.92
-----------------------------------------------------------------------------------------
| epoch 45 | step 500/4071 | loss 6.8229 | lr 0.00100 | ngrams/sec 25744.4 | eta 0h1m11s
| epoch 45 | step 1000/4071 | loss 6.8637 | lr 0.00100 | ngrams/sec 36075.5 | eta 0h0m43s
| epoch 45 | step 1500/4071 | loss 6.8905 | lr 0.00100 | ngrams/sec 36095.1 | eta 0h0m36s
| epoch 45 | step 2000/4071 | loss 6.9094 | lr 0.00100 | ngrams/sec 36083.8 | eta 0h0m29s
| epoch 45 | step 2500/4071 | loss 6.9319 | lr 0.00100 | ngrams/sec 36108.6 | eta 0h0m22s
| epoch 45 | step 3000/4071 | loss 6.9533 | lr 0.00100 | ngrams/sec 36104.4 | eta 0h0m15s
| epoch 45 | step 3500/4071 | loss 6.9636 | lr 0.00100 | ngrams/sec 36094.6 | eta 0h0m8s
| epoch 45 | step 4000/4071 | loss 6.9711 | lr 0.00100 | ngrams/sec 36102.5 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1176.33it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.82it/s]


-----------------------------------------------------------------------------------------
| end of epoch  45 | time 59.18s | valid loss  6.04 | valid ppl   421.65
-----------------------------------------------------------------------------------------
| epoch 46 | step 500/4071 | loss 6.8023 | lr 0.00100 | ngrams/sec 25757.0 | eta 0h1m10s
| epoch 46 | step 1000/4071 | loss 6.8446 | lr 0.00100 | ngrams/sec 36072.7 | eta 0h0m43s
| epoch 46 | step 1500/4071 | loss 6.8630 | lr 0.00100 | ngrams/sec 36087.6 | eta 0h0m36s
| epoch 46 | step 2000/4071 | loss 6.9016 | lr 0.00100 | ngrams/sec 36038.9 | eta 0h0m29s
| epoch 46 | step 2500/4071 | loss 6.9116 | lr 0.00100 | ngrams/sec 36118.4 | eta 0h0m22s
| epoch 46 | step 3000/4071 | loss 6.9437 | lr 0.00100 | ngrams/sec 36054.4 | eta 0h0m15s
| epoch 46 | step 3500/4071 | loss 6.9582 | lr 0.00100 | ngrams/sec 36092.5 | eta 0h0m8s
| epoch 46 | step 4000/4071 | loss 6.9708 | lr 0.00100 | ngrams/sec 35667.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1153.87it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.77it/s]


-----------------------------------------------------------------------------------------
| end of epoch  46 | time 59.28s | valid loss  6.05 | valid ppl   424.51
-----------------------------------------------------------------------------------------
| epoch 47 | step 500/4071 | loss 6.8166 | lr 0.00100 | ngrams/sec 25790.8 | eta 0h1m10s
| epoch 47 | step 1000/4071 | loss 6.8255 | lr 0.00100 | ngrams/sec 36085.6 | eta 0h0m43s
| epoch 47 | step 1500/4071 | loss 6.8643 | lr 0.00100 | ngrams/sec 36103.7 | eta 0h0m36s
| epoch 47 | step 2000/4071 | loss 6.8962 | lr 0.00100 | ngrams/sec 36089.5 | eta 0h0m29s
| epoch 47 | step 2500/4071 | loss 6.9173 | lr 0.00100 | ngrams/sec 36106.3 | eta 0h0m22s
| epoch 47 | step 3000/4071 | loss 6.9259 | lr 0.00100 | ngrams/sec 36099.9 | eta 0h0m15s
| epoch 47 | step 3500/4071 | loss 6.9360 | lr 0.00100 | ngrams/sec 36095.8 | eta 0h0m8s
| epoch 47 | step 4000/4071 | loss 6.9648 | lr 0.00100 | ngrams/sec 36096.3 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1167.34it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.87it/s]


-----------------------------------------------------------------------------------------
| end of epoch  47 | time 59.17s | valid loss  6.05 | valid ppl   424.73
-----------------------------------------------------------------------------------------
| epoch 48 | step 500/4071 | loss 6.7847 | lr 0.00100 | ngrams/sec 25741.6 | eta 0h1m11s
| epoch 48 | step 1000/4071 | loss 6.8346 | lr 0.00100 | ngrams/sec 36098.6 | eta 0h0m43s
| epoch 48 | step 1500/4071 | loss 6.8424 | lr 0.00100 | ngrams/sec 36082.5 | eta 0h0m36s
| epoch 48 | step 2000/4071 | loss 6.8797 | lr 0.00100 | ngrams/sec 36089.4 | eta 0h0m29s
| epoch 48 | step 2500/4071 | loss 6.8938 | lr 0.00100 | ngrams/sec 36044.4 | eta 0h0m22s
| epoch 48 | step 3000/4071 | loss 6.9232 | lr 0.00100 | ngrams/sec 36028.2 | eta 0h0m15s
| epoch 48 | step 3500/4071 | loss 6.9356 | lr 0.00100 | ngrams/sec 36088.1 | eta 0h0m8s
| epoch 48 | step 4000/4071 | loss 6.9425 | lr 0.00100 | ngrams/sec 36091.0 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1170.56it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.14it/s]


-----------------------------------------------------------------------------------------
| end of epoch  48 | time 59.21s | valid loss  6.05 | valid ppl   426.16
-----------------------------------------------------------------------------------------
| epoch 49 | step 500/4071 | loss 6.7607 | lr 0.00100 | ngrams/sec 25760.3 | eta 0h1m10s
| epoch 49 | step 1000/4071 | loss 6.8105 | lr 0.00100 | ngrams/sec 36056.0 | eta 0h0m43s
| epoch 49 | step 1500/4071 | loss 6.8508 | lr 0.00100 | ngrams/sec 36083.2 | eta 0h0m36s
| epoch 49 | step 2000/4071 | loss 6.8622 | lr 0.00100 | ngrams/sec 36119.6 | eta 0h0m29s
| epoch 49 | step 2500/4071 | loss 6.8896 | lr 0.00100 | ngrams/sec 36110.3 | eta 0h0m22s
| epoch 49 | step 3000/4071 | loss 6.9057 | lr 0.00100 | ngrams/sec 36102.3 | eta 0h0m15s
| epoch 49 | step 3500/4071 | loss 6.9190 | lr 0.00100 | ngrams/sec 36078.5 | eta 0h0m8s
| epoch 49 | step 4000/4071 | loss 6.9252 | lr 0.00100 | ngrams/sec 36106.7 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1188.75it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.21it/s]


-----------------------------------------------------------------------------------------
| end of epoch  49 | time 59.18s | valid loss  6.06 | valid ppl   426.63
-----------------------------------------------------------------------------------------
| epoch 50 | step 500/4071 | loss 6.7726 | lr 0.00100 | ngrams/sec 25758.4 | eta 0h1m10s
| epoch 50 | step 1000/4071 | loss 6.7897 | lr 0.00100 | ngrams/sec 36082.0 | eta 0h0m43s
| epoch 50 | step 1500/4071 | loss 6.8385 | lr 0.00100 | ngrams/sec 36101.7 | eta 0h0m36s
| epoch 50 | step 2000/4071 | loss 6.8599 | lr 0.00100 | ngrams/sec 36104.5 | eta 0h0m29s
| epoch 50 | step 2500/4071 | loss 6.8887 | lr 0.00100 | ngrams/sec 36108.3 | eta 0h0m22s
| epoch 50 | step 3000/4071 | loss 6.8876 | lr 0.00100 | ngrams/sec 36122.9 | eta 0h0m15s
| epoch 50 | step 3500/4071 | loss 6.9075 | lr 0.00100 | ngrams/sec 36079.9 | eta 0h0m8s
| epoch 50 | step 4000/4071 | loss 6.9308 | lr 0.00100 | ngrams/sec 36091.2 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1173.00it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.16it/s]


-----------------------------------------------------------------------------------------
| end of epoch  50 | time 59.17s | valid loss  6.07 | valid ppl   430.70
-----------------------------------------------------------------------------------------
| epoch 51 | step 500/4071 | loss 6.7652 | lr 0.00100 | ngrams/sec 25743.7 | eta 0h1m11s
| epoch 51 | step 1000/4071 | loss 6.8045 | lr 0.00100 | ngrams/sec 36077.8 | eta 0h0m43s
| epoch 51 | step 1500/4071 | loss 6.8110 | lr 0.00100 | ngrams/sec 36063.7 | eta 0h0m36s
| epoch 51 | step 2000/4071 | loss 6.8558 | lr 0.00100 | ngrams/sec 36102.5 | eta 0h0m29s
| epoch 51 | step 2500/4071 | loss 6.8716 | lr 0.00100 | ngrams/sec 36045.9 | eta 0h0m22s
| epoch 51 | step 3000/4071 | loss 6.8702 | lr 0.00100 | ngrams/sec 36091.0 | eta 0h0m15s
| epoch 51 | step 3500/4071 | loss 6.8985 | lr 0.00100 | ngrams/sec 36105.5 | eta 0h0m8s
| epoch 51 | step 4000/4071 | loss 6.9177 | lr 0.00100 | ngrams/sec 36096.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1179.39it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.16it/s]


-----------------------------------------------------------------------------------------
| end of epoch  51 | time 59.19s | valid loss  6.06 | valid ppl   429.31
-----------------------------------------------------------------------------------------
| epoch 52 | step 500/4071 | loss 6.7447 | lr 0.00100 | ngrams/sec 25748.1 | eta 0h1m11s
| epoch 52 | step 1000/4071 | loss 6.7781 | lr 0.00100 | ngrams/sec 36079.5 | eta 0h0m43s
| epoch 52 | step 1500/4071 | loss 6.8040 | lr 0.00100 | ngrams/sec 36064.7 | eta 0h0m36s
| epoch 52 | step 2000/4071 | loss 6.8251 | lr 0.00100 | ngrams/sec 36089.0 | eta 0h0m29s
| epoch 52 | step 2500/4071 | loss 6.8452 | lr 0.00100 | ngrams/sec 36055.8 | eta 0h0m22s
| epoch 52 | step 3000/4071 | loss 6.8814 | lr 0.00100 | ngrams/sec 36103.4 | eta 0h0m15s
| epoch 52 | step 3500/4071 | loss 6.9097 | lr 0.00100 | ngrams/sec 36065.7 | eta 0h0m8s
| epoch 52 | step 4000/4071 | loss 6.9177 | lr 0.00100 | ngrams/sec 36114.0 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1162.79it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.91it/s]


-----------------------------------------------------------------------------------------
| end of epoch  52 | time 59.19s | valid loss  6.07 | valid ppl   434.52
-----------------------------------------------------------------------------------------
| epoch 53 | step 500/4071 | loss 6.7190 | lr 0.00100 | ngrams/sec 25752.1 | eta 0h1m10s
| epoch 53 | step 1000/4071 | loss 6.7634 | lr 0.00100 | ngrams/sec 36083.8 | eta 0h0m43s
| epoch 53 | step 1500/4071 | loss 6.8047 | lr 0.00100 | ngrams/sec 36088.4 | eta 0h0m36s
| epoch 53 | step 2000/4071 | loss 6.8348 | lr 0.00100 | ngrams/sec 36091.0 | eta 0h0m29s
| epoch 53 | step 2500/4071 | loss 6.8613 | lr 0.00100 | ngrams/sec 36112.7 | eta 0h0m22s
| epoch 53 | step 3000/4071 | loss 6.8740 | lr 0.00100 | ngrams/sec 36093.7 | eta 0h0m15s
| epoch 53 | step 3500/4071 | loss 6.8832 | lr 0.00100 | ngrams/sec 36021.8 | eta 0h0m8s
| epoch 53 | step 4000/4071 | loss 6.9045 | lr 0.00100 | ngrams/sec 36058.7 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1174.32it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.27it/s]


-----------------------------------------------------------------------------------------
| end of epoch  53 | time 59.21s | valid loss  6.07 | valid ppl   431.61
-----------------------------------------------------------------------------------------
| epoch 54 | step 500/4071 | loss 6.7291 | lr 0.00100 | ngrams/sec 25745.0 | eta 0h1m11s
| epoch 54 | step 1000/4071 | loss 6.7527 | lr 0.00100 | ngrams/sec 36055.9 | eta 0h0m43s
| epoch 54 | step 1500/4071 | loss 6.7784 | lr 0.00100 | ngrams/sec 36066.0 | eta 0h0m36s
| epoch 54 | step 2000/4071 | loss 6.8297 | lr 0.00100 | ngrams/sec 36101.7 | eta 0h0m29s
| epoch 54 | step 2500/4071 | loss 6.8363 | lr 0.00100 | ngrams/sec 36082.6 | eta 0h0m22s
| epoch 54 | step 3000/4071 | loss 6.8611 | lr 0.00100 | ngrams/sec 36076.4 | eta 0h0m15s
| epoch 54 | step 3500/4071 | loss 6.8609 | lr 0.00100 | ngrams/sec 36099.9 | eta 0h0m8s
| epoch 54 | step 4000/4071 | loss 6.9018 | lr 0.00100 | ngrams/sec 36103.2 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1180.10it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.01it/s]


-----------------------------------------------------------------------------------------
| end of epoch  54 | time 59.19s | valid loss  6.08 | valid ppl   436.39
-----------------------------------------------------------------------------------------
| epoch 55 | step 500/4071 | loss 6.7211 | lr 0.00100 | ngrams/sec 25740.3 | eta 0h1m11s
| epoch 55 | step 1000/4071 | loss 6.7624 | lr 0.00100 | ngrams/sec 36106.2 | eta 0h0m43s
| epoch 55 | step 1500/4071 | loss 6.7883 | lr 0.00100 | ngrams/sec 36090.0 | eta 0h0m36s
| epoch 55 | step 2000/4071 | loss 6.8046 | lr 0.00100 | ngrams/sec 36089.6 | eta 0h0m29s
| epoch 55 | step 2500/4071 | loss 6.8367 | lr 0.00100 | ngrams/sec 36076.3 | eta 0h0m22s
| epoch 55 | step 3000/4071 | loss 6.8412 | lr 0.00100 | ngrams/sec 36109.7 | eta 0h0m15s
| epoch 55 | step 3500/4071 | loss 6.8690 | lr 0.00100 | ngrams/sec 36114.1 | eta 0h0m8s
| epoch 55 | step 4000/4071 | loss 6.8777 | lr 0.00100 | ngrams/sec 36048.8 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1182.84it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.92it/s]


-----------------------------------------------------------------------------------------
| end of epoch  55 | time 59.19s | valid loss  6.08 | valid ppl   435.52
-----------------------------------------------------------------------------------------
| epoch 56 | step 500/4071 | loss 6.7127 | lr 0.00100 | ngrams/sec 25761.6 | eta 0h1m10s
| epoch 56 | step 1000/4071 | loss 6.7514 | lr 0.00100 | ngrams/sec 36145.9 | eta 0h0m43s
| epoch 56 | step 1500/4071 | loss 6.7878 | lr 0.00100 | ngrams/sec 36101.9 | eta 0h0m36s
| epoch 56 | step 2000/4071 | loss 6.8066 | lr 0.00100 | ngrams/sec 36102.8 | eta 0h0m29s
| epoch 56 | step 2500/4071 | loss 6.8210 | lr 0.00100 | ngrams/sec 36089.7 | eta 0h0m22s
| epoch 56 | step 3000/4071 | loss 6.8357 | lr 0.00100 | ngrams/sec 36120.5 | eta 0h0m15s
| epoch 56 | step 3500/4071 | loss 6.8689 | lr 0.00100 | ngrams/sec 36043.2 | eta 0h0m8s
| epoch 56 | step 4000/4071 | loss 6.8567 | lr 0.00100 | ngrams/sec 36106.0 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1175.46it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.26it/s]


-----------------------------------------------------------------------------------------
| end of epoch  56 | time 59.16s | valid loss  6.08 | valid ppl   437.73
-----------------------------------------------------------------------------------------
| epoch 57 | step 500/4071 | loss 6.6988 | lr 0.00100 | ngrams/sec 25747.0 | eta 0h1m11s
| epoch 57 | step 1000/4071 | loss 6.7429 | lr 0.00100 | ngrams/sec 36098.7 | eta 0h0m43s
| epoch 57 | step 1500/4071 | loss 6.7709 | lr 0.00100 | ngrams/sec 36076.3 | eta 0h0m36s
| epoch 57 | step 2000/4071 | loss 6.7966 | lr 0.00100 | ngrams/sec 36079.5 | eta 0h0m29s
| epoch 57 | step 2500/4071 | loss 6.8183 | lr 0.00100 | ngrams/sec 36070.3 | eta 0h0m22s
| epoch 57 | step 3000/4071 | loss 6.8308 | lr 0.00100 | ngrams/sec 35906.9 | eta 0h0m15s
| epoch 57 | step 3500/4071 | loss 6.8492 | lr 0.00100 | ngrams/sec 36074.9 | eta 0h0m8s
| epoch 57 | step 4000/4071 | loss 6.8571 | lr 0.00100 | ngrams/sec 36100.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1178.92it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.52it/s]


-----------------------------------------------------------------------------------------
| end of epoch  57 | time 59.23s | valid loss  6.09 | valid ppl   440.35
-----------------------------------------------------------------------------------------
| epoch 58 | step 500/4071 | loss 6.6984 | lr 0.00100 | ngrams/sec 25733.1 | eta 0h1m11s
| epoch 58 | step 1000/4071 | loss 6.7351 | lr 0.00100 | ngrams/sec 36101.5 | eta 0h0m43s
| epoch 58 | step 1500/4071 | loss 6.7591 | lr 0.00100 | ngrams/sec 36063.5 | eta 0h0m36s
| epoch 58 | step 2000/4071 | loss 6.7845 | lr 0.00100 | ngrams/sec 36105.1 | eta 0h0m29s
| epoch 58 | step 2500/4071 | loss 6.8220 | lr 0.00100 | ngrams/sec 36077.8 | eta 0h0m22s
| epoch 58 | step 3000/4071 | loss 6.8156 | lr 0.00100 | ngrams/sec 36095.1 | eta 0h0m15s
| epoch 58 | step 3500/4071 | loss 6.8377 | lr 0.00100 | ngrams/sec 36054.1 | eta 0h0m8s
| epoch 58 | step 4000/4071 | loss 6.8642 | lr 0.00100 | ngrams/sec 36056.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1163.10it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.48it/s]


-----------------------------------------------------------------------------------------
| end of epoch  58 | time 59.21s | valid loss  6.09 | valid ppl   439.72
-----------------------------------------------------------------------------------------
| epoch 59 | step 500/4071 | loss 6.6903 | lr 0.00100 | ngrams/sec 25696.9 | eta 0h1m11s
| epoch 59 | step 1000/4071 | loss 6.7206 | lr 0.00100 | ngrams/sec 36067.3 | eta 0h0m43s
| epoch 59 | step 1500/4071 | loss 6.7541 | lr 0.00100 | ngrams/sec 36045.1 | eta 0h0m36s
| epoch 59 | step 2000/4071 | loss 6.7656 | lr 0.00100 | ngrams/sec 35976.5 | eta 0h0m29s
| epoch 59 | step 2500/4071 | loss 6.8030 | lr 0.00100 | ngrams/sec 36088.2 | eta 0h0m22s
| epoch 59 | step 3000/4071 | loss 6.8201 | lr 0.00100 | ngrams/sec 36083.5 | eta 0h0m15s
| epoch 59 | step 3500/4071 | loss 6.8297 | lr 0.00100 | ngrams/sec 36079.6 | eta 0h0m8s
| epoch 59 | step 4000/4071 | loss 6.8544 | lr 0.00100 | ngrams/sec 36082.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1181.66it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.45it/s]


-----------------------------------------------------------------------------------------
| end of epoch  59 | time 59.24s | valid loss  6.08 | valid ppl   436.78
-----------------------------------------------------------------------------------------
| epoch 60 | step 500/4071 | loss 6.6840 | lr 0.00100 | ngrams/sec 25746.1 | eta 0h1m11s
| epoch 60 | step 1000/4071 | loss 6.7183 | lr 0.00100 | ngrams/sec 36106.9 | eta 0h0m43s
| epoch 60 | step 1500/4071 | loss 6.7527 | lr 0.00100 | ngrams/sec 36064.5 | eta 0h0m36s
| epoch 60 | step 2000/4071 | loss 6.7751 | lr 0.00100 | ngrams/sec 36105.0 | eta 0h0m29s
| epoch 60 | step 2500/4071 | loss 6.7946 | lr 0.00100 | ngrams/sec 36086.5 | eta 0h0m22s
| epoch 60 | step 3000/4071 | loss 6.8137 | lr 0.00100 | ngrams/sec 36055.2 | eta 0h0m15s
| epoch 60 | step 3500/4071 | loss 6.8315 | lr 0.00100 | ngrams/sec 36087.0 | eta 0h0m8s
| epoch 60 | step 4000/4071 | loss 6.8306 | lr 0.00100 | ngrams/sec 36069.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1164.57it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.55it/s]


-----------------------------------------------------------------------------------------
| end of epoch  60 | time 59.19s | valid loss  6.09 | valid ppl   443.18
-----------------------------------------------------------------------------------------
| epoch 61 | step 500/4071 | loss 6.6664 | lr 0.00100 | ngrams/sec 25744.7 | eta 0h1m11s
| epoch 61 | step 1000/4071 | loss 6.7048 | lr 0.00100 | ngrams/sec 36076.2 | eta 0h0m43s
| epoch 61 | step 1500/4071 | loss 6.7426 | lr 0.00100 | ngrams/sec 36089.5 | eta 0h0m36s
| epoch 61 | step 2000/4071 | loss 6.7470 | lr 0.00100 | ngrams/sec 36111.1 | eta 0h0m29s
| epoch 61 | step 2500/4071 | loss 6.7828 | lr 0.00100 | ngrams/sec 36091.5 | eta 0h0m22s
| epoch 61 | step 3000/4071 | loss 6.7966 | lr 0.00100 | ngrams/sec 36083.5 | eta 0h0m15s
| epoch 61 | step 3500/4071 | loss 6.8263 | lr 0.00100 | ngrams/sec 36108.2 | eta 0h0m8s
| epoch 61 | step 4000/4071 | loss 6.8210 | lr 0.00100 | ngrams/sec 36078.5 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1178.22it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  61 | time 59.19s | valid loss  6.10 | valid ppl   444.33
-----------------------------------------------------------------------------------------
| epoch 62 | step 500/4071 | loss 6.6543 | lr 0.00100 | ngrams/sec 25753.0 | eta 0h1m10s
| epoch 62 | step 1000/4071 | loss 6.6959 | lr 0.00100 | ngrams/sec 36078.2 | eta 0h0m43s
| epoch 62 | step 1500/4071 | loss 6.7266 | lr 0.00100 | ngrams/sec 36094.5 | eta 0h0m36s
| epoch 62 | step 2000/4071 | loss 6.7454 | lr 0.00100 | ngrams/sec 36074.3 | eta 0h0m29s
| epoch 62 | step 2500/4071 | loss 6.7760 | lr 0.00100 | ngrams/sec 36107.8 | eta 0h0m22s
| epoch 62 | step 3000/4071 | loss 6.8188 | lr 0.00100 | ngrams/sec 36111.4 | eta 0h0m15s
| epoch 62 | step 3500/4071 | loss 6.8104 | lr 0.00100 | ngrams/sec 36086.7 | eta 0h0m8s
| epoch 62 | step 4000/4071 | loss 6.8233 | lr 0.00100 | ngrams/sec 36041.3 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1171.15it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.50it/s]


-----------------------------------------------------------------------------------------
| end of epoch  62 | time 59.19s | valid loss  6.10 | valid ppl   446.27
-----------------------------------------------------------------------------------------
| epoch 63 | step 500/4071 | loss 6.6521 | lr 0.00100 | ngrams/sec 25745.1 | eta 0h1m11s
| epoch 63 | step 1000/4071 | loss 6.7048 | lr 0.00100 | ngrams/sec 36115.4 | eta 0h0m43s
| epoch 63 | step 1500/4071 | loss 6.7204 | lr 0.00100 | ngrams/sec 36110.6 | eta 0h0m36s
| epoch 63 | step 2000/4071 | loss 6.7443 | lr 0.00100 | ngrams/sec 36042.4 | eta 0h0m29s
| epoch 63 | step 2500/4071 | loss 6.7501 | lr 0.00100 | ngrams/sec 36094.9 | eta 0h0m22s
| epoch 63 | step 3000/4071 | loss 6.7857 | lr 0.00100 | ngrams/sec 36099.6 | eta 0h0m15s
| epoch 63 | step 3500/4071 | loss 6.7924 | lr 0.00100 | ngrams/sec 36085.4 | eta 0h0m8s
| epoch 63 | step 4000/4071 | loss 6.8263 | lr 0.00100 | ngrams/sec 36061.0 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1165.41it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.00it/s]


-----------------------------------------------------------------------------------------
| end of epoch  63 | time 59.19s | valid loss  6.10 | valid ppl   445.21
-----------------------------------------------------------------------------------------
| epoch 64 | step 500/4071 | loss 6.6510 | lr 0.00100 | ngrams/sec 25741.1 | eta 0h1m11s
| epoch 64 | step 1000/4071 | loss 6.6954 | lr 0.00100 | ngrams/sec 36066.8 | eta 0h0m43s
| epoch 64 | step 1500/4071 | loss 6.7141 | lr 0.00100 | ngrams/sec 36090.6 | eta 0h0m36s
| epoch 64 | step 2000/4071 | loss 6.7250 | lr 0.00100 | ngrams/sec 36112.0 | eta 0h0m29s
| epoch 64 | step 2500/4071 | loss 6.7672 | lr 0.00100 | ngrams/sec 36132.2 | eta 0h0m22s
| epoch 64 | step 3000/4071 | loss 6.7701 | lr 0.00100 | ngrams/sec 36011.8 | eta 0h0m15s
| epoch 64 | step 3500/4071 | loss 6.7943 | lr 0.00100 | ngrams/sec 36105.4 | eta 0h0m8s
| epoch 64 | step 4000/4071 | loss 6.8227 | lr 0.00100 | ngrams/sec 36042.1 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1180.45it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.20it/s]


-----------------------------------------------------------------------------------------
| end of epoch  64 | time 59.20s | valid loss  6.11 | valid ppl   449.81
-----------------------------------------------------------------------------------------
| epoch 65 | step 500/4071 | loss 6.6559 | lr 0.00100 | ngrams/sec 25718.9 | eta 0h1m11s
| epoch 65 | step 1000/4071 | loss 6.6958 | lr 0.00100 | ngrams/sec 35967.5 | eta 0h0m43s
| epoch 65 | step 1500/4071 | loss 6.7156 | lr 0.00100 | ngrams/sec 36050.7 | eta 0h0m36s
| epoch 65 | step 2000/4071 | loss 6.7420 | lr 0.00100 | ngrams/sec 35977.5 | eta 0h0m29s
| epoch 65 | step 2500/4071 | loss 6.7308 | lr 0.00100 | ngrams/sec 35944.6 | eta 0h0m22s
| epoch 65 | step 3000/4071 | loss 6.7735 | lr 0.00100 | ngrams/sec 35962.4 | eta 0h0m15s
| epoch 65 | step 3500/4071 | loss 6.7749 | lr 0.00100 | ngrams/sec 35983.8 | eta 0h0m8s
| epoch 65 | step 4000/4071 | loss 6.7958 | lr 0.00100 | ngrams/sec 36019.8 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1179.29it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.89it/s]


-----------------------------------------------------------------------------------------
| end of epoch  65 | time 59.34s | valid loss  6.11 | valid ppl   451.06
-----------------------------------------------------------------------------------------
| epoch 66 | step 500/4071 | loss 6.6373 | lr 0.00100 | ngrams/sec 25737.0 | eta 0h1m11s
| epoch 66 | step 1000/4071 | loss 6.6780 | lr 0.00100 | ngrams/sec 36085.1 | eta 0h0m43s
| epoch 66 | step 1500/4071 | loss 6.7092 | lr 0.00100 | ngrams/sec 36057.0 | eta 0h0m36s
| epoch 66 | step 2000/4071 | loss 6.7282 | lr 0.00100 | ngrams/sec 36058.0 | eta 0h0m29s
| epoch 66 | step 2500/4071 | loss 6.7521 | lr 0.00100 | ngrams/sec 36044.3 | eta 0h0m22s
| epoch 66 | step 3000/4071 | loss 6.7671 | lr 0.00100 | ngrams/sec 36064.9 | eta 0h0m15s
| epoch 66 | step 3500/4071 | loss 6.7932 | lr 0.00100 | ngrams/sec 36044.6 | eta 0h0m8s
| epoch 66 | step 4000/4071 | loss 6.8053 | lr 0.00100 | ngrams/sec 36047.4 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1178.17it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  66 | time 59.24s | valid loss  6.11 | valid ppl   451.00
-----------------------------------------------------------------------------------------
| epoch 67 | step 500/4071 | loss 6.6441 | lr 0.00100 | ngrams/sec 25715.1 | eta 0h1m11s
| epoch 67 | step 1000/4071 | loss 6.6652 | lr 0.00100 | ngrams/sec 36017.4 | eta 0h0m43s
| epoch 67 | step 1500/4071 | loss 6.6866 | lr 0.00100 | ngrams/sec 35992.6 | eta 0h0m36s
| epoch 67 | step 2000/4071 | loss 6.7188 | lr 0.00100 | ngrams/sec 36057.2 | eta 0h0m29s
| epoch 67 | step 2500/4071 | loss 6.7392 | lr 0.00100 | ngrams/sec 36065.0 | eta 0h0m22s
| epoch 67 | step 3000/4071 | loss 6.7507 | lr 0.00100 | ngrams/sec 36104.6 | eta 0h0m15s
| epoch 67 | step 3500/4071 | loss 6.7753 | lr 0.00100 | ngrams/sec 36077.7 | eta 0h0m8s
| epoch 67 | step 4000/4071 | loss 6.7914 | lr 0.00100 | ngrams/sec 36025.0 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1152.87it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.76it/s]


-----------------------------------------------------------------------------------------
| end of epoch  67 | time 59.26s | valid loss  6.11 | valid ppl   450.87
-----------------------------------------------------------------------------------------
| epoch 68 | step 500/4071 | loss 6.6391 | lr 0.00100 | ngrams/sec 25700.4 | eta 0h1m11s
| epoch 68 | step 1000/4071 | loss 6.6542 | lr 0.00100 | ngrams/sec 36039.1 | eta 0h0m43s
| epoch 68 | step 1500/4071 | loss 6.7013 | lr 0.00100 | ngrams/sec 35889.3 | eta 0h0m36s
| epoch 68 | step 2000/4071 | loss 6.7297 | lr 0.00100 | ngrams/sec 35853.6 | eta 0h0m29s
| epoch 68 | step 2500/4071 | loss 6.7398 | lr 0.00100 | ngrams/sec 35867.1 | eta 0h0m22s
| epoch 68 | step 3000/4071 | loss 6.7549 | lr 0.00100 | ngrams/sec 35946.6 | eta 0h0m15s
| epoch 68 | step 3500/4071 | loss 6.7771 | lr 0.00100 | ngrams/sec 35973.9 | eta 0h0m8s
| epoch 68 | step 4000/4071 | loss 6.7665 | lr 0.00100 | ngrams/sec 35956.4 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1151.70it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  68 | time 59.42s | valid loss  6.12 | valid ppl   453.22
-----------------------------------------------------------------------------------------
| epoch 69 | step 500/4071 | loss 6.6111 | lr 0.00100 | ngrams/sec 25663.2 | eta 0h1m11s
| epoch 69 | step 1000/4071 | loss 6.6522 | lr 0.00100 | ngrams/sec 35367.4 | eta 0h0m44s
| epoch 69 | step 1500/4071 | loss 6.6938 | lr 0.00100 | ngrams/sec 35971.1 | eta 0h0m36s
| epoch 69 | step 2000/4071 | loss 6.7092 | lr 0.00100 | ngrams/sec 35879.1 | eta 0h0m29s
| epoch 69 | step 2500/4071 | loss 6.7209 | lr 0.00100 | ngrams/sec 36024.1 | eta 0h0m22s
| epoch 69 | step 3000/4071 | loss 6.7594 | lr 0.00100 | ngrams/sec 36064.1 | eta 0h0m15s
| epoch 69 | step 3500/4071 | loss 6.7558 | lr 0.00100 | ngrams/sec 36019.9 | eta 0h0m8s
| epoch 69 | step 4000/4071 | loss 6.7716 | lr 0.00100 | ngrams/sec 36034.9 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1178.30it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  69 | time 59.47s | valid loss  6.12 | valid ppl   456.86
-----------------------------------------------------------------------------------------
| epoch 70 | step 500/4071 | loss 6.5960 | lr 0.00100 | ngrams/sec 25636.8 | eta 0h1m11s
| epoch 70 | step 1000/4071 | loss 6.6597 | lr 0.00100 | ngrams/sec 35916.8 | eta 0h0m43s
| epoch 70 | step 1500/4071 | loss 6.6852 | lr 0.00100 | ngrams/sec 35854.5 | eta 0h0m36s
| epoch 70 | step 2000/4071 | loss 6.7096 | lr 0.00100 | ngrams/sec 35915.1 | eta 0h0m29s
| epoch 70 | step 2500/4071 | loss 6.7191 | lr 0.00100 | ngrams/sec 35842.9 | eta 0h0m22s
| epoch 70 | step 3000/4071 | loss 6.7308 | lr 0.00100 | ngrams/sec 35910.4 | eta 0h0m15s
| epoch 70 | step 3500/4071 | loss 6.7685 | lr 0.00100 | ngrams/sec 35884.4 | eta 0h0m8s
| epoch 70 | step 4000/4071 | loss 6.7767 | lr 0.00100 | ngrams/sec 35845.8 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1153.35it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.74it/s]


-----------------------------------------------------------------------------------------
| end of epoch  70 | time 59.52s | valid loss  6.13 | valid ppl   460.39
-----------------------------------------------------------------------------------------
| epoch 71 | step 500/4071 | loss 6.5961 | lr 0.00100 | ngrams/sec 25699.6 | eta 0h1m11s
| epoch 71 | step 1000/4071 | loss 6.6538 | lr 0.00100 | ngrams/sec 36145.5 | eta 0h0m43s
| epoch 71 | step 1500/4071 | loss 6.6708 | lr 0.00100 | ngrams/sec 36131.1 | eta 0h0m36s
| epoch 71 | step 2000/4071 | loss 6.6871 | lr 0.00100 | ngrams/sec 36117.1 | eta 0h0m29s
| epoch 71 | step 2500/4071 | loss 6.7147 | lr 0.00100 | ngrams/sec 36118.7 | eta 0h0m22s
| epoch 71 | step 3000/4071 | loss 6.7431 | lr 0.00100 | ngrams/sec 36135.7 | eta 0h0m15s
| epoch 71 | step 3500/4071 | loss 6.7416 | lr 0.00100 | ngrams/sec 36150.7 | eta 0h0m8s
| epoch 71 | step 4000/4071 | loss 6.7607 | lr 0.00100 | ngrams/sec 36159.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1154.61it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.04it/s]


-----------------------------------------------------------------------------------------
| end of epoch  71 | time 59.13s | valid loss  6.13 | valid ppl   458.78
-----------------------------------------------------------------------------------------
| epoch 72 | step 500/4071 | loss 6.5988 | lr 0.00100 | ngrams/sec 25793.5 | eta 0h1m10s
| epoch 72 | step 1000/4071 | loss 6.6282 | lr 0.00100 | ngrams/sec 36168.5 | eta 0h0m43s
| epoch 72 | step 1500/4071 | loss 6.6712 | lr 0.00100 | ngrams/sec 36160.2 | eta 0h0m36s
| epoch 72 | step 2000/4071 | loss 6.6935 | lr 0.00100 | ngrams/sec 36151.4 | eta 0h0m29s
| epoch 72 | step 2500/4071 | loss 6.6923 | lr 0.00100 | ngrams/sec 36154.4 | eta 0h0m22s
| epoch 72 | step 3000/4071 | loss 6.7379 | lr 0.00100 | ngrams/sec 36126.0 | eta 0h0m15s
| epoch 72 | step 3500/4071 | loss 6.7568 | lr 0.00100 | ngrams/sec 36120.1 | eta 0h0m8s
| epoch 72 | step 4000/4071 | loss 6.7568 | lr 0.00100 | ngrams/sec 36130.9 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1188.48it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.30it/s]


-----------------------------------------------------------------------------------------
| end of epoch  72 | time 59.10s | valid loss  6.13 | valid ppl   461.51
-----------------------------------------------------------------------------------------
| epoch 73 | step 500/4071 | loss 6.6106 | lr 0.00100 | ngrams/sec 25821.5 | eta 0h1m10s
| epoch 73 | step 1000/4071 | loss 6.6355 | lr 0.00100 | ngrams/sec 36185.0 | eta 0h0m43s
| epoch 73 | step 1500/4071 | loss 6.6488 | lr 0.00100 | ngrams/sec 36164.8 | eta 0h0m36s
| epoch 73 | step 2000/4071 | loss 6.6755 | lr 0.00100 | ngrams/sec 36163.1 | eta 0h0m29s
| epoch 73 | step 2500/4071 | loss 6.6959 | lr 0.00100 | ngrams/sec 36163.2 | eta 0h0m22s
| epoch 73 | step 3000/4071 | loss 6.7256 | lr 0.00100 | ngrams/sec 36118.0 | eta 0h0m15s
| epoch 73 | step 3500/4071 | loss 6.7455 | lr 0.00100 | ngrams/sec 36137.6 | eta 0h0m8s
| epoch 73 | step 4000/4071 | loss 6.7436 | lr 0.00100 | ngrams/sec 36121.3 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1159.59it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.27it/s]


-----------------------------------------------------------------------------------------
| end of epoch  73 | time 59.08s | valid loss  6.14 | valid ppl   464.35
-----------------------------------------------------------------------------------------
| epoch 74 | step 500/4071 | loss 6.5852 | lr 0.00100 | ngrams/sec 25797.4 | eta 0h1m10s
| epoch 74 | step 1000/4071 | loss 6.6244 | lr 0.00100 | ngrams/sec 36052.8 | eta 0h0m43s
| epoch 74 | step 1500/4071 | loss 6.6336 | lr 0.00100 | ngrams/sec 36033.0 | eta 0h0m36s
| epoch 74 | step 2000/4071 | loss 6.6895 | lr 0.00100 | ngrams/sec 36025.3 | eta 0h0m29s
| epoch 74 | step 2500/4071 | loss 6.6975 | lr 0.00100 | ngrams/sec 35922.2 | eta 0h0m22s
| epoch 74 | step 3000/4071 | loss 6.7087 | lr 0.00100 | ngrams/sec 35821.5 | eta 0h0m15s
| epoch 74 | step 3500/4071 | loss 6.7317 | lr 0.00100 | ngrams/sec 35934.7 | eta 0h0m8s
| epoch 74 | step 4000/4071 | loss 6.7400 | lr 0.00100 | ngrams/sec 35959.9 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1175.49it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch  74 | time 59.36s | valid loss  6.15 | valid ppl   467.23
-----------------------------------------------------------------------------------------
| epoch 75 | step 500/4071 | loss 6.5841 | lr 0.00100 | ngrams/sec 25671.2 | eta 0h1m11s
| epoch 75 | step 1000/4071 | loss 6.6165 | lr 0.00100 | ngrams/sec 35930.8 | eta 0h0m43s
| epoch 75 | step 1500/4071 | loss 6.6543 | lr 0.00100 | ngrams/sec 35946.0 | eta 0h0m36s
| epoch 75 | step 2000/4071 | loss 6.6694 | lr 0.00100 | ngrams/sec 35964.4 | eta 0h0m29s
| epoch 75 | step 2500/4071 | loss 6.7039 | lr 0.00100 | ngrams/sec 36070.3 | eta 0h0m22s
| epoch 75 | step 3000/4071 | loss 6.7193 | lr 0.00100 | ngrams/sec 36073.1 | eta 0h0m15s
| epoch 75 | step 3500/4071 | loss 6.7378 | lr 0.00100 | ngrams/sec 36036.8 | eta 0h0m8s
| epoch 75 | step 4000/4071 | loss 6.7371 | lr 0.00100 | ngrams/sec 36058.9 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1156.80it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.62it/s]


-----------------------------------------------------------------------------------------
| end of epoch  75 | time 59.33s | valid loss  6.15 | valid ppl   468.77
-----------------------------------------------------------------------------------------
| epoch 76 | step 500/4071 | loss 6.5829 | lr 0.00100 | ngrams/sec 25655.2 | eta 0h1m11s
| epoch 76 | step 1000/4071 | loss 6.6301 | lr 0.00100 | ngrams/sec 36107.7 | eta 0h0m43s
| epoch 76 | step 1500/4071 | loss 6.6443 | lr 0.00100 | ngrams/sec 36088.2 | eta 0h0m36s
| epoch 76 | step 2000/4071 | loss 6.6650 | lr 0.00100 | ngrams/sec 36058.8 | eta 0h0m29s
| epoch 76 | step 2500/4071 | loss 6.6938 | lr 0.00100 | ngrams/sec 36025.3 | eta 0h0m22s
| epoch 76 | step 3000/4071 | loss 6.7183 | lr 0.00100 | ngrams/sec 36001.2 | eta 0h0m15s
| epoch 76 | step 3500/4071 | loss 6.7193 | lr 0.00100 | ngrams/sec 36069.9 | eta 0h0m8s
| epoch 76 | step 4000/4071 | loss 6.7197 | lr 0.00100 | ngrams/sec 35819.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1154.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.28it/s]


-----------------------------------------------------------------------------------------
| end of epoch  76 | time 59.31s | valid loss  6.16 | valid ppl   471.44
-----------------------------------------------------------------------------------------
| epoch 77 | step 500/4071 | loss 6.5819 | lr 0.00100 | ngrams/sec 25653.6 | eta 0h1m11s
| epoch 77 | step 1000/4071 | loss 6.6038 | lr 0.00100 | ngrams/sec 35895.1 | eta 0h0m43s
| epoch 77 | step 1500/4071 | loss 6.6309 | lr 0.00100 | ngrams/sec 35932.0 | eta 0h0m36s
| epoch 77 | step 2000/4071 | loss 6.6627 | lr 0.00100 | ngrams/sec 35957.9 | eta 0h0m29s
| epoch 77 | step 2500/4071 | loss 6.6856 | lr 0.00100 | ngrams/sec 36016.9 | eta 0h0m22s
| epoch 77 | step 3000/4071 | loss 6.7156 | lr 0.00100 | ngrams/sec 36072.0 | eta 0h0m15s
| epoch 77 | step 3500/4071 | loss 6.7143 | lr 0.00100 | ngrams/sec 36129.7 | eta 0h0m8s
| epoch 77 | step 4000/4071 | loss 6.7249 | lr 0.00100 | ngrams/sec 36107.8 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1159.96it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.76it/s]


-----------------------------------------------------------------------------------------
| end of epoch  77 | time 59.30s | valid loss  6.15 | valid ppl   467.95
-----------------------------------------------------------------------------------------
| epoch 78 | step 500/4071 | loss 6.5755 | lr 0.00100 | ngrams/sec 25824.2 | eta 0h1m10s
| epoch 78 | step 1000/4071 | loss 6.5958 | lr 0.00100 | ngrams/sec 36139.6 | eta 0h0m43s
| epoch 78 | step 1500/4071 | loss 6.6257 | lr 0.00100 | ngrams/sec 36135.9 | eta 0h0m36s
| epoch 78 | step 2000/4071 | loss 6.6676 | lr 0.00100 | ngrams/sec 36119.7 | eta 0h0m29s
| epoch 78 | step 2500/4071 | loss 6.6870 | lr 0.00100 | ngrams/sec 36095.3 | eta 0h0m22s
| epoch 78 | step 3000/4071 | loss 6.6903 | lr 0.00100 | ngrams/sec 36110.9 | eta 0h0m15s
| epoch 78 | step 3500/4071 | loss 6.7154 | lr 0.00100 | ngrams/sec 36075.1 | eta 0h0m8s
| epoch 78 | step 4000/4071 | loss 6.7183 | lr 0.00100 | ngrams/sec 36071.3 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1188.81it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.03it/s]


-----------------------------------------------------------------------------------------
| end of epoch  78 | time 59.15s | valid loss  6.16 | valid ppl   471.49
-----------------------------------------------------------------------------------------
| epoch 79 | step 500/4071 | loss 6.5636 | lr 0.00100 | ngrams/sec 25733.1 | eta 0h1m11s
| epoch 79 | step 1000/4071 | loss 6.6050 | lr 0.00100 | ngrams/sec 36055.5 | eta 0h0m43s
| epoch 79 | step 1500/4071 | loss 6.6478 | lr 0.00100 | ngrams/sec 36065.8 | eta 0h0m36s
| epoch 79 | step 2000/4071 | loss 6.6472 | lr 0.00100 | ngrams/sec 36070.3 | eta 0h0m29s
| epoch 79 | step 2500/4071 | loss 6.6732 | lr 0.00100 | ngrams/sec 36084.1 | eta 0h0m22s
| epoch 79 | step 3000/4071 | loss 6.6847 | lr 0.00100 | ngrams/sec 36062.1 | eta 0h0m15s
| epoch 79 | step 3500/4071 | loss 6.7012 | lr 0.00100 | ngrams/sec 36035.4 | eta 0h0m8s
| epoch 79 | step 4000/4071 | loss 6.7180 | lr 0.00100 | ngrams/sec 36025.7 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1156.34it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  79 | time 59.24s | valid loss  6.16 | valid ppl   474.09
-----------------------------------------------------------------------------------------
| epoch 80 | step 500/4071 | loss 6.5709 | lr 0.00100 | ngrams/sec 25738.5 | eta 0h1m11s
| epoch 80 | step 1000/4071 | loss 6.5929 | lr 0.00100 | ngrams/sec 36077.7 | eta 0h0m43s
| epoch 80 | step 1500/4071 | loss 6.6156 | lr 0.00100 | ngrams/sec 36063.4 | eta 0h0m36s
| epoch 80 | step 2000/4071 | loss 6.6442 | lr 0.00100 | ngrams/sec 36073.9 | eta 0h0m29s
| epoch 80 | step 2500/4071 | loss 6.6701 | lr 0.00100 | ngrams/sec 36066.0 | eta 0h0m22s
| epoch 80 | step 3000/4071 | loss 6.6908 | lr 0.00100 | ngrams/sec 36071.6 | eta 0h0m15s
| epoch 80 | step 3500/4071 | loss 6.7037 | lr 0.00100 | ngrams/sec 36034.8 | eta 0h0m8s
| epoch 80 | step 4000/4071 | loss 6.6906 | lr 0.00100 | ngrams/sec 36099.3 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1156.64it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.36it/s]


-----------------------------------------------------------------------------------------
| end of epoch  80 | time 59.22s | valid loss  6.17 | valid ppl   477.59
-----------------------------------------------------------------------------------------
| epoch 81 | step 500/4071 | loss 6.5621 | lr 0.00100 | ngrams/sec 25739.9 | eta 0h1m11s
| epoch 81 | step 1000/4071 | loss 6.5982 | lr 0.00100 | ngrams/sec 36086.1 | eta 0h0m43s
| epoch 81 | step 1500/4071 | loss 6.6238 | lr 0.00100 | ngrams/sec 36066.7 | eta 0h0m36s
| epoch 81 | step 2000/4071 | loss 6.6309 | lr 0.00100 | ngrams/sec 36126.6 | eta 0h0m29s
| epoch 81 | step 2500/4071 | loss 6.6694 | lr 0.00100 | ngrams/sec 36102.7 | eta 0h0m22s
| epoch 81 | step 3000/4071 | loss 6.6841 | lr 0.00100 | ngrams/sec 36088.2 | eta 0h0m15s
| epoch 81 | step 3500/4071 | loss 6.6971 | lr 0.00100 | ngrams/sec 36128.7 | eta 0h0m8s
| epoch 81 | step 4000/4071 | loss 6.6956 | lr 0.00100 | ngrams/sec 36114.3 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1181.58it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch  81 | time 59.17s | valid loss  6.17 | valid ppl   476.05
-----------------------------------------------------------------------------------------
| epoch 82 | step 500/4071 | loss 6.5638 | lr 0.00100 | ngrams/sec 25750.8 | eta 0h1m11s
| epoch 82 | step 1000/4071 | loss 6.5790 | lr 0.00100 | ngrams/sec 36109.2 | eta 0h0m43s
| epoch 82 | step 1500/4071 | loss 6.6265 | lr 0.00100 | ngrams/sec 36101.1 | eta 0h0m36s
| epoch 82 | step 2000/4071 | loss 6.6400 | lr 0.00100 | ngrams/sec 36091.8 | eta 0h0m29s
| epoch 82 | step 2500/4071 | loss 6.6482 | lr 0.00100 | ngrams/sec 36102.1 | eta 0h0m22s
| epoch 82 | step 3000/4071 | loss 6.6579 | lr 0.00100 | ngrams/sec 36119.7 | eta 0h0m15s
| epoch 82 | step 3500/4071 | loss 6.6919 | lr 0.00100 | ngrams/sec 36116.7 | eta 0h0m8s
| epoch 82 | step 4000/4071 | loss 6.7015 | lr 0.00100 | ngrams/sec 36107.2 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1152.48it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.09it/s]


-----------------------------------------------------------------------------------------
| end of epoch  82 | time 59.16s | valid loss  6.18 | valid ppl   483.19
-----------------------------------------------------------------------------------------
| epoch 83 | step 500/4071 | loss 6.5573 | lr 0.00100 | ngrams/sec 25765.4 | eta 0h1m10s
| epoch 83 | step 1000/4071 | loss 6.5619 | lr 0.00100 | ngrams/sec 36125.6 | eta 0h0m43s
| epoch 83 | step 1500/4071 | loss 6.6062 | lr 0.00100 | ngrams/sec 36148.7 | eta 0h0m36s
| epoch 83 | step 2000/4071 | loss 6.6410 | lr 0.00100 | ngrams/sec 36116.4 | eta 0h0m29s
| epoch 83 | step 2500/4071 | loss 6.6546 | lr 0.00100 | ngrams/sec 36093.3 | eta 0h0m22s
| epoch 83 | step 3000/4071 | loss 6.6687 | lr 0.00100 | ngrams/sec 36115.4 | eta 0h0m15s
| epoch 83 | step 3500/4071 | loss 6.6945 | lr 0.00100 | ngrams/sec 36102.5 | eta 0h0m8s
| epoch 83 | step 4000/4071 | loss 6.7167 | lr 0.00100 | ngrams/sec 36083.1 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1154.96it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.09it/s]


-----------------------------------------------------------------------------------------
| end of epoch  83 | time 59.14s | valid loss  6.18 | valid ppl   483.20
-----------------------------------------------------------------------------------------
| epoch 84 | step 500/4071 | loss 6.5502 | lr 0.00100 | ngrams/sec 25788.1 | eta 0h1m10s
| epoch 84 | step 1000/4071 | loss 6.5784 | lr 0.00100 | ngrams/sec 36119.2 | eta 0h0m43s
| epoch 84 | step 1500/4071 | loss 6.6083 | lr 0.00100 | ngrams/sec 36132.4 | eta 0h0m36s
| epoch 84 | step 2000/4071 | loss 6.6224 | lr 0.00100 | ngrams/sec 36134.8 | eta 0h0m29s
| epoch 84 | step 2500/4071 | loss 6.6474 | lr 0.00100 | ngrams/sec 36151.1 | eta 0h0m22s
| epoch 84 | step 3000/4071 | loss 6.6627 | lr 0.00100 | ngrams/sec 36099.2 | eta 0h0m15s
| epoch 84 | step 3500/4071 | loss 6.6835 | lr 0.00100 | ngrams/sec 36116.4 | eta 0h0m8s
| epoch 84 | step 4000/4071 | loss 6.6968 | lr 0.00100 | ngrams/sec 36108.8 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1181.90it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.30it/s]


-----------------------------------------------------------------------------------------
| end of epoch  84 | time 59.12s | valid loss  6.18 | valid ppl   482.56
-----------------------------------------------------------------------------------------
| epoch 85 | step 500/4071 | loss 6.5403 | lr 0.00100 | ngrams/sec 25745.0 | eta 0h1m11s
| epoch 85 | step 1000/4071 | loss 6.5854 | lr 0.00100 | ngrams/sec 36088.9 | eta 0h0m43s
| epoch 85 | step 1500/4071 | loss 6.5982 | lr 0.00100 | ngrams/sec 36151.9 | eta 0h0m36s
| epoch 85 | step 2000/4071 | loss 6.6336 | lr 0.00100 | ngrams/sec 36156.3 | eta 0h0m29s
| epoch 85 | step 2500/4071 | loss 6.6430 | lr 0.00100 | ngrams/sec 36155.6 | eta 0h0m22s
| epoch 85 | step 3000/4071 | loss 6.6638 | lr 0.00100 | ngrams/sec 36160.8 | eta 0h0m15s
| epoch 85 | step 3500/4071 | loss 6.6722 | lr 0.00100 | ngrams/sec 36140.1 | eta 0h0m8s
| epoch 85 | step 4000/4071 | loss 6.6930 | lr 0.00100 | ngrams/sec 36118.2 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1171.03it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.62it/s]


-----------------------------------------------------------------------------------------
| end of epoch  85 | time 59.13s | valid loss  6.18 | valid ppl   483.75
-----------------------------------------------------------------------------------------
| epoch 86 | step 500/4071 | loss 6.5151 | lr 0.00100 | ngrams/sec 25777.6 | eta 0h1m10s
| epoch 86 | step 1000/4071 | loss 6.5887 | lr 0.00100 | ngrams/sec 36122.3 | eta 0h0m43s
| epoch 86 | step 1500/4071 | loss 6.6037 | lr 0.00100 | ngrams/sec 36117.4 | eta 0h0m36s
| epoch 86 | step 2000/4071 | loss 6.6217 | lr 0.00100 | ngrams/sec 36131.9 | eta 0h0m29s
| epoch 86 | step 2500/4071 | loss 6.6504 | lr 0.00100 | ngrams/sec 36051.9 | eta 0h0m22s
| epoch 86 | step 3000/4071 | loss 6.6650 | lr 0.00100 | ngrams/sec 36119.7 | eta 0h0m15s
| epoch 86 | step 3500/4071 | loss 6.6744 | lr 0.00100 | ngrams/sec 36107.4 | eta 0h0m8s
| epoch 86 | step 4000/4071 | loss 6.6786 | lr 0.00100 | ngrams/sec 36140.2 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1157.59it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.29it/s]


-----------------------------------------------------------------------------------------
| end of epoch  86 | time 59.14s | valid loss  6.18 | valid ppl   485.20
-----------------------------------------------------------------------------------------
| epoch 87 | step 500/4071 | loss 6.5361 | lr 0.00100 | ngrams/sec 25802.0 | eta 0h1m10s
| epoch 87 | step 1000/4071 | loss 6.5477 | lr 0.00100 | ngrams/sec 36141.4 | eta 0h0m43s
| epoch 87 | step 1500/4071 | loss 6.5968 | lr 0.00100 | ngrams/sec 36133.0 | eta 0h0m36s
| epoch 87 | step 2000/4071 | loss 6.6130 | lr 0.00100 | ngrams/sec 36170.0 | eta 0h0m29s
| epoch 87 | step 2500/4071 | loss 6.6383 | lr 0.00100 | ngrams/sec 36136.6 | eta 0h0m22s
| epoch 87 | step 3000/4071 | loss 6.6551 | lr 0.00100 | ngrams/sec 36172.1 | eta 0h0m15s
| epoch 87 | step 3500/4071 | loss 6.6695 | lr 0.00100 | ngrams/sec 36178.8 | eta 0h0m8s
| epoch 87 | step 4000/4071 | loss 6.6748 | lr 0.00100 | ngrams/sec 36178.0 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1186.23it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.38it/s]


-----------------------------------------------------------------------------------------
| end of epoch  87 | time 59.08s | valid loss  6.20 | valid ppl   491.38
-----------------------------------------------------------------------------------------
| epoch 88 | step 500/4071 | loss 6.5254 | lr 0.00100 | ngrams/sec 25779.6 | eta 0h1m10s
| epoch 88 | step 1000/4071 | loss 6.5490 | lr 0.00100 | ngrams/sec 36105.5 | eta 0h0m43s
| epoch 88 | step 1500/4071 | loss 6.6004 | lr 0.00100 | ngrams/sec 36062.1 | eta 0h0m36s
| epoch 88 | step 2000/4071 | loss 6.6045 | lr 0.00100 | ngrams/sec 36186.5 | eta 0h0m29s
| epoch 88 | step 2500/4071 | loss 6.6205 | lr 0.00100 | ngrams/sec 36144.3 | eta 0h0m22s
| epoch 88 | step 3000/4071 | loss 6.6536 | lr 0.00100 | ngrams/sec 36175.4 | eta 0h0m15s
| epoch 88 | step 3500/4071 | loss 6.6699 | lr 0.00100 | ngrams/sec 36163.8 | eta 0h0m8s
| epoch 88 | step 4000/4071 | loss 6.6780 | lr 0.00100 | ngrams/sec 36128.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1185.75it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.12it/s]


-----------------------------------------------------------------------------------------
| end of epoch  88 | time 59.10s | valid loss  6.19 | valid ppl   489.77
-----------------------------------------------------------------------------------------
| epoch 89 | step 500/4071 | loss 6.5298 | lr 0.00100 | ngrams/sec 25791.8 | eta 0h1m10s
| epoch 89 | step 1000/4071 | loss 6.5538 | lr 0.00100 | ngrams/sec 36170.1 | eta 0h0m43s
| epoch 89 | step 1500/4071 | loss 6.5782 | lr 0.00100 | ngrams/sec 36154.5 | eta 0h0m36s
| epoch 89 | step 2000/4071 | loss 6.6020 | lr 0.00100 | ngrams/sec 36160.1 | eta 0h0m29s
| epoch 89 | step 2500/4071 | loss 6.6272 | lr 0.00100 | ngrams/sec 36199.6 | eta 0h0m22s
| epoch 89 | step 3000/4071 | loss 6.6425 | lr 0.00100 | ngrams/sec 36221.3 | eta 0h0m15s
| epoch 89 | step 3500/4071 | loss 6.6498 | lr 0.00100 | ngrams/sec 36207.5 | eta 0h0m8s
| epoch 89 | step 4000/4071 | loss 6.6744 | lr 0.00100 | ngrams/sec 36188.7 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1184.63it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 298.11it/s]


-----------------------------------------------------------------------------------------
| end of epoch  89 | time 59.03s | valid loss  6.20 | valid ppl   492.06
-----------------------------------------------------------------------------------------
| epoch 90 | step 500/4071 | loss 6.5180 | lr 0.00100 | ngrams/sec 25893.1 | eta 0h1m10s
| epoch 90 | step 1000/4071 | loss 6.5567 | lr 0.00100 | ngrams/sec 36114.6 | eta 0h0m43s
| epoch 90 | step 1500/4071 | loss 6.5883 | lr 0.00100 | ngrams/sec 35943.1 | eta 0h0m36s
| epoch 90 | step 2000/4071 | loss 6.5865 | lr 0.00100 | ngrams/sec 35964.9 | eta 0h0m29s
| epoch 90 | step 2500/4071 | loss 6.6329 | lr 0.00100 | ngrams/sec 36000.3 | eta 0h0m22s
| epoch 90 | step 3000/4071 | loss 6.6364 | lr 0.00100 | ngrams/sec 35961.2 | eta 0h0m15s
| epoch 90 | step 3500/4071 | loss 6.6572 | lr 0.00100 | ngrams/sec 35974.8 | eta 0h0m8s
| epoch 90 | step 4000/4071 | loss 6.6667 | lr 0.00100 | ngrams/sec 36050.3 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1153.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.69it/s]


-----------------------------------------------------------------------------------------
| end of epoch  90 | time 59.27s | valid loss  6.21 | valid ppl   496.33
-----------------------------------------------------------------------------------------
| epoch 91 | step 500/4071 | loss 6.5122 | lr 0.00100 | ngrams/sec 25763.7 | eta 0h1m10s
| epoch 91 | step 1000/4071 | loss 6.5596 | lr 0.00100 | ngrams/sec 36174.0 | eta 0h0m43s
| epoch 91 | step 1500/4071 | loss 6.5957 | lr 0.00100 | ngrams/sec 36172.3 | eta 0h0m36s
| epoch 91 | step 2000/4071 | loss 6.5940 | lr 0.00100 | ngrams/sec 36155.4 | eta 0h0m29s
| epoch 91 | step 2500/4071 | loss 6.6112 | lr 0.00100 | ngrams/sec 36161.0 | eta 0h0m22s
| epoch 91 | step 3000/4071 | loss 6.6207 | lr 0.00100 | ngrams/sec 36114.5 | eta 0h0m15s
| epoch 91 | step 3500/4071 | loss 6.6526 | lr 0.00100 | ngrams/sec 36086.2 | eta 0h0m8s
| epoch 91 | step 4000/4071 | loss 6.6732 | lr 0.00100 | ngrams/sec 36113.9 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1157.91it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.78it/s]


-----------------------------------------------------------------------------------------
| end of epoch  91 | time 59.13s | valid loss  6.21 | valid ppl   500.15
-----------------------------------------------------------------------------------------
| epoch 92 | step 500/4071 | loss 6.5100 | lr 0.00100 | ngrams/sec 25727.2 | eta 0h1m11s
| epoch 92 | step 1000/4071 | loss 6.5512 | lr 0.00100 | ngrams/sec 36069.6 | eta 0h0m43s
| epoch 92 | step 1500/4071 | loss 6.5644 | lr 0.00100 | ngrams/sec 36080.1 | eta 0h0m36s
| epoch 92 | step 2000/4071 | loss 6.5830 | lr 0.00100 | ngrams/sec 36115.2 | eta 0h0m29s
| epoch 92 | step 2500/4071 | loss 6.6122 | lr 0.00100 | ngrams/sec 36137.9 | eta 0h0m22s
| epoch 92 | step 3000/4071 | loss 6.6198 | lr 0.00100 | ngrams/sec 36167.9 | eta 0h0m15s
| epoch 92 | step 3500/4071 | loss 6.6345 | lr 0.00100 | ngrams/sec 36152.1 | eta 0h0m8s
| epoch 92 | step 4000/4071 | loss 6.6559 | lr 0.00100 | ngrams/sec 36156.5 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1157.52it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 293.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  92 | time 59.14s | valid loss  6.21 | valid ppl   498.16
-----------------------------------------------------------------------------------------
| epoch 93 | step 500/4071 | loss 6.5076 | lr 0.00100 | ngrams/sec 25769.3 | eta 0h1m10s
| epoch 93 | step 1000/4071 | loss 6.5451 | lr 0.00100 | ngrams/sec 36099.9 | eta 0h0m43s
| epoch 93 | step 1500/4071 | loss 6.5638 | lr 0.00100 | ngrams/sec 36154.7 | eta 0h0m36s
| epoch 93 | step 2000/4071 | loss 6.5900 | lr 0.00100 | ngrams/sec 36166.3 | eta 0h0m29s
| epoch 93 | step 2500/4071 | loss 6.6151 | lr 0.00100 | ngrams/sec 36141.6 | eta 0h0m22s
| epoch 93 | step 3000/4071 | loss 6.6192 | lr 0.00100 | ngrams/sec 36177.7 | eta 0h0m15s
| epoch 93 | step 3500/4071 | loss 6.6383 | lr 0.00100 | ngrams/sec 36153.2 | eta 0h0m8s
| epoch 93 | step 4000/4071 | loss 6.6488 | lr 0.00100 | ngrams/sec 36143.8 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1166.14it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.60it/s]


-----------------------------------------------------------------------------------------
| end of epoch  93 | time 59.09s | valid loss  6.22 | valid ppl   502.08
-----------------------------------------------------------------------------------------
| epoch 94 | step 500/4071 | loss 6.4950 | lr 0.00100 | ngrams/sec 25804.0 | eta 0h1m10s
| epoch 94 | step 1000/4071 | loss 6.5324 | lr 0.00100 | ngrams/sec 36153.4 | eta 0h0m43s
| epoch 94 | step 1500/4071 | loss 6.5715 | lr 0.00100 | ngrams/sec 36186.0 | eta 0h0m36s
| epoch 94 | step 2000/4071 | loss 6.5947 | lr 0.00100 | ngrams/sec 36212.4 | eta 0h0m29s
| epoch 94 | step 2500/4071 | loss 6.5942 | lr 0.00100 | ngrams/sec 36239.3 | eta 0h0m22s
| epoch 94 | step 3000/4071 | loss 6.6198 | lr 0.00100 | ngrams/sec 36221.0 | eta 0h0m15s
| epoch 94 | step 3500/4071 | loss 6.6384 | lr 0.00100 | ngrams/sec 36187.9 | eta 0h0m8s
| epoch 94 | step 4000/4071 | loss 6.6639 | lr 0.00100 | ngrams/sec 36113.5 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1168.32it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 292.66it/s]


-----------------------------------------------------------------------------------------
| end of epoch  94 | time 59.05s | valid loss  6.22 | valid ppl   501.07
-----------------------------------------------------------------------------------------
| epoch 95 | step 500/4071 | loss 6.5027 | lr 0.00100 | ngrams/sec 25709.2 | eta 0h1m11s
| epoch 95 | step 1000/4071 | loss 6.5371 | lr 0.00100 | ngrams/sec 35923.2 | eta 0h0m43s
| epoch 95 | step 1500/4071 | loss 6.5606 | lr 0.00100 | ngrams/sec 35442.2 | eta 0h0m37s
| epoch 95 | step 2000/4071 | loss 6.5959 | lr 0.00100 | ngrams/sec 36123.0 | eta 0h0m29s
| epoch 95 | step 2500/4071 | loss 6.6036 | lr 0.00100 | ngrams/sec 36134.6 | eta 0h0m22s
| epoch 95 | step 3000/4071 | loss 6.6245 | lr 0.00100 | ngrams/sec 36204.3 | eta 0h0m15s
| epoch 95 | step 3500/4071 | loss 6.6298 | lr 0.00100 | ngrams/sec 36241.0 | eta 0h0m8s
| epoch 95 | step 4000/4071 | loss 6.6631 | lr 0.00100 | ngrams/sec 36209.7 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1156.16it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.73it/s]


-----------------------------------------------------------------------------------------
| end of epoch  95 | time 59.25s | valid loss  6.22 | valid ppl   503.20
-----------------------------------------------------------------------------------------
| epoch 96 | step 500/4071 | loss 6.5133 | lr 0.00100 | ngrams/sec 25852.1 | eta 0h1m10s
| epoch 96 | step 1000/4071 | loss 6.5227 | lr 0.00100 | ngrams/sec 36170.0 | eta 0h0m43s
| epoch 96 | step 1500/4071 | loss 6.5577 | lr 0.00100 | ngrams/sec 36174.5 | eta 0h0m36s
| epoch 96 | step 2000/4071 | loss 6.5890 | lr 0.00100 | ngrams/sec 36112.9 | eta 0h0m29s
| epoch 96 | step 2500/4071 | loss 6.5999 | lr 0.00100 | ngrams/sec 36128.7 | eta 0h0m22s
| epoch 96 | step 3000/4071 | loss 6.6138 | lr 0.00100 | ngrams/sec 36136.5 | eta 0h0m15s
| epoch 96 | step 3500/4071 | loss 6.6428 | lr 0.00100 | ngrams/sec 36161.5 | eta 0h0m8s
| epoch 96 | step 4000/4071 | loss 6.6484 | lr 0.00100 | ngrams/sec 36121.3 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1177.66it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 294.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  96 | time 59.09s | valid loss  6.22 | valid ppl   504.82
-----------------------------------------------------------------------------------------
| epoch 97 | step 500/4071 | loss 6.4985 | lr 0.00100 | ngrams/sec 25785.2 | eta 0h1m10s
| epoch 97 | step 1000/4071 | loss 6.5275 | lr 0.00100 | ngrams/sec 36117.8 | eta 0h0m43s
| epoch 97 | step 1500/4071 | loss 6.5473 | lr 0.00100 | ngrams/sec 36117.3 | eta 0h0m36s
| epoch 97 | step 2000/4071 | loss 6.5654 | lr 0.00100 | ngrams/sec 36167.2 | eta 0h0m29s
| epoch 97 | step 2500/4071 | loss 6.6022 | lr 0.00100 | ngrams/sec 36180.4 | eta 0h0m22s
| epoch 97 | step 3000/4071 | loss 6.6058 | lr 0.00100 | ngrams/sec 36190.9 | eta 0h0m15s
| epoch 97 | step 3500/4071 | loss 6.6212 | lr 0.00100 | ngrams/sec 36175.7 | eta 0h0m8s
| epoch 97 | step 4000/4071 | loss 6.6430 | lr 0.00100 | ngrams/sec 36207.9 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1167.06it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 295.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  97 | time 59.06s | valid loss  6.24 | valid ppl   511.98
-----------------------------------------------------------------------------------------
| epoch 98 | step 500/4071 | loss 6.4912 | lr 0.00100 | ngrams/sec 25852.0 | eta 0h1m10s
| epoch 98 | step 1000/4071 | loss 6.5208 | lr 0.00100 | ngrams/sec 36199.1 | eta 0h0m43s
| epoch 98 | step 1500/4071 | loss 6.5542 | lr 0.00100 | ngrams/sec 36201.5 | eta 0h0m36s
| epoch 98 | step 2000/4071 | loss 6.5731 | lr 0.00100 | ngrams/sec 36184.5 | eta 0h0m29s
| epoch 98 | step 2500/4071 | loss 6.5977 | lr 0.00100 | ngrams/sec 36165.0 | eta 0h0m22s
| epoch 98 | step 3000/4071 | loss 6.6034 | lr 0.00100 | ngrams/sec 36168.2 | eta 0h0m15s
| epoch 98 | step 3500/4071 | loss 6.6179 | lr 0.00100 | ngrams/sec 36192.0 | eta 0h0m8s
| epoch 98 | step 4000/4071 | loss 6.6475 | lr 0.00100 | ngrams/sec 36184.6 | eta 0h0m1s


 29%|██▊       | 119/417 [00:00<00:00, 1181.05it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.24it/s]


-----------------------------------------------------------------------------------------
| end of epoch  98 | time 59.01s | valid loss  6.24 | valid ppl   513.47
-----------------------------------------------------------------------------------------
| epoch 99 | step 500/4071 | loss 6.4856 | lr 0.00100 | ngrams/sec 25894.9 | eta 0h1m10s
| epoch 99 | step 1000/4071 | loss 6.5169 | lr 0.00100 | ngrams/sec 36174.1 | eta 0h0m43s
| epoch 99 | step 1500/4071 | loss 6.5475 | lr 0.00100 | ngrams/sec 36140.6 | eta 0h0m36s
| epoch 99 | step 2000/4071 | loss 6.5645 | lr 0.00100 | ngrams/sec 36199.3 | eta 0h0m29s
| epoch 99 | step 2500/4071 | loss 6.5796 | lr 0.00100 | ngrams/sec 36190.4 | eta 0h0m22s
| epoch 99 | step 3000/4071 | loss 6.5870 | lr 0.00100 | ngrams/sec 36183.4 | eta 0h0m15s
| epoch 99 | step 3500/4071 | loss 6.6070 | lr 0.00100 | ngrams/sec 36181.6 | eta 0h0m8s
| epoch 99 | step 4000/4071 | loss 6.6351 | lr 0.00100 | ngrams/sec 36178.9 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1161.57it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 297.72it/s]


-----------------------------------------------------------------------------------------
| end of epoch  99 | time 59.02s | valid loss  6.24 | valid ppl   512.86
-----------------------------------------------------------------------------------------
| epoch 100 | step 500/4071 | loss 6.4756 | lr 0.00100 | ngrams/sec 25844.1 | eta 0h1m10s
| epoch 100 | step 1000/4071 | loss 6.5148 | lr 0.00100 | ngrams/sec 36187.7 | eta 0h0m43s
| epoch 100 | step 1500/4071 | loss 6.5470 | lr 0.00100 | ngrams/sec 36204.0 | eta 0h0m36s
| epoch 100 | step 2000/4071 | loss 6.5746 | lr 0.00100 | ngrams/sec 36211.5 | eta 0h0m29s
| epoch 100 | step 2500/4071 | loss 6.5970 | lr 0.00100 | ngrams/sec 36190.5 | eta 0h0m22s
| epoch 100 | step 3000/4071 | loss 6.6017 | lr 0.00100 | ngrams/sec 36189.8 | eta 0h0m15s
| epoch 100 | step 3500/4071 | loss 6.6156 | lr 0.00100 | ngrams/sec 36114.8 | eta 0h0m8s
| epoch 100 | step 4000/4071 | loss 6.6382 | lr 0.00100 | ngrams/sec 36144.6 | eta 0h0m1s


 29%|██▉       | 120/417 [00:00<00:00, 1165.33it/s]

Evaluating on validation set...


100%|██████████| 417/417 [00:01<00:00, 296.13it/s]


-----------------------------------------------------------------------------------------


 25%|██▌       | 118/471 [00:00<00:00, 1141.23it/s]

| end of epoch 100 | time 59.04s | valid loss  6.25 | valid ppl   516.29
-----------------------------------------------------------------------------------------
Evaluating on test set...


100%|██████████| 471/471 [00:01<00:00, 287.09it/s]


| End of training | test loss  6.19 | test ppl   488.31


In [None]:
from google.colab import files
files.download('checkpoint.pth')
!cp "checkpoint.pth" "gdrive/MyDrive/checkpoint.pth"

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Generate

In [None]:
!cp "gdrive/MyDrive/checkpoint.pth" "checkpoint.pth" 

In [None]:
import torch 
# Model parameters.
class Args:
    data = 'gdrive/MyDrive/wikitext-2'
    checkpoint = 'checkpoint.pth'
    outf = 'generated.txt'
    #words = 1000
    seed = 42
    cuda = True
    temperature = 1.0 # temperature - higher will increase diversity
    log_interval = 10 # reporting interval
    words = 100
args = Args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

model.load_state_dict(torch.load(args.checkpoint))
print(model)
model.eval()

ntokens = n_class
input_idx = 104#torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
# input_idx = torch.autograd.Variable(torch.t(torch.randint(ntokens, (1, 7), dtype=torch.long))).to(device)
input_words = [corpus.dictionary.idx2word[i] for i in train_data[input_idx:order+input_idx, 0]]
input = torch.tensor([i for i in train_data[input_idx:order+input_idx, 0]], dtype=torch.long).to(device)
print(input)
print(input_words)

FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
tensor([27, 63, 64, 65, 66, 17, 67], device='cuda:0')
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation']


In [None]:
glue = ' '
start = None
with open(args.outf, 'w') as outf:
    for i in range(args.words):
        output = model(input)
        word_weights = output.squeeze().div(args.temperature).exp().cpu()
        # if args.no_unk:
        #     word_weights[corpus.dictionary.w2i[unk]] = 0
        word_idx = torch.multinomial(word_weights, 1)[0]
        # word_idx = word_idx.data[0]
        word = corpus.dictionary.idx2word[word_idx]
        print(word)

        # ids.append(word_idx)
        # input = Variable(torch.LongTensor(ids[-model.order:]).unsqueeze(0))
        word_tensor = torch.tensor([word_idx]).to(device)
        input = torch.cat((input[1:], word_tensor), 0)
        # input.fill_(word_idx)
        input = input.cuda() if cuda else input
        # print(input)
        if word is "<sos>": # ignore start of sentence predictns
            continue
        elif word is "<eos>":
            outf.write('\n')
        else:
            outf.write(word + glue)

        if i % args.log_interval == 0:
            print('| Generated {}/{} words'.format(i, args.words))

engine
| Generated 0/100 words
during
countries
1925
by
foreign
<unk>
special
experience
duke
representation
| Generated 10/100 words
who
participated
high
races
between
1908
detroit
and
jupiter
historical
| Generated 20/100 words
border
large
rows
into
very
well
latter
way
fish
exploring
| Generated 30/100 words
ceres
believed
donated
conjunction
road
parallel
blue
plasma
play
grossed
| Generated 40/100 words
capital
12
continued
independence
;
drama
measure
£
1
@,@
| Generated 50/100 words
due
to
southern
opening
9
enemy
21st
ran
cougar
ranges
| Generated 60/100 words
.
many
delayed
low
dining
julio
decade
to
begin
on
| Generated 70/100 words
september
leaving
much
invitation
.
<eos>
weather
planned
keyboards
1902
| Generated 80/100 words
<unk>
=
formed
this
personality
continued
population
semi
performing
16
| Generated 90/100 words
million
78
epic
becoming
generally
open
,
which
drew


In [None]:
!cp "generated.txt" "gdrive/MyDrive/generated.txt"

In [None]:
drive.flush_and_unmount()