In [34]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [35]:
!ls "gdrive/MyDrive/wikitext-2" # check that it has successfully connected
# files should be at ur GDrive inside folder wikitext-2
!cp "gdrive/MyDrive/wikitext-2/wiki.test.tokens.txt" "test.txt" # copy the files to colab runtime
!cp "gdrive/MyDrive/wikitext-2/wiki.train.tokens.txt" "train.txt"
!cp "gdrive/MyDrive/wikitext-2/wiki.valid.tokens.txt" "valid.txt"

wiki.test.tokens      wiki.train.tokens      wiki.valid.tokens
wiki.test.tokens.txt  wiki.train.tokens.txt  wiki.valid.tokens.txt


# Preprocessing

In [36]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                # remove the headers e.g.  = = Description = = 
                if line.startswith('='): 
                    continue
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word.lower()) # make to lower

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word.lower()]) # make to lower
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
    
    @property
    def vocab_size(self):
        return len(self.dictionary.idx2word)

# Params

In [37]:
#=== params
corpus = Corpus('/content')
n_class = corpus.vocab_size
n_step = 7 # n-1 in paper
n_hidden = 200 # h in paper
embed_size = 200       # m in paper
batch_size = 1000
order = n_step # order (int): the order of the language model, i.e. length of the history
epochs = 100
learning_rate = 0.001
cuda = torch.cuda.is_available()
seed = 42
clip = 2.0
#===

# Model

In [59]:
#== MODEL ==#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''
    the neural model learns the distributed representation of each word 
    (embedding matrix C) and 
    the probability function of a word sequence as a function of their distributed representations. 
    It has a hidden layer with 
    tanh activation and the output layer is a 
    Softmax layer. 
    The output of the model for each 
    input of (n - 1) previous words are the 
    probabilities over the |V | words in the vocabulary for the next word.
'''
class FNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, context_size, no_hidden, dropout=0.5, tie_weight=True):
        super(FNNModel,self).__init__()
        """
        Args:
            n_class (int): no. of vocabulary
            m (int): size of each embedding vector
#n-gram models construct tables of conditional probabilities for the next word, 
#for each one of a large number of contexts, i.e. combinations of the last n − 1 words
            n_step (int): n-1 in paper. #n_step + 1 = n-gram. if n_step = 1, bigram
            n_hidden (int): no. of hidden units associated with each word
        """
        """
        Vars:
            C: encoder (|V| x m)
            H: hiden layer weight (n x (n-1)m)
            W: word feature to output weights (|V| x (n-1)m)
            d: hidden layer bias (has h no. of elements)
            U: hidden-to-output weights (|V| × h matrix)
            b: output bias (has |V| no. of elements)
        """
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear1 = nn.Linear(context_size * embed_size, no_hidden)
        self.linear2 = nn.Linear(no_hidden, vocab_size)
        self.context_size = context_size
        self.embed_size = embed_size
        self.dropout = nn.Dropout(p=dropout)
        if tie_weight:
            self.linear2.weight = self.embeddings.weight

    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((-1, self.context_size * self.embed_size))
        hidden_output = self.linear1(embeds)
        # hidden_output = self.dropout(hidden_output)
        out = hidden_output.tanh()
        # out = self.dropout(out)
        out = self.linear2(out)
        out = self.dropout(out)
        log_probs = F.log_softmax(out, dim=1) # [1000, 28912]: softmax on 28912's dim
        return log_probs

# Data Loading

In [60]:
def batchify(data, batch_size):
    # Work out how cleanly we can divide the dataset into args.batch_size parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data
def get_batch(data, i, order):
    x = torch.autograd.Variable(torch.t(data[i:i+order]))
    y = torch.autograd.Variable(data[i+order].view(-1))
    return x, y
def evaluate(data, model, criterion):
	model.eval()
	total_loss = 0
	n_steps = data.size(0) - order - 1
	for i in tqdm(range(n_steps)):
		x, y = get_batch(data, i, order)
		out = model(x)
		loss = criterion(out, y)
		total_loss += loss.data.data
	return total_loss / n_steps

In [61]:
def clock_time(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    return int(h), int(m), int(s)

In [62]:
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import time


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)
if cuda:
	train_data, val_data, test_data = train_data.cuda(), val_data.cuda(), test_data.cuda()
print('Using cuda: {}'.format(cuda))
print('Size of training set: {:,} tokens'.format(np.prod(train_data.size())))
print('Size of validation set: {:,} tokens'.format(np.prod(val_data.size())))
print('Size of test set: {:,} tokens'.format(np.prod(test_data.size())))
print('Vocabulary size: {:,}'.format(corpus.vocab_size))
print('Example data:')
for k in range(100, 107):
    x = [corpus.dictionary.idx2word[i] for i in train_data[k:order+k, 0]]
    y = [corpus.dictionary.idx2word[train_data[k+order, 0]]]
    print(x, y)
#=== initialise model
model = FNNModel(
    n_class, 
    embed_size, 
    n_step, 
    n_hidden,
    )
if cuda:
  model.cuda()
# Display the model's architecture
print('Model: \n', model)
criterion = nn.NLLLoss()
optimizer = optim.RMSprop(model.parameters(),lr=learning_rate)

Using cuda: True
Size of training set: 2,088,000 tokens
Size of validation set: 217,000 tokens
Size of test set: 245,000 tokens
Vocabulary size: 28,912
Example data:
['"', 'nameless', '"', ',', 'a', 'penal', 'military'] ['unit']
['nameless', '"', ',', 'a', 'penal', 'military', 'unit'] ['serving']
['"', ',', 'a', 'penal', 'military', 'unit', 'serving'] ['the']
[',', 'a', 'penal', 'military', 'unit', 'serving', 'the'] ['nation']
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation'] ['of']
['penal', 'military', 'unit', 'serving', 'the', 'nation', 'of'] ['gallia']
['military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia'] ['during']
Model: 
 FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [63]:
!nvidia-smi

Thu Nov 26 15:33:33 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0    32W /  70W |   1739MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [64]:
# Set seed for reproducibility.
torch.manual_seed(seed)
np.random.seed(seed)
parameters = [param for param in model.parameters() if param.requires_grad]
# Training
print('Training...')
losses = dict(train=[], val=[])

# initialize the early_stopping counter
stop_counter = 0

lr = learning_rate # so that can alter later if SGD not descending 
best_val_loss = None

num_steps = train_data.size(0) - order - 1
batch_order = np.arange(num_steps)

t0 = time.time()
try:
    for epoch in range(1, epochs+1):
        model.train()
        epoch_start_time = time.time()
        np.random.shuffle(batch_order)

        for step in range(1, num_steps+1):
            idx = batch_order[step-1]
            x, y = get_batch(train_data, idx, order)

            model.zero_grad()
            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)
            debug = False
            #   if debug:
            #     # Debugging softmax approximation.
            #     xe = nn.CrossEntropyLoss()
            #     true_loss = xe(logits, y)
            #     print('approx {:>3.2f}, true {:>3.2f}, diff {:>3.4f}'.format(
            #       loss.data, true_loss.data, true_loss.data - loss.data))

            # Update parameters
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # reduce exploding grad
            optimizer.step()

            # Save loss.
            losses['train'].append(loss.cpu().data)
            print_every = 500
            if step % print_every == 0:
                avg_loss = sum(losses['train'][-print_every:]) / print_every
                t1 = time.time()
                steps_per_second = print_every / (t1 - t0)
                print('| epoch {} | step {}/{} | loss {:.4f} | lr {:.5f} | '
                    'ngrams/sec {:.1f} | eta {}h{}m{}s'.format(
                    epoch, step, num_steps, avg_loss, lr,
                    steps_per_second * batch_size,
                    *clock_time((num_steps - step) / steps_per_second)))
                t0 = time.time()
            
        print('Evaluating on validation set...')
        val_loss = evaluate(val_data, model, criterion)
        losses['val'].append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, torch.exp(val_loss)))
        print('-' * 89)

        if not best_val_loss or val_loss < best_val_loss:
            stop_counter = 0 # reset counter
            best_val_loss = val_loss
            print('| saving current state of model ...')
            torch.save(model.state_dict(), 'checkpoint.pth')
            #=== download checkpoint file
            # files.download('checkpoint.pth')
        elif val_loss < best_val_loss and val_loss < losses['val'][-2] and val_loss < torch.mean(torch.stack(losses['val'])): # curr loss less than best loss and previous loss
            stop_counter += 1
            if stop_counter >= 10:
                print("Early stopping")
                break
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    
# write_losses(losses['train'], args.log_dir, name='train-losses')
# write_losses(losses['val'], args.log_dir, name='val-losses')

print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

Training...
| epoch 1 | step 500/2080 | loss 18.8537 | lr 0.00100 | ngrams/sec 48567.3 | eta 0h0m32s
| epoch 1 | step 1000/2080 | loss 10.6281 | lr 0.00100 | ngrams/sec 47753.4 | eta 0h0m22s
| epoch 1 | step 1500/2080 | loss 9.9292 | lr 0.00100 | ngrams/sec 46505.4 | eta 0h0m12s
| epoch 1 | step 2000/2080 | loss 10.2321 | lr 0.00100 | ngrams/sec 45139.5 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1019.89it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 261.61it/s]


-----------------------------------------------------------------------------------------
| end of epoch   1 | time 45.20s | valid loss  7.57 | valid ppl  1945.16
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 2 | step 500/2080 | loss 10.1349 | lr 0.00100 | ngrams/sec 34713.5 | eta 0h0m45s
| epoch 2 | step 1000/2080 | loss 10.0127 | lr 0.00100 | ngrams/sec 46215.2 | eta 0h0m23s
| epoch 2 | step 1500/2080 | loss 9.8654 | lr 0.00100 | ngrams/sec 46871.9 | eta 0h0m12s
| epoch 2 | step 2000/2080 | loss 9.7366 | lr 0.00100 | ngrams/sec 47140.7 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1052.07it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 278.18it/s]


-----------------------------------------------------------------------------------------
| end of epoch   2 | time 45.64s | valid loss  7.28 | valid ppl  1453.92
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 3 | step 500/2080 | loss 9.5826 | lr 0.00100 | ngrams/sec 36361.1 | eta 0h0m43s
| epoch 3 | step 1000/2080 | loss 9.4951 | lr 0.00100 | ngrams/sec 46588.0 | eta 0h0m23s
| epoch 3 | step 1500/2080 | loss 9.3884 | lr 0.00100 | ngrams/sec 46107.5 | eta 0h0m12s
| epoch 3 | step 2000/2080 | loss 9.3081 | lr 0.00100 | ngrams/sec 46083.7 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1046.93it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 270.70it/s]


-----------------------------------------------------------------------------------------
| end of epoch   3 | time 45.57s | valid loss  7.11 | valid ppl  1220.15
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 4 | step 500/2080 | loss 9.1959 | lr 0.00100 | ngrams/sec 35782.2 | eta 0h0m44s
| epoch 4 | step 1000/2080 | loss 9.1385 | lr 0.00100 | ngrams/sec 46409.7 | eta 0h0m23s
| epoch 4 | step 1500/2080 | loss 9.0745 | lr 0.00100 | ngrams/sec 46447.7 | eta 0h0m12s
| epoch 4 | step 2000/2080 | loss 9.0104 | lr 0.00100 | ngrams/sec 46413.6 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1032.34it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.17it/s]


-----------------------------------------------------------------------------------------
| end of epoch   4 | time 45.60s | valid loss  6.98 | valid ppl  1071.92
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 5 | step 500/2080 | loss 8.9277 | lr 0.00100 | ngrams/sec 35883.8 | eta 0h0m44s
| epoch 5 | step 1000/2080 | loss 8.8851 | lr 0.00100 | ngrams/sec 46422.1 | eta 0h0m23s
| epoch 5 | step 1500/2080 | loss 8.8418 | lr 0.00100 | ngrams/sec 46436.7 | eta 0h0m12s
| epoch 5 | step 2000/2080 | loss 8.7911 | lr 0.00100 | ngrams/sec 46439.6 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1043.63it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch   5 | time 45.56s | valid loss  6.87 | valid ppl   962.14
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 6 | step 500/2080 | loss 8.7162 | lr 0.00100 | ngrams/sec 35871.9 | eta 0h0m44s
| epoch 6 | step 1000/2080 | loss 8.6892 | lr 0.00100 | ngrams/sec 46472.6 | eta 0h0m23s
| epoch 6 | step 1500/2080 | loss 8.6694 | lr 0.00100 | ngrams/sec 46506.8 | eta 0h0m12s
| epoch 6 | step 2000/2080 | loss 8.6436 | lr 0.00100 | ngrams/sec 46443.0 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1051.16it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.80it/s]


-----------------------------------------------------------------------------------------
| end of epoch   6 | time 45.54s | valid loss  6.80 | valid ppl   895.56
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 7 | step 500/2080 | loss 8.5616 | lr 0.00100 | ngrams/sec 35840.1 | eta 0h0m44s
| epoch 7 | step 1000/2080 | loss 8.5520 | lr 0.00100 | ngrams/sec 46354.4 | eta 0h0m23s
| epoch 7 | step 1500/2080 | loss 8.5465 | lr 0.00100 | ngrams/sec 46370.4 | eta 0h0m12s
| epoch 7 | step 2000/2080 | loss 8.4996 | lr 0.00100 | ngrams/sec 46296.4 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1055.07it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 269.87it/s]


-----------------------------------------------------------------------------------------
| end of epoch   7 | time 45.66s | valid loss  6.81 | valid ppl   907.51
-----------------------------------------------------------------------------------------
| epoch 8 | step 500/2080 | loss 8.4333 | lr 0.00100 | ngrams/sec 35905.2 | eta 0h0m44s
| epoch 8 | step 1000/2080 | loss 8.4351 | lr 0.00100 | ngrams/sec 46291.7 | eta 0h0m23s
| epoch 8 | step 1500/2080 | loss 8.4015 | lr 0.00100 | ngrams/sec 46284.3 | eta 0h0m12s
| epoch 8 | step 2000/2080 | loss 8.3687 | lr 0.00100 | ngrams/sec 46255.1 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1032.95it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch   8 | time 45.73s | valid loss  6.73 | valid ppl   837.92
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 9 | step 500/2080 | loss 8.3072 | lr 0.00100 | ngrams/sec 35746.2 | eta 0h0m44s
| epoch 9 | step 1000/2080 | loss 8.2971 | lr 0.00100 | ngrams/sec 46233.7 | eta 0h0m23s
| epoch 9 | step 1500/2080 | loss 8.2827 | lr 0.00100 | ngrams/sec 46636.5 | eta 0h0m12s
| epoch 9 | step 2000/2080 | loss 8.2498 | lr 0.00100 | ngrams/sec 46760.1 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1059.18it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.54it/s]


-----------------------------------------------------------------------------------------
| end of epoch   9 | time 45.52s | valid loss  6.83 | valid ppl   927.40
-----------------------------------------------------------------------------------------
| epoch 10 | step 500/2080 | loss 8.1958 | lr 0.00100 | ngrams/sec 36260.6 | eta 0h0m43s
| epoch 10 | step 1000/2080 | loss 8.2052 | lr 0.00100 | ngrams/sec 46804.5 | eta 0h0m23s
| epoch 10 | step 1500/2080 | loss 8.2014 | lr 0.00100 | ngrams/sec 46839.5 | eta 0h0m12s
| epoch 10 | step 2000/2080 | loss 8.1978 | lr 0.00100 | ngrams/sec 46909.7 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1033.88it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  10 | time 45.18s | valid loss  6.82 | valid ppl   913.43
-----------------------------------------------------------------------------------------
| epoch 11 | step 500/2080 | loss 8.1543 | lr 0.00100 | ngrams/sec 36237.4 | eta 0h0m43s
| epoch 11 | step 1000/2080 | loss 8.1527 | lr 0.00100 | ngrams/sec 46896.3 | eta 0h0m23s
| epoch 11 | step 1500/2080 | loss 8.1631 | lr 0.00100 | ngrams/sec 46881.5 | eta 0h0m12s
| epoch 11 | step 2000/2080 | loss 8.1651 | lr 0.00100 | ngrams/sec 46916.4 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1031.77it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.01it/s]


-----------------------------------------------------------------------------------------
| end of epoch  11 | time 45.17s | valid loss  6.80 | valid ppl   894.89
-----------------------------------------------------------------------------------------
| epoch 12 | step 500/2080 | loss 8.1242 | lr 0.00100 | ngrams/sec 36426.6 | eta 0h0m43s
| epoch 12 | step 1000/2080 | loss 8.1359 | lr 0.00100 | ngrams/sec 46968.2 | eta 0h0m22s
| epoch 12 | step 1500/2080 | loss 8.1462 | lr 0.00100 | ngrams/sec 46881.1 | eta 0h0m12s
| epoch 12 | step 2000/2080 | loss 8.1462 | lr 0.00100 | ngrams/sec 46993.7 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1083.48it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch  12 | time 45.06s | valid loss  6.76 | valid ppl   859.66
-----------------------------------------------------------------------------------------
| epoch 13 | step 500/2080 | loss 8.0996 | lr 0.00100 | ngrams/sec 36449.6 | eta 0h0m43s
| epoch 13 | step 1000/2080 | loss 8.1292 | lr 0.00100 | ngrams/sec 47019.9 | eta 0h0m22s
| epoch 13 | step 1500/2080 | loss 8.1165 | lr 0.00100 | ngrams/sec 46922.2 | eta 0h0m12s
| epoch 13 | step 2000/2080 | loss 8.1150 | lr 0.00100 | ngrams/sec 47036.2 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1083.16it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.18it/s]


-----------------------------------------------------------------------------------------
| end of epoch  13 | time 45.03s | valid loss  6.72 | valid ppl   828.74
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 14 | step 500/2080 | loss 8.0854 | lr 0.00100 | ngrams/sec 36325.8 | eta 0h0m43s
| epoch 14 | step 1000/2080 | loss 8.0975 | lr 0.00100 | ngrams/sec 46901.7 | eta 0h0m23s
| epoch 14 | step 1500/2080 | loss 8.1025 | lr 0.00100 | ngrams/sec 46901.2 | eta 0h0m12s
| epoch 14 | step 2000/2080 | loss 8.1132 | lr 0.00100 | ngrams/sec 46947.9 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1051.85it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 275.06it/s]


-----------------------------------------------------------------------------------------
| end of epoch  14 | time 45.07s | valid loss  6.69 | valid ppl   807.10
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 15 | step 500/2080 | loss 8.0624 | lr 0.00100 | ngrams/sec 36298.3 | eta 0h0m43s
| epoch 15 | step 1000/2080 | loss 8.0798 | lr 0.00100 | ngrams/sec 46964.2 | eta 0h0m22s
| epoch 15 | step 1500/2080 | loss 8.0823 | lr 0.00100 | ngrams/sec 46932.6 | eta 0h0m12s
| epoch 15 | step 2000/2080 | loss 8.0847 | lr 0.00100 | ngrams/sec 46949.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1055.02it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.12it/s]


-----------------------------------------------------------------------------------------
| end of epoch  15 | time 45.07s | valid loss  6.66 | valid ppl   778.48
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 16 | step 500/2080 | loss 8.0424 | lr 0.00100 | ngrams/sec 36242.2 | eta 0h0m43s
| epoch 16 | step 1000/2080 | loss 8.0682 | lr 0.00100 | ngrams/sec 46872.6 | eta 0h0m23s
| epoch 16 | step 1500/2080 | loss 8.0654 | lr 0.00100 | ngrams/sec 46897.8 | eta 0h0m12s
| epoch 16 | step 2000/2080 | loss 8.0724 | lr 0.00100 | ngrams/sec 46913.4 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1086.40it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.43it/s]


-----------------------------------------------------------------------------------------
| end of epoch  16 | time 45.10s | valid loss  6.63 | valid ppl   756.34
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 17 | step 500/2080 | loss 8.0247 | lr 0.00100 | ngrams/sec 36276.5 | eta 0h0m43s
| epoch 17 | step 1000/2080 | loss 8.0554 | lr 0.00100 | ngrams/sec 46907.4 | eta 0h0m23s
| epoch 17 | step 1500/2080 | loss 8.0546 | lr 0.00100 | ngrams/sec 46860.8 | eta 0h0m12s
| epoch 17 | step 2000/2080 | loss 8.0444 | lr 0.00100 | ngrams/sec 46885.6 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1064.48it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.63it/s]


-----------------------------------------------------------------------------------------
| end of epoch  17 | time 45.12s | valid loss  6.60 | valid ppl   738.34
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 18 | step 500/2080 | loss 8.0063 | lr 0.00100 | ngrams/sec 36082.8 | eta 0h0m43s
| epoch 18 | step 1000/2080 | loss 8.0240 | lr 0.00100 | ngrams/sec 46851.6 | eta 0h0m23s
| epoch 18 | step 1500/2080 | loss 8.0385 | lr 0.00100 | ngrams/sec 46820.4 | eta 0h0m12s
| epoch 18 | step 2000/2080 | loss 8.0421 | lr 0.00100 | ngrams/sec 46814.4 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1067.29it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch  18 | time 45.20s | valid loss  6.58 | valid ppl   718.26
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 19 | step 500/2080 | loss 7.9874 | lr 0.00100 | ngrams/sec 36145.4 | eta 0h0m43s
| epoch 19 | step 1000/2080 | loss 8.0015 | lr 0.00100 | ngrams/sec 46794.3 | eta 0h0m23s
| epoch 19 | step 1500/2080 | loss 8.0171 | lr 0.00100 | ngrams/sec 46885.8 | eta 0h0m12s
| epoch 19 | step 2000/2080 | loss 8.0227 | lr 0.00100 | ngrams/sec 46846.9 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1037.43it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.78it/s]


-----------------------------------------------------------------------------------------
| end of epoch  19 | time 45.17s | valid loss  6.60 | valid ppl   732.56
-----------------------------------------------------------------------------------------
| epoch 20 | step 500/2080 | loss 7.9654 | lr 0.00100 | ngrams/sec 36319.6 | eta 0h0m43s
| epoch 20 | step 1000/2080 | loss 7.9816 | lr 0.00100 | ngrams/sec 46790.3 | eta 0h0m23s
| epoch 20 | step 1500/2080 | loss 8.0059 | lr 0.00100 | ngrams/sec 46789.7 | eta 0h0m12s
| epoch 20 | step 2000/2080 | loss 8.0107 | lr 0.00100 | ngrams/sec 46762.8 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1076.63it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.03it/s]


-----------------------------------------------------------------------------------------
| end of epoch  20 | time 45.23s | valid loss  6.57 | valid ppl   714.17
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 21 | step 500/2080 | loss 7.9597 | lr 0.00100 | ngrams/sec 36174.3 | eta 0h0m43s
| epoch 21 | step 1000/2080 | loss 7.9814 | lr 0.00100 | ngrams/sec 46792.4 | eta 0h0m23s
| epoch 21 | step 1500/2080 | loss 7.9876 | lr 0.00100 | ngrams/sec 46770.7 | eta 0h0m12s
| epoch 21 | step 2000/2080 | loss 8.0005 | lr 0.00100 | ngrams/sec 46833.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1077.63it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch  21 | time 45.19s | valid loss  6.49 | valid ppl   660.59
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 22 | step 500/2080 | loss 7.9365 | lr 0.00100 | ngrams/sec 36200.2 | eta 0h0m43s
| epoch 22 | step 1000/2080 | loss 7.9691 | lr 0.00100 | ngrams/sec 46873.7 | eta 0h0m23s
| epoch 22 | step 1500/2080 | loss 7.9823 | lr 0.00100 | ngrams/sec 46840.2 | eta 0h0m12s
| epoch 22 | step 2000/2080 | loss 7.9772 | lr 0.00100 | ngrams/sec 46885.6 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1033.36it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.60it/s]


-----------------------------------------------------------------------------------------
| end of epoch  22 | time 45.14s | valid loss  6.54 | valid ppl   693.86
-----------------------------------------------------------------------------------------
| epoch 23 | step 500/2080 | loss 7.9386 | lr 0.00100 | ngrams/sec 36383.5 | eta 0h0m43s
| epoch 23 | step 1000/2080 | loss 7.9457 | lr 0.00100 | ngrams/sec 46891.1 | eta 0h0m23s
| epoch 23 | step 1500/2080 | loss 7.9564 | lr 0.00100 | ngrams/sec 46932.9 | eta 0h0m12s
| epoch 23 | step 2000/2080 | loss 7.9754 | lr 0.00100 | ngrams/sec 46918.7 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1033.23it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.00it/s]


-----------------------------------------------------------------------------------------
| end of epoch  23 | time 45.11s | valid loss  6.47 | valid ppl   643.75
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 24 | step 500/2080 | loss 7.9097 | lr 0.00100 | ngrams/sec 36286.5 | eta 0h0m43s
| epoch 24 | step 1000/2080 | loss 7.9282 | lr 0.00100 | ngrams/sec 46903.9 | eta 0h0m23s
| epoch 24 | step 1500/2080 | loss 7.9475 | lr 0.00100 | ngrams/sec 46893.4 | eta 0h0m12s
| epoch 24 | step 2000/2080 | loss 7.9513 | lr 0.00100 | ngrams/sec 46919.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1078.44it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.01it/s]


-----------------------------------------------------------------------------------------
| end of epoch  24 | time 45.08s | valid loss  6.50 | valid ppl   668.06
-----------------------------------------------------------------------------------------
| epoch 25 | step 500/2080 | loss 7.8938 | lr 0.00100 | ngrams/sec 36464.3 | eta 0h0m43s
| epoch 25 | step 1000/2080 | loss 7.9101 | lr 0.00100 | ngrams/sec 46921.5 | eta 0h0m23s
| epoch 25 | step 1500/2080 | loss 7.9284 | lr 0.00100 | ngrams/sec 47032.6 | eta 0h0m12s
| epoch 25 | step 2000/2080 | loss 7.9448 | lr 0.00100 | ngrams/sec 47102.8 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1048.75it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.89it/s]


-----------------------------------------------------------------------------------------
| end of epoch  25 | time 45.00s | valid loss  6.47 | valid ppl   643.60
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 26 | step 500/2080 | loss 7.8814 | lr 0.00100 | ngrams/sec 36369.3 | eta 0h0m43s
| epoch 26 | step 1000/2080 | loss 7.9111 | lr 0.00100 | ngrams/sec 47096.9 | eta 0h0m22s
| epoch 26 | step 1500/2080 | loss 7.9122 | lr 0.00100 | ngrams/sec 47070.9 | eta 0h0m12s
| epoch 26 | step 2000/2080 | loss 7.9162 | lr 0.00100 | ngrams/sec 47125.9 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1034.20it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 275.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  26 | time 44.93s | valid loss  6.44 | valid ppl   628.97
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 27 | step 500/2080 | loss 7.8632 | lr 0.00100 | ngrams/sec 36365.4 | eta 0h0m43s
| epoch 27 | step 1000/2080 | loss 7.8869 | lr 0.00100 | ngrams/sec 47135.3 | eta 0h0m22s
| epoch 27 | step 1500/2080 | loss 7.8952 | lr 0.00100 | ngrams/sec 47153.1 | eta 0h0m12s
| epoch 27 | step 2000/2080 | loss 7.8990 | lr 0.00100 | ngrams/sec 47124.9 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1069.68it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.05it/s]


-----------------------------------------------------------------------------------------
| end of epoch  27 | time 44.89s | valid loss  6.44 | valid ppl   623.49
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 28 | step 500/2080 | loss 7.8494 | lr 0.00100 | ngrams/sec 36494.4 | eta 0h0m43s
| epoch 28 | step 1000/2080 | loss 7.8735 | lr 0.00100 | ngrams/sec 47083.4 | eta 0h0m22s
| epoch 28 | step 1500/2080 | loss 7.8794 | lr 0.00100 | ngrams/sec 47150.2 | eta 0h0m12s
| epoch 28 | step 2000/2080 | loss 7.8882 | lr 0.00100 | ngrams/sec 47053.2 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1048.59it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 275.03it/s]


-----------------------------------------------------------------------------------------
| end of epoch  28 | time 44.90s | valid loss  6.45 | valid ppl   633.96
-----------------------------------------------------------------------------------------
| epoch 29 | step 500/2080 | loss 7.8396 | lr 0.00100 | ngrams/sec 36391.7 | eta 0h0m43s
| epoch 29 | step 1000/2080 | loss 7.8454 | lr 0.00100 | ngrams/sec 46943.8 | eta 0h0m23s
| epoch 29 | step 1500/2080 | loss 7.8669 | lr 0.00100 | ngrams/sec 46927.2 | eta 0h0m12s
| epoch 29 | step 2000/2080 | loss 7.8809 | lr 0.00100 | ngrams/sec 46903.6 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1048.42it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  29 | time 45.11s | valid loss  6.41 | valid ppl   606.94
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 30 | step 500/2080 | loss 7.8214 | lr 0.00100 | ngrams/sec 36157.0 | eta 0h0m43s
| epoch 30 | step 1000/2080 | loss 7.8427 | lr 0.00100 | ngrams/sec 46832.3 | eta 0h0m23s
| epoch 30 | step 1500/2080 | loss 7.8570 | lr 0.00100 | ngrams/sec 46811.3 | eta 0h0m12s
| epoch 30 | step 2000/2080 | loss 7.8641 | lr 0.00100 | ngrams/sec 46778.9 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1035.87it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.05it/s]


-----------------------------------------------------------------------------------------
| end of epoch  30 | time 45.20s | valid loss  6.39 | valid ppl   598.04
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 31 | step 500/2080 | loss 7.8121 | lr 0.00100 | ngrams/sec 36168.3 | eta 0h0m43s
| epoch 31 | step 1000/2080 | loss 7.8344 | lr 0.00100 | ngrams/sec 46756.5 | eta 0h0m23s
| epoch 31 | step 1500/2080 | loss 7.8523 | lr 0.00100 | ngrams/sec 46736.1 | eta 0h0m12s
| epoch 31 | step 2000/2080 | loss 7.8551 | lr 0.00100 | ngrams/sec 46702.1 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1079.12it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.68it/s]


-----------------------------------------------------------------------------------------
| end of epoch  31 | time 45.25s | valid loss  6.40 | valid ppl   601.90
-----------------------------------------------------------------------------------------
| epoch 32 | step 500/2080 | loss 7.7898 | lr 0.00100 | ngrams/sec 36273.6 | eta 0h0m43s
| epoch 32 | step 1000/2080 | loss 7.8282 | lr 0.00100 | ngrams/sec 46691.7 | eta 0h0m23s
| epoch 32 | step 1500/2080 | loss 7.8373 | lr 0.00100 | ngrams/sec 46732.1 | eta 0h0m12s
| epoch 32 | step 2000/2080 | loss 7.8461 | lr 0.00100 | ngrams/sec 46764.4 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1041.56it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  32 | time 45.27s | valid loss  6.38 | valid ppl   590.45
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 33 | step 500/2080 | loss 7.7773 | lr 0.00100 | ngrams/sec 36055.8 | eta 0h0m43s
| epoch 33 | step 1000/2080 | loss 7.8107 | lr 0.00100 | ngrams/sec 46707.9 | eta 0h0m23s
| epoch 33 | step 1500/2080 | loss 7.8185 | lr 0.00100 | ngrams/sec 46760.8 | eta 0h0m12s
| epoch 33 | step 2000/2080 | loss 7.8310 | lr 0.00100 | ngrams/sec 46791.4 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1062.30it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.78it/s]


-----------------------------------------------------------------------------------------
| end of epoch  33 | time 45.27s | valid loss  6.38 | valid ppl   592.15
-----------------------------------------------------------------------------------------
| epoch 34 | step 500/2080 | loss 7.7723 | lr 0.00100 | ngrams/sec 36316.5 | eta 0h0m43s
| epoch 34 | step 1000/2080 | loss 7.7947 | lr 0.00100 | ngrams/sec 46862.8 | eta 0h0m23s
| epoch 34 | step 1500/2080 | loss 7.8068 | lr 0.00100 | ngrams/sec 46915.5 | eta 0h0m12s
| epoch 34 | step 2000/2080 | loss 7.8151 | lr 0.00100 | ngrams/sec 46910.1 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1070.08it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.12it/s]


-----------------------------------------------------------------------------------------
| end of epoch  34 | time 45.13s | valid loss  6.37 | valid ppl   584.07
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 35 | step 500/2080 | loss 7.7489 | lr 0.00100 | ngrams/sec 36268.9 | eta 0h0m43s
| epoch 35 | step 1000/2080 | loss 7.7802 | lr 0.00100 | ngrams/sec 46973.5 | eta 0h0m22s
| epoch 35 | step 1500/2080 | loss 7.7929 | lr 0.00100 | ngrams/sec 46992.9 | eta 0h0m12s
| epoch 35 | step 2000/2080 | loss 7.8137 | lr 0.00100 | ngrams/sec 46997.0 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1044.97it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 275.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch  35 | time 45.03s | valid loss  6.34 | valid ppl   567.33
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 36 | step 500/2080 | loss 7.7538 | lr 0.00100 | ngrams/sec 36323.0 | eta 0h0m43s
| epoch 36 | step 1000/2080 | loss 7.7739 | lr 0.00100 | ngrams/sec 47072.8 | eta 0h0m22s
| epoch 36 | step 1500/2080 | loss 7.7844 | lr 0.00100 | ngrams/sec 47089.6 | eta 0h0m12s
| epoch 36 | step 2000/2080 | loss 7.7951 | lr 0.00100 | ngrams/sec 46988.5 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1064.57it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.11it/s]


-----------------------------------------------------------------------------------------
| end of epoch  36 | time 44.97s | valid loss  6.32 | valid ppl   553.03
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 37 | step 500/2080 | loss 7.7280 | lr 0.00100 | ngrams/sec 36329.9 | eta 0h0m43s
| epoch 37 | step 1000/2080 | loss 7.7687 | lr 0.00100 | ngrams/sec 47025.0 | eta 0h0m22s
| epoch 37 | step 1500/2080 | loss 7.7736 | lr 0.00100 | ngrams/sec 47080.0 | eta 0h0m12s
| epoch 37 | step 2000/2080 | loss 7.7838 | lr 0.00100 | ngrams/sec 47060.6 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1032.33it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  37 | time 44.96s | valid loss  6.34 | valid ppl   564.24
-----------------------------------------------------------------------------------------
| epoch 38 | step 500/2080 | loss 7.7182 | lr 0.00100 | ngrams/sec 36554.7 | eta 0h0m43s
| epoch 38 | step 1000/2080 | loss 7.7454 | lr 0.00100 | ngrams/sec 47122.5 | eta 0h0m22s
| epoch 38 | step 1500/2080 | loss 7.7627 | lr 0.00100 | ngrams/sec 47153.9 | eta 0h0m12s
| epoch 38 | step 2000/2080 | loss 7.7606 | lr 0.00100 | ngrams/sec 47125.1 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1086.23it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 275.64it/s]


-----------------------------------------------------------------------------------------
| end of epoch  38 | time 44.90s | valid loss  6.32 | valid ppl   557.48
-----------------------------------------------------------------------------------------
| epoch 39 | step 500/2080 | loss 7.7005 | lr 0.00100 | ngrams/sec 36551.4 | eta 0h0m43s
| epoch 39 | step 1000/2080 | loss 7.7430 | lr 0.00100 | ngrams/sec 47138.3 | eta 0h0m22s
| epoch 39 | step 1500/2080 | loss 7.7524 | lr 0.00100 | ngrams/sec 47093.9 | eta 0h0m12s
| epoch 39 | step 2000/2080 | loss 7.7636 | lr 0.00100 | ngrams/sec 47120.9 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1060.57it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.21it/s]


-----------------------------------------------------------------------------------------
| end of epoch  39 | time 44.90s | valid loss  6.32 | valid ppl   557.77
-----------------------------------------------------------------------------------------
| epoch 40 | step 500/2080 | loss 7.6939 | lr 0.00100 | ngrams/sec 36501.0 | eta 0h0m43s
| epoch 40 | step 1000/2080 | loss 7.7088 | lr 0.00100 | ngrams/sec 47174.5 | eta 0h0m22s
| epoch 40 | step 1500/2080 | loss 7.7326 | lr 0.00100 | ngrams/sec 47136.4 | eta 0h0m12s
| epoch 40 | step 2000/2080 | loss 7.7346 | lr 0.00100 | ngrams/sec 47058.4 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1079.85it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.94it/s]


-----------------------------------------------------------------------------------------
| end of epoch  40 | time 44.93s | valid loss  6.30 | valid ppl   544.68
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 41 | step 500/2080 | loss 7.6740 | lr 0.00100 | ngrams/sec 36314.1 | eta 0h0m43s
| epoch 41 | step 1000/2080 | loss 7.6987 | lr 0.00100 | ngrams/sec 46917.0 | eta 0h0m23s
| epoch 41 | step 1500/2080 | loss 7.7230 | lr 0.00100 | ngrams/sec 46943.0 | eta 0h0m12s
| epoch 41 | step 2000/2080 | loss 7.7456 | lr 0.00100 | ngrams/sec 46881.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1085.08it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.97it/s]


-----------------------------------------------------------------------------------------
| end of epoch  41 | time 45.07s | valid loss  6.29 | valid ppl   541.81
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 42 | step 500/2080 | loss 7.6780 | lr 0.00100 | ngrams/sec 36255.4 | eta 0h0m43s
| epoch 42 | step 1000/2080 | loss 7.7039 | lr 0.00100 | ngrams/sec 46849.2 | eta 0h0m23s
| epoch 42 | step 1500/2080 | loss 7.7065 | lr 0.00100 | ngrams/sec 46657.3 | eta 0h0m12s
| epoch 42 | step 2000/2080 | loss 7.7240 | lr 0.00100 | ngrams/sec 46808.5 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1030.76it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.44it/s]


-----------------------------------------------------------------------------------------
| end of epoch  42 | time 45.21s | valid loss  6.29 | valid ppl   536.65
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 43 | step 500/2080 | loss 7.6721 | lr 0.00100 | ngrams/sec 36027.3 | eta 0h0m43s
| epoch 43 | step 1000/2080 | loss 7.6938 | lr 0.00100 | ngrams/sec 46673.2 | eta 0h0m23s
| epoch 43 | step 1500/2080 | loss 7.6947 | lr 0.00100 | ngrams/sec 46618.8 | eta 0h0m12s
| epoch 43 | step 2000/2080 | loss 7.7042 | lr 0.00100 | ngrams/sec 46631.8 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1067.66it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 270.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch  43 | time 45.36s | valid loss  6.31 | valid ppl   550.67
-----------------------------------------------------------------------------------------
| epoch 44 | step 500/2080 | loss 7.6304 | lr 0.00100 | ngrams/sec 36218.6 | eta 0h0m43s
| epoch 44 | step 1000/2080 | loss 7.6793 | lr 0.00100 | ngrams/sec 46860.4 | eta 0h0m23s
| epoch 44 | step 1500/2080 | loss 7.6764 | lr 0.00100 | ngrams/sec 46891.2 | eta 0h0m12s
| epoch 44 | step 2000/2080 | loss 7.7066 | lr 0.00100 | ngrams/sec 46897.1 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1082.51it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.78it/s]


-----------------------------------------------------------------------------------------
| end of epoch  44 | time 45.17s | valid loss  6.30 | valid ppl   543.02
-----------------------------------------------------------------------------------------
| epoch 45 | step 500/2080 | loss 7.6246 | lr 0.00100 | ngrams/sec 36406.7 | eta 0h0m43s
| epoch 45 | step 1000/2080 | loss 7.6606 | lr 0.00100 | ngrams/sec 47039.1 | eta 0h0m22s
| epoch 45 | step 1500/2080 | loss 7.6838 | lr 0.00100 | ngrams/sec 46987.4 | eta 0h0m12s
| epoch 45 | step 2000/2080 | loss 7.6871 | lr 0.00100 | ngrams/sec 47028.7 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1074.29it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.53it/s]


-----------------------------------------------------------------------------------------
| end of epoch  45 | time 45.02s | valid loss  6.27 | valid ppl   527.32
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 46 | step 500/2080 | loss 7.6168 | lr 0.00100 | ngrams/sec 36376.8 | eta 0h0m43s
| epoch 46 | step 1000/2080 | loss 7.6653 | lr 0.00100 | ngrams/sec 47052.9 | eta 0h0m22s
| epoch 46 | step 1500/2080 | loss 7.6641 | lr 0.00100 | ngrams/sec 47071.9 | eta 0h0m12s
| epoch 46 | step 2000/2080 | loss 7.6795 | lr 0.00100 | ngrams/sec 47066.8 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1079.70it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.74it/s]


-----------------------------------------------------------------------------------------
| end of epoch  46 | time 44.94s | valid loss  6.25 | valid ppl   520.49
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 47 | step 500/2080 | loss 7.6057 | lr 0.00100 | ngrams/sec 36331.4 | eta 0h0m43s
| epoch 47 | step 1000/2080 | loss 7.6389 | lr 0.00100 | ngrams/sec 47113.7 | eta 0h0m22s
| epoch 47 | step 1500/2080 | loss 7.6596 | lr 0.00100 | ngrams/sec 47123.1 | eta 0h0m12s
| epoch 47 | step 2000/2080 | loss 7.6631 | lr 0.00100 | ngrams/sec 47141.2 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1063.16it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.27it/s]


-----------------------------------------------------------------------------------------
| end of epoch  47 | time 44.92s | valid loss  6.29 | valid ppl   541.10
-----------------------------------------------------------------------------------------
| epoch 48 | step 500/2080 | loss 7.5904 | lr 0.00100 | ngrams/sec 36566.7 | eta 0h0m43s
| epoch 48 | step 1000/2080 | loss 7.6187 | lr 0.00100 | ngrams/sec 47192.8 | eta 0h0m22s
| epoch 48 | step 1500/2080 | loss 7.6390 | lr 0.00100 | ngrams/sec 47105.3 | eta 0h0m12s
| epoch 48 | step 2000/2080 | loss 7.6531 | lr 0.00100 | ngrams/sec 47070.2 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1082.81it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.21it/s]


-----------------------------------------------------------------------------------------
| end of epoch  48 | time 44.90s | valid loss  6.24 | valid ppl   510.30
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 49 | step 500/2080 | loss 7.5817 | lr 0.00100 | ngrams/sec 36380.7 | eta 0h0m43s
| epoch 49 | step 1000/2080 | loss 7.6150 | lr 0.00100 | ngrams/sec 47019.1 | eta 0h0m22s
| epoch 49 | step 1500/2080 | loss 7.6186 | lr 0.00100 | ngrams/sec 46991.4 | eta 0h0m12s
| epoch 49 | step 2000/2080 | loss 7.6390 | lr 0.00100 | ngrams/sec 46908.4 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1069.75it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.69it/s]


-----------------------------------------------------------------------------------------
| end of epoch  49 | time 45.01s | valid loss  6.23 | valid ppl   510.21
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 50 | step 500/2080 | loss 7.5731 | lr 0.00100 | ngrams/sec 36222.3 | eta 0h0m43s
| epoch 50 | step 1000/2080 | loss 7.5959 | lr 0.00100 | ngrams/sec 46913.2 | eta 0h0m23s
| epoch 50 | step 1500/2080 | loss 7.6211 | lr 0.00100 | ngrams/sec 46839.8 | eta 0h0m12s
| epoch 50 | step 2000/2080 | loss 7.6263 | lr 0.00100 | ngrams/sec 46843.2 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1085.97it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.54it/s]


-----------------------------------------------------------------------------------------
| end of epoch  50 | time 45.15s | valid loss  6.25 | valid ppl   519.01
-----------------------------------------------------------------------------------------
| epoch 51 | step 500/2080 | loss 7.5699 | lr 0.00100 | ngrams/sec 36320.6 | eta 0h0m43s
| epoch 51 | step 1000/2080 | loss 7.5967 | lr 0.00100 | ngrams/sec 46786.0 | eta 0h0m23s
| epoch 51 | step 1500/2080 | loss 7.6105 | lr 0.00100 | ngrams/sec 46763.7 | eta 0h0m12s
| epoch 51 | step 2000/2080 | loss 7.6198 | lr 0.00100 | ngrams/sec 46780.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1088.69it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.98it/s]


-----------------------------------------------------------------------------------------
| end of epoch  51 | time 45.22s | valid loss  6.22 | valid ppl   500.44
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 52 | step 500/2080 | loss 7.5518 | lr 0.00100 | ngrams/sec 36229.6 | eta 0h0m43s
| epoch 52 | step 1000/2080 | loss 7.5912 | lr 0.00100 | ngrams/sec 46779.2 | eta 0h0m23s
| epoch 52 | step 1500/2080 | loss 7.6052 | lr 0.00100 | ngrams/sec 46719.9 | eta 0h0m12s
| epoch 52 | step 2000/2080 | loss 7.6115 | lr 0.00100 | ngrams/sec 46774.5 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1079.51it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  52 | time 45.20s | valid loss  6.22 | valid ppl   500.30
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 53 | step 500/2080 | loss 7.5392 | lr 0.00100 | ngrams/sec 36275.2 | eta 0h0m43s
| epoch 53 | step 1000/2080 | loss 7.5780 | lr 0.00100 | ngrams/sec 47031.3 | eta 0h0m22s
| epoch 53 | step 1500/2080 | loss 7.6005 | lr 0.00100 | ngrams/sec 46995.6 | eta 0h0m12s
| epoch 53 | step 2000/2080 | loss 7.6016 | lr 0.00100 | ngrams/sec 47081.2 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1061.68it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.49it/s]


-----------------------------------------------------------------------------------------
| end of epoch  53 | time 44.99s | valid loss  6.21 | valid ppl   498.95
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 54 | step 500/2080 | loss 7.5386 | lr 0.00100 | ngrams/sec 36450.8 | eta 0h0m43s
| epoch 54 | step 1000/2080 | loss 7.5684 | lr 0.00100 | ngrams/sec 46784.4 | eta 0h0m23s
| epoch 54 | step 1500/2080 | loss 7.5742 | lr 0.00100 | ngrams/sec 47120.4 | eta 0h0m12s
| epoch 54 | step 2000/2080 | loss 7.5950 | lr 0.00100 | ngrams/sec 47181.8 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1036.58it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 274.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  54 | time 44.95s | valid loss  6.22 | valid ppl   502.78
-----------------------------------------------------------------------------------------
| epoch 55 | step 500/2080 | loss 7.5223 | lr 0.00100 | ngrams/sec 36530.7 | eta 0h0m43s
| epoch 55 | step 1000/2080 | loss 7.5486 | lr 0.00100 | ngrams/sec 47121.7 | eta 0h0m22s
| epoch 55 | step 1500/2080 | loss 7.5710 | lr 0.00100 | ngrams/sec 47131.3 | eta 0h0m12s
| epoch 55 | step 2000/2080 | loss 7.5942 | lr 0.00100 | ngrams/sec 47091.2 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1078.18it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.88it/s]


-----------------------------------------------------------------------------------------
| end of epoch  55 | time 44.91s | valid loss  6.21 | valid ppl   496.98
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 56 | step 500/2080 | loss 7.5198 | lr 0.00100 | ngrams/sec 36341.5 | eta 0h0m43s
| epoch 56 | step 1000/2080 | loss 7.5433 | lr 0.00100 | ngrams/sec 47077.6 | eta 0h0m22s
| epoch 56 | step 1500/2080 | loss 7.5610 | lr 0.00100 | ngrams/sec 47008.7 | eta 0h0m12s
| epoch 56 | step 2000/2080 | loss 7.5787 | lr 0.00100 | ngrams/sec 47011.8 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1068.91it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.48it/s]


-----------------------------------------------------------------------------------------
| end of epoch  56 | time 44.99s | valid loss  6.19 | valid ppl   486.43
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 57 | step 500/2080 | loss 7.5058 | lr 0.00100 | ngrams/sec 36230.8 | eta 0h0m43s
| epoch 57 | step 1000/2080 | loss 7.5283 | lr 0.00100 | ngrams/sec 46963.9 | eta 0h0m22s
| epoch 57 | step 1500/2080 | loss 7.5512 | lr 0.00100 | ngrams/sec 46869.2 | eta 0h0m12s
| epoch 57 | step 2000/2080 | loss 7.5650 | lr 0.00100 | ngrams/sec 46863.0 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1078.78it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch  57 | time 45.11s | valid loss  6.20 | valid ppl   493.90
-----------------------------------------------------------------------------------------
| epoch 58 | step 500/2080 | loss 7.4856 | lr 0.00100 | ngrams/sec 36327.5 | eta 0h0m43s
| epoch 58 | step 1000/2080 | loss 7.5274 | lr 0.00100 | ngrams/sec 46851.6 | eta 0h0m23s
| epoch 58 | step 1500/2080 | loss 7.5388 | lr 0.00100 | ngrams/sec 46882.0 | eta 0h0m12s
| epoch 58 | step 2000/2080 | loss 7.5519 | lr 0.00100 | ngrams/sec 46825.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1071.92it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.88it/s]


-----------------------------------------------------------------------------------------
| end of epoch  58 | time 45.17s | valid loss  6.20 | valid ppl   492.99
-----------------------------------------------------------------------------------------
| epoch 59 | step 500/2080 | loss 7.4687 | lr 0.00100 | ngrams/sec 36312.1 | eta 0h0m43s
| epoch 59 | step 1000/2080 | loss 7.5110 | lr 0.00100 | ngrams/sec 46814.9 | eta 0h0m23s
| epoch 59 | step 1500/2080 | loss 7.5335 | lr 0.00100 | ngrams/sec 46786.8 | eta 0h0m12s
| epoch 59 | step 2000/2080 | loss 7.5431 | lr 0.00100 | ngrams/sec 46850.2 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1040.98it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 270.94it/s]


-----------------------------------------------------------------------------------------
| end of epoch  59 | time 45.20s | valid loss  6.21 | valid ppl   495.96
-----------------------------------------------------------------------------------------
| epoch 60 | step 500/2080 | loss 7.4722 | lr 0.00100 | ngrams/sec 36293.0 | eta 0h0m43s
| epoch 60 | step 1000/2080 | loss 7.5051 | lr 0.00100 | ngrams/sec 46796.0 | eta 0h0m23s
| epoch 60 | step 1500/2080 | loss 7.5088 | lr 0.00100 | ngrams/sec 46739.5 | eta 0h0m12s
| epoch 60 | step 2000/2080 | loss 7.5355 | lr 0.00100 | ngrams/sec 46791.5 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1065.50it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.09it/s]


-----------------------------------------------------------------------------------------
| end of epoch  60 | time 45.23s | valid loss  6.20 | valid ppl   492.77
-----------------------------------------------------------------------------------------
| epoch 61 | step 500/2080 | loss 7.4585 | lr 0.00100 | ngrams/sec 36262.7 | eta 0h0m43s
| epoch 61 | step 1000/2080 | loss 7.4920 | lr 0.00100 | ngrams/sec 46784.8 | eta 0h0m23s
| epoch 61 | step 1500/2080 | loss 7.5136 | lr 0.00100 | ngrams/sec 46769.5 | eta 0h0m12s
| epoch 61 | step 2000/2080 | loss 7.5337 | lr 0.00100 | ngrams/sec 46778.9 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1072.21it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.32it/s]


-----------------------------------------------------------------------------------------
| end of epoch  61 | time 45.24s | valid loss  6.20 | valid ppl   493.95
-----------------------------------------------------------------------------------------
| epoch 62 | step 500/2080 | loss 7.4533 | lr 0.00100 | ngrams/sec 36230.7 | eta 0h0m43s
| epoch 62 | step 1000/2080 | loss 7.4793 | lr 0.00100 | ngrams/sec 46794.3 | eta 0h0m23s
| epoch 62 | step 1500/2080 | loss 7.5033 | lr 0.00100 | ngrams/sec 46822.7 | eta 0h0m12s
| epoch 62 | step 2000/2080 | loss 7.5200 | lr 0.00100 | ngrams/sec 46837.7 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1065.58it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.17it/s]


-----------------------------------------------------------------------------------------
| end of epoch  62 | time 45.21s | valid loss  6.20 | valid ppl   495.20
-----------------------------------------------------------------------------------------
| epoch 63 | step 500/2080 | loss 7.4449 | lr 0.00100 | ngrams/sec 36252.2 | eta 0h0m43s
| epoch 63 | step 1000/2080 | loss 7.4710 | lr 0.00100 | ngrams/sec 46769.9 | eta 0h0m23s
| epoch 63 | step 1500/2080 | loss 7.4891 | lr 0.00100 | ngrams/sec 46798.9 | eta 0h0m12s
| epoch 63 | step 2000/2080 | loss 7.5015 | lr 0.00100 | ngrams/sec 46811.5 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1042.26it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 270.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  63 | time 45.23s | valid loss  6.19 | valid ppl   489.75
-----------------------------------------------------------------------------------------
| epoch 64 | step 500/2080 | loss 7.4217 | lr 0.00100 | ngrams/sec 36291.1 | eta 0h0m43s
| epoch 64 | step 1000/2080 | loss 7.4545 | lr 0.00100 | ngrams/sec 46810.5 | eta 0h0m23s
| epoch 64 | step 1500/2080 | loss 7.4770 | lr 0.00100 | ngrams/sec 46898.2 | eta 0h0m12s
| epoch 64 | step 2000/2080 | loss 7.4932 | lr 0.00100 | ngrams/sec 46819.7 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1061.35it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.66it/s]


-----------------------------------------------------------------------------------------
| end of epoch  64 | time 45.18s | valid loss  6.19 | valid ppl   486.63
-----------------------------------------------------------------------------------------
| epoch 65 | step 500/2080 | loss 7.4124 | lr 0.00100 | ngrams/sec 36321.2 | eta 0h0m43s
| epoch 65 | step 1000/2080 | loss 7.4515 | lr 0.00100 | ngrams/sec 46810.7 | eta 0h0m23s
| epoch 65 | step 1500/2080 | loss 7.4677 | lr 0.00100 | ngrams/sec 46810.0 | eta 0h0m12s
| epoch 65 | step 2000/2080 | loss 7.4880 | lr 0.00100 | ngrams/sec 46835.6 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1079.47it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 270.80it/s]


-----------------------------------------------------------------------------------------
| end of epoch  65 | time 45.20s | valid loss  6.19 | valid ppl   488.00
-----------------------------------------------------------------------------------------
| epoch 66 | step 500/2080 | loss 7.4050 | lr 0.00100 | ngrams/sec 36283.5 | eta 0h0m43s
| epoch 66 | step 1000/2080 | loss 7.4445 | lr 0.00100 | ngrams/sec 46829.3 | eta 0h0m23s
| epoch 66 | step 1500/2080 | loss 7.4644 | lr 0.00100 | ngrams/sec 46785.3 | eta 0h0m12s
| epoch 66 | step 2000/2080 | loss 7.4905 | lr 0.00100 | ngrams/sec 46878.2 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1071.93it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  66 | time 45.19s | valid loss  6.19 | valid ppl   487.08
-----------------------------------------------------------------------------------------
| epoch 67 | step 500/2080 | loss 7.3974 | lr 0.00100 | ngrams/sec 36312.6 | eta 0h0m43s
| epoch 67 | step 1000/2080 | loss 7.4337 | lr 0.00100 | ngrams/sec 46764.3 | eta 0h0m23s
| epoch 67 | step 1500/2080 | loss 7.4619 | lr 0.00100 | ngrams/sec 46827.0 | eta 0h0m12s
| epoch 67 | step 2000/2080 | loss 7.4762 | lr 0.00100 | ngrams/sec 46798.6 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1032.94it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.34it/s]


-----------------------------------------------------------------------------------------
| end of epoch  67 | time 45.20s | valid loss  6.17 | valid ppl   476.59
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 68 | step 500/2080 | loss 7.3870 | lr 0.00100 | ngrams/sec 36140.9 | eta 0h0m43s
| epoch 68 | step 1000/2080 | loss 7.4208 | lr 0.00100 | ngrams/sec 46805.7 | eta 0h0m23s
| epoch 68 | step 1500/2080 | loss 7.4414 | lr 0.00100 | ngrams/sec 46822.9 | eta 0h0m12s
| epoch 68 | step 2000/2080 | loss 7.4659 | lr 0.00100 | ngrams/sec 46823.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1065.36it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.38it/s]


-----------------------------------------------------------------------------------------
| end of epoch  68 | time 45.20s | valid loss  6.18 | valid ppl   483.27
-----------------------------------------------------------------------------------------
| epoch 69 | step 500/2080 | loss 7.3836 | lr 0.00100 | ngrams/sec 36337.7 | eta 0h0m43s
| epoch 69 | step 1000/2080 | loss 7.4129 | lr 0.00100 | ngrams/sec 46776.9 | eta 0h0m23s
| epoch 69 | step 1500/2080 | loss 7.4351 | lr 0.00100 | ngrams/sec 46818.2 | eta 0h0m12s
| epoch 69 | step 2000/2080 | loss 7.4512 | lr 0.00100 | ngrams/sec 46758.5 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1072.84it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  69 | time 45.21s | valid loss  6.12 | valid ppl   457.07
-----------------------------------------------------------------------------------------
| saving current state of model ...
| epoch 70 | step 500/2080 | loss 7.3659 | lr 0.00100 | ngrams/sec 36148.1 | eta 0h0m43s
| epoch 70 | step 1000/2080 | loss 7.4062 | lr 0.00100 | ngrams/sec 46793.9 | eta 0h0m23s
| epoch 70 | step 1500/2080 | loss 7.4274 | lr 0.00100 | ngrams/sec 46768.3 | eta 0h0m12s
| epoch 70 | step 2000/2080 | loss 7.4401 | lr 0.00100 | ngrams/sec 46740.5 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1058.64it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 270.13it/s]


-----------------------------------------------------------------------------------------
| end of epoch  70 | time 45.25s | valid loss  6.17 | valid ppl   479.96
-----------------------------------------------------------------------------------------
| epoch 71 | step 500/2080 | loss 7.3590 | lr 0.00100 | ngrams/sec 36210.6 | eta 0h0m43s
| epoch 71 | step 1000/2080 | loss 7.3962 | lr 0.00100 | ngrams/sec 46807.2 | eta 0h0m23s
| epoch 71 | step 1500/2080 | loss 7.4139 | lr 0.00100 | ngrams/sec 46846.6 | eta 0h0m12s
| epoch 71 | step 2000/2080 | loss 7.4212 | lr 0.00100 | ngrams/sec 46749.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1066.01it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.91it/s]


-----------------------------------------------------------------------------------------
| end of epoch  71 | time 45.23s | valid loss  6.15 | valid ppl   466.54
-----------------------------------------------------------------------------------------
| epoch 72 | step 500/2080 | loss 7.3334 | lr 0.00100 | ngrams/sec 36289.2 | eta 0h0m43s
| epoch 72 | step 1000/2080 | loss 7.3831 | lr 0.00100 | ngrams/sec 46852.3 | eta 0h0m23s
| epoch 72 | step 1500/2080 | loss 7.4051 | lr 0.00100 | ngrams/sec 46782.3 | eta 0h0m12s
| epoch 72 | step 2000/2080 | loss 7.4183 | lr 0.00100 | ngrams/sec 46771.9 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1031.03it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.08it/s]


-----------------------------------------------------------------------------------------
| end of epoch  72 | time 45.21s | valid loss  6.15 | valid ppl   469.96
-----------------------------------------------------------------------------------------
| epoch 73 | step 500/2080 | loss 7.3337 | lr 0.00100 | ngrams/sec 36307.7 | eta 0h0m43s
| epoch 73 | step 1000/2080 | loss 7.3747 | lr 0.00100 | ngrams/sec 46872.2 | eta 0h0m23s
| epoch 73 | step 1500/2080 | loss 7.4041 | lr 0.00100 | ngrams/sec 46847.7 | eta 0h0m12s
| epoch 73 | step 2000/2080 | loss 7.4150 | lr 0.00100 | ngrams/sec 46911.9 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1080.35it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch  73 | time 45.16s | valid loss  6.17 | valid ppl   480.52
-----------------------------------------------------------------------------------------
| epoch 74 | step 500/2080 | loss 7.3344 | lr 0.00100 | ngrams/sec 36321.2 | eta 0h0m43s
| epoch 74 | step 1000/2080 | loss 7.3624 | lr 0.00100 | ngrams/sec 46880.0 | eta 0h0m23s
| epoch 74 | step 1500/2080 | loss 7.3879 | lr 0.00100 | ngrams/sec 46878.5 | eta 0h0m12s
| epoch 74 | step 2000/2080 | loss 7.4040 | lr 0.00100 | ngrams/sec 46830.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1051.23it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.33it/s]


-----------------------------------------------------------------------------------------
| end of epoch  74 | time 45.16s | valid loss  6.16 | valid ppl   473.39
-----------------------------------------------------------------------------------------
| epoch 75 | step 500/2080 | loss 7.3137 | lr 0.00100 | ngrams/sec 36327.7 | eta 0h0m43s
| epoch 75 | step 1000/2080 | loss 7.3527 | lr 0.00100 | ngrams/sec 46800.9 | eta 0h0m23s
| epoch 75 | step 1500/2080 | loss 7.3851 | lr 0.00100 | ngrams/sec 46788.1 | eta 0h0m12s
| epoch 75 | step 2000/2080 | loss 7.4004 | lr 0.00100 | ngrams/sec 46861.5 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1073.60it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.69it/s]


-----------------------------------------------------------------------------------------
| end of epoch  75 | time 45.20s | valid loss  6.15 | valid ppl   468.94
-----------------------------------------------------------------------------------------
| epoch 76 | step 500/2080 | loss 7.3081 | lr 0.00100 | ngrams/sec 36279.7 | eta 0h0m43s
| epoch 76 | step 1000/2080 | loss 7.3424 | lr 0.00100 | ngrams/sec 46778.5 | eta 0h0m23s
| epoch 76 | step 1500/2080 | loss 7.3783 | lr 0.00100 | ngrams/sec 46835.3 | eta 0h0m12s
| epoch 76 | step 2000/2080 | loss 7.3972 | lr 0.00100 | ngrams/sec 46825.6 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1075.64it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.22it/s]


-----------------------------------------------------------------------------------------
| end of epoch  76 | time 45.21s | valid loss  6.13 | valid ppl   459.83
-----------------------------------------------------------------------------------------
| epoch 77 | step 500/2080 | loss 7.3089 | lr 0.00100 | ngrams/sec 36316.7 | eta 0h0m43s
| epoch 77 | step 1000/2080 | loss 7.3378 | lr 0.00100 | ngrams/sec 46862.7 | eta 0h0m23s
| epoch 77 | step 1500/2080 | loss 7.3738 | lr 0.00100 | ngrams/sec 46858.9 | eta 0h0m12s
| epoch 77 | step 2000/2080 | loss 7.3837 | lr 0.00100 | ngrams/sec 46949.1 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1036.37it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.42it/s]


-----------------------------------------------------------------------------------------
| end of epoch  77 | time 45.13s | valid loss  6.13 | valid ppl   460.58
-----------------------------------------------------------------------------------------
| epoch 78 | step 500/2080 | loss 7.2924 | lr 0.00100 | ngrams/sec 36307.1 | eta 0h0m43s
| epoch 78 | step 1000/2080 | loss 7.3241 | lr 0.00100 | ngrams/sec 46798.7 | eta 0h0m23s
| epoch 78 | step 1500/2080 | loss 7.3486 | lr 0.00100 | ngrams/sec 46833.5 | eta 0h0m12s
| epoch 78 | step 2000/2080 | loss 7.3741 | lr 0.00100 | ngrams/sec 46804.5 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1079.40it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 270.20it/s]


-----------------------------------------------------------------------------------------
| end of epoch  78 | time 45.21s | valid loss  6.16 | valid ppl   472.78
-----------------------------------------------------------------------------------------
| epoch 79 | step 500/2080 | loss 7.2877 | lr 0.00100 | ngrams/sec 36319.9 | eta 0h0m43s
| epoch 79 | step 1000/2080 | loss 7.3304 | lr 0.00100 | ngrams/sec 46824.8 | eta 0h0m23s
| epoch 79 | step 1500/2080 | loss 7.3472 | lr 0.00100 | ngrams/sec 46790.6 | eta 0h0m12s
| epoch 79 | step 2000/2080 | loss 7.3722 | lr 0.00100 | ngrams/sec 46798.0 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1037.73it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch  79 | time 45.20s | valid loss  6.14 | valid ppl   463.89
-----------------------------------------------------------------------------------------
| epoch 80 | step 500/2080 | loss 7.2879 | lr 0.00100 | ngrams/sec 36293.2 | eta 0h0m43s
| epoch 80 | step 1000/2080 | loss 7.3124 | lr 0.00100 | ngrams/sec 46795.4 | eta 0h0m23s
| epoch 80 | step 1500/2080 | loss 7.3443 | lr 0.00100 | ngrams/sec 46870.8 | eta 0h0m12s
| epoch 80 | step 2000/2080 | loss 7.3451 | lr 0.00100 | ngrams/sec 46780.7 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1044.64it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.85it/s]


-----------------------------------------------------------------------------------------
| end of epoch  80 | time 45.20s | valid loss  6.15 | valid ppl   468.12
-----------------------------------------------------------------------------------------
| epoch 81 | step 500/2080 | loss 7.2742 | lr 0.00100 | ngrams/sec 36306.4 | eta 0h0m43s
| epoch 81 | step 1000/2080 | loss 7.3109 | lr 0.00100 | ngrams/sec 46827.6 | eta 0h0m23s
| epoch 81 | step 1500/2080 | loss 7.3410 | lr 0.00100 | ngrams/sec 46820.3 | eta 0h0m12s
| epoch 81 | step 2000/2080 | loss 7.3614 | lr 0.00100 | ngrams/sec 46823.1 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1069.04it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  81 | time 45.19s | valid loss  6.16 | valid ppl   471.39
-----------------------------------------------------------------------------------------
| epoch 82 | step 500/2080 | loss 7.2603 | lr 0.00100 | ngrams/sec 36282.7 | eta 0h0m43s
| epoch 82 | step 1000/2080 | loss 7.2917 | lr 0.00100 | ngrams/sec 46870.1 | eta 0h0m23s
| epoch 82 | step 1500/2080 | loss 7.3179 | lr 0.00100 | ngrams/sec 46756.8 | eta 0h0m12s
| epoch 82 | step 2000/2080 | loss 7.3355 | lr 0.00100 | ngrams/sec 46814.8 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1070.25it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch  82 | time 45.20s | valid loss  6.15 | valid ppl   466.95
-----------------------------------------------------------------------------------------
| epoch 83 | step 500/2080 | loss 7.2394 | lr 0.00100 | ngrams/sec 36322.8 | eta 0h0m43s
| epoch 83 | step 1000/2080 | loss 7.2804 | lr 0.00100 | ngrams/sec 46905.8 | eta 0h0m23s
| epoch 83 | step 1500/2080 | loss 7.3226 | lr 0.00100 | ngrams/sec 46811.0 | eta 0h0m12s
| epoch 83 | step 2000/2080 | loss 7.3398 | lr 0.00100 | ngrams/sec 46806.8 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1081.39it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.32it/s]


-----------------------------------------------------------------------------------------
| end of epoch  83 | time 45.19s | valid loss  6.16 | valid ppl   473.71
-----------------------------------------------------------------------------------------
| epoch 84 | step 500/2080 | loss 7.2363 | lr 0.00100 | ngrams/sec 36301.9 | eta 0h0m43s
| epoch 84 | step 1000/2080 | loss 7.2787 | lr 0.00100 | ngrams/sec 46856.8 | eta 0h0m23s
| epoch 84 | step 1500/2080 | loss 7.2934 | lr 0.00100 | ngrams/sec 46909.7 | eta 0h0m12s
| epoch 84 | step 2000/2080 | loss 7.3231 | lr 0.00100 | ngrams/sec 46885.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1074.12it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  84 | time 45.14s | valid loss  6.16 | valid ppl   471.80
-----------------------------------------------------------------------------------------
| epoch 85 | step 500/2080 | loss 7.2286 | lr 0.00100 | ngrams/sec 36373.2 | eta 0h0m43s
| epoch 85 | step 1000/2080 | loss 7.2770 | lr 0.00100 | ngrams/sec 46894.5 | eta 0h0m23s
| epoch 85 | step 1500/2080 | loss 7.2945 | lr 0.00100 | ngrams/sec 46947.4 | eta 0h0m12s
| epoch 85 | step 2000/2080 | loss 7.3204 | lr 0.00100 | ngrams/sec 46873.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1085.80it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  85 | time 45.12s | valid loss  6.14 | valid ppl   464.95
-----------------------------------------------------------------------------------------
| epoch 86 | step 500/2080 | loss 7.2226 | lr 0.00100 | ngrams/sec 36353.8 | eta 0h0m43s
| epoch 86 | step 1000/2080 | loss 7.2570 | lr 0.00100 | ngrams/sec 46905.0 | eta 0h0m23s
| epoch 86 | step 1500/2080 | loss 7.2831 | lr 0.00100 | ngrams/sec 46967.4 | eta 0h0m12s
| epoch 86 | step 2000/2080 | loss 7.3028 | lr 0.00100 | ngrams/sec 46934.8 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1069.87it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.20it/s]


-----------------------------------------------------------------------------------------
| end of epoch  86 | time 45.10s | valid loss  6.15 | valid ppl   467.23
-----------------------------------------------------------------------------------------
| epoch 87 | step 500/2080 | loss 7.2015 | lr 0.00100 | ngrams/sec 36375.8 | eta 0h0m43s
| epoch 87 | step 1000/2080 | loss 7.2449 | lr 0.00100 | ngrams/sec 46997.7 | eta 0h0m22s
| epoch 87 | step 1500/2080 | loss 7.2758 | lr 0.00100 | ngrams/sec 46919.8 | eta 0h0m12s
| epoch 87 | step 2000/2080 | loss 7.3002 | lr 0.00100 | ngrams/sec 46884.2 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1038.19it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.77it/s]


-----------------------------------------------------------------------------------------
| end of epoch  87 | time 45.08s | valid loss  6.17 | valid ppl   476.39
-----------------------------------------------------------------------------------------
| epoch 88 | step 500/2080 | loss 7.1946 | lr 0.00100 | ngrams/sec 36391.2 | eta 0h0m43s
| epoch 88 | step 1000/2080 | loss 7.2343 | lr 0.00100 | ngrams/sec 46876.0 | eta 0h0m23s
| epoch 88 | step 1500/2080 | loss 7.2641 | lr 0.00100 | ngrams/sec 46864.0 | eta 0h0m12s
| epoch 88 | step 2000/2080 | loss 7.2860 | lr 0.00100 | ngrams/sec 46840.4 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1080.74it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.79it/s]


-----------------------------------------------------------------------------------------
| end of epoch  88 | time 45.16s | valid loss  6.16 | valid ppl   472.42
-----------------------------------------------------------------------------------------
| epoch 89 | step 500/2080 | loss 7.1906 | lr 0.00100 | ngrams/sec 36329.3 | eta 0h0m43s
| epoch 89 | step 1000/2080 | loss 7.2299 | lr 0.00100 | ngrams/sec 46886.8 | eta 0h0m23s
| epoch 89 | step 1500/2080 | loss 7.2570 | lr 0.00100 | ngrams/sec 46874.2 | eta 0h0m12s
| epoch 89 | step 2000/2080 | loss 7.2699 | lr 0.00100 | ngrams/sec 46900.5 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1062.07it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.06it/s]


-----------------------------------------------------------------------------------------
| end of epoch  89 | time 45.12s | valid loss  6.17 | valid ppl   477.26
-----------------------------------------------------------------------------------------
| epoch 90 | step 500/2080 | loss 7.1765 | lr 0.00100 | ngrams/sec 36368.4 | eta 0h0m43s
| epoch 90 | step 1000/2080 | loss 7.2196 | lr 0.00100 | ngrams/sec 46860.9 | eta 0h0m23s
| epoch 90 | step 1500/2080 | loss 7.2397 | lr 0.00100 | ngrams/sec 46894.3 | eta 0h0m12s
| epoch 90 | step 2000/2080 | loss 7.2707 | lr 0.00100 | ngrams/sec 46864.9 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1061.86it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.76it/s]


-----------------------------------------------------------------------------------------
| end of epoch  90 | time 45.14s | valid loss  6.16 | valid ppl   473.11
-----------------------------------------------------------------------------------------
| epoch 91 | step 500/2080 | loss 7.1799 | lr 0.00100 | ngrams/sec 36310.7 | eta 0h0m43s
| epoch 91 | step 1000/2080 | loss 7.2144 | lr 0.00100 | ngrams/sec 46828.2 | eta 0h0m23s
| epoch 91 | step 1500/2080 | loss 7.2305 | lr 0.00100 | ngrams/sec 46878.8 | eta 0h0m12s
| epoch 91 | step 2000/2080 | loss 7.2522 | lr 0.00100 | ngrams/sec 46927.2 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1074.82it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.37it/s]


-----------------------------------------------------------------------------------------
| end of epoch  91 | time 45.14s | valid loss  6.16 | valid ppl   472.91
-----------------------------------------------------------------------------------------
| epoch 92 | step 500/2080 | loss 7.1477 | lr 0.00100 | ngrams/sec 36361.2 | eta 0h0m43s
| epoch 92 | step 1000/2080 | loss 7.1944 | lr 0.00100 | ngrams/sec 46902.2 | eta 0h0m23s
| epoch 92 | step 1500/2080 | loss 7.2244 | lr 0.00100 | ngrams/sec 46847.1 | eta 0h0m12s
| epoch 92 | step 2000/2080 | loss 7.2425 | lr 0.00100 | ngrams/sec 46850.7 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1086.16it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.76it/s]


-----------------------------------------------------------------------------------------
| end of epoch  92 | time 45.14s | valid loss  6.17 | valid ppl   476.85
-----------------------------------------------------------------------------------------
| epoch 93 | step 500/2080 | loss 7.1421 | lr 0.00100 | ngrams/sec 36329.7 | eta 0h0m43s
| epoch 93 | step 1000/2080 | loss 7.1886 | lr 0.00100 | ngrams/sec 46779.7 | eta 0h0m23s
| epoch 93 | step 1500/2080 | loss 7.2211 | lr 0.00100 | ngrams/sec 46836.9 | eta 0h0m12s
| epoch 93 | step 2000/2080 | loss 7.2396 | lr 0.00100 | ngrams/sec 46871.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1074.38it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.66it/s]


-----------------------------------------------------------------------------------------
| end of epoch  93 | time 45.19s | valid loss  6.17 | valid ppl   476.35
-----------------------------------------------------------------------------------------
| epoch 94 | step 500/2080 | loss 7.1461 | lr 0.00100 | ngrams/sec 36369.2 | eta 0h0m43s
| epoch 94 | step 1000/2080 | loss 7.1857 | lr 0.00100 | ngrams/sec 46912.4 | eta 0h0m23s
| epoch 94 | step 1500/2080 | loss 7.2109 | lr 0.00100 | ngrams/sec 46926.4 | eta 0h0m12s
| epoch 94 | step 2000/2080 | loss 7.2293 | lr 0.00100 | ngrams/sec 46873.6 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1063.53it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch  94 | time 45.11s | valid loss  6.17 | valid ppl   477.75
-----------------------------------------------------------------------------------------
| epoch 95 | step 500/2080 | loss 7.1421 | lr 0.00100 | ngrams/sec 36352.9 | eta 0h0m43s
| epoch 95 | step 1000/2080 | loss 7.1791 | lr 0.00100 | ngrams/sec 46909.1 | eta 0h0m23s
| epoch 95 | step 1500/2080 | loss 7.2029 | lr 0.00100 | ngrams/sec 46890.2 | eta 0h0m12s
| epoch 95 | step 2000/2080 | loss 7.2262 | lr 0.00100 | ngrams/sec 46860.6 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1058.93it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.70it/s]


-----------------------------------------------------------------------------------------
| end of epoch  95 | time 45.13s | valid loss  6.16 | valid ppl   473.16
-----------------------------------------------------------------------------------------
| epoch 96 | step 500/2080 | loss 7.1176 | lr 0.00100 | ngrams/sec 36379.2 | eta 0h0m43s
| epoch 96 | step 1000/2080 | loss 7.1608 | lr 0.00100 | ngrams/sec 46853.1 | eta 0h0m23s
| epoch 96 | step 1500/2080 | loss 7.1923 | lr 0.00100 | ngrams/sec 46860.1 | eta 0h0m12s
| epoch 96 | step 2000/2080 | loss 7.2245 | lr 0.00100 | ngrams/sec 46899.4 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1036.38it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.41it/s]


-----------------------------------------------------------------------------------------
| end of epoch  96 | time 45.13s | valid loss  6.17 | valid ppl   479.36
-----------------------------------------------------------------------------------------
| epoch 97 | step 500/2080 | loss 7.1146 | lr 0.00100 | ngrams/sec 36321.8 | eta 0h0m43s
| epoch 97 | step 1000/2080 | loss 7.1592 | lr 0.00100 | ngrams/sec 46849.0 | eta 0h0m23s
| epoch 97 | step 1500/2080 | loss 7.1866 | lr 0.00100 | ngrams/sec 46461.2 | eta 0h0m12s
| epoch 97 | step 2000/2080 | loss 7.2068 | lr 0.00100 | ngrams/sec 46901.7 | eta 0h0m1s


 53%|█████▎    | 110/209 [00:00<00:00, 1043.45it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 273.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch  97 | time 45.25s | valid loss  6.16 | valid ppl   471.30
-----------------------------------------------------------------------------------------
| epoch 98 | step 500/2080 | loss 7.1126 | lr 0.00100 | ngrams/sec 36361.4 | eta 0h0m43s
| epoch 98 | step 1000/2080 | loss 7.1547 | lr 0.00100 | ngrams/sec 46895.4 | eta 0h0m23s
| epoch 98 | step 1500/2080 | loss 7.1892 | lr 0.00100 | ngrams/sec 46870.3 | eta 0h0m12s
| epoch 98 | step 2000/2080 | loss 7.2035 | lr 0.00100 | ngrams/sec 46884.9 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1067.46it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.34it/s]


-----------------------------------------------------------------------------------------
| end of epoch  98 | time 45.13s | valid loss  6.17 | valid ppl   476.15
-----------------------------------------------------------------------------------------
| epoch 99 | step 500/2080 | loss 7.0962 | lr 0.00100 | ngrams/sec 36380.0 | eta 0h0m43s
| epoch 99 | step 1000/2080 | loss 7.1496 | lr 0.00100 | ngrams/sec 46875.9 | eta 0h0m23s
| epoch 99 | step 1500/2080 | loss 7.1793 | lr 0.00100 | ngrams/sec 46894.6 | eta 0h0m12s
| epoch 99 | step 2000/2080 | loss 7.2061 | lr 0.00100 | ngrams/sec 46899.2 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1057.90it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 271.98it/s]


-----------------------------------------------------------------------------------------
| end of epoch  99 | time 45.13s | valid loss  6.17 | valid ppl   476.91
-----------------------------------------------------------------------------------------
| epoch 100 | step 500/2080 | loss 7.0891 | lr 0.00100 | ngrams/sec 36358.4 | eta 0h0m43s
| epoch 100 | step 1000/2080 | loss 7.1391 | lr 0.00100 | ngrams/sec 46917.3 | eta 0h0m23s
| epoch 100 | step 1500/2080 | loss 7.1669 | lr 0.00100 | ngrams/sec 46945.7 | eta 0h0m12s
| epoch 100 | step 2000/2080 | loss 7.1941 | lr 0.00100 | ngrams/sec 46857.3 | eta 0h0m1s


 52%|█████▏    | 109/209 [00:00<00:00, 1083.22it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 272.10it/s]


-----------------------------------------------------------------------------------------


 46%|████▌     | 109/237 [00:00<00:00, 1072.56it/s]

| end of epoch 100 | time 45.11s | valid loss  6.17 | valid ppl   476.12
-----------------------------------------------------------------------------------------
Evaluating on test set...


100%|██████████| 237/237 [00:00<00:00, 249.93it/s]


| End of training | test loss  6.11 | test ppl   451.94


In [11]:
from google.colab import files
files.download('checkpoint.pth')
!cp "checkpoint.pth" "gdrive/MyDrive/checkpoint.pth"

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Generate

In [10]:
!cp "gdrive/MyDrive/checkpoint.pth" "checkpoint.pth" 

In [32]:
import torch 
# Model parameters.
class Args:
    data = 'gdrive/MyDrive/wikitext-2'
    checkpoint = 'checkpoint.pth'
    outf = 'generated.txt'
    words = 1000
    seed = 42
    cuda = True
    temperature = 1.0 # temperature - higher will increase diversity
    log_interval = 100 # reporting interval
    words = 100
args = Args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

model.load_state_dict(torch.load(args.checkpoint))
print(model)
model.eval()

# input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
input = torch.autograd.Variable(torch.t(torch.randint(ntokens, (1, 1), dtype=torch.long))).to(device)
print(input)
glue = ' '
start = None
with open(args.outf, 'w') as outf:
    for i in range(args.words):
        output = model(input)
        word_weights = output.squeeze().div(args.temperature).exp().cpu()
        # if args.no_unk:
        #     word_weights[corpus.dictionary.w2i[unk]] = 0
        # word_idx = torch.multinomial(word_weights, 1)[0]
        # word_idx = word_idx.data[0]
        # word = corpus.dictionary.i2w[word_idx]

        # ids.append(word_idx)
        # input = Variable(torch.LongTensor(ids[-model.order:]).unsqueeze(0))
        input = input.cuda() if cuda else input
        if word is "<sos>": # ignore start of sentence predictns
            continue
        elif word is "<eos>":
            outf.write('\n')
        else:
            outf.write(word + glue)

        if i % 100 == 0:
            print('| Generated {}/{} words'.format(i, args.num_samples))

FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
tensor([[2774]], device='cuda:0')


RuntimeError: ignored