In [146]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [147]:
!ls "gdrive/MyDrive/wikitext-2" # check that it has successfully connected
# files should be at ur GDrive inside folder wikitext-2
!cp "gdrive/MyDrive/wikitext-2/wiki.test.tokens.txt" "test.txt" # copy the files to colab runtime
!cp "gdrive/MyDrive/wikitext-2/wiki.train.tokens.txt" "train.txt"
!cp "gdrive/MyDrive/wikitext-2/wiki.valid.tokens.txt" "valid.txt"

wiki.test.tokens      wiki.train.tokens      wiki.valid.tokens
wiki.test.tokens.txt  wiki.train.tokens.txt  wiki.valid.tokens.txt


# Preprocessing

In [148]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                # remove the headers e.g.  = = Description = = 
                if line.startswith('='): 
                    continue
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word.lower()) # make to lower

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word.lower()]) # make to lower
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
    
    @property
    def vocab_size(self):
        return len(self.dictionary.idx2word)

# Params

In [149]:
#=== params
corpus = Corpus('/content')
n_class = corpus.vocab_size
n_step = 7 # n-1 in paper
n_hidden = 200 # h in paper
embed_size = 200       # m in paper
batch_size = 1000
order = n_step # order (int): the order of the language model, i.e. length of the history
epochs = 100
learning_rate = 0.0001
cuda = torch.cuda.is_available()
seed = 42
clip = 2.0
#===

# Model

In [150]:
#== MODEL ==#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''
    the neural model learns the distributed representation of each word 
    (embedding matrix C) and 
    the probability function of a word sequence as a function of their distributed representations. 
    It has a hidden layer with 
    tanh activation and the output layer is a 
    Softmax layer. 
    The output of the model for each 
    input of (n - 1) previous words are the 
    probabilities over the |V | words in the vocabulary for the next word.
'''
class FNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, context_size, no_hidden, dropout=0.5):
        super(FNNModel,self).__init__()
        """
        Args:
            n_class (int): no. of vocabulary
            m (int): size of each embedding vector
#n-gram models construct tables of conditional probabilities for the next word, 
#for each one of a large number of contexts, i.e. combinations of the last n − 1 words
            n_step (int): n-1 in paper. #n_step + 1 = n-gram. if n_step = 1, bigram
            n_hidden (int): no. of hidden units associated with each word
        """
        """
        Vars:
            C: encoder (|V| x m)
            H: hiden layer weight (n x (n-1)m)
            W: word feature to output weights (|V| x (n-1)m)
            d: hidden layer bias (has h no. of elements)
            U: hidden-to-output weights (|V| × h matrix)
            b: output bias (has |V| no. of elements)
        """
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear1 = nn.Linear(context_size * embed_size, no_hidden)
        self.linear2 = nn.Linear(no_hidden, vocab_size)
        self.context_size = context_size
        self.embed_size = embed_size
        self.dropout = nn.Dropout(p=dropout)

    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((-1, self.context_size * self.embed_size))
        hidden_output = self.linear1(embeds)
        # hidden_output = self.dropout(hidden_output)
        out = hidden_output.tanh()
        # out = self.dropout(out)
        out = self.linear2(out)
        out = self.dropout(out)
        log_probs = F.log_softmax(out, dim=1) # [1000, 28912]: softmax on 28912's dim
        return log_probs

# Data Loading

In [151]:
def batchify(data, batch_size):
    # Work out how cleanly we can divide the dataset into args.batch_size parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data
def get_batch(data, i, order):
    x = torch.autograd.Variable(torch.t(data[i:i+order]))
    y = torch.autograd.Variable(data[i+order].view(-1))
    return x, y
def evaluate(data, model, criterion):
	model.eval()
	total_loss = 0
	n_steps = data.size(0) - order - 1
	for i in tqdm(range(n_steps)):
		x, y = get_batch(data, i, order)
		out = model(x)
		loss = criterion(out, y)
		total_loss += loss.data.data
	return total_loss / n_steps

In [152]:
def clock_time(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    return int(h), int(m), int(s)

In [153]:
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import time


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)
if cuda:
	train_data, val_data, test_data = train_data.cuda(), val_data.cuda(), test_data.cuda()
print('Using cuda: {}'.format(cuda))
print('Size of training set: {:,} tokens'.format(np.prod(train_data.size())))
print('Size of validation set: {:,} tokens'.format(np.prod(val_data.size())))
print('Size of test set: {:,} tokens'.format(np.prod(test_data.size())))
print('Vocabulary size: {:,}'.format(corpus.vocab_size))
print('Example data:')
for k in range(100, 107):
    x = [corpus.dictionary.idx2word[i] for i in train_data[k:order+k, 0]]
    y = [corpus.dictionary.idx2word[train_data[k+order, 0]]]
    print(x, y)
#=== initialise model
model = FNNModel(
    n_class, 
    embed_size, 
    n_step, 
    n_hidden)
if cuda:
  model.cuda()
# Display the model's architecture
print('Model: \n', model)
criterion = nn.NLLLoss()
optimizer = optim.RMSprop(model.parameters(),lr=learning_rate)

Using cuda: True
Size of training set: 2,088,000 tokens
Size of validation set: 217,000 tokens
Size of test set: 245,000 tokens
Vocabulary size: 28,912
Example data:
['"', 'nameless', '"', ',', 'a', 'penal', 'military'] ['unit']
['nameless', '"', ',', 'a', 'penal', 'military', 'unit'] ['serving']
['"', ',', 'a', 'penal', 'military', 'unit', 'serving'] ['the']
[',', 'a', 'penal', 'military', 'unit', 'serving', 'the'] ['nation']
['a', 'penal', 'military', 'unit', 'serving', 'the', 'nation'] ['of']
['penal', 'military', 'unit', 'serving', 'the', 'nation', 'of'] ['gallia']
['military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia'] ['during']
Model: 
 FNNModel(
  (embeddings): Embedding(28912, 200)
  (linear1): Linear(in_features=1400, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=28912, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [154]:
!nvidia-smi

Wed Nov 25 18:06:45 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    35W / 250W |   2381MiB / 16280MiB |      3%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [155]:
# Set seed for reproducibility.
torch.manual_seed(seed)
np.random.seed(seed)
parameters = [param for param in model.parameters() if param.requires_grad]
# Training
print('Training...')
losses = dict(train=[], val=[])

# initialize the early_stopping counter
stop_counter = 0

lr = learning_rate # so that can alter later if SGD not descending 
best_val_loss = None

num_steps = train_data.size(0) - order - 1
batch_order = np.arange(num_steps)

t0 = time.time()
try:
    for epoch in range(1, epochs+1):
        model.train()
        epoch_start_time = time.time()
        np.random.shuffle(batch_order)

        for step in range(1, num_steps+1):
            idx = batch_order[step-1]
            x, y = get_batch(train_data, idx, order)

            model.zero_grad()
            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)
            debug = False
            #   if debug:
            #     # Debugging softmax approximation.
            #     xe = nn.CrossEntropyLoss()
            #     true_loss = xe(logits, y)
            #     print('approx {:>3.2f}, true {:>3.2f}, diff {:>3.4f}'.format(
            #       loss.data, true_loss.data, true_loss.data - loss.data))

            # Update parameters
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # reduce exploding grad
            optimizer.step()

            # Save loss.
            losses['train'].append(loss.cpu().data)
            print_every = 500
            if step % print_every == 0:
                avg_loss = sum(losses['train'][-print_every:]) / print_every
                t1 = time.time()
                steps_per_second = print_every / (t1 - t0)
                print('| epoch {} | step {}/{} | loss {:.4f} | lr {:.5f} | '
                    'ngrams/sec {:.1f} | eta {}h{}m{}s'.format(
                    epoch, step, num_steps, avg_loss, lr,
                    steps_per_second * batch_size,
                    *clock_time((num_steps - step) / steps_per_second)))
                t0 = time.time()
            
        print('Evaluating on validation set...')
        val_loss = evaluate(val_data, model, criterion)
        losses['val'].append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, torch.exp(val_loss)))
        print('-' * 89)

        if not best_val_loss or val_loss < best_val_loss:
            stop_counter = 0 # reset counter
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'checkpoint.pth')
            #=== download checkpoint file
            # files.download('checkpoint.pth')
        else:
            stop_counter += 1
            if stop_counter >= 10:
                print("Early stopping")
                break
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    
# write_losses(losses['train'], args.log_dir, name='train-losses')
# write_losses(losses['val'], args.log_dir, name='val-losses')

print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

Training...
| epoch 1 | step 500/2080 | loss 8.8873 | lr 0.000 | ngrams/sec 74135.5 | eta 0h0m21s
| epoch 1 | step 1000/2080 | loss 8.5211 | lr 0.000 | ngrams/sec 74244.9 | eta 0h0m14s
| epoch 1 | step 1500/2080 | loss 8.4219 | lr 0.000 | ngrams/sec 74301.0 | eta 0h0m7s
| epoch 1 | step 2000/2080 | loss 8.3659 | lr 0.000 | ngrams/sec 74526.5 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1273.05it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 553.46it/s]


-----------------------------------------------------------------------------------------
| end of epoch   1 | time 28.37s | valid loss  7.31 | valid ppl  1501.66
-----------------------------------------------------------------------------------------
| epoch 2 | step 500/2080 | loss 8.2977 | lr 0.000 | ngrams/sec 57526.7 | eta 0h0m27s
| epoch 2 | step 1000/2080 | loss 8.2881 | lr 0.000 | ngrams/sec 74533.9 | eta 0h0m14s
| epoch 2 | step 1500/2080 | loss 8.2685 | lr 0.000 | ngrams/sec 74301.8 | eta 0h0m7s
| epoch 2 | step 2000/2080 | loss 8.2535 | lr 0.000 | ngrams/sec 73994.2 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1267.31it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 552.66it/s]


-----------------------------------------------------------------------------------------
| end of epoch   2 | time 28.40s | valid loss  7.16 | valid ppl  1287.29
-----------------------------------------------------------------------------------------
| epoch 3 | step 500/2080 | loss 8.2090 | lr 0.000 | ngrams/sec 57683.8 | eta 0h0m27s
| epoch 3 | step 1000/2080 | loss 8.1972 | lr 0.000 | ngrams/sec 74376.4 | eta 0h0m14s
| epoch 3 | step 1500/2080 | loss 8.1904 | lr 0.000 | ngrams/sec 74547.0 | eta 0h0m7s
| epoch 3 | step 2000/2080 | loss 8.1879 | lr 0.000 | ngrams/sec 74568.8 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1276.59it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 554.12it/s]


-----------------------------------------------------------------------------------------
| end of epoch   3 | time 28.31s | valid loss  7.08 | valid ppl  1192.35
-----------------------------------------------------------------------------------------
| epoch 4 | step 500/2080 | loss 8.1505 | lr 0.000 | ngrams/sec 57741.1 | eta 0h0m27s
| epoch 4 | step 1000/2080 | loss 8.1440 | lr 0.000 | ngrams/sec 74369.3 | eta 0h0m14s
| epoch 4 | step 1500/2080 | loss 8.1423 | lr 0.000 | ngrams/sec 74532.1 | eta 0h0m7s
| epoch 4 | step 2000/2080 | loss 8.1378 | lr 0.000 | ngrams/sec 74475.4 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1274.34it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 552.84it/s]


-----------------------------------------------------------------------------------------
| end of epoch   4 | time 28.33s | valid loss  7.04 | valid ppl  1138.17
-----------------------------------------------------------------------------------------
| epoch 5 | step 500/2080 | loss 8.1082 | lr 0.000 | ngrams/sec 57628.2 | eta 0h0m27s
| epoch 5 | step 1000/2080 | loss 8.0978 | lr 0.000 | ngrams/sec 74384.0 | eta 0h0m14s
| epoch 5 | step 1500/2080 | loss 8.0982 | lr 0.000 | ngrams/sec 74551.1 | eta 0h0m7s
| epoch 5 | step 2000/2080 | loss 8.0907 | lr 0.000 | ngrams/sec 74476.7 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1283.81it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 550.70it/s]


-----------------------------------------------------------------------------------------
| end of epoch   5 | time 28.34s | valid loss  6.98 | valid ppl  1078.82
-----------------------------------------------------------------------------------------
| epoch 6 | step 500/2080 | loss 8.0533 | lr 0.000 | ngrams/sec 57637.4 | eta 0h0m27s
| epoch 6 | step 1000/2080 | loss 8.0649 | lr 0.000 | ngrams/sec 74431.5 | eta 0h0m14s
| epoch 6 | step 1500/2080 | loss 8.0653 | lr 0.000 | ngrams/sec 74462.9 | eta 0h0m7s
| epoch 6 | step 2000/2080 | loss 8.0742 | lr 0.000 | ngrams/sec 74444.2 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1274.35it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 549.52it/s]


-----------------------------------------------------------------------------------------
| end of epoch   6 | time 28.34s | valid loss  6.96 | valid ppl  1054.40
-----------------------------------------------------------------------------------------
| epoch 7 | step 500/2080 | loss 8.0260 | lr 0.000 | ngrams/sec 57656.0 | eta 0h0m27s
| epoch 7 | step 1000/2080 | loss 8.0264 | lr 0.000 | ngrams/sec 74539.9 | eta 0h0m14s
| epoch 7 | step 1500/2080 | loss 8.0283 | lr 0.000 | ngrams/sec 74508.7 | eta 0h0m7s
| epoch 7 | step 2000/2080 | loss 8.0261 | lr 0.000 | ngrams/sec 74425.1 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1276.27it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 553.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch   7 | time 28.32s | valid loss  6.93 | valid ppl  1024.77
-----------------------------------------------------------------------------------------
| epoch 8 | step 500/2080 | loss 8.0036 | lr 0.000 | ngrams/sec 57696.9 | eta 0h0m27s
| epoch 8 | step 1000/2080 | loss 8.0019 | lr 0.000 | ngrams/sec 74550.1 | eta 0h0m14s
| epoch 8 | step 1500/2080 | loss 8.0026 | lr 0.000 | ngrams/sec 74567.4 | eta 0h0m7s
| epoch 8 | step 2000/2080 | loss 7.9993 | lr 0.000 | ngrams/sec 74453.1 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1284.26it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 550.93it/s]


-----------------------------------------------------------------------------------------
| end of epoch   8 | time 28.31s | valid loss  6.95 | valid ppl  1041.03
-----------------------------------------------------------------------------------------
| epoch 9 | step 500/2080 | loss 7.9669 | lr 0.000 | ngrams/sec 58541.1 | eta 0h0m26s
| epoch 9 | step 1000/2080 | loss 7.9727 | lr 0.000 | ngrams/sec 74354.7 | eta 0h0m14s
| epoch 9 | step 1500/2080 | loss 7.9718 | lr 0.000 | ngrams/sec 74613.8 | eta 0h0m7s
| epoch 9 | step 2000/2080 | loss 7.9775 | lr 0.000 | ngrams/sec 74409.5 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1279.31it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 553.65it/s]


-----------------------------------------------------------------------------------------
| end of epoch   9 | time 28.32s | valid loss  6.87 | valid ppl   962.81
-----------------------------------------------------------------------------------------
| epoch 10 | step 500/2080 | loss 7.9508 | lr 0.000 | ngrams/sec 57698.7 | eta 0h0m27s
| epoch 10 | step 1000/2080 | loss 7.9525 | lr 0.000 | ngrams/sec 74481.2 | eta 0h0m14s
| epoch 10 | step 1500/2080 | loss 7.9545 | lr 0.000 | ngrams/sec 74452.5 | eta 0h0m7s
| epoch 10 | step 2000/2080 | loss 7.9613 | lr 0.000 | ngrams/sec 74359.9 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1279.46it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 550.29it/s]


-----------------------------------------------------------------------------------------
| end of epoch  10 | time 28.34s | valid loss  6.88 | valid ppl   974.75
-----------------------------------------------------------------------------------------
| epoch 11 | step 500/2080 | loss 7.9298 | lr 0.000 | ngrams/sec 58515.8 | eta 0h0m27s
| epoch 11 | step 1000/2080 | loss 7.9255 | lr 0.000 | ngrams/sec 74535.5 | eta 0h0m14s
| epoch 11 | step 1500/2080 | loss 7.9402 | lr 0.000 | ngrams/sec 74362.5 | eta 0h0m7s
| epoch 11 | step 2000/2080 | loss 7.9327 | lr 0.000 | ngrams/sec 74537.1 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1277.26it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 553.81it/s]


-----------------------------------------------------------------------------------------
| end of epoch  11 | time 28.33s | valid loss  6.90 | valid ppl   994.12
-----------------------------------------------------------------------------------------
| epoch 12 | step 500/2080 | loss 7.9048 | lr 0.000 | ngrams/sec 58305.2 | eta 0h0m27s
| epoch 12 | step 1000/2080 | loss 7.9017 | lr 0.000 | ngrams/sec 74224.9 | eta 0h0m14s
| epoch 12 | step 1500/2080 | loss 7.9077 | lr 0.000 | ngrams/sec 74344.6 | eta 0h0m7s
| epoch 12 | step 2000/2080 | loss 7.9093 | lr 0.000 | ngrams/sec 74551.2 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1287.44it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 550.78it/s]


-----------------------------------------------------------------------------------------
| end of epoch  12 | time 28.39s | valid loss  6.91 | valid ppl  1006.46
-----------------------------------------------------------------------------------------
| epoch 13 | step 500/2080 | loss 7.8883 | lr 0.000 | ngrams/sec 58516.0 | eta 0h0m27s
| epoch 13 | step 1000/2080 | loss 7.8945 | lr 0.000 | ngrams/sec 74533.7 | eta 0h0m14s
| epoch 13 | step 1500/2080 | loss 7.8867 | lr 0.000 | ngrams/sec 74500.7 | eta 0h0m7s
| epoch 13 | step 2000/2080 | loss 7.8949 | lr 0.000 | ngrams/sec 74416.1 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1290.92it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 551.22it/s]


-----------------------------------------------------------------------------------------
| end of epoch  13 | time 28.33s | valid loss  6.89 | valid ppl   983.22
-----------------------------------------------------------------------------------------
| epoch 14 | step 500/2080 | loss 7.8615 | lr 0.000 | ngrams/sec 58439.5 | eta 0h0m27s
| epoch 14 | step 1000/2080 | loss 7.8802 | lr 0.000 | ngrams/sec 74427.8 | eta 0h0m14s
| epoch 14 | step 1500/2080 | loss 7.8815 | lr 0.000 | ngrams/sec 74475.2 | eta 0h0m7s
| epoch 14 | step 2000/2080 | loss 7.8850 | lr 0.000 | ngrams/sec 74386.0 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1284.82it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 550.57it/s]


-----------------------------------------------------------------------------------------
| end of epoch  14 | time 28.35s | valid loss  6.85 | valid ppl   947.88
-----------------------------------------------------------------------------------------
| epoch 15 | step 500/2080 | loss 7.8629 | lr 0.000 | ngrams/sec 57720.1 | eta 0h0m27s
| epoch 15 | step 1000/2080 | loss 7.8552 | lr 0.000 | ngrams/sec 74398.2 | eta 0h0m14s
| epoch 15 | step 1500/2080 | loss 7.8593 | lr 0.000 | ngrams/sec 74335.7 | eta 0h0m7s
| epoch 15 | step 2000/2080 | loss 7.8522 | lr 0.000 | ngrams/sec 74430.0 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1297.46it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 552.25it/s]


-----------------------------------------------------------------------------------------
| end of epoch  15 | time 28.35s | valid loss  6.92 | valid ppl  1010.61
-----------------------------------------------------------------------------------------
| epoch 16 | step 500/2080 | loss 7.8258 | lr 0.000 | ngrams/sec 58634.3 | eta 0h0m26s
| epoch 16 | step 1000/2080 | loss 7.8339 | lr 0.000 | ngrams/sec 74490.8 | eta 0h0m14s
| epoch 16 | step 1500/2080 | loss 7.8432 | lr 0.000 | ngrams/sec 74422.5 | eta 0h0m7s
| epoch 16 | step 2000/2080 | loss 7.8420 | lr 0.000 | ngrams/sec 74589.1 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1278.51it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 553.58it/s]


-----------------------------------------------------------------------------------------
| end of epoch  16 | time 28.30s | valid loss  6.89 | valid ppl   986.96
-----------------------------------------------------------------------------------------
| epoch 17 | step 500/2080 | loss 7.8254 | lr 0.000 | ngrams/sec 58641.3 | eta 0h0m26s
| epoch 17 | step 1000/2080 | loss 7.8262 | lr 0.000 | ngrams/sec 74394.6 | eta 0h0m14s
| epoch 17 | step 1500/2080 | loss 7.8261 | lr 0.000 | ngrams/sec 74572.9 | eta 0h0m7s
| epoch 17 | step 2000/2080 | loss 7.8302 | lr 0.000 | ngrams/sec 74329.3 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1290.98it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 551.39it/s]


-----------------------------------------------------------------------------------------
| end of epoch  17 | time 28.32s | valid loss  6.89 | valid ppl   981.76
-----------------------------------------------------------------------------------------
| epoch 18 | step 500/2080 | loss 7.8030 | lr 0.000 | ngrams/sec 58601.9 | eta 0h0m26s
| epoch 18 | step 1000/2080 | loss 7.8108 | lr 0.000 | ngrams/sec 74474.6 | eta 0h0m14s
| epoch 18 | step 1500/2080 | loss 7.8220 | lr 0.000 | ngrams/sec 74453.1 | eta 0h0m7s
| epoch 18 | step 2000/2080 | loss 7.8168 | lr 0.000 | ngrams/sec 74425.9 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1299.89it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 551.94it/s]


-----------------------------------------------------------------------------------------
| end of epoch  18 | time 28.32s | valid loss  6.88 | valid ppl   970.86
-----------------------------------------------------------------------------------------
| epoch 19 | step 500/2080 | loss 7.7932 | lr 0.000 | ngrams/sec 58597.6 | eta 0h0m26s
| epoch 19 | step 1000/2080 | loss 7.7998 | lr 0.000 | ngrams/sec 74372.8 | eta 0h0m14s
| epoch 19 | step 1500/2080 | loss 7.8108 | lr 0.000 | ngrams/sec 74439.3 | eta 0h0m7s
| epoch 19 | step 2000/2080 | loss 7.8144 | lr 0.000 | ngrams/sec 74390.8 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1279.07it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 554.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  19 | time 28.33s | valid loss  6.81 | valid ppl   907.75
-----------------------------------------------------------------------------------------
| epoch 20 | step 500/2080 | loss 7.7825 | lr 0.000 | ngrams/sec 57713.2 | eta 0h0m27s
| epoch 20 | step 1000/2080 | loss 7.7851 | lr 0.000 | ngrams/sec 74506.4 | eta 0h0m14s
| epoch 20 | step 1500/2080 | loss 7.8139 | lr 0.000 | ngrams/sec 74435.5 | eta 0h0m7s
| epoch 20 | step 2000/2080 | loss 7.8182 | lr 0.000 | ngrams/sec 74480.4 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1289.72it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 551.33it/s]


-----------------------------------------------------------------------------------------
| end of epoch  20 | time 28.32s | valid loss  6.75 | valid ppl   855.27
-----------------------------------------------------------------------------------------
| epoch 21 | step 500/2080 | loss 7.7828 | lr 0.000 | ngrams/sec 57617.5 | eta 0h0m27s
| epoch 21 | step 1000/2080 | loss 7.7923 | lr 0.000 | ngrams/sec 74572.6 | eta 0h0m14s
| epoch 21 | step 1500/2080 | loss 7.8007 | lr 0.000 | ngrams/sec 74325.4 | eta 0h0m7s
| epoch 21 | step 2000/2080 | loss 7.8016 | lr 0.000 | ngrams/sec 74627.0 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1271.82it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 552.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch  21 | time 28.33s | valid loss  6.71 | valid ppl   821.80
-----------------------------------------------------------------------------------------
| epoch 22 | step 500/2080 | loss 7.7769 | lr 0.000 | ngrams/sec 57683.7 | eta 0h0m27s
| epoch 22 | step 1000/2080 | loss 7.7876 | lr 0.000 | ngrams/sec 74285.7 | eta 0h0m14s
| epoch 22 | step 1500/2080 | loss 7.7850 | lr 0.000 | ngrams/sec 74256.5 | eta 0h0m7s
| epoch 22 | step 2000/2080 | loss 7.7798 | lr 0.000 | ngrams/sec 74407.8 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1272.10it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 549.07it/s]


-----------------------------------------------------------------------------------------
| end of epoch  22 | time 28.37s | valid loss  6.84 | valid ppl   933.23
-----------------------------------------------------------------------------------------
| epoch 23 | step 500/2080 | loss 7.7520 | lr 0.000 | ngrams/sec 58376.9 | eta 0h0m27s
| epoch 23 | step 1000/2080 | loss 7.7509 | lr 0.000 | ngrams/sec 74447.4 | eta 0h0m14s
| epoch 23 | step 1500/2080 | loss 7.7491 | lr 0.000 | ngrams/sec 74490.4 | eta 0h0m7s
| epoch 23 | step 2000/2080 | loss 7.7653 | lr 0.000 | ngrams/sec 74471.3 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1277.03it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 553.75it/s]


-----------------------------------------------------------------------------------------
| end of epoch  23 | time 28.33s | valid loss  6.84 | valid ppl   929.92
-----------------------------------------------------------------------------------------
| epoch 24 | step 500/2080 | loss 7.7286 | lr 0.000 | ngrams/sec 58624.6 | eta 0h0m26s
| epoch 24 | step 1000/2080 | loss 7.7396 | lr 0.000 | ngrams/sec 74614.4 | eta 0h0m14s
| epoch 24 | step 1500/2080 | loss 7.7500 | lr 0.000 | ngrams/sec 74481.5 | eta 0h0m7s
| epoch 24 | step 2000/2080 | loss 7.7510 | lr 0.000 | ngrams/sec 74474.4 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1270.50it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 553.26it/s]


-----------------------------------------------------------------------------------------
| end of epoch  24 | time 28.30s | valid loss  6.84 | valid ppl   933.97
-----------------------------------------------------------------------------------------
| epoch 25 | step 500/2080 | loss 7.7191 | lr 0.000 | ngrams/sec 58530.0 | eta 0h0m26s
| epoch 25 | step 1000/2080 | loss 7.7186 | lr 0.000 | ngrams/sec 74519.4 | eta 0h0m14s
| epoch 25 | step 1500/2080 | loss 7.7289 | lr 0.000 | ngrams/sec 74539.0 | eta 0h0m7s
| epoch 25 | step 2000/2080 | loss 7.7491 | lr 0.000 | ngrams/sec 74398.6 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1276.89it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 553.96it/s]


-----------------------------------------------------------------------------------------
| end of epoch  25 | time 28.33s | valid loss  6.89 | valid ppl   980.10
-----------------------------------------------------------------------------------------
| epoch 26 | step 500/2080 | loss 7.7156 | lr 0.000 | ngrams/sec 58587.3 | eta 0h0m26s
| epoch 26 | step 1000/2080 | loss 7.7251 | lr 0.000 | ngrams/sec 74322.4 | eta 0h0m14s
| epoch 26 | step 1500/2080 | loss 7.7170 | lr 0.000 | ngrams/sec 74318.4 | eta 0h0m7s
| epoch 26 | step 2000/2080 | loss 7.7317 | lr 0.000 | ngrams/sec 74438.6 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1275.90it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 553.35it/s]


-----------------------------------------------------------------------------------------
| end of epoch  26 | time 28.35s | valid loss  6.87 | valid ppl   960.28
-----------------------------------------------------------------------------------------
| epoch 27 | step 500/2080 | loss 7.7111 | lr 0.000 | ngrams/sec 58631.4 | eta 0h0m26s
| epoch 27 | step 1000/2080 | loss 7.7055 | lr 0.000 | ngrams/sec 74541.5 | eta 0h0m14s
| epoch 27 | step 1500/2080 | loss 7.7180 | lr 0.000 | ngrams/sec 74445.6 | eta 0h0m7s
| epoch 27 | step 2000/2080 | loss 7.7168 | lr 0.000 | ngrams/sec 74352.0 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1282.25it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 550.47it/s]


-----------------------------------------------------------------------------------------
| end of epoch  27 | time 28.32s | valid loss  6.83 | valid ppl   928.08
-----------------------------------------------------------------------------------------
| epoch 28 | step 500/2080 | loss 7.6919 | lr 0.000 | ngrams/sec 58586.9 | eta 0h0m26s
| epoch 28 | step 1000/2080 | loss 7.7026 | lr 0.000 | ngrams/sec 74438.2 | eta 0h0m14s
| epoch 28 | step 1500/2080 | loss 7.7073 | lr 0.000 | ngrams/sec 74306.4 | eta 0h0m7s
| epoch 28 | step 2000/2080 | loss 7.7240 | lr 0.000 | ngrams/sec 74374.3 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1277.80it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 553.79it/s]


-----------------------------------------------------------------------------------------
| end of epoch  28 | time 28.34s | valid loss  6.80 | valid ppl   898.78
-----------------------------------------------------------------------------------------
| epoch 29 | step 500/2080 | loss 7.7021 | lr 0.000 | ngrams/sec 58635.6 | eta 0h0m26s
| epoch 29 | step 1000/2080 | loss 7.6901 | lr 0.000 | ngrams/sec 74571.4 | eta 0h0m14s
| epoch 29 | step 1500/2080 | loss 7.7067 | lr 0.000 | ngrams/sec 74374.2 | eta 0h0m7s
| epoch 29 | step 2000/2080 | loss 7.7137 | lr 0.000 | ngrams/sec 74513.7 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1269.74it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 548.39it/s]


-----------------------------------------------------------------------------------------
| end of epoch  29 | time 28.31s | valid loss  6.74 | valid ppl   848.41
-----------------------------------------------------------------------------------------
| epoch 30 | step 500/2080 | loss 7.6920 | lr 0.000 | ngrams/sec 58581.8 | eta 0h0m26s
| epoch 30 | step 1000/2080 | loss 7.6942 | lr 0.000 | ngrams/sec 74451.5 | eta 0h0m14s
| epoch 30 | step 1500/2080 | loss 7.6909 | lr 0.000 | ngrams/sec 74495.8 | eta 0h0m7s
| epoch 30 | step 2000/2080 | loss 7.7143 | lr 0.000 | ngrams/sec 74486.9 | eta 0h0m1s


 63%|██████▎   | 131/209 [00:00<00:00, 1271.62it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 552.87it/s]


-----------------------------------------------------------------------------------------
| end of epoch  30 | time 28.32s | valid loss  6.72 | valid ppl   831.57
-----------------------------------------------------------------------------------------
| epoch 31 | step 500/2080 | loss 7.6841 | lr 0.000 | ngrams/sec 58624.5 | eta 0h0m26s
| epoch 31 | step 1000/2080 | loss 7.7016 | lr 0.000 | ngrams/sec 74429.3 | eta 0h0m14s
| epoch 31 | step 1500/2080 | loss 7.6931 | lr 0.000 | ngrams/sec 74475.3 | eta 0h0m7s
| epoch 31 | step 2000/2080 | loss 7.6897 | lr 0.000 | ngrams/sec 74500.7 | eta 0h0m1s


 62%|██████▏   | 130/209 [00:00<00:00, 1280.43it/s]

Evaluating on validation set...


100%|██████████| 209/209 [00:00<00:00, 550.45it/s]


-----------------------------------------------------------------------------------------


 55%|█████▌    | 131/237 [00:00<00:00, 1268.19it/s]

| end of epoch  31 | time 28.32s | valid loss  6.78 | valid ppl   881.33
-----------------------------------------------------------------------------------------
Early stopping
Evaluating on test set...


100%|██████████| 237/237 [00:00<00:00, 496.71it/s]


| End of training | test loss  6.74 | test ppl   844.14


In [156]:
from google.colab import files


files.download('checkpoint.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [160]:
!cp "checkpoint.pth" "gdrive/MyDrive/checkpoint.pth"


cp: cannot create regular file 'gdrive/MyDrive/nlpModels/checkpoint.pth': No such file or directory
