In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!ls "gdrive/MyDrive/wikitext-2" # check that it has successfully connected
# files should be at ur GDrive inside folder wikitext-2
!cp "gdrive/MyDrive/wikitext-2/wiki.test.tokens.txt" "test.txt" # copy the files to colab runtime
!cp "gdrive/MyDrive/wikitext-2/wiki.train.tokens.txt" "train.txt"
!cp "gdrive/MyDrive/wikitext-2/wiki.valid.tokens.txt" "valid.txt"

wiki.test.tokens      wiki.train.tokens      wiki.valid.tokens
wiki.test.tokens.txt  wiki.train.tokens.txt  wiki.valid.tokens.txt


# Preprocessing

In [3]:
import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                # remove the headers e.g.  = = Description = = 
                if line.startswith('='): 
                    continue
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word.lower()) # make to lower

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word.lower()]) # make to lower
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
    
    @property
    def vocab_size(self):
        return len(self.dictionary.idx2word)

# Params

In [13]:
#=== params
corpus = Corpus('/content')
n_class = corpus.vocab_size
n_step = 2 # n-1 in paper
n_hidden = 2 # h in paper
m = 2       # m in paper
epochs = 5000
batch_size = 20
order = 2 # order (int): the order of the language model, i.e. length of the history
epochs = 500
learning_rate = 0.5 #0.001
cuda = torch.cuda.is_available()
seed = 42
#===

# Model

In [5]:
#== MODEL ==#
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

'''
    the neural model learns the distributed representation of each word 
    (embedding matrix C) and 
    the probability function of a word sequence as a function of their distributed representations. 
    It has a hidden layer with 
    tanh activation and the output layer is a 
    Softmax layer. 
    The output of the model for each 
    input of (n - 1) previous words are the 
    probabilities over the |V | words in the vocabulary for the next word.
'''
class FNNModel(nn.Module):
    def __init__(self, n_class, m, n_step, n_hidden):
        super(FNNModel,self).__init__()
        """
        Args:
            n_class (int): no. of vocabulary
            m (int): size of each embedding vector
#n-gram models construct tables of conditional probabilities for the next word, 
#for each one of a large number of contexts, i.e. combinations of the last n − 1 words
            n_step (int): n-1 in paper. #n_step + 1 = n-gram. if n_step = 1, bigram
            n_hidden (int): no. of hidden units associated with each word
        """
        """
        Vars:
            C: encoder (|V| x m)
            H: hiden layer weight (n x (n-1)m)
            W: word feature to output weights (|V| x (n-1)m)
            d: hidden layer bias (has h no. of elements)
            U: hidden-to-output weights (|V| × h matrix)
            b: output bias (has |V| no. of elements)
        """
        self.C = nn.Embedding(n_class,m)
        self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(torch.Tensor))
        self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(torch.Tensor))
        self.d = nn.Parameter(torch.randn(n_hidden).type(torch.Tensor))
        self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(torch.Tensor))
        self.b = nn.Parameter(torch.randn(n_class).type(torch.Tensor))

    def forward(self,x):
        x = self.C(x)
        x = x.view(-1, n_step * m) # batch_size,n_step * n_class
        tanh = torch.tanh(self.d + torch.mm(x, self.H))
        output = self.b + torch.mm(x,self.W)+torch.mm(tanh,self.U)
        return output

# Data Loading

In [39]:
def batchify(data, batch_size):
    # Work out how cleanly we can divide the dataset into args.batch_size parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data
def get_batch(data, i, order):
    x = Variable(torch.t(data[i:i+order]))
    y = Variable(data[i+order].view(-1))
    return x, y
def evaluate(data, model, criterion):
	model.eval()
	total_loss = 0
	n_steps = data.size(0) - order - 1
	for i in tqdm(range(n_steps)):
		x, y = get_batch(data, i, order)
		out = model(x)
		loss = criterion(out, y)
		total_loss += loss.data.data
	return total_loss / n_steps

In [15]:
def clock_time(s):
    h, s = divmod(s, 3600)
    m, s = divmod(s, 60)
    return int(h), int(m), int(s)

In [40]:
import numpy as np
import torch.optim as optim
from tqdm import tqdm
import time


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)
if cuda:
	train_data, val_data, test_data = train_data.cuda(), val_data.cuda(), test_data.cuda()
print('Using cuda: {}'.format(cuda))
print('Size of training set: {:,} tokens'.format(np.prod(train_data.size())))
print('Size of validation set: {:,} tokens'.format(np.prod(val_data.size())))
print('Size of test set: {:,} tokens'.format(np.prod(test_data.size())))
print('Vocabulary size: {:,}'.format(corpus.vocab_size))
print('Example data:')
for k in range(100, 107):
    x = [corpus.dictionary.idx2word[i] for i in train_data[k:order+k, 0]]
    y = [corpus.dictionary.idx2word[train_data[k+order, 0]]]
    print(x, y)
#=== initialise model
model = FNNModel(
    n_class, 
    m, 
    n_step, 
    n_hidden)
if cuda:
  model.cuda()
# Display the model's architecture
print('Model: \n', model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=learning_rate)

Using cuda: True
Size of training set: 2,088,620 tokens
Size of validation set: 217,640 tokens
Size of test set: 245,560 tokens
Vocabulary size: 28,912
Example data:
['"', 'nameless'] ['"']
['nameless', '"'] [',']
['"', ','] ['a']
[',', 'a'] ['penal']
['a', 'penal'] ['military']
['penal', 'military'] ['unit']
['military', 'unit'] ['serving']
Model: 
 FNNModel(
  (C): Embedding(28912, 2)
)


In [41]:
# Set seed for reproducibility.
torch.manual_seed(seed)
np.random.seed(seed)
parameters = [param for param in model.parameters() if param.requires_grad]
# Training
print('Training...')
losses = dict(train=[], val=[])

lr = learning_rate # so that can alter later if SGD not descending 
best_val_loss = None

num_steps = train_data.size(0) - order - 1
batch_order = np.arange(num_steps)

t0 = time.time()
try:
    for epoch in range(1, epochs+1):
        model.train()
        epoch_start_time = time.time()
        np.random.shuffle(batch_order)

        for step in range(1, num_steps+1):
            idx = batch_order[step-1]
            x, y = get_batch(train_data, idx, order)

            # Forward pass
            logits = model(x)
            loss = criterion(logits, y)
            debug = False
            #   if debug:
            #     # Debugging softmax approximation.
            #     xe = nn.CrossEntropyLoss()
            #     true_loss = xe(logits, y)
            #     print('approx {:>3.2f}, true {:>3.2f}, diff {:>3.4f}'.format(
            #       loss.data, true_loss.data, true_loss.data - loss.data))

            # Update parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save loss.
            losses['train'].append(loss.cpu().data)
            print_every = 1000
            if step % print_every == 0:
                avg_loss = sum(losses['train'][-print_every:]) / print_every
                t1 = time.time()
                steps_per_second = print_every / (t1 - t0)
                print('| epoch {} | step {}/{} | loss {:.4f} | lr {:.3f} | '
                    'ngrams/sec {:.1f} | eta {}h{}m{}s'.format(
                    epoch, step, num_steps, avg_loss, lr,
                    steps_per_second * batch_size,
                    *clock_time((num_steps - step) / steps_per_second)))
                t0 = time.time()
            
        print('Evaluating on validation set...')
        val_loss = evaluate(val_data, model, criterion)
        losses['val'].append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
            epoch, (time.time() - epoch_start_time), val_loss, torch.exp(val_loss)))
        print('-' * 89)

        if not best_val_loss or val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'checkpoint.pth')
            #=== download checkpoint file
            # files.download('checkpoint.pth')
        else:
            # reduce the learning rate if no improvement has been seen in the validation dataset.
            print("reducing learning rate")
            lr /= 4.0
            optimizer = torch.optim.SGD(parameters, lr=lr)
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')
    
# write_losses(losses['train'], args.log_dir, name='train-losses')
# write_losses(losses['val'], args.log_dir, name='val-losses')

print('Evaluating on test set...')
test_loss = evaluate(test_data, model, criterion)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, torch.exp(test_loss)))
print('=' * 89)

Training...
| epoch 1 | step 1000/104428 | loss 42.6267 | lr 0.500 | ngrams/sec 7157.6 | eta 0h4m49s
| epoch 1 | step 2000/104428 | loss 79.2601 | lr 0.500 | ngrams/sec 7583.9 | eta 0h4m30s
| epoch 1 | step 3000/104428 | loss 89.8685 | lr 0.500 | ngrams/sec 6920.6 | eta 0h4m53s
| epoch 1 | step 4000/104428 | loss 89.7026 | lr 0.500 | ngrams/sec 6561.5 | eta 0h5m6s
| epoch 1 | step 5000/104428 | loss 95.4461 | lr 0.500 | ngrams/sec 7506.5 | eta 0h4m24s
| epoch 1 | step 6000/104428 | loss 101.9821 | lr 0.500 | ngrams/sec 7653.8 | eta 0h4m17s
| epoch 1 | step 7000/104428 | loss 99.1816 | lr 0.500 | ngrams/sec 7609.6 | eta 0h4m16s
| epoch 1 | step 8000/104428 | loss 100.9478 | lr 0.500 | ngrams/sec 7137.3 | eta 0h4m30s
| epoch 1 | step 9000/104428 | loss 95.2506 | lr 0.500 | ngrams/sec 7362.4 | eta 0h4m19s
| epoch 1 | step 10000/104428 | loss 94.7713 | lr 0.500 | ngrams/sec 7556.9 | eta 0h4m9s
| epoch 1 | step 11000/104428 | loss 101.5965 | lr 0.500 | ngrams/sec 7510.7 | eta 0h4m8s
| epoch

  2%|▏         | 209/10879 [00:00<00:05, 2084.39it/s]

Evaluating on validation set...


100%|██████████| 10879/10879 [00:05<00:00, 2077.02it/s]


-----------------------------------------------------------------------------------------
| end of epoch   1 | time 287.30s | valid loss 116.62 | valid ppl      inf
-----------------------------------------------------------------------------------------
| epoch 2 | step 1000/104428 | loss 116.6240 | lr 0.500 | ngrams/sec 2213.3 | eta 0h15m34s
| epoch 2 | step 2000/104428 | loss 119.9842 | lr 0.500 | ngrams/sec 6895.8 | eta 0h4m57s
| epoch 2 | step 3000/104428 | loss 116.8125 | lr 0.500 | ngrams/sec 7081.4 | eta 0h4m46s


  2%|▏         | 204/12275 [00:00<00:05, 2039.17it/s]

-----------------------------------------------------------------------------------------
Exiting from training early
Evaluating on test set...


100%|██████████| 12275/12275 [00:06<00:00, 1970.15it/s]

| End of training | test loss 126.30 | test ppl      inf





In [42]:
from google.colab import files
files.download('checkpoint.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>