In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from fastai import *        # Quick accesss to most common functionality
from fastai.text import *   # Quick accesss to NLP functionality
import html

# Wikitext 103 (Optional)
This notebook is for training the language model on most of Wikipedia.  
The idea is to create a generalized language model before we fine tune it to predict snippets.

In [3]:
EOS = '<eos>'

In [7]:
PATH=Path('data/wikitext-103')

## Data

Download the dataset [here](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip) and unzip it so it's in the folder wikitext.

Blog:
https://einstein.ai/research/blog/the-wikitext-long-term-dependency-language-modeling-dataset

Original notebook:
https://github.com/fastai/fastai_docs/blob/master/dev_nb/007_wikitext_2.ipynb

Small helper function to read the tokens.

In [None]:
download_url('https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip', PATH)

In [8]:
def read_file(filename):
    tokens = []
    with open(PATH/filename, encoding='utf8') as f:
        for line in f:
            tokens.append(line.split() + [EOS])
    return np.array(tokens)

In [9]:
train_tok = read_file('wiki.train.tokens')
valid_tok = read_file('wiki.valid.tokens')
test_tok = read_file('wiki.test.tokens')

In [10]:
len(train_tok), len(valid_tok), len(test_tok)

(1801350, 3760, 4358)

In [11]:
' '.join(train_tok[4][:20])

'The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II'

In [13]:
vocab = Vocab(str(PATH))

In [None]:
# vocab = Vocab.create(PATH, train_tok, max_vocab=60000, min_freq=2)

In [None]:
train_ids = np.array([vocab.numericalize(p) for p in train_tok])
valid_ids = np.array([vocab.numericalize(p) for p in valid_tok])
np.save(PATH / 'train_ids.npy', train_ids)
np.save(PATH / 'valid_ids.npy', valid_ids)

## Loading data

In [6]:
text_data = TextLMDataBunch.from_id_files(PATH)

In [7]:
learn = RNNLearner.language_model(text_data)

In [8]:
learn.opt_fn = partial(optim.Adam, betas=(0.8,0.99))
learn.callback_fns.extend([partial(GradientClipping, clip=0.12)])

In [9]:
epoch = 12
lr = 1e-3
momentum = (0.8,0.7)
weight_decay = 1.2e-6

In [19]:
learn.fit_one_cycle(epoch, lr, moms=momentum, wd=weight_decay)

VBox(children=(HBox(children=(IntProgress(value=0, max=4), HTML(value='0.00% [0/4 00:00<00:00]'))), HTML(value…

Total time: 3:23:54
epoch  train loss  valid loss  accuracy
0      3.987870    3.370073    0.382919  (50:58)
1      3.966082    3.356367    0.385371  (50:57)
2      3.893526    3.352040    0.385390  (51:02)
3      3.897372    3.351799    0.385498  (50:56)



In [22]:
learn.save_encoder('lstm_wt103_v2')

In [23]:
learn.save('lstm_wt103_full_v2')

### Testing (for fun)

In [175]:
learn.load('lstm_wt103_full_v2'); learn.model.eval();

In [12]:
lm_vocab = text_data.train_ds.vocab
tokenizer = text_data.train_ds.tokenizer

In [177]:
x_str = "The online encyclopedia project Wikipedia is the most popular wiki-based website, and is"

In [184]:
tokens = tokenizer.process_all([x_str])
x_num = lm_vocab.numericalize(tokens[0])
x_t = torch.tensor(x_num).cuda()

In [185]:
lm_vocab.textify(x_num) # sanity check

'the online encyclopedia project wikipedia is the most popular xxunk - based website , and is'

In [183]:
num_preds = 50
for i in range(num_preds):
    res,*_ = learn.model(x_t.unsqueeze(-1).cuda())
    p1, p2 = res[-1].topk(2)[-1].detach()
    best = p2 if p1.data == 0 else p1 # force it to not predict unknowns
    x_t = torch.cat((x_t, best.unsqueeze(0)))
print('Stem:', x_str)
preds = lm_vocab.textify(x_t[-num_preds:])
print('\nPredicted:', preds)

Stem: The online encyclopedia project Wikipedia is the most popular wiki-based website, and is

Predicted: the online encyclopedia project wikipedia is the most popular xxunk - based website , and is the best selling of all the year 's best selling . " the magazine has been the subject of several publications , including the " the best of the world " , the " the best of the world " , the " the best of the world " , the " best of the world " , the " best of the year " list , and the " best of the year " list . the magazine was also ranked the best magazine in the world by the magazine 's editor , and the magazine 's " best of the year " list . the magazine was ranked the best magazine in the world in the first half of the 20th century , and was ranked the best magazine in the world in the late 1980s . the magazine was ranked the best magazine in the world in the 1990s , and was ranked the best magazine in the world in the 2000s . the magazine was ranked the best magazine in the world in