In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

In [2]:
PATH='data/nietzsche/'

In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600901


In [4]:
text[:100]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all ph'

In [7]:
text[-100:]

'al to sanctity, just as they were denounced by the\nchristian world as the indications of sinfulness.'

# Find out how many unique chars are in text or set a vocabulary

In [9]:
chars = sorted(set(text))
len(chars)
vocab = len(chars) + 1

In [13]:
print(f"total chars(vocabulary): {vocab}")

total chars(vocabulary): 86


Sometimes it is useful to have padding or to know where text start using "\0" char

In [17]:
chars.insert(0, "\0")

In [26]:
chars = sorted(set(chars))

In [27]:
''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

# Now we need to have a map from indices to unique char and vice versa 

In [28]:
char_indices = {j:i for i,j in enumerate(chars)}
indices_char = {i:j for i,j in enumerate(chars)}

In [29]:
char_indices['M']

37

In [30]:
indices_char[37]

'M'

Use idx which convert all chars into their index

In [31]:
idx = [char_indices[c] for c in text]

In [33]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [35]:
''.join(indices_char[i] for i in idx[:40])

'PREFACE\n\n\nSUPPOSING that Truth is a woma'

In [36]:
n_hidden = 256

In [37]:
n_fac = 42

In [38]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='data/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

In [39]:
file_to_split = f'{PATH}nietzsche.txt'

In [44]:
#total_file = len(open(file_to_split, 'r').read().split('\n'))

In [45]:
len(text)

600901

In [50]:
trn_size = int(0.75*len(text))
val_size = len(text) - trn_size

In [51]:
trn_size, val_size

(450675, 150226)

In [53]:
len(text[-val_size:])

150226

In [56]:
with open(f"{TRN}nietzsche.txt", "w") as text_file:
    text_file.write(text[:trn_size])

In [57]:
with open(f"{VAL}nietzsche.txt", "w") as text_file:
    text_file.write(text[-val_size:])

In [58]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH,TEXT,**FILES, bs=bs, bptt= bptt, min_freq=3)

In [61]:
dl = iter(md.trn_dl)
xs, ys = next(dl)
xs,ys;

In [63]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(865, 54, 1, 443419)

In [68]:
n_fac, vocab

(42,
 <module 'torchtext.vocab' from 'C:\\Users\\Shota\\Anaconda3\\envs\\fastai\\lib\\site-packages\\torchtext\\vocab.py'>)

# RNN

In [74]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        #rnn : h_t = \tanh(w_{ih} x_t + b_{ih}  +  w_{hh} h_{(t-1)} + b_{hh})
        self.e = nn.Embedding(vocab_size,n_fac) #output (bs x n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden) #output (bs x n_hidden)
        self.out = nn.Linear(n_hidden, vocab_size) #output (bs x vocab_size)
        self.init_hidden(bs)
        
        
    def forward(self, data):
        bs = data[0].size(0)
        if self.h.size(1) != bs : self.init_hidden(bs) #fixes size of bs cause it is random
        output, h =self.rnn(self.e(data), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.out(output), dim=-1).view(-1, self.vocab_size)
        
    def init_hidden(self, bs):
        self.h =  V(torch.zeros(1, bs, n_hidden))

In [75]:
m = CharSeqStatefulRnn(md.nt, n_fac, bs)

In [76]:
opt = optim.Adam(m.parameters(), lr = 1e-3)

In [78]:
fit(m, md, 4,opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.619157   1.652531  
    1      1.566626   1.622346                                                                                         
    2      1.534958   1.595991                                                                                         
    3      1.509614   1.582886                                                                                         



[array([1.58289])]

# GRU

In [79]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [82]:
m = CharSeqStatefulGRU(md.nt, n_fac, bs)
opt = torch.optim.Adam(m.parameters(), lr = 1e-3)

In [83]:
fit(m, md, 6, opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.773303   1.763802  
    1      1.588406   1.615119                                                                                         
    2      1.493089   1.547361                                                                                         
    3      1.441945   1.515519                                                                                         
    4      1.403122   1.498071                                                                                         
    5      1.375237   1.485652                                                                                         



[array([1.48565])]

In [84]:
set_lrs(opt, 1e-4)

In [85]:
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.286972   1.448796  
    1      1.28715    1.444437                                                                                         
    2      1.287711   1.442686                                                                                         



[array([1.44269])]

increasing bs you can get better results

# LSTM

In [86]:
from fastai import sgdr

n_hidden=512

In [87]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [91]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 64, 2)
lo = LayerOptimizer(torch.optim.Adam, m, 1e-2, 1e-5)

In [92]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.837188   1.770575  
    1      1.723521   1.659877                                                                                         


[array([1.65988])]

on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

# Test

In [123]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [124]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [None]:
print(get_next_n('for thos', 400))