Implemention and exploration of RNN, GRU, and LSTM using a Tale of Two Cities by Charles Dickens, taken from the Project Gutenburg corpus dataset. 

The entire dataset can be found here:<https://web.eecs.umich.edu/~lahiri/gutenberg_dataset.html>
This is a collection of 3,036 English books written by 142 authors

## Import Libraries 

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2


from fastai.io import *
from fastai.conv_learner import *
from fastai.column_data import *

## Preparing Data

In [2]:
PATH = "data/dickens/"

In [3]:
works_list = [i for i in os.listdir(f"./{PATH}")]


In [5]:
works_list[4]

'Charles Dickens___A Tale of Two Cities.txt'

In [7]:
text = open(f'{PATH}/{works_list[4]}').read()
print (len(text))

757228


In [8]:
text = text[1900:] #remove table of contents 

In [9]:
#create a list from the set (which is an unordered collection of unique elements)
#sort the list in ascending order
chars = sorted(list(set(text)))
vocab_size = len(chars)
print ("The total number of chars: ", vocab_size)
print ('\n',chars)

The total number of chars:  74

 ['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', '1', '2', '5', '6', '7', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '©', 'Ã']


In [10]:
#inserts 0 character 
chars.insert(0, "\0")

In [11]:
#creates mapping for each character to an index and vice versa 
char_indices = dict((c,i) for i,c in enumerate(chars))
indices_char = dict((i,c) for i,c in enumerate(chars))

In [12]:
#find indexes for all characters in text 
idxs = [char_indices[c] for c in text]
idxs[:10]

[9, 1, 55, 66, 2, 69, 47, 65, 2, 66]

In [13]:
#creating data with 3 characters being skipped in each dataset
#so c1_dat would have first letter, then skip to 4th,..
#second ataset would have second letter, then skip to 5th 
#later we will create sets that do not have overlapping info. 
char_skip = 3
c1_dat = [idxs[i] for i in range(0,len(idxs)-char_skip, char_skip)]
c2_dat = [idxs[i+1] for i in range(0,len(idxs)-char_skip, char_skip)]
c3_dat = [idxs[i+2] for i in range(0,len(idxs)-char_skip, char_skip)]
c4_dat = [idxs[i+3] for i in range(0,len(idxs)-char_skip, char_skip)]


In [14]:
#create 1d numpy arrays from the above lists 
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)
y = np.stack(c4_dat)

In [15]:
#as can be seen, the 2nd element in x1 (character 4) is the 1st element in y 
#which starts at character 4.... 
x1[:4],x2[:4],x3[:4],y[:4]

(array([ 9, 66, 47, 66]),
 array([ 1,  2, 65, 54]),
 array([55, 69,  2, 51]),
 array([66, 47, 66,  2]))

In [16]:
#recreate and print out as a sanity check
dat_text_1 = [indices_char[c] for c in x1]
dat_text_2 = [indices_char[c] for c in x2]
dat_text_3 = [indices_char[c] for c in x3]
dat_text_4 = [indices_char[c] for c in y]

recreate = []
for i in range(0,len(x1)):
    for element in [dat_text_1,dat_text_2,dat_text_3]:
        recreate.append(element[i])

"".join(recreate)[:500]

',\nit was the worst of times,\nit was the age of wisdom,\nit was the age of foolishness,\nit was the epoch of belief,\nit was the epoch of incredulity,\nit was the season of Light,\nit was the season of Darkness,\nit was the spring of hope,\nit was the winter of despair,\nwe had everything before us,\nwe had nothing before us,\nwe were all going direct to Heaven,\nwe were all going direct the other way--\nin short, the period was so far like the present period, that some of\nits noisiest authorities insisted o'

In [17]:
x1.shape, y.shape

((251775,), (251775,))

## Model Training: Simple FC RNN

A simple fully connected RNN, it takes three input characters at time and outputs a prediction for the next character. 

In [18]:
n_hidden = 256 #number of activations in hidden layers 
n_facs = 42 #number of embedding factors 

In [19]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_facs):
        super().__init__() 
        #embedding matrix for each character 
        self.embed = nn.Embedding(vocab_size,n_facs)
        #weight matrix for incoming letters, includes bias 
        self.lin_in = nn.Linear(n_facs,n_hidden)
        #weight matrix for hidden to hidden transitions 
        self.lin_hidden = nn.Linear(n_hidden,n_hidden)
        #weight matrix for FC layer
        self.lin_out = nn.Linear(n_hidden,vocab_size)
        
    def forward(self, c1, c2, c3):
        input_char1 = F.relu(self.lin_in(self.embed(c1)))
        input_char2 = F.relu(self.lin_in(self.embed(c2)))
        input_char3 = F.relu(self.lin_in(self.embed(c3)))
        
        h = V(torch.zeros(input_char1.size())).cuda()
        h = F.tanh(self.lin_hidden(h+input_char1))
        h = F.tanh(self.lin_hidden(h+input_char2))
        h = F.tanh(self.lin_hidden(h+input_char3))
        
        return F.log_softmax(self.lin_out(h))

In [20]:
md=ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3],axis=1),
                                 y, bs=512)

In [21]:
m = Char3Model(vocab_size, n_facs).cuda()

In [22]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [23]:
opt = optim.Adam(m.parameters(),lr=1e-2)

In [24]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                              
    0      2.029694   3.602037  



[array([3.60204])]

In [25]:
set_lrs(opt, 0.001)

In [26]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                              
    0      1.806318   1.042397  



[array([1.0424])]

##  Model Testing: Simple FC RNN

In [27]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    probs = m(*VV(idxs))
    i = np.argmax(to_np(probs))
    return chars[i]


In [28]:
get_next("y. ")

'T'

In [29]:
get_next("you")

' '

In [30]:
get_next(" an")

'd'

## Model Training: 8 Char RNN

This RNN takes 8 input characters to predict the 9th. 

Below we create datasets to represent this

In [31]:
cs = 8 
#each element in in_data would be lists of 8 characters 
in_data = [[idxs[i+j] for i in range(cs)] for j in range(len(idxs)-cs)] 
out_data = [idxs[cs+j] for j in range(len(idxs)-cs)] #every ninth character

In [32]:
xs = np.stack(in_data)
y = np.stack(out_data)

val_idx = get_cv_idxs(len(idxs)-cs-1)
md = ColumnarModelData.from_arrays(".", val_idx, xs, y, bs=512)

In [33]:
class CharLoopModel(nn.Module):
    def __init__(self, vocab_size, n_facs):
        super().__init__()
        self.embed = nn.Embedding(vocab_size,n_facs)
        self.lin_in = nn.Linear(n_facs+n_hidden,n_hidden)
        self.lin_hidden = nn.Linear(n_hidden,n_hidden)
        self.lin_out = nn.Linear(n_hidden,vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h,self.embed(c)),1)
            inp = F.relu(self.lin_in(inp))   
            h = F.tanh(self.lin_hidden(inp))
        
        return F.log_softmax(self.lin_out(h),dim=-1)

In [34]:
m = CharLoopModel(vocab_size, n_facs).cuda()
opt = optim.Adam(m.parameters(), 0.001)


In [35]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                
    0      1.692284   1.68296   



[array([1.68296])]

## Model Testing: 8 Char RNN

In [36]:
set_lrs(opt,1e-4)
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                
    0      1.599182   1.598317  



[array([1.59832])]

In [37]:
get_next("also in ")

't'

In [38]:
get_next("good da")

'y'

In [39]:
get_next("very go")

'o'

## Model Training: RNN with Pytorch 

This model uses the Pytorch RNN implementation

In [40]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, n_facs):
        super().__init__()
        self.embed = nn.Embedding(vocab_size,n_facs)
        self.rnn = nn.RNN(n_facs, n_hidden)
        self.lin_out = nn.Linear(n_hidden,vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1,bs, n_hidden).cuda())
        inp = self.embed(torch.stack(cs))
        outp, h = self.rnn(inp,h)
        
        return F.log_softmax(self.lin_out(outp[-1]),dim=-1)

In [41]:
m = CharRNN(vocab_size, n_facs).cuda()
opt = optim.Adam(m.parameters(), 0.001)

In [42]:
it = iter(md.trn_dl)
*batch_x, batch_y = next(it)

In [43]:
inp = m.embed(V(torch.stack(batch_x)))

In [44]:
hidden_temp = V(torch.zeros(1,512,n_hidden))
outp, h = m.rnn(inp,hidden_temp)

In [45]:
outp.size(), h.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [46]:
fit(m,md,4,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                
    0      1.742012   1.739475  
    1      1.600827   1.602397                                
    2      1.530463   1.539789                                
    3      1.469388   1.503588                                



[array([1.50359])]

In [47]:
set_lrs(opt,1e-4)
fit(m,md,2,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                
    0      1.403896   1.458415  
    1      1.412412   1.45423                                 



[array([1.45423])]

## Model Testing: RNN with Pytorch

It becomes very clear this simple RNN model has no understanding of sentence structure, nor can it recall previous predictions, thus below we see it repeat the same prediciton

In [48]:
print(get_next("for thos"))
print(get_next("good da"))

e
y


In [49]:
def get_next_n(inp,n):
    res=inp
    for i in range(n):
        c = get_next(inp)
        inp = inp[1:] + c
        res += c 
    return res

In [50]:
get_next_n("for thos", 40)

'for those that he had been the streets, and the '

## Model Training: Multi-Output RNN


Below we stop having overlapping information as was seen in character lists of the previous implementations.

In [51]:
#non overlapping sets
in_data = [[idxs[i+j] for i in range(cs)] for j in range(0,len(idxs)-cs-1,cs)]

#offset above data by one character for the labels (i.e. what we shouldve guessed)
out_data = [[idxs[i+j] for i in range(cs)] for j in range(1,len(idxs)-cs, cs)]

In [52]:
xs = np.stack(in_data)
ys = np.stack(out_data)

In [53]:
xs.shape, ys.shape

((94415, 8), (94415, 8))

In [54]:
xs[:cs,:], ys[:cs,:]

(array([[ 9,  1, 55, 66,  2, 69, 47, 65],
        [ 2, 66, 54, 51,  2, 69, 61, 64],
        [65, 66,  2, 61, 52,  2, 66, 55],
        [59, 51, 65,  9,  1, 55, 66,  2],
        [69, 47, 65,  2, 66, 54, 51,  2],
        [47, 53, 51,  2, 61, 52,  2, 69],
        [55, 65, 50, 61, 59,  9,  1, 55],
        [66,  2, 69, 47, 65,  2, 66, 54]]),
 array([[ 1, 55, 66,  2, 69, 47, 65,  2],
        [66, 54, 51,  2, 69, 61, 64, 65],
        [66,  2, 61, 52,  2, 66, 55, 59],
        [51, 65,  9,  1, 55, 66,  2, 69],
        [47, 65,  2, 66, 54, 51,  2, 47],
        [53, 51,  2, 61, 52,  2, 69, 55],
        [65, 50, 61, 59,  9,  1, 55, 66],
        [ 2, 69, 47, 65,  2, 66, 54, 51]]))

In [55]:
#-cs to ensure all samples same size, minus 1 for label set
val_idx = get_cv_idxs(len(xs)-cs-1) 

In [56]:
md = ColumnarModelData.from_arrays(".", val_idx, xs, ys, bs=512)

In [57]:
class CharSeqRNN(nn.Module):
    def __init__(self, vocab_size, n_facs):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,n_facs)
        self.rnn = nn.RNN(n_facs, n_hidden)
        self.linear_out = nn.Linear(n_hidden,vocab_size)
        
    def forward(self,*cs):
        bs = cs[0].size(0) #size of first sample, would be 8 
        h = V(torch.zeros(1,bs,n_hidden)) #rank 3 because of nn.RNN 
        inp = self.embedding(torch.stack(cs)) #convert to embedding matrices 
        outp, h = self.rnn(inp,h)
        return F.log_softmax(self.linear_out(outp),dim=-1)
    

In [58]:
m = CharSeqRNN(vocab_size, n_facs).cuda()
opt = optim.Adam(m.parameters(), 0.001)

In [59]:
#define because F.nll_loss expects rank 2 tensor, outp is rank 3 tensor 
def nll_loss_seq(inp, targ):
    _,_,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [60]:
fit (m,md,4,opt,nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                              
    0      2.403913   2.263301  
    1      2.122646   2.069694                              
    2      1.996011   1.968641                              
    3      1.920056   1.908909                              



[array([1.90891])]

In [61]:
set_lrs(opt, 1e-4)

In [62]:
fit (m,md,1,opt,nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                              
    0      1.878308   1.891948  



[array([1.89195])]

## Stateful RNN

Up utill now the hidden layer has not been saved,it is thrown away after each segment is processed.
For example to predict the second letter based on the first one for the second segment, minus cs to ensure all samples same size, minus 1 for label set we'd be going off the default activation of H. 
So now we will not reset hidden layer in every forward pass. 

### Loading Data

In [63]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

In [64]:
PATH='./data/dickens/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

In [65]:
TEXT = data.Field(lower=True, tokenize=list) 
bs =64; bptt=8; n_fac =42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, 
                                       min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(1164, 42, 1, 596754)

### Creating RNN

The new variable self.h below, has the same value but no history of operations, as we repackaged out of the tensor extracted. Therefore when it tries to back-propagate, it will stop there, saving lots of memory. Keep hidden state but not the history as to how we got to it.  

In [66]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self,vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        #this line in the constructor will set self.h to be a bunch of zeros 
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        #if minibatch size has changed, (happens unless dataset divisible by bptt*bs)
        if self.h.size(1) != bs:
            self.init_hidden(bs) 
        outp, h = self.rnn(self.e(cs), self.h)
        #key for memory, for very large bptt memory complexitiy would go overboard
        #remembering all the operations and stuff, so repackage_var wraps h 
        #in a new variable forgetting all the history
        self.h = repackage_var(h) 
        #returns output of logmax across last axis, after reshaping to rank 2
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)

    def init_hidden(self, bs):
        self.h = V(torch.zeros(1, bs, n_hidden))
        
        
        
    

In [67]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

fit(m,md,4,opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                 
    0      1.810391   1.798723  
    1      1.655623   1.669346                                 
    2      1.572923   1.60699                                  
    3      1.53718    1.57631                                  



[array([1.57631])]

### Understanding the RNN call

In [68]:
def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    #perform linear transform on incoming data and internal state data
    #add and apply nonlinearity to them 
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [69]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0) 
        if self.h.size(1) != bs:
            self.init_hidden(bs)
        outp = [] #list to append results to
        o = self.h 
        for c in cs: 
            o = self.rnn(self.e(c),o)
            outp.append(o) #append the result of each layer to outp
        outp = self.out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
        
    def init_hidden(self, bs):
        self.h = V(torch.zeros(1,bs, n_hidden))

In [70]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

fit(m,md,4,opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                
    0      1.804233   1.809388  
    1      1.656872   1.678395                                
    2      1.586062   1.615492                                
    3      1.539131   1.583782                                



[array([1.58378])]

## Model Testing: Sateful RNN


In [71]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]


def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

print(get_next_n('for those who ', 400))

for those who by have heard simes of these, rnaveinedin, of his d stones. "to it. i see was no fore the biresponencillindoned up, to the bar, that hole," said they had bound ahagring, if in monsiding himself seft; somehill roor, the coach another, in a youngerent delistening himself see miss preasation."as onay that the echoes, a faitimpered fromtonted, and, exclose,that regood to the infamed the fellownding in


## Model Training: GRU

Below we define a GRU cell and then use it inside the GRU network

In [72]:
def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [73]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size=vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)#use gru instead of rnn now 
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def init_hidden(self, bs):
        self.h = V(torch.zeros(1,bs, n_hidden))
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if bs != self.h.size(1):
            self.init_hidden(bs)
        outp, h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    

In [74]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

fit(m,md,6,opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=6, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                
    0      1.808199   1.8019    
    1      1.653945   1.67905                                 
    2      1.582657   1.617869                                
    3      1.546906   1.584365                                
    4      1.507429   1.564108                                
    5      1.485383   1.551869                                



[array([1.55187])]

In [75]:
set_lrs(opt, 1e-4)
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                
    0      1.410242   1.510801  
    1      1.410462   1.50718                                 
    2      1.411977   1.504186                                



[array([1.50419])]

## Model Testing: GRU

In [76]:
print(get_next_n('for those who ', 400))

for those who palto and conveyoration. but he master at postise my hunbing, at the daughter in this--with a string of great whiles, is on the great, "it out. he disostripance, as they it has hold, and dewimdencount, the came over tw toofallow: to speak for the shoest, jacques outlowing on able, and folding ofgain, always feeling tolone. i and, which an now in his streetin so, perask, she did so consider of dest


## Model Training: LSTM
The main difference now is that the hidden state and cell state are not seperate 

In [77]:
from fastai import sgdr
n_hidden = 512

In [78]:
#big changes include inclusion of dropout and modification to fit cell state and 
#hidden layer 

class CharSeqStatefulLSTM (nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size, self.nl =vocab_size, nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout= 0.5) 
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl,bs, n_hidden)),
                  V(torch.zeros(self.nl,bs,n_hidden)))
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if bs != self.h[0].size(1):
            self.init_hidden(bs)
        outp, h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)

In [79]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()

 #fastai layer optimizer for differentiable learning rates and callbacks
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)


In [80]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                
    0      1.892915   1.823314  
    1      1.751633   1.681082                                



[array([1.68108])]

In [81]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [82]:
#updates lr in lo using cb callback which is called every cycle end, 
#saves models automatically at the end of the cycle 
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=15, style=ProgressStyle(description_width='initia…

epoch      trn_loss   val_loss                                
    0      1.594755   1.555376  
    1      1.619378   1.571361                                
    2      1.518302   1.49042                                 
    3      1.625476   1.577819                                
    4      1.558453   1.520086                                
    5      1.490367   1.459466                                
    6      1.434651   1.426745                                
    7      1.591122   1.5514                                  
    8      1.564776   1.524005                                
    9      1.541792   1.499146                                
    10     1.496917   1.471543                                
    11     1.461271   1.441122                                
    12     1.417949   1.408002                                
    13     1.382665   1.381342                                
    14     1.355372   1.366861                                



[array([1.36686])]

In [83]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=63, style=ProgressStyle(description_width='initia…

epoch      trn_loss   val_loss                                
    0      1.352313   1.364385  
    1      1.348974   1.361212                                
    2      1.341185   1.358837                                
    3      1.345829   1.356446                                
    4      1.339915   1.352294                                
    5      1.322869   1.348898                                
    6      1.327268   1.347601                                
    7      1.321966   1.347394                                
    8      1.321292   1.343569                                
    9      1.309917   1.340022                                
    10     1.302994   1.337294                                
    11     1.296835   1.334542                                
    12     1.291377   1.332001                                
    13     1.287559   1.330803                                
    14     1.280745   1.330033                                
    15     1.289892   

[array([1.35868])]

## Model Testing: LSTM

There is a great improvement from our previous models, but this can be further improved by using a bi-directional model. The model also seems to be overfitting the data too. 

In [84]:
print(get_next_n('for those who ', 400))

for those who knew her,settleness, because the occasionany steepes ofcorner's affectionments of that corning on the vollence."as! the young everything to be hurried."i am going to-night?""i think new, wear, in all on the road. charles sat by the meaning of rehusit, it had a red of degandless life, excepted to the sbort of two attentively, to gracioussecoars by, englandi ask,! saidam about!" said mr. lorry; agai
