# Lesson 6 RNN ([Code Along](https://github.com/fastai/fastai/blob/master/courses/dl1/lesson6-rnn.ipynb))
----------------------------------

2018/7/22 –– Wayne H Nixalo

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.io import *
from fastai.conv_learner import *
from fastai.column_data import *

## 1. Setup

> We're going to download the collected works of Nietzsche to use as our data for this class.

In [3]:
PATH = Path('data/nietzsche')

get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600893


In [4]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars) + 1
print('total chars', vocab_size)

total chars 85


> Sometimes it's useful to have a zero value in the dataset, eg: for padding.

In [6]:
chars.insert(0, '\0')
''.join(chars[1:-5])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

>Map from chars to indices and back again:

In [7]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

>idx will be the data we use form now on – it simply converts all characters to their index (based on the mapping above).

In [8]:
idx = [char_indices[c] for c in text]
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [9]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## 2. Three char model

### 2.1 Create inputs

>Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters.

In [10]:
cs = 3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)] # every 1st char
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)] # every 2nd
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)] # every 3rd
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)] # every 4th

>Our inputs:

In [11]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

>Our outputs:

In [12]:
y = np.stack(c4_dat)

>The first 4 inputs and outputs:

In [13]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [14]:
y[:4]

array([30, 29,  1, 40])

In [15]:
x1.shape, y.shape

((200297,), (200297,))

### 2.2 Create and train model

>Pick a size for our hidden state

In [16]:
n_hidden = 256

>The number of latent factors to create (ie: size of the embedding matrix):

In [17]:
n_fac = 42 # about half the number of our characters

In [18]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac) # embedding
        
        # the 'green arrow' from our diagram – the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)
        
        # the 'orange arrow' from our diagram – the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # the 'blue arrow' from our diagram – the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = torch.zeros(in1.size()).cuda() # I dont think I have to wrap as Variable since this is pytorch 0.4, no?
        h = torch.tanh(self.l_hidden(h + in1))
        h = torch.tanh(self.l_hidden(h + in2))
        h = torch.tanh(self.l_hidden(h + in3))
        
        return F.log_softmax(self.l_out(h))

In [19]:
mdata = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)
model = Char3Model(vocab_size, n_fac).cuda()

In [20]:
it = iter(mdata.trn_dl)
*xs,yt = next(it)
tensor = model(*xs)

In [21]:
optimizer = optim.Adam(model.parameters(), 1e-2)

In [22]:
set_lrs(optimizer, 1e-3)
fit(model, mdata, 1, optimizer, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.109939   1.221681  



[1.2216806411743164]

In [23]:
set_lrs(optimizer, 1e-3)
fit(model, mdata, 1, optimizer, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.916162   0.859782  



[0.8597822189331055]

### 2.3 Test model

In [24]:
def get_next(inp):
    """
    Takes a 3-char string. 
    Turns it into a Tensor of an array of the char index of the string.
    Passes that tensor to the model.
    Does an argmax to get the predicted char-number; then coverts to char.
    """
    idxs = T(np.array([char_indices[c] for c in inp]))
    pred = model(*idxs)
    i = np.argmax(to_np(pred))
    return chars[i]

In [25]:
get_next('y. '), get_next('ppl'), get_next(' th'), get_next('and')

('T', 'e', 'e', ' ')

## 3. Our first RNN

[Lecture 6](https://youtu.be/sHcLkfRrgoQ?t=5923)

### 3.1 Create inputs

>This is the size of our unrolled RNN:

In [26]:
cs = 8

>For each of 0 thru 8, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [27]:
c_in_dat = [[idx[i + j] for i in range(cs)] for j in range(len(idx) - cs)]

In [28]:
c_out_dat = [idx[j + cs] for j in range(len(idx) - cs)]

In [29]:
xs = np.stack(c_in_dat, axis=0); xs.shape

(600885, 8)

In [30]:
y = np.stack(c_out_dat); y.shape

(600885,)

>So each column below is one series of 8 characters from the text.

In [31]:
xs[:cs, :cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

they're overlapping. So after '`[42, 29, 30, 25, 27, 29,  1,  1]`' comes '`1`', and after '`[29, 30, 25, 27, 29,  1,  1,  1]`' comes '`43`', and so on. The `n`th row is the same as the `n`th column.

>...and this is the next character after each sequence

In [32]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### 3.2 Create and train model

In [33]:
val_idx = get_cv_idxs(len(idx) - cs - 1)

In [34]:
mdata = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [35]:
class CharLoopModel(nn.Module):
    """This is an RNN."""
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h  = torch.zeros(bs, n_hidden).cuda()
        for c in cs:
            inp = torch.tanh(self.l_in(self.e(c)))   # the torch.tanh vs F.tanh warning didnt pop
            h   = torch.tanh(self.l_hidden(h + inp)) # up on Mac, but did on Linux-gpu. Odd.
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [36]:
model = CharLoopModel(vocab_size, n_fac).cuda()
optimizer = optim.Adam(model.parameters(), 1e-2)

In [37]:
fit(model, mdata, 1, optimizer, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.940634   1.938047  



[1.9380468817814864]

In [38]:
set_lrs(optimizer, 1e-3)
fit(model, mdata, 1, optimizer, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.676599   1.673927  



[1.6739268474962308]

The input and hidden states represent qualitatively different types of information, so adding them together can potentially lose information. Instead we can concatenate them together.

In [37]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac + n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h  = torch.zeros(bs, n_hidden).cuda()
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h   = torch.tanh(self.l_hidden(inp))
            
        return F.log_softmax(self.l_out(inp), dim=-1)

In [38]:
model = CharLoopConcatModel(vocab_size, n_fac).cuda()
optimizer = optim.Adam(model.parameters(), 1e-3)

In [41]:
it = iter(mdata.trn_dl)
*xs,yt = next(it)
t = model(*xs)

In [42]:
xs[0].size(0)

512

In [43]:
t

tensor([[-4.4995, -4.5184, -4.4380,  ..., -4.3623, -4.5934, -4.3042],
        [-4.5358, -4.5572, -4.4473,  ..., -4.3485, -4.4524, -4.4279],
        [-4.5170, -4.5369, -4.5225,  ..., -4.4864, -4.4857, -4.4198],
        ...,
        [-4.5922, -4.4909, -4.5027,  ..., -4.4887, -4.5508, -4.4037],
        [-4.5853, -4.5673, -4.4204,  ..., -4.3825, -4.3977, -4.4349],
        [-4.3585, -4.3449, -4.2625,  ..., -4.4409, -4.5255, -4.2546]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward>)

In [44]:
fit(model, mdata, 1, optimizer, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.833206   1.813848  



[1.813847915324116]

In [45]:
set_lrs(optimizer, 1e-4)
fit(model, mdata, 1, optimizer, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.710183   1.717879  



[1.7178790965611632]

### 3.3 Test Model

In [44]:
def get_next(inp):
#     idxs = [T(np.array([char_indices[c] for c in inp]))]
    idxs = [T(np.array([char_indices[c]])) for c in inp]
    p = model(*idxs)
    i = np.argmax(to_np(p))
#     pdb.set_trace()
    return chars[i]

In [47]:
get_next('for thos')

'e'

In [48]:
get_next('part of ')

't'

In [49]:
get_next('queens a')

'n'

## 4. RNN with PyTorch

[Lecture 6, 1:48:52](https://youtu.be/sHcLkfRrgoQ?t=6532)

In [45]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h  = torch.zeros(1, bs, n_hidden)
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
#         return F.log_softmax(self.l_out(outp[-1]))
        return F.log_softmax(self.l_out(outp[-1]), dim=-1) # outp[-1] to get last hidden state

In [46]:
model = CharRNN(vocab_size, n_fac).cuda()
optimizer = optim.Adam(model.parameters(), 1e-3)

In [47]:
it = iter(mdata.trn_dl)
*xs,yt = next(it)

In [48]:
# tensor = model.e(V(torch.stack(xs))) # works w/o V(.). but takes longer when switching btwn w/wo V(.)?
# tensor = model.e(torch.stack(xs)) # these are ints so cannot require gradients
tensor = model.e(T(torch.stack(xs)))

In [49]:
# htensor = V(torch.zeros(1, 512, n_hidden)) # V(.) required here, else: RuntimeError: CuDNN error: CUDNN_STATUS_EXECUTION_FAILED
# NOTE: does not work: htensor = torch.zeros(1, 512, n_hidden, requires_grad=True) # requires_grad=True accomplishes what V(.) did in 0.3.1 for 0.4.
htensor = T(torch.zeros(1, 512, n_hidden))

In [52]:
outp, hn = model.rnn(tensor, htensor)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

I'm able to get this far in pytorch 0.4, using `T` instead of `V`. The problem is the next line keeps giving me a:
<div class="alert alert-info" style="border: 3px solid #900C3F; color: #900C3F; background-color: #F08080">
RuntimeError: CuDNN error: CUDNN_STATUS_EXECUTION_FAILED
</div>

As per [here](http://forums.fast.ai/t/wiki-lesson-6/9404/28?u=borz), I'm going to use pytorch 0.3 from here to the end.

In [57]:
tensor = model(*V(xs)); tensor.size()

RuntimeError: CuDNN error: CUDNN_STATUS_EXECUTION_FAILED