# Chapter 6b: Predicting English word version of numbers using RNN

In [1]:
from fastai.text.all import *
bs = 64

path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('/home/fastai2/.fastai/data/human_numbers/valid.txt'),Path('/home/fastai2/.fastai/data/human_numbers/train.txt')]

In [2]:
sl = 40  # better than bptt = sl = 20

In [3]:
lines = L()
with open(path/"train.txt") as f: lines += L(*f.readlines())
with open(path/"valid.txt") as f: lines += L(*f.readlines())

text = " . ".join([l.strip() for l in lines]) # separator
tokens = text.split(" ")
vocab = L(*tokens).unique()
word2idx = {w:i for i, w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)


def group_chunks(ds, bs): 
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m * j] for j in range(bs))
    return new_ds


seqs = L((tensor(nums[i : i + sl]), tensor(nums[i + 1: i + sl + 1]))
        for i in range(0, len(nums) - sl - 1, sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [4]:
nv = len(vocab)
nh = 56
nv, nh

(30, 56)

In [5]:
def loss_func(inp, targ): 
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

## Adding a GRU. 

In [45]:
class Model5(Module): 
    """One layer GRU"""
    def __init__(self, nv, nh):
        self.i_h = nn.Embedding(nv, nh)
        self.rnn = nn.GRU(nh, nh, 1, batch_first=True)
        self.h_o = nn.Linear(nh, nv)
        self.bn = BatchNorm1dFlat(nh)
        self.h = torch.zeros(1, bs, nh).cuda()

    def forward(self, x, targ):
        res, h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(self.bn(res))

In [54]:
class TeacherForcing(Callback):
    def __init__(self, end_epoch): self.end_epoch = end_epoch
    
    # def before_batch(self): 
    #     self.learn.xb = (self.x, self.y)
    def before_batch(self): 
        # print(self.x.shape)
        print(self.x + self.y)
        # self.learn.xb = (self.x, self.y)
        self.learn.xb = self.x

    def before_epoch(self):
        self.learn.model.pr_force = 1 - self.learn.epoch / self.end_epoch

In [55]:
learn = Learner(dls, Model5(nv, nh).cuda(), metrics=accuracy, loss_func=loss_func,
cbs=TeacherForcing(5))
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time


tensor([[ 1,  3,  3,  ..., 21, 21, 21],
        [30, 33,  6,  ..., 42, 15,  3],
        [ 4, 31, 52,  ..., 29,  5,  4],
        ...,
        [32, 50, 29,  ...,  3,  8, 36],
        [34, 33, 52,  ..., 29,  6,  8],
        [35, 34, 53,  ..., 53, 33,  9]], device='cuda:0')


TypeError: forward() takes 3 positional arguments but 65 were given

In [8]:
learn = Learner(dls, Model5(nv, nh).cuda(), metrics=accuracy, loss_func=loss_func)
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.115588,3.012713,0.284961,00:00
1,2.30318,2.21823,0.325391,00:00
2,1.850387,1.987093,0.378418,00:00
3,1.482051,1.734097,0.475,00:00
4,1.126451,1.468181,0.631348,00:00
5,0.832557,1.400972,0.686523,00:00
6,0.612401,1.442162,0.70166,00:00
7,0.454393,1.522337,0.697949,00:00
8,0.344277,1.536823,0.702441,00:00
9,0.2689,1.524603,0.701758,00:00


## Let's make our own GRU
Using PyTorch's GRUCell. 
Axis 1 is the time dimension. We want to loop through axis 1. 

In [9]:
def rnn_loop(cell, h, x): 
    res = []
    for x_ in x.transpose(0, 1):
        h = cell(x_, h)
        res.append(h)
    return torch.stack(res, dim=1)

In [17]:
class Model6(Model5):
    def __init__(self, nv, nh):
        super().__init__(nv, nh)
        self.rnnc = nn.GRUCell(nh, nh)
        self.h = torch.zeros(bs, nh).cuda()

    def forward(self, x):
        res = rnn_loop(self.rnnc, self.h, self.i_h(x))
        self.h = res[:, -1].detach()
        return self.h_o(self.bn(res))

In [18]:
learn = Learner(dls, Model6(nv, nh).cuda(), metrics=accuracy, loss_func=loss_func)
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.108515,3.051471,0.246191,00:00
1,2.309349,2.266989,0.326562,00:00
2,1.829143,1.824468,0.413867,00:00
3,1.433972,1.514582,0.527051,00:00
4,1.078152,1.371456,0.610352,00:00
5,0.789716,1.443892,0.622949,00:00
6,0.576026,1.324231,0.656152,00:00
7,0.425238,1.357815,0.655957,00:00
8,0.320548,1.473028,0.638379,00:00
9,0.249287,1.484925,0.635938,00:00


## Writing Custom GRUCell

In [22]:
class custom_GRUCell(Module):
    def __init__(self, ni, nh): 
        super(custom_GRUCell, self).__init__()
        self.ni, self.nh = ni, nh
        self.i2h = nn.Linear(ni, 3 * nh)
        self.h2h = nn.Linear(nh, 3 * nh)

    def forward(self, x, h):
        gate_x = self.i2h(x).squeeze()
        gate_h = self.h2h(x).squeeze()
        i_r, i_u, i_n = gate_x.chunk(3, 1)
        h_r, h_u, h_n = gate_h.chunk(3, 1)

        reset_gate = torch.sigmoid(i_r + h_r)
        update_gate = torch.sigmoid(i_u + h_u)
        new_gate = torch.tanh(i_n + (reset_gate * h_n))
        return update_gate * h + (1 - update_gate) * new_gate

In [23]:
class Model7(Model6):
    def __init__(self, nv, nh):
        super(Model7, self).__init__(nv, nh)
        self.rnnc = custom_GRUCell(nh, nh)

In [25]:
learn = Learner(dls, Model7(nv, nh).cuda(), metrics=accuracy, loss_func=loss_func)
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.048735,2.98978,0.22373,00:01
1,2.291927,2.418529,0.323438,00:01
2,1.893169,2.160614,0.348145,00:01
3,1.684633,2.190918,0.352246,00:01
4,1.556448,2.236534,0.356055,00:01
5,1.470578,2.251192,0.369434,00:01
6,1.410134,2.25578,0.378906,00:01
7,1.366845,2.270138,0.376465,00:01
8,1.335763,2.246675,0.384766,00:01
9,1.314332,2.243241,0.384766,00:01


One doesn't know why accuracy decreases so rapidly. Try check if there's error in the code. 