In [1]:
from fastbook import *
from fastai.text.all import *

In [2]:
path = untar_data(URLs.HUMAN_NUMBERS)

In [3]:
path.ls()

(#2) [Path('/root/.fastai/data/human_numbers/valid.txt'),Path('/root/.fastai/data/human_numbers/train.txt')]

In [4]:
lines = L()
with open(path/'train.txt') as f:
    lines += L(*f.readlines())

with open(path/'valid.txt') as f:
    lines += L(*f.readlines())
    
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [12]:
text = ' . '.join([t.strip() for t in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [13]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [14]:
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [16]:
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[w] for w in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [29]:
seqs = L(
    (tensor(nums[i:i+3]), nums[i+3]) for i in range(0, len(nums) - 4, 3)
    )

In [30]:
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [31]:
bs = 64
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)

In [32]:
class LMModel1(Module):
    def __init__(self, vocab_size, n_hidden):
        self.i_h = nn.Embedding(vocab_size, n_hidden)
        self.h_h = nn.Linear(n_hidden,n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, x):
        # shape of x: 64x1x3 tensor
        h = F.relu(
            self.h_h(
                # first word's embedding
                self.i_h(x[:,0])
            )
        )
        
        h = h + self.i_h(x[:,1])
        h = F.relu(
            self.h_h(h)
        )
        
        h = h + self.i_h(x[:,2])
        h = F.relu(
            self.h_h(h)
        )
        h = self.h_o(h)
        return h
        

In [33]:
model1 = LMModel1(len(vocab), 60)

In [35]:
x,y = dls.one_batch()
x.shape, y.shape

(torch.Size([64, 3]), torch.Size([64]))

In [38]:
model1(x).shape

torch.Size([64, 30])

In [40]:
learn = Learner(dls, LMModel1(len(vocab), 70), loss_func = F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4,1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.825336,1.859318,0.466366,00:03
1,1.405354,1.690724,0.470406,00:02
2,1.404521,1.597311,0.493939,00:02
3,1.3687,1.601832,0.495603,00:02


In [42]:
x,y = first(dls.valid)
y.shape[0]

64

In [52]:
n, counts = 0, torch.zeros(len(vocab))
for x,y in dls.valid:
    n += y.shape[0]
    for i in range(len(vocab)):
        counts[i] += (y==i).long().sum()
idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx].item()/n

(tensor(29), 'thousand', 0.15165200855716662)

In [53]:
class LMModel2(Module):
    def __init__(self, vocab_size, n_hidden):
        self.i_h = nn.Embedding(vocab_size,n_hidden)
        self.h_h = nn.Linear(n_hidden,n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, x):
        h = 0
        for i in range(3):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
            
        return self.h_o(h)

In [56]:
learn = Learner(dls, LMModel2(len(vocab), 70), loss_func = F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4,1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.743556,2.062414,0.462562,00:02
1,1.380203,1.803409,0.467554,00:03
2,1.413023,1.64471,0.490373,00:03
3,1.380128,1.656836,0.490849,00:03


In [60]:
class LMModel3(Module):
    def __init__(self, vocab_size, n_hidden):
        self.i_h = nn.Embedding(vocab_size, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_size)
        self.h = 0
    
    def forward(self, x):
        for i in range(3):
            w = x[:,i]
            # 1 x N_H
            self.h = self.h + self.i_h(w)
            # 1 x N_H
            self.h = F.relu(
                self.h_h(self.h)
            )
        out = self.h_o(self.h)
        self.h = self.h.detach()
        return out
    
    def reset(self):
        self.h = 0

In [66]:
m = len(seqs) // bs
m, bs, len(seqs)

(328, 64, 21031)

In [67]:
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m):
        new_ds.extend(
            L(ds[i + m*j] for j in range(bs))
        )
    return new_ds

In [75]:
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:],bs),
    bs=bs, drop_last=True, shuffle=False
)

In [107]:
learn = Learner(dls, LMModel3(len(vocab), 70), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10,3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.68175,1.876675,0.444712,00:03
1,1.221282,1.711194,0.47524,00:03
2,1.095107,1.771184,0.491106,00:03
3,1.044861,1.508587,0.52476,00:03
4,0.991423,1.580314,0.544712,00:03
5,0.961924,1.534011,0.571635,00:03
6,0.91371,1.635032,0.563702,00:03
7,0.865319,1.711483,0.599279,00:03
8,0.813645,1.802173,0.587019,00:04
9,0.793731,1.798924,0.592548,00:03


In [114]:
sequence_lenth = 16
seqs = L(
    (tensor(nums[i:i+sequence_lenth]), tensor(nums[i+1:i+sequence_lenth+1]))
    for i in range(0, len(nums)-sequence_lenth-1, sequence_lenth)
)
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                            group_chunks(seqs[cut:],bs),
                             drop_last=True,
                             bs=bs,
                             shuffle=False
                            )

In [121]:
class LMModel4(Module):
    def __init__(self,vocab_size, n_hidden):
        self.i_h = nn.Embedding(vocab_size, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_size)
        self.h = 0
    
    def forward(self,x):
        # sequence_length * vocab_size
        outs = []
        for i in range(x.shape[1]):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
            res = self.h_o(self.h)
            outs.append(res)
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self):
        self.h = 0

In [130]:
def loss_func(inp, targ):
    return F.cross_entropy(
        inp.view(-1, len(vocab)),
        targ.view(-1)
    )

In [124]:
x,y = first(dls.train)
model4 = LMModel4(len(vocab), 70)
res = model4(x)
res.shape, y.shape

(torch.Size([64, 16, 30]), torch.Size([64, 16]))

In [126]:
res.view(-1, len(vocab)).shape, y.view(-1).shape

(torch.Size([1024, 30]), torch.Size([1024]))

In [131]:
learn = Learner(dls, LMModel4(len(vocab), 70), loss_func=loss_func, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15,3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.203058,3.002908,0.239014,00:01
1,2.296747,1.958359,0.399333,00:01
2,1.713271,1.912141,0.451172,00:01
3,1.432879,1.875142,0.512451,00:01
4,1.252114,2.085871,0.528483,00:01
5,1.131799,1.90947,0.553711,00:01
6,1.008021,2.163505,0.589844,00:01
7,0.911827,2.217276,0.593262,00:01
8,0.830509,2.370112,0.616943,00:01
9,0.812769,2.245617,0.62972,00:01


In [163]:
class LMModel5(Module):
    def __init__(self, vocab_size, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_size, n_hidden)
        self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_size)
        self.h = torch.zeros(n_layers, bs, n_hidden)
        
    def forward(self,x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(res)
    
    def reset(self):
        self.h.zero_()

In [160]:
model5 = LMModel5(len(vocab), 50, 2)

In [161]:
x,y = first(dls.train)

In [162]:
res = model5(x)
res.shape

torch.Size([64, 16, 50]) torch.Size([2, 64, 50])


torch.Size([64, 16, 30])

In [154]:
model5

LMModel5(
  (i_h): Embedding(30, 50)
  (rnn): RNN(50, 50, num_layers=2, batch_first=True)
  (h_o): Linear(in_features=50, out_features=30, bias=True)
)

In [158]:
for x in model5.parameters():
    print(x.shape)

torch.Size([30, 50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50])
torch.Size([30, 50])
torch.Size([30])


In [164]:
learn = Learner(dls, LMModel5(len(vocab), 70, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15,3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,2.950081,2.470674,0.399984,00:01
1,2.081548,1.738917,0.472982,00:01
2,1.657391,1.773795,0.451742,00:01
3,1.419227,1.931949,0.493815,00:01
4,1.229338,2.038371,0.507894,00:01
5,1.087298,2.055904,0.51888,00:01
6,0.973927,2.112855,0.524902,00:01
7,0.887343,2.167323,0.539144,00:01
8,0.804076,2.234354,0.539551,00:01
9,0.736658,2.280966,0.539062,00:01


In [171]:
class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.forget_gate = nn.Linear(ni + nh, nh)
        self.input_gate = nn.Linear(ni + nh, nh)
        self.cell_gate = nn.Linear(ni + nh, nh)
        self.ouput_gate = nn.Linear(ni + nh, nh)
        
    def forward(self, input, state):
        h,c = state
        h = torch.cat([h, input], dim=1)
        forget = torch.sigmoid(self.forget_gate(h))
        c = c*forget
        inp = torch.sigmoid(self.input_gate(h))
        cell = torch.tanh(self.cell_gate(h))
        c = c + inp*cell
        out = torch.sigmoid(self.output_gate(h))
        h = out * torch.tanh(c)
        return h, (h,c)

In [165]:
class LSTMCell(Module):
    def __init__(self, ni,nh):
        self.ih = nn.Linear(ni, 4*nh)
        self.hh = nn.Linear(nh, 4*nh)
    
    def forward(self, input, state):
        h,c = state
        gates = (self.ih(input) + self.hh(h)).chuck(4,1)
        ingate, forgetgate, outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()
        
        c = (forgetgate*c) + (ingate*cellgate)
        h = outgate * c.tanh()
        return h, (h,c)

In [170]:
torch.cat([t1,t2],dim=2).shape

torch.Size([16, 3, 14])

In [180]:
class LMModel6(Module):
    def __init__(self, vocab_size, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_size, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_size)
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
        
    def forward(self, x):
        res, h = self.rnn(self.i_h(x), self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)
    
    def reset(self):
        for h in self.h:
            h.zero_()

In [178]:
model6 = LMModel6(len(vocab), 50, 2)

In [195]:
model6.i_h.weight.shape, model6.h_o.weight.shape

(torch.Size([30, 50]), torch.Size([30, 50]))

In [181]:
learn = Learner(dls, LMModel6(len(vocab), 70, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15,1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,2.991256,2.698031,0.260986,00:02
1,2.116167,1.977941,0.309082,00:02
2,1.589849,1.718649,0.49056,00:02
3,1.306189,1.924715,0.534342,00:03
4,1.115278,1.859783,0.534261,00:03
5,0.909204,1.787189,0.579102,00:03
6,0.666926,1.577857,0.674723,00:03
7,0.435135,1.381169,0.725586,00:02
8,0.285357,1.398232,0.742269,00:02
9,0.193636,1.331967,0.778646,00:02


In [200]:
class LMModel7(Module):
    def __init__(self, vocab_size,n_hidden, n_layers,p):
        self.i_h = nn.Embedding(vocab_size, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_size)
        self.h_o.weight = self.i_h.weight
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
        
    
    def forward(self,x):
        raw, h = self.rnn(self.i_h(x), self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(out),raw,out
    
    def reset(self):
        for h in self.h:
            h.zero_()

In [201]:
learn = Learner(dls, LMModel7(len(vocab), 70, 2, 0.5),
               loss_func=CrossEntropyLossFlat(),
                metrics=accuracy,
                cbs=[ModelResetter,RNNRegularizer(alpha=2, beta=1)]
               )

In [202]:
learn = TextLearner(dls, LMModel7(len(vocab), 70,2,0.5),
                    loss_func=CrossEntropyLossFlat(),
                    metrics=accuracy
                   )

In [203]:
learn.fit_one_cycle(10,1e-2,wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,2.361794,1.665256,0.49764,00:02
1,1.483413,1.182473,0.617513,00:02
2,0.819536,0.666981,0.807454,00:03
3,0.4669,0.490863,0.847087,00:02
4,0.298775,0.434633,0.853678,00:03
5,0.216145,0.42916,0.854411,00:03
6,0.173862,0.365146,0.885579,00:03
7,0.151468,0.354647,0.893066,00:03
8,0.138553,0.347897,0.891764,00:03
9,0.131962,0.347198,0.890462,00:03
