## Descargar dataset
HUMAN NUMBERS: Cuenta del 1 al 10.000

In [1]:
from fastai.text.all import *

In [2]:
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('/home/will/.fastai/data/human_numbers/train.txt'),Path('/home/will/.fastai/data/human_numbers/valid.txt')]

In [3]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n','eleven \n','twelve \n','thirteen \n','fourteen \n','fifteen \n','sixteen \n','seventeen \n','eighteen \n','nineteen \n','twenty \n'...]

## Tokenizar

### Tomar todo y concatenarlo en un único string

In [4]:
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

### Los tokens serán todos los puntos y palabras

In [5]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

### *Numericalizar*

In [6]:
# To numericalize we create a list of all the unique tokens (our vocab)
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine','ten','eleven','twelve','thirteen','fourteen','fifteen','sixteen','seventeen','eighteen','nineteen'...]

In [7]:
# Asignar un número (id) a cada token
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1,6,1,7,1,8,1,9,1,10,1...]

## Primer Modelo de Lenguaje desde 0
Este primer caso va a predecir la siguiente palabra basándose en las 3 palabras anteriores. Con Python `determinista` sería:

In [8]:
L((tokens[i:i+3], tokens[i+3]) for i in range(0, len(tokens) -4, 3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen'),(['sixteen', '.', 'seventeen'], '.'),(['.', 'eighteen', '.'], 'nineteen'),(['nineteen', '.', 'twenty'], '.'),(['.', 'twenty', 'one'], '.'),(['.', 'twenty', 'two'], '.'),(['.', 'twenty', 'three'], '.'),(['.', 'twenty', 'four'], '.'),(['.', 'twenty', 'five'], '.'),(['.', 'twenty', 'six'], '.'),(['.', 'twenty', 'seven'], '.')...]

In [9]:
seqs = L((tensor(nums[i:i+3]), nums [i+3]) for i in range(0, len(nums) - 4, 3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16),(tensor([16,  1, 17]), 1),(tensor([ 1, 18,  1]), 19),(tensor([19,  1, 20]), 1),(tensor([ 1, 20,  0]), 1),(tensor([ 1, 20,  2]), 1),(tensor([ 1, 20,  3]), 1),(tensor([ 1, 20,  4]), 1),(tensor([ 1, 20,  5]), 1),(tensor([ 1, 20,  6]), 1),(tensor([ 1, 20,  7]), 1)...]

### Crear batches usando la clase "Dataloader"

In [10]:
bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[:cut], seqs[cut:], bs=bs, shuffle=False)

### Crear el módulo

In [11]:
class LMModel1(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)

    def forward(self, x):
        h = F.relu(self.h_h(self.i_h(x[:, 0])))
        h = h + self.i_h(x[:, 1])
        h = F.relu(self.h_h(h))
        h = h + self.i_h(x[:, 2])
        return self.h_o(h)

### Intentar un entreno

In [12]:
learn = Learner(dls, LMModel1(len(vocab), bs), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.87308,1.879638,0.463921,00:04
1,1.487982,1.625845,0.476938,00:04
2,1.459782,1.500129,0.489717,00:04
3,1.404593,1.465446,0.490193,00:04


#### Revisar eficiencia entrenamiento
Para ello, lo comparamos con un modelo totalmente simple. Siempre predecirá el token más común.

In [13]:
n, counts = 0, torch.zeros(len(vocab))
for x,y in dls.valid:
    n +=y.shape[0]
    for i in range_of(vocab): counts[i] += (y==i).long().sum()
idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx].item()/n

(tensor(1), '.', 0.16006894912030434)

Se puede leer que se trata del token '.' que representa un 16% del dataset de entrenamiento. Usándolo tendríamos una precisión del 16%. Si lo comparamos con la Red Neuronal que tenemos, es bastante mejor la nuestra con un 49.57%

> Nota: No sé por qué en el libro ponían que el token 'thousand' era el más representativo. En nuestro código resulta el '.'

## Primera RNN (Recurrent Neural Network)

In [14]:
class LMModel2(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
    
    def forward(self, x):
        h = 0
        for i in range(3):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
        return self.h_o(h)

In [15]:
learn = Learner(dls, LMModel2(len(vocab), bs), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3) 

epoch,train_loss,valid_loss,accuracy,time
0,1.772785,1.936081,0.4597,00:04
1,1.39591,1.670687,0.466536,00:05
2,1.442036,1.473286,0.485378,00:07
3,1.39937,1.436685,0.486745,00:04


### Mantener el estado de la red neuronal.
Actualmente, la red no sabe en qué parte de la cuenta nos encontramos. Si quisiéramos mantener el estado siguiendo el esquema actual tendríamos que introducir 10.000 capas de neuronas. Esto es inviable porque el proceso de backprop sería increíblemente costoso.

> Solución: Decir a Pytorch que no queremos hacer backprop sobre toda la red neuronal. En lugar de eso, usar las últimas 3 capas de gradientes. Para ello, se usa el método *detach*

In [16]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0

    def forward(self, x):
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h = self.h.detach()
        return out
    
    def reset(self): self.h = 0

Este modelo tendrá el mismo número de activaciones independientemente de la longitud de secuencia que usemos. El estado escondido recordará la última activación del batch previo. **La única cosa que será diferente son los gradientes calculados en cada paso debido a la influencia de ese estado escondido**. ***Se calcularán basados únicamente en la longitud de secuencia pasada en lugar de todo el stream***.

> Nota: Esta aproximación se llama ***backpropagation through time (BTT)***

Hay que hacer la reordenación del dataset a mano

In [17]:
m = len(seqs) // bs
m, bs, len(seqs)

(328, 64, 21031)

First batch: (0, m, 2*m, ..., (bs-1)*m)  
Second batch: (1, m+1, 2*m + 1, ..., (bs-1)*m + 1)

In [18]:
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i +m*j] for j in range(bs))
    return new_ds

In [19]:
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs, drop_last=true, shuffle=False
)

In [20]:
learn = Learner(dls, LMModel3(len(vocab), bs), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.708199,1.798186,0.482933,00:03
1,1.241837,1.606235,0.462019,00:03
2,1.06015,1.423892,0.556971,00:03
3,1.000513,1.468853,0.560337,00:03
4,0.974653,1.547597,0.521154,00:03
5,0.911223,1.518217,0.58125,00:03
6,0.88603,1.510052,0.58101,00:06
7,0.822814,1.531217,0.611058,00:03
8,0.787646,1.556508,0.60625,00:03
9,0.777405,1.554249,0.613702,00:03


### Crear más señal
Cambiamos el tamaño de las entradas al atributo *sl* (sequence length) y lo hacemos bastante más grande (16). También se cambia el modelo para que haga predicciones tras cada palabra en lugar de tras cada 3.

In [21]:
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0, len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs, drop_last=True, shuffle=False
)

[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.','six','.','seven','.','eight','.'],
 (#16) ['.','two','.','three','.','four','.','five','.','six','.','seven','.','eight','.','nine']]

In [22]:
class LMModel4(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0

    def forward(self, x):
        outs = []
        for i in range(sl):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self): self.h = 0

In [23]:
def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [24]:
learn = Learner(dls, LMModel4(len(vocab), bs), loss_func=loss_func, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.231908,3.099921,0.231038,00:02
1,2.34225,2.028744,0.461507,00:02
2,1.731806,1.787023,0.45988,00:02
3,1.451861,1.778834,0.489909,00:02
4,1.280449,1.669527,0.547038,00:02
5,1.153848,1.64974,0.572347,00:02
6,1.038531,1.708601,0.600993,00:02
7,0.938328,1.775338,0.611491,00:02
8,0.86124,1.79831,0.633219,00:05
9,0.799868,1.822167,0.638428,00:02


## RNN Multicapa
Se envían las salidas de una primera capa RNN a una segunda capa RNN como inputs

In [25]:
class LMModel5(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = torch.zeros(n_layers, bs, n_hidden)
    
    def forward(self, x):
        res, h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(res)

    def reset(self): self.h.zero_()

In [26]:
learn = Learner(
    dls, LMModel5(len(vocab), 64, 2),
    loss_func=CrossEntropyLossFlat(),
    metrics=accuracy, cbs=ModelResetter
)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.030372,2.623901,0.412435,00:03
1,2.160344,1.749531,0.470947,00:03
2,1.707772,1.886378,0.352539,00:03
3,1.497916,1.74002,0.427246,00:07
4,1.328462,1.70779,0.49292,00:04
5,1.187369,1.765787,0.495117,00:03
6,1.054687,1.689607,0.510579,00:03
7,0.93339,1.651121,0.520915,00:03
8,0.836444,1.691936,0.524821,00:04
9,0.76282,1.742215,0.515951,00:03


## LSTM LM

### Poco optimizado
Es mejor hacer una única gran multiplicación de matrices que hacer varias dividiéndola en otras matrices más pequeñas.

In [27]:
class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.forget_gate    = nn.Linear(ni + nh, nh)
        self.input_gate     = nn.Linear(ni + nh, nh)
        self.cell_gate      = nn.Linear(ni + nh, nh)
        self.output_gate    = nn.Linear(ni + nh, nh)

    def forward(self, input, state):
        h, c = state
        h = torch.cat([h, input], dim=1)
        forget = torch.sigmoid(self.forget_gate(h))
        c = c * forget
        inp = torch.sigmoid(self.input_gate(h))
        cell = torch.tanh(self.cell_gate(h))
        c = c + inp * cell
        out = torch.sigmoid(self.output_gate(h))
        h = out * torch.tanh(c)
        return h, (h, c)

### Optimizado

In [28]:
class LSTMCell(Module):
    def __init__(self, ni, nh):
        self.ih = nn.Linear(ni, 4*nh)
        self.hh = nn.Linear(nh, 4*nh)

    def forward(self, input, state):
        h, c = state
        # One big multiplication for all the gates is better than 4 smaller ones
        gates = (self.ih(input) + self.hh(h)).chunk(4, 1)
        ingate, forgetgate, outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()

        c = (forgetgate*c) + (ingate*cellgate)
        h = outgate * c.tanh()
        return h, (h, c)

In [29]:
class LMModel6(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h   = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]

    def forward(self, x):
        res,h  = self.rnn(self.i_h(x), self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)

    def reset(self):
        for h in self.h: h.zero_()

In [30]:
learn = Learner(
    dls, LMModel6(len(vocab), bs, 2),
    loss_func=CrossEntropyLossFlat(),
    metrics=accuracy, cbs=ModelResetter
)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.023236,2.766167,0.280599,00:04
1,2.207154,2.274855,0.256429,00:05
2,1.600317,1.822498,0.487223,00:04
3,1.301125,1.919242,0.520426,00:04
4,1.073612,2.157125,0.513021,00:08
5,0.86662,1.947124,0.611409,00:04
6,0.650388,1.889281,0.615397,00:07
7,0.445092,1.731433,0.625407,00:06
8,0.29233,1.662285,0.685059,00:04
9,0.179652,1.67085,0.722738,00:10


## Weight-Tied Regularized LSTM

In [31]:
class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h    = nn.Embedding(vocab_sz, n_hidden)
        self.rnn    = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop   = nn.Dropout(p)
        self.h_o    = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h      = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]

    def forward(self, x):
        raw, h      = self.rnn(self.i_h(x), self.h)
        out         = self.drop(raw)
        self.h      = [h_.detach() for h_ in h]
        return self.h_o(out), raw, out
    
    def reset(self):
        for h in self.h: h.zero_()

In [32]:
# learn = Learner(
#     dls, LMModel7(len(vocab), bs, 2, 0.5),
#     loss_func=CrossEntropyLossFlat(),
#     metrics=accuracy,
#     cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=2)]
# )
# A textLearner automatically adds those 2 callbacks for us
learn = TextLearner(dls, LMModel7(len(vocab), bs, 2, 0.4),
                    loss_func=CrossEntropyLossFlat(), metrics=accuracy)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,2.624278,2.154914,0.460856,00:06
1,1.610829,1.542183,0.682292,00:06
2,0.845732,1.186196,0.759277,00:05
3,0.414766,1.001542,0.786214,00:07
4,0.211987,1.162922,0.788086,00:09
5,0.117473,1.164125,0.791097,00:06
6,0.071496,1.104581,0.797526,00:07
7,0.050251,0.940376,0.826091,00:06
8,0.034253,0.977097,0.820312,00:06
9,0.025976,1.035428,0.807373,00:09
