In [1]:
#export
from exp.nb_12 import *

# Data

In [2]:
path = untar_data(URLs.IMDB)

In [3]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [4]:
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

In [5]:
proc_tok, proc_num = TokenizeProcessor(max_workers=8), NumericalizeProcessor()

In [6]:
ll = label_by_func(sd, lambda x: 0, proc_x=[proc_tok, proc_num])

In [7]:
pickle.dump(ll, open(path/'ll_lm.pkl', 'wb'))
pickle.dump(proc_num.vocab, open(path/'vocab_lm.pkl', 'wb'))

In [8]:
ll = pickle.load(open(path/'ll_lm.pkl', 'rb'))
vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))

In [9]:
bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt)

# AWD-LSTM

In [10]:
class LSTMCell(nn.Module):
    def __init__(self, ni, nh):
        super().__init__()
        self.ih = nn.Linear(ni,nh*4)
        self.hh = nn.Linear(ni,nh*4)
        
    def forward(self, input, state):
        h,c = state
        gates = (self.ih(input) + self.hh(h)).chunk(4,1)
        ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()
        
        c = (forgetgate * c) + (ingate * cellgate)
        h = outgate * c.tanh()
        return h, (h,c)

In [11]:
class LSTMLayer(nn.Module):
    def __init__(self, cell, *cell_args):
        super().__init__()
        self.cell = cell(*cell_args)
        
    def forward(self, input, state):
        inputs = input.unbind(1)
        outputs = []
        for i in range(len(inputs)):
            out, state = self.cell(inputs[i], state)
            outputs += [out]
        return torch.stack(outputs, dim=1), state

In [12]:
lstm = LSTMLayer(LSTMCell, 300, 300)

In [13]:
x = torch.randn(64, 70, 300)
h = (torch.zeros(64, 300), torch.zeros(64, 300))

In [14]:
%timeit -n 10 y, h1 = lstm(x,h)

147 ms ± 15.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [16]:
def time_fn(f):
    f()
    torch.cuda.synchronize()

In [17]:
f = partial(lstm, x, h)
time_fn(f)

In [18]:
%timeit -n 10 time_fn(f)

19.9 ms ± 5.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Builtin Version

In [19]:
lstm = nn.LSTM(300, 300, 1, batch_first=True)

In [20]:
x = torch.randn(64, 70, 300)
h = (torch.zeros(1, 64, 300), torch.zeros(1, 64, 300))

In [21]:
%timeit -n 10 y,h1 = lstm(x,h)

100 ms ± 21.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [23]:
f = partial(lstm, x, h)
time_fn(f)

In [24]:
%timeit -n 10 time_fn(f)

6.28 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Jit version

# Dropout

In [25]:
#export
def dropout_mask(x, sz, p):
    return x.new(*sz).bernoulli_(1-p).div_(1-p)

In [26]:
x = torch.randn(10,10)
mask = dropout_mask(x, (10,10), 0.5); mask

tensor([[0., 0., 0., 0., 0., 0., 2., 2., 0., 0.],
        [0., 2., 2., 0., 2., 2., 2., 0., 2., 2.],
        [2., 0., 2., 0., 2., 2., 0., 2., 0., 0.],
        [0., 2., 0., 2., 2., 2., 2., 0., 2., 2.],
        [2., 0., 2., 0., 0., 2., 0., 2., 2., 2.],
        [2., 0., 0., 0., 0., 0., 0., 2., 0., 2.],
        [2., 0., 2., 2., 0., 2., 2., 2., 2., 2.],
        [2., 0., 0., 0., 0., 0., 2., 2., 0., 2.],
        [2., 2., 0., 2., 2., 0., 2., 0., 2., 2.],
        [0., 0., 2., 2., 2., 2., 0., 2., 0., 2.]])

In [27]:
mask = dropout_mask(x, (10,1,10), 0.5); mask

tensor([[[2., 2., 2., 0., 2., 2., 0., 2., 2., 2.]],

        [[2., 2., 2., 2., 2., 0., 0., 0., 2., 2.]],

        [[2., 0., 2., 0., 0., 2., 0., 2., 0., 2.]],

        [[2., 2., 2., 2., 2., 0., 0., 2., 2., 0.]],

        [[2., 0., 0., 2., 2., 0., 2., 0., 2., 2.]],

        [[0., 2., 2., 0., 0., 0., 0., 2., 2., 0.]],

        [[2., 0., 2., 0., 0., 2., 0., 0., 2., 0.]],

        [[2., 0., 2., 2., 2., 0., 0., 2., 2., 2.]],

        [[0., 2., 2., 2., 0., 2., 2., 0., 0., 0.]],

        [[2., 0., 0., 2., 2., 0., 0., 2., 2., 0.]]])

In [28]:
(x*mask).std(), x.std()

(tensor(1.6299), tensor(1.0464))

In [29]:
#export
class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
        
    def forward(self, x):
        if not self.training or self.p == 0: return x
        m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p)
        return x * m

In [30]:
dp = RNNDropout(0.3)
tst_input = torch.randn(3,3,7)
tst_input, dp(tst_input)

(tensor([[[-0.6063,  0.6775, -1.9772,  0.1402, -0.3584, -0.2261, -0.2862],
          [ 0.2561,  0.9484, -1.2634, -1.1033,  0.7754,  1.2954,  2.1382],
          [-1.2494, -0.2679, -0.3921,  0.4358,  0.0893, -1.3988,  0.5875]],
 
         [[ 0.4595, -0.6507, -0.7962, -1.0126,  1.4215, -0.5194,  1.2283],
          [ 0.2415, -0.7655, -0.0630, -0.5322,  0.1524,  0.2143, -0.3106],
          [ 0.9479, -1.1344,  0.2207,  0.7560, -1.2366,  1.1719, -0.4460]],
 
         [[-1.8169, -1.6285, -2.6410,  1.4514,  0.9844, -1.4719, -1.0307],
          [-1.1995,  0.8590,  0.1494,  1.4166, -0.9586, -0.6480, -0.7661],
          [-0.2177,  2.8074, -0.6074, -0.5804,  0.2179, -0.1285,  0.6921]]]),
 tensor([[[-0.8661,  0.9679, -2.8246,  0.0000, -0.5120, -0.0000, -0.4089],
          [ 0.3658,  1.3548, -1.8049, -0.0000,  1.1078,  0.0000,  3.0546],
          [-1.7848, -0.3828, -0.5601,  0.0000,  0.1275, -0.0000,  0.8393]],
 
         [[ 0.6564, -0.0000, -1.1375, -0.0000,  2.0308, -0.0000,  1.7547],
          [ 0

In [31]:
#export
import warnings

WEIGHT_HH = 'weight_hh_l0'

class WeightDropout(nn.Module):
    def __init__(self, module, weight_p=[0.], layer_names=[WEIGHT_HH]):
        super().__init__()
        self.module,self.weight_p,self.layer_names = module, weight_p, layer_names
        for layer in self.layer_names:
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False)
            
    def _setweights(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, self.weight_p, self.training)
            
    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            return self.module.forward(*args)

In [32]:
module = nn.LSTM(5, 2)
dp_module = WeightDropout(module, 0.4)
getattr(dp_module.module, WEIGHT_HH)

Parameter containing:
tensor([[ 0.4979,  0.4344],
        [ 0.4838, -0.4228],
        [-0.6440, -0.0390],
        [-0.2952, -0.3687],
        [ 0.4696,  0.5331],
        [ 0.6828, -0.0539],
        [-0.4510, -0.3763],
        [ 0.4732, -0.6334]], requires_grad=True)

In [33]:
tst_input = torch.randn(4,20,5)
h = (torch.zeros(1,20,2), torch.zeros(1,20,2))
x,h = dp_module(tst_input, h)
getattr(dp_module.module, WEIGHT_HH)

tensor([[ 0.8299,  0.0000],
        [ 0.0000, -0.7047],
        [-0.0000, -0.0650],
        [-0.4920, -0.0000],
        [ 0.0000,  0.0000],
        [ 1.1380, -0.0898],
        [-0.7517, -0.6271],
        [ 0.0000, -0.0000]], grad_fn=<MulBackward0>)

In [34]:
#export
class EmbeddingDropout(nn.Module):
    def __init__(self, emb, embed_p):
        super().__init__()
        self.emb,self.embed_p = emb,embed_p
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None: self.pad_idx = -1
        
    def forward(self, words, scale=None):
        if self.training and self.embed_p != 0:
            size = (self.emb.weight.size(0), 1)
            mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
            masked_embed = self.emb.weight * mask
        else: masked_embed = self.emb.weight
        if scale: masked_embed.mul_(scale)
        
        return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm, self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)

In [35]:
enc = nn.Embedding(100, 7, padding_idx=1)
enc_dp = EmbeddingDropout(enc, 0.5)
tst_input = torch.randint(0, 100, (5,))
enc_dp(tst_input)

tensor([[-0.0000,  0.0000, -0.0000, -0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000, -0.0000, -0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 3.1045, -1.1391, -2.0346,  1.6329,  0.0320,  2.5144,  0.9208],
        [-0.0000, -0.0000, -0.0000, -0.0000,  0.0000,  0.0000,  0.0000],
        [-0.7919, -0.6989, -0.9279,  0.7615, -1.1240, -2.0632,  4.0006]],
       grad_fn=<EmbeddingBackward>)

# Main model

In [36]:
#export
def to_detach(h):
    return h.detach() if type(h) == torch.Tensor else tuple(to_detach(v) for v in h)

In [37]:
#export
class AWD_LSTM(nn.Module):
    "AWD-LSTM inspired by https://arxiv.org/abs/1708.02182."
    initrange=0.1
    
    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5):
        super().__init__()
        self.bs,self.emb_sz,self.n_hid,self.n_layers = 1,emb_sz,n_hid,n_layers
        self.emb = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.emb_dp = EmbeddingDropout(self.emb, embed_p)
        self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz), 1, batch_first=True) for l in range(n_layers)]
        self.rnns = nn.ModuleList([WeightDropout(rnn, weight_p) for rnn in self.rnns])
        self.emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for _ in range(n_layers)])
        
    def forward(self, input):
        bs,sl = input.size()
        if bs != self.bs:
            self.bs = bs
            self.reset()
        
        raw_output = self.input_dp(self.emb_dp(input))
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn, hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
        self.hidden = to_detach(new_hidden)
        return raw_outputs, outputs
    
    def reset(self):
        self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]
        
    def _one_hidden(self, l):
        nh = self.n_hid if l != self.n_layers - 1 else self.emb_sz
        return next(self.parameters()).new(1, self.bs, nh).zero_()

In [38]:
#export
class LinearDecoder(nn.Module):
    def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias=True):
        super().__init__()
        self.output_dp = RNNDropout(output_p)
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight
        else: init.kaiming_uniform_(self.decoder.weight)
        
        
    def forward(self, input):
        raw_outputs, outputs = input
        output = self.output_dp(outputs[-1]).contiguous()
        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
        return decoded, raw_outputs, outputs

In [39]:
#export
class SequentialRNN(nn.Sequential):
    def reset(self):
        for c in self.children():
            if hasattr(self, 'reset'): c.reset()

In [40]:
#export
def get_language_model(vocab_sz, emb_sz, n_hid, n_layers, pad_token, output_p=0.4, hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5, tie_weights=True, bias=True):
    rnn_enc = AWD_LSTM(vocab_sz, emb_sz, n_hid, n_layers, pad_token, hidden_p, input_p, embed_p, weight_p)
    enc = rnn_enc.emb if tie_weights else None
    return SequentialRNN(rnn_enc, LinearDecoder(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias))

In [41]:
tok_pad = vocab.index(PAD)

In [42]:
tst_model = get_language_model(len(vocab), 300, 300, 2, tok_pad)
tst_model = tst_model.cuda()

In [43]:
x,y = next(iter(data.train_dl))

In [44]:
z = tst_model(x.cuda())

In [45]:
len(z)

3

In [46]:
decoded, raw_outputs, outputs = z

In [47]:
decoded.size()

torch.Size([4480, 60003])

In [48]:
64*70

4480

In [49]:
[o.size() for o in raw_outputs], [o.size() for o in outputs]

([torch.Size([64, 70, 300]), torch.Size([64, 70, 300])],
 [torch.Size([64, 70, 300]), torch.Size([64, 70, 300])])

## Callbacks to train the model

In [50]:
#export
class GradientClipping(Callback):
    def __init__(self, clip=None): self.clip = clip
    def after_backward(self):
        if self.clip: nn.utils.clip_grad_norm_(self.run.model.parameters(), self.clip)

In [51]:
#export
class RNNTrainer(Callback):
    def __init__(self, α, β): self.α,self.β = α,β
    
    def after_pred(self):
        self.raw_out, self.out = self.pred[1], self.pred[2]
        self.run.pred = self.pred[0]
        
    def after_loss(self):
        # AR and TAR
        if self.α != 0.: self.run.loss += self.α * self.out[-1].float().pow(2).mean()
        if self.β != 0.:
            h = self.raw_out[-1]
            if h.size(1) > 1: self.run.loss += self.β * (h[:,1:] - h[:, :-1]).float().pow(2).mean()
    
    def begin_epoch(self):
        if hasattr(self.dl.dataset, 'batchify'): self.dl.dataset.batchify()

In [52]:
#export
def cross_entropy_flat(input, target):
    bs, sl = target.size()
    return F.cross_entropy(input.view(bs * sl, -1), target.view(bs * sl))

def accuracy_flat(input, target):
    bs, sl = target.size()
    return accuracy(input.view(bs * sl, -1), target.view(bs * sl))

In [53]:
emb_sz, nh, nl = 300, 300, 2
model = get_language_model(len(vocab), emb_sz, nh, nl, tok_pad)

In [54]:
cbs = [partial(AvgStatsCallback, accuracy_flat),
      CudaCallback, Recorder, ProgressCallback,
      partial(GradientClipping, 0.1),
      partial(RNNTrainer, α=2., β=1.)]

In [56]:
learn = Learner(model, data, cross_entropy_flat, lr=5e-3, cb_funcs=cbs, opt_func=adam_opt())

In [57]:
learn.fit(1)

epoch,train_loss,train_accuracy_flat,valid_loss,valid_accuracy_flat,time


RuntimeError: CUDA out of memory. Tried to allocate 1.00 GiB (GPU 0; 4.00 GiB total capacity; 2.32 GiB already allocated; 6.87 MiB free; 2.35 GiB reserved in total by PyTorch)

In [None]:
!python notebook2script.py 12a_awd_lstm.ipynb