In [1]:
import torch
import torch.nn as nn
import torchviz
import sys; sys.path.insert(0, '../')
from exp import nb_d2l_utils

In [2]:
rnn = nn.RNN(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
output, hn = rnn(input, h0)

In [3]:
### RNN Language Model

In [9]:
import collections
class Vocab(object):  # This class is saved in d2l.
    def __init__(self, tokens, min_freq=0, use_special_tokens=False):
        # sort by frequency and token
        counter = collections.Counter(tokens)
        token_freqs = sorted(counter.items(), key=lambda x: x[0])
        token_freqs.sort(key=lambda x: x[1], reverse=True)
        if use_special_tokens:
            # padding, begin of sentence, end of sentence, unknown
            self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
            tokens = ['<pad>', '<bos>', '<eos>', '<unk>']
        else:
            self.unk = 0
            tokens = ['<unk>']
        tokens +=  [token for token, freq in token_freqs if freq >= min_freq]
        self.idx_to_token = []
        self.token_to_idx = dict()
        for token in tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        else:
            return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        else:
            return [self.idx_to_token[index] for index in indices]
        
def cn_pre_process(raw_text):
    text = raw_text.replace('\n', ' ')
    vocab = Vocab(text, min_freq=5)
    return [vocab[i] for i in text], vocab

with open('./data/jaychou_lyrics.txt', 'r') as f:
    raw_text = f.read()
    
# 连续采样
import random
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
    # Offset for the iterator over the data for uniform starts
    offset = int(random.uniform(0,num_steps))
    # Slice out data - ignore num_steps and just wrap around
    num_indices = ((len(corpus_indices) - offset) // batch_size) * batch_size
    indices = torch.Tensor(corpus_indices[offset:(offset + num_indices)], device=device)
    indices = indices.reshape((batch_size,-1))
    # Need to leave one last token since targets are shifted by 1
    num_epochs = (num_indices // batch_size - 1) // num_steps

    for i in range(0, num_epochs * num_steps, num_steps):
        X = indices[:,i:(i+num_steps)]
        Y = indices[:,(i+1):(i+1+num_steps)]
        yield X, Y    

In [11]:
corpus_indices, vocab = cn_pre_process(raw_text)
dataloader = data_iter_consecutive(corpus_indices, 3, 10)

In [13]:
for X, Y in dataloader:
    print(X)
    print(Y)
    for i in X:
        print(vocab.to_tokens([int(i) for i in i]))
    for i in Y:
        print(vocab.to_tokens([int(i) for i in i]))
    break

tensor([[1.9000e+01, 2.3400e+02, 4.0000e+00, 8.9000e+02, 2.6700e+02, 7.0000e+00,
         6.0000e+00, 6.8000e+01, 1.0000e+00, 8.9000e+02],
        [3.5200e+02, 1.1000e+01, 6.5000e+01, 1.0290e+03, 0.0000e+00, 6.7000e+01,
         2.8700e+02, 1.0000e+00, 5.0000e+01, 1.2000e+01],
        [1.3600e+02, 1.0000e+00, 3.0000e+00, 1.9000e+01, 2.4200e+02, 2.4600e+02,
         1.8900e+02, 2.8000e+01, 2.0000e+00, 0.0000e+00]])
tensor([[2.3400e+02, 4.0000e+00, 8.9000e+02, 2.6700e+02, 7.0000e+00, 6.0000e+00,
         6.8000e+01, 1.0000e+00, 8.9000e+02, 2.6700e+02],
        [1.1000e+01, 6.5000e+01, 1.0290e+03, 0.0000e+00, 6.7000e+01, 2.8700e+02,
         1.0000e+00, 5.0000e+01, 1.2000e+01, 1.7000e+01],
        [1.0000e+00, 3.0000e+00, 1.9000e+01, 2.4200e+02, 2.4600e+02, 1.8900e+02,
         2.8000e+01, 2.0000e+00, 0.0000e+00, 0.0000e+00]])
['要', '和', '你', '融', '化', '在', '一', '起', ' ', '融']
['堡', '\u3000', '像', '欧', '<unk>', '情', '调', ' ', '对', '着']
['力', ' ', '我', '要', '做', '音', '乐', '上', '的', '<unk>'

In [16]:
def predict_ch8(prefix, num_predicts, model, vocab, device):  #@save
    state = model.begin_state(batch_size=1, device=device)
    outputs = [vocab[prefix[0]]]
    get_input = lambda: torch.tensor(
        [outputs[-1]], device=device).reshape(1, 1)
    for y in prefix[1:]:  # Warmup state with prefix
        _, state = model(get_input(), state)
        outputs.append(vocab[y])
    for _ in range(num_predicts):  # Predict num_predicts steps
        Y, state = model(get_input(), state)
        outputs.append(int(Y.argmax(dim=1).reshape(1)))
    return ''.join([vocab.idx_to_token[i] for i in outputs])


In [19]:
1.7**5

14.198569999999998

In [20]:
72/7

10.285714285714286