In [107]:
import sys
sys.path.insert(0, '..')
import mxnet as mx
import gluonbook as gb
from mxnet import autograd, nd
from mxnet import gluon
import random
import zipfile
import time, math

In [2]:
(corpus_indices, char_to_idx, idx_to_char,
 vocab_size) = gb.load_data_jay_lyrics()

In [66]:
def to_onehot(X, size):
    return [nd.one_hot(x, size) for x in X.T]
X = nd.arange(10).reshape(2, 5)
inputs = to_onehot(X, vocab_size)
inputs

[
 [[1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]
 <NDArray 2x1038 @cpu(0)>, 
 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]
 <NDArray 2x1038 @cpu(0)>, 
 [[0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]
 <NDArray 2x1038 @cpu(0)>, 
 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]
 <NDArray 2x1038 @cpu(0)>, 
 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]
 <NDArray 2x1038 @cpu(0)>]

In [48]:
num_inputs = vocab_size 
num_hidden = 256
num_output = vocab_size
ctx = gb.try_gpu()
def get_params():
    def get_initialized(shape):
        return nd.random_normal(scale=0.01, shape=shape, ctx=ctx)
    W_xh = get_initialized(shape=(num_inputs, num_hidden))
    W_hh = get_initialized(shape=(num_hidden, num_hidden))
    b_h = nd.zeros(shape=(num_hidden,))
    
    W_hy = get_initialized(shape=(num_hidden, num_output))
    b_y = nd.zeros(shape=(num_output,))
    
    params = [W_xh, W_hh, b_h, W_hy, b_y]
    for param in params:
        param.attach_grad()
    return params


In [49]:
list(map(lambda param: print(param.shape),
         get_params()))

(1038, 256)
(256, 256)
(256,)
(256, 1038)
(1038,)


[None, None, None, None, None]

In [50]:
def init_rnn_state(batch_size, num_hiddens, ctx=None):
    return nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx),

In [51]:
init_rnn_state(batch_size=2,num_hiddens=10)[0].shape

(2, 10)

In [144]:
def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_hy, b_y = params
    H, = state
    output = []
    for X in inputs:
#         print("X * W_xh shape:", nd.dot(X, W_xh).shape)
#         print('X * W_hh shape:', nd.dot(H, W_hh).shape)
#         print('b_h shape:', b_h.shape)
        H = nd.relu(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h)
#         H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h)
        Y = nd.dot(H, W_hy) + b_y
        output.append(Y)
    return output, (H,)

In [56]:
state = init_rnn_state(X.shape[0], num_hidden, ctx)
inputs = to_onehot(X.as_in_context(ctx), vocab_size)
print('num of inputs:', len(inputs), 'input shape:', inputs[0].shape)
params = get_params()
outputs, state_new = rnn(inputs, state, params)
len(outputs), outputs[0].shape, state_new[0].shape

num of inputs: 5 input shape: (2, 1038)
X * W_xh shape: (2, 256)
X * W_hh shape: (2, 256)
b_h shape: (256,)
X * W_xh shape: (2, 256)
X * W_hh shape: (2, 256)
b_h shape: (256,)
X * W_xh shape: (2, 256)
X * W_hh shape: (2, 256)
b_h shape: (256,)
X * W_xh shape: (2, 256)
X * W_hh shape: (2, 256)
b_h shape: (256,)
X * W_xh shape: (2, 256)
X * W_hh shape: (2, 256)
b_h shape: (256,)


(5, (2, 1038), (2, 256))

In [145]:
def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
                num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx):
    state = init_rnn_state(1, num_hiddens=num_hidden, ctx=ctx)
    output = [char_to_idx[prefix[0]]]
    for t in range(len(prefix) + num_chars):
        X = to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size)
#         print('haha')
        
        Y, state = rnn(X, state, params)
        
        if t < len(prefix) -1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(int(Y[0].argmax(axis=1).asscalar()))
        
    return ''.join([idx_to_char[i] for i in output])

In [139]:
predict_rnn('分开', 10, rnn, params, init_rnn_state, num_hidden, vocab_size,
            ctx, idx_to_char, char_to_idx)

'分开堂舍uuuuuuuuu'

In [146]:
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = gb.data_iter_random
    else:
        data_iter_fn = gb.data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:  # 如使用相邻采样，在 epoch 开始时初始化隐藏变量。
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        loss_sum, start = 0.0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for t, (X, Y) in enumerate(data_iter):
            if is_random_iter: # 如使用随机采样，在每个小批量更新前初始化隐藏变量。
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:  # 否则需要使用 detach 函数从计算图分离隐藏状态变量。
#                 pass
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, vocab_size)
                # outputs 有 num_steps 个形状为 (batch_size, vocab_size) 的矩阵。
                (outputs, state) = rnn(inputs, state, params)
                # 拼接之后形状为 (num_steps * batch_size, vocab_size)。
                outputs = nd.concat(*outputs, dim=0)
                # Y 的形状是 (batch_size, num_steps)，转置后再变成长
                # batch * num_steps 的向量，这样跟输出的行一一对应。
                y = Y.T.reshape((-1,))
                # 使用交叉熵损失计算平均分类误差。
                l = loss(outputs, y).mean()
            l.backward()
            # 裁剪梯度后使用 SGD 更新权重。
            grad_clipping(params, clipping_theta, ctx)
            gb.sgd(params, lr, 1)  # 因为已经误差取过均值，梯度不用再做平均。
            loss_sum += l.asscalar()

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec'  % (
                epoch + 1, math.exp(loss_sum / (t + 1)),
                     time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))

In [147]:
num_epochs = 500
num_steps = 35
batch_size = 32
lr = 1e2
clipping_theta = 1e-2
prefixes = ['分开', '不分开']
pred_period = 50
pred_len = 50

train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hidden,
                      vocab_size, ctx, corpus_indices, idx_to_char,
                      char_to_idx, True, num_epochs, num_steps, lr,
                      clipping_theta, batch_size, pred_period, pred_len,
                      prefixes)

epoch 50, perplexity 54153.847003, time 0.92 sec
 - 分开                                                   
 - 不分开                                                   


KeyboardInterrupt: 

In [142]:

train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hidden,
                      vocab_size, ctx, corpus_indices, idx_to_char,
                      char_to_idx, False, num_epochs, num_steps, lr,
                      clipping_theta, batch_size, pred_period, pred_len,
                      prefixes)

epoch 50, perplexity 72.851587, time 0.97 sec
 - 分开 我想能再想 我不能再想 我不能再想 我不能再想 我不能再想 我不能再想 我不能再想 我不能再想 我不
 - 不分开 我想你你想 我不能再不 我不能再想 我不能再想 我不能再想 我不能再想 我不能再想 我不能再想 我不
epoch 100, perplexity 12.260469, time 0.95 sec
 - 分开我 说散依旧每日 一直走依 快使用碗 恨自不同 你不没纵 我想著努我 你思寄红不投 景色入秋 我该好这
 - 不分开 连是我 别怪我 印你 是因为很 一壶都酒 在来用碗的溪边 情晶莹的话滴 我爱你 你爱我 不地安的传有
epoch 150, perplexity 4.000830, time 0.94 sec
 - 分开我遇见 我的伤口被狂拆暴 誓言太沉重泪被纵容 脸上 你的黑色幽默 想通 却又再的玩笑 想通 却又再考倒
 - 不分开 我已到能 你颗心悬 说是再这 你在我妈别重 然 后壁我 别怪我 三你 是因为闷了很 就伤 这念再血倒
epoch 200, perplexity 2.281921, time 0.92 sec
 - 分开我遇攻 我的伤口被你拆封 誓言太沉重泪被 但那伦人已经不 我不耍的远模有样 什么兵器最喜欢 双截棍 -
 - 不分开 我 想带你在单车 我 想和你看棒球 想这样没担忧 唱着歌 一直走 我想就这样牵着你的手不放开 爱能不
epoch 250, perplexity 1.770085, time 0.92 sec
 - 分开 停格内容不忠 所有回忆对着我进攻...... 古有云层 我不多努力向你奔跑 爱才送到 你却已在别人怀
 - 不分开 那在黑回 你却经离 我听再受我 走没有你在我有多烦恼多难熬) 穿过云层 我试著努力向你奔跑 爱才送到
epoch 300, perplexity 1.474915, time 1.06 sec
 - 分开 停格内容不忠 所有回忆对着我进攻...... 所有回忆对着我进攻...... 古巴比伦王颁布了汉摩拉
 - 不分开觉 在格内 广场箱都我一 有谁啊 是在是你不想活 说你怎我面绕的怒火 我想揍你已经很久 别想躲 说你眼
epoch 350, perplexity 1.497948, t

KeyboardInterrupt: 

In [34]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, ctx=None):
    corpus_indices = nd.array(corpus_indices, ctx=ctx)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    indices = corpus_indices[0: batch_size*batch_len].reshape((
        batch_size, batch_len))
    epoch_size = (batch_len - 1) // num_steps
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + 1: i + num_steps + 1]
        yield X, Y

In [88]:
def grad_clipping(params, theta, ctx):
    norm = nd.array([0.0], ctx)
    for param in params:
        norm += (param.grad ** 2).sum()
    norm = norm.sqrt().asscalar()
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [93]:
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = gb.data_iter_random
    else:
        data_iter_fn = gb.data_iter_consecutive
    params = get_params()
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
    
    for epoch in range(num_epochs):
        

SyntaxError: unexpected EOF while parsing (<ipython-input-93-e55ac7c20252>, line 13)

In [45]:
corpus_indices = nd.array(my_seq)
indices = corpus_indices[0 : 50].reshape((5, 10))
indices


[[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
 [10. 11. 12. 13. 14. 15. 16. 17. 18. 19.]
 [20. 21. 22. 23. 24. 25. 26. 27. 28. 29.]
 [30. 31. 32. 33. 34. 35. 36. 37. 38. 39.]
 [40. 41. 42. 43. 44. 45. 46. 47. 48. 49.]]
<NDArray 5x10 @cpu(0)>

0.001