In [None]:
# 数据：https://github.com/chinese-poetry/chinese-poetry
import tensorflow as tf
import numpy as np
import glob
import json
from collections import Counter
from tqdm import tqdm
from snownlp import SnowNLP  # 简繁体转化功能

### 加载数据

In [None]:
# 非完整数据，
poets = []
paths = glob.glob('chinese-poetry/json/poet.*.json')

for path in paths:
    data = open(path, 'r', encoding='utf-8').read()
    data = json.loads(data)
    
    for item in data:
        content = ' '.join(item['paragraphs'])
        if len(content) >= 24 and len(content) <= 32:
            content = SnowNLP(content)
            poets.append('[' + content.han + ']')
            
poets.sort(key=lambda x: len(x))
print('共%d首诗' % len(poets), poets[0], poets[-1])

#### id to word

In [None]:
chars = []
for item in poets:
    chars += [c for c in item]

print('共%d个字' % len(chars))

chars = sorted(Counter(chars).items(), key=lambda x: x[1], reverse=True)

print('共%d个不同的字' % len(chars))
print(chars[:10])

In [None]:
chars = [c[0] for c in chars]

char2id = {c: i + 1 for i, c in enumerate(chars)}
id2char = {i + 1: c for i, c in enumerate(chars)}

#### training data

In [None]:
batch_size = 64
X_data = []
Y_data = []

for b in range(len(poets) // batch_size):
    start = b * batch_size
    end = b * batch_size + batch_size
    
    batch = [[char2id[c] for c in poets[i]] for i in range(start, end)]
    maxlen = max(map(len, batch))
    
    X_batch = np.full((batch_size, maxlen - 1), 0, np.int32)
    Y_batch = np.full((batch_size, maxlen - 1), 0, np.int32)
    
    # X与Y，错开一位
    for i in range(batch_size):
        X_batch[i, :len(batch[i]) - 1] = batch[i][:-1]
        Y_batch[i, :len(batch[i]) - 1] = batch[i][1:]
    
    X_data.append(X_batch)
    Y_data.append(Y_batch)
    
print(len(X_data), len(Y_data))

### Model

In [None]:
hidden_size = 256
num_layer = 2
embedding_size = 256

X = tf.placeholder(tf.int32, [batch_size, None])
Y = tf.placeholder(tf.int32, [batch_size, None])
learning_rate = tf.Variable(0.0, trainable=False)  # 后面会赋值

# len(char2id) + 1: padding
embeddings = tf.Variable(tf.random_uniform([len(char2id) + 1, embedding_size], -1.0, 1.0))
embedded = tf.nn.embedding_lookup(embeddings, X)

In [None]:
cell = tf.nn.rnn_cell.MultiRNNCell(
            [tf.nn.rnn_cell.BasicLSTMCell(hidden_size, state_is_tuple=True) for i in range(num_layer)],
            state_is_tuple=True
            )
initial_state = cell.zero_state(batch_size, tf.float32)

#### output

In [None]:
# outputs: [batch_size, max_time, hidden_size]
# last_states: (([batch_size, hidden_size], [batch_size, hidden_size]), 
#                    ([batch_size, hidden_size], [batch_size, hidden_size]))
#  两层lstm，分别由C 和 H，维度为[batch_size, hidden_size]
outputs, last_states = tf.nn.dynamic_rnn(cell, embedded, initial_state=initial_state)

outputs = tf.reshape(outputs, [-1, hidden_size])
logits = tf.layers.dense(outputs, units=len(char2id) + 1)
# 转换到每一步的输出结果
logits = tf.reshape(logits, [batch_size, -1, len(char2id) + 1])
probs = tf.nn.softmax(logits, name="prob")

In [None]:
# tf.ones_like(Y, dtype=tf.float32): 每个结果的权重，设为一致
loss = tf.reduce_mean(tf.contrib.seq2seq.sequence_loss(logits, Y, tf.ones_like(Y, dtype=tf.float32)))

# 梯度截断
params = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, params), 5)
optimizer = tf.train.AdadeltaOptimizer(learning_rate).apply_gradients(zip(grads, params))

### training

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

for epoch in range(60):
    sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch)))
    
    data_index = np.arange(len(X_data))
    np.random.shuffle(data_index)
    X_data = [X_data[i] for i in data_index]
    Y_data = [Y_data[i] for i in data_index]
    
    losses = []
    for i in tqdm(range(len(X_data))):
        ls_,  _ = sess.run([loss, optimizer], feed_dict={X: X_data[i], Y: Y_data[i]})
        losses.append(ls_)
    
    print('Epoch %d Loss %.5f' % (epoch, np.mean(losses)))

# 保存模型和词表
saver = tf.train.Saver()
saver.save(sess, './poet_generation_tensorflow')

import pickle
with open('dictionary.pkl', 'wb') as fw:
    pickle.dump([char2id, id2char], fw)

In [None]:
# print(sess.graph_def)

### test

In [1]:
# 同一个文件中，请restart
import tensorflow as tf
import numpy as np
import pickle

In [2]:
with open('dictionary.pkl', 'rb') as f:
    [char2id, id2char] = pickle.load(f)

#### 加载

In [3]:
batch_size = 1
hidden_size = 256
num_layer = 2
embedding_size = 256

# 定义图结构，再加载参数，预测使用到last_states，get_tensor_by_name不能直接取到
X = tf.placeholder(tf.int32, [batch_size, None])
Y = tf.placeholder(tf.int32, [batch_size, None])
learning_rate = tf.Variable(0.0, trainable=False)

cell = tf.nn.rnn_cell.MultiRNNCell(
    [tf.nn.rnn_cell.BasicLSTMCell(hidden_size, state_is_tuple=True) for i in range(num_layer)], 
    state_is_tuple=True)
initial_state = cell.zero_state(batch_size, tf.float32)

embeddings = tf.Variable(tf.random_uniform([len(char2id) + 1, embedding_size], -1.0, 1.0))
embedded = tf.nn.embedding_lookup(embeddings, X)

outputs, last_states = tf.nn.dynamic_rnn(cell, embedded, initial_state=initial_state)

outputs = tf.reshape(outputs, [-1, hidden_size])
logits = tf.layers.dense(outputs, units=len(char2id) + 1)
# 转换到每一步的输出结果
logits = tf.reshape(logits, [batch_size, -1, len(char2id) + 1])
probs = tf.nn.softmax(logits, name="prob")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use keras.layers.dense instead.


In [4]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('./'))

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./poet_generation_tensorflow


In [None]:
# get_tensor_by_name
# inputs = tf.get_default_graph().get_tensor_by_name('Placeholder:0')
# prob_ = tf.get_default_graph().get_tensor_by_name('prob:0')
# last_states取不到

#### 自动生成

In [None]:
def generate():
    states_ = sess.run(initial_state)

    gen = ''
    c = '['
    while c != ']':
        gen += c
        x = np.zeros((batch_size, 1))
        x[:, 0] = char2id[c]
        probs_, states_ = sess.run([probs, last_states],
                                   feed_dict={
                                       X: x,
                                       initial_state: states_
                                   })
        probs_ = np.squeeze(probs_)
        # 相当于在一条直线上，累计划分每个预测概率的长度，产生随机数，看落在哪一个区段
        pos = int(
            np.searchsorted(np.cumsum(probs_),
                            np.random.rand() * np.sum(probs_)))

        c = id2char[pos]
        
    return gen[1: ]

In [7]:
print(generate())

西北空教已十秋，东皇犹未识新秋。黄头可得归斯臭，更跃齐眉疾不忧。


#### 藏头

In [8]:
def generate_with_head(head):
    states_ = sess.run(initial_state)
    
    gen = ''
    c = '['
    i = 0
    while c != ']':
        gen += c
        x = np.zeros((batch_size, 1))
        x[:, 0] = char2id[c]
        probs_, states_ = sess.run([probs, last_states], feed_dict={X: x, initial_state: states_})
        probs_ = np.squeeze(probs_)
        pos = int(np.searchsorted(np.cumsum(probs_), np.random.rand() * np.sum(probs_)))
        
        # 下一个c，在每一句开头处，替换掉预测词
        if (c == '[' or c == '。' or c == '，') and i < len(head):
            c = head[i]
            i += 1
        else:
            c = id2char[pos]
    
    return gen[1:]

In [11]:
generate_with_head('罗小黑请早安')

'罗浮翠幄古今红，小槛孤根两字红。黑白芙蓉素花入，请君不解继高风。早迁高士佩骊龙，安得天为唱道公。'