In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import numpy as np
import os
import time

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# 下载莎士比亚数据集
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [3]:
# 读取数据
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

print(len(text))

1115394


In [4]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [5]:
# non-dupilicate charactor
vocab = sorted(set(text))

len(vocab)

65

In [6]:
print(vocab)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


# handle the text

In [7]:
# vectorlize text: two table: one for charactor to number, other for number to charactor
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [8]:
len(text_as_int)

1115394

In [9]:
text[0]

'F'

# predict task
given a charactor or word to predict the next word

将文本拆分为长度为 seq_length+1 的文本块。例如，假设 seq_length 为 4 而且文本为 “Hello”， 那么输入序列将为 “Hell”，目标序列将为 “ello”。

为此，首先使用 tf.data.Dataset.from_tensor_slices 函数把文本向量转换为字符索引流。

In [10]:
# set the max length
seq_length = 100
examples_per_epoch = len(text)  // seq_length

# target
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])  # tensor into numpy to char

F
i
r
s
t


In [11]:
for i in char_dataset.take(5):
    print(i)

tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(47, shape=(), dtype=int32)
tf.Tensor(56, shape=(), dtype=int32)
tf.Tensor(57, shape=(), dtype=int32)
tf.Tensor(58, shape=(), dtype=int32)


In [12]:
idx2char[18]

'F'

batch 方法使我们能轻松把单个字符转换为所需长度的序列。

In [13]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)  # whole text

for item in sequences.take(5):  # every time take 5
    print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


对于每个序列，使用 map 方法先复制再顺移，以创建输入文本和目标文本。map 方法可以将一个简单的函数应用到每一个批次 （batch）。

In [14]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [15]:
# print 1st batch and target
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()]))) 

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


这些向量的每个索引均作为一个时间步来处理。作为时间步 0 的输入，模型接收到 “F” 的索引，并尝试预测 “i” 的索引为下一个字符。在下一个时间步，模型执行相同的操作，但是 RNN 不仅考虑当前的输入字符，还会考虑上一步的信息。

In [16]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 18 ('F')
  expected output: 47 ('i')
Step    1
  input: 47 ('i')
  expected output: 56 ('r')
Step    2
  input: 56 ('r')
  expected output: 57 ('s')
Step    3
  input: 57 ('s')
  expected output: 58 ('t')
Step    4
  input: 58 ('t')
  expected output: 1 (' ')


前面我们使用 tf.data 将文本拆分为可管理的序列。但是在把这些数据输送至模型之前，我们需要将数据重新排列 （shuffle） 并打包为批次。

In [17]:
# 批大小
BATCH_SIZE = 64

# 设定缓冲区大小，以重新排列数据集
# （TF 数据被设计为可以处理可能是无限的序列，
# 所以它不会试图在内存中重新排列整个序列。相反，
# 它维持一个缓冲区，在缓冲区重新排列元素。） 
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

# buidl model
使用 tf.keras.Sequential 定义模型。在这个简单的例子中，我们使用了三个层来定义模型：

tf.keras.layers.Embedding：输入层。一个可训练的对照表，它会将每个字符的数字映射到一个 embedding_dim 维度的向量。

tf.keras.layers.GRU：一种 RNN 类型，其大小由 units=rnn_units 指定（这里你也可以使用一个 LSTM 层）。

tf.keras.layers.Dense：输出层，带有 vocab_size 个输出。

In [18]:
# 
vocab_size = len(vocab)

# 
embedding_dim = 256

# RNN units
rnn_units = 1024

In [20]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [21]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


![a](https://github.com/littlebeanbean7/docs/blob/master/site/en/tutorials/text/images/text_generation_training.png?raw=1)

In [22]:
# train
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

In [23]:
# 检查点保存至的目录
checkpoint_dir = './training_checkpoints'

# 检查点的文件名
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [24]:
EPOCHS=10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# generate text

恢复最新的检查点
为保持此次预测步骤简单，将批大小设定为 1。

由于 RNN 状态从时间步传递到时间步的方式，模型建立好之后只接受固定的批大小。

若要使用不同的 batch_size 来运行模型，我们需要重建模型并从检查点中恢复权重。

In [25]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints\\ckpt_10'

In [26]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


![1](https://github.com/littlebeanbean7/docs/blob/master/site/en/tutorials/text/images/text_generation_sampling.png?raw=1)
预测循环
下面的代码块生成文本：

首先设置起始字符串，初始化 RNN 状态并设置要生成的字符个数。

用起始字符串和 RNN 状态，获取下一个字符的预测分布。

然后，用分类分布计算预测字符的索引。把这个预测字符当作模型的下一个输入。

模型返回的 RNN 状态被输送回模型。现在，模型有更多上下文可以学习，而非只有一个字符。在预测出下一个字符后，更改过的 RNN 状态被再次输送回模型。模型就是这样，通过不断从前面预测的字符获得更多上下文，进行学习。

In [27]:
def generate_text(model, start_string):
  # 评估步骤（用学习过的模型生成文本）

  # 要生成的字符个数
  num_generate = 1000

  # 将起始字符串转换为数字（向量化）
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # 空字符串用于存储结果
  text_generated = []

  # 低温度会生成更可预测的文本
  # 较高温度会生成更令人惊讶的文本
  # 可以通过试验以找到最好的设定
  temperature = 1.0

  # 这里批大小为 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # 删除批次的维度
      predictions = tf.squeeze(predictions, 0)

      # 用分类分布预测模型返回的字符
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # 把预测字符和前面的隐藏状态一起传递给模型作为下一个输入
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [28]:
print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: have there's nothin me,'s nose,
Who against him like a blood, that I mean to bid
Will thou there to do himself as leave us?

ROMEO:
Thou see'st thou go'st not mine Sinful Henry means to
ever fooling my body's not scarcely well eonle goes thie;
For in your clos what flatter the inform
And that his craves with night, and what not your high-seliverace great merrow
Or stir on my how here stoppices, who nature had some cto
consideth this him
To entre thee away: yew to do some time.

PETRUCHIO:
Came Kent? which should thou we pass'd his thrifts it.

DUKE VINCENTIO:
how let him speak foolmaster;
His inscusempter man, thou now, thou met agree.
Now for your swoting Boning by their trual drop.

RIVERS:
When ill, you need not do?
If not, my comfort, as sear of death of imprisonment:
O bl she smoked; who matter words,
'Gond hirdwells needs must up for as there never grown
To beauta's witch as if he had but a joyful lust, God is not taked together;
You as far as the
lamentations, a fair upon

若想改进结果，最简单的方式是延长训练时间 （试试 EPOCHS=30）。

你还可以试验使用不同的起始字符串，或者尝试增加另一个 RNN 层以提高模型的准确率，亦或调整温度参数以生成更多或者更少的随机预测。

In [29]:
s = generate_text(model, start_string=u"I say: ")

In [30]:
s

"I say: now much is the weart-lame to\nA scudy, our mindness, be to do it.\nHoe! tell him his heavy days to ere I amone them gail for honest\nCliff frights me in our business as his son\nSecrive no great all-men'd you, to say the tricker's wife.\n\nGLOUCESTER:\nThat Even re-st, manquels to't in boy;\nWhich is a hely instruction for your friends: thou hast chatest winder child Bupanur.\n\nHASTINGS:\nI know not, when I would they call you do.\n\nSICANDIA:\nAt thy palm thy tribunes are you ours.\n\nLUCIO:\nAy, my lord, the law's not, question for, our fatherly\nPast her fortune be not to be cathing stain'd topentise\nWith old han cheap his worthip meetual, if you are not do I ended them to die to-night.\n\nJOHN SOLINGBROKE:\nCome, honest thou not, away;\nThe mother would have braved death is read it.\n\nHASTINGS:\nAs I with aloof, will I bear me off, and to leave his body,\nYour mother, and ingrave thee in my true,\nA leaver am I rumber what thou should ask, to give me to yourself\nFor tu

# 自定义训练

上面的训练步骤简单，但是能控制的地方不多。

至此，你已经知道如何手动运行模型。现在，让我们打开训练循环，并自己实现它。这是一些任务的起点，例如实现 课程学习 以帮助稳定模型的开环输出。

你将使用 tf.GradientTape 跟踪梯度。关于此方法的更多信息请参阅 eager execution 指南。

步骤如下：

首先，初始化 RNN 状态，使用 tf.keras.Model.reset_states 方法。

然后，迭代数据集（逐批次）并计算每次迭代对应的 预测。

打开一个 tf.GradientTape 并计算该上下文时的预测和损失。

使用 tf.GradientTape.grads 方法，计算当前模型变量情况下的损失梯度。

最后，使用优化器的 tf.train.Optimizer.apply_gradients 方法向下迈出一步。

In [31]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)



In [32]:
optimizer = tf.keras.optimizers.Adam()

In [33]:
@tf.function
def train_step(inp, target):
  with tf.GradientTape() as tape:
    predictions = model(inp)
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy(
            target, predictions, from_logits=True))
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  return loss

In [None]:
# 训练步骤
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  # 在每个训练周期开始时，初始化隐藏状态
  # 隐藏状态最初为 None
  hidden = model.reset_states()

  for (batch_n, (inp, target)) in enumerate(dataset):
    loss = train_step(inp, target)

    if batch_n % 100 == 0:
      template = 'Epoch {} Batch {} Loss {}'
      print(template.format(epoch+1, batch_n, loss))

  # 每 5 个训练周期，保存（检查点）1 次模型
  if (epoch + 1) % 5 == 0:
    model.save_weights(checkpoint_prefix.format(epoch=epoch))

  print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
  print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))



Epoch 1 Batch 0 Loss 4.173825740814209
Epoch 1 Batch 100 Loss 2.3167788982391357
Epoch 1 Loss 2.1459
Time taken for 1 epoch 18.6064190864563 sec

Epoch 2 Batch 0 Loss 2.1589548587799072
Epoch 2 Batch 100 Loss 1.8869649171829224
Epoch 2 Loss 1.7990
Time taken for 1 epoch 16.880786895751953 sec

Epoch 3 Batch 0 Loss 1.7869982719421387
Epoch 3 Batch 100 Loss 1.644071340560913
Epoch 3 Loss 1.6069
Time taken for 1 epoch 17.476303339004517 sec

Epoch 4 Batch 0 Loss 1.5778863430023193
Epoch 4 Batch 100 Loss 1.5097230672836304
Epoch 4 Loss 1.4975
Time taken for 1 epoch 17.099212169647217 sec

Epoch 5 Batch 0 Loss 1.462996006011963
Epoch 5 Batch 100 Loss 1.428195834159851
Epoch 5 Loss 1.4222
Time taken for 1 epoch 17.050785779953003 sec

Epoch 6 Batch 0 Loss 1.3880597352981567
Epoch 6 Batch 100 Loss 1.3704915046691895
Epoch 6 Loss 1.3626
Time taken for 1 epoch 17.047444820404053 sec

Epoch 7 Batch 0 Loss 1.3303874731063843


# LSTM 生成文本

In [18]:
# 
vocab_size = len(vocab)

# 
embedding_dim = 256

# RNN units
rnn_units = 1024


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [19]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [20]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

In [21]:
# 检查点保存至的目录
checkpoint_dir = './training_checkpoints_lstm'

# 检查点的文件名
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [22]:
EPOCHS=20
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [23]:
# 生成存储点
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints_lstm\\ckpt_20'

In [24]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [29]:
# 要生成的字符个数
num_generate = 1000

def generate_text(model, start_string, num_generate):
  # 评估步骤（用学习过的模型生成文本）

  # 要生成的字符个数
  num_generate = num_generate

  # 将起始字符串转换为数字（向量化）
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # 空字符串用于存储结果
  text_generated = []

  # 低温度会生成更可预测的文本
  # 较高温度会生成更令人惊讶的文本
  # 可以通过试验以找到最好的设定
  temperature = 1.0

  # 这里批大小为 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # 删除批次的维度
      predictions = tf.squeeze(predictions, 0)

      # 用分类分布预测模型返回的字符
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # 把预测字符和前面的隐藏状态一起传递给模型作为下一个输入
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [30]:
print(generate_text(model, start_string=u"ROMEO: ", num_generate=5000))

ROMEO: Dork more noble Duke of Nurce,
Shall have good heart, which for some goier eye owr request,
Could I, resign to see your house: the jule tears was king,
I seem not what you cram, let me too; he tender yeath
your love must lie unbaster'd here
From what shoutings is so that twou foregother. What torture use is this yone,
His rude cliffory rejoice and fair tongue must be,
But left thy brother than she ends at it.
But fam she hath; for being their courther ridagators!
To-morrow Come, he comes have Romeo by such fen
Me his tongue but passing: let's to be lost;
And with his drafes
sitting his chair; the kind goods cleans may be doubted him,
Both him and queen and steeling on my life, if my nature
We here for give can fall ram the death of life,
Canisters from the harvestom, bloody custom, but upon him: it is her
The case of honour than banishment,
Refenting twos, though it caupper you to the
uggery,
For make him the bestore unto the entreaties i' the other, and
proclaim her horse; uto 

# 结论
1. 训练速度：LSTM 比 GNU 慢，原理上来看，GNU是LSTM的简化版本，省略了游戏额参数，计算过程自然要快
2. EPOCHS 训练次数增加之后，生成的文字更像原文风格
3. 设想中文的训练，生成歌词，古诗文，等等