In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib as plm
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

2.1.0


In [None]:
# https://storage.gooleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeaere.txt"
text = open(input_filepath, 'r').read()
print(len(text))
print(text[0:100])

In [None]:
# 1、generate vocab
# 2、build mapping  char -> id
# 3、data --> id_data
# 4、abcd --> a->b->c->d-><eos>

vocab = sorted(set(text))
print(len(vocab))
print(vocab)

In [None]:
# 注意这里生成词典的方式
char2idx = {char:idx for idx, char in enumerate(vocab)}
print(char2idx)

In [None]:
# 列表转为numpy.array
idx2char = np.array(vocab)
print(idx2char)

In [None]:
# 数据转为id
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])

In [None]:
# 生成输入和输出
def split_input_target(id_text):
    """
    abcde --> abcd
               ⬇️
              bcde
    
    """
    return id_text[0:-1], id_text[1:]
# 将id_text转为dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # 字符级dataset

# 转为句子级
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1, # split_input_target中长度会减一
                          drop_remainder = True) # 做batch到最后，不够一个batch的部分舍弃

for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])

for seq_id in seq_dataset.take(2):
    print(seq_id)  # 整数的列表
    print(repr(''.join(idx2char[seq_id.numpy()]))) # 整数变成字符，再拼接起来 

In [None]:
seq_dataset = seq_dataset.map(split_input_target)

for item_input, input_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())

In [None]:
batch_size = 64
buffer_size = 10000

seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size, drop_remainder = True)

In [None]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_uints = 1024

def build_model(vocab_size, embedding_dim, ruu_uints, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, 
                               batch_input_shape = [batch_size, None]),
        keras.layers.LSTM(uints = rnn_uints, 
                          # 重要改动1: 调参
                          stateful = True,
                          recurrent_initializer = 'glorot_uniform',
                          return_sequences = True), # split_input_target函数返回的都是序列
        keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size, embedding_dim, rnn_uints, batch_size)

model.summary()

In [None]:
# 实验预测
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch) # model作为函数使用
    print(example_batch_predictions.shape)
# (batch_size, len(sentence), vocab_size)
# vocab_size 类别预测，各个类别的概率分布

In [None]:
# random sampling --random 可以产生多个序列
#  max sampling   -- greedy 只能产生一个序列
# logits:计算softmax之前的值
sample_indices = tf.random.categorical(logits = example_batch_prddictions[0],
                                       num_samples = 1)
print(sample_indices)  # (100, 65)对100个位置中的65个值做sampling，65-->1

# 还可以再去掉多余的维度1
sample_indices = tf.squeeze(sample_indices, axis = -1)
print(ample_indices)  # (100,)

In [None]:
print("Input:", repr("".join(idx2char[input_example_batch[0]])))
print("Output:", repr("".join(idx2char[target_example_batch[0]])))
print()
print("Predictions:", repr("".join(idx2char[sample_indices])))

In [None]:
# 定义模型的损失函数, 7-7 9:38 再听
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

model.complie(loss = loss, optimizer = 'adam')
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

In [None]:
# 因为训练中不会保存文本，所以需要保存模型
output_dir = './text_generation_checkpoints'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoints_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callbacks = keras.callbacks.ModelCheckpoint(
    filepath = checkpoints_prefix,
    save_weights_only = True)

epochs = 10
history = model.fit(seq_dataset, epochs = epochs,
                   callbacks = [checkpoint_callbacks])


In [None]:
# 查看最新保存的模型
tf.train.latest_checkpoint(output_dir)

文本生成

In [None]:
# 在checkpoint中载入模型
model2 = build_model(vocab_size, embedding_dim, 
                     rnn_uints,
                     batch_size = 1) # 在做文本生成时一次只生成一个句子
model2.load_weights(tf.train.latest_checkpoint(output_dir))

# 设置输入的size
# batch_size = 1, 一个样本（一句话）， None表示可以为变长序列
model2.build(tf.TensorShape([1, None]))

# 流生成程
# start ch sequence A,
# A -> model -> b   sample得出
# A.append(b) -->B
# B(Ab) -> model -> c
# B.append(c) --> C
# C(ABc) -> model ->....

model2.summary()

def generate_text(model, strat_string, num_generate = 1000):
    # start_string --> id
    input_eval = [char2idx[ch] for ch in start_string]
    # 扩展 [1, None] 二维， input_eval是一维的
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_genetated = []
    model.reset_states()
    
    # 重要改动2: 添加temperature
    # temperature > 1, 预测出的句子更加随意 random
    # temperature < 1, 概率分布越陡峭，倾向于找最大值  greedy
    temperature = 0.5
    
    for _ in range(num_generat):
        # 1、model inference -> predictions
        # 2、sample ->ch ->text_generated
        # 3、update input_eval
        
        # predictions: [batch_size, len(input_eval), vocab_size]
        predictions = model(input_eval)
        
        # logits = x, logits -> softmax -> prob
        # eg: logits = 2, 1  e^2/(e^2 + e^1) = 0.73, e^1/(e^2 + e^1) = 0.27 更加均匀
        #     logits = 4, 2  e^4/(e^4 + e^2) = 0.88, e^2/(e^4 + e^2) = 0.12 比较陡峭
        predictions = predictions / temperature
        
        # predictions: [len(input_eval), vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predicted_id: [len(input_eval), 1]
        # a b c --> b c d
        # 只用最后一个
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy()
        
        text_generated.append(idx2char[pridicted_id])
        
        # 为什么input_eval替换成predicted_id，而不是直接在input_eval后面添加一个呢？
        # 看思维导图中的视频截图，我们需要的只是最近一步的输出，而不是累计，否则效率低
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = genetate_text(model2, "All:")
print(new_text)