In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os, sys, time

import tensorflow as tf
from tensorflow import keras

%matplotlib inline

print(tf.__version__)
print(sys.version_info)

2.0.0
sys.version_info(major=3, minor=6, micro=8, releaselevel='final', serial=0)


In [2]:
input_filepath = 'shakespeare.txt'
with open(input_filepath, 'r') as f:
    text = f.read()

print(len(text))
print(text[: 100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [None]:
# 1. vocab
# 2. char -> id
# 3. data -> ids
# 4. input and output pair

In [15]:
vocab = sorted(set(text))

char2idx = {char:idx for idx, char in enumerate(vocab)}
idx2char = np.array(vocab)

text_ids = np.array([char2idx[char] for char in text])
print(text_ids[: 100])

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]


In [17]:
def split_input_target(id_text):
    return id_text[0: -1], id_text[1:]

seq_len = 100
char_data = tf.data.Dataset.from_tensor_slices(text_ids)
# split_input_target中输入与输出相差一位
seq_data = char_data.batch(seq_len + 1, drop_remainder=True)  # 生成句子
for ch_id in char_data.take(2):
    print(ch_id, idx2char[ch_id.numpy()])
print('#' * 30)
for seq_id in seq_data.take(2):
    print(seq_id)
    print(repr(''.join(idx2char[seq_id.numpy()])))

tf.Tensor(18, shape=(), dtype=int32) F
tf.Tensor(47, shape=(), dtype=int32) i
##############################
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirs

In [18]:
seq_data = seq_data.map(split_input_target)
for ins, outs in seq_data.take(2):
    print(ins.numpy())
    print(outs.numpy())

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1]
[56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1 58
 53  1 42

In [19]:
batch_size = 64
buffer_size = 10000

seq_data = seq_data.shuffle(buffer_size).batch(batch_size,
                                               drop_remainder=True)

In [20]:
vocab_size = len(vocab)
embedding_size = 256
rnn_units = 1024

def buid_model(vocab_size, embedding_size, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_size,
                               batch_input_shape=[batch_size, None]),
        keras.layers.LSTM(units=rnn_units,
                               return_sequences=True,
                               stateful=True,
                               recurrent_initializer='glorot_uniform'),
        keras.layers.Dense(vocab_size)
    ])
    return model

model = buid_model(vocab_size, embedding_size, rnn_units, batch_size)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [23]:
# 随机sample
for in_seq, out_seq in seq_data.take(1):
    predictions = model.predict(in_seq)
    print(predictions.shape)

sample_idxs = tf.random.categorical(logits=predictions[0],
                                    num_samples=1)
print(tf.squeeze(sample_idxs, axis=-1))

(64, 100, 65)
tf.Tensor(
[50  2 19 35 16 14 41 13 18 26 26 24 58 63 18  1 16 42 27 17 30 57 53 57
 16 46 54 38 60 25 54 57  1  3  8 17 14 45 48  8 21 59  9 55 60 19 20 58
 57  5  2 52 57 58 43 60 55 45 32 61 56 22  0  9 57 39 63 25  2 26 53 13
  4 20 39 60 45 35 58 39 51 10 11 60 50 47 61 40 54 19 53 57  2 11 23  8
 45 34 28 18], shape=(100,), dtype=int64)


In [24]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

In [26]:
output_dir = "./text_gen_checkpoints"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

epochs = 100
history = model.fit(seq_data, epochs=epochs,
                    callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


#### load and generate

In [None]:
model2 = buid_model(vocab_size, embedding_size, rnn_units, batch_size=1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
# 设置输入size
model2.build(tf.TensorShape([1, None]))

model2.summary()

# temperature > 1, predictions分布更均匀 ; < 1 predictions分布更集中

def generate_text(model, start_str, num_generate=1000, temperature=1):
    input_eval = [char2idx[ch] for ch in start_str]
    input_eval = tf.expand_dims(input_eval, 0)

    text = []
    model.reset_states()

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = predictions / temperature
        predictions = tf.squeeze(predictions, 0)
        prediction_id = tf.random.categorical(
            predictions, num_samples=1)[-1, 0].numpy()
        text.append(idx2char[prediction_id])
        input_eval = tf.expand_dims([prediction_id], 0)

    return start_str + ''.join(text)

res = generate_text(model2, "You ")
print(res)