In [1]:
from pathlib import Path

import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


# 数据读取

In [2]:
data_path = Path('/media/bnu/data/nlp-practice/text-generation/Winston_Churchil.txt')
with open(data_path) as f:
    raw_text = f.read()
raw_text = raw_text.lower()
print('Raw Text Sample:', raw_text[:50])
print('Raw Text Length:', len(raw_text))

Raw Text Sample: ﻿project gutenberg’s real soldiers of fortune, by 
Raw Text Length: 276830


In [3]:
# 对数据进行编码
char_list = sorted(list(set(raw_text)))
char_to_idx = {c: i for i, c in enumerate(char_list)}
idx_to_char = {i: c for i, c in enumerate(char_list)}

print('Number of Char:', len(char_list))
print('Char to Index:')
print(char_to_idx)

Number of Char: 61
Char to Index:
{'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '%': 5, '(': 6, ')': 7, '*': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, '@': 26, '[': 27, ']': 28, '_': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55, '‘': 56, '’': 57, '“': 58, '”': 59, '\ufeff': 60}


# 构造训练集

In [4]:
sequence_length = 100  # 训练集中的序列长度

x_train, y_train = [], []
for i in range(len(raw_text) - sequence_length):
    x_temp = [char_to_idx[c] for c in raw_text[i: i+sequence_length]]
    y_temp = char_to_idx[raw_text[i+sequence_length]]
    x_train.append(x_temp)
    y_train.append(y_temp)

print('Train Data Sample:')
print(x_train[10])
print(y_train[10])

Train Data Sample:
[50, 49, 34, 43, 31, 34, 47, 36, 57, 48, 1, 47, 34, 30, 41, 1, 48, 44, 41, 33, 38, 34, 47, 48, 1, 44, 35, 1, 35, 44, 47, 49, 50, 43, 34, 9, 1, 31, 54, 1, 47, 38, 32, 37, 30, 47, 33, 1, 37, 30, 47, 33, 38, 43, 36, 1, 33, 30, 51, 38, 48, 0, 0, 49, 37, 38, 48, 1, 34, 31, 44, 44, 40, 1, 38, 48, 1, 35, 44, 47, 1, 49, 37, 34, 1, 50, 48, 34, 1, 44, 35, 1, 30, 43, 54, 44, 43, 34, 1, 30]
43


In [5]:
num_samples = len(x_train)
num_vocabs = len(char_list)

# 将x_train的形状修改为LSTM所需的
x_train = np.reshape(x_train, (num_samples, sequence_length, 1))
# 对x_train的数据进行简单归一化
x_train = x_train / float(num_vocabs)
# 对y进行one-hot编码
y_train = np_utils.to_categorical(y_train)

print('X Train Shape:', x_train.shape)
print('Y Train Shape:', y_train.shape)

X Train Shape: (276730, 100, 1)
Y Train Shape: (276730, 60)


# 构建和训练模型

In [6]:
model = Sequential()
model.add(LSTM(256, input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [8]:
model.fit(x_train, y_train, epochs=50, batch_size=4096, validation_split=0.1)

Train on 249057 samples, validate on 27673 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7f4c3d1e66d0>

# 模型预测

In [36]:
def predict_next(input_array):
    """根据输入的文本特征预测下一个字符"""
    test_data = np.reshape(input_array, (1, sequence_length, 1))
    test_data = test_data / float(num_vocabs)
    return model.predict(test_data)

def string_to_index(input_string):
    """将文本中末尾的字符转换为特征"""
    return [char_to_idx[c] for c in input_string[-sequence_length:]]

def pred_to_char(pred):
    """根据预测值获取字符"""
    return idx_to_char[pred.argmax()]

def generate_text(init_string, steps=200):
    """根据初始字符串生成文本"""
    result = init_string.lower()
    for i in range(steps):
        c = pred_to_char(predict_next(string_to_index(result)))
        result += c
    return result

In [37]:
init_string = 'His object in coming to New York was to engage officers for that service. He came at an opportune moment'
result = generate_text(init_string)
print(result)

his object in coming to new york was to engage officers for that service. he came at an opportune moment of the cornin of the carine tf the cornin of the carine tf the cornin of the carine and the torer of the carine and the torer of the carine and the torer of the carine and the torer of the carine and
