### preprocess original data

In [4]:
import re
import os

data_path = "../datas/wagahaiwa_nekodearu.txt"

# open files as binary data
bin_data = open(data_path, "rb")

lines = bin_data.readlines()
text = ""
for line in lines:
    tmp_text = line.decode("Shift_JIS")
    tmp_text = re.split(r'\r', tmp_text)[0]
    tmp_text = re.split(r'底本', tmp_text)[0]
    tmp_text = tmp_text.replace('|', '')
    tmp_text = re.sub(r'《.+?》','', tmp_text)
    tmp_text = re.sub(r'［＃.+?］','', tmp_text)
    text += tmp_text

os.makedirs('../processed_data/', exist_ok=True)
file = open('../processed_data/wagahai.txt', 'w', encoding='utf-8').write(text)

### prepare data for LSTM

In [22]:
import numpy as np

processed_data_path = '../processed_data/wagahai.txt'
bin_data = open(processed_data_path, "rb").read()
text = bin_data.decode("utf-8")
chars = sorted(list(set(text)))

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

maxlen = 40
stride = 3

sentences = [] # training data
next_chars = [] # answer data
# for i in range(0, len(text)-maxlen, stride):
for i in range(0, 10000, stride):
    sentences.append(text[i:i+maxlen])
    next_chars.append(text[i+maxlen])

X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
Y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for j, char_ in enumerate(sentence):
        X[i, j, char_indices[char_]] = 1
    Y[i, char_indices[next_chars[i]]] = 1

### build network

In [23]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop

def simple_LSTM(input_shape, chars_num):
    model = Sequential()
    model.add(LSTM(128, input_shape=input_shape))
    model.add(Dense(chars_num))
    model.add(Activation("softmax"))
    optimizer = RMSprop()
    model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
    return model

### train network and generate sentence

In [36]:
import random
model = simple_LSTM(input_shape=X.shape[1:], chars_num=len(chars))
model.fit(X, Y, batch_size=128, verbose=1, epochs=10)
random_index = random.randint(0, len(sentences)-1)
sentence = sentences[random_index]
print("original sentence: ", sentence)
generated_sentence = sentence
for i in range(40):
    x = np.zeros((1,maxlen,len(chars)))
    for j, char_ in enumerate(sentence):
        x[0, j, char_indices[char_]] = 1
    preds = model.predict(x)[0]

    next_index = np.argmax(preds)
    next_char = indices_char[next_index]
    generated_sentence += next_char

    sentence = sentence[1:] + next_char
print("generated sentence: ", generated_sentence)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
original sentence:  した話をしよう。元来この主人は何といって人に勝れて出来る事もないが、何にでもよく
generated sentence:  した話をしよう。元来この主人は何といって人に勝れて出来る事もないが、何にでもよくのののののののののののののののののののののののののののののののののののののののの


In [28]:
%debug

> [0;32m/home/yaz/.local/lib/python3.5/site-packages/keras/engine/training_utils.py[0m(128)[0;36mstandardize_input_data[0;34m()[0m
[0;32m    126 [0;31m                        [0;34m': expected '[0m [0;34m+[0m [0mnames[0m[0;34m[[0m[0mi[0m[0;34m][0m [0;34m+[0m [0;34m' to have '[0m [0;34m+[0m[0;34m[0m[0m
[0m[0;32m    127 [0;31m                        [0mstr[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mshape[0m[0;34m)[0m[0;34m)[0m [0;34m+[0m [0;34m' dimensions, but got array '[0m[0;34m[0m[0m
[0m[0;32m--> 128 [0;31m                        'with shape ' + str(data_shape))
[0m[0;32m    129 [0;31m                [0;32mif[0m [0;32mnot[0m [0mcheck_batch_axis[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m    130 [0;31m                    [0mdata_shape[0m [0;34m=[0m [0mdata_shape[0m[0;34m[[0m[0;36m1[0m[0;34m:[0m[0;34m][0m[0;34m[0m[0m
[0m
ipdb> quit
