In [7]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.optimizers import *
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import sent_tokenize
from konlpy.corpus import kolaw
from konlpy.tag import Okt

c = kolaw.open('constitution.txt').read()
senstents = [s for s in sent_tokenize(c)]

print(senstents[3])  # 제2조① 대한민국의 국민이 되는 요건은 법률로 정한다.
print(len(senstents))  # 357


########## 전처리 ###################
twitter = Okt()
doc0 = [" ".join(["".join(w) for w, t in twitter.pos(s)
                  if t not in ['Number', "Foreign"] and w not in ["제", "조"]]) for s in sent_tokenize(c)]
print(len(doc0))  # 357
print(doc0[3])  # 대한민국 의 국민 이 되는 요건 은 법률 로 정 한다 .

tokenizer = Tokenizer()
tokenizer.fit_on_texts(doc0)

doc = [l for l in tokenizer.texts_to_sequences(doc0) if len(l) > 1]
print(len(doc))  # 354
print(doc[3])  # [102, 1, 22, 5, 111, 653, 4, 9, 24, 13, 6]

maxlen = max([len(x) - 1 for x in doc])     # 187
vocab_size = len(tokenizer.word_index) + 1  # 1165

print(maxlen, vocab_size) #187 1165

############ Data Generation ##################
import numpy as np

def generate_data(X, maxlen, vocab_size):
    for sentence in X:
        inputs = []
        targets = []
        for i in range(1, len(sentence)):
            inputs.append(sentence[0:i])
            targets.append(sentence[i])
        y = np_utils.to_categorical(targets, vocab_size)  # 원핫 인코딩

        inputs_sequence = sequence.pad_sequences(inputs, maxlen=maxlen)  # 최대 크기
        yield (inputs_sequence, y)

for i, (x, y) in enumerate(generate_data(doc, maxlen, vocab_size)):
    print("i", i)
    print("x", x.shape, "\n", x) # 예---  (12, 187)   [[  0   0   0 ...   0   0 102] [  0   0   0 ...   0 102   1]
    print("y", y.shape, "\n", y) # 예---  (12, 1165) [[0. 1. 0. ... 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.]
    if i > 1:
        break

X = []
Y = []
for x, y in generate_data(doc, maxlen, vocab_size):
    X.append(x)
    Y.append(y)

print (type(X))  # <class 'list'>
X = np.concatenate(X)
print (type(X))  # <class 'numpy.ndarray'>
print (X.shape)  # (6917, 187)

Y = np.concatenate(Y)
print(X.shape, Y.shape)


######## Model ###########
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(LSTM(100, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

###### Training #########
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=["accuracy"])
hist = model.fit(X, Y, epochs=500, batch_size=800, verbose=2)

import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(hist.history['accuracy'])
plt.show()

## 모델 저장 ##
model.save("rnn_text_gen.hdf5")

## 모델 로드 ##
from keras.models import load_model
model = load_model("rnn_text_gen.hdf5")

###### predict_word #########
word_list = '대한민국 의 국민 이 되는 요건 은 법률 로 정한 다 .'.split(" ")

reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
x = sequence.pad_sequences([[tokenizer.word_index[w] for w in word_list[:2]]], maxlen=maxlen)
p = model.predict(x)[0]
idx = np.flip(np.argsort(p), 0)

for i in idx[:5]:
    print(reverse_word_map[i])

def predict_word(i, n=1):
    x = sequence.pad_sequences([[tokenizer.word_index[w] for w in word_list[:i]]], maxlen=maxlen)
    p = model.predict(x)[0]
    print("p  ", p)
    idx = np.flip(np.argsort(p), 0)
    print("np.argsort(p)  ", np.argsort(p))
    print("np.flip(np.argsort(p), 0)",np.flip(np.argsort(p), 0))

    for j in idx[:n]:
        print('"', " ".join(word_list[:i]), '"', reverse_word_map[j], " (p={:4.2f}%)".format(100 * p[j]))

print (predict_word(1, n=3))
print (predict_word(2, n=3))
print (predict_word(3, n=3))

제2조 ① 대한민국의 국민이 되는 요건은 법률로 정한다.
357
357
대한민국 의 국민 이 되는 요건 은 법률 로 정 한다 .
354
[102, 1, 22, 5, 111, 653, 4, 9, 24, 13, 6]
187 1165
i 0
x (187, 187) 
 [[  0   0   0 ...   0   0 102]
 [  0   0   0 ...   0 102  28]
 [  0   0   0 ... 102  28 602]
 ...
 [  0   0 102 ... 647 155   2]
 [  0 102  28 ... 155   2  20]
 [102  28 602 ...   2  20 180]]
y (187, 1165) 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
i 1
x (6, 187) 
 [[  0   0   0 ...   0   0  45]
 [  0   0   0 ...   0  45 439]
 [  0   0   0 ...  45 439 648]
 [  0   0   0 ... 439 648 102]
 [  0   0   0 ... 648 102   4]
 [  0   0   0 ... 102   4 649]]
y (6, 1165) 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
i 2
x (12, 187) 
 [[  0   0   0 ...   0   0 102]
 [  0   0   0 ...   0 102   1]
 [  0   0   0 ... 102   1 440]
 ...
 [  

Epoch 112/500
 - 26s - loss: 3.0298 - accuracy: 0.3626
Epoch 113/500
 - 26s - loss: 2.9959 - accuracy: 0.3675
Epoch 114/500
 - 26s - loss: 2.9850 - accuracy: 0.3701
Epoch 115/500
 - 26s - loss: 2.9766 - accuracy: 0.3705
Epoch 116/500
 - 26s - loss: 2.9581 - accuracy: 0.3731
Epoch 117/500
 - 26s - loss: 2.9474 - accuracy: 0.3749
Epoch 118/500
 - 26s - loss: 2.9253 - accuracy: 0.3812
Epoch 119/500
 - 26s - loss: 2.9089 - accuracy: 0.3781
Epoch 120/500
 - 26s - loss: 2.8927 - accuracy: 0.3763
Epoch 121/500
 - 26s - loss: 2.8983 - accuracy: 0.3805
Epoch 122/500
 - 26s - loss: 2.8666 - accuracy: 0.3889
Epoch 123/500
 - 26s - loss: 2.8553 - accuracy: 0.3895
Epoch 124/500
 - 26s - loss: 2.8451 - accuracy: 0.3892
Epoch 125/500
 - 26s - loss: 2.8315 - accuracy: 0.3903
Epoch 126/500
 - 26s - loss: 2.8206 - accuracy: 0.3919
Epoch 127/500
 - 26s - loss: 2.7969 - accuracy: 0.3971
Epoch 128/500
 - 26s - loss: 2.7871 - accuracy: 0.3979
Epoch 129/500
 - 26s - loss: 2.7539 - accuracy: 0.4015
Epoch 130/

KeyboardInterrupt: 