In [1]:
import os, sys, re
import random
import string
import numpy as np
from pickle import dump, load
import tensorflow as tf
from collections import Counter
from keras.callbacks import LambdaCallback
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding
from keras.layers import LSTM, Dropout
from keras.optimizers import RMSprop
from keras.utils import np_utils
import keras.utils as ku 
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
def data_preprocess(filename):
    """ preprocess raw data
    """
    poetry = []
    with open(filename, "r") as f:
    text = f.readlines()
    for line in text:
        try:
            data = line.strip().split(":")
            if 5<len(data[-1])<50:
                poetry.append([data[0], ''.join(data[-1].split(','))])
        except:
            pass
    print ("Total # of poems: ", len(poetry))
    print ("Example:", poetry[300][1].split(',') )
    poetry = sorted(poetry, key=lambda s: len(s[1]))
    dump(poetry, open('poetry.pkl','wb')) 
    return poetry


In [3]:
poetry = data_preprocess('poetry.txt')
print (len(poetry), poetry[5000])


Total # of poems:  24106
Example: ['神功不测兮运阴阳包藏万宇兮孕八荒天符既出兮帝业昌愿临明祀兮降祯祥']
24106 ['故洛城古墙', '粉落椒飞知几春风吹雨洒旋成尘莫言一片危基在犹过无穷来往人']


In [4]:
all_words = []
for poem in poetry:
    all_words += [word for word in poem[1]]
print ('all words:', len(all_words))
counter = Counter(all_words)
counter_pairs = sorted(counter.items(), key=lambda s : -s[1])
print (counter_pairs[0], counter_pairs[-1])
words = [w[0] for w in counter_pairs] + [' ']
total_words = len(words)
print ('total_words:', total_words)

all words: 802021
('不', 8224) ('窬', 1)
total_words: 5557


In [5]:
# word to number, and poem to vector
word_dict = dict(zip(words, range(len(words))))

to_num = lambda w: word_dict.get(w, total_words)
Lpoetry = [list(map(to_num, poem[1])) for poem in poetry if poem[1]]
print (len(Lpoetry), Lpoetry[0])

24106 [1020, 49, 423, 25, 432]


In [9]:

def generate_padded_sequences(sequences):
    max_sequence_len = 5
    predictors, labels = [], []
    for line in sequences:
        for i in range(0, len(line)-max_sequence_len):
            seq_in = line[i:i+max_sequence_len]
            seq_ou = line[i+max_sequence_len]
            predictors.append(seq_in)
            labels.append(seq_ou)
    labels = np_utils.to_categorical(labels, num_classes=total_words)
    return np.array(predictors), labels, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(Lpoetry)
print (predictors[0], label[0])

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]
[  0  48 288 288 407] [0. 0. 0. ... 0. 0. 0.]


In [61]:
def create_model(max_sequence_len, total_words):
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=max_sequence_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(50))
    model.add(Dropout(0.01))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 5, 10)             55570     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 5557)              283407    
Total params: 351,177
Trainable params: 351,177
Non-trainable params: 0
_________________________________________________________________


In [62]:
checkpoint = ModelCheckpoint('model.hdf5', monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(predictors, label, epochs=40, callbacks=[checkpoint], verbose=1)

Epoch 1/40

Epoch 00001: loss improved from inf to 6.95115, saving model to model.hdf5
Epoch 2/40

Epoch 00002: loss improved from 6.95115 to 6.62218, saving model to model.hdf5
Epoch 3/40

Epoch 00003: loss improved from 6.62218 to 6.39850, saving model to model.hdf5
Epoch 4/40

Epoch 00004: loss improved from 6.39850 to 6.28389, saving model to model.hdf5
Epoch 5/40

Epoch 00005: loss improved from 6.28389 to 6.21006, saving model to model.hdf5
Epoch 6/40

Epoch 00006: loss improved from 6.21006 to 6.15829, saving model to model.hdf5
Epoch 7/40

Epoch 00007: loss improved from 6.15829 to 6.11921, saving model to model.hdf5
Epoch 8/40

Epoch 00008: loss improved from 6.11921 to 6.09007, saving model to model.hdf5
Epoch 9/40

Epoch 00009: loss improved from 6.09007 to 6.06586, saving model to model.hdf5
Epoch 10/40

Epoch 00010: loss improved from 6.06586 to 6.04638, saving model to model.hdf5
Epoch 11/40

Epoch 00011: loss improved from 6.04638 to 6.02971, saving model to model.hdf5
E

<keras.callbacks.History at 0xb34bc4588>

In [78]:
def generate_text(seed_text, model, max_sequence_len):
    text = seed_text
    n = len(seed_text)
    for _ in range(20-n):
        token_list = pad_sequences([seed_text], maxlen=max_sequence_len, padding='post')
        predict= model.predict(token_list, verbose=0)
        # greedy search without character duplicate
        while np.argmax(predict) in text:
            predict = np.delete(predict, np.argmax(predict))
        text += [np.argmax(predict)]
        seed_text = seed_text[1:] + [text[-1]]


    output_word = ""
    for s in text:
        for w in word_dict:
            if word_dict[w]== s:
                output_word += w
    return output_word, text

In [79]:
# load model
#model = load_model('weights-improvement-04-0.0016.hdf5')
from nltk.translate.bleu_score import sentence_bleu
for _ in range(100):
    #seed_text =  random.sample(range(1, 100),5)
    seed = random.randint(0,len(Lpoetry))
    try:
        seed_text = Lpoetry[seed][:3]
    except:
        pass
    p, t= generate_text(seed_text, model, max_sequence_len)
    score = sentence_bleu(Lpoetry, t)
    print ('BLEU-score:', score, '\n', p[:5],', ',p[5:10],'.\n', p[10:15],', ', p[15:],'.\n')

BLEU-score: 5.493657629723045e-78 
 息亡身知可 ,  地是见相心 .
 如归得清早 ,  孤竹堪难去 .

BLEU-score: 5.392813724759232e-78 
 昔佩兵知孤 ,  得如心可地 .
 是节相见早 ,  楼在小清下 .

BLEU-score: 5.392813724759232e-78 
 火急欢知孤 ,  得如心可地 .
 是节相见早 ,  楼在小清下 .

BLEU-score: 5.766891913503738e-78 
 绿杨新知可 ,  清早孤在心 .
 下见得往归 ,  如隋寒树堪 .

BLEU-score: 5.589235336460897e-78 
 艰难王知可 ,  地是见相心 .
 如归得清早 ,  孤竹堪有小 .

BLEU-score: 5.285972420202032e-78 
 挂瓢余知可 ,  心早如得清 .
 在孤有对自 ,  寒难金意小 .

BLEU-score: 5.392813724759232e-78 
 本来银知可 ,  地是见相心 .
 如归得清早 ,  孤竹堪难去 .

BLEU-score: 5.392813724759232e-78 
 自入华知可 ,  地是见相心 .
 如归得清早 ,  孤竹堪难去 .

BLEU-score: 5.493657629723045e-78 
 汉家遗知可 ,  孤在是清如 .
 心意依归得 ,  金外相见早 .

BLEU-score: 5.392813724759232e-78 
 考摭妍知可 ,  地是见相心 .
 如归得清早 ,  孤竹堪难去 .

BLEU-score: 5.589235336460897e-78 
 海客乘归知 ,  可孤早得在 .
 是地如见鸟 ,  长心苍外相 .

BLEU-score: 5.285972420202032e-78 
 驾言寻知可 ,  心早如得清 .
 在孤有对自 ,  寒难金意小 .

BLEU-score: 5.050444248673018e-78 
 高情推知寐 ,  如孤见归小 .
 心在亭清可 ,  地寒浮屋下 .

BLEU-score: 5.589235336460897e-78 
 闲琴开见得 ,  是知在清外 .
 心难可归如 ,  隋

In [75]:
# load model
#model = load_model('weights-improvement-04-0.0016.hdf5')
from nltk.translate.bleu_score import sentence_bleu
for _ in range(100):
    #seed_text =  random.sample(range(1, 100),5)
    seed = random.randint(0,len(Lpoetry))
    try:
        seed_text = Lpoetry[seed][:3]
    except:
        pass
    p, t= generate_text(seed_text, model, max_sequence_len)
    score = sentence_bleu(Lpoetry, t)
    print ('BLEU-score:', score, '\n', p[:5],', ',p[5:10],'.\n', p[10:15],', ', p[15:],'.\n')

BLEU-score: 5.766891913503738e-78 
 南行春知可 ,  清早孤在心 .
 下见得往归 ,  如隋寒树堪 .

BLEU-score: 5.589235336460897e-78 
 无生深见得 ,  是知在清外 .
 心难可归如 ,  隋孤苔红和 .

BLEU-score: 5.493657629723045e-78 
 长乐遥知孤 ,  得如心可地 .
 是节相见早 ,  楼在小清下 .

BLEU-score: 5.392813724759232e-78 
 垂杨拂知可 ,  地是见相心 .
 如归得清早 ,  孤竹堪难去 .

BLEU-score: 5.589235336460897e-78 
 去去何归知 ,  可孤早得在 .
 是地如见鸟 ,  长心苍外相 .

BLEU-score: 5.680147023197547e-78 
 雁门山归见 ,  知心可地是 .
 如得在醉过 ,  老眠寒清孤 .

BLEU-score: 5.493657629723045e-78 
 泣葬一知如 ,  可孤在是清 .
 称心意得枝 ,  平苍行见外 .

BLEU-score: 5.392813724759232e-78 
 望幸纡知孤 ,  得如心可地 .
 是节相见早 ,  楼在小清下 .

BLEU-score: 5.392813724759232e-78 
 窗影摇知孤 ,  得如心可地 .
 是节相见早 ,  楼在小清下 .

BLEU-score: 5.589235336460897e-78 
 笑看沧归见 ,  知心可地是 .
 如得在醉过 ,  老眠寒清孤 .

BLEU-score: 5.392813724759232e-78 
 晓禁苍知可 ,  地是见相心 .
 如归得清早 ,  孤竹堪难去 .

BLEU-score: 5.493657629723045e-78 
 晚渡邗归知 ,  可孤早得在 .
 是地如见鸟 ,  长心苍外相 .

BLEU-score: 5.493657629723045e-78 
 年年人知可 ,  地是见相心 .
 如归得清早 ,  孤竹堪难去 .

BLEU-score: 5.392813724759232e-78 
 孤吟望知可 ,  地是见相心 .
 如归得清早 ,  依