In [3]:
import os, sys, re
import random
import string
import numpy as np
from pickle import dump, load
import tensorflow as tf
from collections import Counter
from keras.callbacks import LambdaCallback
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding
from keras.layers import LSTM, Dropout
from keras.optimizers import RMSprop
from keras.utils import np_utils
import keras.utils as ku 
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [4]:
def data_preprocess(filename):
    """ preprocess raw data
    """
    poetry = []
    with open(filename, "r") as f:
        text = f.readlines()
        for i, line in enumerate(text):
            poetry.append(''.join(line.strip().split(' ')))
            if i>10000: break
    dump(poetry, open('poetry.pkl','wb')) 
    return poetry


In [5]:
poetry = data_preprocess('chinese_poetry.txt')
print ("Total # of poems: ", len(poetry))
print ("Example: ", poetry[0])

Total # of poems:  10002
Example:  秦川雄帝宅函谷壮皇居绮殿千寻起离宫百雉馀连薨遥接汉飞观迥凌虚云日隐层阙风烟出绮疏岩廊罢机务崇文聊驻辇玉匣启龙图金绳披凤篆韦编断仍续缥帙舒还卷对此乃淹留欹案观坟典移步出词林停舆欣武宴雕弓写明月骏马疑流电惊雁落虚弦啼猿悲急箭阅赏诚多美于兹乃忘倦鸣笳临乐馆眺听欢芳节急管韵朱弦清歌凝白雪彩凤肃来仪玄鹤纷成列去兹郑卫声雅音方可悦芳辰追逸趣禁苑信多奇桥形通汉上峰势接云危烟霞交隐映花鸟自参差何如肆辙迹万里赏瑶池飞盖去芳园兰桡游翠渚萍间日彩乱荷处香风举桂楫满中川弦歌振长屿岂必汾河曲方为欢宴所落日双阙昏回舆九重暮长烟散初碧皎月澄轻素搴幌玩琴书开轩引云雾斜汉耿层阁清风摇玉树欢乐难再逢芳辰良可惜玉酒泛云罍兰殽陈绮席千钟合尧禹百兽谐金石得志重寸阴忘怀轻尺璧建章欢赏夕二八尽妖妍罗绮昭阳殿芬芳玳瑁筵佩移星正动扇掩月初圆无劳上悬圃即此对神仙以兹游观极悠然独长想披卷览前踪抚躬寻既往望古茅茨约瞻今兰殿广人道恶高危虚心戒盈荡奉天竭诚敬临民思惠养纳善察忠谏明科慎刑赏六五诚难继四三非易仰广待淳化敷方嗣云亭响


In [6]:
all_words = []
for poem in poetry:
    all_words += [word for word in poem]
counter = Counter(all_words)

counter_pairs = sorted(counter.items(), key=lambda s : -s[1])
print ("most frequent: ", counter_pairs[0], "least frequent: ", counter_pairs[-1])

words = [w[0] for w in counter_pairs] + [' ']
total_words = len(words)
print ('total_words:', total_words)

most frequent:  ('不', 6223) least frequent:  ('稂', 1)
total_words: 5664


In [7]:
# word to number, and poem to vector
word_dict = dict(zip(words, range(len(words))))
to_num = lambda w: word_dict.get(w, total_words)
Lpoetry = [list(map(to_num, poem)) for poem in poetry]
print (len(Lpoetry), Lpoetry[0])

10002 [258, 189, 475, 243, 917, 1674, 376, 603, 383, 260, 626, 356, 58, 378, 156, 172, 163, 145, 1858, 214, 170, 4037, 188, 455, 115, 45, 437, 929, 575, 323, 4, 3, 324, 1030, 370, 5, 124, 55, 626, 384, 420, 1675, 513, 778, 1546, 1022, 176, 809, 879, 918, 59, 1368, 1062, 114, 590, 43, 1799, 711, 216, 2719, 2387, 2495, 263, 641, 1463, 2720, 2537, 1092, 78, 525, 228, 36, 541, 1199, 198, 1899, 2097, 437, 1729, 1693, 497, 471, 55, 698, 88, 1101, 951, 1063, 447, 530, 810, 1047, 1205, 42, 14, 1632, 61, 459, 56, 1416, 287, 286, 48, 323, 487, 472, 440, 186, 632, 1187, 1929, 451, 849, 87, 387, 452, 441, 541, 523, 1322, 177, 1082, 142, 178, 712, 1355, 373, 283, 174, 305, 632, 668, 1655, 461, 487, 34, 85, 749, 22, 140, 507, 216, 863, 11, 1125, 515, 443, 401, 107, 558, 40, 441, 1562, 1253, 108, 1245, 539, 112, 80, 1656, 174, 868, 894, 880, 1093, 839, 619, 414, 87, 780, 612, 797, 313, 115, 13, 330, 860, 455, 4, 705, 124, 434, 372, 324, 402, 20, 123, 30, 730, 1003, 8, 37, 2201, 1826, 431, 50, 32, 451

In [8]:
def generate_padded_sequences(sequences):
    max_sequence_len = 5
    predictors, labels = [], []
    for line in sequences:
        for i in range(0, len(line)-max_sequence_len):
            seq_in = line[i:i+max_sequence_len]
            seq_ou = line[i+max_sequence_len]
            predictors.append(seq_in)
            labels.append(seq_ou)
    labels = np_utils.to_categorical(labels, num_classes=total_words)
    return np.array(predictors), labels, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(Lpoetry)
print (predictors[0], label[0])

[258 189 475 243 917] [0. 0. 0. ... 0. 0. 0.]


### RNN-LSTM architecture

In [9]:
def create_model(max_sequence_len, total_words):
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=max_sequence_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(50))
    model.add(Dropout(0.01))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 10)             56640     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5664)              288864    
Total params: 357,704
Trainable params: 357,704
Non-trainable params: 0
_________________________________________________________________


In [10]:
checkpoint = ModelCheckpoint('model.hdf5', monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(predictors, label, epochs=5, callbacks=[checkpoint], verbose=1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/5

Epoch 00001: loss improved from inf to 7.15885, saving model to model.hdf5
Epoch 2/5

Epoch 00002: loss improved from 7.15885 to 6.87709, saving model to model.hdf5
Epoch 3/5

Epoch 00003: loss improved from 6.87709 to 6.67838, saving model to model.hdf5
Epoch 4/5

Epoch 00004: loss improved from 6.67838 to 6.55051, saving model to model.hdf5
Epoch 5/5

Epoch 00005: loss improved from 6.55051 to 6.46857, saving model to model.hdf5


<keras.callbacks.History at 0x10d7dbef0>

In [11]:
def generate_text(seed_text, model, max_sequence_len):
    text = seed_text
    n = len(seed_text)
    for _ in range(20-n):
        token_list = pad_sequences([seed_text], maxlen=max_sequence_len, padding='post')
        predict= model.predict(token_list, verbose=0)
        # greedy search without character duplicate
        while np.argmax(predict) in text:
            predict = np.delete(predict, np.argmax(predict))
        text += [np.argmax(predict)]
        seed_text = seed_text[1:] + [text[-1]]


    output_word = ""
    for s in text:
        for w in word_dict:
            if word_dict[w]== s:
                output_word += w
    return output_word, text

In [12]:
# load model
#model = load_model('weights-improvement-04-0.0016.hdf5')
from nltk.translate.bleu_score import sentence_bleu
for _ in range(100):
    #seed_text =  random.sample(range(1, 100),5)
    seed = random.randint(0,len(Lpoetry))
    try:
        seed_text = Lpoetry[seed][:3]
    except:
        pass
    p, t= generate_text(seed_text, model, max_sequence_len)
    score = sentence_bleu(Lpoetry, t)
    print ('BLEU-score:', score, '\n', p[:5],', ',p[5:10],'.\n', p[10:15],', ', p[15:],'.\n')

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-score: 5.285972420202032e-78 
 执宪随见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.285972420202032e-78 
 九日重见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.172229091023959e-78 
 九衢金见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.392813724759232e-78 
 神羊既见知 ,  去还闻独安 .
 意相可情家 ,  青向愁发鼓 .

BLEU-score: 5.285972420202032e-78 
 何地早见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.172229091023959e-78 
 山中燕见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.172229091023959e-78 
 九九侍见去 ,  独还闻河草 .
 家相情谢发 ,  可别思宝青 .

BLEU-score: 5.172229091023959e-78 
 今朝腊见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.172229091023959e-78 
 南都信见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.493657629723045e-78 
 薛公十见去 ,  可家闻独还 .
 相入发镜平 ,  思情谢知寄 .

BLEU-score: 5.285972420202032e-78 
 微穆敷见知 ,  去还闻独安 .
 意相可情家 ,  青向愁发鼓 .

BLEU-score: 5.172229091023959e-78 
 东风香见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.172229091023959e-78 
 何处送见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.172229091023959e-78 
 绝塞临见去 ,  还闻独安意 .
 相可情家青 ,  知

In [13]:
# load model
#model = load_model('weights-improvement-04-0.0016.hdf5')
from nltk.translate.bleu_score import sentence_bleu
for _ in range(100):
    #seed_text =  random.sample(range(1, 100),5)
    seed = random.randint(0,len(Lpoetry))
    try:
        seed_text = Lpoetry[seed][:3]
    except:
        pass
    p, t= generate_text(seed_text, model, max_sequence_len)
    score = sentence_bleu(Lpoetry, t)
    print ('BLEU-score:', score, '\n', p[:5],', ',p[5:10],'.\n', p[10:15],', ', p[15:],'.\n')

BLEU-score: 5.285972420202032e-78 
 儿扶犹见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.285972420202032e-78 
 堂堂圣见知 ,  去还闻独安 .
 意相可情家 ,  青向愁发鼓 .

BLEU-score: 5.285972420202032e-78 
 铿鸣钟见知 ,  去还闻独安 .
 意相可情家 ,  青向愁发鼓 .

BLEU-score: 5.172229091023959e-78 
 我思仙见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.285972420202032e-78 
 楚郭微见知 ,  去还闻独安 .
 意相可情家 ,  青向愁发鼓 .

BLEU-score: 5.285972420202032e-78 
 苏武在见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.285972420202032e-78 
 妇姑城见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.285972420202032e-78 
 鸟雀知见去 ,  还闻独安意 .
 相可情家青 ,  向愁发鼓思 .

BLEU-score: 5.392813724759232e-78 
 君不闻见去 ,  还相家思独 .
 情安皆知河 ,  草可谢入暮 .

BLEU-score: 5.172229091023959e-78 
 日宫开见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.285972420202032e-78 
 初日明见去 ,  还闻独安意 .
 相可情家青 ,  知发谢白为 .

BLEU-score: 5.392813724759232e-78 
 青田白见去 ,  家还独闻相 .
 入情谢发知 ,  愁平宝可安 .

BLEU-score: 5.285972420202032e-78 
 得体纥见知 ,  去还闻独安 .
 意相可情家 ,  青向愁发鼓 .

BLEU-score: 5.285972420202032e-78 
 江村秋见去 ,  还闻独安意 .
 相可情家青 ,  知

### Transformer architecture

In [17]:
#! -*- coding: utf-8 -*-

from keras import backend as K
from keras.engine.topology import Layer

class Position_Embedding(Layer):
    
    def __init__(self, size=None, mode='sum', **kwargs):
        self.size = size #必须为偶数
        self.mode = mode
        super(Position_Embedding, self).__init__(**kwargs)
        
    def call(self, x):
        if (self.size == None) or (self.mode == 'sum'):
            self.size = int(x.shape[-1])
        batch_size,seq_len = K.shape(x)[0],K.shape(x)[1]
        position_j = 1. / K.pow(10000., \
                                 2 * K.arange(self.size / 2, dtype='float32' \
                               ) / self.size)
        position_j = K.expand_dims(position_j, 0)
        position_i = K.cumsum(K.ones_like(x[:,:,0]), 1)-1 #K.arange不支持变长，只好用这种方法生成
        position_i = K.expand_dims(position_i, 2)
        position_ij = K.dot(position_i, position_j)
        position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2)
        if self.mode == 'sum':
            return position_ij + x
        elif self.mode == 'concat':
            return K.concatenate([position_ij, x], 2)
        
    def compute_output_shape(self, input_shape):
        if self.mode == 'sum':
            return input_shape
        elif self.mode == 'concat':
            return (input_shape[0], input_shape[1], input_shape[2]+self.size)


class Attention(Layer):

    def __init__(self, nb_head, size_per_head, mask_right=False, **kwargs):
        self.nb_head = nb_head
        self.size_per_head = size_per_head
        self.output_dim = nb_head*size_per_head
        self.mask_right = mask_right
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.WQ = self.add_weight(name='WQ', 
                                  shape=(input_shape[0][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK', 
                                  shape=(input_shape[1][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV', 
                                  shape=(input_shape[2][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        super(Attention, self).build(input_shape)
        
    def Mask(self, inputs, seq_len, mode='mul'):
        if seq_len == None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape)-2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12
                
    def call(self, x):
        #如果只传入Q_seq,K_seq,V_seq，那么就不做Mask
        #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len，那么对多余部分做Mask
        if len(x) == 3:
            Q_seq,K_seq,V_seq = x
            Q_len,V_len = None,None
        elif len(x) == 5:
            Q_seq,K_seq,V_seq,Q_len,V_len = x
        #对Q、K、V做线性变换
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = K.permute_dimensions(V_seq, (0,2,1,3))
        #计算内积，然后mask，然后softmax
        A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
        A = K.permute_dimensions(A, (0,3,2,1))
        A = self.Mask(A, V_len, 'add')
        A = K.permute_dimensions(A, (0,3,2,1)) 
        if self.mask_right:
            ones = K.ones_like(A[:1, :1])
            mask = (ones - K.tf.matrix_band_part(ones, -1, 0)) * 1e12
            A = A - mask
        A = K.softmax(A)
        #输出并mask
        O_seq = K.batch_dot(A, V_seq, axes=[3,2])
        O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
        O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq
        
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)


In [21]:
from keras.models import Model
from keras.layers import *

def create_attention_model(max_sequence_len, total_words):

    S_inputs = Input(shape=(None,), dtype='int32')
    embeddings = Embedding(total_words, 20)(S_inputs)
    O_seq = Attention(8,16)([embeddings,embeddings,embeddings])
    O_seq = GlobalAveragePooling1D()(O_seq)
    O_seq = Dropout(0.5)(O_seq)
    outputs = Dense(total_words, activation='sigmoid')(O_seq)
    model = Model(inputs=S_inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = create_attention_model(max_sequence_len, total_words)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, None, 20)     113280      input_5[0][0]                    
__________________________________________________________________________________________________
attention_4 (Attention)         (None, None, 128)    7680        embedding_6[0][0]                
                                                                 embedding_6[0][0]                
                                                                 embedding_6[0][0]                
__________________________________________________________________________________________________
global_ave

In [23]:
checkpoint = ModelCheckpoint('model.hdf5', monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(predictors, label, epochs=1, callbacks=[checkpoint], verbose=1)

Epoch 1/1

Epoch 00001: loss improved from inf to 7.24415, saving model to model.hdf5


<keras.callbacks.History at 0x1d9dfbe128>

In [None]:
# load model
#model = load_model('weights-improvement-04-0.0016.hdf5')
from nltk.translate.bleu_score import sentence_bleu
for _ in range(100):
    #seed_text =  random.sample(range(1, 100),5)
    seed = random.randint(0,len(Lpoetry))
    try:
        seed_text = Lpoetry[seed][:3]
    except:
        pass
    p, t= generate_text(seed_text, model, max_sequence_len)
    score = sentence_bleu(Lpoetry, t)
    print ('BLEU-score:', score, '\n', p[:5],', ',p[5:10],'.\n', p[10:15],', ', p[15:],'.\n')

BLEU-score: 6.956730935411383e-78 
 水国叶不风 ,  日云山何人 .
 一来月天无 ,  中春长有相 .

BLEU-score: 6.956730935411383e-78 
 堂上不风山 ,  人天无云日 .
 何来一为有 ,  春水君花入 .

BLEU-score: 6.956730935411383e-78 
 镂碗传不风 ,  日云山何人 .
 一来月天无 ,  水春君中花 .

BLEU-score: 6.956730935411383e-78 
 洛渚问不风 ,  日云山何人 .
 一来月天无 ,  水春君中花 .

BLEU-score: 6.858028894990207e-78 
 北陆苍不风 ,  云日山何人 .
 一来月天无 ,  水春君中花 .

BLEU-score: 6.956730935411383e-78 
 斗鸡初不风 ,  云日山何人 .
 一来月天无 ,  水春君中花 .

BLEU-score: 6.956730935411383e-78 
 始入松不风 ,  日云山何人 .
 一来月天无 ,  水春君中花 .

BLEU-score: 6.858028894990207e-78 
 桃花灼不人 ,  风日山天何 .
 云无来春一 ,  白月有时水 .

BLEU-score: 6.956730935411383e-78 
 甲乙遇不风 ,  日云山何人 .
 一来月天无 ,  水春君中花 .

BLEU-score: 6.956730935411383e-78 
 歌谣数不风 ,  日云山何人 .
 一来月天无 ,  水春君中花 .

BLEU-score: 6.956730935411383e-78 
 飞香走不风 ,  日云山何人 .
 一来月天无 ,  水春君中花 .

