In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd

## 读取数据


In [2]:
# 读取处理过的文本
text_Segment = open('./Source_segment11.txt','r', encoding='utf-8')
text_Segment_list = text_Segment.readlines()
text_Segment.close()
# 移除换行
text_Segment_list = [n.rstrip() for n in text_Segment_list]
if len(text_Segment_list)%2!=0:
    print("文本库数据有误 对话不对称 请检查！")
else:
    print('对话内容总数：', len(text_Segment_list))

X = text_Segment_list[0:][::2] # 输入问句
Y = text_Segment_list[1:][::2] # 输出答句

lines = pd.DataFrame({"input":X,"output":Y})
lines.head()

对话内容总数： 4384


Unnamed: 0,input,output
0,早安,早安 昨天晚上 睡得 好 吗
1,睡得 很 好,真是 不错 那 赶快 去 享用 美味 的 早餐 吧
2,早安,早安 昨天晚上 睡得 好 吗
3,睡得 好 啊,真棒 需要 去 外面 走走 做 早晨 运动 吗
4,好 啊 感觉 不错,那 记得 不要 做 太 激烈 的 运动 唷


## 数据填充

In [3]:
#encoder
input_lines = list()
for line in lines.input:
    input_lines.append(line)
    
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(input_lines)
tokenized_input_lines = tokenizer.texts_to_sequences(input_lines)

len_list = list()
for token_line in tokenized_input_lines:
    len_list.append(len(token_line))
max_len = np.array(len_list).max()
print( 'Input max length is {}'.format( max_len ))

padded_input_lines = preprocessing.sequence.pad_sequences(tokenized_input_lines, maxlen=max_len, padding='post')
encoder_input_data = np.array(padded_input_lines)
print( 'Encoder input data shape -> {}'.format( encoder_input_data.shape ))

input_word_dict = tokenizer.word_index
num_input_tokens = len(input_word_dict) + 1 
print( 'Number of Input tokens = {}'.format( num_input_tokens))


#decoder
output_lines = list()
for line in lines.output:
    output_lines.append('<START> ' + line +  ' <END>')
    
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(output_lines)
tokenized_output_lines = tokenizer.texts_to_sequences(output_lines)

length_list = list()
for token_seq in tokenized_output_lines:
    length_list.append( len( token_seq ))
max_output_length = np.array( length_list ).max()
print( 'Output max length is {}'.format( max_output_length ))

padded_output_lines = preprocessing.sequence.pad_sequences(tokenized_output_lines, maxlen=max_output_length, padding='post')
decoder_input_data = np.array(padded_output_lines)
print( 'Decoder input data shape -> {}'.format( decoder_input_data.shape ))

output_word_dict = tokenizer.word_index
num_output_tokens = len(output_word_dict) + 1 
print( 'Number of Input tokens = {}'.format( num_output_tokens))


#target
decoder_target_data = list()
for token in tokenized_output_lines:
    decoder_target_data.append(token[1:])
    
padded_output_lines = preprocessing.sequence.pad_sequences(decoder_target_data, maxlen=max_output_length, padding='post')
onehot_output_lines = utils.to_categorical(padded_output_lines, num_output_tokens)
decoder_target_data = np.array(onehot_output_lines)
print( 'Decoder target data shape -> {}'.format( decoder_target_data.shape ))


Input max length is 20
Encoder input data shape -> (2192, 20)
Number of Input tokens = 1616
Output max length is 132
Decoder input data shape -> (2192, 132)
Number of Input tokens = 3718
Decoder target data shape -> (2192, 132, 3718)


## 搭建模型

In [4]:
encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( num_input_tokens, 256 , mask_zero=True ) (encoder_inputs)
encoder_lstm = tf.keras.layers.LSTM( 256 , return_state=True , recurrent_activation = 'sigmoid',dropout=0.2)
encoder_outputs , state_h , state_c = encoder_lstm( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( num_output_tokens, 256 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 256 , return_state=True , recurrent_activation = 'sigmoid',return_sequences=True,dropout=0.2)
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states)

attention = tf.keras.layers.Attention(name='attention_layer')
attention_output = attention([decoder_outputs,encoder_outputs])
        
decoder_concat = tf.keras.layers.Concatenate(axis=-1, name='concat_layer')
decoder_concat_input = decoder_concat([decoder_outputs, attention_output])

decoder_dense = tf.keras.layers.Dense( num_output_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_concat_input )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy',metrics=['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    413696      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    951808      ['input_2[0][0]']                
                                                                                              

In [7]:
DEFAULT_BATCH_SIZE = 32
DEFAULT_EPOCH = 200

import random 
def generate_batch_data_random(x1,x2, y, batch_size):
    """逐步提取batch数据到显存，降低对显存的占用"""
    ylen = len(y)
    loopcount = ylen // batch_size
    while (True):
        i = random.randint(0,loopcount)
        yield [x1[i * batch_size:(i + 1) * batch_size],x2[i * batch_size:(i + 1) * batch_size]], y[i * batch_size:(i + 1) * batch_size]

In [8]:
train_num_batches = len(encoder_input_data) // DEFAULT_BATCH_SIZE
model.fit(generate_batch_data_random(encoder_input_data,decoder_input_data,decoder_target_data,DEFAULT_BATCH_SIZE)
,steps_per_epoch=train_num_batches, batch_size=DEFAULT_BATCH_SIZE, epochs=DEFAULT_EPOCH) 
model.save( 'model.h5' ) 

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [9]:
#model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=64, epochs=250) 
#model.save( 'model.h5' ) 

In [10]:
def make_inference_model():
    encoder_outputs , state_h , state_c = encoder_lstm( encoder_embedding )
    encoder_model = tf.keras.models.Model(encoder_inputs, [encoder_outputs,encoder_states])
    decoder_state_input_h = tf.keras.layers.Input(shape=(256,))
    decoder_state_input_c = tf.keras.layers.Input(shape=(256,))
    
    decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, decoder_state_inputs)
    decoder_states = [state_h, state_c]

    attention_output = attention([decoder_outputs,encoder_outputs])
    decoder_concat_input = decoder_concat([decoder_outputs, attention_output])
    decoder_outputs = decoder_dense(decoder_concat_input)

    decoder_model = tf.keras.models.Model([decoder_inputs,encoder_outputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)
    return encoder_model, decoder_model
enc_model , dec_model = make_inference_model()

In [11]:
import jieba
def str_to_token (sentence: str):
    words = sentence.lower().strip()
    words = jieba.cut(words)
    token_list = list()
    for word in words: 
        token_list.append(input_word_dict[word])
    return preprocessing.sequence.pad_sequences([token_list], maxlen=max_len, padding='post')


In [12]:

model.load_weights("./model.h5")

for epoch in range( encoder_input_data.shape[0] ):
    decoded_translation = ''
    try:
        encoder_outputs,states_values = enc_model.predict( str_to_token( input( 'User: ' ) ) )
        empty_target_seq = np.zeros( ( 1 , 1 ) )
        empty_target_seq[0, 0] = output_word_dict['start']
        stop_condition = False
        while not stop_condition :
            dec_outputs , h , c = dec_model.predict([empty_target_seq,encoder_outputs] + states_values )
            sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
            sampled_word = None
            for word , index in output_word_dict.items() :
                if sampled_word_index == index :
                    decoded_translation += ' {}'.format( word )
                    sampled_word = word
            
            if sampled_word == 'end' or len(decoded_translation.split()) > max_output_length:
                stop_condition = True
                
            empty_target_seq = np.zeros( ( 1 , 1 ) )  
            empty_target_seq[ 0 , 0 ] = sampled_word_index
            states_values = [ h , c ] 
    except:
        decoded_translation = '对不起，我没有听懂。'
    print( "Bot:" +decoded_translation.replace(' end', '').replace(" ",""))
    print()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\taki\AppData\Local\Temp\jieba.cache
Loading model cost 0.723 seconds.
Prefix dict has been built successfully.


你好
Bot:非常感谢你

你
在
干嘛
Bot:对不起，我没有听懂。

我
呸
Bot:对不起，我没有听懂。

hi
Bot:对不起，我没有听懂。

hello
Bot:对不起，我没有听懂。

我
想
你
Bot:你当你穿越一个bug相对你会得到什么

ai
是
什么
Bot:历史：政治经济军事事件随着时间的推移和人的黎明ai时代的进程

天气
怎么样
Bot:对不起，我没有听懂。

天气
好
热
Bot:多补充水分和多休息吧

天气
不好
Bot:有favoritestoryis2001

我爱你
Bot:我也爱你feel机器人

我爱你
Bot:我也爱你feel机器人

Bot:对不起，我没有听懂。

