# 1、Bilstm中文分词实验
```
程序版本：第一版中文分词程序，双向lstm
数据集:
    训练集：msr_training_p3.txt（已打好标签）
    测试集：msr_test_gold_p3.txt（已打好标签）
运行环境：
    python3+Tensorflow 1.4+Keras 2.1.2
主要功能：
    LSTM神经网络、F1分数的计算、viterbi译码、early_stopping、打印模型图
```

## 0、初始化

#### 0.1 导入模型

In [1]:
# -*- coding:utf-8 -*-
# %matplotlib inline
import re
import copy
import pickle
import tensorflow as tf
import random
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.utils import np_utils
from keras.models import Model
import keras.preprocessing.text as T
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import Dense, Embedding, LSTM,TimeDistributed
from keras.layers import Input, Bidirectional,Dropout,CuDNNLSTM,CuDNNGRU
from keras.optimizers import SGD,RMSprop,Adagrad,Adadelta,Adamax,Adam
from util_function import split_data_label,calculate_evaluation,calculate_evaluation_batch,data_label_to_word,cws_pre,cws_pre_batch
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from keras.wrappers.scikit_learn import KerasClassifier
import matplotlib.pyplot as plt
from keras.utils import plot_model

# 限制显存占比
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
#config.gpu_options.per_process_gpu_memory_fraction = 1.0
sess = tf.Session(config=config)
set_session(sess)

Using TensorFlow backend.


#### 0.2 参数设置

In [2]:
# 越参数
embedding_feature = 128  # 词向量的维度
maxlen = 128     # 句子最长的词数
epochs = 2     # 训练次数
batch_size = 1024 # 批处理数
# 模型存储名
special = 'model_7'
# 其他参数
et_min_delta = 0.0001 
et_patience = 3
TRAIN_PERC = 0.9 # 训练集占所有数据的比重

## 1、加载数据

In [3]:
s = [] 
with open('./../data/pre_cws_data/msr_train_p3.txt','r') as inputs:
    for line in inputs:
        line = line.strip()
        s.append(line)
t = [] 
with open('./../data/pre_cws_data/msr_test_gold_p3.txt','r') as inputs:
    for line in inputs:
        line = line.strip()
        t.append(line)
        
print("训练集句子数：",len(s))
print("测试集句子数：",len(t))

train_x = [] #生成训练样本
train_y = []
test_x = [] #生成训练样本
test_y = []

# 训练集汉字与标签分开
train_x,train_y = split_data_label(s)
# 测试集汉字与标签分开
test_x,test_y = split_data_label(t)

训练集句子数： 86918
测试集句子数： 3985


## 2、数据清洗

#### 2.1 超过maxlen长度的句子暂时滤掉

In [4]:
train = pd.DataFrame(index=range(len(train_x)))
train['train_x'] = train_x
train['train_y'] = train_y
train = train[train['train_x'].apply(len) <= maxlen]  # 如果大于maxlen的句子滤掉
train.index = range(len(train))
print('过滤后剩余的训练集句子数 = ',len(train))  

test= pd.DataFrame(index=range(len(test_x)))
test['test_x'] = test_x
test['test_y'] = test_y
test = test[test['test_x'].apply(len) <= maxlen]  # 如果大于maxlen的句子滤掉
test.index = range(len(test))
print('过滤后剩余的测试集句子数 = ',len(test))

train_x = list(train['train_x'])
train_y = list(train['train_y'])

test_x = list(test['test_x'])
test_y = list(test['test_y'])

过滤后剩余的训练集句子数 =  85620
过滤后剩余的测试集句子数 =  3891


#### 2.2 重构标准的测试数据，与预测的做对比

In [5]:
test_reorganization = data_label_to_word(test_x,test_y,0)
test_x_origin = copy.deepcopy(test_x)   # 一维数据
#for sen in test_x:
#    test_x_origin.extend(sen)

#### 2.3 数据Token化，汉字转为数字序号

In [6]:
for i,line in enumerate(train_x):
    str_tmp = ''
    for char in line:
        str_tmp+=char+' '
    train_x[i] = str_tmp
    
for i,line in enumerate(train_y):
    str_tmp = ''
    for char in line:
        str_tmp+=char+' '
    train_y[i] = str_tmp

for i,line in enumerate(test_x):
    str_tmp = ''
    for char in line:
        str_tmp+=char+' '
    test_x[i] = str_tmp
    
for i,line in enumerate(test_y):
    str_tmp = ''
    for char in line:
        str_tmp+=char+' '
    test_y[i] = str_tmp

In [7]:
# token 序列化
tokenizer_x = Tokenizer(num_words=None)
tokenizer_x.fit_on_texts(np.concatenate((train_x,test_x),axis=0))
word_index = tokenizer_x.word_index # 词_索引,字典
index_word = dict(zip(word_index.values(), word_index.keys())) # 从下标1开始

print("汉字个数：",len(word_index))

tokenizer_y = Tokenizer(num_words=None)
tokenizer_y.fit_on_texts(['s b m e'])
label_index = tokenizer_y.word_index

train_x = tokenizer_x.texts_to_sequences(train_x)
train_y = tokenizer_y.texts_to_sequences(train_y) 

test_x = tokenizer_x.texts_to_sequences(test_x)
test_y = tokenizer_y.texts_to_sequences(test_y)

print(label_index)

汉字个数： 5111
{'e': 4, 'm': 3, 's': 1, 'b': 2}


In [8]:
# 上面的标签序列从1开始，改为从0开始
for i,line in enumerate(train_y):
    for j,num in enumerate(line):
        train_y[i][j] = num-1
for i,line in enumerate(test_y):
    for j,num in enumerate(line):
        test_y[i][j] = num-1

In [9]:
# 记录测试集每个句子的长度
len_test = []
for i in test_y:
    len_test.append(len(i))

#### 2.4 为不够长的句子填充特征值，使得句子长度一致

In [10]:
# 训练集填充，一个句子的字数小于maxlen，后面填充0
train_x = sequence.pad_sequences(train_x, maxlen=maxlen,padding='post')
train_y = sequence.pad_sequences(train_y, maxlen=maxlen,padding='post',value=4.)
train_y = to_categorical(train_y, num_classes=5)
train_y = train_y.reshape(-1,maxlen,5)

# 测试集要填充，否则没法用to_categoriecal函数，如果怀疑这样的准确度，可以后面再验证。
test_x = sequence.pad_sequences(test_x, maxlen=maxlen,padding='post')
test_y = sequence.pad_sequences(test_y, maxlen=maxlen,padding='post',value=4.)
test_y = to_categorical(test_y, num_classes=5)
test_y = test_y.reshape(-1,maxlen,5)

#### 2.5 按比例分配训练集与验证集，没有交叉验证时

In [11]:
# split into 67% for train and 33% for test
train_x, develop_x, train_y, develop_y = train_test_split(train_x, train_y, test_size=(1-TRAIN_PERC))
print(len(train_x))
print(len(train_y))
print(len(develop_x))
print(len(develop_y))

77058
77058
8562
8562


## 3、模型设计

In [12]:
def create_network():
    # 函数式模型
    sequence = Input(shape=(maxlen,))
    #embedded = Embedding(len(word_index), embedding_feature, input_length=maxlen, mask_zero=False)(sequence)
    embedded = Embedding(len(word_index)+1, embedding_feature, input_length=maxlen, mask_zero=True)(sequence)
    blstm = Bidirectional(LSTM(64, return_sequences=True), merge_mode='sum')(embedded)
    blstm = Dropout(0.5)(blstm)
    blstm = Bidirectional(LSTM(32, return_sequences=True), merge_mode='sum')(blstm)
    #output = TimeDistributed(Dense(5, activation='softmax'))(blstm)
    output = Dense(5, activation='softmax')(blstm)
    model = Model(inputs=sequence, outputs=output)
    #op = Adam(lr=0.0015, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    #op = RMSprop(lr=0.001, rho=0.9, epsilon=1e-06)
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
    # model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
    return model

network = create_network()
network.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 128)          654336    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128, 64)           98816     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128, 64)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128, 32)           24832     
_________________________________________________________________
dense_1 (Dense)              (None, 128, 5)            165       
Total params: 778,149
Trainable params: 778,149
Non-trainable params: 0
_________________________________________________________________


## 4、训练

In [13]:
# 设置early stopping
early_stopping = EarlyStopping(monitor='val_loss',
                               min_delta=et_min_delta, 
                               patience=et_patience, 
                               verbose=1,
                               mode='auto')
history = network.fit(train_x, train_y, 
                    validation_data=(develop_x,develop_y),
                    batch_size=batch_size, 
                    epochs=epochs,
                    callbacks=[early_stopping],
                    verbose=1)

Train on 77058 samples, validate on 8562 samples
Epoch 1/2
Epoch 2/2


## 5、预测

#### 5.1 求出测试集的发射概率矩阵

In [14]:
# 这里的测试集是有填充的，后面可以把填充的去掉试一试
score, acc = network.evaluate(test_x, test_y,
                            batch_size=batch_size,verbose=2)

print('Test score:',score)
print('Test accuracy:',acc)

Test score: 0.566791866562
Test accuracy: 0.805062028244


In [24]:
b = network.predict(test_x)   # 发射概率
b = np.log(b)
B = []
# 按真实句子的长度截断
for i,sens in enumerate(b):
    B.append(sens[:len_test[i]])
#del b
# 这下面的代码是错的，并不能简单地取最大概率的那个标签，因为可能造成不合理的现象。
#with sess.as_default():
#    acc = tf.argmax(test_y,2)
#    pre = tf.argmax(pre,2)
#    acc = acc.eval()
#    pre = pre.eval()

#### 5.2 维特比算法解码，得到合理化的标签序列 

In [16]:
pre_y = cws_pre_batch(B) # 通过维特比算法求出合理的标签

#### 5.3 重构分词后的句子

In [17]:
pre_reorganization = data_label_to_word(test_x_origin,pre_y,0)

#### 5.4 计算测试集的准确率、召回率、F1值

In [18]:
# 计算评估指标
F,P,R = calculate_evaluation_batch(test_reorganization,pre_reorganization)
print(F,P,R)

0.7872486953327857 0.789282024529209 0.785225815643254


#### 5.5 保存实验结果

In [19]:
# x = np.arange(20)
# y = x**2
# plt.plot(x,y)
# plt.savefig('u.png')
plot_model(network, to_file=special+'.png')