In [8]:
# -*- coding: utf-8 -*-
"""
Created on Fri May  4 21:21:56 2018

@author: shen1994
"""

import os
import argparse
import gensim
import pickle
import logging

logger = logging.getLogger(__name__)

from log import setUpLogger

from data_create import create_label_data

from data_preprocess import DataPreprocess

from data_generate import generate_batch

from bilstm_cnn_crf import bilstm_cnn_crf

from keras.callbacks import ModelCheckpoint

from paths import TrainPath

In [9]:
parser = argparse.ArgumentParser()
parser.add_argument("--corpus_path", help="corpus path", default="/home/jovyan/shared/corpus/2014/", type=str)
parser.add_argument("--batch_size", help="batch size", default=256, type=int)
parser.add_argument("--epochs", help="epochs", default=3, type=int)
parser.add_argument(
    "--train_dir", help="train directory", default="/home/jovyan/shared/", type=str
)
args = parser.parse_args(args=[])

corpus_path = args.corpus_path
batch_size = args.batch_size
epochs = args.epochs

trainPath = TrainPath(args.train_dir)

setUpLogger(trainPath)
dataPreprocess = DataPreprocess(trainPath)

In [10]:
logger.info("step-1--->" + u"加载词向量模型" + "--->START")
embedding_model = gensim.models.Word2Vec.load(trainPath.model_vector_path)

word_dict = dataPreprocess.create_useful_words(embedding_model)

embedding_size = embedding_model.vector_size

In [4]:
corpus_path

'/home/jovyan/shared/corpus/2014/'

In [11]:
logger.info("step-2--->" + u"语料格式转换,加标注生成标准文件" + "--->START")

def path_flatten(path, includes=['.txt']):
    paths = []
    for subpath in os.listdir(path):
        for include in includes:
            relpath = os.path.join(path, subpath)
            if os.path.isfile(relpath) and include == 'any':
                paths.append(relpath)
            elif os.path.isfile(relpath) and relpath.endswith(include):
                paths.append(relpath)
            elif os.path.isdir(relpath):
                paths += path_flatten(relpath, includes=includes)
    return paths
flated_paths = path_flatten(corpus_path)

In [12]:
flated_paths[0]
# create_label_data(trainPath, word_dict, flated_paths)

'/home/jovyan/shared/corpus/2014/0101/c1002-23995935.txt'

In [30]:
logger.info("step-3--->" + u"按标点符号或是空格存储文件" + "--->START")

documents_length = dataPreprocess.create_documents()

In [31]:
documents_length

2531574

In [12]:
logger.info("step-4--->" + u"对语料中的词统计排序生成索引" + "--->START")

lexicon, lexicon_reverse = dataPreprocess.create_lexicon(word_dict)

In [13]:
logger.info("step-5--->" + u"对所有的词创建词向量" + "--->START")

useful_word_length, embedding_weights = dataPreprocess.create_embedding(
    embedding_model, embedding_size, lexicon_reverse
)

In [14]:
logger.info("step-6--->" + u"生成标注以及索引" + "--->START")

label_2_index = dataPreprocess.create_label_index()

label_2_index_length = len(label_2_index)

In [21]:
logger.info("step-7--->" + u"将语料中每一句和label进行索引编码" + "--->START")

dataPreprocess.create_matrix(lexicon, label_2_index)

In [16]:
logger.info("step-8--->" + u"将语料中每一句和label以最大长度统一长度,不足补零" + "--->START")

#max_len = dataPreprocess.maxlen_2d_list()
max_len=306

# dataPreprocess.padding_sentences(max_len)

In [23]:
# -*- coding: utf-8 -*-
"""
Created on Fri May  4 10:18:27 2018

@author: shen1994
"""

from keras.layers import Input
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Dropout
from keras.layers import ZeroPadding1D
from keras.layers import Conv1D
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Concatenate

from keras_contrib.layers import CRF

from keras.models import Model


def bilstm_cnn_crf(
    maxlen,
    useful_word_len,
    class_label_count,
    embedding_size,
    embedding_weights=None,
    is_train=True,
):
    word_input = Input(shape=(maxlen,), dtype="int32", name="word_input")

    if is_train:
        word_emb = Embedding(
            useful_word_len,
            output_dim=embedding_size,
            input_length=maxlen,
            weights=[embedding_weights],
            name="word_emb",
        )(word_input)
    else:
        word_emb = Embedding(
            useful_word_len,
            output_dim=embedding_size,
            input_length=maxlen,
            name="word_emb",
        )(word_input)

    # bilstm
    bilstm = Bidirectional(LSTM(64, return_sequences=True))(word_emb)
    bilstm_drop = Dropout(0.1)(bilstm)
    bilstm_dense = TimeDistributed(Dense(embedding_size))(bilstm_drop)

    # cnn
    half_window_size = 2
    filter_kernel_number = 64
    padding_layer = ZeroPadding1D(padding=half_window_size)(word_emb)
    conv = Conv1D(
        nb_filter=filter_kernel_number,
        filter_length=2 * half_window_size + 1,
        padding="valid",
    )(padding_layer)
    conv_drop = Dropout(0.1)(conv)
    conv_dense = TimeDistributed(Dense(filter_kernel_number))(conv_drop)

    # merge
    rnn_cnn_merge = Concatenate(axis=2)([bilstm_dense, conv_dense])
#     rnn_cnn_merge = Concatenate([bilstm_dense, conv_dense], axis=2)
    dense = TimeDistributed(Dense(class_label_count))(rnn_cnn_merge)

    # crf
    crf = CRF(class_label_count, sparse_target=False)
    crf_output = crf(dense)

    # mdoel
    model = Model(input=[word_input], output=crf_output)
    model.compile(loss=crf.loss_function, optimizer="adam", metrics=[crf.accuracy])

    return model


In [26]:
logger.info("step-9--->" + u"模型创建" + "--->START")

model = bilstm_cnn_crf(
    max_len,
    useful_word_length + 2,
    label_2_index_length,
    embedding_size,
    embedding_weights,
)
logger.info("setp-9.1--->" + "加载模型" + "--->START")
# model.load_weights(trainPath.checkpoints_path)
model.summary()



Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_input (InputLayer)         (None, 306)          0                                            
__________________________________________________________________________________________________
word_emb (Embedding)            (None, 306, 128)     833280      word_input[0][0]                 
__________________________________________________________________________________________________
zero_padding1d_6 (ZeroPadding1D (None, 310, 128)     0           word_emb[0][0]                   
__________________________________________________________________________________________________
bidirectional_6 (Bidirectional) (None, 306, 128)     98816       word_emb[0][0]                   
____________________________________________________________________________________________

In [33]:
# -*- coding: utf-8 -*-
"""
Created on Mon May  7 13:55:07 2018

@author: shen1994
"""

import codecs

import numpy as np

from fake_keras import to_categorical


def generate_batch(trainPath, batch_size=None, label_class=None):

    batch_count = 0

    X = []
    Y = []

    while True:

        data_index_padding = codecs.open(trainPath.data_index_padding_path, "r", "utf-8")
        label_index_padding = codecs.open(trainPath.label_index_padding_path, "r", "utf-8")

        data_line = data_index_padding.readline()
        label_line = label_index_padding.readline()

        while data_line and label_line:

            data_str_list = data_line.strip().split()
            label_str_list = label_line.strip().split()

            data_list = []
            label_list = []
            for data in data_str_list:
                data_list.append(int(data))

            for label in label_str_list:
                label_list.append(int(label))

            X.append(data_list)
            Y.append(label_list)

            batch_count += 1

            if batch_count == batch_size:

                batch_count = 0

                X_ARRAY = np.array(X)
                Y_ARRAY = np.array(Y)

                Y_CLASS = to_categorical(Y_ARRAY, label_class).reshape(
                    (len(Y_ARRAY), len(Y_ARRAY[0]), -1)
                )

                yield (X_ARRAY, Y_CLASS)

                X = []
                Y = []

            data_line = data_index_padding.readline()
            label_line = label_index_padding.readline()

    data_index_padding.close()
    label_index_padding.close()


In [34]:
logger.info("step-10--->" + u"模型训练" + "--->START")


checkpoint = ModelCheckpoint(
    trainPath.checkpoints_path,
    monitor="val_accuracy",
    verbose=1,
    save_best_only=True,
    mode="max",
)

_ = model.fit_generator(
    generator=generate_batch(
        trainPath=trainPath,
        batch_size=batch_size, label_class=label_2_index_length
    ),
    steps_per_epoch=int(documents_length / batch_size),
    epochs=epochs,
    verbose=1,
    workers=1,
    callbacks=[checkpoint],
)

Epoch 1/3




KeyboardInterrupt: 

In [35]:
    logger.info("step-11--->" + u"模型和字典保存" + "--->START")

    model.save_weights(trainPath.weights_path)

    index_2_label = dataPreprocess.create_index_label()

    pickle.dump([lexicon, index_2_label], open(trainPath.lexicon_path, "wb"))

    pickle.dump(
        [max_len, embedding_size, useful_word_length + 2, label_2_index_length],
        open(trainPath.model_params_path, "wb"),
    )

In [36]:
    logger.info("step-12--->" + u"打印恢复模型的重要参数" + "--->START")

    logger.info("sequence_max_length: " + str(max_len))

    logger.info("embedding size: " + str(embedding_size))

    logger.info("useful_word_length: " + str(useful_word_length + 2))

    logger.info("label_2_index_length: " + str(label_2_index_length))

    logger.info(u"训练完成" + "--->OK")