In [1]:
# 先进行本地/全局分类，再进行类目分类
# 两次分类，现将用户输入按照意愿进行分类（愿意还款，意图模糊，延期还款，其它），再根据不同意愿进行进一步分类
# 理论上，意愿分类的准确率会很高，限定意愿后的进一步分类准确率也会有所提高
# 仅使用多卷积核CNN进行特征的综合提取
# 对于池化尺寸的大小未定，需加大样本数量进行对比
import keras
import xlrd, xlwt
from xlutils import copy
import os
import sys
import random
import gensim
from gensim.models import Word2Vec
from keras.models import Model
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras import regularizers
from keras.layers.embeddings import Embedding
import jieba
import numpy as np
import time
from keras.utils.np_utils import to_categorical
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, LSTM, SpatialDropout1D, LeakyReLU, Input, concatenate
import sklearn
from sklearn.model_selection import train_test_split
from keras.models import load_model
import datetime

Using TensorFlow backend.


In [2]:
'''
对用户意图进行多次分：全局/局部——粗分类——细分类
可以有效减少多分类情境下模型难收敛的问题
逐层逼近用户的真实意图，越底层容错率越高
'''
# jieba分词+去停用词
def tokenize(data):
    tokenized = []
#     对list类型数据进行分词 
    if isinstance(data, list):
        for pattern in data:
            t = jieba.lcut(pattern)
            t_s = []
#             去停用词
            for word in t:
                if word not in stopwords:
                    t_s.append(word)
            tokenized.append(t_s)
#     对str类型数据进行分词 
    elif isinstance(data, str):
        t = jieba.lcut(data)
        t_s = []
#             去停用词
        for word in t:
            if word not in stopwords:
                t_s.append(word)
        tokenized.append(t_s)
    return tokenized

# 对神经网络模型、数据及标签进行预处理
def preprocessing(data_set, label, num_classes, seed, mode):
    tokenized = tokenize(data_set)
    identify = wd_encode(wd2idx, tokenized)
    padded_data,max_len = padding_sts(identify)
    label = label_preprocessing(label, num_classes)
# 将映射关系构造成字典
    dictionary = []
    for i in range(len(tokenized)):
        dictionary.append({'sentence':data_set[i-1], 'tokenized':tokenized[i-1],\
                           'id':padded_data[i-1], 'label':label[i-1]})
    # 对数据集进行切分，生成训练集和验证集
    train_data, validate_data, train_label, validate_label = \
    split_data_set(padded_data, label, 0.1, detal=True)
    if mode == 0:
        # mode=0: 预测模式，需要再将训练集切分为训练集和测试集
        train_data, test_data, train_label, test_label = \
        split_data_set(train_data, train_label, 0.222, detal=True)
#     预测模式下不需要测试集
    else:
        test_data = None
        test_label = None
#     构造神经网络模型
    model = build_model(max_len, num_classes)
    
    return dictionary, model, train_data, validate_data, test_data, train_label, validate_label, test_label

# 将标签转换为指定的形式
def label_preprocessing(label, num_classes):
#     0: 愿意还款; 1: 意图模糊; 2: 延期还款; 3: 其他（全局）; 
    label = np.array(label)
    categorical_labels = to_categorical(label, num_classes=num_classes)
    return categorical_labels

# 在预测时，寻找包含新词的句子
def find_new_wd_of_sts(data, new_wd_of_sts_idx):
    fobj = open(pathDir+'包含新词的句子.txt','a')
    for i in new_wd_of_sts_idx:
        fobj.write('\n'+data[i])
    print('save the new word of setense')
    fobj.close()

# 短句补零，使之与神经网络模型的embedding层的输入尺寸相同
def padding_sts(identify,max_len=None):
#     若不预设最大长度，则计算传入样本中的最大长度
    if max_len is None:
        max_len = 0
        for sts in identify:
            if len(sts)>max_len:
                max_len = len(sts)
    padded_id = list(map(lambda l:l + [0]*(max_len - len(l)), identify))
    padded_id = np.array(padded_id)
    return padded_id, max_len

# 数据集切分，默认不显示detail
def split_data_set(data, label, ratio, detal=False):
    data_1, data_2, label_1, label_2 = train_test_split(data, label, test_size=ratio, random_state=seed)
# data：待划分的样本特征集
# label：待划分的样本标签
# ratio：划分比例。如果是浮点数，在0-1之间，表示样本占比；如果是整数，表示样本数量
# seed：是随机数的种子。
# detal：显示分割后的详情。默认False
    if detal == True:
        print("data_1_len: ", len(data_1),"label_1_len: ", len(label_1),\
              "\ndata_2_len: ", len(data_2),"lebal_2_len: ", len(label_2))
    return data_1, data_2, label_1, label_2

# 预训练词向量模型
def build_word2vec(filename, w2v_model=None, update=True):
#     若传入了词向量模型，则直接载入模型，默认不更新词向量模型（更新策略效果暂时不明显，待调试）
    if  w2v_model != None:
        print('Loading Word2Vec model...')
        try:
            w2v_model = Word2Vec.load(pathDir+w2v_model)
        except Exception as e:
            print('Warning! Fail to load: %s.\nBegin to build new w2v_model...' %e)
            w2v_model = None
#     如果需要构建新的词向量，则无需更新词向量
    if w2v_model is None:
        update = False
#     如果需要更新词向量，则对词向量进行更新
    if update is True:
        try:
            print('Update Word2Vec model...')        
            w2v_model = update_w2v(w2v_model, tokenized)
        except Exception as e:
            print('Warning! Fail to update: %s' %e)
        return w2v_model
#     若需要更新词向量或重新构建词向量，则需获取样本数据集
    else:
#         获取全数据集样本（用做词向量训练）
        try:
            total_set = xlrd.open_workbook(filename)
            total_set_sheet1 = total_set.sheet_by_index(0)
        except Exception as e:
            print('Error! Fail to load data_set: %s' %e)
            sys.exit()
        total_data = []
        for i in range(total_set_sheet1.nrows-1):
            total_data.append(total_set_sheet1.cell(i+1,0).value)
#         对数据集进行分词
        tokenized = tokenize(total_data)
#         若没有传入词向量模型，构建新的模型
#         判断是否有预设w2v_dims，若无则default=200
        try:
            w2v_dims
        except NameError:
            w2v_dims = 200
#         训练词向量模型
        w2v_model = Word2Vec(tokenized,sg=1,size=w2v_dims,window=5,min_count=1,negative=1,sample=0.001,hs=1)
        w2v_model.train(tokenized, total_examples=len(tokenized), epochs=5)
        return w2v_model

# 更新词向量模型（仅会对出现新词了的文本进行更新）
def update_w2v(w2v_model, tokenized):
#     生成原词向量的词-向量映射关系
    vocab_list = []
    for w, _ in w2v_model.wv.vocab.items():
        vocab_list.append(w)
#     寻找词向量模型中没有的新词
    new_sts = []
    new_wd = []
    for sts in tokenized:
        for wd in sts:
            if wd not in vocab_list:
                new_wd.append(wd)
                new_sts.append(sts)
                break
    if new_sts != []:
        print('发现新词：', new_wd, '\n已对词向量模型进行更新！')
        w2v_model.build_vocab(new_sts, update=True)
#         w2v_model.train(new_sts,total_examples=w2v_model.corpus_count,epochs=1)
    else:
        print('未发现新词，没有更新模型！')
    return w2v_model

# 对数据集中的每个词，按照词-向量的索引进行编码，若出现了生词，则填0
def wd_encode(wd2idx, tokenized):
    identify = []
    for sts in tokenized:
        id_sts = []
        for wd in sts:
            try:
                id_sts.append(wd2idx[wd])
            except:
                print('“'+wd+'”不在词向量模型中')
                id_sts.append('0')
        identify.append(id_sts)
    return identify

# 构建词向量的单词索引和embedding层矩阵
def build_wd2idx_embedMatrix(w2vModel):
    word2idx = {"_stopWord": 0}  # 停用词，生词和padding填充的0。
    vocab_list = [(w, w2vModel.wv[w]) for w, v in w2vModel.wv.vocab.items()]
    embedMatrix = np.zeros((len(w2vModel.wv.vocab.items()) + 1, w2vModel.vector_size))

    for i in range(0, len(vocab_list)):
        word = vocab_list[i][0]
        word2idx[word] = i + 1
        embedMatrix[i + 1] = vocab_list[i][1]
    return word2idx, embedMatrix

# 将深度学习过程中训练的词向量矩阵更新到词向量模型中
def embed2w2v(w2vModel, embedMatrix):
    print('正在根据学习过程中训练的词向量矩阵对词向量模型进行更新...')
    i = 0
    for w,_ in w2vModel.wv.vocab.items():
        w2vModel.wv[w] = embedMatrix[i]
        i += 1
    print('词向量模型更新完成')
    return w2vModel

# 设计网络模型：LSTM+六个卷积核(各10个特征)+LeakyReLU激活函数
def build_model(max_len, num_classes):
    # Input
    comment_seq = Input(shape=[max_len], name='x_seq')
    # Embedding
    emb_comment = Embedding(len(embedMatrix), len(embedMatrix[0]), weights=[embedMatrix],\
                            input_length=max_len, trainable=False)(comment_seq)
    # LSTM
    LSTM_1 = LSTM(units=32, dropout=0.2, recurrent_dropout=0.1, return_sequences=True)(emb_comment)
    # LSTM_1 = LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(emb_comment)
    # model.add(LSTM(200, return_sequences=True))
    # LSTM_2 = LSTM(200, return_sequences=False)(LSTM_1)
    # conv
    convs = []
    kernel_size = [1,2,3,4,5,max_len]
    for ksz in kernel_size:
        l_conv = Conv1D(filters=10, kernel_size=ksz, strides=1, padding='valid',\
                       use_bias=True,kernel_initializer='glorot_uniform',\
                       bias_initializer='zeros',kernel_regularizer=regularizers.l2(0.0001))(LSTM_1)
        l_conv = LeakyReLU(alpha=0.01)(l_conv)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)
    merge = concatenate(convs, axis=1)
    dropout = Dropout(0.3)(merge)
    # output = Dense(120)(merge)
    # output = LeakyReLU(alpha=0.05)(output)
    output = Dense(num_classes,activation='softmax')(dropout)
    model = Model([comment_seq], output)

    # summarize the model
    model.summary()
    Adam = keras.optimizers.Adam(lr=0.01)
    model.compile(optimizer=Adam, loss='categorical_crossentropy', metrics=['acc'])

    ThisTime()
    return model

def train_model(train_data, validate_data, train_label, validate_label, model, name):
    # 训练网络模型
    # verbose:显示日志。verbose=0: 为不在标准输出流输出日志信息 verbose=1: 为输出进度条记录 verbose=2: 为每个epoch输出一行记录
    start_time = time.time()
    loss_tmp = 10
    lr_pre = 0.01
    lr = lr_pre
    count = 1
    threshold = 5
#     动态学习速率衰减
    while lr >= lr_pre/128:
        hist = model.fit(train_data, train_label, epochs=1, verbose=1)
#         每次epoch后使用验证集进行验证，防止对训练集的过拟合
        loss, _ = model.evaluate(validate_data, validate_label, verbose=0)
        print(loss)

        if loss < loss_tmp:
            count = 1
            loss_tmp = loss
            model.save(NN_model_path+name)
        elif count >= threshold:
            model = load_model(NN_model_path+name)
            print("decay the learning rate")
            lr = lr/2
            Adam = keras.optimizers.Adam(lr=lr)
            model.compile(optimizer=Adam, loss='categorical_crossentropy', metrics=['acc'])
            count = 1
        else:
            count += 1

    elapsed_time = time.time() - start_time
    print ("processing time:", elapsed_time, "seconds")
    return model

def test_model(model, dictionary, test_data, test_label, detail=False):
    # 显示识别错误的数据，并计算准确率
    count = 0
    sum_loss = 0
    for i in range(len(test_data)):
        loss, accuracy = model.evaluate(np.column_stack(test_data[i-1]), np.column_stack(test_label[i-1]), batch_size=1, verbose=0)
        if accuracy == 0:
            for j in dictionary:
                if (j['id'] == test_data[i-1]).all():
                    tmp = j['label'].tolist()
                    print(j['sentence'], j['tokenized'], 'label: ', tmp.index(max(tmp)))
        else:
            count += 1
        sum_loss += loss
        loss = sum_loss/len(test_data)
    accuracy = count/len(test_data)
    if detail == True:
        print('Accuracy: %f' % (accuracy*100))
        print('Loss: %f' % (loss))
        ThisTime()
        
# 通过迭代遍历树状结构的节点
def find_son(count_lavel, label_index=None):
    tmp = []
    data = []
    label = []
#     第0层：为方便迭代而构造出的虚拟结点
    if count_lavel == 0:
        label_index = [0]# 初始化label_index
    else:
        node = label_index[-1]# 如果不是第0层，则当前节点为label_index的最后一个元素
    for i in range(sheet1.nrows-1):
#         若层级为第0层，直接读取数据第一层的和标签
        if count_lavel == 0:
            data.append(sheet1.cell(i+1, 0).value)
            label.append(sheet1.cell(i+1, 1).value)
        elif node == sheet1.cell(i+1, count_lavel).value:
            for j in range(len(label_index)-1):
                if sheet1.cell(i+1, j+1).value == label_index[j+1]:
                    flag = True
                else:
                    flag = False
                    break
            if flag is True:
                data.append(sheet1.cell(i+1, 0).value)
                label.append(sheet1.cell(i+1, count_lavel+1).value)
#     print(label)
    tmp = list(set(label))# 获取该节点下的子节点
#     如果有数据未标记，则抛出错误
    if '' in tmp:
        raise TypeError('The label in %d layer of some data is BLANK, which is illegal!(有数据未进行标记！)' %count_lavel+1)
    if count_lavel == 0:
        print('根节点下的子节点:', tmp)# 根节点没有node
    else:    
        print('第%d层级%d节点下的子节点:' %(count_lavel, node), tmp)
#     当该节点下有多个子节点时，训练该节点
    if len(tmp) > 1:
        print('正在训练该节点模型...')
#         构建模型
        (dictionary, model, train_data, \
         validate_data, test_data, \
         train_label, validate_label, testllabel) = preprocessing(data, label, len(tmp), seed=seed, mode=devide_mode)
#         模型命名
        if count_lavel == 0:# 根节点没有node
            model_name = '第%d层级的模型' %count_lavel
        else:
            model_name = '第%d层级第%d节点的模型' %(count_lavel, node)
#         训练模型
        model = train_model(train_data, validate_data, \
                            train_label, validate_label, model, model_name)
        print('训练完成！模型名称: %s' %model_name)
#         if count_lavel < num_label_lavel-1:# 循环遍历至倒数第二层（最后一层不存在子节点）
#             count_lavel += 1
#             for node in tmp:
#                 print('遍历第%d层级%d节点下的子节点...' %(count_lavel, node))
#                 print('_'*100)
#                 label_index_tmp = label_index.copy()
#                 label_index_tmp.append(node)
#                 find_son(count_lavel, label_index_tmp)
#             count_lavel -= 1# 循环遍历完本层的节点后，返回上一层
#             print('返回至第%d层' %count_lavel)
    else:
        print('该节点无需训练模型')
#     递归至倒数第二层（最后一层不可能存在子节点）
    if count_lavel < num_label_lavel-1:
        count_lavel += 1
#         获得该节点的子节点的数量
        for node in tmp:
            print('遍历第%d层级%d节点下的子节点...' %(count_lavel, node))
            print('_'*100)
            label_index_tmp = label_index.copy()# 缓存前层的节点索引
            label_index_tmp.append(node)
            find_son(count_lavel, label_index_tmp)
        count_lavel -= 1# 循环遍历完本层的节点后，返回上一层
        print('返回至第%d层' %count_lavel)    
        
def predict_data(count_lavel, data, threshold, last_label=None):
#     如果该节点为根节点，使用单独的名称
#     仅在第一次迭代（根节点处）进行分词
    if count_lavel == 0:
        model_name =  '第0层级的模型'
        tok = True
    else:
        model_name = '第%d层级第%d节点的模型' %(count_lavel, last_label)
        tok = False
    #     分词
    if tok is True:
#         单句预测时，仅对单句进行去停用词+分词即可
        tokenized = tokenize(data)
    else:
        tokenized = data
#     查找model_name是否存在于已加载的模型中（此处需要模型按照规范命名，加载错模型会导致predict的报错）
    for i in range(len(model_name_lst)):
        if model_name == model_name_lst[i]['name']:
            model = model_name_lst[i]['model']
            break
        else:
            model = None
    if model is None:
#         print(count_lavel, model_name)
        if count_lavel < num_label_lavel-1:
            count_lavel += 1
            output = predict_data(count_lavel, tokenized, threshold, last_label=0)
        else:
            output = 'Wrong!'
        return output
    
    max_len = model.layers[1].output_shape[1]# 获取模型文本最大长度
    num_classes = model.layers[23].output_shape[1]# 获取模型分类数量

#     根据词向量模型映射词标签
    identify = wd_encode(wd2idx, tokenized)
#     限定输入文本的长度（不可超过模型最大长度）
    if len(identify[0]) > max_len:
        print('输入过长(超出%d)，仅截取前一部分' %max_len)
        identify[0] = identify[0][0:max_len]
    padded_data, _ = padding_sts(identify, max_len=max_len)

    accuracy = model.predict(padded_data, batch_size=1, verbose=0, steps=None).tolist()
#     print(accuracy)
#     准确率未超过阈值则判定为nomatch
    if max(accuracy[0]) >= threshold:
        label = accuracy[0].index(max(accuracy[0]))
#         print(label)
        output = label
#             当迭代至最后一层，不再进行迭代
        if count_lavel < num_label_lavel-1:
            count_lavel += 1
            predict_data(count_lavel, tokenized, \
                         threshold=predict_threshold, last_label=label)
    else:
#         print('nomatch\n')
#         fobj.write('\t'+'nomatch')
        output = 'nomatch'
    return output

def mkdir(path):
#     去除首位空格
    path=path.strip()
#     去除尾部 \ 符号
    path=path.rstrip("\\")
#     判断路径是否存在
    isExists=os.path.exists(path)
#     如果不存在则创建目录
    if not isExists:
        os.makedirs(path)  
#         print(path+' Success to build the path!')
        return True
#     如果目录存在则不创建，并提示目录已存在
    else:
#         print(path+' The Dir is exsist!')
        return False

# 载入数据集和对应的多级标签
def load_data_set(fname, pathDir=sys.path[0]+'\\'):
    try:
        readbook = xlrd.open_workbook(pathDir+fname)
    except Exception as e:
        raise TypeError('Erroe! Fail to load train_data: %s' %e)
    sheet1 = readbook.sheet_by_index(0)# sheet1: 文本+标签
    num_label_lavel = sheet1.ncols-1# 数据集构成为：一列为文本，其余为层级标签。
    data = []
    label = []
    for i in range(sheet1.nrows-1):
        data.append(sheet1.cell(i+1, 0).value)
        lb_tmp = []
        for layer in range(num_label_lavel):
            lb_tmp.append(sheet1.cell(i+1, layer+1).value)
        label.append(lb_tmp)
    return data, label

# 显示每次运行片段的时间
def ThisTime():
    print('This time is: ', time.strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
def test_data(count_lavel, data, label, last_label=None):
#     如果该节点为根节点，使用单独的名称
#     仅在第一次迭代（根节点处）对数据进行预处理（分词、去停用词等）
    if count_lavel == 0:
        model_name =  '第0层级的模型'
        regu = True
    else:
        model_name = '第%d层级第%d节点的模型' %(count_lavel, last_label)
        regu = False
#     数据规范化
    if regu is True:
        tokenized = tokenize(data)# 分词+去停用词
        identify = wd_encode(wd2idx, tokenized)# 根据词向量模型映射词标签
    else:
        identify = data
#     查找model_name是否存在于已加载的模型中（此处需要模型按照规范命名，加载错模型会导致predict的报错）
    for i in range(len(model_name_lst)):
        if model_name == model_name_lst[i]['name']:
            model = model_name_lst[i]['model']
            break
        else:
            model = None
#     如果当前节点没有模型，但不是倒数第二层，则继续向下递归
    if model is None:
#         print(count_lavel, model_name)
        if count_lavel < num_label_lavel-1:
            count_lavel += 1
            output = test_data(count_lavel, identify, label, last_label=0)
        else:
            output = 'Wrong!'
        return output
#     如果当前节点有模型，则evaluate该节点
    else:
        max_len = model.layers[1].output_shape[1]# 获取模型文本最大长度
        num_classes = model.layers[23].output_shape[1]# 获取模型分类数量(树的深度)

# #         根据词向量模型映射词标签
#         identify = wd_encode(wd2idx, tokenized)
#         限定输入文本的长度（不可超过模型最大长度）
        if len(identify[0]) > max_len:
            print('输入过长(超出%d)，仅截取前一部分' %max_len)
            identify[0] = identify[0][0:max_len]
        padded_data, _ = padding_sts(identify, max_len=max_len)

        loss, accuracy = model.evaluate(padded_data, np.column_stack(label[count_lavel+1]), batch_size=1, verbose=0)
#         如果预测错误，则打印该句并返回False
        if accuracy == 0:
            for j in dictionary:
                if (j['id'] == test_data[i-1]).all():
#                         tmp = j['label'].tolist()
                    print(j['sentence'], j['tokenized'], 'label: ', j['label'])
                    return count, loss, False
#         如果预测正确，则继续递归
        elif count_lavel < num_label_lavel-1:
            count_lavel += 1
            count, loss, status = test_data(count_lavel, identify, label, last_label=label[count_lavel])
        else:
            count -= 1
            return count, loss, True

        if count_lavel == 0:
            count += 1
        count_lavel -= 1
        return count, loss, status

#             loss = sum_loss/len(test_data)
#         accuracy = count/len(test_data)
#         if detail is True:
#             print('Accuracy: %f' % (accuracy*100))
#             print('Loss: %f' % (loss))
#             ThisTime()

In [4]:
# 定义全局变量
global date, wd2idx, embedMatrix, max_len, seed,\
        w2v_dims, predict_threshold, model_name_lst, \
        devide_mode, num_label_lavel
date = time.strftime("%Y%m%d")# 用于保存和加载当天的模型
seed = random.randint(1,10000)# 随机生成种子，确保每次切分得到的数据集不同
max_len = 50# 初始化单句最大词数。超出最大长度则丢弃，不足则填0
devide_mode = 0# 0: 训练模式; 1: 预测模式
predict_threshold = 0.8# 预测时判定是否nomatch的阈值，缺省值为0.8
# w2v_dims = # 若不定义，则缺省值为200
# model_name = # 如果要载入原有的词向量模型，则再次输入模型名称
# 预设文件夹路径
pathDir = sys.path[0]+"\\"# .py文件所在路径
mkdir(pathDir+'NN_model\\')# 创建神经网络模型存放路径
NN_model_path = pathDir+'NN_model\\'
print('Loading self-defined dict...')# 加载自定义词典
try:
    jieba.load_userdict(pathDir+"newdic.txt")
#     print('成功加载自定义分词词库')
except Exception as e:
    print('Warning! Fail to load: %s.\nUse default dict and go on...' %e)

print('Loading stopwords...')# 获取停用词
try:
    stopwords = [line.strip() for line in open(pathDir+"stopwords.txt",encoding='gb18030',errors='ignore').readlines()]
except Exception as e:
    print('Warning! fail to load: %s. With no stopwords and go on...' %e)

# 默认构建新的词向量模型。若要在原有模型基础上继续训练，build_word2vec函数需传入词向量模型
# 若使用原有词向量模型，默认不更新模型
w2v_model = build_word2vec(filename=pathDir+'语音转文本_全业务数据集.xlsx', w2v_model='w2v_model', update=False)
w2v_model.save(pathDir+date+'_w2v_model')# 保存模型
print('词向量模型保存成功：w2v_model')

# 将词向量模型加载为数组
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)

Loading self-defined dict...
Loading stopwords...
Loading Word2Vec model...
词向量模型保存成功：w2v_model


In [None]:
print('Loading train_data...')
try:
    readbook = xlrd.open_workbook(pathDir+'催收-用户输入-样本及标签-本地全局-类目.xlsx')
except Exception as e:
    print('Erroe! Fail to load train_data: %s' %e)
    sys.exit()
sheet1 = readbook.sheet_by_index(0)# sheet1: 文本+标签
num_label_lavel = sheet1.ncols-1# 数据集构成为：一列为文本，其余为层级标签。
# 动态生成层级标签变量
# createVar = locals()
# print('Loading label(s)...')
# for num, label in enumerate(num_label_lavel):
#     try:
# #         createVar['model_name_'+str(i)] = name
#         createVar['label_'+label] = load_model(NN_model_path+name)
#         model_name_lst.append({'name': name, 'model': createVar['model_'+str(i)]})
#     except Exception as e:
#         print('Warning! Fail to load model(%s): %s. Skip this model and go on to load next model...' %(name, e))
# print(model_name_lst)
# num_label_lavel = 2
print('模型的层级数为:', num_label_lavel, '\n构造模型树...')
print('_'*100)
count_lavel = 0# 为方便迭代运算，在层级结构前加入第0层（根节点）
find_son(count_lavel, label_index=None)# 迭代训练结构中有多个子节点的节点
print('模型训练完毕！')

Loading train_data...
模型的层级数为: 3 
构造模型树...
____________________________________________________________________________________________________
根节点下的子节点: [0.0, 1.0]
正在训练该节点模型...
data_1_len:  26480 label_1_len:  26480 
data_2_len:  2943 lebal_2_len:  2943
data_1_len:  20601 label_1_len:  20601 
data_2_len:  5879 lebal_2_len:  5879
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
x_seq (InputLayer)              (None, 38)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 38, 200)      2635200     x_seq[0][0]         

0.1036868272080253
decay the learning rate
Epoch 1/1
0.08831290475140613
Epoch 1/1
0.08720892594163307
Epoch 1/1
0.09073454912370903
Epoch 1/1
0.08903157821281443
Epoch 1/1
0.1096180382074242
Epoch 1/1
0.10393313718266656
Epoch 1/1
0.09115192915002505
decay the learning rate
Epoch 1/1
0.08372992415688717
Epoch 1/1
0.08772979820678166
Epoch 1/1
0.09736876992772692
Epoch 1/1
0.09952506430844567
Epoch 1/1
0.10015836585663712
Epoch 1/1
0.10970266120469706
decay the learning rate
Epoch 1/1
0.08811341728534103
Epoch 1/1
0.09532116021552116
Epoch 1/1
0.0958112206172693
Epoch 1/1
0.09994528209426666
Epoch 1/1
0.1016347584660528
decay the learning rate
Epoch 1/1
0.08594752198754171
Epoch 1/1
0.08653244370075004
Epoch 1/1
0.08917584028151969
Epoch 1/1
0.09024814994720286
Epoch 1/1
0.09328546203423521
decay the learning rate
Epoch 1/1
0.0866728949965292
Epoch 1/1
0.08901976829858618
Epoch 1/1
0.08677752269605998
Epoch 1/1
0.08819633793127585
Epoch 1/1
0.0873549461828034
decay the learning rate
Ep

Epoch 1/1
0.2127054623445036
Epoch 1/1
0.19247560626280452
Epoch 1/1
0.20447899147807827
Epoch 1/1
0.18355830401664802
Epoch 1/1
0.19820988345363363
Epoch 1/1
0.19237883804056807
Epoch 1/1
0.21875025072862744
Epoch 1/1
0.2253907934253515
Epoch 1/1
0.20460759172072776
decay the learning rate
Epoch 1/1
0.19719235033309654
Epoch 1/1
0.18148255286067122
Epoch 1/1
0.18909399054854023
Epoch 1/1
0.18942836479619446
Epoch 1/1
0.18674692432528084
Epoch 1/1
0.180868671891781
Epoch 1/1
0.18835669434082652
Epoch 1/1
0.19671837613288207
Epoch 1/1
0.20275471767915887
Epoch 1/1
0.20652779086911485
Epoch 1/1
0.1987111487308977
decay the learning rate
Epoch 1/1
0.18686378911921853
Epoch 1/1
0.20361999539349243
Epoch 1/1
0.2232561855784312
Epoch 1/1
0.19730195586196325
Epoch 1/1
0.2094456519856144
decay the learning rate
Epoch 1/1
0.18130654275779298
Epoch 1/1
0.18265628837320486
Epoch 1/1
0.18506426463423953
Epoch 1/1
0.18135697634717232
Epoch 1/1
0.19049690866820243
decay the learning rate
Epoch 1/1
0

Epoch 1/1
0.13635082960015896
Epoch 1/1
0.06839465685753209
Epoch 1/1
0.0788105035482934
Epoch 1/1
0.07789736537432129
Epoch 1/1
0.10558404216373508
Epoch 1/1
0.09890195869022246
Epoch 1/1
0.0955616762466503
decay the learning rate
Epoch 1/1
0.06938882461664352
Epoch 1/1
0.06752524104420886
Epoch 1/1
0.06637089968179212
Epoch 1/1
0.06652920938689601
Epoch 1/1
0.07461307300536922
Epoch 1/1
0.07389026979605356
Epoch 1/1
0.07319820553741672
Epoch 1/1
0.07912806375234416
decay the learning rate
Epoch 1/1
0.0784832027317448
Epoch 1/1
0.06940951547726537
Epoch 1/1
0.08287422944876281
Epoch 1/1
0.0684457176195627
Epoch 1/1
0.07216225469372038
decay the learning rate
Epoch 1/1
0.07513512902413354
Epoch 1/1
0.08237642051764961
Epoch 1/1
0.0658920111219314
Epoch 1/1
0.07664315064409466
Epoch 1/1
0.08336249802148703
Epoch 1/1
0.07355509349568323
Epoch 1/1
0.07093122280343915
Epoch 1/1
0.07730351670108962
decay the learning rate
Epoch 1/1
0.0723368355552807
Epoch 1/1
0.07038868484228397
Epoch 1/1


In [21]:
# 加载词向量模型
print('Loading w2v_model...')
try:
    w2v_model = Word2Vec.load(pathDir+'w2v_model')
except Exception as e:
    print('Error! Fail to load model: %s' %e)
    sys.exit()
# 加载神经网络模型
NN_model_path = pathDir+'NN_model\\'
listTemp = os.listdir(NN_model_path)# 获取模型名称
# num_model = len(listTemp)# 获取模型数量
# 动态生成变量，加载模型并创建索引
createVar = locals()
model_name_lst = []
print('Loading NN_model...')
for num, name in enumerate(listTemp):
    try:
#         createVar['model_name_'+str(i)] = name
        createVar['model_'+str(num)] = load_model(NN_model_path+name)
        model_name_lst.append({'name': name, 'model': createVar['model_'+str(num)]})
    except Exception as e:
        print('Warning! Fail to load model(%s): %s. Skip this model and go on to load next model...' %(name, e))
print(model_name_lst)

Loading w2v_model...
Loading NN_model...
[{'name': '第2层级第0节点的模型', 'model': <keras.engine.training.Model object at 0x000001D800AA0CF8>}]


In [23]:
# 预测新文本
# bug1: 如果输出文本长度超过训练集最长长度，会报错
# bug2: 出现新词，会报错
# 加载自定义词典
num_label_lavel = 3
count_lavel = 0# 为方便迭代运算，在层级结构前加入第0层（根节点）
jieba.load_userdict("newdic.txt")
print('成功加载自定义分词词库')

# 获取停用词
stopwords = [line.strip() for line in open("stopwords.txt",encoding='gb18030',errors='ignore').readlines()]

# 将词向量模型加载为数组
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)
readbook = xlrd.open_workbook(pathDir+'催收-延期还款.xlsx')
sheet2 = readbook.sheet_by_index(1)

predict_threshold = 0.99
fobj_output = open(date+'-predict.txt','a')

for i in range(sheet2.nrows-1):
    predict_set = sheet2.cell(i+1,0).value
# predict_set = input('文本: ')
    output = predict_data(count_lavel, predict_set, threshold=predict_threshold)
#     print(predict_set, output, 'i:', i)
    fobj_output.write('\n'+str(output))
fobj_output.close()

成功加载自定义分词词库


In [8]:
print(model_1.layers[1].output_shape[1])

39


In [11]:
# 预测新文本
# bug1: 如果输出文本长度超过训练集最长长度，会报错
# bug2: 出现新词，会报错
# 加载自定义词典
# num_label_lavel = 3
count_lavel = 0# 为方便迭代运算，在层级结构前加入第0层（根节点）
jieba.load_userdict("催收文本-newdic.txt")
print('成功加载自定义分词词库')

# 获取停用词
stopwords = [line.strip() for line in open("催收-停用词.txt",encoding='gb18030',errors='ignore').readlines()]

# 将词向量模型加载为数组
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)
readbook = xlrd.open_workbook(pathDir+'催收-承诺还款.xlsx')
sheet2 = readbook.sheet_by_index(1)

predict_threshold = 0.98
fobj_output = open(date+'-predict.txt','a')

for i in range(sheet2.nrows-1):
    predict_set = sheet2.cell(i+1,0).value
# predict_set = input('文本: ')
    output = predict_data(count_lavel, predict_set, threshold=predict_threshold)
#     print(predict_set, output, 'i:', i)
    fobj_output.write('\n'+str(output))
fobj_output.close()

FileNotFoundError: [Errno 2] No such file or directory: '催收文本-newdic.txt'

In [111]:
# list的模糊匹配
import difflib

AA = ['20190617_第0层级的模型', '20190617_第1层级第0节点的模型', '20190617_第1层级第1节点的模型']
A = '20190617_第1层级第4节点的模型'
a = difflib.get_close_matches(A,AA,1, cutoff=0.7)

['20190617_第1层级第1节点的模型']

In [16]:
w2v_model = Word2Vec.load('20190617_w2v_model')
model_1 = load_model('20190610_一级模型_1')
model_2 = load_model('20190610_一级模型_2')
model_3 = load_model('20190610_一级模型_3')
lmodel_1 = load_mo0del('20190610_二级（本地）模型_1')
lmodel_2 = load_model('20190610_二级（本地）模型_2')
lmodel_3 = load_model('20190610_二级（本地）模型_3')
gmodel_1 = load_model('20190610_二级（全局）模型_1')
gmodel_2 = load_model('20190610_二级（全局）模型_2')
gmodel_3 = load_model('20190610_二级（全局）模型_3')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [15]:
# 测试模型，传入参数：用于测试的模型，字典，测试数据，测试标签
test_model(model_1, g_dictionary, gtstd, gtstl, detail=True)

微信 ['微信'] label:  7
这个我不懂了 ['这个', '我', '不', '懂了'] label:  6
为什么不提前跟我说啊我一下子哪有那么多钱 ['为什么', '不', '提前', '跟', '我', '说', '啊', '我', '一下子', '哪有', '那么', '多', '钱'] label:  15
你这是多少利息了 ['你', '这是', '多少', '利息', '了'] label:  17
处理中遇到问题 ['处理', '中', '遇到', '问题'] label:  7
我的到来人工聊妈吗他那卡里的吗我怎么登不了啊 ['我的', '到来', '人工', '聊妈', '吗', '他', '那', '卡里', '吗', '我', '怎么', '登', '不了', '啊'] label:  7
喂喂你听的见吧 ['喂', '喂', '你', '听的见', '吧'] label:  0
呃他妈的这什么意思哦 ['呃', '他妈的', '这', '什么', '意思', '哦'] label:  3
呃我想请问你一下你们可以听我说一下吗因为我的那个卡绑定的那个卡换掉了我的那个卡掉了现在我要重新绑定一张卡然后我把 ['呃', '我', '想', '请问', '你', '一下', '你们', '可以', '听', '我', '说', '一下', '吗', '因为', '我的', '那个', '卡', '绑定', '那个', '卡换', '掉', '了', '我的', '那个', '卡', '掉', '了', '现在', '我', '要', '重新', '绑定', '一张', '卡', '我', '把'] label:  5
喂你给我再说啊 ['喂', '你', '给我', '再说', '啊'] label:  6
我不知道为什么 ['我', '不知道', '为什么'] label:  7
唉你等一下我因为我刚好今天从工作的基础里好吧你你从讲好吧 ['唉', '你', '等一下', '我', '因为', '我', '刚好', '今天', '从', '工作', '基础', '里', '好吧', '你', '你', '从', '讲', '好吧'] label:  6
你他妈谁啊 ['你', '他', '妈', '谁', '啊'] label:  3
呃

In [20]:
readbook = xlrd.open_workbook('催收-用户输入-样本及标签-本地全局-类目.xlsx')
sheet2 = readbook.sheet_by_name('pos_stopwords')
sheet3 = readbook.sheet_by_name('predict')
pos_stopwords = []
for i in range(sheet2.nrows):
    pos_stopwords.append(sheet2.cell(i,0).value)
predict_set = []
for i in range(sheet3.nrows-1):
    predict_set.append(sheet3.cell(i+1,0).value)
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)
# 加载自定义词典
jieba.load_userdict("催收文本-newdic.txt")

In [17]:
readbook = xlrd.open_workbook('催收-用户输入-样本及标签-本地全局-类目.xlsx')
sheet2 = readbook.sheet_by_name('pos_stopwords')
pos_stopwords = []
for i in range(sheet2.nrows):
    pos_stopwords.append(sheet2.cell(i,0).value)
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)
# 加载自定义词典
jieba.load_userdict("催收文本-newdic.txt")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache


KeyboardInterrupt: 

In [16]:
print(w2v_model.most_similar(positive=['可能']))

[('估计', 0.7111688852310181), ('倒把', 0.6890275478363037), ('不照', 0.6791861057281494), ('活转', 0.6752203702926636), ('甚至', 0.6734603643417358), ('要明', 0.6718773245811462), ('微软', 0.6554404497146606), ('留款', 0.6554221510887146), ('越过', 0.6531094312667847), ('少扣', 0.6469331979751587)]


  """Entry point for launching an IPython kernel.


In [2]:
jieba.load_userdict("催收文本-newdic.txt")
sts1 = '好嘞'
sts1_cut = jieba.lcut(sts1)
print(sts1,sts1_cut)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\biocloo\AppData\Local\Temp\jieba.cache
Loading model cost 0.945 seconds.
Prefix dict has been built succesfully.


好嘞 ['好', '嘞']


In [37]:
tst_label = [[0,0,2], [0,2,1], [1,17,0]]
print(tst_label)

[[0, 0, 2], [0, 2, 1], [1, 17, 0]]


In [41]:
def load_data_set():
    try:
        readbook = xlrd.open_workbook(pathDir+'催收-还款意图模糊.xlsx')
    except Exception as e:
        raise TypeError('Erroe! Fail to load train_data: %s' %e)
    sheet1 = readbook.sheet_by_index(0)# sheet1: 文本+标签
    num_label_lavel = sheet1.ncols-1# 数据集构成为：一列为文本，其余为层级标签。
    data = []
    label = []
    for i in range(sheet1.nrows-1):
        data.append(sheet1.cell(i+1, 0).value)
        lb_tmp = []
        for layer in range(num_label_lavel):
            lb_tmp.append(sheet1.cell(i+1, layer+1).value)
        label.append(lb_tmp)
    return data, label

[0.0, 1.0, 3.0]


In [42]:
print(np.column_stack([0, 1, 0]))

[[0 1 0]]
