In [1]:
# 先进行本地/全局分类，再进行类目分类
# 两次分类，现将用户输入按照意愿进行分类（愿意还款，意图模糊，延期还款，其它），再根据不同意愿进行进一步分类
# 理论上，意愿分类的准确率会很高，限定意愿后的进一步分类准确率也会有所提高
# 仅使用多卷积核CNN进行特征的综合提取
# 对于池化尺寸的大小未定，需加大样本数量进行对比
import keras
import xlrd, xlwt
from xlutils import copy
import os
import random
import gensim
from gensim.models import Word2Vec
from keras.models import Model
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras import regularizers
from keras.layers.embeddings import Embedding
import jieba
import numpy as np
import time
from keras.utils.np_utils import to_categorical
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, LSTM, SpatialDropout1D, LeakyReLU, Input, concatenate
import sklearn
from sklearn.model_selection import train_test_split
from keras.models import load_model
import datetime

Using TensorFlow backend.


In [2]:
'''
对用户意图进行多次分：全局/局部——粗分类——细分类
可以有效减少多分类情境下模型难收敛的问题
逐层逼近用户的真实意图，越底层容错率越高
'''
# 对神经网络模型、数据及标签进行预处理
def preprocessing(data_set, label, num_classes, seed, mode):
    tokenized = data_preprocessing(data_set)
    identify = wd_encode(wd2idx, tokenized)
    padded_data,max_len = padding_sts(identify)
    label = label_preprocessing(label, num_classes)
# 将映射关系构造成字典
    dictionary = []
    for i in range(len(tokenized)):
        dictionary.append({'sentence':data_set[i-1], 'tokenized':tokenized[i-1],\
                           'id':padded_data[i-1], 'label':label[i-1]})
    # 对数据集进行切分，生成训练集和验证集
    train_data, validate_data, train_label, validate_label = \
    split_data_set(padded_data, label, 0.1, detal=True)
    if mode == 0:
        # mode=0: 预测模式，需要再将训练集切分为训练集和测试集
        train_data, test_data, train_label, test_label = \
        split_data_set(train_data, train_label, 0.222, detal=True)
#     预测模式下不需要测试集
    else:
        test_data = None
        test_label = None
#     构造神经网络模型
    model = build_model(max_len, num_classes)
    
    return dictionary, model, train_data, validate_data, test_data, train_label, validate_label, test_label

# 将标签转换为指定的形式
def label_preprocessing(label, num_classes):
#     0: 愿意还款; 1: 意图模糊; 2: 延期还款; 3: 其他（全局）; 
    label = np.array(label)
    categorical_labels = to_categorical(label, num_classes=num_classes)
    return categorical_labels

# 数据集预处理(jieba分词，去停用词)
def data_preprocessing(data):
    total_words = []
    tokenized = []
# 分词
    for pattern in data:
#         去前停用词，本意是希望在分词前将一些无意义的单字去除。
#         但具体效果不详，因此先不使用
#         sts = ""
#         for word in pattern:
#             if word not in pre_stopwords:
#                 sts += word
        t = jieba.lcut(pattern)
        t_s = []
#         去停用词
        for word in t:
            if word not in stopwords:
                t_s.append(word)
        total_words.extend(t_s)
        tokenized.append(t_s)

    return tokenized

# 在预测时，寻找包含新词的句子
def find_new_wd_of_sts(data, new_wd_of_sts_idx):
    fobj = open(pathDir+'包含新词的句子.txt','a')
    for i in new_wd_of_sts_idx:
        fobj.write('\n'+data[i])
    print('save the new word of setense')
    fobj.close()

# 短句补零，使之与神经网络模型的embedding层的输入尺寸相同
def padding_sts(identify,max_len=None):
#     若不预设最大长度，则计算传入样本中的最大长度
    if max_len is None:
        max_len = 0
        for sts in identify:
            if len(sts)>max_len:
                max_len = len(sts)
    padded_id = list(map(lambda l:l + [0]*(max_len - len(l)), identify))
    padded_id = np.array(padded_id)
    return padded_id, max_len

# 数据集切分，默认不显示detail
def split_data_set(data, label, ratio, detal=False):
    data_1, data_2, label_1, label_2 = train_test_split(data, label, test_size=ratio, random_state=seed)
# data：待划分的样本特征集
# label：待划分的样本标签
# ratio：划分比例。如果是浮点数，在0-1之间，表示样本占比；如果是整数，表示样本数量
# seed：是随机数的种子。
# detal：显示分割后的详情。默认False
    if detal == True:
        print("data_1_len: ", len(data_1),"label_1_len: ", len(label_1),\
              "\ndata_2_len: ", len(data_2),"lebal_2_len: ", len(label_2))
    return data_1, data_2, label_1, label_2

# 预训练词向量模型
def build_word2vec(filename, w2v_model=None, update=True):
#     若传入了词向量模型，则直接载入模型，默认不更新词向量模型（更新策略效果暂时不明显，待调试）
    if  w2v_model != None and update is False:
        print('正在载入词向量模型...')
        w2v_model = Word2Vec.load(pathDir+model_name)
#     若需要更新词向量或重新构建词向量，则需获取样本数据集
    else:
        # 获取全数据集样本（用做词向量训练）
        total_set = xlrd.open_workbook(filename)
        total_set_sheet1 = total_set.sheet_by_index(0)
        total_data = []
        for i in range(total_set_sheet1.nrows-1):
            total_data.append(total_set_sheet1.cell(i+1,0).value)
#         对数据集进行分词
        tokenized = []
        for pattern in total_data:
            t = jieba.lcut(pattern)
            t_s = []
#             去后停用词
            for word in t:
                if word not in stopwords:
                    t_s.append(word)
            tokenized.append(t_s)
#     若没有传入词向量模型，构建新的模型
        if w2v_model is None:
            print('正在构建新的词向量模型...')
#             词向量维度的缺省值为200
            try:
                w2v_dims
            except NameError:
                w2v_dims = 200
#             训练词向量模型
            w2v_model = Word2Vec(tokenized,sg=1,size=w2v_dims,window=5,min_count=1,negative=1,sample=0.001,hs=1)
            w2v_model.train(tokenized, total_examples=len(tokenized), epochs=5)
#         若传入了词向量模型，则载入并更新模型
        else:
            print('正在载入词向量模型...')
            w2v_model = Word2Vec.load(pathDir+model_name)
            print('正在更新词向量模型...')        
            w2v_model = update_w2v(w2v_model, tokenized)

    return w2v_model

# 更新词向量模型（仅会对出现新词了的文本进行更新）
def update_w2v(w2v_model, tokenized):
#     生成原词向量的词-向量映射关系
    vocab_list = []
    for w, _ in w2v_model.wv.vocab.items():
        vocab_list.append(w)
#     寻找词向量模型中没有的新词
    new_sts = []
    new_wd = []
    for sts in tokenized:
        for wd in sts:
            if wd not in vocab_list:
                new_wd.append(wd)
                new_sts.append(sts)
                break
    if new_sts != []:
        print('发现新词：', new_wd, '\n已对词向量模型进行更新！')
        w2v_model.build_vocab(new_sts, update=True)
#         w2v_model.train(new_sts,total_examples=w2v_model.corpus_count,epochs=1)
    else:
        print('未发现新词，没有更新模型！')
    return w2v_model

# 对数据集中的每个词，按照词-向量的索引进行编码，若出现了生词，则填0
def wd_encode(wd2idx, tokenized):
    identify = []
    for sts in tokenized:
        id_sts = []
        for wd in sts:
            try:
                id_sts.append(wd2idx[wd])
            except:
                print('“'+wd+'”不在词向量模型中')
                id_sts.append('0')
        identify.append(id_sts)
    return identify

# 构建词向量的单词索引和embedding层矩阵
def build_wd2idx_embedMatrix(w2vModel):
    word2idx = {"_stopWord": 0}  # 停用词，生词和padding填充的0。
    vocab_list = [(w, w2vModel.wv[w]) for w, v in w2vModel.wv.vocab.items()]
    embedMatrix = np.zeros((len(w2vModel.wv.vocab.items()) + 1, w2vModel.vector_size))

    for i in range(0, len(vocab_list)):
        word = vocab_list[i][0]
        word2idx[word] = i + 1
        embedMatrix[i + 1] = vocab_list[i][1]
    return word2idx, embedMatrix

# 将深度学习过程中训练的词向量矩阵更新到词向量模型中
def embed2w2v(w2vModel, embedMatrix):
    print('正在根据学习过程中训练的词向量矩阵对词向量模型进行更新...')
    i = 0
    for w,_ in w2vModel.wv.vocab.items():
        w2vModel.wv[w] = embedMatrix[i]
        i += 1
    print('词向量模型更新完成')
    return w2vModel

# 设计网络模型：LSTM+六个卷积核(各10个特征)+LeakyReLU激活函数
def build_model(max_len, num_classes):
    # Input
    comment_seq = Input(shape=[max_len], name='x_seq')
    # Embedding
    emb_comment = Embedding(len(embedMatrix), len(embedMatrix[0]), weights=[embedMatrix],\
                            input_length=max_len, trainable=False)(comment_seq)
    # LSTM
    LSTM_1 = LSTM(units=32, dropout=0.2, recurrent_dropout=0.1, return_sequences=True)(emb_comment)
    # LSTM_1 = LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(emb_comment)
    # model.add(LSTM(200, return_sequences=True))
    # LSTM_2 = LSTM(200, return_sequences=False)(LSTM_1)
    # conv
    convs = []
    kernel_size = [1,2,3,4,5,max_len]
    for ksz in kernel_size:
        l_conv = Conv1D(filters=10, kernel_size=ksz, strides=1, padding='valid',\
                       use_bias=True,kernel_initializer='glorot_uniform',\
                       bias_initializer='zeros',kernel_regularizer=regularizers.l2(0.0001))(LSTM_1)
        l_conv = LeakyReLU(alpha=0.01)(l_conv)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)
    merge = concatenate(convs, axis=1)
    dropout = Dropout(0.3)(merge)
    # output = Dense(120)(merge)
    # output = LeakyReLU(alpha=0.05)(output)
    output = Dense(num_classes,activation='softmax')(dropout)
    model = Model([comment_seq], output)

    # summarize the model
    model.summary()
    Adam = keras.optimizers.Adam(lr=0.01)
    model.compile(optimizer=Adam, loss='categorical_crossentropy', metrics=['acc'])

    ThisTime()
    return model

def train_model(train_data, validate_data, train_label, validate_label, model, name):
    # 训练网络模型
    # verbose:显示日志。verbose=0: 为不在标准输出流输出日志信息 verbose=1: 为输出进度条记录 verbose=2: 为每个epoch输出一行记录
    start_time = time.time()
    loss_tmp = 10
    lr_pre = 0.01
    lr = lr_pre
    count = 1
    threshold = 5
#     动态学习速率衰减
    while lr >= lr_pre/128:
        hist = model.fit(train_data, train_label, epochs=1, verbose=1)
#         每次epoch后使用验证集进行验证，防止对训练集的过拟合
        loss, _ = model.evaluate(validate_data, validate_label, verbose=0)
        print(loss)

        if loss < loss_tmp:
            count = 1
            loss_tmp = loss
            model.save(pathDir+name)
        elif count >= threshold:
            model = load_model(pathDir+name)
            print("decay the learning rate")
            lr = lr/2
            Adam = keras.optimizers.Adam(lr=lr)
            model.compile(optimizer=Adam, loss='categorical_crossentropy', metrics=['acc'])
            count = 1
        else:
            count += 1

    elapsed_time = time.time() - start_time
    print ("processing time:", elapsed_time, "seconds")
    return model

def test_model(model, dictionary, test_data, test_label, detail=False):
    # 显示识别错误的数据，并计算准确率
    count = 0
    sum_loss = 0
    for i in range(len(test_data)):
        loss, accuracy = model.evaluate(np.column_stack(test_data[i-1]), np.column_stack(test_label[i-1]), batch_size=1, verbose=0)
        if accuracy == 0:
            for j in dictionary:
                if (j['id'] == test_data[i-1]).all():
                    tmp = j['label'].tolist()
                    print(j['sentence'], j['tokenized'], 'label: ', tmp.index(max(tmp)))
        else:
            count += 1
        sum_loss += loss
        loss = sum_loss/len(test_data)
    accuracy = count/len(test_data)
    if detail == True:
        print('Accuracy: %f' % (accuracy*100))
        print('Loss: %f' % (loss))
        ThisTime()
        
# 通过迭代遍历树状结构的节点
def find_son(count_lavel, label_index=None):
    tmp = []
    data = []
    label = []
    if count_lavel == 0:
        label_index = [0]# 初始化label_index
    else:
        node = label_index[-1]
    for i in range(sheet1.nrows-1):
#         print(i)
#         预读取数据集和标签
        if count_lavel == 0:
            data.append(sheet1.cell(i+1, 0).value)
            label.append(sheet1.cell(i+1, 1).value)
        elif node == sheet1.cell(i+1, count_lavel).value:
            for j in range(len(label_index)-1):
                if sheet1.cell(i+1, j+1).value == label_index[j+1]:
                    flag = True
                else:
                    flag = False
                    break
            if flag is True:
                data.append(sheet1.cell(i+1, 0).value)
                label.append(sheet1.cell(i+1, count_lavel+1).value)
#     print(label)
    tmp = list(set(label))# 计算该节点下子节点的数量
    if '' in tmp:
        raise TypeError('The label of some data is BLANK, which is illegal!(有数据未进行标记！)')
#     当该节点下有多个子节点时，训练该节点
    if count_lavel == 0:
        print('根节点下的子节点:', tmp)# 根节点没有node
    else:    
        print('第%d层级%d节点下的子节点:' %(count_lavel, node), tmp)
    if len(tmp) > 1:
        print('正在训练该节点模型...')
#         构建模型
        (dictionary, model, train_data, \
         validate_data, test_data, \
         train_label, validate_label, testllabel) = preprocessing(data, label, len(tmp), seed=seed, mode=devide_mode)
#         模型命名
        if count_lavel == 0:# 根节点没有node
            model_name = date+'_第%d层级的模型' %count_lavel
        else:
            model_name = date+'_第%d层级第%d节点的模型' %(count_lavel, node)
#         训练模型
        model = train_model(train_data, validate_data, \
                            train_label, validate_label, model, model_name)
        print('训练完成！模型名称: %s' %model_name)
#         if count_lavel < num_label_lavel-1:# 循环遍历至倒数第二层（最后一层不存在子节点）
#             count_lavel += 1
#             for node in tmp:
#                 print('遍历第%d层级%d节点下的子节点...' %(count_lavel, node))
#                 print('_'*100)
#                 label_index_tmp = label_index.copy()
#                 label_index_tmp.append(node)
#                 find_son(count_lavel, label_index_tmp)
#             count_lavel -= 1# 循环遍历完本层的节点后，返回上一层
#             print('返回至第%d层' %count_lavel)
    else:
        print('该节点无需训练模型')
        
    if count_lavel < num_label_lavel-1:# 循环遍历至倒数第二层（最后一层不存在子节点）
        count_lavel += 1
        for node in tmp:
            print('遍历第%d层级%d节点下的子节点...' %(count_lavel, node))
            print('_'*100)
            label_index_tmp = label_index.copy()
            label_index_tmp.append(node)
            find_son(count_lavel, label_index_tmp)
        count_lavel -= 1# 循环遍历完本层的节点后，返回上一层
        print('返回至第%d层' %count_lavel)    
        
def predict_data(count_lavel, data, threshold, last_label=None):
#     如果该节点为根节点，使用单独的名称
#     仅在第一次迭代（根节点处）进行分词
    if count_lavel == 0:
        model_name =  '20190620_第0层级的模型'
        tok = True
    else:
        model_name = '20190620_第%d层级第%d节点的模型' %(count_lavel, last_label)
        tok = False
    #     分词
    if tok is True:
#         单句预测时，仅对单句进行去停用词+分词即可
        tokenized = []
        t_s = []
        t = jieba.lcut(data)
        for word in t:
            if word not in stopwords:
                t_s.append(word)
        tokenized.append(t_s)
    else:
        tokenized = data
#     查找model_name是否存在于已加载的模型中（此处需要模型按照规范命名，加载错模型会导致predict的报错）
    for i in range(len(model_name_lst)):
        if model_name == model_name_lst[i]['name']:
            model = model_name_lst[i]['model']
            break
        else:
            model = None
    if model is None:
#         print(count_lavel, model_name)
        if count_lavel < num_label_lavel-1:
            count_lavel += 1
            output = predict_data(count_lavel, tokenized, threshold, last_label=0)
        else:
            output = 'Wrong!'
        return output
    
    max_len = model.layers[1].output_shape[1]# 获取模型文本最大长度
    num_classes = model.layers[23].output_shape[1]# 获取模型分类数量

#     根据词向量模型映射词标签
    identify = wd_encode(wd2idx, tokenized)
#     限定输入文本的长度（不可超过模型最大长度）
    if len(identify[0]) > max_len:
        print('输入过长(超出%d)，仅截取前一部分' %max_len)
        identify[0] = identify[0][0:max_len]
    padded_data, _ = padding_sts(identify, max_len=max_len)

    accuracy = model.predict(padded_data, batch_size=1, verbose=0, steps=None).tolist()
#     print(accuracy)
#     准确率未超过阈值则判定为nomatch
    if max(accuracy[0]) >= threshold:
        label = accuracy[0].index(max(accuracy[0]))
#         print(label)
        output = label
#             当迭代至最后一层，不再进行迭代
        if count_lavel < num_label_lavel-1:
            count_lavel += 1
            predict_data(count_lavel, tokenized, \
                         threshold=predict_threshold, last_label=label)
    else:
#         print('nomatch\n')
#         fobj.write('\t'+'nomatch')
        output = 'nomatch'
    return output
        

# 显示每次运行片段的时间
def ThisTime():
    print('This time is: ', time.strftime("%Y-%m-%d %H:%M:%S"))

In [3]:
# 定义全局变量
global date, wd2idx, embedMatrix, max_len, seed,\
        w2v_dims, predict_threshold, model_name_lst, \
        devide_mode, num_label_lavel
date = time.strftime("%Y%m%d")# 用于保存和加载当天的模型
seed = random.randint(1,10000)# 最近生成种子，确保每次切分得到的数据集不同
max_len = 50# 初始化单句最大词数。超出最大长度则丢弃，不足则填0
devide_mode = 0# 0: 训练模式; 1: 预测模式
predict_threshold = 0.8# 预测时判定是否nomatch的阈值，缺省值为0.8
# w2v_dims = # 若不定义，则缺省值为200
# model_name = # 如果要载入原有的词向量模型，则再次输入模型名称

# 加载自定义词典
pathDir = "C:/Users/admin/催收业务-意图识别/"
jieba.load_userdict(pathDir+"催收文本-newdic.txt")
print('成功加载自定义分词词库')

# 获取停用词
stopwords = [line.strip() for line in open(pathDir+"催收-停用词.txt",encoding='gb18030',errors='ignore').readlines()]

# 默认构建新的词向量模型。若要在原有模型基础上继续训练，build_word2vec函数需传入词向量模型
# 若使用原有词向量模型，默认不更新模型
print('构建词向量模型...')
w2v_model = build_word2vec(filename=pathDir+'语音转文本_全业务数据集.xlsx', w2v_model=None, update=False)
w2v_model.save(pathDir+date+'_w2v_model')# 保存模型
print('词向量模型保存成功：%s_w2v_model' %date)

# 将词向量模型加载为数组
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache
Loading model cost 0.956 seconds.
Prefix dict has been built succesfully.


成功加载自定义分词词库
构建词向量模型...
正在构建新的词向量模型...
词向量模型保存成功：20190620_w2v_model


In [4]:
readbook = xlrd.open_workbook(pathDir+'催收-还款意图模糊.xlsx')
sheet1 = readbook.sheet_by_index(0)# 文本
num_label_lavel = sheet1.ncols-1# 数据集构成为：一列为文本，其余为层级标签。
# num_label_lavel = 2
print('模型的层级数为:', num_label_lavel, '\n构造模型树...')
print('_'*100)
count_lavel = 0# 为方便迭代运算，在层级结构前加入第0层（根节点）
find_son(count_lavel, label_index=None)# 迭代训练结构中有多个子节点的节点
print('模型训练完毕！')

模型的层级数为: 3 
构造模型树...
____________________________________________________________________________________________________
根节点下的子节点: [0.0]
该节点无需训练模型
遍历第1层级0节点下的子节点...
____________________________________________________________________________________________________
第1层级0节点下的子节点: [1.0]
该节点无需训练模型
遍历第2层级1节点下的子节点...
____________________________________________________________________________________________________
第2层级1节点下的子节点: [0.0, 1.0, 2.0, 3.0]
正在训练该节点模型...
data_1_len:  6777 label_1_len:  6777 
data_2_len:  754 lebal_2_len:  754
data_1_len:  5272 label_1_len:  5272 
data_2_len:  1505 lebal_2_len:  1505
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
x_s

0.3333304361971367
Epoch 1/1
0.319204714910105
Epoch 1/1
0.3378210580001143
Epoch 1/1
0.33146945780405
Epoch 1/1
0.3720468205229357
decay the learning rate
Epoch 1/1
0.27038205088924033
Epoch 1/1
0.2898473550691529
Epoch 1/1
0.3456499733880597
Epoch 1/1
0.3481209622770153
Epoch 1/1
0.3341311381571489
decay the learning rate
Epoch 1/1
0.2843172748620061
Epoch 1/1
0.29061816494407955
Epoch 1/1
0.3111748055848898
Epoch 1/1
0.3324548059022079
Epoch 1/1
0.3367140234623411
decay the learning rate
Epoch 1/1
0.27541022710205704
Epoch 1/1
0.2856347232029356
Epoch 1/1
0.2848228277831242
Epoch 1/1
0.2823940168483188
Epoch 1/1
0.3030237259852159
decay the learning rate
Epoch 1/1
0.26984338063143926
Epoch 1/1
0.27237812317018484
Epoch 1/1
0.28444120406472084
Epoch 1/1
0.2805946025038904
Epoch 1/1
0.27915735190996127
decay the learning rate
Epoch 1/1
0.2632755761279352
Epoch 1/1
0.2683127221283293
Epoch 1/1
0.271456012912391
Epoch 1/1
0.27641746235779174
Epoch 1/1
0.27743245088137114
decay the learn

In [5]:
model_1 = load_model(pathDir+'20190620_第2层级第0节点的模型')
model_name_lst = [{'name': '20190620_第2层级第0节点的模型', 'model': model_1}]
num_model = len(model_name_lst)

In [13]:
# 加载词向量
w2v_model = Word2Vec.load(pathDir+'20190618_w2v_model')
# 加载所有神经网络模型
model_1 = load_model(pathDir+'20190618_第0层级的模型')
lmodel_1 = load_model(pathDir+'20190618_第1层级第0节点的模型')
gmodel_1 = load_model(pathDir+'20190618_第1层级第1节点的模型')

model_name_lst = [{'name': '20190618_第0层级的模型', 'model': model_1}, \
                  {'name': '20190618_第1层级第0节点的模型', 'model': lmodel_1}, \
                  {'name': '20190618_第1层级第1节点的模型', 'model': gmodel_1}]
num_model = len(model_name_lst)

In [8]:
print(model_1.layers[1].output_shape[1])

39


In [8]:
# 预测新文本
# bug1: 如果输出文本长度超过训练集最长长度，会报错
# bug2: 出现新词，会报错
# 加载自定义词典
# num_label_lavel = 3
count_lavel = 0# 为方便迭代运算，在层级结构前加入第0层（根节点）
jieba.load_userdict("催收文本-newdic.txt")
print('成功加载自定义分词词库')

# 获取停用词
stopwords = [line.strip() for line in open("催收-停用词.txt",encoding='gb18030',errors='ignore').readlines()]

# 将词向量模型加载为数组
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)
readbook = xlrd.open_workbook(pathDir+'催收-还款意图模糊.xlsx')
sheet2 = readbook.sheet_by_index(0)

predict_threshold = 0.99
fobj_output = open(date+'-predict.txt','a')

for i in range(sheet2.nrows-1):
    predict_set = sheet2.cell(i+1,0).value
# predict_set = input('文本: ')
    output = predict_data(count_lavel, predict_set, threshold=predict_threshold)
#     print(predict_set, output, 'i:', i)
    fobj_output.write('\n'+str(output))
fobj_output.close()

成功加载自定义分词词库


In [None]:
# 预测新文本
# bug1: 如果输出文本长度超过训练集最长长度，会报错
# bug2: 出现新词，会报错
# 加载自定义词典
# num_label_lavel = 3
count_lavel = 0# 为方便迭代运算，在层级结构前加入第0层（根节点）
jieba.load_userdict("催收文本-newdic.txt")
print('成功加载自定义分词词库')

# 获取停用词
stopwords = [line.strip() for line in open("催收-停用词.txt",encoding='gb18030',errors='ignore').readlines()]

# 将词向量模型加载为数组
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)
readbook = xlrd.open_workbook(pathDir+'催收-承诺还款.xlsx')
sheet2 = readbook.sheet_by_index(1)

predict_threshold = 0.98
fobj_output = open(date+'-predict.txt','a')

for i in range(sheet2.nrows-1):
    predict_set = sheet2.cell(i+1,0).value
# predict_set = input('文本: ')
    output = predict_data(count_lavel, predict_set, threshold=predict_threshold)
#     print(predict_set, output, 'i:', i)
    fobj_output.write('\n'+str(output))
fobj_output.close()

In [111]:
# list的模糊匹配
import difflib

AA = ['20190617_第0层级的模型', '20190617_第1层级第0节点的模型', '20190617_第1层级第1节点的模型']
A = '20190617_第1层级第4节点的模型'
a = difflib.get_close_matches(A,AA,1, cutoff=0.7)

['20190617_第1层级第1节点的模型']

In [16]:
w2v_model = Word2Vec.load('20190617_w2v_model')
model_1 = load_model('20190610_一级模型_1')
model_2 = load_model('20190610_一级模型_2')
model_3 = load_model('20190610_一级模型_3')
lmodel_1 = load_mo0del('20190610_二级（本地）模型_1')
lmodel_2 = load_model('20190610_二级（本地）模型_2')
lmodel_3 = load_model('20190610_二级（本地）模型_3')
gmodel_1 = load_model('20190610_二级（全局）模型_1')
gmodel_2 = load_model('20190610_二级（全局）模型_2')
gmodel_3 = load_model('20190610_二级（全局）模型_3')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [15]:
# 测试模型，传入参数：用于测试的模型，字典，测试数据，测试标签
test_model(model_1, g_dictionary, gtstd, gtstl, detail=True)

微信 ['微信'] label:  7
这个我不懂了 ['这个', '我', '不', '懂了'] label:  6
为什么不提前跟我说啊我一下子哪有那么多钱 ['为什么', '不', '提前', '跟', '我', '说', '啊', '我', '一下子', '哪有', '那么', '多', '钱'] label:  15
你这是多少利息了 ['你', '这是', '多少', '利息', '了'] label:  17
处理中遇到问题 ['处理', '中', '遇到', '问题'] label:  7
我的到来人工聊妈吗他那卡里的吗我怎么登不了啊 ['我的', '到来', '人工', '聊妈', '吗', '他', '那', '卡里', '吗', '我', '怎么', '登', '不了', '啊'] label:  7
喂喂你听的见吧 ['喂', '喂', '你', '听的见', '吧'] label:  0
呃他妈的这什么意思哦 ['呃', '他妈的', '这', '什么', '意思', '哦'] label:  3
呃我想请问你一下你们可以听我说一下吗因为我的那个卡绑定的那个卡换掉了我的那个卡掉了现在我要重新绑定一张卡然后我把 ['呃', '我', '想', '请问', '你', '一下', '你们', '可以', '听', '我', '说', '一下', '吗', '因为', '我的', '那个', '卡', '绑定', '那个', '卡换', '掉', '了', '我的', '那个', '卡', '掉', '了', '现在', '我', '要', '重新', '绑定', '一张', '卡', '我', '把'] label:  5
喂你给我再说啊 ['喂', '你', '给我', '再说', '啊'] label:  6
我不知道为什么 ['我', '不知道', '为什么'] label:  7
唉你等一下我因为我刚好今天从工作的基础里好吧你你从讲好吧 ['唉', '你', '等一下', '我', '因为', '我', '刚好', '今天', '从', '工作', '基础', '里', '好吧', '你', '你', '从', '讲', '好吧'] label:  6
你他妈谁啊 ['你', '他', '妈', '谁', '啊'] label:  3
呃

In [20]:
readbook = xlrd.open_workbook('催收-用户输入-样本及标签-本地全局-类目.xlsx')
sheet2 = readbook.sheet_by_name('pos_stopwords')
sheet3 = readbook.sheet_by_name('predict')
pos_stopwords = []
for i in range(sheet2.nrows):
    pos_stopwords.append(sheet2.cell(i,0).value)
predict_set = []
for i in range(sheet3.nrows-1):
    predict_set.append(sheet3.cell(i+1,0).value)
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)
# 加载自定义词典
jieba.load_userdict("催收文本-newdic.txt")

In [17]:
readbook = xlrd.open_workbook('催收-用户输入-样本及标签-本地全局-类目.xlsx')
sheet2 = readbook.sheet_by_name('pos_stopwords')
pos_stopwords = []
for i in range(sheet2.nrows):
    pos_stopwords.append(sheet2.cell(i,0).value)
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)
# 加载自定义词典
jieba.load_userdict("催收文本-newdic.txt")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache


KeyboardInterrupt: 

In [16]:
print(w2v_model.most_similar(positive=['可能']))

[('估计', 0.7111688852310181), ('倒把', 0.6890275478363037), ('不照', 0.6791861057281494), ('活转', 0.6752203702926636), ('甚至', 0.6734603643417358), ('要明', 0.6718773245811462), ('微软', 0.6554404497146606), ('留款', 0.6554221510887146), ('越过', 0.6531094312667847), ('少扣', 0.6469331979751587)]


  """Entry point for launching an IPython kernel.


In [2]:
jieba.load_userdict("催收文本-newdic.txt")
sts1 = '好嘞'
sts1_cut = jieba.lcut(sts1)
print(sts1,sts1_cut)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\biocloo\AppData\Local\Temp\jieba.cache
Loading model cost 0.945 seconds.
Prefix dict has been built succesfully.


好嘞 ['好', '嘞']
