In [1]:
# 两次分类，现将用户输入按照意愿进行分类（愿意还款，意图模糊，延期还款，其它），再根据不同意愿进行进一步分类
# 理论上，意愿分类的准确率会很高，限定意愿后的进一步分类准确率也会有所提高
# 仅使用多卷积核CNN进行特征的综合提取
# 对于池化尺寸的大小未定，需加大样本数量进行对比
import keras
import xlrd, xlwt
from xlutils import copy
import os
import random
import gensim
from gensim.models import Word2Vec
from keras.models import Model
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras import regularizers
from keras.layers.embeddings import Embedding
import jieba
import numpy as np
import time
from keras.utils.np_utils import to_categorical
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, LSTM, SpatialDropout1D, LeakyReLU, Input, concatenate
import sklearn
from sklearn.model_selection import train_test_split
from keras.models import load_model
import datetime

Using TensorFlow backend.


In [2]:
# 对于两次分类问题，需要对样本打两个标签（意愿标签和具体标签）
# 标签提取
def label_preprocessing(label, num_classes):
#     0: 愿意还款; 1: 意图模糊; 2: 延期还款; 3: 其他（全局）; 
    label = np.array(label)
    categorical_labels = to_categorical(label, num_classes=num_classes)
    return categorical_labels

# 数据集预处理(jieba分词，生成词袋，计算字典大小)
def data_preprocessing(data):
    total_words = []
    tokenized = []
    
    for pattern in data:
        sts = ""
        for word in pattern:
            if word not in pre_stopwords:
                sts += word
        t = jieba.lcut(sts)
        t_s = []
        for word in t:
            if word not in pos_stopwords:
                t_s.append(word)
        total_words.extend(t_s)
        tokenized.append(t_s)

    return tokenized

# 在预测时，寻找包含新词的句子
def find_new_wd_of_sts(data, new_wd_of_sts_idx):
    fobj = open('包含新词的句子.txt','a')
    for i in new_wd_of_sts_idx:
        fobj.write('\n'+data[i])
    print('save the new word of setense')
    fobj.close()

# 短句补零，使训练集文本长度相同
def padding_sts(identify,max_len=None):
#     若不预设最大长度，则计算传入样本中的最大长度
    if max_len is None:
        max_len = 0
        for sts in identify:
            if len(sts)>max_len:
                max_len = len(sts)
    padded_id = list(map(lambda l:l + [0]*(max_len - len(l)), identify))
    padded_id = np.array(padded_id)
    return padded_id, max_len

# 数据集切分
def split_data_set(data, label, ratio, seed=None, detal=False):
    data_1, data_2, label_1, label_2 = train_test_split(data, label, test_size=ratio, random_state=seed)
# data：待划分的样本特征集
# label：待划分的样本标签
# ratio：划分比例。如果是浮点数，在0-1之间，表示样本占比；如果是整数，表示样本数量
# seed：是随机数的种子。
# 随机数种子：填0或不填，每次生成的数据集都会不一样。
# detal：显示分割后的详情。默认False
    if detal == True:
        print("data_1_len: ", len(data_1),"label_1_len: ", len(label_1),\
              "\ndata_2_len: ", len(data_2),"lebal_2_len: ", len(label_2))
    return data_1, data_2, label_1, label_2

# 预训练词向量模型
def build_word2vec(data, w2v_dims, w2v_model=None, update=True):
    date = time.strftime("%Y%m%d")
    tokenized = []
    max_len = 0
    
    for pattern in data:
        sts = ""
#         去前停用词
        for word in pattern:
            if word not in pre_stopwords:
                sts += word
        t = jieba.lcut(sts)
        t_s = []
#         去后停用词
        for word in t:
            if word not in pos_stopwords:
                t_s.append(word)
        tokenized.append(t_s)
#         计算分词后的最长长度
        if len(t_s) > max_len:
            max_len = len(t_s)
        
        
# 若没有传入词向量模型，生成新的模型
    if w2v_model is None:
        print('正在生成新的词向量模型...')
        w2v_model = Word2Vec(tokenized,sg=1,size=w2v_dims,window=5,min_count=1,negative=1,sample=0.001,hs=1)
        w2v_model.train(tokenized, total_examples=len(tokenized), epochs=5)
# 若传入了词向量模型，则更新模型
    elif update is True:
        print('正在更新词向量...')
        w2v_model = update_w2v(w2v_model, tokenized)
    return w2v_model, max_len

# 更新词向量模型
def update_w2v(w2v_model, tokenized):
#     生成原词向量的词表
    vocab_list = []
    for w, _ in w2v_model.wv.vocab.items():
        vocab_list.append(w)
#     寻找词向量模型中没有的新词
    new_sts = []
    new_wd = []
    for sts in tokenized:
        for wd in sts:
            if wd not in vocab_list:
                new_wd.append(wd)
                new_sts.append(sts)
                break
    if new_sts != []:
        print('发现新词：', new_wd, '\n已对词向量模型进行更新！')
        w2v_model.build_vocab(new_sts, update=True)
#         w2v_model.train(new_sts,total_examples=w2v_model.corpus_count,epochs=1)
    else:
        print('未发现新词')
    return w2v_model

# 对数据集中的每个词，按照词向量索引进行编码
def wd_encode(wd2idx, tokenized):
    identify = []
    for sts in tokenized:
        id_sts = []
        for wd in sts:
            id_sts.append(wd2idx[wd])
        identify.append(id_sts)
    return identify

# 构建词向量的单词索引和embedding层矩阵
def build_wd2idx_embedMatrix(w2vModel):
    word2idx = {"_stopWord": 0}  # 这里加了一行是用来过滤停用词的。
    vocab_list = [(w, w2vModel.wv[w]) for w, v in w2vModel.wv.vocab.items()]
    embedMatrix = np.zeros((len(w2vModel.wv.vocab.items()) + 1, w2vModel.vector_size))

    for i in range(0, len(vocab_list)):
        word = vocab_list[i][0]
        word2idx[word] = i + 1
        embedMatrix[i + 1] = vocab_list[i][1]
    return word2idx, embedMatrix

# 将深度学习过程中训练的词向量矩阵更新到词向量模型中
def embed2w2v(w2vModel, embedMatrix):
    print('正在根据学习过程中训练的词向量矩阵对词向量模型进行更新...')
    i = 0
    for w,_ in w2vModel.wv.vocab.items():
        w2vModel.wv[w] = embedMatrix[i]
        i += 1
    print('词向量模型更新完成')
    return w2vModel

# 显示每次运行片段的时间
def ThisTime():
    print('This time is: ', time.strftime("%Y-%m-%d %H:%M:%S"))

In [8]:
# 训练集和对应标签预处理
date = time.strftime("%Y%m%d")
readbook = xlrd.open_workbook('催收-用户输入-样本及标签.xlsx')
sheet1 = readbook.sheet_by_name('data & labels')
sheet2 = readbook.sheet_by_name('pre_stopwords')
sheet3 = readbook.sheet_by_name('pos_stopwords')

data_set = []
pre_label = []
pos_label = []
pre_stopwords = []
pos_stopwords = []
w2v_dims = 200

for i in range(sheet1.nrows-1):
    data_set.append(sheet1.cell(i+1,0).value)
    pre_label.append(sheet1.cell(i+1,1).value)
#     pos_label.append(sheet1.cell(i+1,2).value)

for i in range(sheet2.nrows):
    pre_stopwords.append(sheet2.cell(i,0).value)
    
for i in range(sheet3.nrows):
    pos_stopwords.append(sheet3.cell(i,0).value)

# 加载自定义词典
jieba.load_userdict("催收文本-newdic.txt")

# 构建数据集词库
total_set = xlrd.open_workbook('语音转文本_全业务数据集.xlsx')
total_set_sheet1 = total_set.sheet_by_name('Sheet1')
total_data = []
for i in range(total_set_sheet1.nrows-1):
    total_data.append(total_set_sheet1.cell(i+1,0).value)

# 载入原有的词向量模型
# w2v_model = Word2Vec.load(date+'_w2v_model')
# w2v_model = Word2Vec.load(date+'_w2v_model_pos_embed')
# w2v_model = Word2Vec.load('20190528_w2v_model')

# 默认构建新的词向量模型。若要在原有模型基础上继续训练，build_word2vec函数需传入词向量模型
# 若使用原有词向量模型，考虑到可能会出现新词汇，在使用前需进行模型更新
w2v_model, max_len = build_word2vec(total_data, w2v_dims, update=True)
w2v_model.save(date+'_w2v_model')# 保存模型
print('词向量模型保存成功：%s_w2v_model' %date)

# 将词向量模型加载为数组
wd2idx, embedMatrix = build_wd2idx_embedMatrix(w2v_model)

num_pre_classes = 20
num_pos_classes = 5
# num_pos_classes_1 = 2
# num_pos_classes_2 = 3
# num_pos_classes_3 = 2
# num_pos_classes_4 = 5
tokenized = data_preprocessing(data_set)
identify = wd_encode(wd2idx, tokenized)
padded_data,_ = padding_sts(identify, max_len=max_len)
# print(pre_label)
pre_label = label_preprocessing(pre_label, num_pre_classes)
# pos_label = label_preprocessing(pos_label, num_pos_classes)

dictionary = []
for i in range(len(tokenized)):
    dictionary.append({'sentence':data_set[i-1], 'tokenized':tokenized[i-1],\
                       'id':padded_data[i-1], 'pre_label':pre_label[i-1]})
# print(dictionary)
rand = random.randint(1,10000)

# 对数据集进行切分，生成训练集和验证集
train_data, validate_data, train_pre_label, validate_pre_label = \
split_data_set(padded_data, pre_label, 0.1, seed=rand, detal=True)
# train_data, test_data, train_pos_label, test_pos_label = \
# split_data_set(padded_data, pos_label, 0.1, seed=rand, detal=True)

# 对训练集进行切分，生成训练集和测试集
train_data, test_data, train_pre_label, test_pre_label = \
split_data_set(train_data, train_pre_label, 0.222, seed=rand, detal=True)
# train_data_tmp, validate_data, train_pos_label, validate_pos_label = \
# split_data_set(train_data, train_pos_label, 0.222, seed=rand, detal=True)
# train_data = train_data_tmp

正在生成新的词向量模型...
词向量模型保存成功：20190606_w2v_model
data_1_len:  24939 label_1_len:  24939 
data_2_len:  2771 lebal_2_len:  2771
data_1_len:  19402 label_1_len:  19402 
data_2_len:  5537 lebal_2_len:  5537


In [9]:
# 设计网络模型：LSTM+六个卷积核(各10个特征)+LeakyReLU激活函数
# Input
comment_seq = Input(shape=[max_len], name='x_seq')
# Embedding
emb_comment = Embedding(len(embedMatrix), w2v_dims, weights=[embedMatrix], input_length=max_len, trainable=False)(comment_seq)
# LSTM
LSTM_1 = LSTM(units=32, dropout=0.2, recurrent_dropout=0.1, return_sequences=True)(emb_comment)
# LSTM_1 = LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(emb_comment)
# model.add(LSTM(200, return_sequences=True))
# LSTM_2 = LSTM(200, return_sequences=False)(LSTM_1)
# conv
convs = []
kernel_size = [1,2,3,4,5,max_len]
for ksz in kernel_size:
    l_conv = Conv1D(filters=10, kernel_size=ksz, strides=1, padding='valid',\
                   use_bias=True,kernel_initializer='glorot_uniform',\
                   bias_initializer='zeros',kernel_regularizer=regularizers.l2(0.0001))(LSTM_1)
    l_conv = LeakyReLU(alpha=0.01)(l_conv)
    l_pool = GlobalMaxPooling1D()(l_conv)
    convs.append(l_pool)
merge = concatenate(convs, axis=1)
dropout = Dropout(0.3)(merge)
# output = Dense(120)(merge)
# output = LeakyReLU(alpha=0.05)(output)
output = Dense(num_pre_classes,activation='softmax')(dropout)
model = Model([comment_seq], output)

# summarize the model
model.summary()
Adam = keras.optimizers.Adam(lr=0.01)
model.compile(optimizer=Adam, loss='categorical_crossentropy', metrics=['acc'])


ThisTime()

# Accuracy: 88.709677
# Loss: 0.558412
# This time is:  2019-05-08 14:35:49

# Accuracy: 81.012658
# Loss: 0.544165
# This time is:  2019-05-08 15:53:01

# Accuracy: 88.607595
# Loss: 0.307871
# This time is:  2019-05-08 16:20:03

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
x_seq (InputLayer)              (None, 42)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 42, 200)      2482000     x_seq[0][0]                      
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 42, 32)       29824       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 42, 10)       330         lstm_2[0][0]                     
__________________________________________________________________________________________________
conv1d_8 (

In [10]:
# 训练网络模型
# verbose:显示日志。verbose=0: 为不在标准输出流输出日志信息 verbose=1: 为输出进度条记录 verbose=2: 为每个epoch输出一行记录
start_time = time.time()
# for i in range(500):
# hist = model.fit(padded_data, categorical_labels,\
#                  epochs=100, verbose=1, validation_split=0.2,\
#                  steps_per_epoch=1, validation_steps=10, shuffle=True)
loss_tmp = 10
lr_pre = 0.01
lr = lr_pre
count = 1
threshold = 5

#     动态学习速率衰减
while lr >= lr_pre/64:
    hist = model.fit(train_data, train_pre_label, epochs=1, verbose=1)
#     每次只取随机一部分验证集进行验证，防止对验证集的过拟合
#     _, v_d, _, v_l = split_data_set(validate_data, validate_label,\
#                                     1, seed=None, detal=False)
#     loss, _ = model.evaluate(v_d, v_l, verbose=0)
    loss, _ = model.evaluate(validate_data, validate_pre_label, verbose=0)
    print(loss)

    if loss < loss_tmp:
        count = 1
        loss_tmp = loss
        model.save('Keras_cuishou_model')
    elif count >= threshold:
        model = load_model('Keras_cuishou_model')
        print("decay the learning rate")
        lr = lr/2
        Adam = keras.optimizers.Adam(lr=lr)
        model.compile(optimizer=Adam, loss='categorical_crossentropy', metrics=['acc'])
        count = 1
    else:
        count += 1

# new_embedMatirx = model.layers[1].get_weights()[0]# 获取embedding层的参数(词向量矩阵)
# w2v_model = embed2w2v(w2v_model, new_embedMatirx)# 对词向量模型进行更新

# w2v_model.save(date+'_w2v_model_pos_embed')# 保存模型
# print('词向量模型保存成功：%s_w2v_model_pos_embed' %date)
        
elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")

Epoch 1/1
0.3400517814493231
Epoch 1/1
0.3084949768867272
Epoch 1/1
0.29346667416729855
Epoch 1/1
0.31170352967540005
Epoch 1/1
0.2881868067242875
Epoch 1/1
0.3123054650448497
Epoch 1/1
0.3340319973365876
Epoch 1/1
0.3320388200802736
Epoch 1/1
0.3363660401878095
Epoch 1/1
0.32055108143249944
decay the learning rate
Epoch 1/1
0.29165140813931106
Epoch 1/1
0.2638187049120095
Epoch 1/1
0.2821246832878054
Epoch 1/1
0.2685039545043373
Epoch 1/1
0.2823377594927165
Epoch 1/1
0.2920018430545157
Epoch 1/1
0.2925400561543846
decay the learning rate
Epoch 1/1
0.26127907093876496
Epoch 1/1
0.2583389167147223
Epoch 1/1
0.2587756382907095
Epoch 1/1
0.26617774982522946
Epoch 1/1
0.26427355898024585
Epoch 1/1
0.2686422257292619
Epoch 1/1
0.269149413890935
decay the learning rate
Epoch 1/1
0.25955233284386864
Epoch 1/1
0.2642374156571949
Epoch 1/1
0.26251037295696666
Epoch 1/1
0.2617088542738274
Epoch 1/1
0.2621652636447137
decay the learning rate
Epoch 1/1
0.25651270345805227
Epoch 1/1
0.2563588044480

In [18]:
model = load_model('Keras_cuishou_model')

In [11]:
# 显示识别错误的数据，并计算准确率
count = 0
sum_loss = 0
for i in range(len(test_data)):
    loss, accuracy = model.evaluate(np.column_stack(test_data[i-1]), np.column_stack(test_pre_label[i-1]), batch_size=1, verbose=0)
    if accuracy == 0:
#         print(test_data[i-1])
        for j in dictionary:
            if (j['id'] == test_data[i-1]).all():
                tmp = j['pre_label'].tolist()
                print(j['sentence'], j['tokenized'], 'label: ', tmp.index(max(tmp)))
#         print('666' for j in dictionary if j['id'] == test_data[i-1])
    else:
        count += 1
    sum_loss += loss
    loss = sum_loss/len(test_data)
accuracy = count/len(test_data)

print('Accuracy: %f' % (accuracy*100))
print('Loss: %f' % (loss))
ThisTime()

在吃饭呢 ['在', '吃饭', '呢'] label:  1
呃我我现在等那卡要手机去维修好我就上那个app啊 ['呃', '我', '我', '现在', '等', '那卡要', '手机', '去', '维修', '好', '我', '上', '那个', 'app', '啊'] label:  0
我上月处理啦那个时间的话你们那里不是要输入那个什么交易密码我忘记交易密码啦然后输了五次就 ['我', '上', '月', '处理', '啦', '那个', '时间', '你们', '那里', '不是', '要', '输入', '那个', '什么', '交易', '密码', '我', '忘记', '交易', '密码', '啦', '输', '了', '五次'] label:  1
呃最近我那钱还没到账要下个星期一才到账还想开你们打电话来都是那种 ['呃', '最近', '我', '那', '钱', '还', '没到账', '要', '下个星期', '一才', '到账', '还', '想开', '你们', '打电话', '来', '都', '是', '那种'] label:  2
我现在手上拿钱不出来了 ['我', '现在', '手上', '拿', '钱', '不', '出来', '了'] label:  2
到底是还一百多还是七百多 ['到底', '是', '还', '一百多', '还是', '七百多'] label:  13
呃我忘记了怎样处理了吧 ['呃', '我', '忘记', '了', '怎样处理', '了', '吧'] label:  7
卡冻结了今天没法存钱了 ['卡', '冻结', '了', '今天', '没法', '存钱', '了'] label:  1
帮我这我我我都忘记了这个 ['帮', '我', '这', '我', '我', '我', '都', '忘记', '了', '这个'] label:  1
那目前 ['那', '目前'] label:  2
万一处理的要处理 ['万一', '处理', '要', '处理'] label:  0
语音提示 ['语音', '提示'] label:  8
我不记得密码了 ['我', '不记得', '密码', '了'] label:  7
往哪里 ['往', '哪里'] label:  7
正在筹得多少钱稍等一下吧 [

嗯昨天晚上还的时候上面显示是不能手动还款 ['嗯', '昨天晚上', '还', '时候', '上面', '显示', '是', '不能', '手动', '还款'] label:  7
我就欠了吗我开展你的头嘛 ['我', '欠', '了', '吗', '我', '开展', '你', '头', '嘛'] label:  0
是的暂未处理 ['是的', '暂未处理'] label:  1
在忙给我留言 ['在忙', '给我', '留言'] label:  1
处理过没成功 ['处理', '过', '没成功'] label:  0
然后我我稍后我在atm机上面方法吧 ['我', '我', '稍后', '我', '在', 'atm', '机', '上面', '方法', '吧'] label:  1
我的钱在里面多少分泌你改正吗 ['我的', '钱', '在', '里面', '多少', '分泌', '你', '改正', '吗'] label:  0
嗯今天是没钱我要晚一点才有网点处理里面已经处理可以吗 ['嗯', '今天', '是', '没钱', '我', '要', '晚一点', '才', '有', '网点', '处理', '里面', '已经', '处理', '可以吗'] label:  0
没能力还了 ['没能力', '还了'] label:  1
没有我是真没有但是我会尽快想办法我已经联系朋友了 ['没有', '我', '是', '没有', '但是', '我', '会', '尽快', '想办法', '我', '已经', '联系', '朋友', '了'] label:  1
我说我未处理呢 ['我', '说', '我', '未处理', '呢'] label:  1
我现在不方便说给下班乘坐 ['我', '现在', '不方便', '说', '给', '下班', '乘坐'] label:  0
那没有没有没有我找不到那个它上面怎么不显示了我所以说我就没搞我直接直接充到余额里面就行是吧 ['那', '没有', '没有', '没有', '我', '找', '不到', '那个', '它', '上面', '怎么', '不', '显示', '了', '我', '所以', '说', '我', '没', '搞', '我', '直接', '直接', '充', '到', '余额', '里面', '

In [13]:
model.save(date+'_Keras_cuishou_model')

In [13]:
print(max_len)

67


In [9]:
print(pre_stopwords)

['', '', '', '', '', '', '', '', '', '稍微', '稍', '很', '的', '就是', '这么', '那么', '过去', '过来', '可能', '一直', '比较', '有点']


In [24]:
model = load_model('Keras_cuishou_model_20190514_96_leakyrelu')

In [24]:
w2v_model = Word2Vec.load('20190522_w2v_model')
w2v_model2 = Word2Vec.load('20190522_w2v_model_pos_embed')

In [2]:
sheet4 = readbook.sheet_by_name('predict')
predict_set = []

for i in range(sheet4.nrows-1):
    predict_set.append(sheet4.cell(i+1,0).value)

_, _, predict_id, _ = data_preprocessing(predict_set, BOW)
predict_padded_data,_ = padding_sts(predict_id, max_len)
print(len(predict_padded_data))

NameError: name 'readbook' is not defined

In [15]:
w2v_model = Word2Vec.load('20190521_w2v_model')

In [16]:
print(w2v_model.most_similar(positive=['可能']))

[('估计', 0.7111688852310181), ('倒把', 0.6890275478363037), ('不照', 0.6791861057281494), ('活转', 0.6752203702926636), ('甚至', 0.6734603643417358), ('要明', 0.6718773245811462), ('微软', 0.6554404497146606), ('留款', 0.6554221510887146), ('越过', 0.6531094312667847), ('少扣', 0.6469331979751587)]


  """Entry point for launching an IPython kernel.


In [10]:
new_sts = ['没问题','没问题','没问题','没问题','没问题','啦啦','没问题','没问题','没问题','没问题','没问题','没问题','没问题','放心']
tokenized.append(new_sts)
new_sts = [['没问题','没问题','没问题','没问题','没问题','啦啦','没问题','没问题','没问题','没问题','没问题','没问题','没问题','放心']]
w2v_model.build_vocab(new_sts, update=True)
w2v_model.train(tokenized,total_examples=w2v_model.corpus_count,epochs=1)

(110178, 174672)

In [16]:
jieba.load_userdict("催收文本-newdic.txt")
sts1 = '晚一点点'
sts1_cut = jieba.lcut(sts1)
print(sts1,sts1_cut)

晚一点点 ['晚一点', '点']


In [81]:
A = []
C = []
A.append('1')
A.append('2')
A.append('666')
C.append(A)
C.append(A)
print(A, C)
B = []
# B = [x for j in C for x in j]
B += A
B += A
print(B)

['1', '2', '666'] [['1', '2', '666'], ['1', '2', '666']]
['1', '2', '666', '1', '2', '666']
