In [10]:
#encoding=utf-8
import pandas as pd
import numpy as np
import re
import string

import jieba
jieba.load_userdict("../../code/WordCut/userdict.txt")

import gc

import pickle

# get data
model_list = ['IDClassifier', 'IfKnowDebtor', 'ConfirmLoan', 'WillingToPay', 'CutDebt', 'Installment']
# path = '/home/kai/data/jiangning/Chatbot_1/Chatbot1.0/data/'
path = '../../data/others/'
data = pd.read_csv(path + 'irrelevant_response_training_set.csv', encoding='utf8')
strategy_mat = pd.read_csv(path + 'strategy_mat.csv', encoding='utf8')

print(data.shape)
print(strategy_mat.shape)

(4695, 2)
(14, 9)


In [11]:
strategy_mat

Unnamed: 0,category,label,IDClassifier,IfKnowDebtor,ConfirmLoan,WillingToPay,CutDebt,Installment,strategy
0,讨价还价,100,1.0,1.0,0.0,1.0,1.0,1.0,跳到cutdebt节点的施压话术，随机挑一条level1级别的
1,讨价还价,100,1.0,1.0,1.0,1.0,0.0,0.0,对应节点施压更高level2及以上
2,说出目的,101,0.0,0.0,1.0,1.0,1.0,1.0,跳到ConfirmLoan的问题，选一条新的话术
3,确认数额,102,1.0,1.0,1.0,1.0,0.0,1.0,1.核资内容（你逾期XX、欠款多少、利息多少，滞纳金多少等等） 2.减免之后需要多少钱
4,确认数额,102,1.0,1.0,0.0,1.0,1.0,0.0,1.核资内容（你逾期XX、欠款多少、利息多少，滞纳金多少等等）
5,请求重复,103,0.0,0.0,0.0,0.0,0.0,0.0,重复刚才说的催收话术，换一条新的话术
6,请求等下打来,104,0.0,0.0,0.0,0.0,0.0,0.0,约定下次时间：过一会打给你
7,其它通讯方式,105,1.0,1.0,1.0,0.0,1.0,1.0,（记录通讯方式，例如微信号等，告知马上会有一个主管联系他）
8,模糊确认,106,1.0,1.0,0.0,0.0,0.0,0.0,对应节点施压，level1
9,回问身份,107,1.0,1.0,0.0,0.0,0.0,0.0,施压


# data cleaning

In [12]:
data = data.rename(index=str, columns={'文本': 'text', '类别': 'label'})
# strategy_mat = strategy_mat.rename(index=str, columns={'类别': 'label', '对策': 'strategy', 'Idclassfier':'IDClassifier', 'ifknowdebtor': 'IfKnowDebtor', 'willingtopay':'WillingToPay', 'cutdebt': 'CutDebt', 'installment': 'Installment'})

print(data['label'].unique())
print(data.label.value_counts())

mapping = strategy_mat.set_index('category').label.drop_duplicates()

def cut_words(text):
    seg_list = jieba.cut(text, cut_all=False)
    return " ".join(seg_list)

def clean(text):
    text = re.sub(f'([{string.punctuation}“”¨«»®´·º ½¾¿¡§£₤‘’，])',' ', text)
    text = text.split(' ')
    text = ' '.join(text)
    return text



    
# cut words
data['text']=data['text'].apply(cut_words)
print('finish cutting words')

# cleaning and save
data['text'] = data['text'].apply(clean)
data['label'] = data['label'].map(mapping)

# shuffle data
data = data.sample(frac=1).reset_index(drop=True)
print('finish shuffling')
data.to_csv(path + 'cleaned_mock_up_data.csv', index = False, encoding = 'utf8')





['故意岔开话题' '回问身份' '确认数额' '说出目的' '其它通讯方式' '请求重复' '请求等下打来' '讨价还价' '模糊确认'
 '还款方式']
故意岔开话题    1375
请求等下打来     667
请求重复       605
讨价还价       484
回问身份       336
确认数额       333
说出目的       256
还款方式       215
模糊确认       213
其它通讯方式     211
Name: label, dtype: int64
finish cutting words
finish shuffling


In [13]:
data.head(10)

Unnamed: 0,text,label
0,等 会 再 给 我 打电话,104
1,我 也 不 知道 钱 还 没到 账,100
2,你 谁 啊 你 管 我 到底 想 干嘛,101
3,啊 ？ 俺 听 不到,103
4,我 朋友 说 今天 钱 还 我,100
5,正在 凑钱,100
6,现在 没空,104
7,满满的 月光,109
8,你 给 我 发红包 我 就 告诉 你,109
9,在 干活,104


# modeling

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB

from others_py import *

func_list = [CutDebt_other,IDClassifier_other,IfKnowDebtor_other,Installment_other,ConfirmLoan_other,WillingToPay_other]

In [15]:
ind = 0
for each_model in model_list:
    print(each_model)
    possible_label = sorted(list(set(strategy_mat[strategy_mat[each_model]==0]['label'].values)))
    print(possible_label)
    train_data = data[data['label'].apply(lambda x: x in possible_label)]
    
    # get tfidf
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(train_data.text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(train_data.text)

    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, train_data.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, train_data.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, train_data.label)
    
    print('finish training')
    
    
    # save model
    save_path = '../../savedModel/others/{}/'
#     # save tfidf
#     pickle.dump(phrase_vectorizer, open(save_path.format(each_model) + "tfidf.pickle", "wb"))
#     # save linear svc
#     pickle.dump(lsvc, open(save_path.format(each_model) + "LinearSVC.pickle", "wb"))
#     # save logistic
#     pickle.dump(log_r, open(save_path.format(each_model) + "Logistic.pickle", "wb"))
#     # save naive bayes
#     pickle.dump(naive_b, open(save_path.format(each_model) + "nb.pickle", "wb"))
    
    
    func = func_list[ind]
    result = func(svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, jieba_path='../WordCut/userdict.txt',possible_label=possible_label)
    pickle.dump(result, open(save_path.format(each_model) + each_model + '_other.pickle', "wb"))
    ind = ind + 1
    
   

IDClassifier
[101, 103, 104, 107, 109]
fitting phrase
transform phrase
finish training
IfKnowDebtor
[101, 103, 104, 107, 109]
fitting phrase
transform phrase
finish training
ConfirmLoan
[100, 102, 103, 104, 106, 107, 108, 109]
fitting phrase
transform phrase
finish training
WillingToPay
[103, 104, 105, 106, 107, 108, 109]
fitting phrase
transform phrase
finish training
CutDebt
[100, 102, 103, 104, 106, 107, 108, 109]
fitting phrase
transform phrase
finish training
Installment
[100, 102, 103, 104, 106, 107, 108, 109]
fitting phrase
transform phrase
finish training


In [16]:
# coder = {'讨价还价':0, '说出目的':1, '确认数额':2, '请求重复':3, '请求等下打来':4, '其它通讯方式':5, '模糊确认':6, '回问身份':7, '还款方式':8, '故意岔开话题':9, '不愿配合':10}

idc = pickle.load(open("../../savedModel/others/IDClassifier/IDClassifier_other.pickle", 'rb'))
cutd = pickle.load(open("../../savedModel/others/CutDebt/CutDebt_other.pickle", 'rb'))
ifk = pickle.load(open("../../savedModel/others/IfKnowDebtor/IfKnowDebtor_other.pickle", 'rb'))
will = pickle.load(open("../../savedModel/others/WillingToPay/WillingToPay_other.pickle", 'rb'))
inst = pickle.load(open("../../savedModel/others/Installment/Installment_other.pickle", 'rb'))
conf = pickle.load(open("../../savedModel/others/ConfirmLoan/ConfirmLoan_other.pickle", 'rb'))

In [18]:
idc.classify('风太大了，听不清')

{'label': 103,
 'pred_prob': array([[0.0123779 , 0.93609719, 0.00507587, 0.00915109, 0.03729795],
        [0.03287193, 0.6248927 , 0.04713778, 0.02476825, 0.27032933],
        [0.01508828, 0.71018262, 0.03791627, 0.01590351, 0.22090932]]),
 'av_pred': array([0.0201127 , 0.75705751, 0.03004331, 0.01660762, 0.17617887])}

In [None]:
# save pickle for possible labels

In [54]:
# a = pickle.load(open('../../data/others/label_token.pkl','rb'))