In [1]:
#encoding=utf-8
import pandas as pd
import numpy as np
import re
import string

import jieba
jieba.load_userdict("../../code/WordCut/userdict.txt")

import gc

import pickle


path = '../../data/others/'
data = pd.read_csv(path + 'irrelevant_response_training_set.csv', encoding='utf8')
data_109 = pd.read_csv(path + 'irrelevant_response_training_set109.csv', encoding='utf8')
strategy_mat = pd.read_csv(path + 'strategy_mat.csv', encoding='utf8')

print(data.shape)
print(data_109.shape)
print(strategy_mat.shape)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.788 seconds.
Prefix dict has been built succesfully.


(3950, 2)
(1408, 8)
(16, 9)


In [2]:
strategy_mat

Unnamed: 0,category,label,IDClassifier,IfKnowDebtor,ConfirmLoan,WillingToPay,CutDebt,Installment,strategy
0,讨价还价,100,1.0,1.0,1.0,1.0,0.0,0.0,对应节点施压更高level2及以上
1,说出目的,101,0.0,0.0,1.0,1.0,1.0,1.0,跳到ConfirmLoan的问题，选一条新的话术
2,确认数额,102,1.0,1.0,1.0,0.0,1.0,1.0,告知欠款明细并施压
3,确认数额,102,1.0,1.0,1.0,1.0,0.0,1.0,1.核资内容（你逾期XX、欠款多少、利息多少，滞纳金多少等等） 2.减免之后需要多少钱
4,确认数额,102,1.0,1.0,1.0,1.0,1.0,0.0,1.核资内容（你逾期XX、欠款多少、利息多少，滞纳金多少等等）
5,请求重复,103,0.0,0.0,0.0,0.0,0.0,0.0,重复刚才说的催收话术，换一条新的话术
6,请求等下打来,104,0.0,0.0,0.0,0.0,0.0,0.0,约定下次时间：过一会打给你
7,其它通讯方式,105,1.0,1.0,1.0,0.0,1.0,1.0,（记录通讯方式，例如微信号等，告知马上会有一个主管联系他）
8,模糊确认,106,1.0,1.0,1.0,0.0,0.0,0.0,对应节点施压，level1
9,回问身份,107,1.0,1.0,0.0,0.0,0.0,0.0,施压


# data cleaning

In [3]:
data = data.rename(index=str, columns={'文本': 'text', '类别': 'label'})
data_109 = data_109.rename(index=str, columns={'文本': 'text', '类别': 'label'})

print(data['label'].unique())
print(data.label.value_counts())

mapping = strategy_mat.set_index('category').label.drop_duplicates()

def cut_words(text):
    seg_list = jieba.cut(text, cut_all=False)
    return " ".join(seg_list)

def clean(text):
    text = re.sub(f'([{string.punctuation}“”¨«»®´·º ½¾¿¡§£₤‘’，])',' ', text)
    text = text.split(' ')
    text = ' '.join(text)
    return text



    
# cut words
data['text']=data['text'].apply(cut_words)
data_109['text']=data_109['text'].apply(cut_words)
print('finish cutting words')

# cleaning and save
data['text'] = data['text'].apply(clean)
data['label'] = data['label'].map(mapping)
data_109['text'] = data_109['text'].apply(clean)
data_109['label'] = data_109['label'].map(mapping)

# shuffle data
data = data.sample(frac=1).reset_index(drop=True)
data_109 = data_109.sample(frac=1).reset_index(drop=True)
print('finish shuffling')




data.to_csv(path + 'cleaned_mock_up_data_non109.csv', index = False, encoding = 'utf8')
data_109.to_csv(path + 'cleaned_mock_up_data_109.csv', index = False, encoding = 'utf8')





['其它通讯方式' '回问身份' '请求等下打来' '说出目的' '请求重复' '模糊确认' '还款方式' '确认数额' '讨价还价'
 '请求更新金额']
请求等下打来    928
请求重复      612
模糊确认      583
确认数额      357
讨价还价      315
说出目的      314
回问身份      301
还款方式      271
其它通讯方式    229
请求更新金额     40
Name: label, dtype: int64
finish cutting words
finish shuffling


# modeling

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB

from others_py import *


In [5]:
model_list = {'CutDebt':CutDebt_other,
              'IDClassifier':IDClassifier_other,
              'IfKnowDebtor':IfKnowDebtor_other,
              'Installment':Installment_other,
              'ConfirmLoan':ConfirmLoan_other,
              'WillingToPay':WillingToPay_other}

In [6]:
def get_other_data(df_non109,df_109,strategy_mat,classifier):
    possible_label = sorted(list(set(strategy_mat[strategy_mat[classifier]==0]['label'].values)))
    train_data_non109 = df_non109[df_non109['label'].apply(lambda x: x in possible_label)]
    train_data_109 = df_109[df_109[classifier]==0]
    data = pd.concat([train_data_non109,train_data_109],ignore_index=True,sort=True)
    return data
    


In [7]:

for each_model in model_list:
    print(each_model)
    train_data = get_other_data(data,data_109,strategy_mat,each_model)
    train_data = train_data.sample(frac=1,random_state=19)
    print(train_data.label.value_counts())
    
    # get tfidf
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(train_data.text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(train_data.text)

    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, train_data.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, train_data.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, train_data.label)
    
    print('finish training')
    
    
    # save model
    save_path = '../../savedModel/others/{}/'

    print(each_model)
    result = model_list[each_model](svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, jieba_path='../WordCut/userdict.txt',possible_label=lsvc.classes_)
    pickle.dump(result, open(save_path.format(each_model) + each_model + '_other.pickle', "wb"))

    
   

CutDebt
109    1291
104     928
103     612
106     583
102     357
100     315
107     301
108     271
110      40
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
CutDebt
IDClassifier
109    1331
104     928
103     612
101     314
107     301
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
IDClassifier
IfKnowDebtor
109    1313
104     928
103     612
101     314
107     301
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
IfKnowDebtor
Installment
109    1296
104     928
103     612
106     583
102     357
100     315
107     301
108     271
110      40
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
Installment
ConfirmLoan
109    1233
104     928
103     612
107     301
108     271
Name: label, dtype: int64
fitting phrase
transform phrase
finish training
ConfirmLoan
WillingToPay
109    1295
104     928
103     612
106     583
102     357
107     301
108     271
105     229
Name: lab

In [9]:
# coder = {'讨价还价':0, '说出目的':1, '确认数额':2, '请求重复':3, '请求等下打来':4, '其它通讯方式':5, '模糊确认':6, '回问身份':7, '还款方式':8, '故意岔开话题':9, '不愿配合':10}

idc = pickle.load(open("../../savedModel/others/IDClassifier/IDClassifier_other.pickle", 'rb'))
cutd = pickle.load(open("../../savedModel/others/CutDebt/CutDebt_other.pickle", 'rb'))
ifk = pickle.load(open("../../savedModel/others/IfKnowDebtor/IfKnowDebtor_other.pickle", 'rb'))
will = pickle.load(open("../../savedModel/others/WillingToPay/WillingToPay_other.pickle", 'rb'))
inst = pickle.load(open("../../savedModel/others/Installment/Installment_other.pickle", 'rb'))
conf = pickle.load(open("../../savedModel/others/ConfirmLoan/ConfirmLoan_other.pickle", 'rb'))

In [10]:
idc.classify('风太大了，听不清')

{'label': 103,
 'pred_prob': array([[0.00972847, 0.89080727, 0.00790702, 0.00593046, 0.08562679],
        [0.02842283, 0.69437368, 0.0387822 , 0.01643383, 0.22198746],
        [0.01234623, 0.77669322, 0.03280712, 0.00960237, 0.16855106]]),
 'av_pred': array([0.01683251, 0.78729139, 0.02649878, 0.01065555, 0.15872177])}

In [None]:
# save pickle for possible labels