In [10]:
#encoding=utf-8
import pandas as pd
import numpy as np
import re
import string

import jieba
jieba.load_userdict("../../code/WordCut/userdict.txt")

import gc

import pickle

# get data
model_list = ['IDClassifier', 'IfKnowDebtor', 'ConfirmLoan', 'WillingToPay', 'CutDebt', 'Installment']
# path = '/home/kai/data/jiangning/Chatbot_1/Chatbot1.0/data/'
path = '../../data/others/'
data = pd.read_csv(path + 'irrelevant_response_training_set.csv', encoding='utf8')
strategy_mat = pd.read_csv(path + 'irrelevant_response_strategy_matrix.csv', encoding='utf8')

print(data.shape)
print(strategy_mat.shape)

(2054, 2)
(13, 8)


# data cleaning

In [11]:
data = data.rename(index=str, columns={'文本': 'text', '类别': 'label'})
strategy_mat = strategy_mat.rename(index=str, columns={'类别': 'label', '对策': 'strategy', 'Idclassfier':'IDClassifier', 'ifknowdebtor': 'IfKnowDebtor', 'willingtopay':'WillingToPay', 'cutdebt': 'CutDebt', 'installment': 'Installment'})

print(data['label'].unique())
print(data.label.value_counts())

def clean_label(label):
    coder = {'讨价还价':0, '说出目的':1, '确认数额':2, '请求重复':3, '请求等下打来':4, '其它通讯方式':5, '模糊确认':6, '回问身份':7, '还款方式':8, '故意岔开话题':9, '不愿配合':10}
    pickle.dump(coder, open('../../data/others/label_token.pkl','wb'))
    return coder[label]

def cut_words(text):
    seg_list = jieba.cut(text, cut_all=False)
    return " ".join(seg_list)

def clean(text):
    text = re.sub(f'([{string.punctuation}“”¨«»®´·º ½¾¿¡§£₤‘’，])',' ', text)
    text = text.split(' ')
    text = ' '.join(text)
    return text



    
# cut words
data['text']=data['text'].apply(cut_words)
print('finish cutting words')

# cleaning and save
data['text'] = data['text'].apply(clean)
data['label'] = data['label'].apply(clean_label)

# shuffle data
data = data.sample(frac=1).reset_index(drop=True)
print('finish shuffling')
data.to_csv(path + 'cleaned_mock_up_data.csv', index = False, encoding = 'utf8')

strategy_mat['label'] = strategy_mat['label'].apply(clean_label)



['讨价还价' '说出目的' '确认数额' '请求重复' '请求等下打来' '其它通讯方式' '模糊确认' '回问身份' '还款方式'
 '故意岔开话题' '不愿配合']
请求等下打来    612
讨价还价      489
请求重复      390
故意岔开话题    231
不愿配合      132
回问身份       69
说出目的       54
模糊确认       40
确认数额       24
还款方式       10
其它通讯方式      3
Name: label, dtype: int64
finish cutting words
finish shuffling


In [12]:
data.head(10)

Unnamed: 0,text,label
0,等 我 稳定 工作 处理,0
1,下班 联系 你,4
2,我 在 学习,4
3,我 今天 没借 到 钱,0
4,我过 60 分钟 给 你 回 电话,4
5,我 在 汽车站 都 是 喇叭,4
6,我 在 开会 等 下 再说,4
7,我 是 最帅 的,9
8,我要 和 家人 商量一下 ！,0
9,钱太多 了,0


In [13]:
strategy_mat

Unnamed: 0,label,IDClassifier,IfKnowDebtor,ConfirmLoan,WillingToPay,CutDebt,Installment,strategy
0,0,1.0,1.0,0.0,1.0,0.0,0.0,cutdebt
1,0,1.0,1.0,1.0,1.0,0.0,0.0,施压
2,1,0.0,0.0,1.0,1.0,1.0,1.0,ConfirmLoan
3,2,1.0,1.0,0.0,1.0,1.0,1.0,willingtopay
4,3,0.0,0.0,0.0,0.0,0.0,0.0,重复
5,4,0.0,0.0,0.0,0.0,0.0,0.0,另选时间打来
6,5,1.0,1.0,1.0,0.0,1.0,1.0,重复
7,6,1.0,1.0,0.0,0.0,0.0,0.0,施压
8,7,1.0,1.0,0.0,0.0,0.0,0.0,施压
9,7,0.0,0.0,1.0,1.0,1.0,1.0,重复


# modeling

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [107]:
ind = 0
# for each_model in model_list:
for each_model in ['ConfirmLoan']:
    print(each_model)
    possible_label = list(set(strategy_mat[strategy_mat[each_model]==0]['label'].values))
    print(possible_label)
    model_data = data[data['label'].apply(lambda x: x in possible_label)]
    
    [train_data,test_data] = train_test_split(model_data, test_size = 0.1, shuffle=True) 
#     print(train_data.shape)
#     print(test_data.shape)
    
    # get tfidf
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(train_data.text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(train_data.text)

    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, train_data.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, train_data.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, train_data.label)
    
    print('finish training')
    

    
    # start evaluation
    text = test_data['text']
    text = phrase_vectorizer.transform(test_data['text'])
    M1 = lsvc.predict_proba(text)
    M2 = log_r.predict_proba(text)
    M3 = naive_b.predict_proba(text)
    prob_mat = (M1 + M2 + M3)/3
    
    estimated = [possible_label[i] for i in np.argmax(prob_mat,axis=1)]
    real = test_data['label'].values
    
    n = len(possible_label)
    result = np.zeros([n,n])
    
    for i in range(len(real)):    
        result[possible_label.index(real[i]), possible_label.index(estimated[i])] = result[possible_label.index(real[i]), possible_label.index(estimated[i])] + 1
    
    print(result)

ConfirmLoan
[0, 2, 3, 4, 6, 7, 8, 9]
fitting phrase
transform phrase
finish training
[[34.  0.  2.  4.  0.  0.  0.  0.]
 [ 1.  0.  0.  1.  0.  0.  0.  0.]
 [ 1.  0. 42.  4.  0.  0.  0.  2.]
 [ 3.  0.  4. 52.  0.  0.  0.  0.]
 [ 2.  0.  0.  0.  3.  0.  0.  0.]
 [ 0.  0.  1.  2.  0.  5.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 3.  0.  1.  2.  0.  1.  0. 16.]]
