In [4]:
#encoding=utf-8
import pandas as pd
import numpy as np
import re
import string

import jieba
jieba.load_userdict("../../code/WordCut/userdict.txt")

import gc

import pickle

# get data
model_list = ['IDClassifier', 'IfKnowDebtor', 'ConfirmLoan', 'WillingToPay', 'CutDebt', 'Installment']
# path = '/home/kai/data/jiangning/Chatbot_1/Chatbot1.0/data/'
path = '../../data/others/'
data = pd.read_csv(path + 'irrelevant_response_training_set.csv', encoding='utf8')
strategy_mat = pd.read_csv(path + 'irrelevant_response_strategy_matrix.csv', encoding='utf8')

print(data.shape)
print(strategy_mat.shape)

(2054, 2)
(13, 8)


# data cleaning

In [5]:
data = data.rename(index=str, columns={'文本': 'text', '类别': 'label'})
strategy_mat = strategy_mat.rename(index=str, columns={'类别': 'label', '对策': 'strategy', 'Idclassfier':'IDClassifier', 'ifknowdebtor': 'IfKnowDebtor', 'willingtopay':'WillingToPay', 'cutdebt': 'CutDebt', 'installment': 'Installment'})

print(data['label'].unique())
print(data.label.value_counts())

def clean_label(label):
    coder = {'讨价还价':0, '说出目的':1, '确认数额':2, '请求重复':3, '请求等下打来':4, '其它通讯方式':5, '模糊确认':6, '回问身份':7, '还款方式':8, '故意岔开话题':9, '不愿配合':10}
    pickle.dump(coder, open('../../data/others/label_token.pkl','wb'))
    return coder[label]

def cut_words(text):
    seg_list = jieba.cut(text, cut_all=False)
    return " ".join(seg_list)

def clean(text):
    text = re.sub(f'([{string.punctuation}“”¨«»®´·º ½¾¿¡§£₤‘’，])',' ', text)
    text = text.split(' ')
    text = ' '.join(text)
    return text



    
# cut words
data['text']=data['text'].apply(cut_words)
print('finish cutting words')

# cleaning and save
data['text'] = data['text'].apply(clean)
data['label'] = data['label'].apply(clean_label)

# shuffle data
data = data.sample(frac=1).reset_index(drop=True)
print('finish shuffling')
data.to_csv(path + 'cleaned_mock_up_data.csv', index = False, encoding = 'utf8')

strategy_mat['label'] = strategy_mat['label'].apply(clean_label)



['讨价还价' '说出目的' '确认数额' '请求重复' '请求等下打来' '其它通讯方式' '模糊确认' '回问身份' '还款方式'
 '故意岔开话题' '不愿配合']
请求等下打来    612
讨价还价      489
请求重复      390
故意岔开话题    231
不愿配合      132
回问身份       69
说出目的       54
模糊确认       40
确认数额       24
还款方式       10
其它通讯方式      3
Name: label, dtype: int64
finish cutting words
finish shuffling


In [6]:
data.head(10)

Unnamed: 0,text,label
0,我 在 开车 稍后 联系,4
1,答案 跟 问题 三 一样 的,9
2,是不是 管 你 什么 事,10
3,你 说 啥子,3
4,我 和 家人 商量 下,0
5,什么 是 跟 我 说,1
6,头太大 听不清,3
7,别 打 了 烦死 了,10
8,在 开车 等 会 再 打 过来,4
9,你 那边 信号 不好 啊,3


In [7]:
strategy_mat

Unnamed: 0,label,IDClassifier,IfKnowDebtor,ConfirmLoan,WillingToPay,CutDebt,Installment,strategy
0,0,1.0,1.0,0.0,1.0,0.0,0.0,cutdebt
1,0,1.0,1.0,1.0,1.0,0.0,0.0,施压
2,1,0.0,0.0,1.0,1.0,1.0,1.0,ConfirmLoan
3,2,1.0,1.0,0.0,1.0,1.0,1.0,willingtopay
4,3,0.0,0.0,0.0,0.0,0.0,0.0,重复
5,4,0.0,0.0,0.0,0.0,0.0,0.0,另选时间打来
6,5,1.0,1.0,1.0,0.0,1.0,1.0,重复
7,6,1.0,1.0,0.0,0.0,0.0,0.0,施压
8,7,1.0,1.0,0.0,0.0,0.0,0.0,施压
9,7,0.0,0.0,1.0,1.0,1.0,1.0,重复


# modeling

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB

# from others_py import *

# func_list = [CutDebt_other,IDClassifier_other,IfKnowDebtor_other,Installment_other,ConfirmLoan_other,WillingToPay_other]

In [None]:
ind = 0
for each_model in model_list:
    print(each_model)
    possible_label = list(set(strategy_mat[strategy_mat[each_model]==0]['label'].values))
    print(possible_label)
    model_data = data[data['label'].apply(lambda x: x in possible_label)]
    
    train_data = 
    test_data = 
    
    # get tfidf
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

    print('fitting phrase')
    phrase_vectorizer.fit(train_data.text)

    print('transform phrase')
    phrase = phrase_vectorizer.transform(train_data.text)

    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(phrase, train_data.label)
    
    
    # logistic
    log_r = LogisticRegression()
    log_r.fit(phrase, train_data.label)
    
    
    # Naive Bayes
    naive_b = MultinomialNB()
    naive_b.fit(phrase, train_data.label)
    
    print('finish training')
    
    
    # save model
#     save_path = '../../savedModel/others/{}/'
#     # save tfidf
#     pickle.dump(phrase_vectorizer, open(save_path.format(each_model) + "tfidf.pickle", "wb"))
#     # save linear svc
#     pickle.dump(lsvc, open(save_path.format(each_model) + "LinearSVC.pickle", "wb"))
#     # save logistic
#     pickle.dump(log_r, open(save_path.format(each_model) + "Logistic.pickle", "wb"))
#     # save naive bayes
#     pickle.dump(naive_b, open(save_path.format(each_model) + "nb.pickle", "wb"))
    
    
#     func = func_list[ind]
#     result = func(svc=lsvc, logistic=log_r, nb=naive_b, tfidf=phrase_vectorizer, jieba_path='../WordCut/userdict.txt')
#     pickle.dump(result, open(save_path.format(each_model) + each_model + '_other.pickle', "wb"))
#     ind = ind + 1
    
   