In [2]:
#encoding=utf-8
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold

import pickle
import sys,os
sys.path.append('../../MLModel/code/OneClickTraining/')
from all_model_py import CutDebt, IDClassifier, IfKnowDebtor, Installment, WillingToPay, ConfirmLoan
import re
import jieba
jieba.load_userdict("../../MLModel/code/WordCut/userdict.txt")
import string

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.734 seconds.
Prefix dict has been built succesfully.


In [3]:
sys.path.append('../../MLModel/code/TimePattern/')
from  time_pattern import TimePattern
t = TimePattern('../../MLModel/code/TimePattern/mapping.csv')
kf = KFold(n_splits=10, shuffle=False, random_state=None)


def sub_df(df,sets,target='label'):
    result = pd.DataFrame()
    for each in sets:
        result = pd.concat([result,df[df[target]==each]])
#     print(result[target].value_counts())
    return result


def cut_words(text):
    ##### more -- added by wei
    # this is used to remove time patterns from sentence
    text = re.sub(r' ','',text)
    text = t.remove_time(text)
    #########
    seg_list = jieba.cut(text, cut_all=False)
    return " ".join(seg_list)

def clean(text):
    text = re.sub(f'([{string.punctuation}“”¨«»®´·º ½¾¿¡§£₤‘’，])',' ', text)
    text = text.split(' ')
    text = ' '.join(text)
    return text

def clean_label(label):
    return int(label)










others = pd.read_csv('../../MLModel/data/others/irrelevant_response_training_set.csv')
other_matrix = pd.read_csv('../../MLModel/data/others/strategy_mat.csv')

others = others.rename({'文本':'original_text','类别':'from'},axis=1)
mapping = other_matrix.set_index('category').label.drop_duplicates()
others['original_label'] = others['from'].map(mapping)

others['split_text']=others['original_text'].apply(cut_words)
    
# cleaning and save
others['split_text'] = others['split_text'].apply(clean)




model_list = {'CutDebt':CutDebt,
              'IDClassifier':IDClassifier,
              'IfKnowDebtor':IfKnowDebtor,
              'Installment':Installment,
              'ConfirmLoan':ConfirmLoan,
              'WillingToPay':WillingToPay}

target='label'




for each_model in model_list:
    path = '../../MLModel/data/{}/'
    data = pd.read_csv(path.format(each_model) + 'mock_up_data1.csv', encoding='utf8')
    data['from'] = each_model
    data.to_csv(path.format(each_model) + 'combined_mock_up_data_eval.csv', index = False, encoding = 'utf8')
    
    

# label 0 part for CutDebt and Installment
data_cut = pd.read_csv(path.format('CutDebt') + 'combined_mock_up_data_eval.csv', encoding = 'utf8')
data_ins = pd.read_csv(path.format('Installment') + 'combined_mock_up_data_eval.csv', encoding = 'utf8')
temp_cut = data_cut[data_cut.label == 0]
temp_ins = data_ins[data_ins.label == 0]
data_cut = pd.concat([data_cut,temp_ins], ignore_index=True)
data_ins = pd.concat([data_ins,temp_cut], ignore_index=True)
data_cut.to_csv(path.format('CutDebt') + 'combined_mock_up_data.csv', index = False, encoding = 'utf8')
data_ins.to_csv(path.format('Installment') + 'combined_mock_up_data.csv', index = False, encoding = 'utf8')



for each_model in model_list:
    print(each_model)
    path = '../../MLModel/data/{}/'
    data = pd.read_csv(path.format(each_model) + 'combined_mock_up_data_eval.csv', encoding = 'utf8')
    data = data.dropna()
    col = 'split_text'
    
    # cut words
    data['original_text'] = data['split_text']
    data['split_text']=data['split_text'].apply(cut_words)
    
    # cleaning and save
    data['split_text'] = data['split_text'].apply(clean)
    data['label'] = data['label'].apply(clean_label)

    # shuffle data
    data = data.sample(frac=1).reset_index(drop=True)
    other_label = max(set(data.label))
    data = data[data.label != other_label]
    
    
    other_set = set(other_matrix[other_matrix[each_model]==0].label.values)
    ava_others = sub_df(others,other_set,target='original_label')
    ava_others[target] = other_label
    data = pd.concat([data,ava_others],sort=True)
    data = data.sample(frac=1).reset_index(drop=True)
    print(data.label.value_counts())
    
    # prepare data done!
    ##################################################################################
    # K fold
    kf = KFold(n_splits=10, shuffle=False, random_state=None)
    ss = kf.split(data)
    result = pd.DataFrame()
    counter = 0
    for train_index,val_index in ss:
        train_df = data.iloc[train_index]
        val_df = data.iloc[val_index].copy()
        train_data = train_df.split_text.values
        val_data = val_df.split_text.values
        phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')


        phrase_vectorizer.fit(train_data)
        train_tfidf = phrase_vectorizer.transform(train_data)
        val_tfidf = phrase_vectorizer.transform(val_data)
        
        # linear svc
        l_svc = LinearSVC()
        lsvc = CalibratedClassifierCV(l_svc) 
        lsvc.fit(train_tfidf, train_df.label)
        preds = lsvc.predict(val_tfidf)
        val_df['pred_label'] = preds
        val_df = val_df[val_df.label != val_df.pred_label]
        result = pd.concat([result,val_df])
    print('total length is {}!! Label needs to be checked length is {}'.format(len(data), len(result)))
    result.drop(['pred_label','split_text'],inplace=True, axis=1)
    result.to_csv('../../data/{}/labelNeedsCorrections.csv'.format(each_model),index=False,encoding='utf8')
        
        
    

    


CutDebt
2    4228
1    2144
0    1984
Name: label, dtype: int64
total length is 8356!! Label needs to be checked length is 1379
IDClassifier
2    3239
0    1447
1    1296
Name: label, dtype: int64
total length is 5982!! Label needs to be checked length is 1109
IfKnowDebtor
2    3239
0    1483
1    1177
Name: label, dtype: int64
total length is 5899!! Label needs to be checked length is 728
Installment
2    4228
0    2137
1    2034
Name: label, dtype: int64
total length is 8399!! Label needs to be checked length is 1636
ConfirmLoan
2    3198
0     580
1     553
Name: label, dtype: int64
total length is 4331!! Label needs to be checked length is 375
WillingToPay
3    3955
0    2114
1    2095
2    2069
Name: label, dtype: int64
total length is 10233!! Label needs to be checked length is 2819


In [145]:
result

Unnamed: 0,from,label,original_label,original_text
0,其它通讯方式,3,105.0,联系我律师吧
5,WillingToPay,2,,谢谢你
10,WillingToPay,2,,我忙完了给你回电话
12,请求等下打来,3,104.0,我等会去看下
16,WillingToPay,2,,你说拨打的用户，他没啦
17,WillingToPay,2,,发工资处理
20,WillingToPay,0,,嗯嗯，好的
23,请求重复,3,103.0,你说话听不清啊
24,WillingToPay,1,,钱到就还
42,WillingToPay,1,,太麻烦


In [144]:
data.label.value_counts()

3    3109
0    2114
1    2095
2    2069
Name: label, dtype: int64