In [1]:
#encoding=utf-8
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold

import pickle
import sys,os
import re
import jieba
jieba.load_userdict("../../MLModel/code/WordCut/userdict.txt")
import string

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.801 seconds.
Prefix dict has been built succesfully.


In [2]:
sys.path.append('../../MLModel/code/TimePattern/')
from  time_pattern import TimePattern
t = TimePattern('../../MLModel/code/TimePattern/mapping.csv')
kf = KFold(n_splits=10, shuffle=False, random_state=None)

In [14]:
def sub_df(df,sets,target='label'):
    result = pd.DataFrame()
    for each in sets:
        result = pd.concat([result,df[df[target]==each]])
#     print(result[target].value_counts())
    return result


def cut_words(text):
    ##### more -- added by wei
    # this is used to remove time patterns from sentence
    text = re.sub(r' ','',text)
    text = t.remove_time(text)
    #########
    seg_list = jieba.cut(text, cut_all=False)
    return " ".join(seg_list)

def clean(text):
    text = re.sub(f'([{string.punctuation}“”¨«»®´·º ½¾¿¡§£₤‘’，])',' ', text)
    text = text.split(' ')
    text = ' '.join(text)
    return text

def clean_label(label):
    return int(label)

others = pd.read_csv('../../MLModel/data/others/irrelevant_response_training_set.csv')
other_matrix = pd.read_csv('../../MLModel/data/others/strategy_mat.csv')

others = others.rename({'文本':'original_text','类别':'from'},axis=1)
mapping = other_matrix.set_index('category').label.drop_duplicates()
others['original_label'] = others['from'].map(mapping)

others['split_text']=others['original_text'].apply(cut_words)
    
# # cleaning and save
others['split_text'] = others['split_text'].apply(clean)

data = others.copy()
data = data.sample(frac=1,random_state=19)

# K fold
kf = KFold(n_splits=10, shuffle=False, random_state=None)
ss = kf.split(data)
result = pd.DataFrame()

all_preds = np.array([])

for train_index,val_index in ss:
    train_df = data.iloc[train_index]
    val_df = data.iloc[val_index].copy()
    train_data = train_df.split_text.values
    val_data = val_df.split_text.values
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                strip_accents='unicode', 
                                max_features=100000, 
                                analyzer='word',
                                sublinear_tf=True,
                                token_pattern=r'\w{1,}')


    phrase_vectorizer.fit(train_data)
    train_tfidf = phrase_vectorizer.transform(train_data)
    val_tfidf = phrase_vectorizer.transform(val_data)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(train_tfidf, train_df.original_label)
    preds = lsvc.predict(val_tfidf)
    val_df['pred_label'] = preds
    val_df = val_df[val_df.original_label != val_df.pred_label]
    result = pd.concat([result,val_df])
    all_preds = np.concatenate([all_preds,preds])





In [20]:
sys.path.append('../../Lib/')
from model_matrix import eval_mat
eval_mat(data.original_label.values,all_preds)

Unnamed: 0,pred_100,pred_101,pred_102,pred_103,pred_104,pred_105,pred_106,pred_107,pred_108,pred_109,recall
actual_100,380.0,0.0,20.0,3.0,26.0,1.0,21.0,1.0,9.0,23.0,0.785124
actual_101,1.0,179.0,3.0,14.0,7.0,1.0,3.0,15.0,3.0,30.0,0.699219
actual_102,13.0,1.0,274.0,3.0,8.0,4.0,4.0,1.0,5.0,20.0,0.822823
actual_103,4.0,6.0,2.0,550.0,10.0,1.0,0.0,6.0,0.0,26.0,0.909091
actual_104,13.0,8.0,3.0,9.0,575.0,6.0,3.0,1.0,0.0,49.0,0.862069
actual_105,4.0,0.0,3.0,1.0,15.0,177.0,0.0,1.0,1.0,9.0,0.838863
actual_106,37.0,4.0,7.0,2.0,9.0,0.0,116.0,0.0,4.0,34.0,0.544601
actual_107,2.0,8.0,5.0,6.0,5.0,1.0,1.0,277.0,0.0,31.0,0.824405
actual_108,8.0,2.0,1.0,2.0,1.0,7.0,0.0,2.0,178.0,14.0,0.827907
actual_109,43.0,28.0,19.0,42.0,63.0,9.0,22.0,50.0,16.0,1083.0,0.787636


In [22]:
others = pd.read_csv('../../MLModel/data/others/irrelevant_response_training_set.csv')

In [25]:
others = others.sort_values('类别')

In [28]:
others['类别'].value_counts()

故意岔开话题    1375
请求等下打来     667
请求重复       605
讨价还价       484
回问身份       336
确认数额       333
说出目的       256
还款方式       215
模糊确认       213
其它通讯方式     211
Name: 类别, dtype: int64

In [31]:
jn = [104,103,109]
ts = [110,100,102,107]
wei = set(others.original_label.value_counts().index.values) - set(jn)-set(ts)

In [35]:
df_jn = sub_df(others,jn,'original_label')
df_jn.to_csv('../../data/others/jiangning_other.csv',index=False,encoding='utf8')
df_ts = sub_df(others,ts,'original_label')
df_ts.to_csv('../../data/others/tanshu_other.csv',index=False,encoding='utf8')
df_wei = sub_df(others,wei,'original_label')
df_wei.to_csv('../../data/others/wei_other.csv',index=False,encoding='utf8')

In [68]:
df = pd.read_csv('../../data/others/combined_cleaned_others.csv')
df_mapping = pd.read_csv('../../data/others/strategy_mat.csv')
mapping = df_mapping.set_index('label').category.drop_duplicates()
df['from'] = df.original_label.map(mapping)

In [75]:
df.rename({'from':'类别','original_text':'文本'},inplace=True,axis=1)

In [77]:
df = df.drop(['original_label','split_text'],axis=1)

In [79]:
df.to_csv('../../data/others/irrelevant_response_training_set.csv',index=False,encoding='utf8')