In [18]:
#encoding=utf-8
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold

import pickle
import sys,os
import re
import jieba
jieba.load_userdict("../WordCut/userdict.txt")
import string

In [19]:
sys.path.append('../TimePattern/')
from  time_pattern import TimePattern
t = TimePattern('../TimePattern/mapping.csv')
kf = KFold(n_splits=10, shuffle=False, random_state=None)

In [24]:
def sub_df(df,sets,target='label'):
    result = pd.DataFrame()
    for each in sets:
        result = pd.concat([result,df[df[target]==each]])
#     print(result[target].value_counts())
    return result


def cut_words(text):
    ##### more -- added by wei
    # this is used to remove time patterns from sentence
    text = re.sub(r' ','',text)
    text = t.remove_time(text)
    #########
    seg_list = jieba.cut(text, cut_all=False)
    return " ".join(seg_list)

def clean(text):
    text = re.sub(f'([{string.punctuation}“”¨«»®´·º ½¾¿¡§£₤‘’，])',' ', text)
    text = text.split(' ')
    text = ' '.join(text)
    return text

def clean_label(label):
    return int(label)

others = pd.read_csv('../../data/others/irrelevant_response_training_set.csv')
other_matrix = pd.read_csv('../../data/others/strategy_mat.csv')

others = others.rename({'文本':'original_text','类别':'from'},axis=1)
mapping = other_matrix.set_index('category').label.drop_duplicates()
others['original_label'] = others['from'].map(mapping)

others['split_text']=others['original_text'].apply(cut_words)
    
# # cleaning and save
others['split_text'] = others['split_text'].apply(clean)

data = others.copy()
data = data.sample(frac=1,random_state=19)

# K fold
kf = KFold(n_splits=10, shuffle=False, random_state=None)
ss = kf.split(data)
result = pd.DataFrame()

for train_index,val_index in ss:
    train_df = data.iloc[train_index]
    val_df = data.iloc[val_index].copy()
    train_data = train_df.split_text.values
    val_data = val_df.split_text.values
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                strip_accents='unicode', 
                                max_features=100000, 
                                analyzer='word',
                                sublinear_tf=True,
                                token_pattern=r'\w{1,}')


    phrase_vectorizer.fit(train_data)
    train_tfidf = phrase_vectorizer.transform(train_data)
    val_tfidf = phrase_vectorizer.transform(val_data)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(train_tfidf, train_df.original_label)
    preds = lsvc.predict(val_tfidf)
    val_df['pred_label'] = preds
    val_df = val_df[val_df.original_label != val_df.pred_label]
    result = pd.concat([result,val_df])




In [25]:
result

Unnamed: 0,original_text,from,original_label,split_text,pred_label
3663,你怎么不早说,讨价还价,100,你 怎么 不早 说,107
547,爱干啥干啥,不愿配合,110,爱干 啥 干 啥,101
767,把要还的明显发我下，我核对下。,确认数额,102,把 要 还 的 明显 发 我 下 我 核对 下 。,105
4343,闪电了，太危险了。挂了。,故意岔开话题,109,闪电 了 太 危险 了 。 挂 了 。,104
859,你们公司在哪点，我过来,其它通讯方式,105,你们 公司 在 哪点 我 过来,104
4090,以后再说,请求等下打来,104,以后 再说,100
1265,已经还了，提示正在交易中,还款方式,108,已经 还 了 提示 正在 交易 中,100
10,说,说出目的,101,说,100
4788,可不可以请我吃饭,不愿配合,110,可不可以 请 我 吃饭,100
3546,你过来就处理,不愿配合,110,你 过来 就 处理,100


In [26]:
others.original_label.value_counts()

104    825
103    651
109    635
110    597
100    566
102    350
107    331
101    244
105    225
108    225
106    217
Name: original_label, dtype: int64

In [31]:
jn = [104,103,109]
ts = [110,100,102,107]
wei = set(others.original_label.value_counts().index.values) - set(jn)-set(ts)

In [35]:
df_jn = sub_df(others,jn,'original_label')
df_jn.to_csv('../../data/others/jiangning_other.csv',index=False,encoding='utf8')
df_ts = sub_df(others,ts,'original_label')
df_ts.to_csv('../../data/others/tanshu_other.csv',index=False,encoding='utf8')
df_wei = sub_df(others,wei,'original_label')
df_wei.to_csv('../../data/others/wei_other.csv',index=False,encoding='utf8')

In [36]:
df_jn

Unnamed: 0,original_text,from,original_label,split_text
115,别打扰我,请求等下打来,104,别 打扰 我
116,车速太快,请求等下打来,104,车速 太快
117,大雨即将来临,请求等下打来,104,大雨 即将来临
118,地震得好厉害，好厉害，好晃动！,请求等下打来,104,地震 得 好 厉害 好 厉害 好 晃动 ！
119,对不起家里有客人来了，一会儿再聊,请求等下打来,104,对不起 家里 有 客人 来 了 一会儿 再聊
120,好吵哦，听不到,请求等下打来,104,好 吵 哦 听 不到
121,好了，等会儿再说，我要出去趟,请求等下打来,104,好 了 等会儿 再说 我要 出去 趟
122,好了，先聊到这，我这边有其他事情，晚点再说,请求等下打来,104,好 了 先聊 到 这 我 这边 有 其他 事情 晚点 再说
123,火车来了,请求等下打来,104,火车 来 了
124,马上进电梯了，没信号,请求等下打来,104,马上 进 电梯 了 没 信号
