In [37]:
#encoding=utf-8
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
# import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold

import pickle
import sys,os
import re
import jieba
jieba.load_userdict("../WordCut/userdict.txt")
import string

In [19]:
sys.path.append('../TimePattern/')
from  time_pattern import TimePattern
t = TimePattern('../TimePattern/mapping.csv')
kf = KFold(n_splits=10, shuffle=False, random_state=None)

In [24]:
def sub_df(df,sets,target='label'):
    result = pd.DataFrame()
    for each in sets:
        result = pd.concat([result,df[df[target]==each]])
#     print(result[target].value_counts())
    return result


def cut_words(text):
    ##### more -- added by wei
    # this is used to remove time patterns from sentence
    text = re.sub(r' ','',text)
    text = t.remove_time(text)
    #########
    seg_list = jieba.cut(text, cut_all=False)
    return " ".join(seg_list)

def clean(text):
    text = re.sub(f'([{string.punctuation}“”¨«»®´·º ½¾¿¡§£₤‘’，])',' ', text)
    text = text.split(' ')
    text = ' '.join(text)
    return text

def clean_label(label):
    return int(label)

others = pd.read_csv('../../data/others/irrelevant_response_training_set.csv')
other_matrix = pd.read_csv('../../data/others/strategy_mat.csv')

others = others.rename({'文本':'original_text','类别':'from'},axis=1)
mapping = other_matrix.set_index('category').label.drop_duplicates()
others['original_label'] = others['from'].map(mapping)

others['split_text']=others['original_text'].apply(cut_words)
    
# # cleaning and save
others['split_text'] = others['split_text'].apply(clean)

data = others.copy()
data = data.sample(frac=1,random_state=19)

# K fold
kf = KFold(n_splits=10, shuffle=False, random_state=None)
ss = kf.split(data)
result = pd.DataFrame()

for train_index,val_index in ss:
    train_df = data.iloc[train_index]
    val_df = data.iloc[val_index].copy()
    train_data = train_df.split_text.values
    val_data = val_df.split_text.values
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                strip_accents='unicode', 
                                max_features=100000, 
                                analyzer='word',
                                sublinear_tf=True,
                                token_pattern=r'\w{1,}')


    phrase_vectorizer.fit(train_data)
    train_tfidf = phrase_vectorizer.transform(train_data)
    val_tfidf = phrase_vectorizer.transform(val_data)
    
    # linear svc
    l_svc = LinearSVC()
    lsvc = CalibratedClassifierCV(l_svc) 
    lsvc.fit(train_tfidf, train_df.original_label)
    preds = lsvc.predict(val_tfidf)
    val_df['pred_label'] = preds
    val_df = val_df[val_df.original_label != val_df.pred_label]
    result = pd.concat([result,val_df])




In [26]:
others.original_label.value_counts()

104    825
103    651
109    635
110    597
100    566
102    350
107    331
101    244
105    225
108    225
106    217
Name: original_label, dtype: int64

In [31]:
jn = [104,103,109]
ts = [110,100,102,107]
wei = set(others.original_label.value_counts().index.values) - set(jn)-set(ts)

In [35]:
df_jn = sub_df(others,jn,'original_label')
df_jn.to_csv('../../data/others/jiangning_other.csv',index=False,encoding='utf8')
df_ts = sub_df(others,ts,'original_label')
df_ts.to_csv('../../data/others/tanshu_other.csv',index=False,encoding='utf8')
df_wei = sub_df(others,wei,'original_label')
df_wei.to_csv('../../data/others/wei_other.csv',index=False,encoding='utf8')

In [68]:
df = pd.read_csv('../../data/others/combined_cleaned_others.csv')
df_mapping = pd.read_csv('../../data/others/strategy_mat.csv')
mapping = df_mapping.set_index('label').category.drop_duplicates()
df['from'] = df.original_label.map(mapping)

In [73]:
df.rename({'from':'类别','original_text':'文本'},inplace=True)

In [74]:
df

Unnamed: 0,original_text,from,original_label,split_text
0,哎呀烦不烦,故意岔开话题,109,哎呀 烦不烦
1,不关我的事,故意岔开话题,109,不关 我 的 事
2,不认识，不要再打过来了啊,故意岔开话题,109,不 认识 不要 再 打 过来 了 啊
3,不要给我说土味情话,故意岔开话题,109,不要 给 我 说 土味 情话
4,不要再骚扰我了，不然我报警了,故意岔开话题,109,不要 再 骚扰 我 了 不然 我 报警 了
5,打错了,故意岔开话题,109,打错 了
6,烦不烦啊，请你以后不要打这个电话了，在打我要屏蔽你了,故意岔开话题,109,烦不烦 啊 请 你 以后 不要 打 这个 电话 了 在 打 我 要 屏蔽 你 了
7,烦死了，天天催！催命啊,故意岔开话题,109,烦死 了 天天 催 ！ 催命 啊
8,风太大，没听清,故意岔开话题,109,风太大 没 听 清
9,滚,故意岔开话题,109,滚


In [63]:
df.original_label

0       109
1       109
2       109
3       109
4       109
5       109
6       109
7       109
8       109
9       109
10      109
11      109
12      109
13      109
14      109
15      109
16      109
17      109
18      109
19      109
20      109
21      109
22      109
23      109
24      109
25      107
26      109
27      109
28      109
29      109
       ... 
4665    101
4666    101
4667    101
4668    101
4669    101
4670    101
4671    101
4672    101
4673    101
4674    101
4675    101
4676    101
4677    101
4678    101
4679    101
4680    101
4681    101
4682    101
4683    101
4684    101
4685    101
4686    101
4687    101
4688    101
4689    101
4690    101
4691    101
4692    109
4693    109
4694    109
Name: original_label, Length: 4695, dtype: int64