In [1]:
import pandas as pd
import numpy as np
import jieba
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


6分类

In [2]:
df = pd.read_csv('data/27378.csv',index_col=0)
print('分数分布:')
print(df['复核结果'].value_counts())

with open('ChineseStopWords/ChineseStopWords.txt', encoding='utf-8') as f:
    stop_words = set([l.strip() for l in f])
    
def segmentWord(cont):
    c = []
    b = []
    for i in cont:
        n = 0
        text = ""
        word_list = list(jieba.cut(i, cut_all=False))
        for word in word_list:
            if word not in stop_words and word != '\r\n':
                text += word
                text += ' '
            else:
                n += 1
        c.append(text)
        b.append(n)
    return c,b

df['回答内容'],df['去停频数'] = segmentWord(df['回答内容'])
df['回答长度'] = df['回答内容'].map(lambda x: len(x))
con_data = df['回答内容'].values

token = Tokenizer(num_words=3300)
token.fit_on_texts(con_data)
data_x_mat = token.texts_to_matrix(con_data)
x_data2 = df[['回答长度','去停频数']].values
x_data = np.hstack([data_x_mat,x_data2])
y_data = df['复核结果'].values

x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.1, random_state=2020)
print('训练集分布:')
for i in range(6):
    print('{}:{}'.format(i,len(y_train[y_train==i])))

print('测试集分布:')
for i in range(6):
    print('{}:{}'.format(i,len(y_test[y_test==i])))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache


分数分布:
3    740
4    503
2    191
1    167
5     67
0     14
Name: 复核结果, dtype: int64


Loading model cost 1.053 seconds.
Prefix dict has been built succesfully.


训练集分布:
0:12
1:146
2:171
3:670
4:453
5:61
测试集分布:
0:2
1:21
2:20
3:70
4:50
5:6


In [None]:
# cv_params = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_child_weight': [1, 2, 3, 4, 5, 6]}
# other_params = {'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
#                 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
# model = XGBClassifier(**other_params)
# optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='accuracy', cv=2, verbose=3)
# optimized_GBM.fit(x_train, y_train)
# evalute_result = optimized_GBM.grid_scores_
# print('每轮迭代运行结果:{0}'.format(evalute_result))
# print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
# print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))


In [6]:
model = XGBClassifier(
    learning_rate =0.1,
    n_estimators=300,
    max_depth=3,
    min_child_weight=1,
    gamma=0.3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0,
    reg_lambda=1,
    objective= 'multi:softmax',
    nthread=-1,
    scale_pos_weight=1,
    seed=27
)
eval_list = [(x_train,y_train),(x_test,y_test)]
model.fit(x_train,y_train,verbose=True,eval_set=eval_list,eval_metric='mlogloss', early_stopping_rounds=10)
y_pred = model.predict(x_test)
print(accuracy_score(y_test, y_pred))


labels = list(set(y_test))
print('标签顺序:')
print(labels)
print('测试集混淆矩阵:')
print(confusion_matrix(y_test, y_pred, labels = labels))

y_pred_train = model.predict(x_train)
labels = list(set(y_train))
print('训练集混淆矩阵:')
print(confusion_matrix(y_train, y_pred_train, labels = labels))

target_names = ['class 0','class 1', 'class 2', 'class 3' , 'class 4','class 5']
print(classification_report(y_test, y_pred, target_names=target_names))

print ("修正MSE:")
c = [abs(y_test[i] - y_pred[i]) for i in range(len(y_test))]
mse = 1 - sum(c)/(2*len(y_test))
print (mse)

[0]	validation_0-mlogloss:1.66501	validation_1-mlogloss:1.66851
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 10 rounds.
[1]	validation_0-mlogloss:1.55941	validation_1-mlogloss:1.57107
[2]	validation_0-mlogloss:1.47596	validation_1-mlogloss:1.4898
[3]	validation_0-mlogloss:1.40038	validation_1-mlogloss:1.41335
[4]	validation_0-mlogloss:1.33292	validation_1-mlogloss:1.34652
[5]	validation_0-mlogloss:1.27139	validation_1-mlogloss:1.28339
[6]	validation_0-mlogloss:1.22096	validation_1-mlogloss:1.23587
[7]	validation_0-mlogloss:1.17342	validation_1-mlogloss:1.18894
[8]	validation_0-mlogloss:1.13126	validation_1-mlogloss:1.1487
[9]	validation_0-mlogloss:1.09161	validation_1-mlogloss:1.11423
[10]	validation_0-mlogloss:1.05839	validation_1-mlogloss:1.08532
[11]	validation_0-mlogloss:1.02721	validation_1-mlogloss:1.05481
[12]	validation_0-mlogloss:0.998091	validation_1-mlogloss:1.02769

[121]	validation_0-mlogloss:0.432848	validation_1-mlogloss:0.660638
[122]	validation_0-mlogloss:0.431844	validation_1-mlogloss:0.660609
[123]	validation_0-mlogloss:0.430535	validation_1-mlogloss:0.661821
[124]	validation_0-mlogloss:0.429141	validation_1-mlogloss:0.663158
[125]	validation_0-mlogloss:0.427967	validation_1-mlogloss:0.663263
Stopping. Best iteration:
[115]	validation_0-mlogloss:0.44168	validation_1-mlogloss:0.657138

0.7159763313609467
标签顺序:
[0, 1, 2, 3, 4, 5]
测试集混淆矩阵:
[[ 1  1  0  0  0  0]
 [ 1 16  2  2  0  0]
 [ 0  4  5 10  0  1]
 [ 0  0  1 59 10  0]
 [ 0  0  0 12 38  0]
 [ 0  0  0  0  4  2]]
训练集混淆矩阵:
[[  5   7   0   0   0   0]
 [  0 132   9   5   0   0]
 [  0  12 107  50   2   0]
 [  0   0   8 618  44   0]
 [  0   0   0  78 375   0]
 [  0   0   0   0   5  56]]
             precision    recall  f1-score   support

    class 0       0.50      0.50      0.50         2
    class 1       0.76      0.76      0.76        21
    class 2       0.62      0.25      0.36        20
 

5分类

In [7]:
df = pd.read_csv('data/27378.csv',index_col=0)

# 0,1合并
df['复核结果'].replace(0,1,inplace=True)
# df['复核结果'].replace(4,3,inplace=True)
# df['复核结果'].replace(5,4,inplace=True)
print(df['复核结果'].value_counts())

with open('ChineseStopWords/ChineseStopWords.txt', encoding='utf-8') as f:
    stop_words = set([l.strip() for l in f])
    
def segmentWord(cont):
    c = []
    b = []
    for i in cont:
        n = 0
        text = ""
        word_list = list(jieba.cut(i, cut_all=False))
        for word in word_list:
            if word not in stop_words and word != '\r\n':
                text += word
                text += ' '
            else:
                n += 1
        c.append(text)
        b.append(n)
    return c,b

df['回答内容'],df['去停频数'] = segmentWord(df['回答内容'])
df['回答长度'] = df['回答内容'].map(lambda x: len(x))
con_data = df['回答内容'].values

token = Tokenizer(num_words=3300)
token.fit_on_texts(con_data)
data_x_mat = token.texts_to_matrix(con_data)
x_data2 = df[['回答长度','去停频数']].values
x_data = np.hstack([data_x_mat,x_data2])
y_data = df['复核结果'].values

x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.1, random_state=2017)
print('训练集分布:')
for i in range(1,6):
    print('{}:{}'.format(i,len(y_train[y_train==i])))

print('测试集分布:')
for i in range(1,6):
    print('{}:{}'.format(i,len(y_test[y_test==i])))

3    740
4    503
2    191
1    181
5     67
Name: 复核结果, dtype: int64
训练集分布:
1:164
2:174
3:660
4:454
5:61
测试集分布:
1:17
2:17
3:80
4:49
5:6


In [8]:
model = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    max_depth=3,
    min_child_weight=1,
    gamma=0.3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0,
    reg_lambda=1,
    objective= 'multi:softmax',
    nthread=-1,
    scale_pos_weight=10,
    seed=27
)
eval_list = [(x_train,y_train),(x_test,y_test)]
model.fit(x_train,y_train,verbose=True,eval_set=eval_list,eval_metric='mlogloss', early_stopping_rounds=10)
y_pred = model.predict(x_test)
print(accuracy_score(y_test, y_pred))


labels = list(set(y_test))
print('标签顺序:')
print(labels)
print('测试集混淆矩阵:')
print(confusion_matrix(y_test, y_pred, labels = labels))

y_pred_train = model.predict(x_train)
labels = list(set(y_train))
print('训练集混淆矩阵:')
print(confusion_matrix(y_train, y_pred_train, labels = labels))

target_names = ['class 1', 'class 2', 'class 3' , 'class 4','class 5']
print(classification_report(y_test, y_pred, target_names=target_names))

print ("修正MSE:")
c = [abs(y_test[i] - y_pred[i]) for i in range(len(y_test))]
mse = 1 - sum(c)/(2*len(y_test))
print (mse)

[0]	validation_0-mlogloss:1.5065	validation_1-mlogloss:1.51247
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 10 rounds.
[1]	validation_0-mlogloss:1.41939	validation_1-mlogloss:1.43046
[2]	validation_0-mlogloss:1.3529	validation_1-mlogloss:1.36843
[3]	validation_0-mlogloss:1.28806	validation_1-mlogloss:1.30417
[4]	validation_0-mlogloss:1.2322	validation_1-mlogloss:1.24904
[5]	validation_0-mlogloss:1.17688	validation_1-mlogloss:1.20043
[6]	validation_0-mlogloss:1.131	validation_1-mlogloss:1.15867
[7]	validation_0-mlogloss:1.09132	validation_1-mlogloss:1.12176
[8]	validation_0-mlogloss:1.05286	validation_1-mlogloss:1.08729
[9]	validation_0-mlogloss:1.01924	validation_1-mlogloss:1.05605
[10]	validation_0-mlogloss:0.989032	validation_1-mlogloss:1.02766
[11]	validation_0-mlogloss:0.96002	validation_1-mlogloss:1.00122
[12]	validation_0-mlogloss:0.936533	validation_1-mlogloss:0.980837


[121]	validation_0-mlogloss:0.413314	validation_1-mlogloss:0.644444
[122]	validation_0-mlogloss:0.412131	validation_1-mlogloss:0.64327
[123]	validation_0-mlogloss:0.411011	validation_1-mlogloss:0.642937
[124]	validation_0-mlogloss:0.409697	validation_1-mlogloss:0.641893
[125]	validation_0-mlogloss:0.408118	validation_1-mlogloss:0.640898
[126]	validation_0-mlogloss:0.406869	validation_1-mlogloss:0.641609
[127]	validation_0-mlogloss:0.405571	validation_1-mlogloss:0.641036
[128]	validation_0-mlogloss:0.40418	validation_1-mlogloss:0.641124
[129]	validation_0-mlogloss:0.402841	validation_1-mlogloss:0.639874
[130]	validation_0-mlogloss:0.401443	validation_1-mlogloss:0.641482
[131]	validation_0-mlogloss:0.399823	validation_1-mlogloss:0.640178
[132]	validation_0-mlogloss:0.398639	validation_1-mlogloss:0.639757
[133]	validation_0-mlogloss:0.397216	validation_1-mlogloss:0.638632
[134]	validation_0-mlogloss:0.395949	validation_1-mlogloss:0.638589
[135]	validation_0-mlogloss:0.394736	validation_1-

In [9]:
model.save_model('q3_xgb0.7278(5分类).model')

4分类

In [10]:
df = pd.read_csv('data/27378.csv',index_col=0)

# 0,1,2合并
df['复核结果'].replace(0,1,inplace=True)
df['复核结果'].replace(2,1,inplace=True)
df['复核结果'].replace(3,2,inplace=True)
df['复核结果'].replace(4,3,inplace=True)
df['复核结果'].replace(5,4,inplace=True)

with open('ChineseStopWords/ChineseStopWords.txt', encoding='utf-8') as f:
    stop_words = set([l.strip() for l in f])
    
def segmentWord(cont):
    c = []
    b = []
    for i in cont:
        n = 0
        text = ""
        word_list = list(jieba.cut(i, cut_all=False))
        for word in word_list:
            if word not in stop_words and word != '\r\n':
                text += word
                text += ' '
            else:
                n += 1
        c.append(text)
        b.append(n)
    return c,b

df['回答内容'],df['去停频数'] = segmentWord(df['回答内容'])
df['回答长度'] = df['回答内容'].map(lambda x: len(x))
con_data = df['回答内容'].values

token = Tokenizer(num_words=3300)
token.fit_on_texts(con_data)
data_x_mat = token.texts_to_matrix(con_data)
x_data2 = df[['回答长度','去停频数']].values
x_data = np.hstack([data_x_mat,x_data2])
y_data = df['复核结果'].values

x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.1, random_state=2019)
print('训练集分布:')
for i in range(1,5):
    print('{}:{}'.format(i,len(y_train[y_train==i])))

print('测试集分布:')
for i in range(1,5):
    print('{}:{}'.format(i,len(y_test[y_test==i])))

训练集分布:
1:342
2:667
3:448
4:56
测试集分布:
1:30
2:73
3:55
4:11


In [11]:
model = XGBClassifier(
    learning_rate =0.1,
    n_estimators=100,
    max_depth=3,
    min_child_weight=1,
    gamma=0.3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0,
    reg_lambda=1,
    objective= 'multi:softmax',
    nthread=-1,
    scale_pos_weight=1,
    seed=27
)
eval_list = [(x_train,y_train),(x_test,y_test)]
model.fit(x_train,y_train,verbose=True,eval_set=eval_list,eval_metric='mlogloss', early_stopping_rounds=10)
y_pred = model.predict(x_test)
print(accuracy_score(y_test, y_pred))


labels = list(set(y_test))
print('标签顺序:')
print(labels)
print('测试集混淆矩阵:')
print(confusion_matrix(y_test, y_pred, labels = labels))

y_pred_train = model.predict(x_train)
labels = list(set(y_train))
print('训练集混淆矩阵:')
print(confusion_matrix(y_train, y_pred_train, labels = labels))

target_names = ['class 1', 'class 2', 'class 3' , 'class 4']
print(classification_report(y_test, y_pred, target_names=target_names))

print ("修正MSE:")
c = [abs(y_test[i] - y_pred[i]) for i in range(len(y_test))]
mse = 1 - sum(c)/(2*len(y_test))
print (mse)

[0]	validation_0-mlogloss:1.29985	validation_1-mlogloss:1.30676
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 10 rounds.
[1]	validation_0-mlogloss:1.22911	validation_1-mlogloss:1.23678
[2]	validation_0-mlogloss:1.16478	validation_1-mlogloss:1.17612
[3]	validation_0-mlogloss:1.11229	validation_1-mlogloss:1.12748
[4]	validation_0-mlogloss:1.06177	validation_1-mlogloss:1.07961
[5]	validation_0-mlogloss:1.01696	validation_1-mlogloss:1.03815
[6]	validation_0-mlogloss:0.976704	validation_1-mlogloss:0.998326
[7]	validation_0-mlogloss:0.943761	validation_1-mlogloss:0.968433
[8]	validation_0-mlogloss:0.909007	validation_1-mlogloss:0.934203
[9]	validation_0-mlogloss:0.880412	validation_1-mlogloss:0.907396
[10]	validation_0-mlogloss:0.852196	validation_1-mlogloss:0.883544
[11]	validation_0-mlogloss:0.827666	validation_1-mlogloss:0.860295
[12]	validation_0-mlogloss:0.806408	validation_1-ml

In [12]:
model.save_model('q3_xgb0.7514(4分类).model')

3分类

In [13]:
df = pd.read_csv('data/27378.csv',index_col=0)

# 0,1,2合并
# 3,4合并
df['复核结果'].replace(0,1,inplace=True)
df['复核结果'].replace(2,1,inplace=True)
df['复核结果'].replace(3,2,inplace=True)
df['复核结果'].replace(4,2,inplace=True)
df['复核结果'].replace(5,3,inplace=True)

with open('ChineseStopWords/ChineseStopWords.txt', encoding='utf-8') as f:
    stop_words = set([l.strip() for l in f])
    
def segmentWord(cont):
    c = []
    b = []
    for i in cont:
        n = 0
        text = ""
        word_list = list(jieba.cut(i, cut_all=False))
        for word in word_list:
            if word not in stop_words and word != '\r\n':
                text += word
                text += ' '
            else:
                n += 1
        c.append(text)
        b.append(n)
    return c,b

df['回答内容'],df['去停频数'] = segmentWord(df['回答内容'])
df['回答长度'] = df['回答内容'].map(lambda x: len(x))
con_data = df['回答内容'].values

token = Tokenizer(num_words=3300)
token.fit_on_texts(con_data)
data_x_mat = token.texts_to_matrix(con_data)
x_data2 = df[['回答长度','去停频数']].values
x_data = np.hstack([data_x_mat,x_data2])
y_data = df['复核结果'].values

x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.1, random_state=2019)
print('训练集分布:')
for i in range(1,4):
    print('{}:{}'.format(i,len(y_train[y_train==i])))

print('测试集分布:')
for i in range(1,4):
    print('{}:{}'.format(i,len(y_test[y_test==i])))

训练集分布:
1:342
2:1115
3:56
测试集分布:
1:30
2:128
3:11


In [14]:
model = XGBClassifier(
    learning_rate =0.1,
    n_estimators=100,
    max_depth=3,
    min_child_weight=1,
    gamma=0.3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0,
    reg_lambda=1,
    objective= 'multi:softmax',
    nthread=-1,
    scale_pos_weight=1,
    seed=27
)
eval_list = [(x_train,y_train),(x_test,y_test)]
model.fit(x_train,y_train,verbose=True,eval_set=eval_list,eval_metric='mlogloss', early_stopping_rounds=10)
y_pred = model.predict(x_test)
print(accuracy_score(y_test, y_pred))


labels = list(set(y_test))
print('标签顺序:')
print(labels)
print('测试集混淆矩阵:')
print(confusion_matrix(y_test, y_pred, labels = labels))

y_pred_train = model.predict(x_train)
labels = list(set(y_train))
print('训练集混淆矩阵:')
print(confusion_matrix(y_train, y_pred_train, labels = labels))

target_names = ['class 1', 'class 2', 'class 3']
print(classification_report(y_test, y_pred, target_names=target_names))

print ("修正MSE:")
c = [abs(y_test[i] - y_pred[i]) for i in range(len(y_test))]
mse = 1 - sum(c)/(2*len(y_test))
print (mse)

[0]	validation_0-mlogloss:0.992456	validation_1-mlogloss:0.999144
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 10 rounds.
[1]	validation_0-mlogloss:0.906113	validation_1-mlogloss:0.917103
[2]	validation_0-mlogloss:0.830966	validation_1-mlogloss:0.846472
[3]	validation_0-mlogloss:0.767385	validation_1-mlogloss:0.785828
[4]	validation_0-mlogloss:0.714253	validation_1-mlogloss:0.737331
[5]	validation_0-mlogloss:0.666988	validation_1-mlogloss:0.692803
[6]	validation_0-mlogloss:0.622233	validation_1-mlogloss:0.650466
[7]	validation_0-mlogloss:0.587456	validation_1-mlogloss:0.615867
[8]	validation_0-mlogloss:0.552183	validation_1-mlogloss:0.58367
[9]	validation_0-mlogloss:0.522899	validation_1-mlogloss:0.559069
[10]	validation_0-mlogloss:0.495158	validation_1-mlogloss:0.532911
[11]	validation_0-mlogloss:0.472083	validation_1-mlogloss:0.51148
[12]	validation_0-mlogloss:0.451159	valid

In [29]:
model.save_model('q3_xgb0.899(3分类).model')