# 更多的分类方案尝试
## 一、数据加载

In [66]:
from sklearn.svm import SVC  # 支持向量机分类器
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier  # 最近邻分类器
import gensim
import sklearn
from sklearn import metrics  # 模型评价工具
from sklearn.externals import joblib  # 模型保存与加载
# sklearn自带的向量化工具
from sklearn.feature_extraction.text import CountVectorizer
# sklearn自带的TF-TDF构造器
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# 朴素贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB

from entity.comm import Comm
from util.path import *
from util.vec import doc_vec
from util.xl import read_xl_by_line
from util import dataset

In [67]:
# 数据加载与数据清洗
sheet_2 = read_xl_by_line(sheet_2_input)
stop = dataset.fetch_default_stop_words()
comm_dict_2 = dataset.fetch_data("full_dataset_sheet_2", stop_words=stop,
                                     cut_all=False, remove_duplicates=False)
_, targets, target_names = dataset.fetch_issue1_dataset()
seg_sheet_2 = [comm_dict_2[row[0]].seg_topic + comm_dict_2[row[0]].seg_detail for row in sheet_2]
sents = list(map(lambda x: " ".join(x), seg_sheet_2))

exec "read_xl_by_line" in 2.0828731060028076s.
exec "read_xl_by_line" in 1.7205724716186523s.
exec "fetch_data" in 51.96892046928406s.
exec "read_xl_by_line" in 1.3550927639007568s.


## 二、分类器
### 2.1 TF-IDF特征 + kNN分类器

In [68]:
# 分割测试集和训练集
x_train, x_test, y_train, y_test \
        = train_test_split(sents, targets, test_size=0.3)

In [69]:
#在训练集上构造TF-IDF特征
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)  # 拟合模型

# 构建TF-TDF特征
tf_transformer = TfidfTransformer().fit(x_train_counts)
# 构建TF-IDF特征
x_train_tf = tf_transformer.transform(x_train_counts)

In [70]:
# 使用特征集（X）和目标（target）训练/拟合一个kNN分类器
clf = KNeighborsClassifier(algorithm="brute", leaf_size=11, n_neighbors=13)
clf.fit(x_train_tf, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=11, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=13, p=2,
                     weights='uniform')

In [71]:
# 将x_test中的文本转化为tf-idf矩阵表示
test_tf = tf_transformer.transform(count_vect.transform(x_test))
predicted = clf.predict(test_tf)  # 预测x_test中的分类

In [72]:
print(metrics.classification_report(y_test, predicted, target_names=target_names))

              precision    recall  f1-score   support

        城乡建设       0.77      0.91      0.83       563
        卫生计生       0.91      0.83      0.87       284
     劳动和社会保障       0.84      0.92      0.88       601
        交通运输       0.87      0.71      0.78       187
        商贸旅游       0.85      0.73      0.78       351
        环境保护       0.91      0.83      0.87       269
        教育文体       0.92      0.88      0.90       508

    accuracy                           0.85      2763
   macro avg       0.87      0.83      0.84      2763
weighted avg       0.86      0.85      0.85      2763



In [73]:
# 分类精度评价
acc = metrics.accuracy_score(y_test, predicted)
acc

0.8516105682229461

In [74]:
# 训练集上精度
tf_predicted_on_trainSet = clf.predict(x_train_tf)
tf_trainSet_acc = metrics.accuracy_score(y_train, tf_predicted_on_trainSet)
print(f"训练集上精度：{tf_trainSet_acc}")

训练集上精度：0.8794788273615635


### 2.2 Word2Vec特征 + kNN分类方法

In [75]:
# 从预设的路径加载词向量化模型Word2Vec
wv_model = gensim.models.KeyedVectors.load_word2vec_format(
    word2vec_model_path, binary=False)

In [76]:
# 计算训练集上的word2vec向量
x_train_vec = [doc_vec(s.split(" "), wv_model) for s in x_train]

In [77]:
# 建立kNN分类器
knn = KNeighborsClassifier(leaf_size=11, n_neighbors=13)
knn.fit(x_train_vec, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=11, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=13, p=2,
                     weights='uniform')

In [78]:
# 计算测试集上的word2vec向量
x_test_vec = [doc_vec(s.split(" "), wv_model) for s in x_test]
# 使用kNN模型预测测试集上的标签
wv_knn_predicted = knn.predict(x_test_vec)

In [79]:
print(metrics.classification_report(y_test, wv_knn_predicted, target_names=target_names))

              precision    recall  f1-score   support

        城乡建设       0.78      0.77      0.77       563
        卫生计生       0.83      0.63      0.72       284
     劳动和社会保障       0.76      0.91      0.83       601
        交通运输       0.65      0.52      0.58       187
        商贸旅游       0.72      0.73      0.72       351
        环境保护       0.80      0.88      0.84       269
        教育文体       0.90      0.83      0.86       508

    accuracy                           0.79      2763
   macro avg       0.78      0.75      0.76      2763
weighted avg       0.79      0.79      0.78      2763



In [80]:
# 分类器精度评价
wv_knn_acc = metrics.accuracy_score(y_test, wv_knn_predicted)
wv_knn_acc

0.7861020629750272

In [81]:
predicted_on_trainSet = knn.predict(x_train_vec)
wv_knn_acc_on_trainSet = metrics.accuracy_score(predicted_on_trainSet, y_train)
print(f"训练集上预测精度：{wv_knn_acc_on_trainSet}")

训练集上预测精度：0.8264308980921359


### 2.3 TF-IDF特征 + 支持向量机

In [96]:
svm = SVC(kernel="linear")  # 线性核支持向量机模型
svm.fit(x_train_tf, y_train)  # 训练模型

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [97]:
# 预测测试集上的标签，并评价模型精度
tf_svm_predicted = svm.predict(test_tf)
tf_svm_acc = metrics.accuracy_score(y_test, tf_svm_predicted)
tf_svm_acc

0.9037278320665942

In [98]:
print(metrics.classification_report(y_test,
                                    tf_svm_predicted,
                                    target_names=target_names))

              precision    recall  f1-score   support

        城乡建设       0.81      0.94      0.87       563
        卫生计生       0.94      0.85      0.89       284
     劳动和社会保障       0.93      0.95      0.94       601
        交通运输       0.91      0.80      0.85       187
        商贸旅游       0.88      0.83      0.86       351
        环境保护       0.96      0.91      0.94       269
        教育文体       0.96      0.93      0.94       508

    accuracy                           0.90      2763
   macro avg       0.91      0.89      0.90      2763
weighted avg       0.91      0.90      0.90      2763



In [99]:
# 计算经验风险
tf_svm_predicted_on_trainSet = svm.predict(x_train_tf)
print(f"训练集上精度：{metrics.accuracy_score(y_train, tf_svm_predicted_on_trainSet)}")

训练集上精度：0.9899177912207228


### 2.4 Word2Vec特征 + 支持向量机

In [86]:
svm = SVC(kernel="linear")
svm.fit(x_train_vec, y_train)
wv_svm_predicted = svm.predict(x_test_vec)
wv_svm_acc = metrics.accuracy_score(y_test, wv_svm_predicted)
wv_svm_acc

0.8172276511038726

In [87]:
print(metrics.classification_report(y_test,
                                    wv_svm_predicted,
                                    target_names=target_names))

              precision    recall  f1-score   support

        城乡建设       0.77      0.82      0.79       563
        卫生计生       0.80      0.71      0.75       284
     劳动和社会保障       0.83      0.90      0.86       601
        交通运输       0.72      0.63      0.67       187
        商贸旅游       0.74      0.72      0.73       351
        环境保护       0.87      0.90      0.89       269
        教育文体       0.92      0.87      0.90       508

    accuracy                           0.82      2763
   macro avg       0.81      0.79      0.80      2763
weighted avg       0.82      0.82      0.82      2763



In [88]:
# 计算经验风险
wv_svm_predicted_on_trainSet = svm.predict(x_train_vec)
print(f"训练集上精度：{metrics.accuracy_score(y_train, wv_svm_predicted_on_trainSet)}")

训练集上精度：0.8455095393206142


### 2.5 TF-IDF特征 + 朴素贝叶斯

In [89]:
# 使用NB模型拟合并预测测试集上的标签
nb = MultinomialNB()
nb.fit(x_train_tf, y_train)
tf_nb_predicted = nb.predict(test_tf)
tf_nb_acc = metrics.accuracy_score(y_test, tf_nb_predicted)
tf_nb_acc

0.7423090843286283

In [90]:
print(metrics.classification_report(y_test,
                                    tf_nb_predicted,
                                    target_names=target_names))

              precision    recall  f1-score   support

        城乡建设       0.59      0.97      0.73       563
        卫生计生       0.98      0.37      0.54       284
     劳动和社会保障       0.68      0.97      0.80       601
        交通运输       0.97      0.18      0.31       187
        商贸旅游       0.91      0.48      0.63       351
        环境保护       0.97      0.67      0.79       269
        教育文体       0.94      0.85      0.89       508

    accuracy                           0.74      2763
   macro avg       0.86      0.64      0.67      2763
weighted avg       0.82      0.74      0.72      2763



In [91]:
# 计算经验风险
tf_nb_predicted_on_trainSet = nb.predict(x_train_tf)
print(f"训练集上精度：{metrics.accuracy_score(y_train, tf_nb_predicted_on_trainSet)}")

训练集上精度：0.8531099736311463
