In [48]:
# 导入数据集
import pickle
from sklearn.metrics import classification_report

with open('../preprocess/train/train_bag.pickle', 'rb') as file_obj:
    train_bunch = pickle.load(file_obj)
with open('../preprocess/test/test_bag.pickle', 'rb') as file_obj:
    test_bunch = pickle.load(file_obj)

In [49]:
X_train = train_bunch.tfidf_weight_matrices
y_train = train_bunch.label

In [50]:
# 单SVM
from sklearn.svm import LinearSVC
SVM = LinearSVC(C=1, tol=1e-5)
SVM.fit(X_train, y_train)
svm_predicted = SVM.predict(test_bunch.tfidf_weight_matrices)
print("----------------Single SVM----------------")
print(classification_report(test_bunch.label, svm_predicted))	# 打印结果

----------------Single SVM----------------
              precision    recall  f1-score   support

          体育       1.00      1.00      1.00      1000
          娱乐       0.97      0.98      0.98      1000
          教育       0.92      0.87      0.90      1000
          时尚       0.94      0.96      0.95      1000
          时政       0.96      0.93      0.95      1000
          游戏       0.97      0.97      0.97      1000
          社会       0.93      0.90      0.91      1000
          科技       0.91      0.92      0.91      1000
          股票       0.95      0.97      0.96      1000
          财经       0.94      0.98      0.96      1000

    accuracy                           0.95     10000
   macro avg       0.95      0.95      0.95     10000
weighted avg       0.95      0.95      0.95     10000



In [51]:
# 单Bayes
from sklearn.naive_bayes import MultinomialNB
Bayes = MultinomialNB(alpha=0.001)
Bayes.fit(X_train, y_train)
bayes_predicted = Bayes.predict(test_bunch.tfidf_weight_matrices)
print("----------------Single Bayes----------------")
print(classification_report(test_bunch.label, bayes_predicted))	# 打印结果

----------------Single Bayes----------------
              precision    recall  f1-score   support

          体育       0.98      0.98      0.98      1000
          娱乐       0.91      0.98      0.94      1000
          教育       0.90      0.86      0.88      1000
          时尚       0.97      0.94      0.95      1000
          时政       0.97      0.90      0.93      1000
          游戏       0.84      0.96      0.89      1000
          社会       0.91      0.88      0.89      1000
          科技       0.90      0.82      0.86      1000
          股票       0.98      0.99      0.98      1000
          财经       0.95      0.98      0.97      1000

    accuracy                           0.93     10000
   macro avg       0.93      0.93      0.93     10000
weighted avg       0.93      0.93      0.93     10000



In [52]:
# 单决策树
from sklearn.tree import DecisionTreeClassifier
Tree = DecisionTreeClassifier()
Tree.fit(X_train, y_train)
tree_predicted = Tree.predict(test_bunch.tfidf_weight_matrices)
print("----------------Single Tree----------------")
print(classification_report(test_bunch.label, tree_predicted))	# 打印结果

----------------Single Tree----------------
              precision    recall  f1-score   support

          体育       0.98      0.96      0.97      1000
          娱乐       0.94      0.91      0.93      1000
          教育       0.78      0.72      0.75      1000
          时尚       0.78      0.88      0.83      1000
          时政       0.80      0.78      0.79      1000
          游戏       0.83      0.88      0.85      1000
          社会       0.78      0.76      0.77      1000
          科技       0.79      0.61      0.69      1000
          股票       0.83      0.83      0.83      1000
          财经       0.78      0.95      0.86      1000

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000



In [53]:
# Adaboost
from sklearn.ensemble import AdaBoostClassifier
import time
# 每个adaboost模型都有50个弱分类器
# 使用分类器对样本集预测错误的概率进行划分
SVC_adaboost = AdaBoostClassifier(LinearSVC(C=1, tol=1e-5),n_estimators=20,algorithm='SAMME')
Bayes_adaboost = AdaBoostClassifier(MultinomialNB(alpha=0.001),n_estimators=20,algorithm='SAMME.R')
Tree_adaboost = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators=20,algorithm='SAMME.R')

In [54]:
start = time.time()
print("Start SVM Training")
SVC_adaboost.fit(X_train, y_train)
SVC_train_time = time.time() - start
print("SVM Training Time: " + str(SVC_train_time))

SVC_adaboost_predicted = SVC_adaboost.predict(test_bunch.tfidf_weight_matrices)

print("----------------SVM----------------")
print(classification_report(test_bunch.label, SVC_adaboost_predicted))	# 打印结果

Start SVM Training
SVM Training Time: 32.09978199005127
----------------SVM----------------
              precision    recall  f1-score   support

          体育       0.99      0.99      0.99      1000
          娱乐       0.93      0.97      0.95      1000
          教育       0.84      0.79      0.82      1000
          时尚       0.78      0.96      0.86      1000
          时政       0.96      0.85      0.90      1000
          游戏       0.81      0.96      0.88      1000
          社会       0.88      0.84      0.86      1000
          科技       0.86      0.54      0.66      1000
          股票       0.94      0.98      0.96      1000
          财经       0.91      0.98      0.94      1000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.88     10000
weighted avg       0.89      0.89      0.88     10000



In [55]:
start = time.time()
print("Start Bayes Training")
Bayes_adaboost.fit(X_train, y_train)
Bayes_train_time = time.time() - start
print("Bayes Training Time: " + str(Bayes_train_time))

Bayes_adaboost_predicted = Bayes_adaboost.predict(test_bunch.tfidf_weight_matrices)

print("----------------Bayes----------------")
print(classification_report(test_bunch.label, Bayes_adaboost_predicted))	# 打印结果

Start Bayes Training
Bayes Training Time: 2.1556735038757324
----------------Bayes----------------
              precision    recall  f1-score   support

          体育       1.00      0.97      0.99      1000
          娱乐       0.98      0.96      0.97      1000
          教育       0.90      0.88      0.89      1000
          时尚       0.92      0.97      0.94      1000
          时政       0.96      0.90      0.93      1000
          游戏       0.95      0.97      0.96      1000
          社会       0.91      0.90      0.91      1000
          科技       0.91      0.88      0.90      1000
          股票       0.92      0.98      0.95      1000
          财经       0.93      0.97      0.95      1000

    accuracy                           0.94     10000
   macro avg       0.94      0.94      0.94     10000
weighted avg       0.94      0.94      0.94     10000



In [56]:
start = time.time()
print("Start Tree Training")
Tree_adaboost.fit(X_train, y_train)
Tree_train_time = time.time() - start
print("Tree Training Time: " + str(Tree_train_time))

Tree_adaboost_predicted = Tree_adaboost.predict(test_bunch.tfidf_weight_matrices)

print("----------------Tree----------------")
print(classification_report(test_bunch.label, Tree_adaboost_predicted))	# 打印结果

Start Tree Training
Tree Training Time: 220.5449857711792
----------------Tree----------------
              precision    recall  f1-score   support

          体育       0.99      0.99      0.99      1000
          娱乐       0.94      0.93      0.94      1000
          教育       0.82      0.69      0.75      1000
          时尚       0.84      0.90      0.87      1000
          时政       0.72      0.92      0.80      1000
          游戏       0.86      0.91      0.88      1000
          社会       0.86      0.81      0.83      1000
          科技       0.86      0.68      0.76      1000
          股票       0.91      0.84      0.87      1000
          财经       0.86      0.96      0.91      1000

    accuracy                           0.86     10000
   macro avg       0.87      0.86      0.86     10000
weighted avg       0.87      0.86      0.86     10000



In [57]:
# 混合型Adaboost
import matplotlib
import matplotlib.pyplot as plt
from numpy import *