In [50]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.grid_search import GridSearchCV
from pprint import pprint
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import numpy as np

In [51]:
### 数据集载入
dataset = fetch_20newsgroups(subset='all', shuffle=True)
pprint(dataset.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [52]:
X_train,X_test,y_train,y_test = train_test_split(dataset.data, dataset.target, test_size = 0.3)
train_size = len(X_train)
test_size = len(X_test)
class_nums = len(dataset.target_names)
class_labels = list(range(class_nums))

In [53]:
### 词频统计，去除停用词
count_vect = CountVectorizer(stop_words = 'english', max_df = 0.9)
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

In [54]:
### 得出TF-IDF向量
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [55]:
count_filter_vec = CountVectorizer(analyzer='word',stop_words='english')
### 直接利用TFidfVectorizer进行处理
tfidf_filter_vec = TfidfVectorizer(analyzer='word',stop_words='english')

#使用带有停用词过滤的CountVectorizer对训练和测试文本分别进行量化处理
X_train_counts = count_filter_vec.fit_transform(X_train)
X_test_counts  = count_filter_vec.transform(X_test)

#使用带有停用词过滤的TfidfVectorizer对训练和测试样本分别进行量化处理
X_train_tfidf = tfidf_filter_vec.fit_transform(X_train)
X_test_tfidf = tfidf_filter_vec.transform(X_test)

In [56]:
print(X_train_tfidf.shape, X_test_tfidf.shape)

(13192, 149054) (5654, 149054)


In [18]:
'''
### LDA降维方法，效果不是很好
from sklearn.decomposition import LatentDirichletAllocation
#n_topic = class_nums
lda = LatentDirichletAllocation(n_topics = 20, 
                                max_iter= 100,
                                learning_method='batch')
lda.fit(X_train_counts) 
X_train_lda = lda.transform(X_train_counts)
X_test_lda = lda.transform(X_test_counts)d
'''



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=100, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=30, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [57]:
### 模型训练

# setup parameters for xgboost
params = {}
# use softmax multi-class classification
params['objective'] = 'multi:softprob'
# scale weight of positive examples
params['eta'] = 0.1
params['max_depth'] = 6
# 取0表示打印运行时信息，取1表示以缄默方式运行，不打印运行时信息
params['silent'] = 1
params['nthread'] = 8
params['num_class'] = class_nums

xgb_train = xgb.DMatrix(X_train_tfidf, y_train)

# 设置boosting迭代计算次数
num_rounds = 100

clf = xgb.train(params, xgb_train, num_rounds)

In [None]:
'''
### 参数调整

model = xgb.XGBClassifier(booster='gbtree',
                    objective= 'multi:softprob',
                    gamma = 0.1,
                    min_child_weight= 1.1,
                    max_depth= 5,
                    subsample= 0.8,
                    colsample_bytree= 0.8,
                    tree_method= 'exact',
                    learning_rate=0.1,
                    n_estimators=100,
                    nthread=4,
                    scale_pos_weight=1,
                    seed=27)
param_test1 = {
    'max_depth':[3,5,7,9],
    'min_child_weight':[1,3,5]
}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier(booster='gbtree',
                    objective= 'multi:softmax',
                    gamma = 0.1,
                    min_child_weight= 1.1,
                    max_depth= 5,
                    subsample= 0.8,
                    colsample_bytree= 0.8,
                    tree_method= 'exact',
                    learning_rate=0.1,
                    n_estimators=100,
                    scale_pos_weight=1,
                    seed=27),
                       param_grid = param_test1, scoring='f1',iid=False, cv=5)
gsearch1.fit(X_train_tfidf,train.target)
print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
'''

In [58]:
### 在测试集上进行预测
xgb_test = xgb.DMatrix(X_test_tfidf, y_test)
test_prob = clf.predict(xgb_test).reshape(test_size, class_nums)
test_label = np.argmax(test_prob, axis=1)  # return the index of the biggest pro

In [59]:
### 输入各个类别的 precision, recall, f1-score
#names = ['class ' + str(i) for i in class_labels]
print(classification_report(y_test, test_label, target_names= dataset.target_names, digits = 4))

                          precision    recall  f1-score   support

             alt.atheism     0.8667    0.7992    0.8316       244
           comp.graphics     0.7735    0.7603    0.7668       292
 comp.os.ms-windows.misc     0.7710    0.8445    0.8061       283
comp.sys.ibm.pc.hardware     0.7185    0.7233    0.7209       300
   comp.sys.mac.hardware     0.8664    0.8421    0.8541       285
          comp.windows.x     0.8537    0.8333    0.8434       294
            misc.forsale     0.8378    0.8407    0.8393       295
               rec.autos     0.8462    0.8490    0.8476       298
         rec.motorcycles     0.9072    0.8919    0.8995       296
      rec.sport.baseball     0.9076    0.9016    0.9046       305
        rec.sport.hockey     0.9357    0.9193    0.9274       285
               sci.crypt     0.9603    0.9091    0.9340       319
         sci.electronics     0.5974    0.7801    0.6766       291
                 sci.med     0.8694    0.9101    0.8893       278
         