In [28]:
from sklearn.datasets import fetch_20newsgroups  # 从sklearn.datasets里导入新闻数据抓取器 fetch_20newsgroups
from sklearn.model_selection import  train_test_split
from sklearn.feature_extraction.text import CountVectorizer  # 从sklearn.feature_extraction.text里导入文本特征向量化模块
from sklearn.naive_bayes import MultinomialNB     # 从sklean.naive_bayes里导入朴素贝叶斯模型
from sklearn.metrics import classification_report

In [29]:
#1.数据获取
news = fetch_20newsgroups(subset='all')
print(len(news.data))  # 输出数据的条数：18846

18846


In [30]:
#2.数据预处理：训练集和测试集分割，文本特征向量化
X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33) # 随机采样25%的数据样本作为测试集
print(X_train[0])  #查看训练样本
print(y_train[0:100])  #查看标签

From: scotts@math.orst.edu (Scott Settlemier)
Subject: FORSALE: MAG Innovision MX15F 1280x1024
Article-I.D.: gaia.1r7hir$9sk
Distribution: world
Organization: Oregon State University Math Department
Lines: 7
NNTP-Posting-Host: math.orst.edu

MAG Innovision MX15F
Fantastic 15" multiscan monitor that can display up to
1280x1024 noninterlaced (!) with .26 mm dot pitch.
If you are looking for a large crystal clear super vga
monitor then this is for you.
$430   call Scott at (503) 757-3483 or
email scotts@math.orst.edu

[ 6  1  5  6  5  3 19  8 19 19  3 16  3  1 13 14  4  8  0 13 13 11 19 16
 10 18  9 19  5  3 15  2  0  8 19 12  9  3 19  0  3  3  8 16 16  6 13  8
  8 11 11 11  6  0 13  3 11  9 15  8  8 14 18  2  3  5 14  8 12 15  0 18
  1  0 13 16 14  7 18  5 15  3  5 17 15  2  8  9 14 10  3 14 11 11  3 15
  6 10 14 19]


In [31]:
#文本特征向量化
vec = CountVectorizer()
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)
print(X_train)

  (0, 66520)	1
  (0, 122236)	2
  (0, 93010)	4
  (0, 105230)	3
  (0, 59188)	3
  (0, 122222)	2
  (0, 123300)	1
  (0, 128977)	1
  (0, 65921)	1
  (0, 91899)	2
  (0, 78632)	2
  (0, 99226)	2
  (0, 4447)	2
  (0, 35136)	1
  (0, 67665)	1
  (0, 9352)	1
  (0, 27541)	1
  (0, 56181)	1
  (0, 144786)	1
  (0, 105079)	1
  (0, 105052)	1
  (0, 127872)	1
  (0, 137926)	1
  (0, 54291)	1
  (0, 88624)	1
  :	:
  (14133, 124348)	1
  (14133, 122909)	1
  (14133, 75247)	1
  (14133, 119224)	1
  (14133, 55318)	1
  (14133, 116482)	1
  (14133, 123307)	1
  (14133, 94031)	1
  (14133, 48860)	1
  (14133, 138478)	1
  (14133, 48313)	3
  (14133, 40841)	1
  (14133, 37995)	1
  (14133, 96952)	3
  (14133, 138826)	1
  (14133, 40819)	1
  (14133, 17353)	1
  (14133, 114260)	1
  (14133, 103953)	1
  (14133, 32494)	1
  (14133, 118476)	2
  (14133, 93184)	1
  (14133, 133083)	1
  (14133, 139386)	1
  (14133, 128817)	2


In [32]:
#3.使用朴素贝叶斯进行训练
mnb = MultinomialNB()   # 使用默认配置初始化朴素贝叶斯
mnb.fit(X_train,y_train)    # 利用训练数据对模型参数进行估计
y_predict = mnb.predict(X_test)     # 对参数进行预测

In [33]:
#4.获取结果报告
print('The Accuracy of Naive Bayes Classifier is:', mnb.score(X_test,y_test))
print(classification_report(y_test, y_predict, target_names = news.target_names))

The Accuracy of Naive Bayes Classifier is: 0.8397707979626485
                          precision    recall  f1-score   support

             alt.atheism       0.86      0.86      0.86       201
           comp.graphics       0.59      0.86      0.70       250
 comp.os.ms-windows.misc       0.89      0.10      0.17       248
comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240
   comp.sys.mac.hardware       0.93      0.78      0.85       242
          comp.windows.x       0.82      0.84      0.83       263
            misc.forsale       0.91      0.70      0.79       257
               rec.autos       0.89      0.89      0.89       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.98      0.91      0.95       251
        rec.sport.hockey       0.93      0.99      0.96       233
               sci.crypt       0.86      0.98      0.91       238
         sci.electronics       0.85      0.88      0.86       249
             