In [1]:
import time

from sklearn.datasets import load_iris, fetch_20newsgroups, fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [2]:
"""
朴素贝叶斯进行文本分类
:return: None
"""
news = fetch_20newsgroups(subset='all', data_home='data')

print(len(news.data))  #样本数，包含的特征
print('-'*50)
print(news.data[0]) #第一个样本 特征
print('-'*50)
print(news.target[0:5]) #标签
print(np.unique(news.target)) #标签的类别
print(news.target_names) #标签的名字

18846
--------------------------------------------------
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


----------------------------------------

In [3]:
print('-'*50)
# 进行数据分割
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=1)

# 对数据集进行特征抽取
tf = TfidfVectorizer()

# 以训练集当中的词的列表进行每篇文章重要性统计['a','b','c','d']
x_train = tf.fit_transform(x_train)
#针对特征内容，可以自行打印，下面的打印可以得到特征数目，总计有15万特征
print(len(tf.get_feature_names_out()))

--------------------------------------------------
153196


In [4]:
print(tf.get_feature_names_out()[100000])

murky


In [5]:
print(tf.get_feature_names_out()[0:10])

['00' '000' '0000' '00000' '0000000004' '0000000005' '0000000667'
 '0000001200' '000003' '000005102000']


In [6]:
print(tf.get_feature_names_out()[100000:100000+10])

['murky' 'murmurs' 'murnane' 'murph' 'murphey' 'murphy' 'murr11' 'murray'
 'murray_craven' 'murrayfield']


In [7]:
import time
# 进行朴素贝叶斯算法的预测,alpha是拉普拉斯平滑系数，分子和分母加上一个系数，分母加alpha*特征词数目
mlt = MultinomialNB(alpha=1.0)

# print(x_train.toarray())
# 训练
start=time.time()
mlt.fit(x_train, y_train) #训练模型
end=time.time()
end-start #统计训练时间

0.08022046089172363

In [8]:
x_transform_test = tf.transform(x_test)  #特征数目不发生改变
print(len(tf.get_feature_names_out())) #查看特征数目

153196


In [9]:
start=time.time()
y_predict = mlt.predict(x_transform_test)

print("预测的前面10篇文章类别为：", y_predict[0:10])

# 得出准确率,这个是很难提高准确率，为什么呢？
print("准确率为：", mlt.score(x_transform_test, y_test))
end=time.time()
end-start #预测时间

预测的前面10篇文章类别为： [16 19 18  1  9 15  1  2 16 13]
准确率为： 0.8518675721561969


0.028542518615722656

In [10]:
len(y_predict)

4712

In [11]:
print(classification_report(y_test, y_predict,
      target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.91      0.77      0.83       199
           comp.graphics       0.83      0.79      0.81       242
 comp.os.ms-windows.misc       0.89      0.83      0.86       263
comp.sys.ibm.pc.hardware       0.80      0.83      0.81       262
   comp.sys.mac.hardware       0.90      0.88      0.89       234
          comp.windows.x       0.92      0.85      0.88       230
            misc.forsale       0.96      0.67      0.79       257
               rec.autos       0.90      0.87      0.88       265
         rec.motorcycles       0.90      0.95      0.92       251
      rec.sport.baseball       0.89      0.96      0.93       226
        rec.sport.hockey       0.95      0.98      0.96       262
               sci.crypt       0.76      0.97      0.85       257
         sci.electronics       0.84      0.80      0.82       229
                 sci.med       0.97      0.86      0.91       249
         

In [12]:
y_test1 = np.where(y_test == 0, 1, 0)
print(y_test1.sum()) #label为0的样本数

199


In [13]:
y_predict1 = np.where(y_predict == 0, 1, 0)
print(y_predict1.sum())

168


In [14]:
print(y_test1[0:20])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]


In [15]:
y_predict1[0:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [16]:
(y_test1*y_predict1).sum()

np.int64(153)

In [17]:
max(y_test),min(y_test)

(np.int64(19), np.int64(0))

In [18]:
# 把0-19总计20个分类，变为0和1
# 5是可以改为0到19的
y_test1 = np.where(y_test == 5, 1, 0)
print(y_test1.sum()) #label为5的样本数
y_predict1 = np.where(y_predict == 5, 1, 0)
print(y_predict1.sum())
# roc_auc_score的y_test只能是二分类,针对多分类如何计算AUC
print("AUC指标：", roc_auc_score(y_test1, y_predict1))

230
214
AUC指标： 0.924078924393225


In [19]:
y_test1,y_predict1

(array([0, 0, 0, ..., 0, 0, 0], shape=(4712,)),
 array([0, 0, 0, ..., 0, 0, 0], shape=(4712,)))

In [20]:
#算多分类的精确率，召回率，F1-score
FP=np.where((np.array(y_test1)-np.array(y_predict1))==-1,1,0).sum()   #FP是18
TP=y_predict1.sum()-FP #TP是196
print(TP)
FN=np.where((np.array(y_test1)-np.array(y_predict1))==1,1,0).sum() #FN是34
print(FN)#FN是1
TN=np.where(y_test1==0,1,0).sum()-FP  #4464
print(TN)

196
34
4464


In [21]:
TP/(TP+FP) #精确率

np.float64(0.9158878504672897)

In [22]:
TP/(TP+FN)  #召回率

np.float64(0.8521739130434782)

In [23]:
#F1-score
2*TP/(2*TP+FP+FN)

np.float64(0.8828828828828829)

In [24]:
del news
del x_train
del x_test
del y_test
del y_predict
del tf