In [47]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.metrics import brier_score_loss as BS
from sklearn.calibration import CalibratedClassifierCV

In [2]:
sample = ["Machine learning is fascinating, it is wonderful"
         ,"Machine learning is a sensational techonology"
         ,"Elsa is a popular character"]

In [3]:
vec = CountVectorizer()
X = vec.fit_transform(sample)

In [4]:
X

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [5]:
vec.get_feature_names()

['character',
 'elsa',
 'fascinating',
 'is',
 'it',
 'learning',
 'machine',
 'popular',
 'sensational',
 'techonology',
 'wonderful']

In [6]:
pd.DataFrame(X.toarray(),columns=vec.get_feature_names())

Unnamed: 0,character,elsa,fascinating,is,it,learning,machine,popular,sensational,techonology,wonderful
0,0,0,1,2,1,1,1,0,0,0,1
1,0,0,0,1,0,1,1,0,1,1,0
2,1,1,0,1,0,0,0,1,0,0,0


In [9]:
tif = TfidfVectorizer()
X_ = tif.fit_transform(sample)

In [10]:
X_

<3x11 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [11]:
pd.DataFrame(X_.toarray(),columns=vec.get_feature_names())

Unnamed: 0,character,elsa,fascinating,is,it,learning,machine,popular,sensational,techonology,wonderful
0,0.0,0.0,0.424396,0.50131,0.424396,0.322764,0.322764,0.0,0.0,0.0,0.424396
1,0.0,0.0,0.0,0.315444,0.0,0.406192,0.406192,0.0,0.534093,0.534093,0.0
2,0.546454,0.546454,0.0,0.322745,0.0,0.0,0.0,0.546454,0.0,0.0,0.0


In [13]:
data = fetch_20newsgroups()

In [15]:
categories = ["sci.space" #科学技术 - 太空
             ,"rec.sport.hockey" #运动 - 曲棍球
             ,"talk.politics.guns" #政治 - 枪支问题
             ,"talk.politics.mideast"] #政治 - 中东问题

In [16]:
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [18]:
train.target_names

['rec.sport.hockey',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.mideast']

In [20]:
len(train.data)

2303

In [22]:
np.unique(train.target)

array([0, 1, 2, 3], dtype=int64)

In [23]:
Xtrain = train.data
Xtest = test.data
Ytrain = train.target
Ytest = test.target

In [25]:
tfidf = TfidfVectorizer().fit(Xtrain)
Xtrain_ = tfidf.transform(Xtrain)
Xtest_ = tfidf.transform(Xtest)

In [31]:
tfidf.get_feature_names()

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '000021',
 '000062david42',
 '000152',
 '000246',
 '000256',
 '00041032',
 '000413',
 '0004136',
 '0004246',
 '0004422',
 '00044513',
 '0004847546',
 '0005',
 '0005111312',
 '0005111312na3em',
 '00090711',
 '000s',
 '000th',
 '001',
 '0012',
 '001319',
 '001323',
 '001428',
 '001555',
 '001718',
 '001757',
 '0018',
 '002',
 '0020',
 '002118',
 '0022',
 '0028',
 '002811',
 '0029',
 '003',
 '00309',
 '003221',
 '0033',
 '0034',
 '003522',
 '003719',
 '004',
 '004532',
 '005',
 '005512',
 '0059',
 '006',
 '0062',
 '0065',
 '007',
 '0076',
 '0079',
 '008',
 '0086',
 '009',
 '0096b294',
 '0098',
 '00xkv',
 '01',
 '010',
 '01002',
 '010235',
 '010305',
 '010326',
 '010821',
 '010834',
 '0109',
 '010955',
 '011',
 '011112',
 '0112',
 '012',
 '013',
 '013037',
 '013653rap115',
 '013939',
 '014',
 '014237',
 '014305',
 '014506',
 '01463',
 '015',
 '015043',
 '015209',
 '015225',
 '015415',
 '015551',
 '015936',
 '016',
 '01609',
 '01776',
 '018',
 '

In [32]:
pd.DataFrame(Xtrain_.toarray(),columns=tfidf.get_feature_names())

Unnamed: 0,00,000,0000,00000,000000,000021,000062david42,000152,000246,000256,...,zwrm,zx,zx6wre,zxp,zxqi,zy,zyg,zz,zz_g9q3,zzzzzz
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.058046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2298,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2299,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2300,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2301,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
Ytest

array([2, 3, 0, ..., 3, 2, 0], dtype=int64)

In [49]:
name = ["Multinomial","Complement","Bournulli"]
#高斯朴素贝叶斯不接受稀疏矩阵
models = [MultinomialNB(),ComplementNB(),BernoulliNB()]

In [50]:
for name, clf in zip(name, models):
    clf.fit(Xtrain_,Ytrain)
    y_pred = clf.predict(Xtest_)
    proba = clf.predict_proba(Xtest_)
    score = clf.score(Xtest_,Ytest)
    print(name)
    
    Bscore = []
    for i in range(len(np.unique(Ytrain))):
        bs = BS(Ytest,proba[:,i],pos_label=i)
        Bscore.append(bs)
        print("\tBrier under {}:{:.3f}".format(train.target_names[i],bs))
        
    print("\tAverage Brier:{:.3f}".format(np.mean(Bscore)))
    print("\tAccuracy:{:.3f}".format(score))
    print("\n")

Multinomial


ValueError: Only binary classification is supported. The type of the target is multiclass.

In [52]:
name = ["Multinomial"
       ,"Multinomial + Isotonic"
       ,"Multinomial + Sigmoid"
       ,"Complement"
       ,"Complement + Isotonic"
       ,"Complement + Sigmoid"
       ,"Bernoulli"
       ,"Bernoulli + Isotonic"
       ,"Bernoulli + Sigmoid"]
models = [MultinomialNB()
         ,CalibratedClassifierCV(MultinomialNB(), cv=2, method='isotonic')
         ,CalibratedClassifierCV(MultinomialNB(), cv=2, method='sigmoid')
         ,ComplementNB()
         ,CalibratedClassifierCV(ComplementNB(), cv=2, method='isotonic')
         ,CalibratedClassifierCV(ComplementNB(), cv=2, method='sigmoid')
         ,BernoulliNB()
         ,CalibratedClassifierCV(BernoulliNB(), cv=2, method='isotonic')
         ,CalibratedClassifierCV(BernoulliNB(), cv=2, method='sigmoid')
         ]
for name,clf in zip(name,models):
    clf.fit(Xtrain_,Ytrain)
    y_pred = clf.predict(Xtest_)
    proba = clf.predict_proba(Xtest_)
    score = clf.score(Xtest_,Ytest)
    print(name)
    Bscore = []
    for i in range(len(np.unique(Ytrain))):
        bs = BS(Ytest,proba[:,i],pos_label=i)
        Bscore.append(bs)
        print("\tBrier under {}:{:.3f}".format(train.target_names[i],bs))
    print("\tAverage Brier:{:.3f}".format(np.mean(Bscore)))
    print("\tAccuracy:{:.3f}".format(score))
    print("\n")

Multinomial


ValueError: Only binary classification is supported. The type of the target is multiclass.