In [73]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [74]:
iris = datasets.load_iris()

In [75]:
X = iris.data
y = iris.target

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 33)

# Gaussian Naive Bayes 高斯貝式

主要用於連續變數，比方說特徵長度為幾公分、重量為幾公斤等等。

$$
P(X_j | Y_i) = \frac{1}{\sqrt{2\pi\sigma_{Y_i}^2}} \exp\left(-\frac{(X_j - \mu_{Y_i})^2}{2\sigma_{Y_i}^2}\right)
$$

In [77]:
modelgnb = GaussianNB()
modelgnb.fit(X_train, y_train)
modelgnb.predict(X_test)

array([1, 1, 0, 1, 1, 2, 0, 0, 2, 2, 2, 0, 2, 1, 2, 1, 2, 0, 1, 2, 0, 0,
       2, 0, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 0, 1, 1, 1, 0,
       0])

In [78]:
# 被分配到三種花的機率
modelgnb.predict_proba(X_test)

array([[1.29356119e-076, 9.99865779e-001, 1.34220523e-004],
       [3.69939572e-092, 9.86294621e-001, 1.37053793e-002],
       [1.00000000e+000, 1.71472943e-014, 2.64067764e-025],
       [4.46736151e-105, 9.67919741e-001, 3.20802590e-002],
       [9.17464910e-153, 7.34091430e-001, 2.65908570e-001],
       [8.06342637e-245, 4.06094860e-009, 9.99999996e-001],
       [1.00000000e+000, 2.29090110e-012, 7.90288739e-023],
       [1.00000000e+000, 4.85372771e-015, 2.88378833e-025],
       [3.29767065e-167, 4.86162719e-003, 9.95138373e-001],
       [3.76452169e-214, 1.53430994e-007, 9.99999847e-001],
       [1.99849076e-226, 6.72331516e-008, 9.99999933e-001],
       [1.00000000e+000, 1.64356578e-015, 1.46571856e-026],
       [2.54279672e-211, 1.71881917e-007, 9.99999828e-001],
       [7.36311948e-104, 9.92943678e-001, 7.05632232e-003],
       [7.68921862e-179, 4.62539733e-006, 9.99995375e-001],
       [3.82621763e-083, 9.99199778e-001, 8.00221655e-004],
       [1.10024561e-126, 3.29061821e-001

In [79]:
print('Training Set Score : ', modelgnb.score(X_train, y_train))
print('Test Set Score : ', modelgnb.score(X_test, y_test))

Training Set Score :  0.9619047619047619
Test Set Score :  0.9555555555555556


# Multinomial Naive Bayes 多項式貝式分類器

主要用在離散變數，比方說次數、類別等等。

$$
P(Y_i) = \frac{N_{Y_i} + \alpha}{N + K\alpha}
$$

$$
P(X_j | Y_i) = \frac{N_{Y_i, X_j} + \alpha}{N_{Y_i} + n\alpha}
$$

In [80]:
# model #此案例中特徵為連續型，因此不適合使用多項式貝式分類器
modelmnb = MultinomialNB(alpha=1)  #alpha預設為1
modelmnb.fit(X_train, y_train)
modelmnb.predict(X_test)

array([1, 1, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 0, 1, 1, 1, 0,
       0])

In [81]:
modelmnb.predict_proba(X_test)

array([[0.0706437 , 0.51647472, 0.41288159],
       [0.07309297, 0.52733404, 0.39957299],
       [0.70327368, 0.19514817, 0.10157814],
       [0.04510087, 0.52226344, 0.4326357 ],
       [0.01985886, 0.50770063, 0.47244051],
       [0.00538611, 0.47283062, 0.52178327],
       [0.68421034, 0.2080835 , 0.10770616],
       [0.75814979, 0.16228167, 0.07956855],
       [0.01818873, 0.50972911, 0.47208217],
       [0.0094497 , 0.4948794 , 0.4956709 ],
       [0.00793364, 0.48969285, 0.50237351],
       [0.7137075 , 0.18705037, 0.09924213],
       [0.00893051, 0.48802363, 0.50304585],
       [0.04669603, 0.52095252, 0.43235145],
       [0.01409898, 0.50224234, 0.48365868],
       [0.07499116, 0.52287496, 0.40213389],
       [0.02688754, 0.51028285, 0.46282961],
       [0.77149902, 0.15430953, 0.07419145],
       [0.04753511, 0.52519762, 0.42726727],
       [0.0155038 , 0.51686067, 0.46763553],
       [0.74074724, 0.17231689, 0.08693587],
       [0.77026886, 0.15674899, 0.07298215],
       [0.

In [82]:
print('Training Set Score : ', modelmnb.score(X_train, y_train))
print('Test Set Score : ', modelmnb.score(X_test, y_test))
print('此案例中特徵為連續型，不適合使用多項式貝式分類器，\n因此多項式貝式分類效果比高斯貝式分類器差')

Training Set Score :  0.819047619047619
Test Set Score :  0.7777777777777778
此案例中特徵為連續型，不適合使用多項式貝式分類器，
因此多項式貝式分類效果比高斯貝式分類器差


# Bernoulli Naive Bayes 伯努力貝式分類器

用在二元的特徵，比方說特徵是否出現、特徵大小、特徵長短等這種二元的分類。

$$
P(X_j | Y_i) = P_{Y_i}^{X_j}(1 - P_{Y_i})^{1 - X_j}
$$

In [83]:
# 在沒加入binarize時預測結果都會被分為同一類
modelbnb = BernoulliNB()
modelbnb.fit(X_train, y_train)
modelbnb.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0])

### 加入binarize
`binarize` 表示切分的基準點。例如，若設定 `binarize=1`，則特徵值大於 1 的資料會被分為一類，特徵值小於或等於 1 的資料則被分為另一類。

In [84]:
modelbnb = BernoulliNB(binarize=1) 
modelbnb.fit(X_train, y_train)
modelbnb.predict(X_test) 

array([2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0,
       2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0,
       0])

In [86]:
print('Training Set Score : ', modelbnb.score(X_train, y_train))
print('Test Set Score : ', modelbnb.score(X_test, y_test)) # 以binarize=1區分，效果也不是太好

Training Set Score :  0.6666666666666666
Test Set Score :  0.6666666666666666
