## 朴素贝叶斯分类器就是一个对所有可能性求概率的模型，最后输出结果中哪种可能性高就输出哪种。

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math

In [14]:
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data,columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    #data = np.array(df.iloc[:100,:])
    data = np.array(df.iloc[:,:])
    return data[:,:-1],data[:,-1]

In [15]:
X,y = create_data()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

In [16]:
X_test[0],y_test[0]

(array([5.5, 2.4, 3.8, 1.1]), 1.0)

##  GaussianNB 高斯朴素贝叶斯¶
特征的可能性被假设为高斯

概率密度函数：$$P(x_i | y_k)=\frac{1}{\sqrt{2\pi\sigma^2_{yk}}}exp(-\frac{(x_i-\mu_{yk})^2}{2\sigma^2_{yk}})$$

数学期望(mean)：$\mu$

方差：$\sigma^2=\frac{\sum(X-\mu)^2}{N}$

In [17]:
class NaiveBayes():
    def __init__(self):
        self.model = None
        
    # 数学期望
    @staticmethod
    #返回函数的静态方法
    #该方法不强制要求传递参数 """
    def mean(X):
        return sum(X) / float(len(X))
    
    # 标准差（方差）
    def stdev(self,X):
        avg = self.mean(X)
        return math.sqrt(sum([pow(x - avg, 2) for x in X])/ float(len(X)))
    
    # 概率密度函数
    #将 math.pow 换成了 np.power.将 math.exp 换成了 np.exp这个很关键，让下面的测试通过了
    def gaussian_probablity(self,x,mean,stdev):
        #exponent = np.exp(-(np.power(x - mean,2) /  (2 * np.power(stdev,2)))) 
        #return (1 / (np.sqrt(2 * math.pi) * stdev)) * exponent
        
        exponent = np.exp(-(np.power(x - mean,2) /  (2 * np.power(stdev,2)+1))) 
        return (1 / ((np.sqrt(2 * math.pi) * stdev)+1)) * exponent
    

    # 处理X_train
    def summarize(self,train_data):
        # zip函数将数据实例中每个属性的值分组到它们自己的列表中，以便我们可以计算属性的均值和标准差值。
        summaries = [(self.mean(i),self.stdev(i)) for i in zip(*train_data)]        
        return summaries
    
    # 分类别求出数学期望和标准差
    def fit(self,X,y):
        #set() 函数创建一个无序不重复元素集
        labels = list(set(y))
        data = {label: [] for label in labels}
        for f,label in zip(X, y):
            data[label].append(f)
        self.model = {
                label:self.summarize(value)
                for label, value in data.items()
            }
        return 'gussianNB train Done!'
    
    # 计算概率（计算属于某个类的属性的概率）
    def calculate_probabilities(self,input_data):
        probabilities = {}
        for label,value in self.model.items():
            probabilities[label] = 0
            #probabilities[label] = 1
            for i in range(len(value)):
                mean,stdev = value[i]
                #probabilities[label] *= self.gaussian_probablity(input_data[i], mean, stdev)
                probabilities[label] += np.log(self.gaussian_probablity(input_data[i], mean, stdev))
                
        return probabilities
    
    # 类别 ,注意下面的实例model测试的时候，X_test传入的是 一维的
    def predict(self, X_test):
        label = sorted(
            self.calculate_probabilities(X_test).items(),
            key=lambda x: x[-1])[-1][0]
        return label

    def score(self, X_test, y_test):
        right = 0
        for X, y in zip(X_test, y_test):
            label = self.predict(X)
            if label == y:
                right += 1

        return right / float(len(X_test))
    

## 测试

In [18]:
model = NaiveBayes()

In [19]:
model.fit(X_train, y_train)

'gussianNB train Done!'

In [20]:
model.calculate_probabilities(X_test)

{0.0: array([-71.8447576 , -14.49388257, -51.81357076, -24.22395186]),
 1.0: array([-38.11556974, -13.27685767, -27.02941183, -25.73505569]),
 2.0: array([-26.31210272, -18.36530046, -22.06040833, -35.21480036])}

In [21]:
print(model.predict([4.4,  3.2,  1.3,  0.2]))

0.0


In [22]:
model.score(X_test, y_test)

0.9111111111111111

In [12]:
liancheng = 0.8666666666666667

In [None]:
log_sum = 0.9111111111111111

# 自己测试

In [11]:
X_train.ndim

2

In [12]:
y_train.ndim

1

In [13]:
a = NaiveBayes()
a.fit(X_train, y_train)

'gussianNB train Done!'

In [14]:
a1 = a.summarize(X_train)
#四个特征的均值和方差
a1  

[(5.477142857142858, 0.654472836623149),
 (3.104285714285716, 0.4682599138713195),
 (2.8614285714285717, 1.4568354017373073),
 (0.7928571428571429, 0.5720443111137032)]

In [17]:
labels = list(set(y_train))
data1 = {label: [] for label in labels}
for f,label in zip(X_train,y_train):
    data1[label].append(f)

In [18]:
#data1

In [19]:
labels = list(set(y_train))
data = {label: [] for label in labels}
#创建只有键 而 值为空的字典
data       

{0.0: [], 1.0: []}

In [20]:
for f,label in zip(X_train, y_train):
    #注意这个label就是字典的 键，f就是字典的值
    data[label].append(f)

In [21]:
#字典，只有两个键 0.0 和 1.0
data

{0.0: [array([4.9, 3. , 1.4, 0.2]),
  array([5.4, 3.9, 1.7, 0.4]),
  array([4.5, 2.3, 1.3, 0.3]),
  array([5.4, 3.7, 1.5, 0.2]),
  array([4.8, 3.4, 1.9, 0.2]),
  array([5.2, 4.1, 1.5, 0.1]),
  array([5.7, 3.8, 1.7, 0.3]),
  array([4.8, 3. , 1.4, 0.3]),
  array([4.6, 3.2, 1.4, 0.2]),
  array([5. , 3.4, 1.5, 0.2]),
  array([5. , 3.2, 1.2, 0.2]),
  array([4.4, 2.9, 1.4, 0.2]),
  array([5.8, 4. , 1.2, 0.2]),
  array([5.4, 3.4, 1.5, 0.4]),
  array([4.3, 3. , 1.1, 0.1]),
  array([5. , 3.4, 1.6, 0.4]),
  array([5.1, 3.3, 1.7, 0.5]),
  array([5.2, 3.4, 1.4, 0.2]),
  array([5.4, 3.9, 1.3, 0.4]),
  array([5.5, 4.2, 1.4, 0.2]),
  array([5. , 3.5, 1.3, 0.3]),
  array([5.4, 3.4, 1.7, 0.2]),
  array([5.1, 3.8, 1.6, 0.2]),
  array([5.3, 3.7, 1.5, 0.2]),
  array([5.1, 3.5, 1.4, 0.2]),
  array([4.9, 3.1, 1.5, 0.1]),
  array([4.6, 3.4, 1.4, 0.3]),
  array([4.7, 3.2, 1.3, 0.2]),
  array([5.2, 3.5, 1.5, 0.2]),
  array([4.9, 3.6, 1.4, 0.1]),
  array([5. , 3.3, 1.4, 0.2]),
  array([4.6, 3.6, 1. , 0.2]),
  a

In [22]:
selfmodel = {
                label:a.summarize(value) for label, value in data.items()
            }

In [23]:
#计算每个target的特征的期望和标准差
selfmodel

{0.0: [(5.017142857142856, 0.3541474384536268),
  (3.4199999999999995, 0.3785687331440431),
  (1.4457142857142855, 0.17783275793823355),
  (0.24285714285714285, 0.11284810090360858)],
 1.0: [(5.937142857142856, 0.555021602815455),
  (2.7885714285714283, 0.3096278213518644),
  (4.277142857142857, 0.4523498736191548),
  (1.3428571428571425, 0.191662969499982)]}

In [24]:
len(data[0])

35

In [25]:
len(data[1.0])

35

In [26]:
data[1.0][0]

array([6.3, 3.3, 4.7, 1.6])

In [27]:
selfmodel

{0.0: [(5.017142857142856, 0.3541474384536268),
  (3.4199999999999995, 0.3785687331440431),
  (1.4457142857142855, 0.17783275793823355),
  (0.24285714285714285, 0.11284810090360858)],
 1.0: [(5.937142857142856, 0.555021602815455),
  (2.7885714285714283, 0.3096278213518644),
  (4.277142857142857, 0.4523498736191548),
  (1.3428571428571425, 0.191662969499982)]}

In [29]:
xtest = X_test[2]

In [30]:
probabilities = {}
for label,value in selfmodel.items():  
    #将字典的值初始化为1
    probabilities[label] = 1
    for i in range(len(value)):
        mean,stdev = value[i]
        probabilities[label] *= a.gaussian_probablity(xtest[i], mean, stdev)
                
probabilities

{0.0: 1.1586327296759225, 1.0: 1.134258892760089e-19}

In [31]:
#key=lambda 变量：变量[维数] 
l1 = sorted(probabilities.items(),key=lambda x: x[-1])[-1][0]

In [32]:
l1

0.0

## scikit-learn实例

In [46]:
from sklearn.naive_bayes import GaussianNB

In [47]:
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB()

In [48]:
clf.score(X_test, y_test)

1.0

In [49]:
clf.predict([[4.4,  3.2,  1.3,  0.2]])

array([0.])

In [50]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB # 伯努利模型和多项式模型

习题4.1
  用极大似然估计法推出朴素贝叶斯法中的概率估计公式(4.8)及公式 (4.9)。

### 解答：    

第1步：证明公式(4.8)：$\displaystyle P(Y=c_k) = \frac{\displaystyle \sum_{i=1}^N I(y_i=c_k)}{N}$
由于朴素贝叶斯法假设$Y$是定义在输出空间$\mathcal{Y}$上的随机变量，因此可以定义$P(Y=c_k)$概率为$p$。
令$\displaystyle m=\sum_{i=1}^NI(y_i=c_k)$，得出似然函数：$$L(p)=f_D(y_1,y_2,\cdots,y_n|\theta)=\binom{N}{m}p^m(1-p)^{(N-m)}$$使用微分求极值，两边同时对$p$求微分：$$\begin{aligned}
0 &amp;= \binom{N}{m}\left[mp^{(m-1)}(1-p)^{(N-m)}-(N-m)p^m(1-p)^{(N-m-1)}\right] \\
&amp; = \binom{N}{m}\left[p^{(m-1)}(1-p)^{(N-m-1)}(m-Np)\right]
\end{aligned}$$可求解得到$\displaystyle p=0,p=1,p=\frac{m}{N}$
显然$\displaystyle P(Y=c_k)=p=\frac{m}{N}=\frac{\displaystyle \sum_{i=1}^N I(y_i=c_k)}{N}$，公式(4.8)得证。

第2步：证明公式(4.9)：$\displaystyle P(X^{(j)}=a_{jl}|Y=c_k) = \frac{\displaystyle \sum_{i=1}^N I(x_i^{(j)}=a_{jl},y_i=c_k)}{\displaystyle \sum_{i=1}^N I(y_i=c_k)}$
令$P(X^{(j)}=a_{jl}|Y=c_k)=p$，令$\displaystyle m=\sum_{i=1}^N I(y_i=c_k), q=\sum_{i=1}^N I(x_i^{(j)}=a_{jl},y_i=c_k)$，得出似然函数：$$L(p)=\binom{m}{q}p^q(i-p)^{m-q}$$使用微分求极值，两边同时对$p$求微分：$$\begin{aligned}
0 &amp;= \binom{m}{q}\left[qp^{(q-1)}(1-p)^{(m-q)}-(m-q)p^q(1-p)^{(m-q-1)}\right] \\
&amp; = \binom{m}{q}\left[p^{(q-1)}(1-p)^{(m-q-1)}(q-mp)\right]
\end{aligned}$$可求解得到$\displaystyle p=0,p=1,p=\frac{q}{m}$
显然$\displaystyle P(X^{(j)}=a_{jl}|Y=c_k)=p=\frac{q}{m}=\frac{\displaystyle \sum_{i=1}^N I(x_i^{(j)}=a_{jl},y_i=c_k)}{\displaystyle \sum_{i=1}^N I(y_i=c_k)}$，公式(4.9)得证

习题4.2
    用贝叶斯估计法推出朴素贝叶斯法中的慨率估计公式(4.10)及公式(4.11)

第1步：证明公式(4.11)：$\displaystyle P(Y=c_k) = \frac{\displaystyle \sum_{i=1}^N I(y_i=c_k) + \lambda}{N+K \lambda}$
加入先验概率，在没有任何信息的情况下，可以假设先验概率为均匀概率（即每个事件的概率是相同的）。
可得$\displaystyle p=\frac{1}{K} \Leftrightarrow pK-1=0\quad(1)$
根据习题4.1得出先验概率的极大似然估计是$\displaystyle pN - \sum_{i=1}^N I(y_i=c_k) = 0\quad(2)$
存在参数$\lambda$使得$(1) \cdot \lambda + (2) = 0$
所以有$$\lambda(pK-1) + pN - \sum_{i=1}^N I(y_i=c_k) = 0$$可得$\displaystyle P(Y=c_k) = \frac{\displaystyle \sum_{i=1}^N I(y_i=c_k) + \lambda}{N+K \lambda}$，公式(4.11)得证。



第2步：证明公式(4.10)：$\displaystyle P_{\lambda}(X^{(j)}=a_{jl} | Y = c_k) = \frac{\displaystyle \sum_{i=1}^N I(x_i^{(j)}=a_{jl},y_i=c_k) + \lambda}{\displaystyle \sum_{i=1}^N I(y_i=c_k) + S_j \lambda}$
根据第1步，可同理得到$$
P(Y=c_k, x^{(j)}=a_{j l})=\frac{\displaystyle \sum_{i=1}^N I(y_i=c_k, x_i^{(j)}=a_{jl})+\lambda}{N+K S_j \lambda}$$

$$\begin{aligned} 
P(x^{(j)}=a_{jl} | Y=c_k)
&amp;= \frac{P(Y=c_k, x^{(j)}=a_{j l})}{P(y_i=c_k)} \\
&amp;= \frac{\displaystyle \frac{\displaystyle \sum_{i=1}^N I(y_i=c_k, x_i^{(j)}=a_{jl})+\lambda}{N+K S_j \lambda}}{\displaystyle \frac{\displaystyle \sum_{i=1}^N I(y_i=c_k) + \lambda}{N+K \lambda}} \\
&amp;= (\lambda可以任意取值，于是取\lambda = S_j \lambda) \\
&amp;= \frac{\displaystyle \frac{\displaystyle \sum_{i=1}^N I(y_i=c_k, x_i^{(j)}=a_{jl})+\lambda}{N+K S_j \lambda}}{\displaystyle \frac{\displaystyle \sum_{i=1}^N I(y_i=c_k) + \lambda}{N+K S_j \lambda}} \\ 
&amp;= \frac{\displaystyle \sum_{i=1}^N I(y_i=c_k, x_i^{(j)}=a_{jl})+\lambda}{\displaystyle \sum_{i=1}^N I(y_i=c_k) + \lambda} (其中\lambda = S_j \lambda)\\
&amp;= \frac{\displaystyle \sum_{i=1}^N I(x_i^{(j)}=a_{jl},y_i=c_k) + \lambda}{\displaystyle \sum_{i=1}^N I(y_i=c_k) + S_j \lambda}
\end{aligned} $$