# 算法4.1 朴素贝叶斯算法
- 例4.1，朴素贝叶斯算法
    - 测试数据：表4.1 训练数据
    - 结果与书中一致
- 例4.2，贝叶斯估计
    - 测试数据：表4.1 训练数据
    - 结果与书中一致
- 可以完善的问题：没有处理字典key不存在的情况

In [1]:
# 表4.1 训练数据
import numpy as np
import pandas as pd
from IPython.display import display

data = [
    [1, 'S', -1],
    [1, 'M', -1],
    [1, 'M', 1],
    [1, 'S', 1],
    [1, 'S', -1],
    [2, 'S', -1],
    [2, 'M', -1],
    [2, 'M', 1],
    [2, 'L', 1],
    [2, 'L', 1],
    [3, 'L', 1],
    [3, 'M', 1],
    [3, 'M', 1],
    [3, 'L', 1],
    [3, 'L', -1],
]

data_pd = pd.DataFrame(data, columns=['X1', 'X2', 'Y'])
# data_pd['X2'] = data_pd['X2'].map({'S': 0, 'M': 1, 'L': 2})
display(data_pd)

x_data = data_pd[['X1', 'X2']].as_matrix()
y_data = data_pd['Y'].as_matrix()

print(x_data.shape)
print(y_data.shape)

x_test = [2, 'S']
print('x_test:', x_test)

Unnamed: 0,X1,X2,Y
0,1,S,-1
1,1,M,-1
2,1,M,1
3,1,S,1
4,1,S,-1
5,2,S,-1
6,2,M,-1
7,2,M,1
8,2,L,1
9,2,L,1


(15, 2)
(15,)
x_test: [2, 'S']


## 例4.1，朴素贝叶斯算法

In [2]:
class NativeBayes(object):
    def __init__(self):
        self._py = {}
        self._pxy = {}
    @property
    def py(self):
        return self._py
    @property
    def pxy(self):
        return self._pxy
    
    def train(self, datas, labels):
        label_set = set(labels)
        # 【p50，算法4.1第（1）步】先验概率P(Y=Ck)
        py = {}
        for c in label_set:
            py[c] = (list(labels).count(c), len(labels))
        # 【p50，算法4.1第（1）步】条件概率P(X=Ajl | Y=Ck)
        pxy = {}
        for c in label_set:
            c_indexes = [i for i in range(len(labels)) if c == labels[i]] # 按label对数据分割，然后再求条件概率
            for i in range(datas[c_indexes].shape[1]):
                xi = datas[c_indexes, i]
                xi_set = set(xi)
                for j in xi_set:
                    pxy[(j, c)] = (list(xi).count(j), len(xi))
        self._py = py
        self._pxy = pxy
        self._label_set = label_set

    def predict(self, data):
        # 【p50，算法4.1第（2）步】计算各个py
        py = []
        for c in self._label_set:
            pxi = 1
            for xi in data:
                pxi = pxi * self._pxy[(xi, c)][0] / self._pxy[(xi, c)][1]
            py.append(self._py[c][0] / self._py[c][1] * pxi)
        print(py)
        # 【p50，算法4.1第（3）步】返回概率最大的label
        return list(self._label_set)[np.argsort(py)[-1]]

nb = NativeBayes()
print('先验概率P(Y=Ck)：')
nb.train(x_data, y_data)
for key in nb.py.keys():
    print(key, nb.py[key])

print('条件概率P(X=Ajl | Y=Ck)：')
for key in nb.pxy.keys():
    print(key, nb.pxy[key])

print('predict结果：%s' % nb.predict(x_test))

先验概率P(Y=Ck)：
1 (9, 15)
-1 (6, 15)
条件概率P(X=Ajl | Y=Ck)：
(1, 1) (2, 9)
(2, 1) (3, 9)
(3, 1) (4, 9)
('L', 1) (4, 9)
('S', 1) (1, 9)
('M', 1) (4, 9)
(1, -1) (3, 6)
(2, -1) (2, 6)
(3, -1) (1, 6)
('L', -1) (1, 6)
('S', -1) (3, 6)
('M', -1) (2, 6)
[0.02222222222222222, 0.06666666666666667]
predict结果：-1


## 例4.2，贝叶斯估计

In [3]:
class BayesianEstimation(object):
    def __init__(self):
        self._py = {}
        self._pxy = {}
    @property
    def py(self):
        return self._py
    @property
    def pxy(self):
        return self._pxy
    
    def train(self, datas, labels, lambdaa = 1):
        label_set = set(labels)
        # 【p51，公式4.11】先验概率Plambda(Y=Ck)的贝叶斯估计
        py = {}
        for c in label_set:
            py[c] = (list(labels).count(c) + lambdaa, len(labels) + len(label_set)*lambdaa)
        # 【p51，公式4.10】条件概率Plambda(X=Ajl | Y=Ck)的贝叶斯估计
        pxy = {}
        for c in label_set:
            c_indexes = [i for i in range(len(labels)) if c == labels[i]] # 按label对数据分割，然后再求条件概率
            for i in range(datas[c_indexes].shape[1]):
                xi = datas[c_indexes, i]
                xi_set = set(xi)
                for j in xi_set:
                    pxy[(j, c)] = (list(xi).count(j) + lambdaa, len(xi) + len(xi_set)*lambdaa)
        self._py = py
        self._pxy = pxy
        self._label_set = label_set

    def predict(self, data):
        # 【p51，算法4.1第（2）步】计算各个py
        py = []
        for c in self._label_set:
            pxi = 1
            for xi in data:
                pxi = pxi * self._pxy[(xi, c)][0] / self._pxy[(xi, c)][1]
            py.append(self._py[c][0] / self._py[c][1] * pxi)
        print(py)
        # 【p51，算法4.1第（3）步】返回概率最大的label
        return list(self._label_set)[np.argsort(py)[-1]]

be = BayesianEstimation()
print('先验概率P(Y=Ck)：')
be.train(x_data, y_data)
for key in be.py.keys():
    print(key, be.py[key])

print('条件概率P(X=Ajl | Y=Ck)：')
for key in be.pxy.keys():
    print(key, be.pxy[key])

print('predict结果：%s' % be.predict(x_test))

先验概率P(Y=Ck)：
1 (10, 17)
-1 (7, 17)
条件概率P(X=Ajl | Y=Ck)：
(1, 1) (3, 12)
(2, 1) (4, 12)
(3, 1) (5, 12)
('L', 1) (5, 12)
('S', 1) (2, 12)
('M', 1) (5, 12)
(1, -1) (4, 9)
(2, -1) (3, 9)
(3, -1) (2, 9)
('L', -1) (2, 9)
('S', -1) (4, 9)
('M', -1) (3, 9)
[0.032679738562091505, 0.061002178649237467]
predict结果：-1
