## 朴素贝叶斯

朴素贝叶斯算法如下：

$$max(P(B) * \prod_{Ai \in A} P(Ai | B) )$$

其中B表示标签， A表示特征

如B取值为 ['是'， '否']
且A的取值为[1,1,2,2,3,3]
那么算法为

$$max(P(B=否) * \prod_{Ai \in A} P(Ai | B=否), P(B=是) * \prod_{Ai \in A} P(Ai | B=是) )$$

In [10]:
import numpy as np
import pandas as pd

In [11]:
# 创建数据
x1 = [1,1,1,1,1,2,2,2,2,2,3,3,3,3,3]
x2 = ['S','M','M','S','S','S','M','M','L','L','L','M','M','L','L']
y = [-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]

df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})
df.head()

Unnamed: 0,x1,x2,y
0,1,S,-1
1,1,M,-1
2,1,M,1
3,1,S,1
4,1,S,-1


In [12]:
# 提取特征和标签
x = df.iloc[:, :-1]
y = df[['y']]

In [14]:
class_count = y[y.columns[0]].value_counts()
class_count

 1    9
-1    6
Name: y, dtype: int64

In [15]:
x.columns

Index(['x1', 'x2'], dtype='object')

In [16]:
def nb_fit(x, y):
    classes = y[y.columns[0]].unique() # 标签，获得y的标签 -1 或者 1
    class_count = y[y.columns[0]].value_counts() # 获得数量 返回series数据
    class_prior = class_count / len(y); # 计算出每个标签的概率 
    
    prior = dict() # 定义返回字典
    for col in x.columns:  # col ['x1', 'x2']
        for j in classes: # [-1, 1]
            # 特征中 当前列里面 当前标签下所有值的数量
            # x[y == '-1']['x1'].value_counts() 返回数组Series(标签：数量)
            p_x_y = x[(y == j).values][col].value_counts()
            print('=============\np_x_y:\n  {}\n================='.format(p_x_y))
            for i in p_x_y.index:
                prior[(col, i, j)] = p_x_y[i] / class_count[j] # 计算 P(B | A) 即P(特征 | 标签)
    return classes, class_prior, prior

In [17]:
nb_fit(x, y)

p_x_y:
  1    3
2    2
3    1
Name: x1, dtype: int64
p_x_y:
  3    4
2    3
1    2
Name: x1, dtype: int64
p_x_y:
  S    3
M    2
L    1
Name: x2, dtype: int64
p_x_y:
  M    4
L    4
S    1
Name: x2, dtype: int64


(array([-1,  1], dtype=int64),
  1    0.6
 -1    0.4
 Name: y, dtype: float64,
 {('x1', 1, -1): 0.5,
  ('x1', 2, -1): 0.3333333333333333,
  ('x1', 3, -1): 0.16666666666666666,
  ('x1', 3, 1): 0.4444444444444444,
  ('x1', 2, 1): 0.3333333333333333,
  ('x1', 1, 1): 0.2222222222222222,
  ('x2', 'S', -1): 0.5,
  ('x2', 'M', -1): 0.3333333333333333,
  ('x2', 'L', -1): 0.16666666666666666,
  ('x2', 'M', 1): 0.4444444444444444,
  ('x2', 'L', 1): 0.4444444444444444,
  ('x2', 'S', 1): 0.1111111111111111})

In [18]:
def predict(x_test):
    res = []
    for c in classes:
        p_y = class_prior[c] # class_prior是一个Serise数组，可以通过索引获得值
        p_x_y = 1
        
        #  dict_items([('x1', 2), ('x2', 'S')])
        for i in x_test.items(): 
            # (x1, 2, -1)* ('x2', 'S', -1) * 0.4   标签-1 
            # (x1, 2,  1)* ('x2', 'S', 1) * 0.6   标签+1
            p_x_y *= prior[tuple(list(i) + [c])]
        res.append(p_y * p_x_y)
    print(res)
    return classes[np.argmax(res)]

In [19]:
X_test = {'x1': 2, 'x2': 'S'}
classes, class_prior, prior = nb_fit(x, y) 
# class_prior 为 P(B) 即 P(标签)
# prior 为 所有的 P(B | A)
print('测试数据预测类别为：', predict(X_test))

p_x_y:
  1    3
2    2
3    1
Name: x1, dtype: int64
p_x_y:
  3    4
2    3
1    2
Name: x1, dtype: int64
p_x_y:
  S    3
M    2
L    1
Name: x2, dtype: int64
p_x_y:
  M    4
L    4
S    1
Name: x2, dtype: int64
[0.06666666666666667, 0.02222222222222222]
测试数据预测类别为： -1


In [20]:
X_test.items()

dict_items([('x1', 2), ('x2', 'S')])

## 类式封装

In [21]:
class Naive_Bayes:
    def __init__(self):
        pass

    # 朴素贝叶斯训练过程
    def nb_fit(self, X, y):
        classes = y[y.columns[0]].unique()
        class_count = y[y.columns[0]].value_counts()
        # 类先验概率
        class_prior = class_count / len(y)
        # 计算类条件概率
        prior = dict()
        for col in X.columns:
            for j in classes:
                p_x_y = X[(y == j).values][col].value_counts()
                for i in p_x_y.index:
                    prior[(col, i, j)] = p_x_y[i] / class_count[j]

        return classes, class_prior, prior

    # 预测新的实例
    def predict(self, X_test):
        res = []
        for c in classes:
            p_y = class_prior[c]
            p_x_y = 1
            for i in X_test.items():
                p_x_y *= prior[tuple(list(i) + [c])]
            res.append(p_y * p_x_y)
        return classes[np.argmax(res)]


if __name__ == "__main__":
    x1 = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]
    x2 = ['S', 'M', 'M', 'S', 'S', 'S', 'M', 'M', 'L', 'L', 'L', 'M', 'M', 'L', 'L']
    y = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]
    df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})
    X = df[['x1', 'x2']]
    y = df[['y']]
    X_test = {'x1': 2, 'x2': 'S'}

    nb = Naive_Bayes()
    classes, class_prior, prior = nb.nb_fit(X, y)
    print('测试数据预测类别为：', nb.predict(X_test))

测试数据预测类别为： -1
