创建离散演示数据：

In [1]:
import numpy as np

X = np.array([
    [1,1,2,2],
    [2,2,3,1],
    [1,1,2,2],
    [1,1,3,1],
    [2,2,2,1],
    [1,2,2,2],
    [2,2,3,1],
    [1,2,1,1],
    [1,1,2,1],
    [2,2,2,2],
])

y = np.array([1,2,2,2,3,1,2,3,1,3])

多项式朴素贝叶斯分类：

In [2]:
import numpy as np

def mnb(X, y, alpha = 1):
    """
    多项式朴素贝叶斯分类
    args:
        X - 训练数据集
        y - 目标标签值
        alpha - 平滑参数
    return:
        priors - 先验概率的对数
        pss - 每种特征对应每种标签分类的条件概率的对数
        x_classes - 特征分类
        y_classes - 标签分类
    """
    # 标签分类、每类数量
    y_classes, y_counts = np.unique(y, return_counts=True)
    y_counts += alpha
    # 先验概率
    priors = np.log(y_counts / np.sum(y_counts))
    # 每种特征对应每种标签分类的条件概率的对数
    pss = []
    # 特征分类
    x_classes = []
    for idx in range(X.shape[1]):
        # 第 idx 个特征
        x_idx = X[:, idx]
        # 第 idx 个特征分类、每类数量
        x_idx_classes, x_idx_counts = np.unique(x_idx, return_counts=True)
        x_classes.append(x_idx_classes)
        # 第 idx 个特征对应每种标签分类的条件概率的对数
        ps = []
        for jdx in range(len(y_classes)):
            # 第 idx 个特征对应第 jdx 个标签分类的分类、每类数量
            x_jdx_classes, x_jdx_counts = np.unique(x_idx[y==y_classes[jdx]], return_counts=True)
            # 第 idx 个特征对应第 jdx 个标签分类的条件概率的对数
            p = []
            for kdx in range(len(x_idx_classes)):
                # 同时满足特征与标签的下标
                idxs = np.where(x_jdx_classes == x_idx_classes[kdx])[0]
                # 平滑后的分子
                a = alpha
                if (len(idxs) != 0):
                    a = x_jdx_counts[idxs[0]] + alpha
                # 平滑后的分母
                b = np.sum(x_jdx_counts) + len(x_idx_classes) * alpha
                p.append(np.log(a/b))
            ps.append(np.array(p))
        pss.append(np.array(ps))
    return priors, pss, x_classes, y_classes

def predict(X, priors, pss, x_classes, y_classes):
    """
    预测
    args:
        X - 数据集
        priors - 先验概率的对数
        pss - 每种特征对应每种标签分类的条件概率的对数
        x_classes - 特征分类
        y_classes - 标签分类
    return:
        预测结果
    """
    ys = []
    for idx in range(X.shape[0]):
        y = np.array(priors)
        for jdx in range(X.shape[1]):
            for kdx in range(len(y_classes)):
                y[kdx] = y[kdx] + pss[jdx][kdx][x_classes[jdx] == X[idx][jdx]]
        ys.append(y)
    return y_classes[np.argmax(ys, axis=1)]

priors, pss, x_classes, y_classes = mnb(X, y)
predict(X, priors, pss, x_classes, y_classes)

array([1, 2, 1, 2, 3, 1, 2, 3, 1, 3])

创建连续演示数据：

In [3]:
import numpy as np
from sklearn.datasets import make_classification

X, y = make_classification(n_features = 2, n_informative = 2, n_redundant = 0, n_samples = 1000, n_classes = 2, random_state = 0)

演示数据：

In [4]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib notebook

plt.rcParams['font.sans-serif'] = ['PingFang HK']  # 选择一个本地的支持中文的字体
fig, ax = plt.subplots()
ax.set_facecolor('#f8f9fa')

x1 = X[y==0][:, 0]
y1 = X[y==0][:, 1]
x2 = X[y==1][:, 0]
y2 = X[y==1][:, 1]
p1 = plt.scatter(x1, y1, c='#e63946', marker='o', s=20)
p2 = plt.scatter(x2, y2, c='#457b9d', marker='x', s=20)

ax.set_title('朴素贝叶斯分类', color='#264653')
ax.set_xlabel('X1', color='#264653')
ax.set_ylabel('X2', color='#264653')
ax.tick_params(labelcolor='#264653')
plt.legend([p1, p2], ["0", "1"], loc="upper left")
plt.show()

<IPython.core.display.Javascript object>

高斯朴素贝叶斯分类：

In [5]:
import numpy as np

def gnb(X, y):
    """
    高斯朴素贝叶斯分类
    args:
        X - 训练数据集
        y - 目标标签值
        alpha - 平滑参数
    return:
        priors - 先验概率的对数
        means - 均值向量
        stds - 标准差向量
        y_classes - 标签分类
    """
    # 标签分类、每类数量
    y_classes, y_counts = np.unique(y, return_counts=True)
    # 先验概率
    priors = np.log(y_counts / np.sum(y_counts))
    # 均值向量
    means = []
    # 标准差向量
    stds = []
    for y_class in y_classes:
        x = X[y==y_class][:]
        means.append(np.mean(x, axis=0))
        stds.append(np.std(x, axis=0))
    return priors, means, stds, y_classes

def predict(X, priors, means, stds, y_classes):
    """
    预测
    args:
        X - 数据集
        priors - 先验概率的对数
        means - 均值向量
        stds - 标准差向量
        y_classes - 标签分类
    return:
        预测结果
    """
    ys = []
    for kdx in range(len(y_classes)):
        ys.append(np.sum(np.log(normal(X, means[kdx], stds[kdx])), axis=1) + priors[kdx])
    return y_classes[np.argmax(ys, axis=0)]

def normal(x, mean, std):
    """
    正态分布概率密度函数
    args:
        x - 特征值
        mean - 均值
        std - 标准差
    return:
        概率估计
    """
    exponent = np.exp(-(np.power(x - mean, 2) / (2 * np.power(std, 2))))
    return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

拟合数据：

In [6]:
priors, means, stds, y_classes = gnb(X, y)
print(y[0:5])
print(predict(X[0:5,:], priors, means, stds, y_classes))

[0 1 1 1 1]
[0 1 1 1 1]


可视化：

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

%matplotlib notebook

plt.rcParams['font.sans-serif'] = ['PingFang HK']  # 选择一个本地的支持中文的字体
fig, ax = plt.subplots()
ax.set_facecolor('#f8f9fa')

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .01), np.arange(y_min, y_max, .01))
Z = predict(np.c_[xx.ravel(), yy.ravel()], priors, means, stds, y_classes)
Z = Z.reshape(xx.shape)
clist=['#ffadad', '#8ecae6']
newcmp = LinearSegmentedColormap.from_list('point_color', clist)
plt.pcolormesh(xx, yy, Z, cmap = newcmp)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

x1 = X[y==0][:, 0]
y1 = X[y==0][:, 1]
x2 = X[y==1][:, 0]
y2 = X[y==1][:, 1]
p1 = plt.scatter(x1, y1, c='#e63946', marker='o', s=20)
p2 = plt.scatter(x2, y2, c='#457b9d', marker='x', s=20)

ax.set_title('高斯朴素贝叶斯分类', color='#264653')
ax.set_xlabel('X1', color='#264653')
ax.set_ylabel('X2', color='#264653')
ax.tick_params(labelcolor='#264653')
plt.legend([p1, p2], ["0", "1"], loc="upper left")
plt.show()

<IPython.core.display.Javascript object>



In [8]:
from sklearn.naive_bayes import GaussianNB

# 初始化高斯朴素贝叶斯分类器
gnb = GaussianNB()
# 拟合数据
gnb.fit(X, y)
# 预测
gnb.predict(X[0:5,:])

array([0, 1, 1, 1, 1])

可视化：

In [9]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

%matplotlib notebook

plt.rcParams['font.sans-serif'] = ['PingFang HK']  # 选择一个本地的支持中文的字体
fig, ax = plt.subplots()
ax.set_facecolor('#f8f9fa')

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .01), np.arange(y_min, y_max, .01))
Z = gnb.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
clist=['#ffadad', '#8ecae6']
newcmp = LinearSegmentedColormap.from_list('point_color', clist)
plt.pcolormesh(xx, yy, Z, cmap = newcmp)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

x1 = X[y==0][:, 0]
y1 = X[y==0][:, 1]
x2 = X[y==1][:, 0]
y2 = X[y==1][:, 1]
p1 = plt.scatter(x1, y1, c='#e63946', marker='o', s=20)
p2 = plt.scatter(x2, y2, c='#457b9d', marker='x', s=20)

ax.set_title('高斯朴素贝叶斯分类', color='#264653')
ax.set_xlabel('X1', color='#264653')
ax.set_ylabel('X2', color='#264653')
ax.tick_params(labelcolor='#264653')
plt.legend([p1, p2], ["0", "1"], loc="upper left")
plt.show()

<IPython.core.display.Javascript object>

