# NaiveBayes 朴素贝叶斯

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
iris = load_iris()
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=666666)

In [3]:
x[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### 计算类别比例

In [5]:
from collections import Counter
import numpy as np
def calculate_prior(y_label):
    """
    输入y_label，计算每个类别的比例
    """
    count = Counter(y_label)
    total_label = len(y_label)
    prior = [count[i] / total_label for i in range(len(count))]
    return np.array(prior)

In [6]:
calculate_prior(y_train)

array([0.36666667, 0.33333333, 0.3       ])

In [7]:
Counter(y_train)

Counter({0: 44, 2: 36, 1: 40})

In [8]:
len(y_train)

120

In [9]:
class_nums = len(set(y_train))
class_nums

3

### 计算训练集均值

In [10]:
x_train[:3]

array([[4.7, 3.2, 1.3, 0.2],
       [6.1, 2.6, 5.6, 1.4],
       [6. , 3. , 4.8, 1.8]])

In [11]:
mean = []
for i in range(class_nums):
    mean.append(x_train[y_train == i].mean(axis=0))
mean

[array([5.        , 3.42272727, 1.45227273, 0.24318182]),
 array([5.925 , 2.7575, 4.22  , 1.31  ]),
 array([6.51666667, 2.93888889, 5.51944444, 2.04166667])]

### 计算训练集方差

In [12]:
var = []
for i in range(class_nums):
    var.append(x_train[y_train == i].var(axis=0))
var

[array([0.11772727, 0.13766529, 0.03158574, 0.00881715]),
 array([0.281875  , 0.10244375, 0.2301    , 0.0339    ]),
 array([0.37583333, 0.10126543, 0.2960108 , 0.07576389])]

In [13]:
mean = np.array(mean)
var = np.array(var)

### 计算似然度

In [14]:
def gaussian_func(x_data):
    return (1 / np.sqrt(2 * np.pi * var) * np.exp(-(x_data - mean)**2 / (2 * var))).prod(axis=1)

In [15]:
gaussian_func(x_test[0])

array([1.57146418e-210, 1.07628378e-006, 1.53194370e-001])

In [16]:
likelihood = gaussian_func(x_test[0])
likelihood = likelihood.reshape(1, -1)
probs = calculate_prior(y_train) * likelihood
probs

array([[5.76203531e-211, 3.58761258e-007, 4.59583111e-002]])

In [17]:
probs_sum = probs.sum(axis=1)
probs_sum

array([0.04595867])

In [18]:
probs / probs_sum

array([[1.25374284e-209, 7.80617148e-006, 9.99992194e-001]])

### 预测

In [19]:
p = probs / probs_sum
p.argmax(axis=1)

array([2], dtype=int64)

In [20]:
y_test[0]

2