In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from collections import Counter
import math

In [2]:
# import data
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['labels'] = iris.target
df.columns = [
    'sepal length', 'sepal width', 'petal length', 'petal width', 'label'
]
data = np.array(df)
X, Y = data[:, :-1], data[:, -1]

def _shuffle(X, Y):
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return X[randomize], Y[randomize]

X, Y = _shuffle(X,Y)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3)
Y_train

array([0., 0., 2., 1., 0., 2., 2., 0., 0., 0., 0., 0., 1., 0., 2., 1., 0.,
       1., 1., 1., 2., 0., 2., 0., 1., 1., 0., 1., 1., 0., 2., 1., 0., 2.,
       2., 2., 0., 2., 0., 1., 2., 0., 2., 1., 2., 1., 2., 2., 2., 2., 2.,
       0., 1., 2., 0., 1., 0., 1., 2., 2., 0., 1., 0., 2., 1., 0., 1., 0.,
       0., 2., 1., 2., 2., 0., 2., 2., 1., 1., 2., 1., 1., 1., 1., 0., 0.,
       2., 1., 2., 2., 1., 2., 1., 2., 1., 0., 2., 0., 2., 1., 2., 1., 0.,
       1., 2., 1.])

In [28]:
class NaiveBayes:
    def __init__(self):
        self.model = None
        self.Y_mean = None
        
    
    # 计算均值
    @staticmethod   # 这个装饰器的作用就是，可以不用先实例化再调用函数，直接NaiveBayes.mean(X)
    def mean(X):
        return sum(X) / float(len(X))
    
    # 计算标准差
    def std(self, X):
        avg = self.mean(X)
        return math.sqrt(sum([pow(x - avg, 2) for x in X]) / float(len(X)))
    
    # 概率密度函数
    def gaussian_probability(self, x, mean, std):
        exp = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(std, 2))))
        return (1 / (math.sqrt(2 * math.pi)) * std) * exp
    
    # 计算X_train的mean 和std
    def summarize(self, X_train): #  *X_train 星号作用是解包然后逐个传入 
        summaries = [(self.mean(i), self.std(i)) for i in zip(*X_train)] # 所以zip(*X_train) == zip([1,2,3],[2,3,4],...)
        return summaries
    
    def fit(self, X, Y):
        labels = list(set(Y))
        data = {label: [] for label in labels} # 初始化
        self.Y_mean = np.zeros(len(labels))
        for x, label in zip(X, Y):
            data[label].append(x)
            self.Y_mean[int(label)] += 1.
        self.Y_mean /= len(Y)
        # 计算P(x_i|y_k) for i in range...
        self.model = {
            label:self.summarize(value)
            for label, value in data.items()
        }
        return 'gaussianNB train done'
    
    def calculate_probabilities(self, input_data):
        # summaries: {0: [(mean1,std1),(mean2,std2),(mean3,std3),(mean4,std4)], 1:...}
        probabilities = {}
        for label, value in self.model.items(): # value --> summaries
            probabilities[label] = self.Y_mean[int(label)] # P(C_i)
            for i in range(len(value)):
                mean, std = value[i]
                probabilities[label] *= self.gaussian_probability(input_data[i], mean, std)
                # 计算P(x|c_i) * P(C_i)的概率
        return probabilities
    
    def predict(self, X_test): 
        # 这里X_test为单个实例
        result = []
        for i in range(len(X_test)):
            label = sorted(self.calculate_probabilities(X_test[i]).items(), key=lambda x:x[-1])[-1][0]  # sorted 默认从小到大 返回一个list
            # 如[(1, 75), (0, 85), (2, 95)]
            result.append(label)
        
        return result
    
    def score(self, X_test, Y_test):
        right = 0
        predictions = self.predict(X_test)
        for i in range(len(Y_test)):
            if predictions[i] == Y_test[i]:
                right += 1
        return right / float(len(Y_test))
                
                

In [29]:
model = NaiveBayes()

In [30]:
model.fit(X_train, Y_train)

'gaussianNB train done'

In [31]:
result1 = model.predict(X_test)

In [32]:
model.score(X_test, Y_test)

0.9555555555555556

## scikit-learn 实例

In [33]:
from sklearn.naive_bayes import GaussianNB

In [34]:
clf = GaussianNB()
#X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
#Y = np.array([1, 1, 1, 2, 2, 2])
clf.fit(X_train, Y_train)

GaussianNB()

In [35]:
clf.score(X_test, Y_test)

0.9777777777777777

In [36]:
result2 = clf.predict(X_test)

In [37]:
count = 0
for i in range(len(result1)):
    if result1[i] != result2[i]:
        count += 1
count

1