In [1]:
import numpy as np
from sklearn.datasets import load_iris

In [2]:
def load_data_1():
    feature_name = ['年龄','有工作','有房','信贷情况']
    dataset = [['青年','否','否','一般','否']]
    dataset.append(['青年','否','否','好','否'])
    dataset.append(['青年','是','否','好','是'])
    dataset.append(['青年','是','是','一般','是'])
    dataset.append(['青年','否','否','一般','否'])
    dataset.append(['中年','否','否','一般','否'])
    dataset.append(['中年','否','否','好','否'])
    dataset.append(['中年','是','是','好','是'])
    dataset.append(['中年','否','是','非常好','是'])
    dataset.append(['中年','否','是','非常好','是'])
    dataset.append(['老年','否','是','非常好','是'])
    dataset.append(['老年','否','是','好','是'])
    dataset.append(['老年','是','否','好','是'])
    dataset.append(['老年','是','否','非常好','是'])
    dataset.append(['老年','否','否','一般','否'])
    dataset = np.array(dataset)
    return dataset, feature_name

In [3]:
def load_data_2():
    feature = load_iris().data
    label = load_iris().target.reshape(-1, 1)
    iris_name = load_iris().feature_names
    iris = np.concatenate((feature, label), axis = 1)
    return iris, iris_name

In [4]:
def value_calculation(dataset):
    values = []
    for i in range(dataset.shape[1]):
        value = np.unique(dataset[:, i])
        values.append(value)
    return values

In [5]:
def class_prob(dataset):
    label_values = np.unique(dataset[:,-1])
    subdata = {}
    prob = {}
    N = len(label_values)
    #根据label取的值将数据集分成几个类， 并计算每个类的概率
    for valu in label_values:
        subdata[valu] = dataset[dataset[:, -1] == valu]
        prob[valu] = (len(subdata[valu])+1)/(len(dataset)+N)
        
    return subdata, prob

In [6]:
def naive_bayes(dataset, feature_name, distribution):
    '''
    构建多重字典，计算各个特征的概率分布
    对于连续特征，假设为正态分布，计算均值和标准差
    对于类别特征，假设为伯努利分布，计算每个类别的概率
    
    '''
    values = value_calculation(dataset)
    subdata, label_prob = class_prob(dataset)
    label_key = list(label_prob.keys())
    prob_dict = {}
    
    #按标签值不同将数据集分为几部分
    for num, data in enumerate(subdata.values()):
        count = len(data)
        data_prob = {}
        
        #对于类别特征，计算每部分数据中该特征取不同值时的条件概率
        feat_count = len(data[0,:])-1
        for i in range(feat_count):
            feat_value = values[i]
            n = len(feat_value)
            feat_prob = {}
            
            if distribution == 'bernoulli':
                for valu in feat_value:
                    prob = (np.sum(data[:, i] == valu)+1) / (count+n)
                    feat_prob[valu] = prob
                
            elif distribution == 'normal':
                feat_prob['mean'] = np.mean(data[:,i])
                feat_prob['std'] = np.std(data[:, i])
                
            else:
                raise Exception('distribution == normal or bernoulli')
                
            data_prob[feature_name[i]] = feat_prob
        prob_dict[label_key[num]] = data_prob
        
    return prob_dict, label_prob

In [7]:
def normal_prob_density(mean, sd, x):
    prob = 1/(np.sqrt(2*np.pi) * sd) * np.exp(-0.5 * (np.square(x - mean)) / np.square(sd))    
    return prob

In [8]:
def predict(X, feature_name, model, label_prob, distribution):
    results = []
    
    for example in X:
        max_prob = 0.

        #计算每个样本取每个标签值的概率
        for label in list(model.keys()):
            prob = 1.
        
            for i, name in enumerate(feature_name):
                valu = example[i]
                
                if distribution == 'bernoulli':
                #简单起见，如果测试样本中出现训练集特定标签的子集下没有的值，将概率设为1e-3，当然也可以用拉普拉斯平滑设置基准数
                    if valu not in model[label][name].keys():
                        feat_prob = 1e-4
                    else:
                        feat_prob = model[label][name][valu] * label_prob[label]
                        
                elif distribution == 'normal':
                    mean = model[label][name]['mean']
                    sd = model[label][name]['std']
                    feat_prob = normal_prob_density(mean, sd, valu) * label_prob[label]
                
                else:
                    raise Exception('distribution == normal or bernoulli')
                        
                #将每个特征的概率连乘得到最终概率
                prob = prob * feat_prob

            #选出最大的概率值，并返回此时的标签作为预测结果
            if prob >= max_prob:
                max_prob = prob
                result = label

        results.append(result)
    return results

In [9]:
def main(data):
    if data == 'iris':
        dataset, feature_name = load_data_2()
        model, label_prob = naive_bayes(dataset, feature_name, 'normal')
        pred = predict(dataset[:, :4], feature_name, model, label_prob, 'normal')
        
    elif data == 'loan':
        dataset, feature_name = load_data_1()
        model, label_prob = naive_bayes(dataset, feature_name, 'bernoulli')
        pred = predict(dataset[:, :4], feature_name, model, label_prob, 'bernoulli')
    
    return pred

In [10]:
if __name__ == '__main__':
    result1 = main('loan')
    result2 = main('iris')
    print(result1)
    print(result2)

['否', '否', '是', '是', '否', '否', '是', '是', '是', '是', '是', '是', '是', '是', '否']
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]
