# 作业二：补充贝叶斯估计算法

### 班级：人工智能与机器人班

### 学号：201700171080

### 姓名：毛冬辉

## 1 作业说明
在估计步骤，避免极大似然估计分母为零造成计算错误的情况出现，根据贝叶斯估计计算公式，在分子分母加上了一个常数。其他部分，根据计算式补充。

## 2 代码实现

In [2]:
#encoding=utf-8
import pandas as pd
import numpy as np
import cv2
import sys
import random
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 二值化
def binaryzation(img):
    cv_img = img.astype(np.uint8)
    cv2.threshold(cv_img,50,1,cv2.THRESH_BINARY_INV,cv_img)
    return cv_img

def Train(trainset,train_labels):
    '''
    输入：训练数据集与训练标签
    输出：先验概率与条件概率
    '''
    # P(Y=ck), ck = class_num
    # P(X^(j)=ajl|Y=ck)
    # j=0,1,...,784
    # ajl = 0,1
    prior_probability = np.zeros(class_num)                         # 先验概率
    conditional_probability = np.zeros((class_num,feature_len,2))   # 条件概率

    # 计算先验概率及条件概率
    for i in range(len(train_labels)):
        img = binaryzation(trainset[i])     # 图片二值化
        label = train_labels[i]
        # count y=(0,1,2,3,..,9)
        prior_probability[label] += 1

        # 单个样本的特征向量各分量的像素归类
        for j in range(feature_len):

            conditional_probability[label][j][img[j]] = conditional_probability[label][j][img[j]]+1

    # 将概率归到[1.10001]
    for i in range(class_num):
        for j in range(feature_len):

            # 经过二值化后图像只有0，1两种取值
            pix_0 = conditional_probability[i][j][0]
            pix_1 = conditional_probability[i][j][1]
            lambda_ = 1
            # 计算0，1像素点对应的条件概率
            # 使用贝叶斯估计，避免极大似然估计中分母出现0的错误
            probalility_0 = (float(pix_0+lambda_)/float(pix_0+pix_1+lambda_*2))*10000+1
            probalility_1 = (float(pix_1+lambda_)/float(pix_0+pix_1+lambda_*2))*10000+1
            conditional_probability[i][j][0] = probalility_0
            conditional_probability[i][j][1] = probalility_1

    return prior_probability,conditional_probability

# 计算概率
def calculate_probability(img,label):
    probability = int(prior_probability[label])

    for i in range(len(img)):
        probability *= int(conditional_probability[label][i][img[i]])

    return probability

def Predict(testset,prior_probability,conditional_probability):
    predict = []

    i=0

    for img in testset:
        i +=1
        # 图像二值化
        img = binaryzation(img)

        max_label = 0
        max_probability = calculate_probability(img,0)

        for j in range(1,10):
            probability = calculate_probability(img,j)
            if max_probability < probability:
                max_label = j
                max_probability = probability
        predict.append(max_label)

    return np.array(predict)


class_num = 10
feature_len = 784

if __name__ == '__main__':

    print('Start read data')

    time_1 = time.time()

    raw_data = pd.read_csv('./train_binary.csv',header=0)
    # data szie = (42000, 785)
    # the firt value is label
    # the left values are pixels of the imgs
    data = raw_data.values

    # imgs size = (42000, 784)
    imgs = data[:,1::]
    # label size = (42000,784)
    labels = data[:,0]

    # 选取 2/3 数据作为训练集， 1/3 数据作为测试集
    train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323)
    # print train_features.shape
    # print train_features.shape

    time_2 = time.time()
    print('read data cost {} second'.format(time_2 - time_1))

    print('Start training')

    # 先验概率和条件概率
    prior_probability,conditional_probability = Train(train_features,train_labels)
    time_3 = time.time()
    print('\r training cost {} second'.format(time_3 - time_2))

    print('Start predicting')
    test_predict = Predict(test_features,prior_probability,conditional_probability)
    time_4 = time.time()
    print('\r predicting cost {} second'.format(time_4 - time_3))

    score = accuracy_score(test_labels,test_predict)
    print("The accruacy socre is {}".format(score))

Start read data
read data cost 3.7020955085754395 second
Start training
 training cost 59.77711820602417 second
Start predicting
 predicting cost 175.7184338569641 second
The accruacy socre is 0.9611832611832611
