In [3]:
import pandas as pd
import numpy as np
import logging
import math
from sklearn.model_selection import train_test_split

In [4]:
logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s',                level=logging.DEBUG)    
"""
class:贝叶斯分类器
    - 数据划分
    - 模型训练
    - 模型预测
"""
class Bayes_classfier:
    def __init__(self,path):
        # 模型数据集路径
        self.data_path=path
        #模型参数记录
        self.params=dict()
    '''
    func:数据集读取与训练测试集划分

    '''
    def data_processing(self):
        self.df=pd.read_csv(self.data_path,header=None)
        logging.info('loading datasets:volume:{}'.format(self.df.shape))
        #数据列名称指定
        self.df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
        #print(self.df)
        #特征与标签提取
        features=self.df
        label=self.df['species']
        # 训练集与测试集切分
        self.X_train, self.X_valid, self.y_train, self.y_valid=train_test_split(features, label, test_size=2/15, random_state=300,stratify=label)
        logging.info('split datasets,train_datasets:{},test_datasets:{}'.format(self.X_train.shape, self.X_valid.shape))
        pass
    
    '''
    func: 模型训练过程，即计算贝叶斯各个参数
    '''
    def fit(self):
        # 计算先验概率：
        
        priori_prob=(self.y_train.value_counts()+1)/(self.y_train.shape[0]+3)
        logging.info('计算训练集先验概率...')
        print(priori_prob)
        # 记录各模型参数的值用于预测
        self.params['priori_prob']=priori_prob
        # 计算各类均值mu,协方差矩阵的逆,协方差矩阵的行列式
        
        mu_=dict()
        sigma_inv=dict()
        det_=dict()
        logging.info('计算mu,sigma等矩阵运算...')
        for _label,groups in self.X_train.groupby(['species']):
            groups=groups[[x for x in self.df.columns if x not in ['species']]]
            #print(_label,groups.mean(axis=0))
            mu_[_label]=groups.mean(axis=0)
            sigma_=np.cov(groups.T, bias=True)
            sigma_inv[_label]=np.linalg.inv(sigma_)
            det_[_label]=np.linalg.det(sigma_)
        self.params['class_mu']=mu_
        self.params['class_sigma_inv']=sigma_inv
        self.params['class_det']=det_
        
    '''
    func: 分类判别函数,即利用代码实现上述判别函数
    return: 判别值
     注： @ 为numpy矩阵乘法运算 
    '''
    def g(self,x, mu, sigma_inv, det, priori_prob):
        #print(mu,sigma_inv)
        return -0.5 * (x - mu).T @ sigma_inv @ (x - mu) - 0.5 * math.log(det) + math.log(priori_prob)
    
    '''
    func: 构建判别函数用于预测，对测试集中各类别进行测试
    '''
    def predict(self):
        # 对各类进行测试
        for _label,test_group in self.X_valid.groupby(['species']):
            logging.info('test the {},volume:{}'.format(_label,test_group.shape[0]))
            test_feature=test_group[[x for x in self.df.columns if x not in ['species']]]
            ##
            print(_label)
            # 记录测试结果是否正确
            test_res=[]
            
            for _,row in test_feature.iterrows():
                #print(self.params['priori_prob'][_label])
                score=dict()
            
                # 计算各类别的得分情况
                for _l in self.df['species'].unique():
                    
                    score[_l]=self.g(np.array(row.values),
                       self.params['class_mu'][_l].values,
                       self.params['class_sigma_inv'][_l],
                       self.params['class_det'][_l],
                       self.params['priori_prob'][_l])
                #print(score)
                predict_res=max(score, key=score.get)
                
                print('[测试]当前预测类别:{},实际类别:{},{}'.format(predict_res,_label,predict_res==_label))
                test_res.append(predict_res==_label)
            print('{}类别测试准确率:{}'.format(_label,test_res.count(True)/len(test_res)))
classfier=Bayes_classfier('./iris.txt')
classfier.data_processing()
classfier.fit()
classfier.predict()


2022-12-06 17:58:16,621 - INFO: loading datasets:volume:(150, 5)
2022-12-06 17:58:16,624 - INFO: split datasets,train_datasets:(130, 5),test_datasets:(20, 5)
2022-12-06 17:58:16,625 - INFO: 计算训练集先验概率...
2022-12-06 17:58:16,628 - INFO: 计算mu,sigma等矩阵运算...
  for _label,groups in self.X_train.groupby(['species']):
  for _label,test_group in self.X_valid.groupby(['species']):
2022-12-06 17:58:16,636 - INFO: test the setosa,volume:6
2022-12-06 17:58:16,638 - INFO: test the versicolor,volume:7
2022-12-06 17:58:16,641 - INFO: test the virginica,volume:7


setosa        0.338346
versicolor    0.330827
virginica     0.330827
Name: species, dtype: float64
setosa
[测试]当前预测类别:setosa,实际类别:setosa,True
[测试]当前预测类别:setosa,实际类别:setosa,True
[测试]当前预测类别:setosa,实际类别:setosa,True
[测试]当前预测类别:setosa,实际类别:setosa,True
[测试]当前预测类别:setosa,实际类别:setosa,True
[测试]当前预测类别:setosa,实际类别:setosa,True
setosa类别测试准确率:1.0
versicolor
[测试]当前预测类别:versicolor,实际类别:versicolor,True
[测试]当前预测类别:versicolor,实际类别:versicolor,True
[测试]当前预测类别:versicolor,实际类别:versicolor,True
[测试]当前预测类别:versicolor,实际类别:versicolor,True
[测试]当前预测类别:versicolor,实际类别:versicolor,True
[测试]当前预测类别:versicolor,实际类别:versicolor,True
[测试]当前预测类别:versicolor,实际类别:versicolor,True
versicolor类别测试准确率:1.0
virginica
[测试]当前预测类别:virginica,实际类别:virginica,True
[测试]当前预测类别:virginica,实际类别:virginica,True
[测试]当前预测类别:virginica,实际类别:virginica,True
[测试]当前预测类别:virginica,实际类别:virginica,True
[测试]当前预测类别:virginica,实际类别:virginica,True
[测试]当前预测类别:virginica,实际类别:virginica,True
[测试]当前预测类别:virginica,实际类别:virginica,True
virginica类别测试准确率:1.0
