In [32]:
import numpy as np, pandas as pd
import math, re, datetime, random
import pylab as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [553]:
def rescaling(x, d1, d2):
    if (x <= d1):
        return 'small'
    elif (x <= d2):
        return 'mid'
    else:
        return 'big'
     
def get_data3(ratio = 0.8):
    data = pd.read_csv('./data3.csv')
    data['好瓜'] = data['好瓜'].apply(lambda x: True if x == '是' else False)
    data = data.drop(axis=1,columns=['编号'])
    feature_rescale = ['含糖率', '密度']
    for fea in feature_rescale:
        sorted_feature_data = sorted(list(data[fea]))
        n = len(sorted_feature_data)
        data[fea] = data[fea].apply(rescaling,d1=sorted_feature_data[n//3],d2=sorted_feature_data[2*n//3])
    
    idx = np.random.randint(0, len(data))
    return (data, data.iloc[idx])

In [457]:
class Naive_Bayes():
    def __init__(self, data_train, sample_test):
        from collections import defaultdict
        self.data_train = data_train
        self.sample_test = sample_test
        self.c0 = len(data_train[data_train[data_train.columns[-1]] == False])
        self.c1 = len(data_train[data_train[data_train.columns[-1]] == True])
        self.table = ({}, {})
        
    def fit(self):
        cols = self.data_train.columns
        self.table[0][cols[-1]] = (self.c0 + 1) / (self.c0 + self.c1 + 2)
        self.table[1][cols[-1]] = (self.c1 + 1) / (self.c0 + self.c1 + 2)
        
        for fea in cols[:-1]:
            fea_dic = [{}, {}]
            groups = self.data_train.groupby(axis = 0, by = fea)
            N = len(groups)
            
            fea_dic[0] = defaultdict(lambda: 1 / float(self.c0 + N))
            fea_dic[1] = defaultdict(lambda: 1 / float(self.c1 + N))
            
            for (val, tab) in groups:
                n0, n1 = len(tab[tab[tab.columns[-1]] == False]), len(tab[tab[tab.columns[-1]] == True])
                fea_dic[0][val] = (n0 + 1) / float(self.c0 + N)
                fea_dic[1][val] = (n1 + 1) / float(self.c1 + N)
            
            (self.table[0][fea], self.table[1][fea]) = fea_dic
    
    def test(self):
        cols = self.data_train.columns
        sample = self.sample_test
        logl = np.zeros(2)
        for c in range(2):
            logl[c] += np.log(self.table[c][cols[-1]])
            for fea in cols[:-1]:
                val = sample[fea]
                logl[c] += np.log(self.table[c][fea][val])
        output = [sample[-1], logl[1] >= logl[0], (logl[0], logl[1])]
        print ('{}, {} <-- {} '.format(output[0], output[1], output[2]))

In [616]:
class AODE_Bayes():
    def __init__(self, data_train, sample_test):
        from collections import defaultdict
        self.data_train = data_train
        self.sample_test = sample_test
        self.c = (len(data_train[data_train[data_train.columns[-1]]==False]), \
                  len(data_train[data_train[data_train.columns[-1]]==True]))
        self.ctable = ({}, {})
        self.table = ({}, {})
        
    def fit(self):
        cols = self.data_train.columns
        
        for C in range(2):
            
            data = self.data_train[self.data_train[cols[-1]] == (True if C == 1 else False)]
            
            for par in cols[:-1]:
                groups_par = data.groupby(axis = 0, by = par)
                N = len(groups_par)
                par_dic = defaultdict(lambda: 1 / float(self.c[0] + self.c[1] + 2 * N))
                self.table[C][par] = {}
                
                for (val, data_par) in groups_par:
                    NC = len(data_par)
                    par_dic[val] = (NC + 1) / float(self.c[0] + self.c[1] + 2 * N)
                    self.table[C][par][val] = {}
                    
                    for son in cols[:-1]:
                        if (son == par): 
                            continue
                        else:
                            groups_son = data_par.groupby(axis = 0, by = son)
                            n = len(groups_son)
                            son_dic = defaultdict(lambda: 1 / float(NC + n))
                            for (ual, data_son) in groups_son:
                                nC = len(data_son)
                                son_dic[ual] = (nC + 1) / (NC + n)
                        self.table[C][par][val][son] = son_dic
            
                self.ctable[C][par] = par_dic
            
    def test(self):
        cols = data_train.columns
        sample = self.sample_test
        logl = np.zeros(2)
        for C in range(2):
            for par in cols[:-1]:
                val = sample[par]
                if par not in self.table[C] or val not in self.table[C][par]: continue    
                p = self.ctable[C][par][val]
                for son in cols[:-1]:
                    if son == par: continue
                    ual = sample[son]
                    if son not in self.table[C][par][val]: continue
#                         print('par, val, son, ual = ({},{},{},{})'.format(par,val,son,ual))
                    p *= self.table[C][par][val][son][ual]
                logl[C] += p
        
        output = [sample[-1], logl[1] >= logl[0], (logl[0], logl[1])]
        print ('{}, {} <-- {} '.format(output[0], output[1], output[2]))

In [675]:
(data_train, sample_test) = get_data3()

nb = Naive_Bayes(data_train, sample_test)
nb.fit()
output = nb.test()

snb = AODE_Bayes(data_train, sample_test)
snb.fit()
snb.test()

False, False <-- (-8.220054892307417, -9.923610682227556) 
False, False <-- (0.04100403825039061, 0.03131964203555818) 
