In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold,StratifiedKFold
import sys

In [81]:
def readWaterMelon():
    df=pd.read_csv('watermelon3.csv',delimiter=',')
    df.iloc[:,-1] = pd.Categorical(df.iloc[:,-1])
    df['code'] = df.iloc[:,-1].cat.codes
    df['code']=df['code'].astype(str)
    
    df=df.drop([df.columns[0],df.columns[-2]],axis=1)
    
    sfolder = StratifiedKFold(n_splits=4,random_state=0)
    sfolder.get_n_splits(df)
    train_index, test_index=next(sfolder.split(df,df.iloc[:,-1].values))
    train_df,val_df=df.iloc[train_index],df.iloc[test_index]
    
    dsct=[]
    ctns=[]
    for col in df.columns[:-1]:
        if df[col].dtype=='object':
            dsct.append(col)
        else:
            ctns.append(col)
    return df,dsct,ctns
#     return train_df,val_df,dsct,ctns

def get_val_df(columns):
    val_df=pd.DataFrame(columns=columns)
    val_df.loc[0]=['青绿','蜷缩','浊响','清晰','凹陷','硬滑',0.691,0.460,np.nan]
    return val_df

In [86]:
class NaiveBayes:
    
    def __init__(self,args={}):
        self.args=args
    
    def train(self,df,dsct,ctns):
        label_key=df.iloc[:,-1].value_counts()
        
        self.args['dsct']=dsct
        self.args['ctns']=ctns
        self.args['labels']=label_key
        self.args['bayes']={}
        
        # p(c), prior probability
        for label,count in label_key.items():
            self.args['bayes'][label]=(count+1)/(df.shape[0]+len(label_key))
        
        # p(c/x_i), with Lapras smooth
        
        
        for attr in dsct:
            attrColumn=df[attr]
            attr_N=len(attrColumn.value_counts())
            self.args['bayes'][attr]={}
            for label in label_key.keys():
                temp=df[df['code']==label][attr].value_counts()
                self.args['bayes'][attr][label]={}
                for value,count in temp.items():
                    self.args['bayes'][attr][label][value]=(count+1)/(label_key[label]+attr_N)
        
        for attr in ctns:
            self.args['bayes'][attr]={}
            for label in label_key.keys():
                attrColumn=df[df['code']==label][attr]
                attrStd=attrColumn.std()
                attrMean=attrColumn.mean()
                self.args['bayes'][attr][label]=(attrMean,attrStd)
        
        
    def predict(self,test):
        total=test.shape[0]
        correct=0
        for xi in range(total):
            probs={}
            
            for label in self.args['labels'].keys():
                curScore=self.args['bayes'][label]
                
                # calculate discrete attributes
                for attr in self.args['dsct']:
                    curScore*=self.args['bayes'][attr][label][test[attr].iloc[xi]]
            
                # calculate continuous attributes, ...[1] is variance, while [0] is class-wise mean of attr
                for attr in self.args['ctns']:
                    curScore*=1/(np.sqrt(2*np.pi)*self.args['bayes'][attr][label][1])*\
                                np.exp(-(test[attr].iloc[xi]-self.args['bayes'][attr][label][0])**2/
                                           (2*(self.args['bayes'][attr][label][1])**2))
                probs[label]=curScore
        return probs
            
        

In [87]:
df,dsct,ctns=readWaterMelon()
val_df=get_val_df(df.columns)
model=NaiveBayes()
model.train(df,dsct,ctns)
print(model.predict(val_df))

{'0': 7.96801546535171e-05, '1': 0.026762967462757723}


In [144]:
class AODE:
    
    def __init__(self,args={}):
        self.args=args
    
    def train(self,df,dsct,ctns):
        """
        Note: for convenience, we only count for discrete attributes this time.
        """
        label_key=df.iloc[:,-1].value_counts()
        
        self.args['dsct']=dsct
        self.args['ctns']=ctns
        self.args['labels']=label_key
        self.args['bayes']={}
        
        
        # p(c,xi)
        for label in label_key.keys():
            curSubset=df[df['code']==label]
            self.args['bayes'][label]={}
            for attr in dsct:
                self.args['bayes'][label][attr]={}
                attrCount=df[attr].value_counts()
                attr_N=len(attrCount)
                
                for key in attrCount.keys():
                    D_c_xi=curSubset[curSubset[attr]==key].shape[0]
                    # calculate p(c,xi)
                    self.args['bayes'][label][attr][key]=(D_c_xi+1)/(df.shape[0]+attr_N) 
                    
                    for attr2 in dsct:
                        if attr2 not in self.args['bayes'][label][attr]:
                            self.args['bayes'][label][attr][attr2]={}
                        
                        attr2Count=df[attr2].value_counts()
                        attr2_N=len(attr2Count)
                        
                        for key2 in attr2Count.keys():
                            D_c_xi_xj=curSubset[(curSubset[attr]==key)&(curSubset[attr2]==key2)].shape[0]
                            # calculate p(xj|c,xi)
                            self.args['bayes'][label][attr][attr2][(key,key2)]=(D_c_xi_xj+1)/(D_c_xi+attr2_N)     
        
    def predict(self,test):
        # Because continuous multiply may cause a problem of scalar vanishing, we adopt a log operation to avoid.
        probs={}
        total=test.shape[0]
        for xi in range(total):
            for label in self.args['labels'].keys():
                curScore=0.0
                for attr in self.args['dsct']:
                    # p(c|xi)
                    curScore+=np.log(self.args['bayes'][label][attr][test[attr].iloc[xi]])
                        # p(xj|c,xi)
                    for attr2 in self.args['dsct']:

                        curScore+=np.log(self.args['bayes'][label][attr][attr2]\
                                         [(test[attr].iloc[xi],test[attr2].iloc[xi])])
                
                probs[label]=curScore

        return probs
            
        

In [145]:
df,dsct,ctns=readWaterMelon()
val_df=get_val_df(df.columns)
model=AODE()
model.train(df,dsct,ctns)
print(model.predict(val_df))

{'0': -46.15920157800021, '1': -25.41822571093543}


In [147]:
# AdaBoost ensemble with decision tree


1.1843977824369188e-11


In [148]:
print(42/200)

0.21
