In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
dataroot = 'watermelon.csv'
df = pd.read_csv(dataroot)

In [3]:
samples = np.zeros((17, 8))
labels = np.zeros((17, 1))

In [4]:
feature1 = {'青绿': 1, '乌黑': 2, '浅白': 3}
feature2 = {'蜷缩': 1, '稍蜷': 2, '硬挺': 3}
feature3 = {'浊响': 1, '沉闷': 2, '清脆': 3}
feature4 = {'清晰': 1, '稍糊': 2, '模糊': 3}
feature5 = {'凹陷': 1, '稍凹': 2, '平坦': 3}
feature6 = {'硬滑': 1, '软粘': 2}


for row in range(df.shape[0]):
    samples[row][0] = feature1[df.iloc[row][1]]
    samples[row][1] = feature2[df.iloc[row][2]]
    samples[row][2] = feature3[df.iloc[row][3]]
    samples[row][3] = feature4[df.iloc[row][4]]
    samples[row][4] = feature5[df.iloc[row][5]]
    samples[row][5] = feature6[df.iloc[row][6]]
    samples[row][6] = df.iloc[row][7]
    samples[row][7] = df.iloc[row][8]
    labels[row][0] = 1 if df.iloc[row][9] == '是' else -1
    
labels = np.reshape(labels, -1)
flag = np.array([0, 0, 0, 0, 0, 0, 1, 1])

**Two Class Naive Bayes Classifier**

In [57]:
class TwoClassNaiveBayesClassifier():
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
    
    
    def train(self, training_samples, flag, training_labels):
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        if sample_dim != training_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        conti_feature = np.where(flag==1)
        
        dis_feature = np.where(flag==0)
        
        prob_positive = np.zeros((sample_dim, 3))
        prob_negative = np.zeros((sample_dim, 3))
        
        
        mean_list = np.zeros((sample_dim, 2))
        std_list = np.zeros((sample_dim, 2))
        class_prior = np.zeros((2))
        
        
        class_prior[0] = np.count_nonzero(training_labels==-1) / sample_num
        class_prior[1] = np.count_nonzero(training_labels==1) / sample_num
        
        for dim in conti_feature[0]:
            mean_list[dim, 0] = np.mean(training_samples[training_labels==-1, dim])
            mean_list[dim, 1] = np.mean(training_samples[training_labels==1, dim])
            std_list[dim, 0] = np.std(training_samples[training_labels==-1, dim])
            std_list[dim, 1] = np.std(training_samples[training_labels==1, dim])
            
        pos = np.where(training_labels==1)
        neg = np.where(training_labels==-1)
        num_positive = len(pos[0])
        num_negative = len(neg[0])
        
        for dim in dis_feature[0]:
            feature = training_samples[training_labels==1][:,dim]
            nums = np.unique(feature)
            for i in range(len(nums)):
                prob_positive[dim][i] = np.count_nonzero(feature==nums[i])/num_positive
                
            feature = training_samples[training_labels==-1][:,dim]
            nums = np.unique(feature)
            for i in range(len(nums)):
                prob_negative[dim][i] = np.count_nonzero(feature==nums[i])/num_negative
            
        self.class_prior = class_prior
        self.mean_list = mean_list
        self.std_list = std_list
        self.prob_positive = prob_positive
        self.prob_negative = prob_negative
        
    
    def test(self, testing_samples, flag, testing_labels=None):
        
        sample_dim = self.sample_dim
        sample_num = testing_samples.shape[0]
        if sample_dim != testing_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        predicted_labels = np.zeros((sample_num))
        
        class_prior = self.class_prior
        mean_list = self.mean_list
        std_list = self.std_list
        prob_positive = self.prob_positive
        prob_negative = self.prob_negative
        
        conti_feature = np.where(flag==1)
        dis_feature = np.where(flag==0)
        
        for i in range(sample_num):
            xi = testing_samples[i]
            
            xi_posterior_prob = [1, 1]
            
            for dim in conti_feature[0]:
                xi_prob1 = self.Gaussian(xi[dim], mean_list[dim, 0], std_list[dim, 0])
                xi_posterior_prob[0] *= xi_prob1
                
                xi_prob2 = self.Gaussian(xi[dim], mean_list[dim, 1], std_list[dim, 1])
                xi_posterior_prob[1] *= xi_prob2
                
            for dim in dis_feature[0]:

                xi_prob1 = prob_negative[dim][int(xi[dim])-1]
                xi_posterior_prob[0] *= xi_prob1
                
                xi_prob2 = prob_positive[dim][int(xi[dim])-1]
                xi_posterior_prob[1] *= xi_prob2
                
            xi_posterior_prob[0] *= class_prior[0]
            xi_posterior_prob[1] *= class_prior[1]
            
            if xi_posterior_prob[0] > xi_posterior_prob[1]:
                predicted_labels[i] = -1
            else:
                predicted_labels[i] = 1
                
        if testing_labels is not None:
            acc = accuracy_score(testing_labels, predicted_labels)
                
        return predicted_labels
                
                
    def Gaussian(self, x, mean, std):
        return np.exp(- 1 / 2 * np.dot((x - mean).T, (x - mean)) / std) / (2 * np.pi * np.sqrt(np.abs(std)))

In [58]:
def NBClassifier(training_samples, training_labels, testing_samples, testing_labels, flag):
    
    sample_dim = len(flag)
    
    NBC = TwoClassNaiveBayesClassifier(sample_dim)
    NBC.train(training_samples, flag, training_labels)
    pred = NBC.test(testing_samples, flag, testing_labels)
    
    test_num = len(testing_labels)
    correct_num = 0
    for i in range(test_num):
        if pred[i] == testing_labels[i]:
            correct_num += 1
            
    return [test_num, correct_num, correct_num / test_num]

**k重交叉验证**

In [59]:
def Cross_validation(samples, labels, flag, k=5):
    
    batch_size = int(samples.shape[0] / k)
    correct_classification = 0
    total = 0

    for i in range(0, k):
        k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
        k_train_labels = np.hstack([labels[0 : i * batch_size], labels[(i + 1) * batch_size:]])

        k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
        k_val_labels = labels[i * batch_size : (i + 1) * batch_size]

        res = NBClassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels, flag)
        
        correct_classification += res[1]
        total += res[0]
        print('ACC of %dth validation : %.3f' % (i, res[2]))
                    
    return correct_classification / total

**数据集乱序**

In [60]:
idx = list(range(17))
np.random.shuffle(idx)

samples = samples[idx]
labels = labels[idx]

In [61]:
Cross_validation(samples, labels, flag, k=5)

ACC of 0th validation : 0.667
ACC of 1th validation : 0.667
ACC of 2th validation : 1.000
ACC of 3th validation : 0.000
ACC of 4th validation : 0.333


0.5333333333333333