In [60]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix

In [61]:
dataroot = 'watermelon.csv'
df = pd.read_csv(dataroot)

**Data Processing**

In [62]:
samples = np.zeros((17, 8))
labels = np.zeros((17, 1))

In [63]:
feature1 = {'青绿': 1, '乌黑': 2, '浅白': 3}
feature2 = {'蜷缩': 1, '稍蜷': 2, '硬挺': 3}
feature3 = {'浊响': 1, '沉闷': 2, '清脆': 3}
feature4 = {'清晰': 1, '稍糊': 2, '模糊': 3}
feature5 = {'凹陷': 1, '稍凹': 2, '平坦': 3}
feature6 = {'硬滑': 1, '软粘': 2}


for row in range(df.shape[0]):
    samples[row][0] = feature1[df.iloc[row][1]]
    samples[row][1] = feature2[df.iloc[row][2]]
    samples[row][2] = feature3[df.iloc[row][3]]
    samples[row][3] = feature4[df.iloc[row][4]]
    samples[row][4] = feature5[df.iloc[row][5]]
    samples[row][5] = feature6[df.iloc[row][6]]
    samples[row][6] = df.iloc[row][7]
    samples[row][7] = df.iloc[row][8]
    labels[row][0] = 1 if df.iloc[row][9] == '是' else 0
    
labels = np.reshape(labels, -1)
flag = np.array([0, 0, 0, 0, 0, 0, 1, 1])

**数据集乱序**

In [64]:
idx = list(range(17))
np.random.shuffle(idx)

samples = samples[idx]
labels = labels[idx]

**LDA**

In [65]:
class LinearDiscriminantAnalysis(object):
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
        
    def train(self, training_samples, training_labels):
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        if sample_dim != training_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        positive = training_samples[training_labels==1]
        negative = training_samples[training_labels==0]
        
        # 计算均值
        u0 = negative.mean(0)
        u1 = positive.mean(0)
        
        # 计算协方差
        tmp0 = np.zeros_like(negative[0] - u0).dot((negative[0] - u0).T)
        for row in negative:
            tmp0 += (row - u0).dot((row - u0).T)
        sigma0 = 1 / (negative.shape[1] - 1) * tmp0
        
        tmp1 = np.zeros_like(positive[0] - u1).dot((positive[0] - u1).T)
        for row in positive:
            tmp1 += (row - u1).dot((row - u1).T)
        sigma1 = 1 / (positive.shape[1] - 1) * tmp1
        
        # 类内离散度矩阵
        Sw = sigma0 + sigma1
        
        if isinstance(Sw, np.float64): 
            w = (1 / Sw) * (u0 - u1)
        else:
            w = np.linalg.inv(Sw).dot(u0 - u1)
            
        negative_center = (w.T).dot(u0)
        positive_center = (w.T).dot(u1)
        
        self.w = w
        self.negative_center = negative_center
        self.positive_center = positive_center
        
    
    def test(self, testing_samples, testing_labels=None):
        
        sample_dim = self.sample_dim
        sample_num = testing_samples.shape[0]
        if sample_dim != testing_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        w = self.w
        negative_center = self.negative_center
        positive_center = self.positive_center
        
        predicted_labels = np.zeros((sample_num))
        
        for i in range(sample_num):
            xi = testing_samples[i]
            
            if abs((w.T).dot(xi) - positive_center) > abs((w.T).dot(xi) - negative_center):
                predicted_labels[i] = 0
            else:
                predicted_labels[i] = 1
        
        if testing_labels is not None:
            acc = accuracy_score(testing_labels, predicted_labels)
                
        return predicted_labels

In [66]:
def LDAclassifier(training_samples, training_labels, testing_samples, testing_labels):
    
    sample_dim = len(training_samples[0])
    
    LDA = LinearDiscriminantAnalysis(sample_dim)
    LDA.train(training_samples, training_labels)
    pred = LDA.test(testing_samples, testing_labels)
    
    test_num = len(testing_labels)
    correct_num = 0
    for i in range(test_num):
        if pred[i] == testing_labels[i]:
            correct_num += 1

    return [test_num, correct_num, correct_num / test_num]

In [67]:
def Cross_validation(samples, labels, multiclass=False, k=5):
    
    batch_size = int(samples.shape[0] / k)
    correct_classification = 0
    total = 0
    
    if multiclass:
        for label in set(labels):
            print('one vs rest for label_%d: ' % (label))
            ones = np.ones_like(labels)
            zeros = np.zeros_like(labels)
            new_labels = np.where(labels==label, ones, zeros)

            for i in range(0, k):
                k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
                k_train_labels = np.hstack([new_labels[0 : i * batch_size], new_labels[(i + 1) * batch_size:]])

                k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
                k_val_labels = new_labels[i * batch_size : (i + 1) * batch_size]

                res = LDAclassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels)
        
                correct_classification += res[1]
                total += res[0]
                print('ACC of %dth validation : %.3f' % (i, res[2]))
                
    else:
        for i in range(0, k):
            k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
            k_train_labels = np.hstack([labels[0 : i * batch_size], labels[(i + 1) * batch_size:]])

            k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
            k_val_labels = labels[i * batch_size : (i + 1) * batch_size]

            res = LDAclassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels)
        
            correct_classification += res[1]
            total += res[0]
            print('ACC of %dth validation : %.3f' % (i, res[2]))
                    
    return correct_classification / total

In [68]:
Cross_validation(samples, labels, multiclass=False, k=5)

ACC of 0th validation : 0.667
ACC of 1th validation : 1.000
ACC of 2th validation : 1.000
ACC of 3th validation : 0.667
ACC of 4th validation : 0.333


0.7333333333333333

**Logistic Regression**

In [69]:
class LogisticRegression(object):
    
    
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
        self.threshold = 0.5
    
    
    def train(self, training_samples, training_labels):
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        if sample_dim != training_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
            
        w = np.ones_like(training_samples[0])
        b = 1
        prev_w = w
        prev_b = b
        
        lr = 0.001
        iteration = 10000
        cnt = 0

        while cnt < iteration:
            prev_w = w
            prev_b = b
            w = w - lr * self.derivative_over_w(training_labels, training_samples, w, b)
            b = b - lr * self.derivative_over_b(training_labels, training_samples, w, b)
            if abs(sum(prev_w-w))<1e-5:
                break
            cnt += 1
            
        self.w = w
        self.b = b
            
            
    def test(self, testing_samples, testing_labels=None):
        sample_dim = self.sample_dim
        threshold = self.threshold
        
        sample_num = testing_samples.shape[0]
        if sample_dim != testing_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        predicted_labels = np.zeros((sample_num))
            
        w = self.w
        b = self.b
        
        for i in range(sample_num):
            xi = testing_samples[i]
            
            out = self.sigmoid(xi, b, w)
            predicted_labels[i] = 1 if out > threshold else 0
            
        if testing_labels is not None:
            acc = accuracy_score(testing_labels, predicted_labels)
            
        return predicted_labels
    
    
    def sigmoid(self, x, b, w):
        return 1 / (1 + np.exp(- (w.T).dot(x) - b))
    
    
    def prob_positive(self, w, x, b):
        tmp = np.exp((w.T).dot(x) + b)
        return tmp / (1 + tmp)
    
    
    def derivative_over_w(self, y, x, w, b):
        D = np.zeros_like(x[0])
        for i in range(x.shape[0]):
            D += (x[i] * y[i] - x[i] * self.prob_positive(w, x[i], b))
        return -D
    
    
    def derivative_over_b(self, y, x, w, b):
        D = 0
        for i in range(x.shape[0]):
            D += (y[i] - self.prob_positive(w, x[i], b))
        return -D

In [70]:
def LRclassifier(training_samples, training_labels, testing_samples, testing_labels):
    
    sample_dim = len(training_samples[0])
    
    LR = LogisticRegression(sample_dim)
    LR.train(training_samples, training_labels)
    pred = LR.test(testing_samples, testing_labels)
    
    test_num = len(testing_labels)
    correct_num = 0
    for i in range(test_num):
        if pred[i] == testing_labels[i]:
            correct_num += 1
            
#     print(confusion_matrix(testing_labels, pred))

    return [test_num, correct_num, correct_num / test_num]

In [71]:
def Cross_validation(samples, labels, multiclass=False, k=5):
    
    batch_size = int(samples.shape[0] / k)
    correct_classification = 0
    total = 0
    
    if multiclass:
        for label in set(labels):
            print('one vs rest for label_%d: ' % (label))
            ones = np.ones_like(labels)
            zeros = np.zeros_like(labels)
            new_labels = np.where(labels==label, ones, zeros)

            for i in range(0, k):
                k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
                k_train_labels = np.hstack([new_labels[0 : i * batch_size], new_labels[(i + 1) * batch_size:]])

                k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
                k_val_labels = new_labels[i * batch_size : (i + 1) * batch_size]

                res = LRclassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels)
        
                correct_classification += res[1]
                total += res[0]
                print('ACC of %dth validation : %.3f' % (i, res[2]))
                
    else:
        for i in range(0, k):
            k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
            k_train_labels = np.hstack([labels[0 : i * batch_size], labels[(i + 1) * batch_size:]])

            k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
            k_val_labels = labels[i * batch_size : (i + 1) * batch_size]

            res = LRclassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels)
        
            correct_classification += res[1]
            total += res[0]
            print('ACC of %dth validation : %.3f' % (i, res[2]))
                    
    return correct_classification / total

In [72]:
Cross_validation(samples, labels, multiclass=False, k=5)

ACC of 0th validation : 0.667
ACC of 1th validation : 1.000
ACC of 2th validation : 0.333
ACC of 3th validation : 0.667
ACC of 4th validation : 0.667


0.6666666666666666

**NaiveBayes**

In [80]:
class TwoClassNaiveBayesClassifier():
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
    
    
    def train(self, training_samples, flag, training_labels):
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        if sample_dim != training_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        conti_feature = np.where(flag==1)
        
        dis_feature = np.where(flag==0)
        
        prob_positive = np.zeros((sample_dim, 3))
        prob_negative = np.zeros((sample_dim, 3))
        
        
        mean_list = np.zeros((sample_dim, 2))
        std_list = np.zeros((sample_dim, 2))
        class_prior = np.zeros((2))
        
        
        class_prior[0] = np.count_nonzero(training_labels==-1) / sample_num
        class_prior[1] = np.count_nonzero(training_labels==1) / sample_num
        
        for dim in conti_feature[0]:
            mean_list[dim, 0] = np.mean(training_samples[training_labels==0, dim])
            mean_list[dim, 1] = np.mean(training_samples[training_labels==1, dim])
            std_list[dim, 0] = np.std(training_samples[training_labels==0, dim])
            std_list[dim, 1] = np.std(training_samples[training_labels==1, dim])
            
        pos = np.where(training_labels==1)
        neg = np.where(training_labels==0)
        num_positive = len(pos[0])
        num_negative = len(neg[0])
        
        for dim in dis_feature[0]:
            feature = training_samples[training_labels==1][:,dim]
            nums = np.unique(feature)
            for i in range(len(nums)):
                prob_positive[dim][i] = feature[feature==nums[i]].shape[0] / num_positive
                
            feature = training_samples[training_labels==-1][:,dim]
            nums = np.unique(feature)
            for i in range(len(nums)):
                prob_negative[dim][i] = feature[feature==nums[i]].shape[0] / num_negative
            
        self.class_prior = class_prior
        self.mean_list = mean_list
        self.std_list = std_list
        self.prob_positive = prob_positive
        self.prob_negative = prob_negative
        
    
    def test(self, testing_samples, flag, testing_labels=None):
        
        sample_dim = self.sample_dim
        sample_num = testing_samples.shape[0]
        if sample_dim != testing_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        predicted_labels = np.zeros((sample_num))
        
        class_prior = self.class_prior
        mean_list = self.mean_list
        std_list = self.std_list
        prob_positive = self.prob_positive
        prob_negative = self.prob_negative
        
        conti_feature = np.where(flag==1)
        dis_feature = np.where(flag==0)
        
        for i in range(sample_num):
            xi = testing_samples[i]
            
            xi_posterior_prob = [1, 1]
            
            for dim in conti_feature[0]:
                xi_prob1 = self.Gaussian(xi[dim], mean_list[dim, 0], std_list[dim, 0])
                xi_posterior_prob[0] *= xi_prob1
                
                xi_prob2 = self.Gaussian(xi[dim], mean_list[dim, 1], std_list[dim, 1])
                xi_posterior_prob[1] *= xi_prob2
                
            for dim in dis_feature[0]:

                xi_prob1 = prob_negative[dim][int(xi[dim])-1]
                xi_posterior_prob[0] *= xi_prob1
                
                xi_prob2 = prob_positive[dim][int(xi[dim])-1]
                xi_posterior_prob[1] *= xi_prob2
                
            xi_posterior_prob[0] *= class_prior[0]
            xi_posterior_prob[1] *= class_prior[1]
            
            if xi_posterior_prob[0] > xi_posterior_prob[1]:
                predicted_labels[i] = 0
            else:
                predicted_labels[i] = 1
                
        if testing_labels is not None:
            acc = accuracy_score(testing_labels, predicted_labels)
                
        return predicted_labels
                
                
    def Gaussian(self, x, mean, std):
        return np.exp(- 1 / 2 * np.dot((x - mean).T, (x - mean)) / std) / (2 * np.pi * np.sqrt(np.abs(std)))

In [81]:
def NBClassifier(training_samples, training_labels, testing_samples, testing_labels, flag):
    
    sample_dim = len(flag)
    
    NBC = TwoClassNaiveBayesClassifier(sample_dim)
    NBC.train(training_samples, flag, training_labels)
    pred = NBC.test(testing_samples, flag, testing_labels)
    
    test_num = len(testing_labels)
    correct_num = 0
    for i in range(test_num):
        if pred[i] == testing_labels[i]:
            correct_num += 1
            
    return [test_num, correct_num, correct_num / test_num]

In [82]:
def Cross_validation(samples, labels, flag, k=5):
    
    batch_size = int(samples.shape[0] / k)
    correct_classification = 0
    total = 0

    for i in range(0, k):
        k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
        k_train_labels = np.hstack([labels[0 : i * batch_size], labels[(i + 1) * batch_size:]])

        k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
        k_val_labels = labels[i * batch_size : (i + 1) * batch_size]

        res = NBClassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels, flag)
        
        correct_classification += res[1]
        total += res[0]
        print('ACC of %dth validation : %.3f' % (i, res[2]))
                    
    return correct_classification / total

In [83]:
Cross_validation(samples, labels, flag, k=5)

ACC of 0th validation : 0.333
ACC of 1th validation : 0.333
ACC of 2th validation : 0.667
ACC of 3th validation : 0.667
ACC of 4th validation : 0.333


0.4666666666666667

**SVM**

In [77]:
def SVMclassifier(kernel, training_samples, training_labels, testing_samples, testing_labels):
    clf = svm.SVC(kernel=kernel, C=0.1)
    clf.fit(training_samples, training_labels)
    pred = clf.predict(testing_samples)
    
    test_num = len(testing_labels)
    correct_num = 0
    for i in range(test_num):
        if pred[i] == testing_labels[i]:
            correct_num += 1
            
    return [test_num, correct_num, correct_num / test_num]

In [78]:
def Cross_validation(samples, labels, k=5, kernel='linear'):
    
    batch_size = int(samples.shape[0] / k)
    correct_classification = 0
    total = 0

    for i in range(0, k):
        k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
        k_train_labels = np.hstack([labels[0 : i * batch_size], labels[(i + 1) * batch_size:]])

        k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
        k_val_labels = labels[i * batch_size : (i + 1) * batch_size]

        res = SVMclassifier(kernel, k_train_samples, k_train_labels, k_val_samples, k_val_labels)
        
        correct_classification += res[1]
        total += res[0]
        print('ACC of %dth validation : %.3f' % (i, res[2]))
        
    print('total acc: %.3f' % (correct_classification / total))

In [79]:
for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
    print('kernel: %s' % (kernel))
    Cross_validation(samples, labels, k=5, kernel=kernel)

kernel: linear
ACC of 0th validation : 0.667
ACC of 1th validation : 0.667
ACC of 2th validation : 1.000
ACC of 3th validation : 0.667
ACC of 4th validation : 0.333
total acc: 0.667
kernel: rbf
ACC of 0th validation : 0.667
ACC of 1th validation : 0.667
ACC of 2th validation : 0.333
ACC of 3th validation : 0.333
ACC of 4th validation : 0.333
total acc: 0.467
kernel: poly
ACC of 0th validation : 0.667
ACC of 1th validation : 1.000
ACC of 2th validation : 1.000
ACC of 3th validation : 0.667
ACC of 4th validation : 0.333
total acc: 0.733
kernel: sigmoid
ACC of 0th validation : 0.333
ACC of 1th validation : 0.000
ACC of 2th validation : 0.333
ACC of 3th validation : 0.333
ACC of 4th validation : 0.000
total acc: 0.200
