In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

**数据预处理**

In [2]:
dataroot = 'watermelon.csv'
df = pd.read_csv(dataroot)

In [3]:
samples = np.zeros((17, 8))
labels = np.zeros((17, 1))

feature1 = {'青绿': 1, '乌黑': 2, '浅白': 3}
feature2 = {'蜷缩': 1, '稍蜷': 2, '硬挺': 3}
feature3 = {'浊响': 1, '沉闷': 2, '清脆': 3}
feature4 = {'清晰': 1, '稍糊': 2, '模糊': 3}
feature5 = {'凹陷': 1, '稍凹': 2, '平坦': 3}
feature6 = {'硬滑': 1, '软粘': 2}

for row in range(df.shape[0]):
    samples[row][0] = feature1[df.iloc[row][1]]
    samples[row][1] = feature2[df.iloc[row][2]]
    samples[row][2] = feature3[df.iloc[row][3]]
    samples[row][3] = feature4[df.iloc[row][4]]
    samples[row][4] = feature5[df.iloc[row][5]]
    samples[row][5] = feature6[df.iloc[row][6]]
    samples[row][6] = df.iloc[row][7]
    samples[row][7] = df.iloc[row][8]
    labels[row][0] = 1 if df.iloc[row][9] == '是' else -1
    

labels = np.reshape(labels, -1)
flag = np.array([0, 0, 0, 0, 0, 0, 1, 1])

**LDA**

In [70]:
class LinearDiscriminantAnalysis(object):
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
        
    def train(self, training_samples, training_labels):
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        if sample_dim != training_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        positive = training_samples[training_labels==1]
        negative = training_samples[training_labels==-1]
        
        # 计算均值
        u0 = negative.mean(0)
        u1 = positive.mean(0)
        
        # 计算协方差
        tmp0 = np.zeros_like(negative[0] - u0).dot((negative[0] - u0).T)
        for row in negative:
            tmp0 += (row - u0).dot((row - u0).T)
        sigma0 = 1 / (negative.shape[1] - 1) * tmp0
        
        tmp1 = np.zeros_like(positive[0] - u1).dot((positive[0] - u1).T)
        for row in positive:
            tmp1 += (row - u1).dot((row - u1).T)
        sigma1 = 1 / (positive.shape[1] - 1) * tmp1
        
        # 类内离散度矩阵
        Sw = sigma0 + sigma1
        
        if isinstance(Sw, np.float64): 
            w = (1 / Sw) * (u0 - u1)
        else:
            w = np.linalg.inv(Sw).dot(u0 - u1)
            
        negative_center =  (w.T).dot(u0)
        positive_center =  (w.T).dot(u1)
        
        self.w = w
        self.negative_center = negative_center
        self.positive_center = positive_center
        
    
    def test(self, testing_samples, testing_labels=None):
        
        sample_dim = self.sample_dim
        sample_num = testing_samples.shape[0]
        if sample_dim != testing_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        w = self.w
        negative_center = self.negative_center
        positive_center = self.positive_center
        
        predicted_labels = np.zeros((sample_num))
        
        for i in range(sample_num):
            xi = testing_samples[i]
            
            if abs((w.T).dot(xi) - positive_center) > abs((w.T).dot(xi) - negative_center):
                predicted_labels[i] = -1
            else:
                predicted_labels[i] = 1
        
        if testing_labels is not None:
            print('ACC of test:{}'.format(accuracy_score(testing_labels, predicted_labels)))
                
        return predicted_labels

In [71]:
LDA = LinearDiscriminantAnalysis(8)

In [84]:
idx = list(range(17))
np.random.shuffle(idx)

training_idx = idx[:13]
training_samples = samples[training_idx]
training_labels = labels[training_idx]

testing_idx = idx[13:]
testing_samples = samples[testing_idx]
testing_labels = labels[testing_idx]

In [85]:
LDA.train(training_samples, training_labels)

In [86]:
pred = LDA.test(testing_samples, testing_labels)

ACC of test:1.0


In [87]:
pred

array([ 1., -1.,  1.,  1.])

In [88]:
testing_labels

array([ 1., -1.,  1.,  1.])

In [89]:
accuracy_score(testing_labels, pred)

1.0