In [287]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix

In [257]:
Iris_dataset = pd.read_table('iris.data', delimiter=',',header=None)

**Data Processing**

In [258]:
samples = np.zeros((Iris_dataset.shape[0], 4))
labels = np.zeros((Iris_dataset.shape[0], 1))

In [259]:
dic = {}
for i,category in enumerate(Iris_dataset[4].unique()):
    dic[category] = i

In [260]:
for row in range(Iris_dataset.shape[0]):
    samples[row][0] = Iris_dataset.iloc[row][0]
    samples[row][1] = Iris_dataset.iloc[row][1]
    samples[row][2] = Iris_dataset.iloc[row][2]
    samples[row][3] = Iris_dataset.iloc[row][3]
                               
    labels[row][0] = dic[Iris_dataset.iloc[row][4]]
    
labels = np.reshape(labels, -1)

**One vs Rest**

In [23]:
def ovr(samples, labels):
    for label in set(labels):
        ones = np.ones_like(labels)
        zeros = np.zeros_like(labels)
        new_labels = np.where(labels==label, ones, zeros)

**LDA**

In [115]:
class LinearDiscriminantAnalysis(object):
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
        
    def train(self, training_samples, training_labels):
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        if sample_dim != training_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        positive = training_samples[training_labels==1]
        negative = training_samples[training_labels==0]
        
        # 计算均值
        u0 = negative.mean(0)
        u1 = positive.mean(0)
        
        # 计算协方差
        tmp0 = np.zeros_like(negative[0] - u0).dot((negative[0] - u0).T)
        for row in negative:
            tmp0 += (row - u0).dot((row - u0).T)
        sigma0 = 1 / (negative.shape[1] - 1) * tmp0
        
        tmp1 = np.zeros_like(positive[0] - u1).dot((positive[0] - u1).T)
        for row in positive:
            tmp1 += (row - u1).dot((row - u1).T)
        sigma1 = 1 / (positive.shape[1] - 1) * tmp1
        
        # 类内离散度矩阵
        Sw = sigma0 + sigma1
        
        if isinstance(Sw, np.float64): 
            w = (1 / Sw) * (u0 - u1)
        else:
            w = np.linalg.inv(Sw).dot(u0 - u1)
            
        negative_center = (w.T).dot(u0)
        positive_center = (w.T).dot(u1)
        
        self.w = w
        self.negative_center = negative_center
        self.positive_center = positive_center
        
    
    def test(self, testing_samples, testing_labels=None):
        
        sample_dim = self.sample_dim
        sample_num = testing_samples.shape[0]
        if sample_dim != testing_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        w = self.w
        negative_center = self.negative_center
        positive_center = self.positive_center
        
        predicted_labels = np.zeros((sample_num))
        
        for i in range(sample_num):
            xi = testing_samples[i]
            
            if abs((w.T).dot(xi) - positive_center) > abs((w.T).dot(xi) - negative_center):
                predicted_labels[i] = 0
            else:
                predicted_labels[i] = 1
        
        if testing_labels is not None:
            acc = accuracy_score(testing_labels, predicted_labels)
                
        return predicted_labels

In [116]:
def LDAclassifier(training_samples, training_labels, testing_samples, testing_labels):
    
    sample_dim = len(training_samples[0])
    
    LDA = LinearDiscriminantAnalysis(sample_dim)
    LDA.train(training_samples, training_labels)
    pred = LDA.test(testing_samples, testing_labels)
    
    test_num = len(testing_labels)
    correct_num = 0
    for i in range(test_num):
        if pred[i] == testing_labels[i]:
            correct_num += 1

    return [test_num, correct_num, correct_num / test_num]

In [117]:
def Cross_validation(samples, labels, multiclass=False, k=5):
    
    batch_size = int(samples.shape[0] / k)
    correct_classification = 0
    total = 0
    
    if multiclass:
        for label in set(labels):
            print('one vs rest for label_%d: ' % (label))
            ones = np.ones_like(labels)
            zeros = np.zeros_like(labels)
            new_labels = np.where(labels==label, ones, zeros)

            for i in range(0, k):
                k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
                k_train_labels = np.hstack([new_labels[0 : i * batch_size], new_labels[(i + 1) * batch_size:]])

                k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
                k_val_labels = new_labels[i * batch_size : (i + 1) * batch_size]

                res = LDAclassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels)
        
                correct_classification += res[1]
                total += res[0]
                print('ACC of %dth validation : %.3f' % (i, res[2]))
                
    else:
        for i in range(0, k):
            k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
            k_train_labels = np.hstack([labels[0 : i * batch_size], labels[(i + 1) * batch_size:]])

            k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
            k_val_labels = labels[i * batch_size : (i + 1) * batch_size]

            res = LDAlassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels, flag)
        
            correct_classification += res[1]
            total += res[0]
            print('ACC of %dth validation : %.3f' % (i, res[2]))
                    
    return correct_classification / total

In [264]:
idx = list(range(samples.shape[0]))
np.random.shuffle(idx)

samples = samples[idx]
labels = labels[idx]

In [119]:
Cross_validation(samples, labels, multiclass=True, k=5)

one vs rest for label_0: 
ACC of 0th validation : 0.967
ACC of 1th validation : 1.000
ACC of 2th validation : 0.967
ACC of 3th validation : 1.000
ACC of 4th validation : 1.000
one vs rest for label_1: 
ACC of 0th validation : 0.500
ACC of 1th validation : 0.667
ACC of 2th validation : 0.667
ACC of 3th validation : 0.600
ACC of 4th validation : 0.667
one vs rest for label_2: 
ACC of 0th validation : 0.833
ACC of 1th validation : 0.700
ACC of 2th validation : 0.900
ACC of 3th validation : 0.767
ACC of 4th validation : 0.933


0.8111111111111111

**Logistic Regression**

In [140]:
class LogisticRegression(object):
    
    
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
        self.threshold = 0.5
    
    
    def train(self, training_samples, training_labels):
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        if sample_dim != training_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
            
        w = np.ones_like(training_samples[0])
        b = 1
        prev_w = w
        prev_b = b
        
        lr = 0.001
        iteration = 10000
        cnt = 0

        while cnt < iteration:
            prev_w = w
            prev_b = b
            w = w - lr * self.derivative_over_w(training_labels, training_samples, w, b)
            b = b - lr * self.derivative_over_b(training_labels, training_samples, w, b)
            if abs(sum(prev_w-w))<1e-3:
                break
            cnt += 1
            
        self.w = w
        self.b = b
            
            
    def test(self, testing_samples, testing_labels=None):
        sample_dim = self.sample_dim
        threshold = self.threshold
        
        sample_num = testing_samples.shape[0]
        if sample_dim != testing_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        predicted_labels = np.zeros((sample_num))
            
        w = self.w
        b = self.b
        
        for i in range(sample_num):
            xi = testing_samples[i]
            
            out = self.sigmoid(xi, b, w)
            predicted_labels[i] = 1 if out > threshold else 0
            
        if testing_labels is not None:
            acc = accuracy_score(testing_labels, predicted_labels)
            
        return predicted_labels
    
    
    def sigmoid(self, x, b, w):
        return 1 / (1 + np.exp(- (w.T).dot(x) - b))
    
    
    def prob_positive(self, w, x, b):
        tmp = np.exp((w.T).dot(x) + b)
        return tmp / (1 + tmp)
    
    
    def derivative_over_w(self, y, x, w, b):
        D = np.zeros_like(x[0])
        for i in range(x.shape[0]):
            D += (x[i] * y[i] - x[i] * self.prob_positive(w, x[i], b))
        return -D
    
    
    def derivative_over_b(self, y, x, w, b):
        D = 0
        for i in range(x.shape[0]):
            D += (y[i] - self.prob_positive(w, x[i], b))
        return -D

In [152]:
def LRclassifier(training_samples, training_labels, testing_samples, testing_labels):
    
    sample_dim = len(training_samples[0])
    
    LR = LogisticRegression(sample_dim)
    LR.train(training_samples, training_labels)
    pred = LR.test(testing_samples, testing_labels)
    
    test_num = len(testing_labels)
    correct_num = 0
    for i in range(test_num):
        if pred[i] == testing_labels[i]:
            correct_num += 1
            
#     print(confusion_matrix(testing_labels, pred))

    return [test_num, correct_num, correct_num / test_num]

In [153]:
def Cross_validation(samples, labels, multiclass=False, k=5):
    
    batch_size = int(samples.shape[0] / k)
    correct_classification = 0
    total = 0
    
    if multiclass:
        for label in set(labels):
            print('one vs rest for label_%d: ' % (label))
            ones = np.ones_like(labels)
            zeros = np.zeros_like(labels)
            new_labels = np.where(labels==label, ones, zeros)

            for i in range(0, k):
                k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
                k_train_labels = np.hstack([new_labels[0 : i * batch_size], new_labels[(i + 1) * batch_size:]])

                k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
                k_val_labels = new_labels[i * batch_size : (i + 1) * batch_size]

                res = LRclassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels)
        
                correct_classification += res[1]
                total += res[0]
                print('ACC of %dth validation : %.3f' % (i, res[2]))
                
    else:
        for i in range(0, k):
            k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
            k_train_labels = np.hstack([labels[0 : i * batch_size], labels[(i + 1) * batch_size:]])

            k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
            k_val_labels = labels[i * batch_size : (i + 1) * batch_size]

            res = LRclassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels, flag)
        
            correct_classification += res[1]
            total += res[0]
            print('ACC of %dth validation : %.3f' % (i, res[2]))
                    
    return correct_classification / total

In [154]:
Cross_validation(samples, labels, multiclass=True, k=5)

one vs rest for label_0: 
ACC of 0th validation : 1.000
ACC of 1th validation : 1.000
ACC of 2th validation : 1.000
ACC of 3th validation : 1.000
ACC of 4th validation : 1.000
one vs rest for label_1: 
ACC of 0th validation : 0.800
ACC of 1th validation : 0.567
ACC of 2th validation : 0.633
ACC of 3th validation : 0.600
ACC of 4th validation : 0.700
one vs rest for label_2: 
ACC of 0th validation : 1.000
ACC of 1th validation : 0.900
ACC of 2th validation : 0.933
ACC of 3th validation : 1.000
ACC of 4th validation : 0.967


0.8733333333333333

**NaiveBayes**

In [323]:
class NaiveBayesClassifier():
    
    def __init__(self, sample_dim):
        self.sample_dim = sample_dim
    
    
    def train(self, training_samples, training_labels):
        sample_dim = self.sample_dim
        sample_num = training_samples.shape[0]
        num_class = len(set(training_labels))
        
        if sample_dim != training_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
        
        mean_list = np.zeros((sample_dim, num_class))
        std_list = np.zeros((sample_dim, num_class))
        class_prior = np.zeros((num_class))
        
        for i in range(num_class):
            class_prior[i] = training_samples[training_labels==i].shape[0] / sample_num
        
        for dim in range(sample_dim):
            for i in range(num_class):
                mean_list[dim, i] = np.mean(training_samples[training_labels==i, dim])
                std_list[dim, i] = np.std(training_samples[training_labels==i, dim])
            
        self.class_prior = class_prior
        self.mean_list = mean_list
        self.std_list = std_list
        self.num_class = num_class
        
    
    def test(self, testing_samples, testing_labels=None):
        
        sample_dim = self.sample_dim
        num_class = self.num_class
        sample_num = testing_samples.shape[0]
        if sample_dim != testing_samples.shape[1]:
            raise Exception("Input samples are not compatible with this classifier!")
            
        predicted_labels = np.zeros((sample_num))
        
        class_prior = self.class_prior
        mean_list = self.mean_list
        std_list = self.std_list

        
        for i in range(sample_num):
            xi = testing_samples[i]
            
            xi_posterior_prob = [1] * num_class
            
            for idx in range(num_class):
                for dim in range(sample_dim):
                    xi_prob = self.Gaussian(xi[dim], mean_list[dim, idx], std_list[dim, idx])
                    xi_posterior_prob[idx] *= xi_prob
            
            for idx in range(num_class):
                xi_posterior_prob[idx] *= class_prior[idx]
            
            predicted_labels[i] = np.argmax(np.array(xi_posterior_prob))
                
        if testing_labels is not None:
            acc = accuracy_score(testing_labels, predicted_labels)
                
        return predicted_labels

    def Gaussian(self, x, mean, std):
        return np.exp(- 1 / 2 * np.dot((x - mean).T, (x - mean)) / std) / (2 * np.pi * np.sqrt(np.abs(std)))

In [324]:
def NBClassifier(training_samples, training_labels, testing_samples, testing_labels):
    
    sample_dim = len(training_samples[0])
    
    NBC = NaiveBayesClassifier(sample_dim)
    NBC.train(training_samples, training_labels)
    pred = NBC.test(testing_samples, testing_labels)
    
    test_num = len(testing_labels)
    correct_num = 0
    for i in range(test_num):
        if pred[i] == testing_labels[i]:
            correct_num += 1

    return [test_num, correct_num, correct_num / test_num]

In [327]:
def Cross_validation(samples, labels, k=5):
    
    batch_size = int(samples.shape[0] / k)
    correct_classification = 0
    total = 0

    for i in range(0, k):
        k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
        k_train_labels = np.hstack([labels[0 : i * batch_size], labels[(i + 1) * batch_size:]])

        k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
        k_val_labels = labels[i * batch_size : (i + 1) * batch_size]

        res = NBClassifier(k_train_samples, k_train_labels, k_val_samples, k_val_labels)
        
        correct_classification += res[1]
        total += res[0]
        print('ACC of %dth validation : %.3f' % (i, res[2]))
        
    return correct_classification / total

In [328]:
Cross_validation(samples, labels, k=5)

ACC of 0th validation : 0.967
ACC of 1th validation : 0.900
ACC of 2th validation : 1.000
ACC of 3th validation : 1.000
ACC of 4th validation : 0.833


0.94

**SVM**

In [294]:
def SVMclassifier(kernel, training_samples, training_labels, testing_samples, testing_labels):
    clf = svm.SVC(kernel=kernel, C=0.1)
    clf.fit(training_samples, training_labels)
    pred = clf.predict(testing_samples)
    
    test_num = len(testing_labels)
    correct_num = 0
    for i in range(test_num):
        if pred[i] == testing_labels[i]:
            correct_num += 1
            
    return [test_num, correct_num, correct_num / test_num]

In [297]:
def Cross_validation(samples, labels, k=5, kernel='linear'):
    
    batch_size = int(samples.shape[0] / k)
    correct_classification = 0
    total = 0

    for i in range(0, k):
        k_train_samples = np.vstack([samples[0 : i * batch_size], samples[(i + 1) * batch_size :]])
        k_train_labels = np.hstack([labels[0 : i * batch_size], labels[(i + 1) * batch_size:]])

        k_val_samples = samples[i * batch_size : (i + 1) * batch_size]
        k_val_labels = labels[i * batch_size : (i + 1) * batch_size]

        res = SVMclassifier(kernel, k_train_samples, k_train_labels, k_val_samples, k_val_labels)
        
        correct_classification += res[1]
        total += res[0]
        print('ACC of %dth validation : %.3f' % (i, res[2]))
        
    print('total acc: %.3f' % (correct_classification / total))

In [298]:
for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
    print('kernel: %s' % (kernel))
    Cross_validation(samples, labels, k=5, kernel=kernel)

kernel: linear
ACC of 0th validation : 1.000
ACC of 1th validation : 0.900
ACC of 2th validation : 1.000
ACC of 3th validation : 1.000
ACC of 4th validation : 0.967
total acc: 0.973
kernel: rbf
ACC of 0th validation : 0.967
ACC of 1th validation : 0.900
ACC of 2th validation : 1.000
ACC of 3th validation : 1.000
ACC of 4th validation : 0.800
total acc: 0.933
kernel: poly
ACC of 0th validation : 1.000
ACC of 1th validation : 0.933
ACC of 2th validation : 1.000
ACC of 3th validation : 0.967
ACC of 4th validation : 0.933
total acc: 0.967
kernel: sigmoid
ACC of 0th validation : 0.233
ACC of 1th validation : 0.133
ACC of 2th validation : 0.133
ACC of 3th validation : 0.300
ACC of 4th validation : 0.200
total acc: 0.200
