## 原代码-原始形式

In [5]:

import pandas as pd
import numpy as np
import cv2
import random
import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#%matplotlib inline

class Perceptron(object):

    def __init__(self):
        self.learning_step = 0.00001
        self.max_iteration = 5000

    def predict_(self, x):
        wx = sum([self.w[j] * x[j] for j in range(len(self.w))])
        return int(wx > 0)

    def train(self, features, labels):
        self.w = [0.0] * (len(features[0]) + 1)    #w，b都放在里面

        correct_count = 0
        time = 0

        while time < self.max_iteration:
            index = random.randint(0, len(labels) - 1)    #随机取训练数据
            x = list(features[index])
            x.append(1.0)     #添加1为了和b对应
            y = 2 * labels[index] - 1     #输出变为1与-1
            wx = sum([self.w[j] * x[j] for j in range(len(self.w))])

            if wx * y > 0:
                correct_count += 1
                if correct_count > self.max_iteration:
                    break
                continue

            for i in range(len(self.w)):
                self.w[i] += self.learning_step * (y * x[i])

    def predict(self,features):
        labels = []
        for feature in features:
            x = list(feature)
            x.append(1)
            labels.append(self.predict_(x))
        return labels

print('Start read data')

time_1 = time.time()

raw_data = pd.read_csv('data/train_binary.csv', header=0)
data = raw_data.values

imgs = data[0:, 1:]
labels = data[:, 0]

# 选取 2/3 数据作为训练集， 1/3 数据作为测试集
train_features, test_features, train_labels, test_labels = train_test_split(
    imgs, labels, test_size=0.33, random_state=23323)


time_2 = time.time()
print('read data cost {}'.format(time_2 - time_1))

print('Start training')
p = Perceptron()
p.train(train_features, train_labels)

time_3 = time.time()
print('training cost {}'.format(time_3 - time_2))

print('Start predicting')
test_predict = p.predict(test_features)
time_4 = time.time()
print('predicting cost {}'.format(time_4 - time_3))

score = accuracy_score(test_labels, test_predict)
print('The accruacy socre is {}'.format(score))


Start read data
read data cost 8.202415704727173
Start training
training cost 4.731271505355835
Start predicting
predicting cost 10.869202852249146
The accruacy socre is 0.9836219336219336


## 改成对偶形式的代码

In [4]:

import pandas as pd
import numpy as np
import cv2
import random
import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#%matplotlib inline

def get_gram(x):    
    #计算Gram矩阵
    g = np.zeros((len(x), len(x)))
    for i in range(len(x)):
        for j in range(len(x)):
            g[i][j] = np.dot(x[i,:], x[j,:].T)   
    return g

class Perceptron(object):

    def __init__(self):
        self.learning_step = 0.00001
        self.max_iteration = 5000

    def predict_(self, x):
        wx = np.dot(self.w ,x.T) + self.b
        return int(wx > 0)

    #对偶的训练函数
    def train(self, features, labels):
        self.a = [0.0] * (len(features))
        self.b = 0.0
        self.x = features
        self.y = 2 * labels - 1     #输出变为1与-1
        
        Gram = get_gram(features)
        print('get gram')
        correct_count = 0
        time = 0

        while time < self.max_iteration:
            index = random.randint(0, len(labels) - 1)    #随机取训练数据
            ax = np.dot(self.a * self.y, Gram[index]) + self.b
            
            if ax*self.y[index] > 0:
                correct_count += 1
                if correct_count > self.max_iteration:
                    break
                continue   #满足条件就继续，不进行下面的更新参数语句

            self.a[index] += self.learning_step
            self.b += self.y[index]
        self.w = np.dot(self.a*self.y, self.x)  
    
    def predict(self,features):
        labels = []
        for feature in features:
            x = np.array(feature)
            labels.append(self.predict_(x))
        return labels

print('Start read data')

time_1 = time.time()

raw_data = pd.read_csv('data/train_binary.csv', header=0)
data = raw_data.values

imgs = data[0:3000, 1:]   #取出数据   数据过多，求解Gram是电脑存不下，所以只取一部分
labels = data[0:3000, 0]  #取出标签    求gram的过程十分浪费时间，所以要减少数据量
# 选取 2/3 数据作为训练集， 1/3 数据作为测试集
train_features, test_features, train_labels, test_labels = train_test_split(
    imgs, labels, test_size=0.33, random_state=23323)


time_2 = time.time()
print('read data cost {}'.format(time_2 - time_1))

print('Start training')
p = Perceptron()
p.train(train_features, train_labels)

time_3 = time.time()
print('training cost {}'.format(time_3 - time_2))

print('Start predicting')
test_predict = p.predict(test_features)
time_4 = time.time()
print('predicting cost {}'.format(time_4 - time_3))

score = accuracy_score(test_labels, test_predict)
print('The accruacy socre is {}'.format(score))


Start read data
read data cost 9.54023790359497
Start training
get gram
training cost 51.82147216796875
Start predicting
predicting cost 0.028051137924194336
The accruacy socre is 0.9858585858585859


**数据过多时，在计算gram的时候会很浪费时间，因为有一万条数据就要循环一亿次，所以我减少了数据量，只从中取了3000个进行训练和测试**