In [94]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [95]:
# 超参数
epochs = 200

In [96]:
# 参数
w, b = np.array([0, 0]), 0

In [97]:
# 定义 d(x)
def d(x):
  return np.dot(w,x)+b

In [98]:
# 定义 sign 函数
def sign(x):
  return 1 if x >= 0 else -1

In [99]:
# 定义 h(x) 函数
def h(x):
  return sign(d(x))

In [100]:
def clf_score(X, y):
  score = 0
  for xi, yi in zip(X, y):
     score += yi*h(xi)
  return score

In [101]:
# 感知机的口袋算法
def PLA_pocket(X, y):
  global epochs, w, b
  
  w, b = np.array([0, 0]), 0
  best_w, best_b = w
  best_cs = clf_score(X, y)
  for _ in range(epochs):
    for xi, yi in zip(X, y):
      if yi*d(xi) <= 0:
        w, b = w + yi*xi, b + yi
        cs = clf_score(X, y)
        if cs > best_cs:
          best_cs = cs
          best_w, best_b = w, b
        break
  w, b = best_w, best_b

In [102]:
# 开始训练
# 载入 iris(鸢尾花) 数据集
iris = datasets.load_iris()

# 取后面 100 个数据，并且只取最后两个特征，以及取出对应的类别
sampleNumber = 100
X = iris.data[50:50+sampleNumber, [2,3]]

# iris 数据集的类别分别为 0, 1, 2，为了运用我们实现的感知机算法，这里将后两个类别改为 -1, 1
y = np.where(iris.target[50:50+sampleNumber] == 1, -1, 1)

# 借助 train_test_split 进行随机分割，按照 8 : 2 的比例分为训练验证集、测试集
rs = 42
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=0.2, random_state=rs, shuffle=True)

print(r'总共有 {} 个数据，其中训练验证集有 {} 个数据，测试集中有 {} 个数据。'.format(len(X), len(X_tv), len(X_test)))

总共有 100 个数据，其中训练验证集有 80 个数据，测试集中有 20 个数据。


In [104]:
# 在 X_tv 上进行 k 折交叉验证
k = 10
kf = KFold(n_splits=k, random_state=rs, shuffle=True)

test_accuracy = 0
for idx, (train_index, test_index) in zip(range(k), kf.split(X_tv)):
  X_train, y_train, X_test, y_test = X_tv[train_index], y_tv[train_index], X_tv[test_index], y_tv[test_index]
  PLA_pocket(X_train, y_train)
  split_train_accuracy = 1 - (len(X_train) - clf_score(X_train, y_train))/2/len(X_train)
  split_test_accuracy = 1 - (len(X_test) - clf_score(X_test, y_test))/2/len(X_test)
  print(r'第 {} 折，训练集准确率：{:.2%}，验证集准确率 {:.2%}'.format(idx + 1, split_train_accuracy, split_test_accuracy))
  test_accuracy += split_test_accuracy
print(r'epochs = {}，验证集准确率的平均值为 {:.2%}。'.format(epochs, test_accuracy / k))

第 1 折，训练集准确率：79.17%，验证集准确率 87.50%
第 2 折，训练集准确率：94.44%，验证集准确率 87.50%
第 3 折，训练集准确率：95.83%，验证集准确率 75.00%
第 4 折，训练集准确率：94.44%，验证集准确率 100.00%
第 5 折，训练集准确率：94.44%，验证集准确率 87.50%
第 6 折，训练集准确率：94.44%，验证集准确率 100.00%
第 7 折，训练集准确率：94.44%，验证集准确率 100.00%
第 8 折，训练集准确率：95.83%，验证集准确率 87.50%
第 9 折，训练集准确率：94.44%，验证集准确率 100.00%
第 10 折，训练集准确率：97.22%，验证集准确率 87.50%
epochs = 200，验证集准确率的平均值为 91.25%。
