# 4.5 Hyper Parameters

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
digits = datasets.load_digits()
X = digits.data
y = digits.target

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [4]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
knn_clf.score(X_test, y_test)

0.9888888888888889

### 4.5.1 超参数，在算法运行前需要决定的参数

- 超参数：在算法运行前需要决定的参数（调参）
- 模型参数：算法过程中学习的参数
- kNN算法没有模型参数，而kNN算法中的k时典型的超参数
- 寻找好的超参数
    - 领域知识
    - 经验数值（例如sklearn中的k为5）
    - 实验搜索

### 4.5.2 寻找最好的k
- 可通过循环的形式来找最好的k
- 如果找到的最好k在寻找边界上时，边界外可能有更好的超参数
- 假如在[1，11)间找到了10，应该在[8, 20]间再寻找一次

In [5]:
best_score = 0.0
best_k = -1
for k in range(1, 11):
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train, y_train)
    score = knn_clf.score(X_test, y_test)
    if score > best_score:
        best_k = k
        best_score = score
print("best_k =", best_k)
print("best_score = ", best_score)

best_k = 4
best_score =  0.9916666666666667


### 4.5.3 考虑距离？不考虑距离？

- kNN的另一个超参数
- 距离的权重
    - 一般是距离的倒数
    - 而且可以解决平票问题

In [6]:
best_method = ""
best_score = 0.0
best_k = -1
for method in ["uniform", "distance"]:
    for k in range(1, 11):
        knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_k = k
            best_score = score
            best_method = method
print("best_method =", best_method)
print("best_k =", best_k)
print("best_score = ", best_score)

best_method = uniform
best_k = 4
best_score =  0.9916666666666667


### 4.5.4 搜索明可夫斯基距离相应的p

- 曼哈顿距离、欧拉距离、明可夫斯基距离
- p=1 曼哈顿距离
- p= 2 欧拉距离
- p不定时  明可夫斯基距离 Minkowski distance

In [7]:
%%time
best_p = -1
best_score = 0.0
best_k = -1
for k in range(1, 11):
    for p in range(1, 6):
        knn_clf = KNeighborsClassifier(n_neighbors=k, weights="distance", p=p)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_k = k
            best_score = score
            best_p = p
print("best_p =", best_p)
print("best_k =", best_k)
print("best_score = ", best_score)

best_p = 2
best_k = 3
best_score =  0.9888888888888889
Wall time: 33.7 s
