### 交叉验证

In [1]:
# 创建手写体的数据
import numpy as np
from sklearn import datasets
digits = datasets.load_digits()
X = digits.data
y = digits.target

### 1. 使用 train_test_split 的方法验证数据


In [2]:
# 使用 knn 算法测试性能
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 666)

In [3]:
from sklearn.neighbors import KNeighborsClassifier
# for 循环调整不同的超参数拟合出最好的模型
best_k,best_p,best_score = 0,0,0
for k in range(2,11):
    for p in range(1,7):
        knn_clf = KNeighborsClassifier(weights="distance",n_neighbors = k,p = p)
        knn_clf.fit(X_train,y_train)
        score = knn_clf.score(X_test,y_test)
        if score > best_score:
            best_score = score
            best_k = k
            best_p = p
print("Best K =", best_k)
print("Best P =", best_p)
print("Best Score =", best_score)

Best K = 5
Best P = 2
Best Score = 0.9866666666666667


### 2.使用交叉验证
这里使用的是 三交叉验证，因为将训练数据分为了三份

In [4]:
# 首先看一下三交叉验证怎么用
from sklearn.model_selection import cross_val_score
knn_clf = KNeighborsClassifier()
# 最终结果显示了三个数据，是将训练数据分为三份后从每一份中测试的结果，注意在这里就不再使用 fit 的方法了
cross_val_score(knn_clf,X_train,y_train)



array([0.98896247, 0.98210291, 0.98210291])

In [6]:
#去掉警告
import warnings
warnings.filterwarnings("ignore")
# 使用上述的方法判断超参数
from sklearn.neighbors import KNeighborsClassifier
# for 循环调整不同的超参数拟合出最好的模型
best_k,best_p,best_score = 0,0,0
for k in range(2,11):
    for p in range(1,7):
        knn_clf = KNeighborsClassifier(weights="distance",n_neighbors = k,p = p)
        scores = cross_val_score(knn_clf,X_train,y_train)
        #将这三组成绩取平均值
        score = np.mean(scores)
        if score > best_score:
            best_score = score
            best_k = k
            best_p = p
print("Best K =", best_k)
print("Best P =", best_p)
print("Best Score =", best_score)

Best K = 5
Best P = 3
Best Score = 0.9866166891368011


In [7]:
# 这样通过对训练数据建模就得到了比较准确的超参数，然后再使用得到的超参数对于测数据集进行测试，看结果
knn_clf = KNeighborsClassifier(weights="distance",n_neighbors = best_k,p = best_p)
knn_clf.fit(X_train,y_train)
# 训练准确定并不是很高但是结果更加准确了
knn_clf.score(X_test,y_test)

0.9822222222222222

### 网格搜索
网格搜索调用的类中就是用了交叉验证 ，因为最后cv cross_val 的缩写

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(2, 11)], 
        'p': [i for i in range(1, 6)]
    }
]

grid_search = GridSearchCV(knn_clf, param_grid, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:  1.6min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=3,
                                            weights='distance'),
             iid='warn', n_jobs=None,
             param_grid=[{'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [9]:
# 与上面 best_score 是相通的
grid_search.best_score_

0.9866369710467706

In [10]:
# 与上面 best_k,best_p 的值是相同的
grid_search.best_params_

{'n_neighbors': 5, 'p': 3, 'weights': 'distance'}

In [11]:
# 最后再用网格搜索建立的模型对测试数据进行性能的评测
# 得到最好超参数下的 knn_clf
best_knn_clf = grid_search.best_estimator_
best_knn_clf.score(X_test, y_test)

0.9822222222222222

### 交叉验证分不同的 k---CV 参数
上面是将训练数据分为了三份，但是训练数据还可以分为 k 份

In [12]:
cross_val_score(knn_clf, X_train, y_train, cv=5)

array([0.99270073, 0.99264706, 0.98507463, 0.98127341, 0.98120301])

In [None]:
# 在网格搜索中设置分成的参数
grid_search = GridSearchCV(knn_clf, param_grid, verbose=1, cv=5)