### Grid_search 网格搜索

In [8]:
# 导包
import numpy as np
from sklearn import datasets

In [9]:
# 得到手写数据集
digits = datasets.load_digits()
X = digits.data
y = digits.target

In [15]:
# 形成测试集和模型集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=666)

In [16]:
# 用默认参数生成模型得到准确率
from sklearn.neighbors import KNeighborsClassifier

sk_knn_clf = KNeighborsClassifier(n_neighbors=4, weights="uniform")
sk_knn_clf.fit(X_train, y_train)
sk_knn_clf.score(X_test, y_test)

0.9916666666666667

In [None]:
### 使用网格搜索建立适合的模型

In [17]:
# 1. 定义网格参数搜索的数据，整体式一个列表，内部是字典
param_grid = [
    {
        'weights': ['uniform'], 
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)], 
        'p': [i for i in range(1, 6)]
    }
]

In [19]:
# 2.定义实体类 knn_clf,这里不用传任何参数
knn_clf = KNeighborsClassifier()

In [33]:
# 3. 调用创建模型中的 GridSearchCV 方法，knn_clf 与 网格参数数据结合
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf,param_grid)

In [34]:
%time
#4.拟合数据，寻找模型,这个过程非常的漫长
grid_search.fit(X_train,y_train)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.34 µs




GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [42]:
#得到最好的分类器，将最好的分类器给 knn_clf ，方便新数据用分类器进行预测
knn_clf = grid_search.best_estimator_

In [36]:
# 显示最好的成绩
grid_search.best_score_

0.9853862212943633

In [38]:
# 得到最好的参数
grid_search.best_params_

{'n_neighbors': 3, 'p': 3, 'weights': 'distance'}

In [43]:
# 从上面得到最好的分类器后就可以用分类器对 test 中的数据进行测试
knn_clf.score(X_test,y_test)

0.9833333333333333

### 其他参数的使用

In [48]:
# n_jobs：制定在进行模型建立的时候使用 CPU 的核数，值大于1就是准确的核数，值=-1 就是分配适当的核数
#verbose 就是在建模的时候显示过程，值越大信息越详细
grid_search = GridSearchCV(knn_clf, param_grid, n_jobs=-1, verbose=2)
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.3min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=3, p=3,
                                            weights='distance'),
             iid='warn', n_jobs=-1,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)