In [1]:
import numpy as np, humanfriendly as hf
import time
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV
# 只要把参数输进去，就能给出最优化的结果和参数。但是这个方法适合于小数据集
from sklearn.model_selection import RandomizedSearchCV
# 超参数优化也就是常说的调参，python-sklearn里常用的有GridSearchCV和RandomizedSearchCV可以用。
# 其中GridSearchCV的原理很简明，就是程序去挨个尝试每一组超参数，然后选取最好的那一组。
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
def get_scores(model, Xtrain, ytrain, Xtest, ytest):
    y_pred = model.predict(Xtrain)
    train = accuracy_score(ytrain, y_pred)
    y_pred = model.predict(Xtest)
    test = accuracy_score(ytest, y_pred)
    return train, test, model.__class__.__name__

In [3]:
def get_cross(model, data, target, groups=10):
    return cross_val_score(model, data, target, cv=groups)

In [4]:
def see_time(note,start):
    end = time.perf_counter()
    elapsed = end - start
    print (note,hf.format_timespan(elapsed, detailed=True))

In [5]:
iris = load_iris()
X = iris.data
y = iris.target
targets = iris.target_names
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [6]:
knn_model = KNeighborsClassifier()

In [7]:
# 定义调节参数，均以列表存储，然后打包成字典
distances = [1, 2, 3, 4, 5]
k_range = list(range(1, 31))
leaf = [10]
param_grid = dict(n_neighbors=k_range, p=distances,leaf_size=leaf)

In [8]:
start = time.perf_counter()
grid = GridSearchCV(knn_model, param_grid, cv=10,scoring='accuracy')
grid.fit(X, y)
see_time('GridSearchCV total tuning time:',start=start)
bp = grid.best_params_# 找到最好参数
print ('best parameters:',bp)

GridSearchCV total tuning time: 2 seconds, 890 milliseconds, 904 microseconds and 300 nanoseconds
best parameters: {'leaf_size': 10, 'n_neighbors': 6, 'p': 3}


In [9]:
knn_best_model = KNeighborsClassifier(**bp).fit(X_train, y_train)# 用最好的参数学习
train_score, test_score, name = get_scores(knn_best_model, X_train, y_train,X_test, y_test)

In [10]:
print (name, 'train/test scores (GridSearchCV):')
print(train_score, test_score)

KNeighborsClassifier train/test scores (GridSearchCV):
0.9732142857142857 0.9736842105263158


In [11]:
scores = get_cross(knn_best_model, X, y)
print('cross-val scores:',scores)
print('avg cross-val score:', np.mean(scores))

cross-val scores: [1.         0.93333333 1.         1.         0.93333333 1.
 0.93333333 1.         1.         1.        ]
avg cross-val score: 0.9800000000000001


In [12]:
d = grid.cv_results_
print(d.keys())
print ('mean grid score:', np.mean(d['mean_test_score']))

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_leaf_size', 'param_n_neighbors', 'param_p', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])
mean grid score: 0.9673333333333333


In [15]:
vector = [[3, 5, 4, 2]]
vectors = [[2, 5, 3, 5], [1, 4, 2, 1]]
y_pred = knn_best_model.predict(vector)
print (targets[y_pred])

['versicolor']


In [17]:
y_preds = knn_best_model.predict(vectors)
print (targets[y_preds])

['versicolor' 'setosa']


In [18]:
# 随机搜索
start = time.perf_counter()
randS_model = RandomizedSearchCV(knn_model, param_grid, cv=10,random_state=0,scoring='accuracy', n_iter=10)
randS_model.fit(X, y)
see_time('RandomizedSearchCV total tuning time:',start=start)

RandomizedSearchCV total tuning time: 195 milliseconds, 257 microseconds and 400 nanoseconds


In [19]:
bp = randS_model.best_params_
print ('best parameters:',bp)
knn_best = KNeighborsClassifier(**bp).fit(X_train, y_train)
train, test, name = get_scores(knn_best, X_train, y_train,X_test, y_test)
print (name, 'train/test scores (RandomizedSearchCV):')
print (train, test)

best parameters: {'p': 3, 'n_neighbors': 13, 'leaf_size': 10}
KNeighborsClassifier train/test scores (RandomizedSearchCV):
0.9642857142857143 0.9736842105263158
