In [1]:
import numpy as np, humanfriendly as hf, random
import time
from sklearn.model_selection import train_test_split,RandomizedSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
def get_scores(model, xtrain, ytrain, xtest, ytest):
    ypred = model.predict(xtest)
    train = model.score(xtrain, ytrain)
    test = model.score(xtest, y_test)
    name = model.__class__.__name__
    return (name, train, test)

In [7]:
def get_cross(model, data, target, groups=10):
    return cross_val_score(model, data, target, cv=groups)


In [8]:
def prep_data(data, target):
    d = [data[i] for i, _ in enumerate(data)]
    t = [target[i] for i, _ in enumerate(target)]
    return list(zip(d, t))

In [9]:
def create_sample(data, samples, replace='yes'):
    if replace == 'yes': 
        s = random.sample(data, samples)# s为Xs与Ys的集合
    else: 
        s = [random.choice(d) for i, _ in enumerate(d) if i < n]
    Xs = [row[0] for i, row in enumerate(s)]
    ys = [row[1] for i, row in enumerate(s)]
    return np.array(Xs), np.array(ys)

In [19]:
def see_time(note,start):
    end = time.perf_counter()
    elapsed = end - start
    print (note,hf.format_timespan(elapsed, detailed=True))

In [11]:
X = np.load('data/X_bank.npy')
# need to add allow_pickle=True parameter
y = np.load('data/y_bank.npy', allow_pickle=True)


In [12]:
sample_size = 4000
data = prep_data(X, y)
Xs, ys = create_sample(data, sample_size, replace='no')
X_train, X_test, y_train, y_test = train_test_split(Xs, ys, random_state=0)

In [16]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_model_scores = get_scores(knn_model, X_train, y_train,X_test, y_test)
print(knn_model_scores[0] + ' (train, test):',knn_model_scores[1], knn_model_scores[2])

KNeighborsClassifier (train, test): 0.9303333333333333 0.922


In [17]:
param_grid = {'n_neighbors': np.arange(1, 31, 2),'metric': ['euclidean']}

In [21]:
start = time.perf_counter()
randS_model = RandomizedSearchCV(knn_model, param_grid, cv=3, n_jobs = -1,random_state=0, verbose=2)
randS_model.fit(X, y)
see_time('RandomizedSearchCV total tuning time:',start)
bp = randS_model.best_params_
print (bp)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   30.3s finished


RandomizedSearchCV total tuning time: 31 seconds, 369 milliseconds, 824 microseconds and 1000 nanoseconds
{'n_neighbors': 29, 'metric': 'euclidean'}


In [23]:
file = 'data/bp_bank'
np.save(file, bp)
knn_tuned_model = KNeighborsClassifier(**bp).fit(X_train, y_train)
knn_tuned_model_scores = get_scores(knn_tuned_model, X_train, y_train,X_test, y_test)
print (knn_tuned_model_scores[0] + ' (train, test):',knn_tuned_model_scores[1], knn_tuned_model_scores[2])

KNeighborsClassifier (train, test): 0.9146666666666666 0.917


In [24]:
print ('cross-validation score:')
knn = KNeighborsClassifier()
scores = get_cross(knn, Xs, ys)
print (np.mean(scores))

cross-validation score:
0.908
