In [1]:
import numpy as np, humanfriendly as hf, random
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier

In [2]:
def get_scores(model, xtrain, ytrain, xtest, ytest):
    ypred = model.predict(xtest)
    train = model.score(xtrain, ytrain)
    test = model.score(xtest, y_test)
    name = model.__class__.__name__
    return (name, train, test)

In [3]:
def get_cross(model, data, target, groups=10):
    return cross_val_score(model, data, target, cv=groups)

In [4]:
def prep_data(data, target):
    d = [data[i] for i, _ in enumerate(data)]
    t = [target[i] for i, _ in enumerate(target)]
    return list(zip(d, t))

In [5]:
def create_sample(d, n, replace='yes'):
    if replace == 'yes': 
        s = random.sample(d, n)
    else: 
        s = [random.choice(d) for i, _ in enumerate(d) if i < n]
    Xs = [row[0] for i, row in enumerate(s)]
    ys = [row[1] for i, row in enumerate(s)]
    return np.array(Xs), np.array(ys)

In [6]:
def see_time(note,start):
    end = time.perf_counter()
    elapsed = end - start
    print (note,hf.format_timespan(elapsed, detailed=True))

In [7]:
X_file = 'data/X_mnist'
y_file = 'data/y_mnist'
X = np.load('data/X_mnist.npy')
y = np.load('data/y_mnist.npy')

In [8]:
X = X.astype(np.float32)
data = prep_data(X, y)
sample_size = 7000
Xs, ys = create_sample(data, sample_size)
Xs

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [9]:
# 找到最优参数
rf_model = RandomForestClassifier(random_state=0,n_estimators=100)
params = {'class_weight': ['balanced'],'max_depth': [10, 30]}
random = RandomizedSearchCV(rf_model, param_distributions = params,cv=3, n_iter=2, random_state=0)
start = time.perf_counter()
random.fit(Xs, ys)
see_time('RandomizedSearchCV total tuning time:',start)
bp = random.best_params_
print (bp)

RandomizedSearchCV total tuning time: 17 seconds, 609 milliseconds, 806 microseconds and 800 nanoseconds
{'max_depth': 30, 'class_weight': 'balanced'}


In [10]:
# 以最优参数训练
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
rf = RandomForestClassifier(**bp, random_state=0,n_estimators=100)
start = time.perf_counter()
rf.fit(X_train, y_train)
rf_scores = get_scores(rf, X_train, y_train,X_test, y_test)
see_time('total time:',start)
print (rf_scores[0] + 'train:',rf_scores[1], 'test_score:',rf_scores[2])

total time: 42 seconds, 786 milliseconds, 835 microseconds and 300 nanoseconds
RandomForestClassifiertrain: 0.9999809523809524 test_score: 0.9701142857142857


In [11]:
# 找打最优参数
et = ExtraTreesClassifier(random_state=0, n_estimators=200)
params = {'class_weight': ['balanced'],'max_depth': [10, 30]}
random = RandomizedSearchCV(et, param_distributions = params,cv=3, n_iter=2, random_state=0)
start = time.perf_counter()
random.fit(Xs, ys)
see_time('RandomizedSearchCV total tuning time:',start)
bp = random.best_params_
print (bp)

RandomizedSearchCV total tuning time: 28 seconds, 394 milliseconds, 547 microseconds and 600 nanoseconds
{'max_depth': 30, 'class_weight': 'balanced'}


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
et = ExtraTreesClassifier(**bp, random_state=0,n_estimators=200)
start = time.perf_counter()
et.fit(X_train, y_train)
et_scores = get_scores(et, X_train, y_train,X_test, y_test)
see_time('total time:',start)
print (et_scores[0] + 'train:',et_scores[1],'test_score:',et_scores[2])

In [None]:
file = 'data/bp_mnist_et'
np.save(file, bp)
bp = np.load('data/bp_mnist_et.npy', allow_pickle=True)
bp = bp.tolist()
print ('best parameters:',bp)
