In [2]:
import numpy as np, humanfriendly as hf, warnings, sys
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error

In [3]:
def get_error(model, Xtest, ytest):
    y_pred = model.predict(Xtest)
    return np.sqrt(mean_squared_error(ytest, y_pred)),model.__class__.__name__

In [4]:
def see_time(note,start):
    end = time.perf_counter()
    elapsed = end - start
    print (note,hf.format_timespan(elapsed, detailed=True))

In [5]:
def get_cross(model, data, target, groups=10):
    return cross_val_score(model, data, target, cv=groups,scoring='neg_mean_squared_error')

In [6]:
X = np.load('data/X_boston.npy')
y = np.load('data/y_boston.npy')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [7]:
gbr_model = GradientBoostingRegressor(random_state=0)
gbr_model.fit(X_train, y_train)
rmse, name = get_error(gbr_model, X_test, y_test)
print (name + '(rmse):',rmse)

GradientBoostingRegressor(rmse): 3.1941117128039194


In [8]:
loss = ['ls', 'lad', 'huber']
lr = [1e-2, 1e-1, 1e-0]
n_est = [150, 200, 300, 500]
alpha = [0.9]
params = {'loss': loss, 'learning_rate': lr,'n_estimators': n_est, 'alpha': alpha}
grid = GridSearchCV(gbr_model, params, cv=5, n_jobs=-1,verbose=1, refit=False)
start = time.perf_counter()
grid.fit(X_train, y_train)
see_time('training time:',start)
bp = grid.best_params_
print (bp)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.8s


training time: 18 seconds, 82 milliseconds, 626 microseconds and 200 nanoseconds
{'alpha': 0.9, 'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 300}


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   18.0s finished


In [10]:
gbr_bp_model = GradientBoostingRegressor(**bp, random_state=0)
gbr_bp_model.fit(X_train, y_train)
rmse, name = get_error(gbr_bp_model, X_test, y_test)
print (name + '(rmse):',rmse)

GradientBoostingRegressor(rmse): 3.0839764165411934


In [12]:
start = time.perf_counter()
scores = get_cross(gbr_bp_model, X, y)
see_time('cross-validation rmse:',start)
rmse = np.sqrt(np.mean(scores) * -1)
print (rmse)

cross-validation rmse: 6 seconds, 313 milliseconds, 944 microseconds and 1000 nanoseconds
3.7929403445012064
