In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection    import train_test_split, ParameterSampler, RandomizedSearchCV
from sklearn.base               import TransformerMixin, BaseEstimator, clone
from sklearn.ensemble           import *
from sklearn.metrics            import mean_squared_error, make_scorer
from sklearn.pipeline           import Pipeline, make_pipeline
from sklearn.preprocessing      import *
from sklearn.impute             import *
from sklearn.compose            import *
from sklearn.feature_selection  import SelectFromModel
from sklearn.decomposition      import PCA
from sklearn.linear_model       import Lasso, Ridge
from sklearn                    import tree

from xgboost                    import XGBRegressor
from lightgbm                   import LGBMRegressor

%run helper

In [24]:
data = pd.read_csv('concrete_data.csv')

In [25]:
y = data.loc[:, '28-d']
X = data.drop('28-d', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [26]:
search_space = [# Lasso
                {'regressor': [Lasso()],
                 'regressor__alpha': np.linspace(1, 10, 10), # strength of regulation, when alpha = 0, lasso degrade to regular linear regression
                 'regressor__max_iter': [1000, 10000, 100000], # iteration to find the minimum point
                },
                # Ridge
                {'regressor': [Ridge()],
                 'regressor__alpha': np.linspace(0, 1, 10), # strength of regulation, when alpha = 0, lasso degrade to regular linear regression
                 'regressor__max_iter': [1000, 10000, 100000], # iteration to find the minimum point
                },
                # RandomForestRegressor
                {
                 'regressor': [RandomForestRegressor(n_jobs=-1)],
                 'regressor__n_estimators': [50, 70, 100, 150, 200, 300], # due to the bootstrapping, one tree only see a subset of data, thus we need to try multiple trees
#                  'regressor__max_depth': [6,9, None], # set the max_depth to prevent the random forest to be too deep to prevent overfitting
                 'regressor__min_samples_leaf': [1,2,3,4,5] # also prevent the random forest goes too specific and prevent over-fitting
                },
                #ExtraTreesRegressor is a modfied version of random forest. Instead of bootstrapping, it uses all data and randomly select one split
                {
                 'regressor': [ExtraTreesRegressor(n_jobs=-1)],
                 'regressor__n_estimators': [50, 70, 100, 150, 200, 300],
#                  'regressor__max_depth': [6,9, None],
                 'regressor__min_samples_leaf': [1,2,3,4,5],
                },
                #XGBRegressor 
                {
                 'regressor': [XGBRegressor(booster='gbtree', n_jobs=-1)],
                 'regressor__n_estimators': [50, 70, 100, 150, 200, 300],
                 'regressor__learning_rate': np.logspace(-2, 0, 10),
                 'regressor__max_depth': np.arange(1, 5, 1),
#                  'regressor__reg_lambda': np.linspace(1, 10, 10),
#                  'regressor__gamma': np.linspace(0.5, 0.9, 5),
                 },
                #LGBMRegressor
                {
                 'regressor': [LGBMRegressor(boosting_type='gbdt', n_jobs=-1)],
                 'regressor__n_estimators': [50, 70, 100, 150, 200, 300],
                 'regressor__learning_rate': np.logspace(-2, 0, 10),
                 'regressor__max_depth': np.arange(1, 5, 1),
                 }]

In [27]:
class DummyEstimator(BaseEstimator):
    "Pass through class, methods are present but do nothing."
    def fit(self): pass
    def score(self): pass

In [30]:
pipe = Pipeline([
                    ('ss', StandardScaler()),# for linear regressions, we need standarize the data at first
                    ('regressor', DummyEstimator())
                        
                ])

regressor_rand_cv = RandomizedSearchCV(estimator=pipe, 
                                  param_distributions=search_space, 
                                  n_iter=80, 
                                  cv=5, 
                                  n_jobs=-1,
                                  verbose=False,
                                  scoring='neg_root_mean_squared_error',
                                )

regressor_rand_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('ss', StandardScaler()),
                                             ('regressor', DummyEstimator())]),
                   n_iter=80, n_jobs=-1,
                   param_distributions=[{'regressor': [Lasso()],
                                         'regressor__alpha': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]),
                                         'regressor__max_iter': [1000, 10000,
                                                                 100000]},
                                        {'regressor': [Ridge()],
                                         'regressor__alpha': array([0.        , 0.11111111, 0.22222222, 0.3...
                                         'regressor__n_estimators': [50, 70,
                                                                     100, 150,
                                                                     200,
                                       

In [31]:
pd.DataFrame(regressor_rand_cv.cv_results_)[:50]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__n_estimators,param_regressor__max_depth,param_regressor__learning_rate,param_regressor,param_regressor__min_samples_leaf,param_regressor__max_iter,param_regressor__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,8.629641,0.081181,0.012335,0.004234,300.0,4.0,0.077426,"XGBRegressor(base_score=0.5, booster='gbtree',...",,,,"{'regressor__n_estimators': 300, 'regressor__m...",-6.434915,-7.09139,-6.526013,-7.165766,-7.051043,-6.853825,0.308411,1
1,5.799708,0.051925,0.010022,0.000539,200.0,4.0,1.0,"XGBRegressor(base_score=0.5, booster='gbtree',...",,,,"{'regressor__n_estimators': 200, 'regressor__m...",-10.31659,-9.890655,-9.16224,-8.399667,-9.02523,-9.358876,0.673806,57
2,0.271769,0.06315,0.022418,0.002238,70.0,,,"ExtraTreesRegressor(min_samples_leaf=4, n_esti...",1.0,,,"{'regressor__n_estimators': 70, 'regressor__mi...",-6.783579,-7.857286,-6.127648,-7.567749,-7.038884,-7.075029,0.606379,5
3,1.857061,0.129576,0.006295,0.000187,50.0,3.0,0.077426,LGBMRegressor(),,,,"{'regressor__n_estimators': 50, 'regressor__ma...",-8.785002,-8.437821,-8.063939,-9.193323,-8.443066,-8.58463,0.38036,38
4,0.668594,0.045232,0.008997,0.000844,50.0,2.0,1.0,"XGBRegressor(base_score=0.5, booster='gbtree',...",,,,"{'regressor__n_estimators': 50, 'regressor__ma...",-8.972059,-9.140576,-9.866526,-8.143125,-8.497436,-8.923945,0.588097,47
5,1.302483,0.013612,0.008302,0.000273,150.0,1.0,0.215443,"XGBRegressor(base_score=0.5, booster='gbtree',...",,,,"{'regressor__n_estimators': 150, 'regressor__m...",-9.201786,-8.754619,-8.485319,-9.850652,-8.683629,-8.995201,0.487731,49
6,3.409773,0.038604,0.007298,0.000378,300.0,1.0,0.046416,LGBMRegressor(),,,,"{'regressor__n_estimators': 300, 'regressor__m...",-9.980932,-9.53206,-9.322609,-11.282697,-9.646305,-9.952921,0.698215,64
7,9.26814,0.081317,0.00766,0.000825,300.0,3.0,1.0,LGBMRegressor(),,,,"{'regressor__n_estimators': 300, 'regressor__m...",-7.762328,-9.525189,-9.383408,-8.632085,-8.558528,-8.772308,0.636505,45
8,3.562829,0.111924,0.006579,0.000132,70.0,4.0,1.0,LGBMRegressor(),,,,"{'regressor__n_estimators': 70, 'regressor__ma...",-8.164568,-9.159433,-8.997421,-8.513598,-8.843569,-8.735718,0.356428,41
9,0.372383,0.041038,0.057724,0.003153,200.0,,,"ExtraTreesRegressor(min_samples_leaf=4, n_esti...",4.0,,,"{'regressor__n_estimators': 200, 'regressor__m...",-7.583295,-8.15662,-6.611348,-8.061533,-7.659038,-7.614367,0.548271,24


In [32]:
regressor_rand_cv.best_estimator_

Pipeline(steps=[('ss', StandardScaler()),
                ('regressor',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0, gpu_id=-1,
                              importance_type='gain',
                              interaction_constraints='',
                              learning_rate=0.0774263682681127,
                              max_delta_step=0, max_depth=4, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                              n_estimators=300, n_jobs=-1, num_parallel_tree=1,
                              random_state=0, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, subsample=1,
                              tree_method='exact', validate_parameters=1,
                              verbosity=None))])

In [None]:
fig, ax = plt.subplots(1, figsize=(7,6))
ax.boxplot(best_score_coefvar_concat)
# ax.set_xlabel('y_pred')
ax.set_ylabel('coefficient_of_variation (%)')
ax.set_xticklabels(['Lasso', 'Random Forest', 'ExtraTreesRegressor'])
plt.show()