## Halving Grid Search CV

#### 1. Libraries

In [22]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import VarianceThreshold

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import multiprocessing

from sklearn import set_config                   #to change the display
from sklearn.utils import estimator_html_repr    #to save the diagram into HTML format
from IPython.core.display import display, HTML   #to visualize pipeline

from sklearn.metrics import mean_squared_log_error, make_scorer

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV


In [2]:
train_df = pd.read_csv('train.csv').set_index('Id')
train_df

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
y_train= train_df['SalePrice']

#### 2.Creating Pipeline & Model

In [4]:
# globals

n_threads = multiprocessing.cpu_count()
n_estim = 1000
np.random.seed(123)

#create column transformer

na_transformer = FunctionTransformer(lambda x: x.fillna(np.nan))

select_numeric_features = make_column_selector(dtype_include=np.number)

numeric_pipe = make_pipeline( na_transformer,
                            SimpleImputer(strategy='median',add_indicator=True))

select_oh_features = make_column_selector(dtype_exclude=np.number)

oh_pipe = make_pipeline( na_transformer,
                       SimpleImputer(strategy='constant'),
                       OneHotEncoder(handle_unknown='ignore'))
column_transformer = ColumnTransformer([('numeric_pipe', numeric_pipe, select_numeric_features),
                                       ('ohe_pipe', oh_pipe, select_oh_features)],
                                      n_jobs = n_threads)

#create model

model = CatBoostRegressor(thread_count = n_threads, n_estimators=n_estim, verbose=False)
                    
#create pipeline

pipe = Pipeline([('column transformer', column_transformer),
                ('variance threashold', VarianceThreshold(threshold=0.0)),
                ('model', model)])

In [6]:
set_config(display='diagram')
display(HTML(estimator_html_repr(pipe)))

#### 3. Experimental Controls

In [17]:
np.random.seed(123) #set a global seed
pd.set_option('display.precision',4)

root_mean_squared_log_error = lambda y_true, y_pred: np.sqrt(mean_squared_log_error(y_true, y_pred))
scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False)

param_grid = {'model__max_depth': [5,6,7],
             'model__learning_rate':[0.1,0.03,0.06],
             'model__subsample':[.7,.8,.9],
             'model__colsample_bylevel':[.8,.9,1]}
            
grid_search_params = dict(estimator=pipe,
                         param_grid=param_grid,
                         scoring=scorer,
                         cv=3,
                         n_jobs=-1,
                         verbose=2)

#### 4.Tests

* Grid Search CV

In [18]:
%%time
full_results = GridSearchCV(**grid_search_params).fit(train_df, y_train)
pd.DataFrame(full_results.best_params_, index=[0]).assign(RMSLE = -full_results.best_score_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


5 fits failed out of a total of 243.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Prerana\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Prerana\anaconda3\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Prerana\anaconda3\lib\site-packages\catboost\core.py", line 5734, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "C:\Users\Prerana\anaconda3\lib\si

Wall time: 9min 14s


Unnamed: 0,model__colsample_bylevel,model__learning_rate,model__max_depth,model__subsample,RMSLE
0,1,0.03,5,0.9,0.0264


* Halving GridSearch CV with n_samples

In [20]:
%%time
FACTOR = 2
MAX_RESOURCE_DIVISOR = 4

n_samples = len(train_df)
halving_results_n_samples = HalvingGridSearchCV(resource = 'n_samples',
                                               min_resources = n_samples //MAX_RESOURCE_DIVISOR,
                                               factor = FACTOR,
                                               **grid_search_params).fit(train_df, y_train)

n_iterations: 3
n_required_iterations: 7
n_possible_iterations: 3
min_resources_: 365
max_resources_: 1460
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 81
n_resources: 365
Fitting 3 folds for each of 81 candidates, totalling 243 fits
----------
iter: 1
n_candidates: 41
n_resources: 730
Fitting 3 folds for each of 41 candidates, totalling 123 fits
----------
iter: 2
n_candidates: 21
n_resources: 1460
Fitting 3 folds for each of 21 candidates, totalling 63 fits
Wall time: 12min 6s


In [29]:
def find_rank(full_results, halving_results):
    best_halving_results_df = pd.DataFrame(halving_results.best_params, index=[0]).rename(columns = lambda x:'param_'+ x)
    
    return pd.DataFrame(full_results.cv_results).filter(regex='param_|rank_test_score').merge(best_halving_results_df)\
                                                                                       .loc[:,'rank_test_score'].values

def compare_cv_best_params(full_results,*halving_results):
    cv_results =[full_results]+list(halving_results)
    df_list=[]
    
    for cv_result in cv_results:
        best_params_score = pd.DataFrame(cv_result.best_params_,
                                        index=[cv_result])\
                                        .assign(RMSLE=-cv_result.best_score_,
                                               full_grid_search_rank=find_rank(full_results, cv_result) if cv_result != full_results else np.nan)\
                                        .pipe(lambda df: pd.concat([df.iloc[:,-2:],df.iloc[:,:-2]],axis=1))
        df_list.append(best_params_score)
        
        return pd.concat(df_list).reset_index()

In [30]:
compare_cv_best_params(full_results, *[halving_results_n_samples])\
 .style.applymap(lambda cell: 'background: pink' if cell == 9 else '')

Unnamed: 0,index,RMSLE,full_grid_search_rank,model__colsample_bylevel,model__learning_rate,model__max_depth,model__subsample
0,"GridSearchCV(cv=3,  estimator=Pipeline(steps=[('column transformer',  ColumnTransformer(n_jobs=12,  transformers=[('numeric_pipe',  Pipeline(steps=[('functiontransformer',  FunctionTransformer(func= at 0x000001CF33C23AF0>)),  ('simpleimputer',  SimpleImputer(add_indicator=True,  strategy='median'))]),  )])),  ('variance threashold',  VarianceThreshold()),  ('model',  )]),  n_jobs=-1,  param_grid={'model__colsample_bylevel': [0.8, 0.9, 1],  'model__learning_rate': [0.1, 0.03, 0.06],  'model__max_depth': [5, 6, 7],  'model__subsample': [0.7, 0.8, 0.9]},  scoring=make_scorer(, greater_is_better=False), verbose=2)",0.026444,,1,0.03,5,0.9


* Halving GridSearchCV with n_estimators

In [24]:
%%time

halving_results_n_estimator = HalvingGridSearchCV(resource='model__n_estimators',
                                                 max_resources=1000,
                                                 min_resources=1000//MAX_RESOURCE_DIVISOR,
                                                 factor=FACTOR,
                                                 **grid_search_params).fit(train_df,y_train)

n_iterations: 3
n_required_iterations: 7
n_possible_iterations: 3
min_resources_: 250
max_resources_: 1000
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 81
n_resources: 250
Fitting 3 folds for each of 81 candidates, totalling 243 fits
----------
iter: 1
n_candidates: 41
n_resources: 500
Fitting 3 folds for each of 41 candidates, totalling 123 fits
----------
iter: 2
n_candidates: 21
n_resources: 1000
Fitting 3 folds for each of 21 candidates, totalling 63 fits
Wall time: 6min 12s


In [32]:
compare_cv_best_params(full_results, *[halving_results_n_samples,
                                      halving_results_n_estimator]).style.apply(lambda row : row.apply(lambda col: 'background:lightgreen' if row.name ==2 else ''),axis=1)

Unnamed: 0,index,RMSLE,full_grid_search_rank,model__colsample_bylevel,model__learning_rate,model__max_depth,model__subsample
0,"GridSearchCV(cv=3,  estimator=Pipeline(steps=[('column transformer',  ColumnTransformer(n_jobs=12,  transformers=[('numeric_pipe',  Pipeline(steps=[('functiontransformer',  FunctionTransformer(func= at 0x000001CF33C23AF0>)),  ('simpleimputer',  SimpleImputer(add_indicator=True,  strategy='median'))]),  )])),  ('variance threashold',  VarianceThreshold()),  ('model',  )]),  n_jobs=-1,  param_grid={'model__colsample_bylevel': [0.8, 0.9, 1],  'model__learning_rate': [0.1, 0.03, 0.06],  'model__max_depth': [5, 6, 7],  'model__subsample': [0.7, 0.8, 0.9]},  scoring=make_scorer(, greater_is_better=False), verbose=2)",0.026444,,1,0.03,5,0.9


In [33]:
#end