In [1]:
from zoish.feature_selectors.optunashap import OptunaShapFeatureSelector
import xgboost
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import KFold,train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    r2_score
    )
from zoish.utils.helper_funcs import catboost
import matplotlib.pyplot as plt
import optuna
import logging
from sklearn.model_selection import StratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import lightgbm

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
[32m[I 2022-09-07 06:26:37,568][0m A new study created in memory with name: no-name-74e0506b-1c56-48a2-a238-8d16a48ded29[0m


<Figure size 432x288 with 0 Axes>

# Example 1 :Computer Hardware Data Set (a regression problem)
  
https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data"
# column names
col_names=[
    "vendor name",
    "Model Name",
    "MYCT",
    "MMIN",
    "MMAX",
    "CACH",
    "CHMIN",
    "CHMAX",
    "PRP"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data

Unnamed: 0,vendor name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
adviser,32/60,125,256,6000,256,16,128,198,199
amdahl,470v/7,29,8000,32000,32,8,32,269,253
amdahl,470v/7a,29,8000,32000,32,8,32,220,253
amdahl,470v/7b,29,8000,32000,32,8,32,172,253
amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...
sperry,80/8,124,1000,8000,0,1,8,42,37
sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
sratus,32,125,2000,8000,0,2,14,52,41
wang,vs-100,480,512,8000,32,0,0,67,47


# Train test split

In [3]:
X = data.loc[:, data.columns != "PRP"]
y = data.loc[:, data.columns == "PRP"]


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


# Find feature types for later use

In [4]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#  Define Feature selector and set its arguments  

In [5]:
optuna_regression_xgb = OptunaShapFeatureSelector(
        # general argument setting        
        verbose=1,
        random_state=0,
        logging_basicConfig = None,
        # general argument setting        
        n_features=4,
        list_of_obligatory_features_that_must_be_in_model=[],
        list_of_features_to_drop_before_any_selection=[],
        # shap argument setting        
        estimator=xgboost.XGBRegressor(),
        estimator_params={
        "max_depth": [4, 5],
         "min_child_weight": [0.1, 0.9],
         "gamma": [1, 9],
        },
        # shap arguments
        model_output="raw", 
        feature_perturbation="interventional", 
        algorithm="auto", 
        shap_n_jobs=-1, 
        memory_tolerance=-1, 
        feature_names=None, 
        approximate=False, 
        shortcut=False, 
        plot_shap_summary=False,
        save_shap_summary_plot=False,
        path_to_save_plot = './summary_plot.png',
        shap_fig = plt.figure(),
        ## optuna params
        test_size=0.33,
        with_stratified = False,
        performance_metric = 'r2',
        # optuna study init params
        study = optuna.create_study(
            storage = None,
            sampler = TPESampler(),
            pruner= HyperbandPruner(),
            study_name  = None,
            direction = "maximize",
            load_if_exists = False,
            directions  = None,
        ),
        # optuna optimization params
        study_optimize_objective = None,
        study_optimize_objective_n_trials=10, 
        study_optimize_objective_timeout=600,
        study_optimize_n_jobs = -1,
        study_optimize_catch= (),
        study_optimize_callbacks = None,
        study_optimize_gc_after_trial = False,
        study_optimize_show_progress_bar=False,

)




[32m[I 2022-09-07 06:26:38,020][0m A new study created in memory with name: no-name-26b98147-ae09-40cc-b3f8-674b96734f73[0m
root - INFO - Setting value for logging_basicConfig
root - INFO - Setting value for verbose
root - INFO - Setting value for xgbse_focus
root - INFO - Setting value for random_state
root - INFO - Setting value for n_features
root - INFO - Setting value for list_of_obligatory_features_that_must_be_in_model
root - INFO - Setting value for list of features to drop before any selection
root - INFO - Setting value for estimator
root - INFO - Getting value for estimator
root - INFO - XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, gamma=None,
             gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=N

<Figure size 432x288 with 0 Axes>

# Build sklearn Pipeline

In [6]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # feature selection
            ('SFC_CATREG_OPTUNA', optuna_regression_xgb),
            # add any regression model from sklearn e.g., LinearRegression
            ('regression', LinearRegression())


 ])



# Run Pipeline

In [7]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


root - INFO - Getting value for estimator
root - INFO - Getting value for performance metric
root - INFO - Getting value for estimator_params
root - INFO - Getting value for verbose
root - INFO - Getting value for test_size
root - INFO - Getting value for random_state
root - INFO - Getting value for study
root - INFO - Getting value for study optimize objective
root - INFO - Getting value for study_optimize_objective_n_trials
root - INFO - Getting value for study_optimize_objective_timeout
root - INFO - Getting value for study_optimize_n_jobs
root - INFO - Getting value for study_optimize_catch
root - INFO - Getting value for study_optimize_callbacks
root - INFO - Getting value for study_optimize_gc_after_trial
root - INFO - Getting value for study_optimize_show_progress_bar
root - INFO - Getting value for with_stratified
root - INFO - Getting value for xgbse_focus


[0]	validation-rmse:77.40249
[0]	validation-rmse:77.40249
[1]	validation-rmse:62.34344
[1]	validation-rmse:62.34344
[0]	validation-rmse:77.40249
[2]	validation-rmse:53.79209
[1]	validation-rmse:62.34344[2]	validation-rmse:53.79209

[0]	validation-rmse:77.40249
[3]	validation-rmse:47.81294
[0]	validation-rmse:77.40249
[0]	validation-rmse:77.40249


`n_jobs` argument has been deprecated in v2.7.0. This feature will be removed in v4.0.0. See https://github.com/optuna/optuna/releases/tag/v2.7.0.
The distribution is specified by [0.1, 0.9] and step=1, but the range is not divisible by `step`. It will be replaced by [0.1, 0.1].


[2]	validation-rmse:53.79209[1]	validation-rmse:62.34344

[4]	validation-rmse:44.03545
[0]	validation-rmse:77.40249
[1]	validation-rmse:62.34344
[3]	validation-rmse:47.81294
[1]	validation-rmse:62.34344
[3]	validation-rmse:47.81294
[0]	validation-rmse:77.40249
[2]	validation-rmse:53.79209
[5]	validation-rmse:42.37902[2]	validation-rmse:53.79209

[4]	validation-rmse:44.03545[2]	validation-rmse:53.79209

[1]	validation-rmse:62.34344[4]	validation-rmse:44.03545

[1]	validation-rmse:62.34344
[3]	validation-rmse:47.81294
[6]	validation-rmse:41.28446
[3]	validation-rmse:47.81294
[3]	validation-rmse:47.81294[5]	validation-rmse:42.37833

[2]	validation-rmse:53.79209
[5]	validation-rmse:42.37833
[2]	validation-rmse:53.79209
[4]	validation-rmse:44.03545
[7]	validation-rmse:40.94810[4]	validation-rmse:44.03545

[3]	validation-rmse:47.84469
[3]	validation-rmse:47.81294
[6]	validation-rmse:41.27909[6]	validation-rmse:41.27909

[5]	validation-rmse:42.37833
[4]	validation-rmse:44.03545
[5]	validation

[32m[I 2022-09-07 06:26:38,316][0m Trial 7 finished with value: 0.6831619768299979 and parameters: {'max_depth': 5, 'min_child_weight': 0, 'gamma': 2.4069389046795555}. Best is trial 7 with value: 0.6831619768299979.[0m


[7]	validation-rmse:40.94330
[9]	validation-rmse:40.42190[9]	validation-rmse:40.42190

[7]	validation-rmse:40.94330
[6]	validation-rmse:41.27909
[8]	validation-rmse:40.55347
[7]	validation-rmse:41.07709[8]	validation-rmse:40.55347



The distribution is specified by [0.1, 0.9] and step=1, but the range is not divisible by `step`. It will be replaced by [0.1, 0.1].
[32m[I 2022-09-07 06:26:38,339][0m Trial 5 finished with value: 0.6842270614320075 and parameters: {'max_depth': 5, 'min_child_weight': 0, 'gamma': 7.786356990334456}. Best is trial 5 with value: 0.6842270614320075.[0m


[0]	validation-rmse:77.40249[9]	validation-rmse:40.42190
[8]	validation-rmse:40.66812

[9]	validation-rmse:40.40822
[7]	validation-rmse:40.94330
[8]	validation-rmse:40.55347


[32m[I 2022-09-07 06:26:38,339][0m Trial 4 finished with value: 0.6842270614320075 and parameters: {'max_depth': 5, 'min_child_weight': 0, 'gamma': 7.174123876842897}. Best is trial 5 with value: 0.6842270614320075.[0m
[32m[I 2022-09-07 06:26:38,352][0m Trial 0 finished with value: 0.6842270614320075 and parameters: {'max_depth': 5, 'min_child_weight': 0, 'gamma': 8.765294978454811}. Best is trial 5 with value: 0.6842270614320075.[0m


[9]	validation-rmse:40.09971


[32m[I 2022-09-07 06:26:38,356][0m Trial 2 finished with value: 0.6844407220415731 and parameters: {'max_depth': 5, 'min_child_weight': 0, 'gamma': 5.179173856296075}. Best is trial 2 with value: 0.6844407220415731.[0m


[8]	validation-rmse:40.55347[1]	validation-rmse:62.34344
[9]	validation-rmse:40.42190



The distribution is specified by [0.1, 0.9] and step=1, but the range is not divisible by `step`. It will be replaced by [0.1, 0.1].
[32m[I 2022-09-07 06:26:38,367][0m Trial 6 finished with value: 0.6892407650251735 and parameters: {'max_depth': 4, 'min_child_weight': 0, 'gamma': 7.809310211212768}. Best is trial 6 with value: 0.6892407650251735.[0m


[2]	validation-rmse:53.79209
[9]	validation-rmse:40.40822


[32m[I 2022-09-07 06:26:38,375][0m Trial 3 finished with value: 0.6842270614320075 and parameters: {'max_depth': 5, 'min_child_weight': 0, 'gamma': 7.070781770783633}. Best is trial 6 with value: 0.6892407650251735.[0m


[0]	validation-rmse:77.40249


[32m[I 2022-09-07 06:26:38,384][0m Trial 1 finished with value: 0.6844407220415731 and parameters: {'max_depth': 5, 'min_child_weight': 0, 'gamma': 4.6672496337571365}. Best is trial 6 with value: 0.6892407650251735.[0m


[3]	validation-rmse:47.81294
[1]	validation-rmse:62.34344
[4]	validation-rmse:44.03545
[2]	validation-rmse:53.79209
[5]	validation-rmse:42.37902
[3]	validation-rmse:47.84469
[6]	validation-rmse:41.28446
[4]	validation-rmse:44.00719
[7]	validation-rmse:40.94810[5]	validation-rmse:42.18096

[6]	validation-rmse:41.58481
[8]	validation-rmse:40.55204
[7]	validation-rmse:41.07709
[9]	validation-rmse:40.49001
[8]	validation-rmse:40.66812


[32m[I 2022-09-07 06:26:38,425][0m Trial 8 finished with value: 0.6831619768299979 and parameters: {'max_depth': 5, 'min_child_weight': 0, 'gamma': 1.2080750126183943}. Best is trial 6 with value: 0.6892407650251735.[0m


[9]	validation-rmse:40.09971


[32m[I 2022-09-07 06:26:38,442][0m Trial 9 finished with value: 0.6892407650251735 and parameters: {'max_depth': 4, 'min_child_weight': 0, 'gamma': 6.6819078002011825}. Best is trial 6 with value: 0.6892407650251735.[0m


{'max_depth': 4, 'min_child_weight': 0, 'gamma': 7.809310211212768}
[0]	validation-rmse:77.40249
[1]	validation-rmse:62.34344
[2]	validation-rmse:53.79209
[3]	validation-rmse:47.84469
[4]	validation-rmse:44.00719
[5]	validation-rmse:42.18096
[6]	validation-rmse:41.58481
[7]	validation-rmse:41.07709
[8]	validation-rmse:40.66812
[9]	validation-rmse:40.09971


root - INFO - Setting value for best estimator
root - INFO - Getting value for estimator
root - INFO - Getting value for estimator
root - INFO - Getting value for best estimator
root - INFO - Getting value for model_output


XGBRegressor


root - INFO - Getting value for feature perturbation
root - INFO - Getting value for algorithm
root - INFO - Getting value for shap_n_jobs
root - INFO - Getting value for memory_tolerance
root - INFO - Getting value for feature_names
root - INFO - Getting value for approximate
root - INFO - Getting value for shortcut


.values =
array([[ -3.59863474,   0.51441557,   1.07934671, ...,   1.84107763,
          0.        ,  16.94330525],
       [ -0.32208528,  -1.08593572,  -1.42568661, ...,  -2.91570303,
          0.        , -16.32149129],
       [ -0.32208528,   0.64088982,   1.56615729, ...,  -2.91570303,
          0.        , -22.26379588],
       ...,
       [ -0.13597406,  -0.86066333,   2.1451464 , ...,  -2.86840362,
          0.        , -19.70238415],
       [ -0.13597406,  -2.57465348,  -1.5314788 , ...,  -2.86840362,
          0.        , -17.09876057],
       [ -0.13597406,  -1.08593572,  -1.92273977, ...,  -2.86840362,
          0.        , -20.10936706]])

.base_values =
array([[82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.30139077],
       [82.301390

root - INFO - Getting value for plot shap summary
root - INFO - Getting value for save_shap_summary_plot
root - INFO - Setting value for importance_df
root - INFO - Getting value for importance_df





root - INFO - Getting value for importance_df
root - INFO - Getting value for importance_df
root - INFO - Getting value for importance_df
root - INFO - Setting value for importance_df
root - INFO - Getting value for importance_df


             0          1
0  vendor name   0.326608
1   Model Name   1.178685
2         MYCT   1.650027
3         MMIN  36.215308
4         MMAX   3.410175
5         CACH   5.939296
6        CHMIN        0.0
7        CHMAX  26.440059
   column_name shap_importance
0  vendor name        0.326608
1   Model Name        1.178685
2         MYCT        1.650027
3         MMIN       36.215308
4         MMAX        3.410175
5         CACH        5.939296
6        CHMIN             0.0
7        CHMAX       26.440059
   column_name shap_importance
3         MMIN       36.215308
7        CHMAX       26.440059
5         CACH        5.939296
4         MMAX        3.410175
2         MYCT        1.650027
1   Model Name        1.178685
0  vendor name        0.326608
6        CHMIN             0.0


root - INFO - Getting value for importance_df
root - INFO - Getting value for n_features
root - INFO - Getting value for n_features
root - INFO - Getting value for importance_df
root - INFO - Getting value for importance_df


  column_name shap_importance
3        MMIN       36.215308
7       CHMAX       26.440059
5        CACH        5.939296
4        MMAX        3.410175


# Check performance of the Pipeline

In [8]:
print('r2 score : ')
print(r2_score(y_test,y_pred))


r2 score : 
0.9338172940898798
