In [1]:
from zoish.feature_selectors.optunashap import OptunaShapFeatureSelector
import xgboost
from optuna.pruners import HyperbandPruner
from optuna.samplers._tpe.sampler import TPESampler
from sklearn.model_selection import KFold,train_test_split
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    r2_score
    )
from zoish.utils.helper_funcs import catboost
import matplotlib.pyplot as plt
import optuna
import logging
from sklearn.model_selection import StratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import lightgbm

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
[32m[I 2022-08-09 23:09:10,215][0m A new study created in memory with name: no-name-86fb686a-7788-415b-811b-6b53baa69c92[0m


<Figure size 432x288 with 0 Axes>

# Example 1 :Computer Hardware Data Set (a regression problem)
  
https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

In [2]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data"
# column names
col_names=[
    "vendor name",
    "Model Name",
    "MYCT",
    "MMIN",
    "MMAX",
    "CACH",
    "CHMIN",
    "CHMAX",
    "PRP"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data

Unnamed: 0,vendor name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP
adviser,32/60,125,256,6000,256,16,128,198,199
amdahl,470v/7,29,8000,32000,32,8,32,269,253
amdahl,470v/7a,29,8000,32000,32,8,32,220,253
amdahl,470v/7b,29,8000,32000,32,8,32,172,253
amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...
sperry,80/8,124,1000,8000,0,1,8,42,37
sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
sratus,32,125,2000,8000,0,2,14,52,41
wang,vs-100,480,512,8000,32,0,0,67,47


# Train test split

In [3]:
X = data.loc[:, data.columns != "PRP"]
y = data.loc[:, data.columns == "PRP"]


X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33, random_state=42)


# Find feature types for later use

In [4]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#  Define Feature selector and set its arguments  

In [5]:
optuna_regression_xgb = OptunaShapFeatureSelector(
        # general argument setting        
        verbose=1,
        random_state=0,
        logging_basicConfig = None,
        # general argument setting        
        n_features=4,
        list_of_obligatory_features_that_must_be_in_model=[],
        list_of_features_to_drop_before_any_selection=[],
        # shap argument setting        
        estimator=lightgbm.LGBMRegressor(),
        estimator_params={
        "max_depth": [4, 5],
        # "min_child_weight": [0.1, 0.9],
        # "gamma": [1, 9],
        },
        # shap arguments
        model_output="raw", 
        feature_perturbation="interventional", 
        algorithm="auto", 
        shap_n_jobs=-1, 
        memory_tolerance=-1, 
        feature_names=None, 
        approximate=False, 
        shortcut=False, 
        plot_shap_summary=False,
        save_shap_summary_plot=False,
        path_to_save_plot = './summary_plot.png',
        shap_fig = plt.figure(),
        ## optuna params
        test_size=0.33,
        with_stratified = False,
        performance_metric = 'r2',
        # optuna study init params
        study = optuna.create_study(
            storage = None,
            sampler = TPESampler(),
            pruner= HyperbandPruner(),
            study_name  = None,
            direction = "maximize",
            load_if_exists = False,
            directions  = None,
        ),
        # optuna optimization params
        study_optimize_objective = None,
        study_optimize_objective_n_trials=10, 
        study_optimize_objective_timeout=600,
        study_optimize_n_jobs = -1,
        study_optimize_catch= (),
        study_optimize_callbacks = None,
        study_optimize_gc_after_trial = False,
        study_optimize_show_progress_bar=False,

)




[32m[I 2022-08-09 23:09:10,646][0m A new study created in memory with name: no-name-84213702-a382-4490-b00c-ac37145430fe[0m
root - INFO - Setting value for logging_basicConfig
root - INFO - Setting value for verbose
root - INFO - Setting value for random_state
root - INFO - Setting value for n_features
root - INFO - Setting value for list_of_obligatory_features_that_must_be_in_model
root - INFO - Setting value for list of features to drop before any selection
root - INFO - Setting value for estimator
root - INFO - Getting value for estimator
root - INFO - LGBMRegressor()
root - INFO - Getting value for estimator
root - INFO - Setting value for estimator_params
root - INFO - Getting value for estimator
root - INFO - Getting value for estimator
root - INFO - Getting value for estimator
root - INFO - Getting value for estimator
root - INFO - Getting value for estimator
root - INFO - Getting value for estimator
root - INFO - Getting value for estimator
root - INFO - Getting value for es

<Figure size 432x288 with 0 Axes>

# Build sklearn Pipeline

In [6]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # feature selection
            ('SFC_CATREG_OPTUNA', optuna_regression_xgb),
            # add any regression model from sklearn e.g., LinearRegression
            ('regression', LinearRegression())


 ])



# Run Pipeline

In [7]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


root - INFO - Getting value for estimator
root - INFO - Getting value for performance metric
root - INFO - Getting value for estimator_params
root - INFO - Getting value for verbose
root - INFO - Getting value for test_size
root - INFO - Getting value for random_state
root - INFO - Getting value for study
root - INFO - Getting value for study optimize objective
root - INFO - Getting value for study_optimize_objective_n_trials
root - INFO - Getting value for study_optimize_objective_timeout
root - INFO - Getting value for study_optimize_n_jobs
root - INFO - Getting value for study_optimize_catch
root - INFO - Getting value for study_optimize_callbacks
root - INFO - Getting value for study_optimize_gc_after_trial
root - INFO - Getting value for study_optimize_show_progress_bar
root - INFO - Getting value for with_stratified
`n_jobs` argument has been deprecated in v2.7.0. This feature will be removed in v4.0.0. See https://github.com/optuna/optuna/releases/tag/v2.7.0.
'verbose' argument 

You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Total Bins 141
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Number of data points in the train set: 93, number of used features: 8
[LightGBM] [Info] Number of data points in the train set: 93, number of used features: 8
[LightGBM] [Info] Number of data points in the train set: 93, number of used features: 8
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_col_wise=true` to remove the overhead.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Number of data points in the train set: 9

[32m[I 2022-08-09 23:09:11,365][0m Trial 2 finished with value: 0.2691269267186439 and parameters: {'max_depth': 5}. Best is trial 2 with value: 0.2691269267186439.[0m


[77]	valid_0's l2: 3402.92
[76]	valid_0's l2: 3425.73

[67]	valid_0's l2: 3223.99
[77]	valid_0's l2: 3402.92
[78]	valid_0's l2: 3426.43
[78]	valid_0's l2: 3426.43
[77]	valid_0's l2: 3402.92
[67]	valid_0's l2: 3223.99
[78]	valid_0's l2: 3426.43
[68]	valid_0's l2: 3202.28
[78]	valid_0's l2: 3426.43
[79]	valid_0's l2: 3376.04
[78]	valid_0's l2: 3426.43
[79]	valid_0's l2: 3376.04
[79]	valid_0's l2: 3376.04
[68]	valid_0's l2: 3202.28
[78]	valid_0's l2: 3426.43
[79]	valid_0's l2: 3376.04
[79]	valid_0's l2: 3376.04
[80]	valid_0's l2: 3438.92
[69]	valid_0's l2: 3276.09
[80]	valid_0's l2: 3438.92
[79]	valid_0's l2: 3376.04
[80]	valid_0's l2: 3438.92
[80]	valid_0's l2: 3438.92
[81]	valid_0's l2: 3390.58
[69]	valid_0's l2: 3276.09
[80]	valid_0's l2: 3438.92
[70]	valid_0's l2: 3254.19
[81]	valid_0's l2: 3390.58
[81]	valid_0's l2: 3390.58
[80]	valid_0's l2: 3438.92
[81]	valid_0's l2: 3390.58
[81]	valid_0's l2: 3390.58
[70]	valid_0's l2: 3254.19
[82]	valid_0's l2: 3452.35
[71]	valid_0's l2: 3257.13


[32m[I 2022-08-09 23:09:11,382][0m Trial 1 finished with value: 0.2691269267186439 and parameters: {'max_depth': 4}. Best is trial 2 with value: 0.2691269267186439.[0m
'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
[32m[I 2022-08-09 23:09:11,392][0m Trial 4 finished with value: 0.2691269267186439 and parameters: {'max_depth': 4}. Best is trial 2 with value: 0.2691269267186439.[0m
'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
[32m[I 2022-08-09 23:09:11,394][0m Trial 7 finished with value: 0.2691269267186439 and parameters: {'max_depth': 4}. Best is trial 2 with value: 0.2691269267186439.[0m
[32m[I 2022-08-09 23:09:11,397][0m Trial 3 finished with value: 0.2691269267186439 and parameters: {'max_depth': 5}. Best is trial 2 with value: 0.2691269267186439.[0m
[32m[I 20

[92]	valid_0's l2: 3481.61
[91]	valid_0's l2: 3426.64
[93]	valid_0's l2: 3440.63
[94]	valid_0's l2: 3484.8
No further splits with positive gain, best gain: -inf
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Number of data points in the train set: 93, number of used features: 8
[93]	valid_0's l2: 3440.63
[95]	valid_0's l2: 3444.45
[LightGBM] [Info] Start training from score 89.806452
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Number of data points in the train set: 93, number of used features: 8
[94]	valid_0's l2: 3484.8
[96]	valid_0's l2: 3466.54
[LightGBM] [Info] Start training from score 89.806452
[1]	valid_0's l2: 4295.71
[95]	valid_0's l2: 3444.45
[1]	valid_0's l2: 4295.71
[97]	valid_0's l2: 3477.32
[2]	valid_0's l2: 3630.05

[2]	valid_0's l2: 3630.05
[3]	valid_0's l2: 3168.82
[98]	valid_0's l2: 3437.54
[3]	valid_0's l2: 3168.82
[97]	valid_0's l2: 3477.32

[32m[I 2022-08-09 23:09:11,620][0m Trial 8 finished with value: 0.2691269267186439 and parameters: {'max_depth': 4}. Best is trial 2 with value: 0.2691269267186439.[0m
[32m[I 2022-08-09 23:09:11,631][0m Trial 9 finished with value: 0.2691269267186439 and parameters: {'max_depth': 4}. Best is trial 2 with value: 0.2691269267186439.[0m
root - INFO - Setting value for best estimator
root - INFO - Getting value for estimator
root - INFO - Getting value for estimator
root - INFO - Getting value for best estimator
root - INFO - Getting value for model_output
root - INFO - Getting value for feature perturbation
root - INFO - Getting value for algorithm
root - INFO - Getting value for shap_n_jobs
root - INFO - Getting value for memory_tolerance
root - INFO - Getting value for feature_names
root - INFO - Getting value for approximate
root - INFO - Getting value for shortcut
root - INFO - Getting value for plot shap summary
root - INFO - Getting value for save_shap_summary_plot
root - INFO

[93]	valid_0's l2: 3440.63
[94]	valid_0's l2: 3484.8
[95]	valid_0's l2: 3444.45
[94]	valid_0's l2: 3484.8
[96]	valid_0's l2: 3466.54
[95]	valid_0's l2: 3444.45
[96]	valid_0's l2: 3466.54
[97]	valid_0's l2: 3477.32
[98]	valid_0's l2: 3437.54
[97]	valid_0's l2: 3477.32
[98]	valid_0's l2: 3437.54
[99]	valid_0's l2: 3494.75
[100]	valid_0's l2: 3457.49
[99]	valid_0's l2: 3494.75
[100]	valid_0's l2: 3457.49
{'max_depth': 5}
LGBMRegressor
.values =
array([[-4.72144036e-01, -3.22159794e+01,  9.57700656e+00, ...,
         2.29664889e+01,  2.46718379e+01,  7.26002643e+01],
       [ 1.59457244e+00, -8.55346764e+00, -4.68599306e+00, ...,
        -5.57597093e+00, -3.28928982e-01, -2.56118125e+01],
       [ 2.61583332e+00,  3.00965838e+01, -4.32470130e-01, ...,
        -1.05618054e+01, -1.05871706e+01, -2.32576154e+01],
       ...,
       [-1.80365794e+00, -2.02134372e+01,  4.14133443e-02, ...,
        -5.47939392e+00, -1.56068437e+01, -2.43846816e+01],
       [-6.85402499e+00, -1.35939193e+01, -5.1

# Check performance of the Pipeline

In [8]:
print('r2 score : ')
print(r2_score(y_test,y_pred))


r2 score : 
0.9400389432530891
