## Imports
- Place %%pycodestyle at the top of any cell to check python syntax

In [1]:
%load_ext pycodestyle_magic

# Common imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import shap

# Import from my GitHub
from getxgboostmodel.getxgboostmodel import get_xgboost_model
from randomizedgridsearch.randomizedgridsearch import RandomizedGridSearch
from transformers.transformers import *


lightgbm is installed...but failed to load!


## Define the Experiments

In [2]:
# Example dataset
boston_data = load_boston()

# Extract pandas dataframe and target
X = pd.DataFrame(boston_data['data']).copy().values
y = pd.DataFrame(boston_data['target']).copy().values

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.20, random_state=42)
train_X, test_X = train_X, test_X
train_y, test_y = train_y.reshape(-1, 1), test_y.reshape(-1, 1)

# An okay model fit to the data
try:
    xgb_model
except:
    xgb_model = get_xgboost_model(train_X, train_y)

# Pipeline
pipe = Pipeline([('standard_scaler', StandardScalerTransform()),
                 ('min_max_scaler', MinMaxScalerTransform()),
                 ('binarizer', BinarizerTransform()), 
                 ('model', xgb_model)])

# Find the number of features
num_features = train_X.shape[1]

# Testing with these indices
indices = list(range(num_features))

# Default 
default_values = [None]*num_features

min_max_values = [None]*num_features
min_max_values[4] = True

binarizer_values = [False]*num_features
binarizer_values[3] = True
binarizer_values[12] = True

# Possible configurations [None, True, or False] - None means not decided yet
param_distributions = {
    'standard_scaler': default_values,
    'min_max_scaler': default_values,
    'binarizer': default_values
}

# Randomly search the space n_iter times
experiments_results = RandomizedGridSearch(
    n_experiments=100,
    pipe=pipe,
    param_distributions=param_distributions,
    train_X=train_X,
    train_y=train_y,
    test_X=test_X,
    test_y=test_y,
    scoring='neg_mean_squared_error')

# Drop score
experiments_X_df = experiments_results.drop(['score'], axis=1)

# Get column names
X_column_names = experiments_X_df.columns

# Convert to numpy
experiments_X = experiments_X_df.values
experiments_y = experiments_results[['score']].values

# Create an XGBoost model tuned with the experiments data
xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)

# Fit the model
xgb_experiments_model.fit(experiments_X_df, experiments_y)

# Extract shap values
explainer = shap.TreeExplainer(xgb_experiments_model)
shap_values = explainer.shap_values(experiments_X_df)

# Shap as dataframe
shap_values_of_experiments = pd.DataFrame(shap_values, columns=X_column_names)
shap_values_of_experiments['score'] = experiments_y

print("Done")

100%|██████████| 100/100 [00:04<00:00, 22.23it/s]


Done


## Experiment Scores

In [3]:
# Options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 6)

# Print
shap_values_of_experiments


Unnamed: 0,standard_scaler__0,standard_scaler__1,standard_scaler__2,standard_scaler__3,standard_scaler__4,standard_scaler__5,standard_scaler__6,standard_scaler__7,standard_scaler__8,standard_scaler__9,standard_scaler__10,standard_scaler__11,standard_scaler__12,min_max_scaler__0,min_max_scaler__1,min_max_scaler__2,min_max_scaler__3,min_max_scaler__4,min_max_scaler__5,min_max_scaler__6,min_max_scaler__7,min_max_scaler__8,min_max_scaler__9,min_max_scaler__10,min_max_scaler__11,min_max_scaler__12,binarizer__0,binarizer__1,binarizer__2,binarizer__3,binarizer__4,binarizer__5,binarizer__6,binarizer__7,binarizer__8,binarizer__9,binarizer__10,binarizer__11,binarizer__12,score
0,0.254102,0.003926,0.013600,-0.001972,0.088296,0.055862,0.067326,0.133516,0.076371,-0.009725,0.135583,0.044000,0.007211,0.103226,0.020356,-0.002358,0.279421,-0.030359,0.003896,0.032406,0.036513,0.016740,0.254391,-0.040521,-0.024511,-0.039844,0.449511,-0.083150,0.637837,0.239618,1.986626,5.312289,0.018804,3.239763,-0.021537,0.014827,1.498042,-0.014833,0.673414,31.428364
1,0.278325,-0.009348,0.013600,0.001410,0.089542,-0.053018,-0.011025,0.163266,0.115963,0.007346,-0.047035,0.005766,0.035483,-0.045253,0.003269,0.002527,0.233852,-0.029122,-0.002572,-0.034387,0.038764,-0.017547,0.320066,-0.028802,-0.019542,-0.040569,0.286328,0.099223,0.575775,0.241249,1.912419,5.203274,0.023777,3.001658,0.041870,-0.007917,1.527898,0.045644,0.768267,29.593080
2,0.123020,0.014630,0.013600,-0.003083,-0.092999,-0.037540,0.057005,0.060453,-0.034057,-0.004690,-0.066866,-0.009451,-0.014572,0.017665,0.001251,-0.002358,0.191753,-0.031203,0.014761,0.065857,0.034349,-0.001279,-0.158735,0.040961,-0.024923,0.064882,-0.314812,-0.081509,0.528129,0.212476,1.881346,5.584935,-0.002956,3.196544,-0.048191,-0.009935,1.384665,0.028563,0.654514,27.700466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.029491,-0.023066,-0.001485,0.009780,0.095191,-0.001146,0.001885,-0.018137,-0.018502,0.004368,-0.084190,0.006512,-0.026799,-0.001243,-0.001521,-0.004626,-0.064613,0.082503,0.011439,-0.022545,-0.057713,0.036993,-0.070959,0.080049,0.079550,0.063149,-0.490895,-0.093236,0.040861,-0.000300,-0.944060,-3.436132,0.001927,-1.798802,-0.051461,0.039588,-0.593730,0.013006,-0.258380,6.701158
98,-0.087159,-0.010791,0.001194,-0.007606,-0.165738,0.029661,0.025007,0.053104,-0.021240,-0.008593,-0.069087,0.007522,-0.019569,0.030412,0.003908,-0.002358,0.073387,0.096677,0.009683,-0.042877,0.005507,-0.020239,0.034670,-0.076619,0.071974,0.058585,-0.534086,0.132604,0.092358,-0.010041,-0.867615,-3.712751,-0.007301,-1.755169,0.122496,-0.099630,-0.723866,-0.017464,-0.203017,6.303939
99,0.105972,0.007777,-0.002360,0.009780,-0.188747,0.017303,0.003693,-0.010485,0.018726,-0.024988,0.072288,-0.025630,-0.039539,-0.026350,0.003846,-0.004626,-0.061137,-0.049653,-0.008533,-0.039566,-0.046815,0.002706,0.098735,-0.124307,-0.058692,0.032225,-0.625420,-0.106746,0.079178,-0.014358,-0.890148,-3.564973,-0.008380,-1.844623,-0.053388,-0.074386,-0.617787,0.020926,-0.224070,5.849342


In [4]:
# Print
experiments_results


Unnamed: 0,standard_scaler__0,standard_scaler__1,standard_scaler__2,standard_scaler__3,standard_scaler__4,standard_scaler__5,standard_scaler__6,standard_scaler__7,standard_scaler__8,standard_scaler__9,standard_scaler__10,standard_scaler__11,standard_scaler__12,min_max_scaler__0,min_max_scaler__1,min_max_scaler__2,min_max_scaler__3,min_max_scaler__4,min_max_scaler__5,min_max_scaler__6,min_max_scaler__7,min_max_scaler__8,min_max_scaler__9,min_max_scaler__10,min_max_scaler__11,min_max_scaler__12,binarizer__0,binarizer__1,binarizer__2,binarizer__3,binarizer__4,binarizer__5,binarizer__6,binarizer__7,binarizer__8,binarizer__9,binarizer__10,binarizer__11,binarizer__12,score
20,True,True,False,True,True,False,False,False,True,True,False,False,False,True,False,True,True,True,False,True,False,True,True,True,False,False,True,False,True,True,True,True,True,True,False,False,True,False,True,31.428364
39,True,False,False,False,True,True,True,False,True,False,True,True,True,False,False,False,True,True,False,False,False,False,True,True,False,False,True,True,True,True,True,True,False,True,True,False,True,True,True,29.593080
36,True,True,False,True,False,True,False,False,False,True,True,False,False,False,True,True,True,True,True,True,False,False,False,False,False,True,False,False,True,True,True,True,False,True,False,False,True,True,True,27.700466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45,True,False,True,False,True,False,False,True,False,False,True,False,True,True,True,True,False,False,True,False,True,True,False,False,True,True,False,False,True,True,False,False,True,False,False,True,False,True,True,6.701158
82,False,False,True,True,False,False,False,False,False,True,True,False,False,False,True,True,True,False,True,False,False,False,True,True,True,True,False,True,True,False,False,False,False,False,True,False,False,False,True,6.303939
52,True,True,True,False,False,False,True,False,True,True,False,True,False,True,False,True,False,True,False,False,True,True,True,True,False,True,False,False,True,False,False,False,True,False,False,False,False,True,True,5.849342


In [5]:
# Function to support analysis
def find_significance_from_experiments_results(importance_threshold=0.05, max_toggles_to_lock_per_series=5):
    temp_df = shap_values_of_experiments.drop(['score'], axis=1).copy()
    for i in range(0, len(shap_values_of_experiments.index)):
        for j in range(0, len(shap_values_of_experiments.columns)):
            if not experiments_results.iloc[i, j]:
                temp_df.iloc[i, j] = -1 * shap_values_of_experiments.iloc[i, j]
    temp_df = temp_df.sum().sort_values()
    options_to_set_to_false = temp_df[temp_df > 0]
    options_to_set_to_true = temp_df[temp_df < 0]    
    sum_value = (options_to_set_to_false.sum() + abs(options_to_set_to_true.sum()))
    options_to_set_to_false = options_to_set_to_false / sum_value
    options_to_set_to_true = abs(options_to_set_to_true) / sum_value
    options_to_set_to_false = options_to_set_to_false[options_to_set_to_false > importance_threshold].sort_values(ascending=False)
    options_to_set_to_true = options_to_set_to_true[options_to_set_to_true > importance_threshold]
    return options_to_set_to_false[0:max_toggles_to_lock_per_series], options_to_set_to_true[0:max_toggles_to_lock_per_series]

# Call function
options_to_set_to_false, options_to_set_to_true = find_significance_from_experiments_results()


In [6]:
options_to_set_to_true.keys()

Index([], dtype='object')

In [7]:
options_to_set_to_false

binarizer__5     0.399580
binarizer__7     0.201509
binarizer__4     0.113185
binarizer__10    0.077935
dtype: float32

In [18]:
# Options
pd.set_option('display.max_rows', 100)

# Make a dataframe
options_to_set_to_true_df = pd.DataFrame()
transformation, value = None, None
try:
    transformation_and_value = options_to_set_to_true.keys()
except:
    transformation, value = [], []
if len(transformation_and_value) > 0:
    options_to_set_to_true_df["transformation"] = [x.split("__")[0] for x in transformation_and_value]
    options_to_set_to_true_df["value"] = [x.split("__")[1] for x in transformation_and_value]
else:
    options_to_set_to_true_df["transformation"] = []
    options_to_set_to_true_df["value"] = []
options_to_set_to_true_df["significance"] = options_to_set_to_true.values
options_to_set_to_true_df


Unnamed: 0,transformation,value,significance


In [20]:
# Options
pd.set_option('display.max_rows', 100)

# Make a dataframe
options_to_set_to_false_df = pd.DataFrame()
transformation, value = None, None
try:
    transformation_and_value = options_to_set_to_false.keys()
except:
    transformation, value = [], []
if len(transformation_and_value) > 0:
    options_to_set_to_false_df["transformation"] = [x.split("__")[0] for x in transformation_and_value]
    options_to_set_to_false_df["value"] = [x.split("__")[1] for x in transformation_and_value]
else:
    options_to_set_to_false_df["transformation"] = []
    options_to_set_to_false_df["value"] = []
options_to_set_to_false_df["significance"] = options_to_set_to_false.values
options_to_set_to_false_df


Unnamed: 0,transformation,value,significance
0,binarizer,5,0.39958
1,binarizer,7,0.201509
2,binarizer,4,0.113185
3,binarizer,10,0.077935


## Set Fields

In [30]:
# Set to True
for index, row in options_to_set_to_true_df.iterrows():
    param_distributions[row['transformation']][int(row['value'])] = True

# Set to False
for index, row in options_to_set_to_false_df.iterrows():
    param_distributions[row['transformation']][int(row['value'])] = False    