## Imports
- Place %%pycodestyle at the top of any cell to check python syntax

In [27]:
%load_ext pycodestyle_magic

# Common imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import shap

# Import from my GitHub
from getxgboostmodel.getxgboostmodel import get_xgboost_model
from randomizedgridsearch.randomizedgridsearch import RandomizedGridSearch
from transformers.transformers import *


The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


## Define the Experiments

In [28]:
# Example dataset
boston_data = load_boston()

# Extract pandas dataframe and target
X = pd.DataFrame(boston_data['data']).copy().values
y = pd.DataFrame(boston_data['target']).copy().values

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.20, random_state=42)
train_X, test_X = train_X, test_X
train_y, test_y = train_y.reshape(-1, 1), test_y.reshape(-1, 1)

# An okay model fit to the data
try:
    xgb_model
except:
    xgb_model = get_xgboost_model(train_X, train_y)

# Pipeline
pipe = Pipeline([('standard_scaler', StandardScalerTransform()),
                 ('min_max_scaler', MinMaxScalerTransform()),
                 ('binarizer', BinarizerTransform()), 
                 ('model', xgb_model)])

# Find the number of features
num_features = train_X.shape[1]

# Testing with these indices
indices = list(range(num_features))

# Default 
default_values = [None]*num_features

min_max_values = [None]*num_features
min_max_values[4] = True

binarizer_values = [False]*num_features
binarizer_values[3] = True
binarizer_values[12] = True

# Possible configurations [None, True, or False] - None means not decided yet
param_distributions = {
    'standard_scaler': default_values,
    'min_max_scaler': default_values,
    'binarizer': default_values
}

# Randomly search the space n_iter times
experiments_results = RandomizedGridSearch(
    n_experiments=100,
    pipe=pipe,
    param_distributions=param_distributions,
    train_X=train_X,
    train_y=train_y,
    test_X=test_X,
    test_y=test_y,
    scoring='neg_mean_squared_error')

# Drop score
experiments_X_df = experiments_results.drop(['score'], axis=1)

# Get column names
X_column_names = experiments_X_df.columns

# Convert to numpy
experiments_X = experiments_X_df.values
experiments_y = experiments_results[['score']].values

# Create an XGBoost model tuned with the experiments data
xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)

# Fit the model
xgb_experiments_model.fit(experiments_X_df, experiments_y)

# Extract shap values
explainer = shap.TreeExplainer(xgb_experiments_model)
shap_values = explainer.shap_values(experiments_X_df)

# Shap as dataframe
shap_values_of_experiments = pd.DataFrame(shap_values, columns=X_column_names)
shap_values_of_experiments['score'] = experiments_y

print("Done")

100%|██████████| 100/100 [00:04<00:00, 22.02it/s]


Done


## Experiment Scores

In [29]:
# Options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 6)

# Print
shap_values_of_experiments


Unnamed: 0,standard_scaler__0,standard_scaler__1,standard_scaler__2,standard_scaler__3,standard_scaler__4,standard_scaler__5,standard_scaler__6,standard_scaler__7,standard_scaler__8,standard_scaler__9,standard_scaler__10,standard_scaler__11,standard_scaler__12,min_max_scaler__0,min_max_scaler__1,min_max_scaler__2,min_max_scaler__3,min_max_scaler__4,min_max_scaler__5,min_max_scaler__6,min_max_scaler__7,min_max_scaler__8,min_max_scaler__9,min_max_scaler__10,min_max_scaler__11,min_max_scaler__12,binarizer__0,binarizer__1,binarizer__2,binarizer__3,binarizer__4,binarizer__5,binarizer__6,binarizer__7,binarizer__8,binarizer__9,binarizer__10,binarizer__11,binarizer__12,score
0,0.117997,0.066690,-0.013306,0.071029,-0.000951,0.132078,0.595510,0.128647,0.007092,0.187166,0.191435,-0.003593,0.230703,0.138018,0.309880,0.186131,0.013843,-0.011964,0.097177,-0.001429,0.078609,0.013423,0.304088,0.399382,0.176807,-0.013638,-0.167355,0.155114,1.186053,-0.002758,0.568764,5.716842,-0.016843,2.666203,0.053186,0.148849,1.534725,-0.064548,0.426662,31.484121
1,0.163466,0.065699,-0.018588,0.019901,-0.008849,-0.020472,0.437365,0.036497,0.003575,0.186489,0.140293,-0.007087,-0.114872,-0.036435,-0.127951,0.189272,0.010168,0.002145,0.095483,0.003192,-0.000173,0.031372,-0.117435,-0.065662,0.071526,-0.023527,-0.178023,-0.093347,1.097373,-0.013614,-0.596163,5.878191,0.015375,2.595448,0.049301,-0.029214,1.229462,0.073912,0.415573,24.332242
2,0.099606,-0.001947,0.047477,-0.015094,-0.015528,-0.019215,0.542934,0.062646,0.019772,0.190067,0.195133,0.002481,0.085706,0.135804,-0.067711,-0.036995,-0.005593,0.137961,0.106617,0.000982,0.079443,-0.002397,0.322287,0.236306,-0.070220,0.033287,0.443103,0.367981,1.029089,0.008927,0.533158,6.115623,-0.015971,-1.522497,0.006896,-0.033738,1.377427,-0.064518,-0.069281,23.663579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,-0.006273,-0.003962,0.050910,-0.052503,0.035635,0.019658,0.126350,0.027489,0.001680,0.007148,-0.087340,-0.002312,-0.036247,-0.029363,0.120111,-0.084610,0.015029,-0.048913,0.014021,0.003192,0.031788,0.040262,0.042462,-0.113707,0.046459,-0.042759,-0.276035,-0.098083,0.191201,0.011385,-0.984260,-3.240623,-0.016835,-2.056223,0.014429,0.047402,-0.302494,0.042896,-0.297964,6.155096
98,-0.120123,0.004848,-0.017885,0.037791,0.023565,0.030223,-0.114624,-0.019669,0.004465,-0.042593,-0.019988,0.002481,-0.044398,-0.006376,-0.092388,-0.068429,-0.003580,-0.038610,0.032013,0.000481,-0.028586,0.031747,0.131793,0.127064,-0.022501,-0.015946,-0.308573,0.073506,0.236532,-0.004282,-1.076278,-3.523926,-0.014520,-2.215182,-0.013870,-0.056471,0.478136,0.057140,-0.186865,6.031164
99,-0.101194,0.004848,-0.022020,-0.012984,-0.030753,0.033384,0.170022,-0.018121,0.019950,0.030976,-0.008174,-0.001682,-0.077366,0.032977,-0.077539,-0.067178,0.033669,-0.038610,0.028968,0.000982,-0.035659,-0.027315,-0.043718,0.086085,-0.024129,0.053081,-0.278731,-0.086359,0.203499,-0.006020,-1.032326,-3.072928,0.022331,-2.195768,-0.017152,-0.037062,-0.287760,-0.084162,-0.305016,5.785716


In [30]:
# Print
experiments_results


Unnamed: 0,standard_scaler__0,standard_scaler__1,standard_scaler__2,standard_scaler__3,standard_scaler__4,standard_scaler__5,standard_scaler__6,standard_scaler__7,standard_scaler__8,standard_scaler__9,standard_scaler__10,standard_scaler__11,standard_scaler__12,min_max_scaler__0,min_max_scaler__1,min_max_scaler__2,min_max_scaler__3,min_max_scaler__4,min_max_scaler__5,min_max_scaler__6,min_max_scaler__7,min_max_scaler__8,min_max_scaler__9,min_max_scaler__10,min_max_scaler__11,min_max_scaler__12,binarizer__0,binarizer__1,binarizer__2,binarizer__3,binarizer__4,binarizer__5,binarizer__6,binarizer__7,binarizer__8,binarizer__9,binarizer__10,binarizer__11,binarizer__12,score
96,True,True,False,True,False,True,False,False,True,False,True,True,False,True,True,True,False,True,True,False,False,False,True,False,False,True,False,True,True,False,True,True,False,True,False,True,True,False,True,31.484121
69,True,True,False,True,False,False,False,False,False,False,True,True,True,False,False,True,False,True,True,True,True,False,False,True,False,True,False,False,True,False,False,True,True,True,False,False,True,True,True,24.332242
11,True,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,True,True,True,True,True,False,True,False,True,True,True,True,True,True,False,False,True,False,True,False,True,23.663579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,True,False,True,False,True,True,False,False,True,True,False,True,True,False,True,False,False,True,True,True,True,False,True,True,False,True,False,False,True,True,False,False,False,False,True,True,False,True,True,6.155096
49,False,True,False,True,True,True,True,False,False,True,True,False,True,True,False,False,False,True,True,True,False,False,True,False,True,True,False,True,True,True,False,False,False,False,False,False,True,True,True,6.031164
65,False,True,False,False,False,True,False,False,False,True,False,True,True,True,False,False,False,True,True,True,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,True,5.785716


In [31]:
# Function to support analysis
def find_significance_from_experiments_results(importance_threshold=0.05, max_toggles_to_lock_per_series=5):
    temp_df = shap_values_of_experiments.drop(['score'], axis=1).copy()
    for i in range(0, len(shap_values_of_experiments.index)):
        for j in range(0, len(shap_values_of_experiments.columns)):
            if not experiments_results.iloc[i, j]:
                temp_df.iloc[i, j] = -1 * shap_values_of_experiments.iloc[i, j]
    temp_df = temp_df.sum().sort_values()
    options_to_set_to_false = temp_df[temp_df > 0]
    options_to_set_to_true = temp_df[temp_df < 0]    
    sum_value = (options_to_set_to_false.sum() + abs(options_to_set_to_true.sum()))
    options_to_set_to_false = options_to_set_to_false / sum_value
    options_to_set_to_true = abs(options_to_set_to_true) / sum_value
    options_to_set_to_false = options_to_set_to_false[options_to_set_to_false > importance_threshold].sort_values(ascending=False)
    options_to_set_to_true = options_to_set_to_true[options_to_set_to_true > importance_threshold]
    return options_to_set_to_false[0:max_toggles_to_lock_per_series], options_to_set_to_true[0:max_toggles_to_lock_per_series]

# Call function
options_to_set_to_false, options_to_set_to_true = find_significance_from_experiments_results()


In [35]:
options_to_set_to_true.keys()

Index([], dtype='object')

In [38]:
options_to_set_to_false

binarizer__5    0.403020
binarizer__7    0.213330
binarizer__4    0.085259
dtype: float32

In [36]:
# Options
pd.set_option('display.max_rows', 100)

# Make a dataframe
options_to_set_to_true_df = pd.DataFrame()
transformation, value = options_to_set_to_true.keys().split("__")
options_to_set_to_true_df["transformation"] = transformation
options_to_set_to_true_df["value"] = value
options_to_set_to_true_df["significance"] = options_to_set_to_true.values
options_to_set_to_true_df


AttributeError: 'Index' object has no attribute 'split'

In [34]:
# Make a dataframe
options_to_set_to_false_df = pd.DataFrame()
transformation, value = [for x in options_to_set_to_false.keys()]
options_to_set_to_false_df["transformation"] = transformation
options_to_set_to_false_df["value"] = value
options_to_set_to_false_df["significance"] = options_to_set_to_false.values
options_to_set_to_false_df


SyntaxError: invalid syntax (<ipython-input-34-02ef892c8dc5>, line 6)

## Set Fields

In [None]:
param_distributions

In [None]:
for key in positive_fields.keys():
    choices = param_distributions[
        'standard_scaler__column_indices_to_replace'][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[:-1]

for key in negative_fields.keys():
    choices = param_distributions[key][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[1:]


In [None]:
#### IDEAS:


# Much later on: Every 3rd experiment should be to try and remove the existing assumptions keeping the rest on false
    # In other words, six steps forward, one step back