## Imports
- Place %%pycodestyle at the top of any cell to check python syntax

In [1]:
%load_ext pycodestyle_magic

# Common imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import shap

# Import from my GitHub
from getxgboostmodel.getxgboostmodel import get_xgboost_model
from randomizedgridsearch.randomizedgridsearch import RandomizedGridSearch
from transformers.transformers import *


lightgbm is installed...but failed to load!


## Define the Experiments

In [2]:
# Example dataset
boston_data = load_boston()

# Extract pandas dataframe and target
X = pd.DataFrame(boston_data['data']).copy().values
y = pd.DataFrame(boston_data['target']).copy().values

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.20, random_state=42)
train_X, test_X = train_X, test_X
train_y, test_y = train_y.reshape(-1, 1), test_y.reshape(-1, 1)

# An okay model fit to the data
try:
    xgb_model
except:
    xgb_model = get_xgboost_model(train_X, train_y)

# Pipeline
pipe = Pipeline([('standard_scaler', StandardScalerTransform()),
                 ('min_max_scaler', MinMaxScalerTransform()),
                 ('binarizer', BinarizerTransform()), 
                 ('model', xgb_model)])

# Find the number of features
num_features = train_X.shape[1]

# Testing with these indices
indices = list(range(num_features))

# Default 
default_values = [None]*num_features

min_max_values = [None]*num_features
min_max_values[4] = True

binarizer_values = [False]*num_features
binarizer_values[3] = True
binarizer_values[12] = True

# Possible configurations [None, True, or False] - None means not decided yet
param_distributions = {
    'standard_scaler': default_values,
    'min_max_scaler': default_values,
    'binarizer': default_values
}

# Randomly search the space n_iter times
experiments_results = RandomizedGridSearch(
    n_experiments=100,
    pipe=pipe,
    param_distributions=param_distributions,
    train_X=train_X,
    train_y=train_y,
    test_X=test_X,
    test_y=test_y,
    scoring='neg_mean_squared_error')

# Drop score
experiments_X_df = experiments_results.drop(['score'], axis=1)

# Get column names
X_column_names = experiments_X_df.columns

# Convert to numpy
experiments_X = experiments_X_df.values
experiments_y = experiments_results[['score']].values

# Create an XGBoost model tuned with the experiments data
xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)

# Fit the model
xgb_experiments_model.fit(experiments_X_df, experiments_y)

# Extract shap values
explainer = shap.TreeExplainer(xgb_experiments_model)
shap_values = explainer.shap_values(experiments_X_df)

# Shap as dataframe
shap_values_of_experiments = pd.DataFrame(shap_values, columns=X_column_names)
shap_values_of_experiments['score'] = experiments_y

print("Done")

100%|██████████| 100/100 [00:04<00:00, 21.38it/s]


Done


## Experiment Scores

In [3]:
# Options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 6)

# Print
shap_values_of_experiments


Unnamed: 0,standard_scaler__0,standard_scaler__1,standard_scaler__2,standard_scaler__3,standard_scaler__4,standard_scaler__5,standard_scaler__6,standard_scaler__7,standard_scaler__8,standard_scaler__9,standard_scaler__10,standard_scaler__11,standard_scaler__12,min_max_scaler__0,min_max_scaler__1,min_max_scaler__2,min_max_scaler__3,min_max_scaler__4,min_max_scaler__5,min_max_scaler__6,min_max_scaler__7,min_max_scaler__8,min_max_scaler__9,min_max_scaler__10,min_max_scaler__11,min_max_scaler__12,binarizer__0,binarizer__1,binarizer__2,binarizer__3,binarizer__4,binarizer__5,binarizer__6,binarizer__7,binarizer__8,binarizer__9,binarizer__10,binarizer__11,binarizer__12,score
0,0.183224,0.013532,0.138556,0.153134,0.010298,-0.036428,0.022605,0.139664,-0.003776,0.131023,0.039490,-0.009482,0.007270,-0.015949,0.008449,-0.036028,-0.015162,0.221078,-0.009836,0.005050,0.208649,0.116858,-0.067064,0.098398,-0.009178,0.025613,-0.057015,0.135546,0.946465,-0.022503,1.327569,4.802039,0.143803,2.260792,0.251424,0.350091,1.193060,0.304643,1.001423,31.419274
1,-0.089396,-0.016080,-0.113274,0.093269,0.009751,0.105347,0.000819,0.061711,-0.004941,0.105409,0.013454,0.010323,0.007434,0.029884,0.003868,-0.045801,-0.020377,0.154124,-0.009855,0.099392,0.208616,0.113196,0.041138,-0.032154,0.009873,-0.000047,0.117175,0.134881,-0.505679,0.020293,1.131981,4.375022,-0.030863,2.022226,-0.114216,-0.218831,0.858270,0.269461,0.732714,24.513539
2,-0.090567,0.033926,0.165550,0.189252,0.037913,0.016719,0.018725,0.003991,0.050716,0.101875,-0.012860,0.024127,0.022363,0.015605,0.008024,0.134467,0.046806,-0.063802,0.018548,-0.005042,0.125624,-0.128899,0.085683,-0.023273,0.033415,-0.007719,-0.058920,-0.113489,0.483088,-0.025678,-1.268344,4.860224,0.209147,2.178597,0.120504,0.287782,1.089190,-0.085268,0.832442,24.438593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,-0.181565,0.037865,0.037687,0.052565,0.037257,-0.090523,-0.006946,-0.009585,-0.028583,0.051311,-0.030468,-0.016985,0.000512,-0.025904,-0.010288,0.102946,-0.056724,0.038472,0.014662,0.088255,-0.112520,0.056092,0.043493,-0.024784,0.008113,-0.019823,0.188544,-0.126633,-0.212209,0.033574,-1.079651,-4.203960,-0.037249,-2.062873,0.025980,-0.061292,-0.536725,-0.103873,-0.086254,6.366184
98,0.073685,0.034707,-0.033570,-0.057446,0.008021,-0.026925,0.003203,-0.012878,-0.020036,0.101566,0.003373,0.021824,-0.002363,-0.035558,-0.003270,0.064969,0.030547,0.057810,0.025645,0.048965,-0.068800,-0.125777,-0.111908,0.040425,0.009124,-0.001523,-0.192455,0.089118,-0.338341,0.007492,-1.245699,-4.850580,-0.071688,-2.074586,-0.092529,0.123650,0.356311,0.050988,0.032466,6.350471
99,-0.113491,-0.031907,-0.039849,0.057469,0.015005,-0.033636,0.001794,0.000527,-0.028349,0.046560,0.010816,0.016846,-0.002763,0.016841,-0.010823,-0.136446,0.035560,-0.077904,0.021669,-0.021016,-0.056690,0.070050,-0.109266,-0.017510,0.022223,0.000810,-0.165773,-0.172394,-0.395150,-0.008677,-1.188901,-4.686068,0.087306,-2.184285,0.079670,0.143215,0.488811,-0.119385,0.043960,6.226279


In [4]:
# Print
experiments_results


Unnamed: 0,standard_scaler__0,standard_scaler__1,standard_scaler__2,standard_scaler__3,standard_scaler__4,standard_scaler__5,standard_scaler__6,standard_scaler__7,standard_scaler__8,standard_scaler__9,standard_scaler__10,standard_scaler__11,standard_scaler__12,min_max_scaler__0,min_max_scaler__1,min_max_scaler__2,min_max_scaler__3,min_max_scaler__4,min_max_scaler__5,min_max_scaler__6,min_max_scaler__7,min_max_scaler__8,min_max_scaler__9,min_max_scaler__10,min_max_scaler__11,min_max_scaler__12,binarizer__0,binarizer__1,binarizer__2,binarizer__3,binarizer__4,binarizer__5,binarizer__6,binarizer__7,binarizer__8,binarizer__9,binarizer__10,binarizer__11,binarizer__12,score
80,True,False,True,False,True,False,True,False,False,False,True,True,True,True,False,False,False,False,True,False,False,True,True,False,True,False,False,True,True,False,True,True,False,True,False,True,True,True,True,31.419274
43,False,False,False,False,True,True,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,False,True,False,False,True,True,False,True,True,True,True,True,True,False,True,True,True,24.513539
34,False,True,True,False,False,True,True,True,True,False,False,False,False,False,True,True,True,True,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,True,False,True,True,False,True,24.438593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,False,True,True,False,False,False,True,True,False,False,False,True,False,True,False,True,False,False,False,True,True,True,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,6.366184
84,True,True,False,True,False,False,True,True,False,False,True,False,False,True,True,True,True,False,False,True,True,False,True,False,False,False,False,True,False,True,False,False,True,False,True,True,True,True,True,6.350471
37,False,False,False,False,False,False,True,True,False,False,True,False,True,False,False,False,True,True,False,False,True,True,True,True,False,True,False,False,False,False,False,False,False,False,False,True,True,False,True,6.226279


In [21]:
# Function to support analysis
def find_significance_from_experiments_results(importance_threshold=0.05, max_toggles_to_lock_per_series=5):
    temp_df = shap_values_of_experiments.drop(['score'], axis=1).copy()
    for i in range(0, len(shap_values_of_experiments.index)):
        for j in range(0, len(shap_values_of_experiments.columns)):
            if not experiments_results.iloc[i, j]:
                temp_df.iloc[i, j] = -1 * shap_values_of_experiments.iloc[i, j]
    temp_df = temp_df.sum().sort_values()
    options_to_set_to_false = temp_df[temp_df > 0]
    options_to_set_to_true = temp_df[temp_df < 0]    
    sum_value = (options_to_set_to_false.sum() + abs(options_to_set_to_true.sum()))
    options_to_set_to_false = options_to_set_to_false / sum_value
    options_to_set_to_true = abs(options_to_set_to_true) / sum_value
    options_to_set_to_false = options_to_set_to_false[options_to_set_to_false > importance_threshold].sort_values(ascending=False)
    options_to_set_to_true = options_to_set_to_true[options_to_set_to_true > importance_threshold]
    return options_to_set_to_false[0:max_toggles_to_lock_per_series], options_to_set_to_true[0:max_toggles_to_lock_per_series]

# Call function
options_to_set_to_false, options_to_set_to_true = find_significance_from_experiments_results()


In [22]:
# Options
pd.set_option('display.max_rows', 100)

# Make a dataframe
options_to_set_to_true_df = pd.DataFrame()
options_to_set_to_true_df["transformation"] = options_to_set_to_true.keys()
options_to_set_to_true_df["significance"] = options_to_set_to_true.values
options_to_set_to_true_df


Unnamed: 0,transformation,significance


In [23]:
# Options
pd.set_option('display.max_rows', 100)

# Make a dataframe
options_to_set_to_false_df = pd.DataFrame()
options_to_set_to_false_df["transformation"] = options_to_set_to_false.keys()
options_to_set_to_false_df["significance"] = options_to_set_to_false.values
options_to_set_to_false_df


Unnamed: 0,transformation,significance
0,binarizer__5,0.397537
1,binarizer__7,0.18281
2,binarizer__4,0.106864
3,binarizer__10,0.06518


In [None]:
#### IDEAS:


# Much later on: Every 3rd experiment should be to try and remove the existing assumptions keeping the rest on false
    # In other words, six steps forward, one step back

In [15]:


# Splits positive and negative

# Each iteration, find anything above 5% and either remove a low value or remove a high value from their options

# Continue until 0 things were removed (0 will be removed if one option for each)

# When there are X choices yet

# Try appending experiments vs continue to use the same results for analysis ; keep together for now for review

# When there were 3

Series([], dtype: float32) 
 min_max_scaler__custom_values___4    1.0
dtype: float32


In [16]:
# just weighting based on feature length etc

In [17]:
for key in positive_fields.keys():
    choices = param_distributions[
        'standard_scaler__column_indices_to_replace'][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[:-1]

for key in negative_fields.keys():
    choices = param_distributions[key][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[1:]

KeyError: 'min_max_scaler__custom_values___4'

In [18]:
param_distributions['min_max_scaler__column_indices_to_replace']

KeyError: 'min_max_scaler__column_indices_to_replace'

In [19]:
# always consider all features
#

In [20]:
# featuers to consider,
# num of features
# Zeroes;
# could
# default distribution ()
# weighting by feature towards up or down

In [None]:
for key in negative_fields.keys():
    print(key.split("___")[0])

In [None]:
a = [1, 2, 3, 4]

In [None]:
a[1:]

In [None]:
a[:-1]