## Imports
- Place %%pycodestyle at the top of any cell to check python syntax

In [1]:
%load_ext pycodestyle_magic

# Common imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import shap

# Import from my GitHub
from getxgboostmodel.getxgboostmodel import get_xgboost_model
from randomizedgridsearch.randomizedgridsearch import RandomizedGridSearch
from transformers.transformers import *


lightgbm is installed...but failed to load!


## Define the Experiments

In [2]:
# Example dataset
boston_data = load_boston()

# Extract pandas dataframe and target
X = pd.DataFrame(boston_data['data']).copy().values
y = pd.DataFrame(boston_data['target']).copy().values

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.20, random_state=42)
train_X, test_X = train_X, test_X
train_y, test_y = train_y.reshape(-1, 1), test_y.reshape(-1, 1)

# An okay model fit to the data
try:
    xgb_model
except:
    xgb_model = get_xgboost_model(train_X, train_y)

# Pipeline
pipe = Pipeline([('standard_scaler', StandardScalerTransform()),
                 ('min_max_scaler', MinMaxScalerTransform()),
                 ('binarizer', BinarizerTransform()), 
                 ('model', xgb_model)])

# Find the number of features
num_features = train_X.shape[1]

# Testing with these indices
indices = list(range(num_features))

# Default 
default_values = [None]*num_features

min_max_values = [None]*num_features
min_max_values[4] = True

binarizer_values = [False]*num_features
binarizer_values[3] = True
binarizer_values[12] = True

# Possible configurations [None, True, or False] - None means not decided yet
param_distributions = {
    'standard_scaler': default_values,
    'min_max_scaler': default_values,
    'binarizer': default_values
}

# Randomly search the space n_iter times
experiments_results = RandomizedGridSearch(
    n_experiments=100,
    pipe=pipe,
    param_distributions=param_distributions,
    train_X=train_X,
    train_y=train_y,
    test_X=test_X,
    test_y=test_y,
    scoring='neg_mean_squared_error')

# Drop score
experiments_X_df = experiments_results.drop(['score'], axis=1)

# Get column names
X_column_names = experiments_X_df.columns

# Convert to numpy
experiments_X = experiments_X_df.values
experiments_y = experiments_results[['score']].values

# Create an XGBoost model tuned with the experiments data
xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)

# Fit the model
xgb_experiments_model.fit(experiments_X_df, experiments_y)

# Extract shap values
explainer = shap.TreeExplainer(xgb_experiments_model)
shap_values = explainer.shap_values(experiments_X_df)

# Shap as dataframe
shap_values_of_experiments = pd.DataFrame(shap_values, columns=X_column_names)
shap_values_of_experiments['score'] = experiments_y

print("Done")

100%|██████████| 100/100 [00:04<00:00, 20.97it/s]


Done


## Experiment Scores

In [3]:
# Options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 6)

# Print
shap_values_of_experiments


Unnamed: 0,standard_scaler___0,standard_scaler___1,standard_scaler___2,standard_scaler___3,standard_scaler___4,standard_scaler___5,standard_scaler___6,standard_scaler___7,standard_scaler___8,standard_scaler___9,standard_scaler___10,standard_scaler___11,standard_scaler___12,min_max_scaler___0,min_max_scaler___1,min_max_scaler___2,min_max_scaler___3,min_max_scaler___4,min_max_scaler___5,min_max_scaler___6,min_max_scaler___7,min_max_scaler___8,min_max_scaler___9,min_max_scaler___10,min_max_scaler___11,min_max_scaler___12,binarizer___0,binarizer___1,binarizer___2,binarizer___3,binarizer___4,binarizer___5,binarizer___6,binarizer___7,binarizer___8,binarizer___9,binarizer___10,binarizer___11,binarizer___12,score
0,-0.023171,-0.008146,0.196190,0.087615,-0.037440,0.019905,0.073489,0.078291,-0.021138,0.037570,0.171541,0.012693,-0.002811,0.200938,-0.002446,-0.037490,0.053520,0.194104,0.016524,0.351592,0.257750,0.499308,0.038794,0.233820,-0.027575,0.221791,0.561211,0.116634,0.600092,0.384735,1.358900,3.617033,0.129876,2.220209,0.332330,-0.056955,1.683662,0.013947,0.300804,31.265291
1,0.019451,0.004001,0.163968,0.086764,0.044592,0.002562,0.015035,-0.079374,-0.057591,-0.027228,0.128923,-0.001513,-0.010796,0.173158,0.000397,0.027911,0.050795,0.139859,0.013624,0.327844,-0.067555,0.500966,0.033916,-0.116750,0.054105,0.186789,0.565673,0.029090,0.553269,0.288596,1.105266,3.536520,0.094861,2.193604,0.207106,0.066522,1.535025,0.041640,0.286691,26.835432
2,0.038413,-0.004723,-0.102879,0.021583,0.103127,-0.001664,0.064762,0.109045,0.246317,-0.000825,0.066883,0.013118,0.005254,0.146718,0.000431,0.135004,0.055220,-0.052745,-0.017669,0.334127,0.193844,0.499829,-0.013677,0.151274,0.024171,-0.045804,0.579130,0.100553,0.698003,0.519562,1.238793,4.126551,0.095023,-1.478003,-0.010391,0.063245,1.206585,0.036625,0.172302,24.646689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,-0.040187,-0.007767,-0.044035,0.012750,0.105606,-0.001231,-0.036111,-0.024927,0.122055,-0.045687,-0.010263,-0.000968,0.018893,0.023039,-0.000353,0.031862,-0.006123,-0.049178,0.015668,0.098019,-0.038192,0.126792,0.010515,-0.027638,-0.058913,-0.012486,0.221329,0.023487,-0.195186,-0.194713,-1.316223,-4.043306,-0.018416,-1.903037,0.024411,0.072620,-0.432084,-0.034376,-0.103532,6.721216
98,0.021307,0.003540,0.050235,-0.054769,-0.059421,-0.019058,0.040782,-0.024154,0.123519,-0.057224,0.003067,-0.000755,-0.010796,-0.012614,-0.008066,0.037000,0.038647,-0.042017,-0.024470,0.023959,-0.020315,0.167503,0.017597,-0.086005,-0.074850,-0.000689,-0.290718,0.019335,-0.341778,-0.162833,-0.999420,-4.599297,-0.085081,-2.091398,0.047585,-0.165880,0.386754,0.053768,-0.048358,6.371816
99,0.046771,0.004092,-0.093871,-0.000794,0.086837,0.002205,-0.064405,0.014589,-0.210285,-0.045609,-0.018198,0.004187,0.018893,0.001291,0.003022,-0.038878,0.001199,-0.022250,-0.049019,-0.024610,-0.065180,0.131639,0.012064,0.049965,0.018064,0.032268,-0.318360,0.047380,-0.269698,0.109342,-1.129861,-4.080911,0.003427,-1.983090,0.069681,-0.077364,-0.437282,0.007905,-0.177372,6.226373


In [4]:
# Print
experiments_results


Unnamed: 0,standard_scaler___0,standard_scaler___1,standard_scaler___2,standard_scaler___3,standard_scaler___4,standard_scaler___5,standard_scaler___6,standard_scaler___7,standard_scaler___8,standard_scaler___9,standard_scaler___10,standard_scaler___11,standard_scaler___12,min_max_scaler___0,min_max_scaler___1,min_max_scaler___2,min_max_scaler___3,min_max_scaler___4,min_max_scaler___5,min_max_scaler___6,min_max_scaler___7,min_max_scaler___8,min_max_scaler___9,min_max_scaler___10,min_max_scaler___11,min_max_scaler___12,binarizer___0,binarizer___1,binarizer___2,binarizer___3,binarizer___4,binarizer___5,binarizer___6,binarizer___7,binarizer___8,binarizer___9,binarizer___10,binarizer___11,binarizer___12,score
25,False,True,False,True,False,True,False,True,True,True,True,False,False,True,True,False,False,True,False,False,False,True,True,False,False,False,True,True,True,True,True,True,False,True,False,True,True,True,True,31.605373
29,False,True,True,True,True,True,True,False,False,True,True,True,True,True,False,False,False,True,False,True,True,True,False,True,True,True,False,False,False,True,True,True,True,True,False,False,True,False,True,25.979007
17,True,False,False,False,True,True,False,True,True,False,True,False,False,False,True,False,False,True,True,True,False,False,False,True,False,False,True,True,False,False,True,True,False,True,True,False,True,False,True,25.745124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,False,False,True,True,True,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,True,True,True,False,False,False,False,True,True,True,False,False,True,False,True,True,True,False,True,6.470658
31,False,False,False,True,True,True,True,True,False,True,False,True,True,False,True,True,True,False,True,True,False,True,True,True,False,False,False,False,False,True,False,False,True,False,True,True,True,False,True,6.248851
58,True,True,True,True,True,True,False,True,False,True,True,False,False,False,False,False,False,True,True,True,False,True,False,True,True,True,False,False,True,False,False,False,True,False,True,True,True,True,True,6.062313


In [5]:
# Function to support analysis
def find_significance_from_experiments_results(importance_threshold=0.05, max_toggles_to_lock_per_series=3):
    temp_df = shap_values_of_experiments.drop(['score'], axis=1).copy()
    for i in range(0, len(shap_values_of_experiments.index)):
        for j in range(0, len(shap_values_of_experiments.columns)):
            if not experiments_results.iloc[i, j]:
                temp_df.iloc[i, j] = -1 * shap_values_of_experiments.iloc[i, j]
    temp_df = temp_df.sum().sort_values()
    options_to_set_to_false = temp_df[temp_df > 0]
    options_to_set_to_false = options_to_set_to_false / options_to_set_to_false.sum()
    options_to_set_to_true = temp_df[temp_df < 0]
    options_to_set_to_true = options_to_set_to_true / options_to_set_to_true.sum()
    options_to_set_to_false = options_to_set_to_false[options_to_set_to_false > importance_threshold].sort_values(ascending=False)
    options_to_set_to_true = options_to_set_to_true[options_to_set_to_true > importance_threshold]
    return options_to_set_to_false[0:max_toggles_to_lock_per_series], options_to_set_to_true[0:max_toggles_to_lock_per_series]

# Call function
options_to_set_to_false, options_to_set_to_true = find_significance_from_experiments_results()


In [6]:
# Options
pd.set_option('display.max_rows', 100)

# Make a dataframe
options_to_set_to_true_df = pd.DataFrame()
options_to_set_to_true_df["transformation"] = options_to_set_to_true.keys()
options_to_set_to_true_df["Significance"] = options_to_set_to_true.values
options_to_set_to_true_df


Unnamed: 0,transformation,Significance
0,standard_scaler___8,0.190328
1,min_max_scaler___2,0.152694
2,min_max_scaler___7,0.121739


In [73]:
# Options
pd.set_option('display.max_rows', 100)

# Make a dataframe
options_to_set_to_false_df = pd.DataFrame()
options_to_set_to_false_df["transformation"] = options_to_set_to_false.keys()
options_to_set_to_false_df["Significance"] = options_to_set_to_false.values
options_to_set_to_false_df


Unnamed: 0,Fields to set to False,Importance
0,binarizer__custom_values___5,0.412933
1,binarizer__custom_values___7,0.226616
2,binarizer__custom_values___4,0.111091


In [None]:
#### IDEAS:


# Much later on: Every 3rd experiment should be to try and remove the existing assumptions keeping the rest on false
    # In other words, six steps forward, one step back

In [15]:


# Splits positive and negative

# Each iteration, find anything above 5% and either remove a low value or remove a high value from their options

# Continue until 0 things were removed (0 will be removed if one option for each)

# When there are X choices yet

# Try appending experiments vs continue to use the same results for analysis ; keep together for now for review

# When there were 3

Series([], dtype: float32) 
 min_max_scaler__custom_values___4    1.0
dtype: float32


In [16]:
# just weighting based on feature length etc

In [17]:
for key in positive_fields.keys():
    choices = param_distributions[
        'standard_scaler__column_indices_to_replace'][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[:-1]

for key in negative_fields.keys():
    choices = param_distributions[key][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[1:]

KeyError: 'min_max_scaler__custom_values___4'

In [18]:
param_distributions['min_max_scaler__column_indices_to_replace']

KeyError: 'min_max_scaler__column_indices_to_replace'

In [19]:
# always consider all features
#

In [20]:
# featuers to consider,
# num of features
# Zeroes;
# could
# default distribution ()
# weighting by feature towards up or down

In [None]:
for key in negative_fields.keys():
    print(key.split("___")[0])

In [None]:
a = [1, 2, 3, 4]

In [None]:
a[1:]

In [None]:
a[:-1]