## Imports
- Place %%pycodestyle at the top of any cell to check python syntax

In [9]:
%load_ext pycodestyle_magic

# Common imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import shap

# Import from my GitHub
from getxgboostmodel.getxgboostmodel import get_xgboost_model
from randomizedgridsearch.randomizedgridsearch import RandomizedGridSearch
from transformers.transformers import *


The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


## Define the Experiments

In [60]:
# Example dataset
boston_data = load_boston()

# Extract pandas dataframe and target
X = pd.DataFrame(boston_data['data']).copy().values
y = pd.DataFrame(boston_data['target']).copy().values

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.20, random_state=42)
train_X, test_X = train_X, test_X
train_y, test_y = train_y.reshape(-1, 1), test_y.reshape(-1, 1)

# An okay model fit to the data
try:
    xgb_model
except:
    xgb_model = get_xgboost_model(train_X, train_y)

# Pipeline
pipe = Pipeline([('standard_scaler', StandardScalerTransform()),
                 ('min_max_scaler', MinMaxScalerTransform()),
                 ('binarizer', BinarizerTransform()), 
                 ('model', xgb_model)])

# Find the number of features
num_features = train_X.shape[1]

# Testing with these indices
indices = list(range(num_features))

default_values = [None]*num_features

min_max_values = [None]*num_features
min_max_values[4] = True

binarizer_values = [False]*num_features
binarizer_values[3] = True
binarizer_values[12] = True

# Possible configurations [None, True, or False] - None means not decided yet
param_distributions = {
    'standard_scaler__custom_values': default_values,
    'min_max_scaler__custom_values': default_values,
    'binarizer__custom_values': default_values
}

# Randomly search the space n_iter times
experiments_results = RandomizedGridSearch(
    n_experiments=100,
    pipe=pipe,
    param_distributions=param_distributions,
    train_X=train_X,
    train_y=train_y,
    test_X=test_X,
    test_y=test_y,
    scoring='neg_mean_squared_error')

# Sort the scores
experiments_results.sort_values(by=['score'], ascending=False, inplace=True)

# Drop score
experiments_X_df = experiments_results.drop(['score'], axis=1)

# Get column names
X_column_names = experiments_X_df.columns

# Convert to numpy
experiments_X = experiments_X_df.values
experiments_y = experiments_results[['score']].values

# Create an XGBoost model tuned with the experiments data
xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)

# Fit the model
xgb_experiments_model.fit(experiments_X_df, experiments_y)

# Extract shap values
explainer = shap.TreeExplainer(xgb_experiments_model)
shap_values = explainer.shap_values(experiments_X_df)

# Shap as dataframe
shap_values_of_experiments = pd.DataFrame(shap_values, columns=X_column_names)
shap_values_of_experiments['score'] = experiments_y

print("Done")

100%|██████████| 100/100 [00:04<00:00, 21.37it/s]


Done


## Experiment Scores

In [11]:
# Options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 6)

# Print
shap_values_of_experiments


Unnamed: 0,standard_scaler__custom_values___0,standard_scaler__custom_values___1,standard_scaler__custom_values___2,standard_scaler__custom_values___3,standard_scaler__custom_values___4,standard_scaler__custom_values___5,standard_scaler__custom_values___6,standard_scaler__custom_values___7,standard_scaler__custom_values___8,standard_scaler__custom_values___9,standard_scaler__custom_values___10,standard_scaler__custom_values___11,standard_scaler__custom_values___12,min_max_scaler__custom_values___0,min_max_scaler__custom_values___1,min_max_scaler__custom_values___2,min_max_scaler__custom_values___3,min_max_scaler__custom_values___4,min_max_scaler__custom_values___5,min_max_scaler__custom_values___6,min_max_scaler__custom_values___7,min_max_scaler__custom_values___8,min_max_scaler__custom_values___9,min_max_scaler__custom_values___10,min_max_scaler__custom_values___11,min_max_scaler__custom_values___12,binarizer__custom_values___0,binarizer__custom_values___1,binarizer__custom_values___2,binarizer__custom_values___3,binarizer__custom_values___4,binarizer__custom_values___5,binarizer__custom_values___6,binarizer__custom_values___7,binarizer__custom_values___8,binarizer__custom_values___9,binarizer__custom_values___10,binarizer__custom_values___11,binarizer__custom_values___12,score
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.758120
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.758120
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.758120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.758026
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.758026
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.758026


In [12]:
# Print
experiments_results


Unnamed: 0,standard_scaler__custom_values___0,standard_scaler__custom_values___1,standard_scaler__custom_values___2,standard_scaler__custom_values___3,standard_scaler__custom_values___4,standard_scaler__custom_values___5,standard_scaler__custom_values___6,standard_scaler__custom_values___7,standard_scaler__custom_values___8,standard_scaler__custom_values___9,standard_scaler__custom_values___10,standard_scaler__custom_values___11,standard_scaler__custom_values___12,min_max_scaler__custom_values___0,min_max_scaler__custom_values___1,min_max_scaler__custom_values___2,min_max_scaler__custom_values___3,min_max_scaler__custom_values___4,min_max_scaler__custom_values___5,min_max_scaler__custom_values___6,min_max_scaler__custom_values___7,min_max_scaler__custom_values___8,min_max_scaler__custom_values___9,min_max_scaler__custom_values___10,min_max_scaler__custom_values___11,min_max_scaler__custom_values___12,binarizer__custom_values___0,binarizer__custom_values___1,binarizer__custom_values___2,binarizer__custom_values___3,binarizer__custom_values___4,binarizer__custom_values___5,binarizer__custom_values___6,binarizer__custom_values___7,binarizer__custom_values___8,binarizer__custom_values___9,binarizer__custom_values___10,binarizer__custom_values___11,binarizer__custom_values___12,score
37,False,False,False,False,False,False,False,True,False,False,True,True,True,True,True,True,False,True,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,5.758120
66,True,False,True,False,True,True,False,True,False,False,False,False,False,True,False,False,False,True,True,True,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,5.758120
89,False,False,True,True,True,False,False,True,False,True,True,False,True,True,False,True,True,True,True,True,False,True,False,True,True,True,False,False,False,True,False,False,False,False,False,False,False,False,True,5.758120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,False,True,False,False,True,True,False,True,False,False,True,False,False,True,True,True,False,True,False,True,True,False,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,5.758026
70,False,False,False,False,True,True,False,False,True,False,True,True,False,True,True,True,True,True,False,False,True,True,False,False,True,True,False,False,False,True,False,False,False,False,False,False,False,False,True,5.758026
99,False,False,True,True,False,False,True,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,5.758026


In [63]:
# Transformations to support analysis
def transformations_to_support_analysis(importance_threshold=0.05):
    temp_df = shap_values_of_experiments.drop(['score'], axis=1).copy()
    for i in range(0, len(shap_values_of_experiments.index)):
        for j in range(0, len(shap_values_of_experiments.columns)):
            if not experiments_results.iloc[i, j]:
                temp_df.iloc[i, j] = -1 * shap_values_of_experiments.iloc[i, j]
    temp_df = temp_df.sum().sort_values()
    fields_to_set_to_false = temp_df[temp_df > 0]
    fields_to_set_to_false = fields_to_set_to_false / fields_to_set_to_false.sum()
    fields_to_set_to_true = temp_df[temp_df < 0]
    fields_to_set_to_true = fields_to_set_to_true / fields_to_set_to_true.sum()
    fields_to_set_to_false = fields_to_set_to_false[fields_to_set_to_false > importance_threshold].sort_values(ascending=False)
    fields_to_set_to_true = fields_to_set_to_true[fields_to_set_to_true > importance_threshold]
    return fields_to_set_to_false, fields_to_set_to_true

# Call function
fields_to_set_to_false, fields_to_set_to_true = transformations_to_support_analysis()


In [64]:
pd.set_option('display.max_rows', 100)

tweaks_to_change = pd.DataFrame()
tweaks_to_change["Fields to set to True"] = fields_to_set_to_true.keys()
# tweaks_to_change["Fields to set to False"] = fields_to_set_to_false
tweaks_to_change["Importance"] = fields_to_set_to_true.values
tweaks_to_change

Unnamed: 0,Fields to set to True,Importance
0,standard_scaler__custom_values___9,0.167547
1,binarizer__custom_values___3,0.142092
2,min_max_scaler__custom_values___1,0.099714
3,standard_scaler__custom_values___8,0.095044
4,standard_scaler__custom_values___0,0.094088
5,min_max_scaler__custom_values___3,0.089687
6,binarizer__custom_values___12,0.077059
7,standard_scaler__custom_values___7,0.059641
8,min_max_scaler__custom_values___0,0.058673


In [65]:
pd.set_option('display.max_rows', 100)

tweaks_to_change = pd.DataFrame()
tweaks_to_change["Fields to set to False"] = fields_to_set_to_false.keys()
tweaks_to_change["Importance"] = fields_to_set_to_false.values
tweaks_to_change

Unnamed: 0,Fields to set to False,Importance
0,binarizer__custom_values___5,0.412933
1,binarizer__custom_values___7,0.226616
2,binarizer__custom_values___4,0.111091
3,binarizer__custom_values___10,0.064508


In [15]:


# Splits positive and negative

# Each iteration, find anything above 5% and either remove a low value or remove a high value from their options

# Continue until 0 things were removed (0 will be removed if one option for each)

# When there are X choices yet

# Try appending experiments vs continue to use the same results for analysis ; keep together for now for review

# When there were 3

Series([], dtype: float32) 
 min_max_scaler__custom_values___4    1.0
dtype: float32


In [16]:
# just weighting based on feature length etc

In [17]:
for key in positive_fields.keys():
    choices = param_distributions[
        'standard_scaler__column_indices_to_replace'][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[:-1]

for key in negative_fields.keys():
    choices = param_distributions[key][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[1:]

KeyError: 'min_max_scaler__custom_values___4'

In [18]:
param_distributions['min_max_scaler__column_indices_to_replace']

KeyError: 'min_max_scaler__column_indices_to_replace'

In [19]:
# always consider all features
#

In [20]:
# featuers to consider,
# num of features
# Zeroes;
# could
# default distribution ()
# weighting by feature towards up or down

In [None]:
for key in negative_fields.keys():
    print(key.split("___")[0])

In [None]:
a = [1, 2, 3, 4]

In [None]:
a[1:]

In [None]:
a[:-1]