## Imports
- Place %%pycodestyle at the top of any cell to check python syntax

In [32]:
%load_ext pycodestyle_magic

# Common imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import shap

# Import from my GitHub
from getxgboostmodel.getxgboostmodel import get_xgboost_model
from randomizedgridsearch.randomizedgridsearch import RandomizedGridSearch
from transformers.transformers import *


The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


## Initialize

In [68]:
# Example dataset
boston_data = load_boston()

# Extract pandas dataframe and target
X = pd.DataFrame(boston_data['data']).copy().values
y = pd.DataFrame(boston_data['target']).copy().values

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.20, random_state=42)
train_X, test_X = train_X, test_X
train_y, test_y = train_y.reshape(-1, 1), test_y.reshape(-1, 1)

# An okay model fit to the data
try:
    xgb_model
except:
    xgb_model = get_xgboost_model(train_X, train_y)

# Pipeline
pipe = Pipeline([('standard_scaler', StandardScalerTransform()),
                 ('min_max_scaler', MinMaxScalerTransform()),
                 ('binarizer', BinarizerTransform()), 
                 ('model', xgb_model)])

# Find the number of features
num_features = train_X.shape[1]

# Testing with these indices
indices = list(range(num_features))

# Default 
default_values = [None]*num_features

# Possible configurations [None, True, or False] - None means not decided yet
param_distributions = {
    'standard_scaler': default_values,
    'min_max_scaler': default_values,
    'binarizer': default_values
}

experiments_results = pd.DataFrame()

## Run the Experiments

In [69]:
for iteration in range(20):
    # Randomly search the space n_iter times
    experiments_results_temp = RandomizedGridSearch(
        n_experiments=100,
        pipe=pipe,
        param_distributions=param_distributions,
        train_X=train_X,
        train_y=train_y,
        test_X=test_X,
        test_y=test_y,
        scoring='neg_mean_squared_error')
    
    # Append to experiment results
    experiments_results = experiments_results.append(experiments_results_temp)

    # Drop score
    experiments_X_df = experiments_results.drop(['score'], axis=1)

    # Get column names
    X_column_names = experiments_X_df.columns

    # Convert to numpy
    experiments_X = experiments_X_df.values
    experiments_y = experiments_results[['score']].values

    # Create an XGBoost model tuned with the experiments data
    try:
        xgb_experiments_model
        # Tune hyperparameters every once in a while
        if iteration % 7 == 6:
            xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)
    except:
        xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)

    # Fit the model
    xgb_experiments_model.fit(experiments_X_df, experiments_y)

    # Extract shap values
    explainer = shap.TreeExplainer(xgb_experiments_model)
    shap_values = explainer.shap_values(experiments_X_df)

    # Shap as dataframe
    shap_values_of_experiments = pd.DataFrame(shap_values, columns=X_column_names)
    shap_values_of_experiments['score'] = experiments_y

    # Function to support analysis
    def find_significance_from_experiments_results(importance_threshold=0.05, max_toggles_to_lock_per_series=5):
        temp_df = shap_values_of_experiments.drop(['score'], axis=1).copy()
        for i in range(0, len(shap_values_of_experiments.index)):
            for j in range(0, len(shap_values_of_experiments.columns)):
                if not experiments_results.iloc[i, j]:
                    temp_df.iloc[i, j] = -1 * shap_values_of_experiments.iloc[i, j]
        temp_df = temp_df.sum().sort_values()
        options_to_set_to_false = temp_df[temp_df > 0]
        options_to_set_to_true = temp_df[temp_df < 0]    
        sum_value = (options_to_set_to_false.sum() + abs(options_to_set_to_true.sum()))
        options_to_set_to_false = options_to_set_to_false / sum_value
        options_to_set_to_true = abs(options_to_set_to_true) / sum_value
        options_to_set_to_false = options_to_set_to_false[options_to_set_to_false > importance_threshold].sort_values(ascending=False)
        options_to_set_to_true = options_to_set_to_true[options_to_set_to_true > importance_threshold]
        return options_to_set_to_false[0:max_toggles_to_lock_per_series], options_to_set_to_true[0:max_toggles_to_lock_per_series]

    # Call function
    options_to_set_to_false, options_to_set_to_true = find_significance_from_experiments_results()

    # Make the set to true DF
    options_to_set_to_true_df = pd.DataFrame()
    transformation, value = None, None
    try:
        transformation_and_value = options_to_set_to_true.keys()
    except:
        transformation, value = [], []
    if len(transformation_and_value) > 0:
        options_to_set_to_true_df["transformation"] = [x.split("__")[0] for x in transformation_and_value]
        options_to_set_to_true_df["value"] = [x.split("__")[1] for x in transformation_and_value]
    else:
        options_to_set_to_true_df["transformation"] = []
        options_to_set_to_true_df["value"] = []
    options_to_set_to_true_df["significance"] = options_to_set_to_true.values

    # Make the false DF
    options_to_set_to_false_df = pd.DataFrame()
    transformation, value = None, None
    try:
        transformation_and_value = options_to_set_to_false.keys()
    except:
        transformation, value = [], []
    if len(transformation_and_value) > 0:
        options_to_set_to_false_df["transformation"] = [x.split("__")[0] for x in transformation_and_value]
        options_to_set_to_false_df["value"] = [x.split("__")[1] for x in transformation_and_value]
    else:
        options_to_set_to_false_df["transformation"] = []
        options_to_set_to_false_df["value"] = []
    options_to_set_to_false_df["significance"] = options_to_set_to_false.values

    # Set to True
    for index, row in options_to_set_to_true_df.iterrows():
        param_distributions[row['transformation']][int(row['value'])] = True

    # Set to False
    for index, row in options_to_set_to_false_df.iterrows():
        param_distributions[row['transformation']][int(row['value'])] = False        



print("Done")

100%|██████████| 100/100 [00:04<00:00, 22.57it/s]
100%|██████████| 100/100 [00:04<00:00, 22.42it/s]
100%|██████████| 100/100 [00:04<00:00, 21.20it/s]
100%|██████████| 100/100 [00:04<00:00, 22.38it/s]
100%|██████████| 100/100 [00:04<00:00, 21.70it/s]


Done


## Final Results

In [70]:
experiments_results

Unnamed: 0,standard_scaler__0,standard_scaler__1,standard_scaler__2,standard_scaler__3,standard_scaler__4,standard_scaler__5,standard_scaler__6,standard_scaler__7,standard_scaler__8,standard_scaler__9,standard_scaler__10,standard_scaler__11,standard_scaler__12,min_max_scaler__0,min_max_scaler__1,min_max_scaler__2,min_max_scaler__3,min_max_scaler__4,min_max_scaler__5,min_max_scaler__6,min_max_scaler__7,min_max_scaler__8,min_max_scaler__9,min_max_scaler__10,min_max_scaler__11,min_max_scaler__12,binarizer__0,binarizer__1,binarizer__2,binarizer__3,binarizer__4,binarizer__5,binarizer__6,binarizer__7,binarizer__8,binarizer__9,binarizer__10,binarizer__11,binarizer__12,score
75,True,False,True,False,True,False,False,False,True,False,False,True,True,False,True,True,False,True,False,False,True,False,True,False,True,False,False,True,True,False,False,True,True,True,False,False,True,False,True,24.953231
2,True,True,True,True,True,False,False,True,True,False,False,True,True,False,True,True,True,True,True,True,True,False,False,True,False,True,True,False,True,False,True,True,True,True,False,True,True,False,False,24.823891
40,True,False,False,False,False,False,False,True,True,False,True,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,True,True,False,True,False,True,True,False,True,False,True,24.565076
49,False,True,True,False,False,False,True,False,False,True,True,False,True,True,True,False,False,True,True,True,False,True,True,False,False,True,True,False,True,False,True,True,False,True,True,True,True,True,False,23.094115
68,False,False,True,True,True,False,False,True,False,False,False,True,False,False,False,True,False,True,False,False,False,False,True,False,True,False,True,False,False,False,True,True,False,True,True,True,True,True,True,23.010342
17,False,False,True,False,False,False,True,False,True,True,True,False,False,False,False,True,True,False,False,False,False,False,False,True,False,True,True,True,False,True,True,True,True,True,False,False,False,False,True,21.983834
85,True,False,True,False,False,False,False,True,True,False,True,False,False,True,True,True,True,False,False,True,True,True,True,True,False,True,False,True,True,False,True,True,True,True,False,True,False,False,True,21.856153
31,True,False,False,True,False,False,False,True,False,False,True,True,True,False,True,False,False,False,False,True,False,True,True,True,False,True,False,False,True,True,True,True,True,True,False,True,False,False,True,21.569542
20,False,False,True,False,False,True,True,False,True,False,False,True,False,True,False,True,False,True,True,True,True,True,False,True,False,False,False,True,True,False,False,True,False,False,False,True,True,True,True,21.459317
54,True,True,False,True,True,False,True,True,True,False,False,True,False,False,False,False,False,True,False,True,True,True,False,False,False,True,True,False,True,False,True,True,True,True,True,False,False,False,True,21.400177
