In [18]:
%load_ext pycodestyle_magic
# %%pycodestyle


In [50]:
###### import pandas as pd
import numpy as np
import shap
import pandas as pd
from pandasql import sqldf
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston



# Lambdas

def q(x):
    return sqldf(x, globals())

# Randomize given the value options


def randomize_value_choice(value_options):
    """
    The value_options format is either:
        [True, False] - True or False
        [1,2,3,4,5] - 1 to 5
        [[0,1,4],5] - Any subset of [0,1,2,3,4] and 5
            represents the number of features in the dataframe
    """
    value = None
    # Check to make sure there are value options
    if len(value_options) > 0:
        # Make sure it isn't a 2D List
        if type(value_options[0]) != list:
            # Simple choice from the list -
                # In the future this might include a distribution mean+std
            np.random.choice(value_options)
        else:
            value = value_options[0]

            
            # Delete this later:
#         else:
#             # If the option is a 2D array it means we take
#                 # each element with 20% probability
#             value = []
#             for element in value_options[0]:
#                 random_number = np.random.rand(1)[0]
#                 if random_number <= 0.70:
#                     value.append(element)
    return value

# My own version of RandomizedSearchCV


def RandomizedGridSearchCV(n_experiments, pipe,
                           param_distributions, train_X, train_y,
                           test_X, test_y,
                           scoring='neg_mean_squared_error', cv=2):
    train_X = train_X.copy()
    train_y = train_y.copy()
    test_X = test_X.copy()
    test_y = test_y.copy()

    # Transform the param_distributions into four arrays
    key_list = []
    transform_class_list = []
    parameter_name_list = []
    value_options_list = []
    for key, value_options in param_distributions.items():
        class_key, parameter_name = key.split("__")
        transform_class = pipe.named_steps[class_key]
        key_list.append(key)
        transform_class_list.append(transform_class)
        parameter_name_list.append(parameter_name)
        value_options_list.append(value_options)

    # Initialize experiments dictionary
    experiments_info = {}
    for key, transform_class, parameter_name, value_options in \
        zip(key_list, transform_class_list, parameter_name_list,
            value_options_list):
        if type(value_options[0]) != list:
            experiments_info[key] = []          
        else:
            # value_options[1] is the number of features
            for i in range(value_options[1]):
                experiments_info[key + "__" + str(i)] = []
    experiments_info['score'] = []

    # Iterate over the experiments
    for iteration in range(n_experiments):
        print("Iteration: ", iteration)

        # Updates the transform parameters
        for key, transform_class, parameter_name, value_options in \
            zip(key_list, transform_class_list, parameter_name_list,
                value_options_list):

            # Get the random value
            value = randomize_value_choice(value_options)

            # If the value option is a 2D array it is treated differently
            if type(value_options[0]) != list:
                # Save input data for the experiments dataframe output
                experiments_info[key].append(value)             
            else:
                custom_values = {}
                for feature_i in range(value_options[1]):
                    # Choose a random value if that index will be used
                    custom_value = np.random.choice(value_options[2]) if feature_i in list(value) else 0.0
                    
                    # Save input data for the experiments dataframe output
                    experiments_info[key + "__" + str(feature_i)].append(custom_value)
                    
                    # Append to collection to provide custom_values (A parameter for the transformation class)
                    if custom_value != 0.0:
                        custom_values[feature_i] = custom_value
                    
                # Set parameters for the transformation class (typically numeric fields)
                setattr(transform_class, "custom_values", custom_values)
            
            # Set parameters for the transformation class (typically index values)
            setattr(transform_class, parameter_name, value)        

        # Fit
        pipe.fit(train_X, train_y)
        
        # Predict
        pred_y = pipe.predict(test_X)
        
        # Scoring
        if scoring == 'neg_mean_squared_error':
            score = mean_squared_error(pred_y, test_y)
        else:
            raise Exception('Scoring type not implemented')
        
        # Appending the score
        experiments_info["score"].append(score)

    experiments_df = pd.DataFrame(experiments_info)
    return experiments_df

# Standard scaler data preparation class


class StandardScalerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, column_indices_to_replace=[]):
        self.standard_scalers = {}
        
    def fit(self, X, y=None):
        for index in self.column_indices_to_replace:
            self.standard_scalers[index] = StandardScaler()
            self.standard_scalers[index].fit(X[:, index:index+1])
        return self

    def transform(self, X, y=None):
        for index in self.column_indices_to_replace:
            X[:, index:index+1] = self.standard_scalers[index].transform(X[:, index:index+1])
        return np.c_[X]

# Min-max scaler data preparation class


class MinMaxScalerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, column_indices_to_replace=[]):
        self.min_max_scalers = {}

    def fit(self, X, y=None):
        for index in self.column_indices_to_replace:
            self.min_max_scalers[index] = MinMaxScaler()
            self.min_max_scalers[index].fit(X[:, index:index+1])
        return self

    def transform(self, X, y=None):
        for index in self.column_indices_to_replace:
            X[:, index:index+1] = self.min_max_scalers[index].transform(X[:, index:index+1])
        return np.c_[X]

# Binarizer scaler data preparation class


class BinarizerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, column_indices_to_replace=[], custom_values={}):
        self.thresholds = {}

    def fit(self, X, y=None):
        for index in self.column_indices_to_replace:
            self.thresholds[index] = np.quantile(X[:, index:index+1], self.custom_values[index])
        return self

    def transform(self, X, y=None):
        for index in self.column_indices_to_replace:
            X[:, index:index+1] = X[:, index:index+1] > self.thresholds[index]
        return np.c_[X]

    
    
# Generic xgboost fit using several grid searches


def get_xgboost_model(train_X, train_y):
    model = Pipeline([('xgb', XGBRegressor())])

    # 1) Tune max depth
    param_grid = [{
        'xgb__n_estimators': [100],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [1, 2, 4, 6, 8],
        'xgb__subsample': [1.00]
    }]
    gs1 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs1 = gs1.fit(train_X, train_y)
    max_depth = gs1.best_params_['xgb__max_depth']
    # print(gs1.best_score_)
    # print(gs1.best_params_)

    # 2) Tune subsample
    param_grid = [{
        'xgb__n_estimators': [100],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00]
    }]
    gs2 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs2 = gs2.fit(train_X, train_y)
    subsample = gs2.best_params_['xgb__subsample']
    # print(gs2.best_score_)
    # print(gs2.best_params_)

    # 3) Tune n_estimators
    param_grid = [{
        'xgb__n_estimators': [50, 100, 150, 200],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    }]
    gs3 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs3 = gs3.fit(train_X, train_y)
    n_estimators = gs3.best_params_['xgb__n_estimators']
    # print(gs3.best_score_)
    # print(gs3.best_params_)

    # 4) Tune learning rate
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    },
        {
        'xgb__n_estimators': [n_estimators * 3],
        'xgb__learning_rate': [0.03],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    }]
    gs4 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs4 = gs4.fit(train_X, train_y)
    n_estimators = gs4.best_params_['xgb__n_estimators']
    learning_rate = gs4.best_params_['xgb__learning_rate']
    # print(gs4.best_score_)
    # print(gs4.best_params_)

    # 5) Tune n_estimators
    param_grid = [{
        'xgb__n_estimators': [
            int(0.8 * n_estimators),
            int(1.0 * n_estimators),
            int(1.2 * n_estimators)
        ],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    }]
    gs5 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs5 = gs5.fit(train_X, train_y)
    n_estimators = gs5.best_params_['xgb__n_estimators']
    # print(gs5.best_score_)
    # print(gs5.best_params_)

    # 6) Tune sampling by tree
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'xgb__colsample_bylevel': [1.0]
    }]
    gs6 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs6 = gs6.fit(train_X, train_y)
    colsample_bytree = gs6.best_params_['xgb__colsample_bytree']
    # print(gs6.best_score_)
    # print(gs6.best_params_)

    # 7) Tune subsample
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [1.0]
    }]
    gs7 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs7 = gs7.fit(train_X, train_y)
    subsample = gs7.best_params_['xgb__subsample']
    # print(gs7.best_score_)
    # print(gs7.best_params_)

    # 8) Tune sampling by level
    n_estimators = gs7.best_params_['xgb__n_estimators']
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    }]
    gs8 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs8 = gs8.fit(train_X, train_y)
    colsample_bylevel = gs8.best_params_['xgb__colsample_bylevel']
    # print(gs8.best_score_)
    # print(gs8.best_params_)

    # 9) Tune sampling fields
    n_estimators = gs8.best_params_['xgb__n_estimators']
    subsample = 0.9 if subsample == 1.0 else subsample
    colsample_bytree = 0.6 if colsample_bytree == 0.5 else colsample_bytree
    colsample_bylevel = 0.9 if colsample_bylevel == 1.0 else colsample_bylevel
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample, subsample + 0.1],
        'xgb__colsample_bytree': [colsample_bytree - 0.1, colsample_bytree],
        'xgb__colsample_bylevel': [colsample_bylevel, colsample_bylevel + 0.1]
    }]
    gs9 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs9 = gs9.fit(train_X, train_y)
    subsample = gs9.best_params_['xgb__subsample']
    colsample_bytree = gs9.best_params_['xgb__colsample_bytree']
    colsample_bylevel = gs9.best_params_['xgb__colsample_bylevel']
    # print(gs9.best_score_)
    # print(gs9.best_params_)

    # 10) Tune alpha
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [colsample_bylevel],
        'xgb__reg_lambda': [0.001, 0.01, 0.1, 0.3, 1, 3, 10, 100, 1000]
    }]
    gs10 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs10 = gs10.fit(train_X, train_y)
    # print(gs10.best_score_)
    # print(gs10.best_params_)

    # Find the best model
    # Sometimes the best model isn't the last one, so checking all of them
    best_model = gs1
    best_model_score = gs1.best_score_
    if gs2.best_score_ > best_model_score:
        best_model = gs2
        best_model_score = gs2.best_score_
    if gs2.best_score_ > best_model_score:
        best_model = gs2
        best_model_score = gs2.best_score_
    if gs3.best_score_ > best_model_score:
        best_model = gs3
        best_model_score = gs3.best_score_
    if gs4.best_score_ > best_model_score:
        best_model = gs4
        best_model_score = gs4.best_score_
    if gs5.best_score_ > best_model_score:
        best_model = gs5
        best_model_score = gs5.best_score_
    if gs6.best_score_ > best_model_score:
        best_model = gs6
        best_model_score = gs6.best_score_
    if gs7.best_score_ > best_model_score:
        best_model = gs7
        best_model_score = gs7.best_score_
    if gs8.best_score_ > best_model_score:
        best_model = gs8
        best_model_score = gs8.best_score_
    if gs9.best_score_ > best_model_score:
        best_model = gs9
        best_model_score = gs9.best_score_
    if gs10.best_score_ > best_model_score:
        best_model = gs10
        best_model_score = gs10.best_score_

    # Return the best model
    return XGBRegressor(**best_model.best_params_)

In [51]:
# Example dataset
boston_data = load_boston()

# Extract pandas dataframe and target
X = pd.DataFrame(boston_data['data']).copy().values
y = pd.DataFrame(boston_data['target']).copy().values

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.20, random_state=42)
train_X, test_X = train_X, test_X
train_y, test_y = train_y.reshape(-1, 1), test_y.reshape(-1, 1)

# An okay model fit to the data
try:
    xgb_model
except:
    xgb_model = get_xgboost_model(train_X, train_y)
# linear_regression = LinearRegression(normalize=False)

# Pipeline
pipe = Pipeline([('standard_scaler', StandardScalerTransform()),
                 ('min_max_scaler', MinMaxScalerTransform()),
                 ('binarizer', BinarizerTransform()),
                 ('model', xgb_model)])

# Find the number of features
num_features = train_X.shape[1]

# Testing with these indices
indices = list(range(num_features))

# Value options
binary_value_options = [1.0]
small_value_options = [0.16, 0.33, 0.5, 0.66, 0.83]

# Possible configurations
param_distributions = {
    'standard_scaler__column_indices_to_replace': [indices, num_features, binary_value_options],
    'min_max_scaler__column_indices_to_replace': [indices, num_features, binary_value_options],
    'binarizer__column_indices_to_replace': [indices, num_features, small_value_options],
}

# Randomly search the space n_iter times
experiments_df = RandomizedGridSearchCV(
    n_experiments=10,
    pipe=pipe,
    param_distributions=param_distributions,
    train_X=train_X,
    train_y=train_y,
    test_X=test_X,
    test_y=test_y,
    scoring='neg_mean_squared_error',
    cv=2)

# Sort the scores
experiments_df.sort_values(by=['score'], ascending=False, inplace=True)

# Drop score
experiments_X_df = experiments_df.drop(['score'], axis=1)

# Get column names
X_column_names = experiments_X_df.columns

# Convert to numpy
experiments_X = experiments_X_df.values
experiments_y = experiments_df[['score']].values

# Create an XGBoost model tuned with the experiments data
xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)

# Fit the model
xgb_experiments_model.fit(experiments_X_df, experiments_y)

# Extract shap values
explainer = shap.TreeExplainer(xgb_experiments_model)
shap_values = explainer.shap_values(experiments_X_df)

# Shap as dataframe
pandas_shap_df = pd.DataFrame(shap_values, columns=X_column_names)
pandas_shap_df


Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9


Unnamed: 0,standard_scaler__column_indices_to_replace__0,standard_scaler__column_indices_to_replace__1,standard_scaler__column_indices_to_replace__2,standard_scaler__column_indices_to_replace__3,standard_scaler__column_indices_to_replace__4,standard_scaler__column_indices_to_replace__5,standard_scaler__column_indices_to_replace__6,standard_scaler__column_indices_to_replace__7,standard_scaler__column_indices_to_replace__8,standard_scaler__column_indices_to_replace__9,standard_scaler__column_indices_to_replace__10,standard_scaler__column_indices_to_replace__11,standard_scaler__column_indices_to_replace__12,min_max_scaler__column_indices_to_replace__0,min_max_scaler__column_indices_to_replace__1,min_max_scaler__column_indices_to_replace__2,min_max_scaler__column_indices_to_replace__3,min_max_scaler__column_indices_to_replace__4,min_max_scaler__column_indices_to_replace__5,min_max_scaler__column_indices_to_replace__6,min_max_scaler__column_indices_to_replace__7,min_max_scaler__column_indices_to_replace__8,min_max_scaler__column_indices_to_replace__9,min_max_scaler__column_indices_to_replace__10,min_max_scaler__column_indices_to_replace__11,min_max_scaler__column_indices_to_replace__12,binarizer__column_indices_to_replace__0,binarizer__column_indices_to_replace__1,binarizer__column_indices_to_replace__2,binarizer__column_indices_to_replace__3,binarizer__column_indices_to_replace__4,binarizer__column_indices_to_replace__5,binarizer__column_indices_to_replace__6,binarizer__column_indices_to_replace__7,binarizer__column_indices_to_replace__8,binarizer__column_indices_to_replace__9,binarizer__column_indices_to_replace__10,binarizer__column_indices_to_replace__11,binarizer__column_indices_to_replace__12
0,-0.031388,0.434092,0.002751,-0.083869,-0.012671,-0.003476,-6.5e-05,0.411042,-0.006389,0.0,0.0,0.032942,0.029878,0.0,0.0,-0.015014,0.512023,0.0,0.0,0.0,0.05856,0.0,0.0,0.0,-0.001031,0.0,1.242635,0.0,0.0,0.0,-0.009965,2.402712,9.494497,0.001348,0.489883,0.0,-0.003377,2.278624,-0.010813
1,0.09712,0.434092,0.00283,0.020967,-0.023252,-0.011136,-0.000101,0.421635,-0.00355,0.0,0.0,-0.161777,0.016599,0.0,0.0,0.025159,0.523997,0.0,0.0,0.0,-0.100554,0.0,0.0,0.0,0.000741,0.0,1.242635,0.0,0.0,0.0,0.060018,2.301387,9.799907,0.002975,0.811353,0.0,-0.007197,-0.479122,-0.010813
2,0.111048,0.487125,0.002576,-0.083869,0.213685,0.041952,0.000203,0.488879,0.02356,0.0,0.0,-0.161777,0.073449,0.0,0.0,0.040117,0.596158,0.0,0.0,0.0,0.138152,0.0,0.0,0.0,0.000342,0.0,1.5331,0.0,0.0,0.0,-0.00969,2.402405,0.359319,0.001276,0.809411,0.0,0.013312,0.075673,0.041556
3,-0.20518,0.191009,0.006635,-0.083869,-0.026711,0.025183,0.000203,0.476704,-0.015707,0.0,0.0,0.078459,0.073449,0.0,0.0,-0.035758,0.590885,0.0,0.0,0.0,0.129435,0.0,0.0,0.0,0.000741,0.0,1.365514,0.0,0.0,0.0,-0.017772,0.466945,-3.035956,-0.001701,0.637821,0.0,-0.006531,0.041624,-0.024934
4,-0.048826,0.199585,-0.010084,0.062901,-0.023118,-0.004302,-0.000101,0.267903,0.02356,0.0,0.0,0.08575,0.073449,0.0,0.0,0.040117,0.59765,0.0,0.0,0.0,0.146642,0.0,0.0,0.0,0.000741,0.0,1.365514,0.0,0.0,0.0,0.070202,-1.520629,-3.019686,0.002975,-0.577578,0.0,0.018796,0.042196,0.054241
5,0.025907,0.144813,0.006635,0.062901,-0.028361,-0.013511,-0.000416,-0.678083,-0.005768,0.0,0.0,0.08575,0.026973,0.0,0.0,-0.020978,0.556965,0.0,0.0,0.0,-0.217869,0.0,0.0,0.0,-0.002222,0.0,1.365514,0.0,0.0,0.0,-0.021729,-0.933213,-3.197255,-0.003967,0.386443,0.0,-0.005145,-0.804163,-0.024934
6,-0.048826,-0.460531,0.007149,0.062901,-0.02715,-0.006439,0.000203,0.196527,0.008652,0.0,0.0,0.08575,0.026973,0.0,0.0,-0.027858,-0.849073,0.0,0.0,0.0,-0.251973,0.0,0.0,0.0,-0.002222,0.0,1.365514,0.0,0.0,0.0,-0.021729,-1.522538,-2.665477,0.002975,-0.586696,0.0,-0.009106,0.003764,-0.015286
7,0.025907,0.144813,0.007149,-0.083869,-0.028178,-0.011986,0.000833,-1.655551,-0.015707,0.0,0.0,0.08575,-0.110173,0.0,0.0,0.014752,-0.849861,0.0,0.0,0.0,0.089109,0.0,0.0,0.0,0.000741,0.0,1.365514,0.0,0.0,0.0,-0.021729,-0.126198,-3.116265,0.002975,-0.659476,0.0,0.007055,0.005793,-0.024934
8,0.025907,-1.340303,0.00309,0.062901,-0.026839,-0.013385,-0.000101,0.205548,-0.015707,0.0,0.0,0.039647,-0.110173,0.0,0.0,-0.027858,-0.806027,0.0,0.0,0.0,0.093753,0.0,0.0,0.0,0.000741,0.0,-5.609564,0.0,0.0,0.0,-0.021729,-1.522538,-1.422105,-0.003967,-0.652864,0.0,-0.009106,-0.754394,0.025477
9,-0.048826,0.144813,-0.027862,-0.251606,-0.028312,-0.00732,-0.000416,-0.690828,0.008652,0.0,0.0,-0.428752,0.026973,0.0,0.0,0.014752,-0.807957,0.0,0.0,0.0,-0.233399,0.0,0.0,0.0,0.000741,0.0,-5.609564,0.0,0.0,0.0,-0.021729,-1.038049,-2.575473,-0.003967,-0.607891,0.0,-0.005145,-0.766139,-0.032544


In [52]:
xgb_experiments_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, xgb__colsample_bylevel=1.0,
       xgb__colsample_bytree=0.9, xgb__learning_rate=0.1, xgb__max_depth=1,
       xgb__n_estimators=240, xgb__subsample=1.0)

In [53]:
pd.set_option('display.max_columns', 200)

experiments_df

Unnamed: 0,standard_scaler__column_indices_to_replace__0,standard_scaler__column_indices_to_replace__1,standard_scaler__column_indices_to_replace__2,standard_scaler__column_indices_to_replace__3,standard_scaler__column_indices_to_replace__4,standard_scaler__column_indices_to_replace__5,standard_scaler__column_indices_to_replace__6,standard_scaler__column_indices_to_replace__7,standard_scaler__column_indices_to_replace__8,standard_scaler__column_indices_to_replace__9,standard_scaler__column_indices_to_replace__10,standard_scaler__column_indices_to_replace__11,standard_scaler__column_indices_to_replace__12,min_max_scaler__column_indices_to_replace__0,min_max_scaler__column_indices_to_replace__1,min_max_scaler__column_indices_to_replace__2,min_max_scaler__column_indices_to_replace__3,min_max_scaler__column_indices_to_replace__4,min_max_scaler__column_indices_to_replace__5,min_max_scaler__column_indices_to_replace__6,min_max_scaler__column_indices_to_replace__7,min_max_scaler__column_indices_to_replace__8,min_max_scaler__column_indices_to_replace__9,min_max_scaler__column_indices_to_replace__10,min_max_scaler__column_indices_to_replace__11,min_max_scaler__column_indices_to_replace__12,binarizer__column_indices_to_replace__0,binarizer__column_indices_to_replace__1,binarizer__column_indices_to_replace__2,binarizer__column_indices_to_replace__3,binarizer__column_indices_to_replace__4,binarizer__column_indices_to_replace__5,binarizer__column_indices_to_replace__6,binarizer__column_indices_to_replace__7,binarizer__column_indices_to_replace__8,binarizer__column_indices_to_replace__9,binarizer__column_indices_to_replace__10,binarizer__column_indices_to_replace__11,binarizer__column_indices_to_replace__12,score
5,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.5,0.83,0.16,0.5,0.66,0.33,0.83,0.33,0.16,0.83,0.5,0.66,0.33,38.231993
7,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,0.83,0.83,0.5,0.83,0.16,0.83,0.66,0.66,0.16,0.83,0.33,0.16,35.821404
8,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.5,0.33,0.33,0.5,0.33,0.16,0.66,0.66,0.66,0.33,0.0,0.0,0.66,28.110204
2,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.16,0.33,0.0,0.0,0.0,0.5,0.0,0.0,0.66,0.83,0.83,0.0,0.33,21.495189
1,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.33,0.16,0.0,0.0,0.83,0.0,0.33,0.16,0.0,0.0,0.0,0.0,0.66,18.714723
9,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.5,0.5,0.83,0.66,0.33,0.0,0.0,0.0,0.5,0.0,0.16,0.16,0.33,17.542804
3,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.33,0.16,0.16,0.0,0.33,0.0,0.16,0.66,0.0,0.5,0.5,0.0,0.5,16.109253
6,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.66,0.83,0.33,0.66,0.33,0.83,0.16,0.66,0.0,0.66,0.0,0.0,0.0,15.90409
0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.33,0.0,0.16,0.5,0.0,0.66,0.0,0.0,0.33,0.5,0.16,0.66,8.971616
4,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.5,0.0,0.83,0.0,0.0,0.5,0.0,0.0,0.5,0.66,0.5,0.0,7.848513


In [54]:
# Transformation to polarized groups of shap values
polarized_df = pandas_shap_df.copy()
for i in range(0, len(pandas_shap_df.index)):
    for j in range(0, len(pandas_shap_df.columns)):
        if not experiments_df.iloc[i, j]:
            polarized_df.iloc[i, j] = -1 * pandas_shap_df.iloc[i, j]
polarized_df


Unnamed: 0,standard_scaler__column_indices_to_replace__0,standard_scaler__column_indices_to_replace__1,standard_scaler__column_indices_to_replace__2,standard_scaler__column_indices_to_replace__3,standard_scaler__column_indices_to_replace__4,standard_scaler__column_indices_to_replace__5,standard_scaler__column_indices_to_replace__6,standard_scaler__column_indices_to_replace__7,standard_scaler__column_indices_to_replace__8,standard_scaler__column_indices_to_replace__9,standard_scaler__column_indices_to_replace__10,standard_scaler__column_indices_to_replace__11,standard_scaler__column_indices_to_replace__12,min_max_scaler__column_indices_to_replace__0,min_max_scaler__column_indices_to_replace__1,min_max_scaler__column_indices_to_replace__2,min_max_scaler__column_indices_to_replace__3,min_max_scaler__column_indices_to_replace__4,min_max_scaler__column_indices_to_replace__5,min_max_scaler__column_indices_to_replace__6,min_max_scaler__column_indices_to_replace__7,min_max_scaler__column_indices_to_replace__8,min_max_scaler__column_indices_to_replace__9,min_max_scaler__column_indices_to_replace__10,min_max_scaler__column_indices_to_replace__11,min_max_scaler__column_indices_to_replace__12,binarizer__column_indices_to_replace__0,binarizer__column_indices_to_replace__1,binarizer__column_indices_to_replace__2,binarizer__column_indices_to_replace__3,binarizer__column_indices_to_replace__4,binarizer__column_indices_to_replace__5,binarizer__column_indices_to_replace__6,binarizer__column_indices_to_replace__7,binarizer__column_indices_to_replace__8,binarizer__column_indices_to_replace__9,binarizer__column_indices_to_replace__10,binarizer__column_indices_to_replace__11,binarizer__column_indices_to_replace__12
0,-0.031388,0.434092,0.002751,0.083869,-0.012671,-0.003476,6.5e-05,0.411042,-0.006389,0.0,0.0,0.032942,0.029878,0.0,0.0,-0.015014,0.512023,-0.0,0.0,0.0,-0.05856,0.0,0.0,-0.0,0.001031,0.0,1.242635,0.0,0.0,0.0,-0.009965,2.402712,9.494497,0.001348,0.489883,0.0,-0.003377,2.278624,-0.010813
1,-0.09712,0.434092,0.00283,0.020967,-0.023252,-0.011136,0.000101,0.421635,-0.00355,0.0,0.0,0.161777,0.016599,0.0,0.0,-0.025159,0.523997,-0.0,-0.0,-0.0,-0.100554,0.0,0.0,0.0,0.000741,0.0,1.242635,0.0,0.0,0.0,0.060018,2.301387,9.799907,0.002975,0.811353,0.0,-0.007197,-0.479122,-0.010813
2,-0.111048,0.487125,0.002576,0.083869,-0.213685,-0.041952,0.000203,0.488879,-0.02356,0.0,0.0,0.161777,0.073449,0.0,0.0,-0.040117,0.596158,0.0,0.0,0.0,-0.138152,0.0,0.0,0.0,0.000342,-0.0,1.5331,0.0,0.0,0.0,-0.00969,2.402405,0.359319,0.001276,0.809411,0.0,-0.013312,-0.075673,0.041556
3,-0.20518,0.191009,0.006635,0.083869,-0.026711,-0.025183,0.000203,0.476704,-0.015707,0.0,0.0,0.078459,0.073449,0.0,0.0,-0.035758,0.590885,-0.0,0.0,0.0,-0.129435,0.0,0.0,0.0,0.000741,-0.0,1.365514,0.0,-0.0,-0.0,0.017772,0.466945,3.035956,0.001701,0.637821,0.0,-0.006531,-0.041624,-0.024934
4,-0.048826,0.199585,0.010084,0.062901,-0.023118,-0.004302,0.000101,0.267903,-0.02356,0.0,0.0,0.08575,0.073449,0.0,0.0,-0.040117,0.59765,0.0,0.0,0.0,-0.146642,0.0,-0.0,0.0,0.000741,0.0,1.365514,0.0,-0.0,-0.0,0.070202,1.520629,-3.019686,0.002975,0.577578,-0.0,-0.018796,-0.042196,0.054241
5,-0.025907,0.144813,0.006635,0.062901,-0.028361,-0.013511,0.000416,0.678083,-0.005768,0.0,0.0,0.08575,0.026973,0.0,0.0,-0.020978,0.556965,-0.0,0.0,-0.0,-0.217869,-0.0,0.0,0.0,0.002222,0.0,1.365514,0.0,0.0,0.0,-0.021729,0.933213,3.197255,0.003967,0.386443,-0.0,-0.005145,-0.804163,-0.024934
6,-0.048826,0.460531,0.007149,0.062901,-0.02715,-0.006439,0.000203,0.196527,-0.008652,-0.0,0.0,0.08575,0.026973,0.0,0.0,-0.027858,0.849073,0.0,0.0,0.0,-0.251973,0.0,0.0,0.0,0.002222,0.0,1.365514,0.0,0.0,-0.0,-0.021729,1.522538,-2.665477,0.002975,0.586696,0.0,-0.009106,-0.003764,-0.015286
7,-0.025907,0.144813,0.007149,0.083869,-0.028178,-0.011986,0.000833,1.655551,-0.015707,0.0,0.0,0.08575,0.110173,0.0,-0.0,-0.014752,0.849861,-0.0,0.0,0.0,-0.089109,-0.0,0.0,0.0,0.000741,0.0,1.365514,0.0,0.0,0.0,-0.021729,-0.126198,-3.116265,0.002975,0.659476,0.0,-0.007055,-0.005793,0.024934
8,-0.025907,1.340303,0.00309,0.062901,-0.026839,-0.013385,0.000101,0.205548,-0.015707,-0.0,0.0,0.039647,0.110173,0.0,0.0,-0.027858,0.806027,0.0,0.0,0.0,-0.093753,0.0,0.0,0.0,0.000741,0.0,5.609564,0.0,-0.0,0.0,-0.021729,1.522538,-1.422105,0.003967,0.652864,0.0,-0.009106,-0.754394,0.025477
9,-0.048826,0.144813,0.027862,0.251606,-0.028312,-0.00732,0.000416,0.690828,-0.008652,0.0,0.0,0.428752,0.026973,-0.0,0.0,-0.014752,0.807957,0.0,0.0,0.0,-0.233399,0.0,-0.0,-0.0,0.000741,0.0,5.609564,0.0,-0.0,0.0,0.021729,1.038049,-2.575473,0.003967,0.607891,0.0,-0.005145,-0.766139,0.032544


In [55]:
# Certainly, I set a feature to True for large positive values
# Also, I set a feature to False for large negative values
# Otherwise, it is set to True or False
polarized_shap_result = polarized_df.sum()
polarized_shap_result.sort_values()


min_max_scaler__column_indices_to_replace__7      -1.459448
binarizer__column_indices_to_replace__11          -0.694243
standard_scaler__column_indices_to_replace__0     -0.668938
standard_scaler__column_indices_to_replace__4     -0.438277
min_max_scaler__column_indices_to_replace__2      -0.262363
standard_scaler__column_indices_to_replace__5     -0.138691
standard_scaler__column_indices_to_replace__8     -0.127253
binarizer__column_indices_to_replace__10          -0.084769
binarizer__column_indices_to_replace__1            0.000000
binarizer__column_indices_to_replace__3            0.000000
min_max_scaler__column_indices_to_replace__12      0.000000
min_max_scaler__column_indices_to_replace__10      0.000000
min_max_scaler__column_indices_to_replace__9       0.000000
min_max_scaler__column_indices_to_replace__8       0.000000
binarizer__column_indices_to_replace__9            0.000000
binarizer__column_indices_to_replace__2            0.000000
min_max_scaler__column_indices_to_replac

In [56]:
# Splits positive and negative

positive_fields = polarized_shap_result[polarized_shap_result>0]
positive_fields = positive_fields / positive_fields.sum()
negative_fields = polarized_shap_result[polarized_shap_result<0]
negative_fields = negative_fields / negative_fields.sum()
positive_fields = positive_fields[positive_fields>0.05]
negative_fields = negative_fields[negative_fields>0.05]

# Each iteration, find anything above % number

In [57]:
print(positive_fields, '\n', negative_fields)

# Splits positive and negative


# Each iteration, find anything above 5% and either remove a low value or remove a high value from their options

# Continue until 0 things were removed (0 will be removed if one option for each)

# When there are X choices yet

# Try appending experiments vs continue to use the same results for analysis ; keep together for now for review

# When there were 3

standard_scaler__column_indices_to_replace__1    0.053461
standard_scaler__column_indices_to_replace__7    0.073759
min_max_scaler__column_indices_to_replace__3     0.089845
binarizer__column_indices_to_replace__0          0.296302
binarizer__column_indices_to_replace__5          0.187788
binarizer__column_indices_to_replace__6          0.175752
binarizer__column_indices_to_replace__8          0.083518
dtype: float32 
 standard_scaler__column_indices_to_replace__0    0.172674
standard_scaler__column_indices_to_replace__4    0.113134
min_max_scaler__column_indices_to_replace__2     0.067724
min_max_scaler__column_indices_to_replace__7     0.376731
binarizer__column_indices_to_replace__11         0.179207
dtype: float32


In [58]:
positive_fields.keys()

Index(['standard_scaler__column_indices_to_replace__1',
       'standard_scaler__column_indices_to_replace__7',
       'min_max_scaler__column_indices_to_replace__3',
       'binarizer__column_indices_to_replace__0',
       'binarizer__column_indices_to_replace__5',
       'binarizer__column_indices_to_replace__6',
       'binarizer__column_indices_to_replace__8'],
      dtype='object')

In [60]:
if len(param_distributions['standard_scaler__column_indices_to_replace'][2]) > 1:
    

In [61]:
param_distributions['standard_scaler__column_indices_to_replace']