In [74]:
%load_ext pycodestyle_magic
# %%pycodestyle


In [57]:
###### import pandas as pd
import numpy as np
import shap
import pandas as pd
from pandasql import sqldf
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston

# Lambdas


def q(x):
    return sqldf(x, globals())


# My own version of RandomizedSearchCV


def RandomizedGridSearchCV(n_experiments,
                           pipe,
                           param_distributions,
                           train_X,
                           train_y,
                           test_X,
                           test_y,
                           scoring='neg_mean_squared_error',
                           cv=2):
    # Copy data
    train_X, train_y = train_X.copy(), train_y.copy()
    test_X, test_y = test_X.copy(), test_y.copy()

    # Transform the param_distributions into four arrays
    key_list, transform_class_list = [], []
    parameter_name_list, features_list = [], []
    for key, features in param_distributions.items():
        class_key, parameter_name = key.split("__")
        transform_class = pipe.named_steps[class_key]
        key_list.append(key)
        transform_class_list.append(transform_class)
        parameter_name_list.append(parameter_name)
        features_list.append(features)

    # Initialize experiments dictionary
    experiments_info = {}
    for key, transform_class, parameter_name, features in \
        zip(key_list, transform_class_list, parameter_name_list,
            features_list):
        for i in range(len(features)):
            experiments_info[key + "___" + str(i)] = []
    experiments_info['score'] = []

    # Iterate over the experiments
    for iteration in range(n_experiments):
        print("Iteration: ", iteration)

        # Updates the transform parameters
        for key, transform_class, parameter_name, features in \
            zip(key_list, transform_class_list, parameter_name_list,
                features_list):

            # Copy features
            features = features.copy()

            # Loop over features
            for feature_i in range(len(features)):

                # Replace features
                features[feature_i] = np.random.choice([
                    True, False
                ]) if features[feature_i] == None else features[feature_i]

                # Save input data for the experiments dataframe output
                experiments_info[key + "___" +
                                 str(feature_i)].append(features[feature_i])

            # Set parameters for the transformation class (typically numeric fields)
            setattr(transform_class, "features", features)

        # Fit
        pipe.fit(train_X, train_y)

        # Predict
        pred_y = pipe.predict(test_X)

        # Scoring
        if scoring == 'neg_mean_squared_error':
            score = mean_squared_error(pred_y, test_y)
        else:
            raise Exception('Scoring type not implemented')

        # Appending the score
        experiments_info["score"].append(score)

    experiments_df = pd.DataFrame(experiments_info)
    return experiments_df


# Standard scaler data preparation class


class StandardScalerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, features=[]):
        self.standard_scalers = {}

    def fit(self, X, y=None):
        for index in range(len(self.features)):
            if self.features[index] > 0.5:
                self.standard_scalers[index] = StandardScaler()
                self.standard_scalers[index].fit(X[:, index:index + 1])
        return self

    def transform(self, X, y=None):
        for index in range(len(self.features)):
            if self.features[index] > 0.5:
                X[:, index:index + 1] = self.standard_scalers[index].transform(
                    X[:, index:index + 1])
        return np.c_[X]


# Min-max scaler data preparation class


class MinMaxScalerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, features=[]):
        self.min_max_scalers = {}

    def fit(self, X, y=None):
        for index in range(len(self.features)):
            if self.features[index] > 0.5:
                self.min_max_scalers[index] = MinMaxScaler()
                self.min_max_scalers[index].fit(X[:, index:index + 1])
        return self

    def transform(self, X, y=None):
        for index in range(len(self.features)):
            if self.features[index] > 0.5:
                X[:, index:index + 1] = self.min_max_scalers[index].transform(
                    X[:, index:index + 1])
        return np.c_[X]


# Binarizer scaler data preparation class


class BinarizerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, features={}):
        self.thresholds = {}

    def fit(self, X, y=None):
        for index in range(len(self.features)):
            if self.features[index] == True:
                self.thresholds[index] = np.quantile(X[:, index:index + 1],
                                                 0.50)
        return self

    def transform(self, X, y=None):
        for index in range(len(self.features)):
            if self.features[index] == True:
                X[:, index:index +
                  1] = X[:, index:index + 1] > self.thresholds[index]
        return np.c_[X]


# Generic xgboost fit using several grid searches


def get_xgboost_model(train_X, train_y):
    model = Pipeline([('xgb', XGBRegressor())])

    # 1) Tune max depth
    param_grid = [{
        'xgb__n_estimators': [100],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [1, 2, 4, 6, 8],
        'xgb__subsample': [1.00]
    }]
    gs1 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs1 = gs1.fit(train_X, train_y)
    max_depth = gs1.best_params_['xgb__max_depth']
    # print(gs1.best_score_)
    # print(gs1.best_params_)

    # 2) Tune subsample
    param_grid = [{
        'xgb__n_estimators': [100],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00]
    }]
    gs2 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs2 = gs2.fit(train_X, train_y)
    subsample = gs2.best_params_['xgb__subsample']
    # print(gs2.best_score_)
    # print(gs2.best_params_)

    # 3) Tune n_estimators
    param_grid = [{
        'xgb__n_estimators': [50, 100, 150, 200],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    }]
    gs3 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs3 = gs3.fit(train_X, train_y)
    n_estimators = gs3.best_params_['xgb__n_estimators']
    # print(gs3.best_score_)
    # print(gs3.best_params_)

    # 4) Tune learning rate
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    },
                  {
                      'xgb__n_estimators': [n_estimators * 3],
                      'xgb__learning_rate': [0.03],
                      'xgb__max_depth': [max_depth],
                      'xgb__subsample': [subsample]
                  }]
    gs4 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs4 = gs4.fit(train_X, train_y)
    n_estimators = gs4.best_params_['xgb__n_estimators']
    learning_rate = gs4.best_params_['xgb__learning_rate']
    # print(gs4.best_score_)
    # print(gs4.best_params_)

    # 5) Tune n_estimators
    param_grid = [{
        'xgb__n_estimators': [
            int(0.8 * n_estimators),
            int(1.0 * n_estimators),
            int(1.2 * n_estimators)
        ],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    }]
    gs5 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs5 = gs5.fit(train_X, train_y)
    n_estimators = gs5.best_params_['xgb__n_estimators']
    # print(gs5.best_score_)
    # print(gs5.best_params_)

    # 6) Tune sampling by tree
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'xgb__colsample_bylevel': [1.0]
    }]
    gs6 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs6 = gs6.fit(train_X, train_y)
    colsample_bytree = gs6.best_params_['xgb__colsample_bytree']
    # print(gs6.best_score_)
    # print(gs6.best_params_)

    # 7) Tune subsample
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [1.0]
    }]
    gs7 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs7 = gs7.fit(train_X, train_y)
    subsample = gs7.best_params_['xgb__subsample']
    # print(gs7.best_score_)
    # print(gs7.best_params_)

    # 8) Tune sampling by level
    n_estimators = gs7.best_params_['xgb__n_estimators']
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    }]
    gs8 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs8 = gs8.fit(train_X, train_y)
    colsample_bylevel = gs8.best_params_['xgb__colsample_bylevel']
    # print(gs8.best_score_)
    # print(gs8.best_params_)

    # 9) Tune sampling fields
    n_estimators = gs8.best_params_['xgb__n_estimators']
    subsample = 0.9 if subsample == 1.0 else subsample
    colsample_bytree = 0.6 if colsample_bytree == 0.5 else colsample_bytree
    colsample_bylevel = 0.9 if colsample_bylevel == 1.0 else colsample_bylevel
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample, subsample + 0.1],
        'xgb__colsample_bytree': [colsample_bytree - 0.1, colsample_bytree],
        'xgb__colsample_bylevel': [colsample_bylevel, colsample_bylevel + 0.1]
    }]
    gs9 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs9 = gs9.fit(train_X, train_y)
    subsample = gs9.best_params_['xgb__subsample']
    colsample_bytree = gs9.best_params_['xgb__colsample_bytree']
    colsample_bylevel = gs9.best_params_['xgb__colsample_bylevel']
    # print(gs9.best_score_)
    # print(gs9.best_params_)

    # 10) Tune alpha
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [colsample_bylevel],
        'xgb__reg_lambda': [0.001, 0.01, 0.1, 0.3, 1, 3, 10, 100, 1000]
    }]
    gs10 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs10 = gs10.fit(train_X, train_y)
    # print(gs10.best_score_)
    # print(gs10.best_params_)

    # Find the best model
    # Sometimes the best model isn't the last one, so checking all of them
    best_model = gs1
    best_model_score = gs1.best_score_
    if gs2.best_score_ > best_model_score:
        best_model = gs2
        best_model_score = gs2.best_score_
    if gs2.best_score_ > best_model_score:
        best_model = gs2
        best_model_score = gs2.best_score_
    if gs3.best_score_ > best_model_score:
        best_model = gs3
        best_model_score = gs3.best_score_
    if gs4.best_score_ > best_model_score:
        best_model = gs4
        best_model_score = gs4.best_score_
    if gs5.best_score_ > best_model_score:
        best_model = gs5
        best_model_score = gs5.best_score_
    if gs6.best_score_ > best_model_score:
        best_model = gs6
        best_model_score = gs6.best_score_
    if gs7.best_score_ > best_model_score:
        best_model = gs7
        best_model_score = gs7.best_score_
    if gs8.best_score_ > best_model_score:
        best_model = gs8
        best_model_score = gs8.best_score_
    if gs9.best_score_ > best_model_score:
        best_model = gs9
        best_model_score = gs9.best_score_
    if gs10.best_score_ > best_model_score:
        best_model = gs10
        best_model_score = gs10.best_score_

    # Return the best model
    return XGBRegressor(**best_model.best_params_)


In [65]:
# Example dataset
boston_data = load_boston()

# Extract pandas dataframe and target
X = pd.DataFrame(boston_data['data']).copy().values
y = pd.DataFrame(boston_data['target']).copy().values

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.20, random_state=42)
train_X, test_X = train_X, test_X
train_y, test_y = train_y.reshape(-1, 1), test_y.reshape(-1, 1)

# An okay model fit to the data
try:
    xgb_model
except:
    xgb_model = get_xgboost_model(train_X, train_y)
# linear_regression = LinearRegression(normalize=False)

# Pipeline
pipe = Pipeline([('standard_scaler', StandardScalerTransform()),
                 ('min_max_scaler', MinMaxScalerTransform()),
                 ('binarizer', BinarizerTransform()), 
                 ('model', xgb_model)])

# Find the number of features
num_features = train_X.shape[1]

# Testing with these indices
indices = list(range(num_features))

binarizer_values = [False]*num_features
binarizer_values[3] = True
binarizer_values[12] = True

# Possible configurations [None, True, or False] - None means not decided yet
param_distributions = {
    'standard_scaler__custom_values': [None]*num_features,
    'min_max_scaler__custom_values': [None]*num_features,
    'binarizer__custom_values': binarizer_values
}

# Randomly search the space n_iter times
experiments_df = RandomizedGridSearchCV(
    n_experiments=100,
    pipe=pipe,
    param_distributions=param_distributions,
    train_X=train_X,
    train_y=train_y,
    test_X=test_X,
    test_y=test_y,
    scoring='neg_mean_squared_error',
    cv=2)

# Sort the scores
experiments_df.sort_values(by=['score'], ascending=False, inplace=True)

# Drop score
experiments_X_df = experiments_df.drop(['score'], axis=1)

# Get column names
X_column_names = experiments_X_df.columns

# Convert to numpy
experiments_X = experiments_X_df.values
experiments_y = experiments_df[['score']].values

# Create an XGBoost model tuned with the experiments data
xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)

# Fit the model
xgb_experiments_model.fit(experiments_X_df, experiments_y)

# Extract shap values
explainer = shap.TreeExplainer(xgb_experiments_model)
shap_values = explainer.shap_values(experiments_X_df)

# Shap as dataframe
pandas_shap_df = pd.DataFrame(shap_values, columns=X_column_names)
pandas_shap_df


Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31
Iteration:  32
Iteration:  33
Iteration:  34
Iteration:  35
Iteration:  36
Iteration:  37
Iteration:  38
Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48
Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration:  61
Iteration:  62
Iteration:  63
Iteration:  64
Iteration:  65
Iteration:  66
Itera

Unnamed: 0,standard_scaler__custom_values___0,standard_scaler__custom_values___1,standard_scaler__custom_values___2,standard_scaler__custom_values___3,standard_scaler__custom_values___4,standard_scaler__custom_values___5,standard_scaler__custom_values___6,standard_scaler__custom_values___7,standard_scaler__custom_values___8,standard_scaler__custom_values___9,standard_scaler__custom_values___10,standard_scaler__custom_values___11,standard_scaler__custom_values___12,min_max_scaler__custom_values___0,min_max_scaler__custom_values___1,min_max_scaler__custom_values___2,min_max_scaler__custom_values___3,min_max_scaler__custom_values___4,min_max_scaler__custom_values___5,min_max_scaler__custom_values___6,min_max_scaler__custom_values___7,min_max_scaler__custom_values___8,min_max_scaler__custom_values___9,min_max_scaler__custom_values___10,min_max_scaler__custom_values___11,min_max_scaler__custom_values___12,binarizer__custom_values___0,binarizer__custom_values___1,binarizer__custom_values___2,binarizer__custom_values___3,binarizer__custom_values___4,binarizer__custom_values___5,binarizer__custom_values___6,binarizer__custom_values___7,binarizer__custom_values___8,binarizer__custom_values___9,binarizer__custom_values___10,binarizer__custom_values___11,binarizer__custom_values___12
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
xgb_experiments_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, xgb__colsample_bylevel=1.0,
       xgb__colsample_bytree=1.0, xgb__learning_rate=0.1, xgb__max_depth=1,
       xgb__n_estimators=180, xgb__reg_lambda=0.001, xgb__subsample=0.9)

In [67]:
xgb_experiments_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, xgb__colsample_bylevel=1.0,
       xgb__colsample_bytree=1.0, xgb__learning_rate=0.1, xgb__max_depth=1,
       xgb__n_estimators=180, xgb__reg_lambda=0.001, xgb__subsample=0.9)

In [68]:
pd.set_option('display.max_columns', 200)

experiments_df

Unnamed: 0,standard_scaler__custom_values___0,standard_scaler__custom_values___1,standard_scaler__custom_values___2,standard_scaler__custom_values___3,standard_scaler__custom_values___4,standard_scaler__custom_values___5,standard_scaler__custom_values___6,standard_scaler__custom_values___7,standard_scaler__custom_values___8,standard_scaler__custom_values___9,standard_scaler__custom_values___10,standard_scaler__custom_values___11,standard_scaler__custom_values___12,min_max_scaler__custom_values___0,min_max_scaler__custom_values___1,min_max_scaler__custom_values___2,min_max_scaler__custom_values___3,min_max_scaler__custom_values___4,min_max_scaler__custom_values___5,min_max_scaler__custom_values___6,min_max_scaler__custom_values___7,min_max_scaler__custom_values___8,min_max_scaler__custom_values___9,min_max_scaler__custom_values___10,min_max_scaler__custom_values___11,min_max_scaler__custom_values___12,binarizer__custom_values___0,binarizer__custom_values___1,binarizer__custom_values___2,binarizer__custom_values___3,binarizer__custom_values___4,binarizer__custom_values___5,binarizer__custom_values___6,binarizer__custom_values___7,binarizer__custom_values___8,binarizer__custom_values___9,binarizer__custom_values___10,binarizer__custom_values___11,binarizer__custom_values___12,score
99,False,False,False,False,False,False,False,True,False,False,True,True,False,False,True,True,False,False,True,True,False,True,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,5.761208
43,True,True,False,True,True,False,False,True,False,False,False,False,True,True,False,False,False,False,True,True,True,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,5.761208
25,True,False,False,False,False,True,False,True,False,False,False,False,True,False,False,True,True,False,True,False,True,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,5.761208
29,False,True,False,False,True,False,True,True,True,True,True,True,True,True,False,False,True,False,True,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,5.761208
30,True,False,True,True,False,True,False,True,False,True,True,True,False,True,True,False,False,False,True,True,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,5.761208
78,False,True,True,False,True,False,True,True,True,True,True,True,True,True,False,False,True,False,True,True,True,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,5.761208
76,True,True,True,True,True,True,False,False,False,True,True,False,False,False,False,False,True,False,True,False,True,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,5.761208
70,False,True,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,True,True,False,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,5.761208
69,False,False,True,True,True,False,False,False,False,True,False,False,True,False,True,False,False,False,True,False,True,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,5.761208
45,False,True,False,False,True,False,True,True,True,True,True,False,True,True,False,True,True,False,True,True,True,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,5.761208


In [51]:
# Transformation to polarized groups of shap values
polarized_df = pandas_shap_df.copy()
for i in range(0, len(pandas_shap_df.index)):
    for j in range(0, len(pandas_shap_df.columns)):
        if not experiments_df.iloc[i, j]:
            polarized_df.iloc[i, j] = -1 * pandas_shap_df.iloc[i, j]
polarized_df

Unnamed: 0,standard_scaler__custom_values___0,standard_scaler__custom_values___1,standard_scaler__custom_values___2,standard_scaler__custom_values___3,standard_scaler__custom_values___4,standard_scaler__custom_values___5,standard_scaler__custom_values___6,standard_scaler__custom_values___7,standard_scaler__custom_values___8,standard_scaler__custom_values___9,standard_scaler__custom_values___10,standard_scaler__custom_values___11,standard_scaler__custom_values___12,min_max_scaler__custom_values___0,min_max_scaler__custom_values___1,min_max_scaler__custom_values___2,min_max_scaler__custom_values___3,min_max_scaler__custom_values___4,min_max_scaler__custom_values___5,min_max_scaler__custom_values___6,min_max_scaler__custom_values___7,min_max_scaler__custom_values___8,min_max_scaler__custom_values___9,min_max_scaler__custom_values___10,min_max_scaler__custom_values___11,min_max_scaler__custom_values___12,binarizer__custom_values___0,binarizer__custom_values___1,binarizer__custom_values___2,binarizer__custom_values___3,binarizer__custom_values___4,binarizer__custom_values___5,binarizer__custom_values___6,binarizer__custom_values___7,binarizer__custom_values___8,binarizer__custom_values___9,binarizer__custom_values___10,binarizer__custom_values___11,binarizer__custom_values___12
0,0.010183,0.021914,0.283968,0.079860,-0.019874,0.069330,-0.145917,0.040984,0.033354,0.094337,0.118435,0.011698,0.002925,0.006906,0.331510,0.026539,0.055566,-0.059149,-0.030219,0.098213,0.052999,-0.004603,-0.148643,-0.215091,-0.006371,-0.013617,0.328519,0.104492,0.795108,-0.011210,1.205111,4.071423,0.049127,2.535123,0.008034,0.013089,1.119209,0.019716,0.135908
1,0.007907,0.031098,0.263356,0.023188,-0.028872,-0.006669,-0.086385,0.039111,0.032378,0.043222,0.029932,0.036707,0.000683,0.006906,0.110188,0.018921,-0.012856,-0.017793,-0.004347,0.059513,0.109477,-0.008127,-0.042852,-0.239387,0.001325,-0.002211,0.179038,0.083390,0.711308,-0.022427,0.957864,4.428466,0.056537,1.998772,0.046629,0.007040,0.684253,0.007476,0.138300
2,0.005294,0.038855,0.104296,-0.000916,-0.036020,0.064116,-0.150409,-0.167718,0.038266,0.092791,0.114197,0.024405,0.000879,0.007534,0.365725,0.030233,0.047034,-0.016754,-0.033513,0.087622,0.093852,-0.008763,-0.185235,-0.112285,0.001952,-0.013896,0.283055,0.081499,0.354477,-0.012422,1.356869,3.964359,0.195052,2.246801,0.142900,-0.009494,0.840050,0.001294,-0.154343
3,0.094703,0.023012,0.024235,0.117884,-0.040118,-0.028311,-0.037423,-0.039790,0.142354,0.106027,-0.062360,0.008784,-0.004425,0.025409,0.036850,-0.087771,0.011702,-0.015444,0.028003,0.074397,0.087165,-0.029633,-0.087191,-0.219770,-0.010601,-0.002886,0.178132,0.297196,1.028365,-0.030238,0.535527,4.569502,0.048848,1.192893,0.031965,-0.037323,1.099536,0.009802,-0.048515
4,-0.004231,0.033269,0.018201,0.128909,-0.045953,0.026577,-0.022019,-0.082954,0.054928,0.119354,-0.072448,0.028370,0.000026,0.007059,-0.045424,-0.030510,0.002029,-0.103340,-0.008033,0.261227,0.062854,-0.022142,-0.065346,-0.114308,0.065094,-0.007696,0.360894,0.313677,0.936543,0.027310,0.275427,4.713697,0.024904,1.013567,0.019963,0.002921,1.191036,0.026181,-0.018017
5,0.003213,0.037227,0.094090,-0.000916,-0.036573,0.079853,-0.147455,-0.097524,0.027594,0.094231,0.115820,0.014915,-0.000148,0.025409,0.312398,0.029607,0.055566,-0.027098,-0.022174,0.130536,0.084735,-0.002009,-0.091562,-0.103227,0.004634,-0.010641,0.236410,0.078290,0.286738,0.001947,1.241422,3.423822,0.130466,2.287157,0.128170,-0.023904,0.427125,0.030210,-0.092029
6,0.028554,0.040606,0.015743,0.059082,-0.089746,0.031003,-0.022246,-0.011063,0.082875,0.121889,-0.041477,0.028370,-0.004805,0.003729,0.025549,-0.061424,0.007678,-0.032119,-0.007200,0.210591,0.055731,-0.013127,-0.077387,-0.129396,0.060636,-0.004093,0.331704,0.321411,0.967491,0.023831,0.401834,4.636808,0.060843,1.064237,0.022842,0.005102,1.146445,0.054149,-0.015627
7,0.014834,0.031098,0.104296,0.049937,-0.035252,-0.023218,-0.129678,0.005108,0.045724,0.061442,0.035717,0.026737,0.003879,0.025409,0.151407,0.025781,-0.005131,-0.018380,0.009105,0.027086,0.103578,-0.001970,-0.065789,-0.118806,-0.008550,-0.004699,0.294044,0.086683,0.711308,-0.017547,0.812157,4.218779,0.048534,1.746910,0.029150,0.008537,0.719795,-0.003533,0.118711
8,0.026683,0.036370,0.102583,-0.000916,-0.035582,0.019499,-0.145474,-0.028269,0.049958,0.088203,0.034446,0.026131,-0.000148,0.007534,0.155702,0.008148,0.026857,-0.007224,-0.018572,0.060374,0.127435,0.005644,-0.076986,-0.274818,0.004223,-0.006720,0.196499,0.087514,0.304268,-0.022299,1.155969,3.480126,0.163774,2.342073,0.140264,-0.014324,0.463467,0.020296,-0.227646
9,0.007415,0.017025,0.263356,0.024467,-0.026961,0.023440,-0.137187,-0.029412,0.037400,0.061442,0.034549,0.039940,-0.003495,0.025409,0.311858,0.023104,0.055566,-0.010921,-0.003464,0.063333,0.070545,-0.000772,-0.044133,-0.188353,0.001661,-0.013090,0.277433,0.087608,0.654719,-0.012303,0.968478,3.904459,0.088023,2.406008,0.029968,-0.025749,0.372565,0.017952,-0.043344


In [52]:
# Certainly, I set a feature to True for large positive values
# Also, I set a feature to False for large negative values
# Otherwise, it is set to True or False
polarized_shap_result = polarized_df.sum()
polarized_shap_result.sort_values()

min_max_scaler__custom_values___10     -14.813402
binarizer__custom_values___12          -12.207275
standard_scaler__custom_values___6      -6.635370
min_max_scaler__custom_values___9       -5.772208
standard_scaler__custom_values___7      -5.007137
min_max_scaler__custom_values___2       -3.929888
min_max_scaler__custom_values___4       -3.601721
standard_scaler__custom_values___4      -3.546767
min_max_scaler__custom_values___12      -1.794590
binarizer__custom_values___3            -1.724637
min_max_scaler__custom_values___8       -1.345573
standard_scaler__custom_values___10     -0.594102
min_max_scaler__custom_values___5       -0.143985
standard_scaler__custom_values___12      0.146587
binarizer__custom_values___9             0.825929
min_max_scaler__custom_values___0        1.095158
min_max_scaler__custom_values___3        1.242120
standard_scaler__custom_values___0       2.256149
binarizer__custom_values___11            2.334642
min_max_scaler__custom_values___11       2.359535


In [53]:
# Splits positive and negative

positive_fields = polarized_shap_result[polarized_shap_result > 0]
positive_fields = positive_fields / positive_fields.sum()
negative_fields = polarized_shap_result[polarized_shap_result < 0]
negative_fields = negative_fields / negative_fields.sum()
positive_fields = positive_fields[positive_fields > 0.05]
negative_fields = negative_fields[negative_fields > 0.05]

# Each iteration, find anything above % number

In [54]:
print(positive_fields, '\n', negative_fields)

# Splits positive and negative

# Each iteration, find anything above 5% and either remove a low value or remove a high value from their options

# Continue until 0 things were removed (0 will be removed if one option for each)

# When there are X choices yet

# Try appending experiments vs continue to use the same results for analysis ; keep together for now for review

# When there were 3

binarizer__custom_values___4     0.094216
binarizer__custom_values___5     0.454678
binarizer__custom_values___7     0.186912
binarizer__custom_values___10    0.065510
dtype: float32 
 standard_scaler__custom_values___4    0.058033
standard_scaler__custom_values___6    0.108569
standard_scaler__custom_values___7    0.081928
min_max_scaler__custom_values___2     0.064301
min_max_scaler__custom_values___4     0.058932
min_max_scaler__custom_values___9     0.094446
min_max_scaler__custom_values___10    0.242379
binarizer__custom_values___12         0.199737
dtype: float32


In [22]:
# just weighting based on feature length etc

In [23]:
for key in positive_fields.keys():
    choices = param_distributions[
        'standard_scaler__column_indices_to_replace'][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[:-1]

for key in negative_fields.keys():
    choices = param_distributions[key][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[1:]

In [24]:
param_distributions['min_max_scaler__column_indices_to_replace']

KeyError: 'min_max_scaler__column_indices_to_replace'

In [37]:
# always consider all features
#

In [None]:
# featuers to consider,
# num of features
# Zeroes;
# could
# default distribution ()
# weighting by feature towards up or down

In [106]:
for key in negative_fields.keys():
    print(key.split("___")[0])

In [107]:
a = [1, 2, 3, 4]

In [108]:
a[1:]

In [109]:
a[:-1]