In [74]:
%load_ext pycodestyle_magic
# %%pycodestyle


In [65]:
###### import pandas as pd
import numpy as np
import shap
import pandas as pd
from pandasql import sqldf
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston

# Lambdas


def q(x):
    return sqldf(x, globals())

# My own version of RandomizedSearchCV


def RandomizedGridSearchCV(n_experiments, pipe,
                           param_distributions,
                           train_X, train_y,
                           test_X, test_y,
                           scoring='neg_mean_squared_error',
                           cv=2):
    # Copy data
    train_X, train_y = train_X.copy(), train_y.copy()
    test_X, test_y = test_X.copy(), test_y.copy()

    # Transform the param_distributions into four arrays
    key_list, transform_class_list = [], []
    parameter_name_list, columns_list = [], []
    for key, columns in param_distributions.items():
        class_key, parameter_name = key.split("__")
        transform_class = pipe.named_steps[class_key]
        key_list.append(key)
        transform_class_list.append(transform_class)
        parameter_name_list.append(parameter_name)
        columns_list.append(columns)

    # Initialize experiments dictionary
    experiments_info = {}
    for key, transform_class, parameter_name, columns in \
        zip(key_list, transform_class_list, parameter_name_list,
            columns_list):
        for i in range(len(columns)):
            experiments_info[key + "___" + str(i)] = []
    experiments_info['score'] = []

    # Iterate over the experiments
    for iteration in range(n_experiments):
        print("Iteration: ", iteration)

        # Updates the transform parameters
        for key, transform_class, parameter_name, columns in \
            zip(key_list, transform_class_list, parameter_name_list,
                columns_list):

            # If the value option is a 2D array it is treated differently
            for feature_i in range(len(columns)):
                
                # Lock it in if it's near a bound
                if columns[feature_i] <= 0.1:
                    new_value = 0.0
                elif columns[feature_i] >= 0.9:
                    new_value = 1.0
                else:
                    # Randomize +/- 0.40 --- columns varies from [0 - 1.0]
                    new_value = columns[feature_i] + ((np.random.rand(1)[0] - 0.5) * 0.8)
                    new_value = min(1.0, max(0.0, new_value))

                # Replace columns
                columns[feature_i] = new_value

                # Save input data for the experiments dataframe output
                experiments_info[key + "___" +
                                 str(feature_i)].append(new_value)

            # Set parameters for the transformation class (typically numeric fields)
            setattr(transform_class, "columns", columns)

        # Fit
        pipe.fit(train_X, train_y)

        # Predict
        pred_y = pipe.predict(test_X)

        # Scoring
        if scoring == 'neg_mean_squared_error':
            score = mean_squared_error(pred_y, test_y)
        else:
            raise Exception('Scoring type not implemented')

        # Appending the score
        experiments_info["score"].append(score)

    experiments_df = pd.DataFrame(experiments_info)
    return experiments_df


# Standard scaler data preparation class


class StandardScalerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, columns=[]):
        self.standard_scalers = {}

    def fit(self, X, y=None):
        for index in range(len(self.columns)):
            if self.columns[index] > 0.5:
                self.standard_scalers[index] = StandardScaler()
                self.standard_scalers[index].fit(X[:, index:index + 1])
        return self

    def transform(self, X, y=None):
        for index in range(len(self.columns)):
            if self.columns[index] > 0.5:
                X[:, index:index + 1] = self.standard_scalers[index].transform(
                    X[:, index:index + 1])
        return np.c_[X]


# Min-max scaler data preparation class


class MinMaxScalerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, columns=[]):
        self.min_max_scalers = {}

    def fit(self, X, y=None):
        for index in range(len(self.columns)):
            if self.columns[index] > 0.5:
                self.min_max_scalers[index] = MinMaxScaler()
                self.min_max_scalers[index].fit(X[:, index:index + 1])
        return self

    def transform(self, X, y=None):
        for index in range(len(self.columns)):
            if self.columns[index] > 0.5:
                X[:, index:index + 1] = self.min_max_scalers[index].transform(
                    X[:, index:index + 1])
        return np.c_[X]


# Binarizer scaler data preparation class


class BinarizerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, columns={}):
        self.thresholds = {}

    def fit(self, X, y=None):        
        for index in range(len(self.columns)):
            if self.columns[index] == 0.0:
                continue
            self.thresholds[index] = np.quantile(X[:, index:index + 1],
                                                 self.columns[index])
        return self

    def transform(self, X, y=None):
        for index in range(len(self.columns)):
            if self.columns[index] == 0.0:
                continue
            X[:, index:index +
              1] = X[:, index:index + 1] > self.thresholds[index]
        return np.c_[X]


# Generic xgboost fit using several grid searches


def get_xgboost_model(train_X, train_y):
    model = Pipeline([('xgb', XGBRegressor())])

    # 1) Tune max depth
    param_grid = [{
        'xgb__n_estimators': [100],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [1, 2, 4, 6, 8],
        'xgb__subsample': [1.00]
    }]
    gs1 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs1 = gs1.fit(train_X, train_y)
    max_depth = gs1.best_params_['xgb__max_depth']
    # print(gs1.best_score_)
    # print(gs1.best_params_)

    # 2) Tune subsample
    param_grid = [{
        'xgb__n_estimators': [100],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00]
    }]
    gs2 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs2 = gs2.fit(train_X, train_y)
    subsample = gs2.best_params_['xgb__subsample']
    # print(gs2.best_score_)
    # print(gs2.best_params_)

    # 3) Tune n_estimators
    param_grid = [{
        'xgb__n_estimators': [50, 100, 150, 200],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    }]
    gs3 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs3 = gs3.fit(train_X, train_y)
    n_estimators = gs3.best_params_['xgb__n_estimators']
    # print(gs3.best_score_)
    # print(gs3.best_params_)

    # 4) Tune learning rate
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    },
                  {
                      'xgb__n_estimators': [n_estimators * 3],
                      'xgb__learning_rate': [0.03],
                      'xgb__max_depth': [max_depth],
                      'xgb__subsample': [subsample]
                  }]
    gs4 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs4 = gs4.fit(train_X, train_y)
    n_estimators = gs4.best_params_['xgb__n_estimators']
    learning_rate = gs4.best_params_['xgb__learning_rate']
    # print(gs4.best_score_)
    # print(gs4.best_params_)

    # 5) Tune n_estimators
    param_grid = [{
        'xgb__n_estimators': [
            int(0.8 * n_estimators),
            int(1.0 * n_estimators),
            int(1.2 * n_estimators)
        ],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    }]
    gs5 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs5 = gs5.fit(train_X, train_y)
    n_estimators = gs5.best_params_['xgb__n_estimators']
    # print(gs5.best_score_)
    # print(gs5.best_params_)

    # 6) Tune sampling by tree
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'xgb__colsample_bylevel': [1.0]
    }]
    gs6 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs6 = gs6.fit(train_X, train_y)
    colsample_bytree = gs6.best_params_['xgb__colsample_bytree']
    # print(gs6.best_score_)
    # print(gs6.best_params_)

    # 7) Tune subsample
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [1.0]
    }]
    gs7 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs7 = gs7.fit(train_X, train_y)
    subsample = gs7.best_params_['xgb__subsample']
    # print(gs7.best_score_)
    # print(gs7.best_params_)

    # 8) Tune sampling by level
    n_estimators = gs7.best_params_['xgb__n_estimators']
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    }]
    gs8 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs8 = gs8.fit(train_X, train_y)
    colsample_bylevel = gs8.best_params_['xgb__colsample_bylevel']
    # print(gs8.best_score_)
    # print(gs8.best_params_)

    # 9) Tune sampling fields
    n_estimators = gs8.best_params_['xgb__n_estimators']
    subsample = 0.9 if subsample == 1.0 else subsample
    colsample_bytree = 0.6 if colsample_bytree == 0.5 else colsample_bytree
    colsample_bylevel = 0.9 if colsample_bylevel == 1.0 else colsample_bylevel
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample, subsample + 0.1],
        'xgb__colsample_bytree': [colsample_bytree - 0.1, colsample_bytree],
        'xgb__colsample_bylevel': [colsample_bylevel, colsample_bylevel + 0.1]
    }]
    gs9 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs9 = gs9.fit(train_X, train_y)
    subsample = gs9.best_params_['xgb__subsample']
    colsample_bytree = gs9.best_params_['xgb__colsample_bytree']
    colsample_bylevel = gs9.best_params_['xgb__colsample_bylevel']
    # print(gs9.best_score_)
    # print(gs9.best_params_)

    # 10) Tune alpha
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [colsample_bylevel],
        'xgb__reg_lambda': [0.001, 0.01, 0.1, 0.3, 1, 3, 10, 100, 1000]
    }]
    gs10 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs10 = gs10.fit(train_X, train_y)
    # print(gs10.best_score_)
    # print(gs10.best_params_)

    # Find the best model
    # Sometimes the best model isn't the last one, so checking all of them
    best_model = gs1
    best_model_score = gs1.best_score_
    if gs2.best_score_ > best_model_score:
        best_model = gs2
        best_model_score = gs2.best_score_
    if gs2.best_score_ > best_model_score:
        best_model = gs2
        best_model_score = gs2.best_score_
    if gs3.best_score_ > best_model_score:
        best_model = gs3
        best_model_score = gs3.best_score_
    if gs4.best_score_ > best_model_score:
        best_model = gs4
        best_model_score = gs4.best_score_
    if gs5.best_score_ > best_model_score:
        best_model = gs5
        best_model_score = gs5.best_score_
    if gs6.best_score_ > best_model_score:
        best_model = gs6
        best_model_score = gs6.best_score_
    if gs7.best_score_ > best_model_score:
        best_model = gs7
        best_model_score = gs7.best_score_
    if gs8.best_score_ > best_model_score:
        best_model = gs8
        best_model_score = gs8.best_score_
    if gs9.best_score_ > best_model_score:
        best_model = gs9
        best_model_score = gs9.best_score_
    if gs10.best_score_ > best_model_score:
        best_model = gs10
        best_model_score = gs10.best_score_

    # Return the best model
    return XGBRegressor(**best_model.best_params_)


In [72]:
# Example dataset
boston_data = load_boston()

# Extract pandas dataframe and target
X = pd.DataFrame(boston_data['data']).copy().values
y = pd.DataFrame(boston_data['target']).copy().values

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.20, random_state=42)
train_X, test_X = train_X, test_X
train_y, test_y = train_y.reshape(-1, 1), test_y.reshape(-1, 1)

# An okay model fit to the data
try:
    xgb_model
except:
    xgb_model = get_xgboost_model(train_X, train_y)
# linear_regression = LinearRegression(normalize=False)

# Pipeline
pipe = Pipeline([('standard_scaler', StandardScalerTransform()),
                 ('min_max_scaler', MinMaxScalerTransform()),
                 ('binarizer', BinarizerTransform()), ('model', xgb_model)])

# Find the number of features
num_features = train_X.shape[1]

# Testing with these indices
indices = list(range(num_features))

# binarizer_options
binarizer_options = [0.0]*num_features
binarizer_options[3] = 1.0
binarizer_options[6] = 1.0
binarizer_options[8] = 1.0


# Possible configurations
param_distributions = {
    'standard_scaler__custom_values': [0.5]*num_features,
    'min_max_scaler__custom_values': [0.5]*num_features,
    'binarizer__custom_values': [0.5]*num_features,
}

# Randomly search the space n_iter times
experiments_df = RandomizedGridSearchCV(
    n_experiments=100,
    pipe=pipe,
    param_distributions=param_distributions,
    train_X=train_X,
    train_y=train_y,
    test_X=test_X,
    test_y=test_y,
    scoring='neg_mean_squared_error',
    cv=2)

# Sort the scores
experiments_df.sort_values(by=['score'], ascending=False, inplace=True)

# Drop score
experiments_X_df = experiments_df.drop(['score'], axis=1)

# Get column names
X_column_names = experiments_X_df.columns

# Convert to numpy
experiments_X = experiments_X_df.values
experiments_y = experiments_df[['score']].values

# Create an XGBoost model tuned with the experiments data
xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)

# Fit the model
xgb_experiments_model.fit(experiments_X_df, experiments_y)

# Extract shap values
explainer = shap.TreeExplainer(xgb_experiments_model)
shap_values = explainer.shap_values(experiments_X_df)

# Shap as dataframe
pandas_shap_df = pd.DataFrame(shap_values, columns=X_column_names)
pandas_shap_df


Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31
Iteration:  32
Iteration:  33
Iteration:  34
Iteration:  35
Iteration:  36
Iteration:  37
Iteration:  38
Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48
Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration:  61
Iteration:  62
Iteration:  63
Iteration:  64
Iteration:  65
Iteration:  66
Itera

Unnamed: 0,standard_scaler__custom_values___0,standard_scaler__custom_values___1,standard_scaler__custom_values___2,standard_scaler__custom_values___3,standard_scaler__custom_values___4,standard_scaler__custom_values___5,standard_scaler__custom_values___6,standard_scaler__custom_values___7,standard_scaler__custom_values___8,standard_scaler__custom_values___9,standard_scaler__custom_values___10,standard_scaler__custom_values___11,standard_scaler__custom_values___12,min_max_scaler__custom_values___0,min_max_scaler__custom_values___1,min_max_scaler__custom_values___2,min_max_scaler__custom_values___3,min_max_scaler__custom_values___4,min_max_scaler__custom_values___5,min_max_scaler__custom_values___6,min_max_scaler__custom_values___7,min_max_scaler__custom_values___8,min_max_scaler__custom_values___9,min_max_scaler__custom_values___10,min_max_scaler__custom_values___11,min_max_scaler__custom_values___12,binarizer__custom_values___0,binarizer__custom_values___1,binarizer__custom_values___2,binarizer__custom_values___3,binarizer__custom_values___4,binarizer__custom_values___5,binarizer__custom_values___6,binarizer__custom_values___7,binarizer__custom_values___8,binarizer__custom_values___9,binarizer__custom_values___10,binarizer__custom_values___11,binarizer__custom_values___12
0,-0.003369,5.313730,0.0,0.923289,0.0,0.0,0.0,0.235631,7.492890,0.0,0.0,0.0,0.0,-0.011038,0.0,0.0,0.0,0.023955,0.0,0.0,0.0,-0.001268,0.0,0.001475,0.0,0.0,0.0,0.001783,0.0,4.104317,0.0,0.0,0.022095,0.0,0.0,0.0,0.0,0.0,-0.000184
1,-0.000799,5.318821,0.0,0.999578,0.0,0.0,0.0,0.251685,1.586242,0.0,0.0,0.0,0.0,0.621249,0.0,0.0,0.0,0.037575,0.0,0.0,0.0,0.082970,0.0,0.004143,0.0,0.0,0.0,0.004969,0.0,4.202542,0.0,0.0,0.022018,0.0,0.0,0.0,0.0,0.0,-0.000183
2,0.040823,5.780961,0.0,1.028382,0.0,0.0,0.0,0.267682,-0.039373,0.0,0.0,0.0,0.0,-0.018534,0.0,0.0,0.0,0.037846,0.0,0.0,0.0,0.082970,0.0,0.004143,0.0,0.0,0.0,0.004969,0.0,4.152992,0.0,0.0,0.044565,0.0,0.0,0.0,0.0,0.0,0.017632
3,-0.001238,5.413830,0.0,1.028382,0.0,0.0,0.0,0.267682,-0.042232,0.0,0.0,0.0,0.0,-0.020328,0.0,0.0,0.0,0.002243,0.0,0.0,0.0,-0.002627,0.0,0.000309,0.0,0.0,0.0,-0.009938,0.0,4.140565,0.0,0.0,0.044565,0.0,0.0,0.0,0.0,0.0,-0.000371
4,0.040823,4.985659,0.0,1.028382,0.0,0.0,0.0,0.267682,-0.069194,0.0,0.0,0.0,0.0,-0.019669,0.0,0.0,0.0,0.002333,0.0,0.0,0.0,0.082970,0.0,0.000309,0.0,0.0,0.0,-0.009938,0.0,4.140565,0.0,0.0,0.044565,0.0,0.0,0.0,0.0,0.0,-0.000371
5,-0.000539,3.925432,0.0,-0.091316,0.0,0.0,0.0,0.193859,-0.127453,0.0,0.0,0.0,0.0,-0.022948,0.0,0.0,0.0,0.031625,0.0,0.0,0.0,-0.002627,0.0,0.004143,0.0,0.0,0.0,0.004969,0.0,4.152948,0.0,0.0,-0.003466,0.0,0.0,0.0,0.0,0.0,-0.000371
6,0.008032,0.857951,0.0,1.028382,0.0,0.0,0.0,0.267682,-0.116792,0.0,0.0,0.0,0.0,1.417612,0.0,0.0,0.0,0.073737,0.0,0.0,0.0,-0.001303,0.0,0.004143,0.0,0.0,0.0,0.004969,0.0,4.156640,0.0,0.0,-0.001713,0.0,0.0,0.0,0.0,0.0,-0.000183
7,-0.009371,0.842937,0.0,0.340701,0.0,0.0,0.0,0.128033,-0.042501,0.0,0.0,0.0,0.0,-0.021560,0.0,0.0,0.0,0.028747,0.0,0.0,0.0,-0.002518,0.0,-0.012430,0.0,0.0,0.0,0.004969,0.0,1.856932,0.0,0.0,0.044565,0.0,0.0,0.0,0.0,0.0,0.017632
8,-0.001238,0.473105,0.0,0.491508,0.0,0.0,0.0,-1.968550,-0.129619,0.0,0.0,0.0,0.0,-0.022457,0.0,0.0,0.0,0.023288,0.0,0.0,0.0,-0.002627,0.0,-0.012430,0.0,0.0,0.0,-0.009938,0.0,3.159342,0.0,0.0,0.044565,0.0,0.0,0.0,0.0,0.0,-0.000371
9,-0.009419,0.846693,0.0,0.995175,0.0,0.0,0.0,0.136102,-0.098669,0.0,0.0,0.0,0.0,-0.022615,0.0,0.0,0.0,0.009700,0.0,0.0,0.0,-0.002518,0.0,0.004143,0.0,0.0,0.0,0.000413,0.0,-0.381786,0.0,0.0,0.044565,0.0,0.0,0.0,0.0,0.0,-0.000371


In [59]:
xgb_experiments_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, xgb__colsample_bylevel=0.8,
       xgb__colsample_bytree=1.0, xgb__learning_rate=0.1, xgb__max_depth=2,
       xgb__n_estimators=240, xgb__reg_lambda=100, xgb__subsample=1.0)

In [60]:
pd.set_option('display.max_columns', 200)

experiments_df

Unnamed: 0,standard_scaler__column_indices_to_replace___0,standard_scaler__column_indices_to_replace___1,standard_scaler__column_indices_to_replace___2,standard_scaler__column_indices_to_replace___3,standard_scaler__column_indices_to_replace___4,standard_scaler__column_indices_to_replace___5,standard_scaler__column_indices_to_replace___6,standard_scaler__column_indices_to_replace___7,standard_scaler__column_indices_to_replace___8,standard_scaler__column_indices_to_replace___9,standard_scaler__column_indices_to_replace___10,standard_scaler__column_indices_to_replace___11,standard_scaler__column_indices_to_replace___12,min_max_scaler__column_indices_to_replace___0,min_max_scaler__column_indices_to_replace___1,min_max_scaler__column_indices_to_replace___2,min_max_scaler__column_indices_to_replace___3,min_max_scaler__column_indices_to_replace___4,min_max_scaler__column_indices_to_replace___5,min_max_scaler__column_indices_to_replace___6,min_max_scaler__column_indices_to_replace___7,min_max_scaler__column_indices_to_replace___8,min_max_scaler__column_indices_to_replace___9,min_max_scaler__column_indices_to_replace___10,min_max_scaler__column_indices_to_replace___11,min_max_scaler__column_indices_to_replace___12,binarizer__column_indices_to_replace___0,binarizer__column_indices_to_replace___1,binarizer__column_indices_to_replace___2,binarizer__column_indices_to_replace___3,binarizer__column_indices_to_replace___4,binarizer__column_indices_to_replace___5,binarizer__column_indices_to_replace___6,binarizer__column_indices_to_replace___7,binarizer__column_indices_to_replace___8,binarizer__column_indices_to_replace___9,binarizer__column_indices_to_replace___10,binarizer__column_indices_to_replace___11,binarizer__column_indices_to_replace___12,score
81,0.739462,0.000000,0.483499,1.000000,0.775462,0.022469,0.268715,0.379865,0.363926,0.000000,0.329832,0.049618,0.954915,0.000000,0.000000,1.000000,1.000000,0.010316,0.324818,0.443893,0.000000,0.579013,0.979257,0.310855,1.000000,1.000000,0.295166,0.823935,0.215455,0.801511,0.875588,0.134253,0.000000,0.495196,0.507970,0.983661,0.740446,0.024894,1.000000,57.507306
918,0.000000,0.899820,0.067433,0.000000,0.000000,0.209231,0.251225,0.283278,0.457725,0.279228,0.735419,0.434053,0.960646,0.625110,0.375340,0.218585,0.758704,0.254986,0.100409,0.566902,0.464013,0.633452,0.394626,0.137327,0.034322,0.264409,0.374202,0.907860,0.349305,0.930489,1.000000,0.067357,0.373261,1.000000,0.000000,0.988573,0.355487,0.907442,1.000000,53.600973
920,0.000000,0.564135,0.233661,0.386780,0.291281,0.437691,0.756977,0.790679,0.381231,0.328088,0.976270,0.000000,1.000000,0.453851,0.897718,0.139096,0.501614,0.039150,0.067631,0.783122,0.000000,0.558348,0.441788,0.437659,0.000000,0.588400,0.868222,1.000000,0.556746,1.000000,0.857668,0.260638,0.388559,1.000000,0.358647,1.000000,1.000000,1.000000,0.996873,52.828455
817,0.035047,1.000000,0.590644,0.340589,0.536113,0.000000,0.071233,0.515505,0.258615,0.000000,1.000000,1.000000,0.716480,1.000000,0.281179,0.412794,0.728864,0.000000,0.578720,0.531810,0.166758,0.917949,1.000000,0.212287,0.366829,0.615251,0.770902,0.855631,1.000000,0.000000,0.073633,0.206843,0.210858,0.609873,0.821847,1.000000,0.627255,0.862733,1.000000,51.061695
755,0.000000,0.684164,0.347542,0.065757,0.728474,1.000000,0.406926,0.802763,0.502330,0.000000,0.051396,0.356115,0.795375,0.172355,0.499309,0.979809,0.799368,0.390405,0.013709,0.579746,1.000000,1.000000,1.000000,0.154676,0.239665,0.978372,0.256679,1.000000,0.958293,0.065351,0.847620,0.034493,0.675694,0.169206,0.241812,0.790965,0.243312,0.333650,0.988450,50.972017
858,1.000000,0.928152,0.123581,0.436853,0.676142,0.507789,0.623261,0.238509,0.958098,0.592283,0.000000,0.103063,0.379840,0.387554,1.000000,0.460545,1.000000,0.848604,0.464720,0.000000,0.590109,0.615889,0.032513,1.000000,0.909160,0.699627,1.000000,0.012269,0.690795,0.709739,0.714301,0.977809,0.175143,0.898046,0.000000,0.563335,0.468788,1.000000,1.000000,49.768749
71,0.177307,0.502195,0.249001,0.715082,0.000000,0.480851,0.648274,0.238499,0.182257,1.000000,0.160481,0.738983,1.000000,0.374299,0.690735,0.621268,0.651274,0.076433,0.945977,0.000000,0.167461,0.000000,0.898033,0.962685,0.870742,0.280911,0.128918,0.000000,0.881398,0.000000,0.933076,1.000000,0.822827,0.605362,1.000000,1.000000,0.909299,1.000000,0.980893,49.561188
919,0.085793,0.542129,0.000000,0.000000,0.000000,0.122436,0.380567,0.414440,0.370177,0.431492,0.846377,0.070782,0.854294,0.547925,0.627676,0.000000,0.380083,0.329770,0.000000,0.553171,0.075544,0.403239,0.363373,0.103035,0.000000,0.558073,0.755529,1.000000,0.441815,0.961655,0.712556,0.049925,0.554225,1.000000,0.000000,1.000000,0.744779,1.000000,0.763999,49.084804
80,0.627579,0.000000,0.344929,0.821016,0.888548,0.040039,0.486513,0.149564,0.495053,0.000000,0.000000,0.076883,0.932429,0.153064,0.358235,1.000000,1.000000,0.288096,0.000000,0.840618,0.027717,0.701926,0.759908,0.067619,0.916295,1.000000,0.262209,0.787144,0.143052,1.000000,1.000000,0.033311,0.166805,0.544015,0.441702,0.656668,0.434580,0.044769,1.000000,49.031023
590,0.895425,0.312969,0.835668,0.758404,0.875824,1.000000,0.287074,0.162537,0.289134,0.000000,1.000000,1.000000,0.800131,0.616693,0.991612,0.262219,0.787765,0.982948,0.000000,0.728992,0.668691,0.385728,0.000000,0.377846,0.902715,0.318033,0.496534,0.264539,0.426685,0.513601,0.056915,0.043358,0.445491,0.662526,1.000000,0.387187,0.447520,0.418454,0.006162,48.969463


In [73]:
# Transformation to polarized groups of shap values
polarized_df = pandas_shap_df.copy()
for i in range(0, len(pandas_shap_df.index)):
    for j in range(0, len(pandas_shap_df.columns)):
        if not experiments_df.iloc[i, j]:
            polarized_df.iloc[i, j] = -1 * pandas_shap_df.iloc[i, j]
polarized_df

Unnamed: 0,standard_scaler__custom_values___0,standard_scaler__custom_values___1,standard_scaler__custom_values___2,standard_scaler__custom_values___3,standard_scaler__custom_values___4,standard_scaler__custom_values___5,standard_scaler__custom_values___6,standard_scaler__custom_values___7,standard_scaler__custom_values___8,standard_scaler__custom_values___9,standard_scaler__custom_values___10,standard_scaler__custom_values___11,standard_scaler__custom_values___12,min_max_scaler__custom_values___0,min_max_scaler__custom_values___1,min_max_scaler__custom_values___2,min_max_scaler__custom_values___3,min_max_scaler__custom_values___4,min_max_scaler__custom_values___5,min_max_scaler__custom_values___6,min_max_scaler__custom_values___7,min_max_scaler__custom_values___8,min_max_scaler__custom_values___9,min_max_scaler__custom_values___10,min_max_scaler__custom_values___11,min_max_scaler__custom_values___12,binarizer__custom_values___0,binarizer__custom_values___1,binarizer__custom_values___2,binarizer__custom_values___3,binarizer__custom_values___4,binarizer__custom_values___5,binarizer__custom_values___6,binarizer__custom_values___7,binarizer__custom_values___8,binarizer__custom_values___9,binarizer__custom_values___10,binarizer__custom_values___11,binarizer__custom_values___12
0,-0.003369,5.313730,0.0,0.923289,-0.0,0.0,-0.0,0.235631,7.492890,-0.0,0.0,-0.0,0.0,-0.011038,-0.0,0.0,0.0,0.023955,-0.0,-0.0,-0.0,-0.001268,0.0,0.001475,0.0,-0.0,0.0,-0.001783,-0.0,4.104317,0.0,0.0,0.022095,0.0,0.0,0.0,0.0,-0.0,-0.000184
1,-0.000799,5.318821,0.0,0.999578,-0.0,0.0,-0.0,0.251685,1.586242,-0.0,0.0,-0.0,0.0,0.621249,-0.0,0.0,0.0,0.037575,-0.0,-0.0,-0.0,0.082970,0.0,0.004143,0.0,-0.0,0.0,0.004969,-0.0,4.202542,0.0,0.0,0.022018,0.0,0.0,0.0,0.0,-0.0,-0.000183
2,0.040823,5.780961,0.0,1.028382,-0.0,0.0,0.0,0.267682,-0.039373,0.0,0.0,-0.0,0.0,-0.018534,-0.0,0.0,0.0,0.037846,0.0,0.0,-0.0,0.082970,0.0,0.004143,0.0,-0.0,0.0,0.004969,-0.0,4.152992,0.0,0.0,0.044565,0.0,0.0,0.0,0.0,-0.0,0.017632
3,-0.001238,5.413830,0.0,1.028382,-0.0,0.0,-0.0,0.267682,-0.042232,-0.0,0.0,-0.0,0.0,-0.020328,-0.0,0.0,0.0,0.002243,0.0,-0.0,-0.0,-0.002627,0.0,0.000309,0.0,-0.0,0.0,-0.009938,-0.0,4.140565,0.0,0.0,0.044565,0.0,0.0,0.0,0.0,-0.0,-0.000371
4,0.040823,4.985659,0.0,1.028382,0.0,0.0,0.0,0.267682,-0.069194,0.0,0.0,0.0,0.0,-0.019669,0.0,0.0,0.0,0.002333,0.0,0.0,0.0,0.082970,0.0,0.000309,0.0,0.0,0.0,-0.009938,0.0,4.140565,0.0,0.0,0.044565,0.0,0.0,0.0,0.0,0.0,-0.000371
5,-0.000539,3.925432,0.0,0.091316,-0.0,-0.0,-0.0,0.193859,-0.127453,-0.0,-0.0,-0.0,0.0,-0.022948,-0.0,0.0,0.0,0.031625,-0.0,-0.0,-0.0,-0.002627,0.0,0.004143,0.0,-0.0,0.0,-0.004969,-0.0,4.152948,0.0,-0.0,-0.003466,0.0,0.0,0.0,0.0,-0.0,-0.000371
6,0.008032,0.857951,0.0,1.028382,-0.0,-0.0,-0.0,0.267682,-0.116792,-0.0,-0.0,-0.0,0.0,1.417612,-0.0,0.0,0.0,0.073737,-0.0,-0.0,-0.0,-0.001303,0.0,0.004143,0.0,-0.0,0.0,-0.004969,-0.0,4.156640,0.0,-0.0,-0.001713,0.0,0.0,0.0,0.0,-0.0,-0.000183
7,-0.009371,0.842937,0.0,0.340701,-0.0,0.0,-0.0,0.128033,-0.042501,-0.0,0.0,-0.0,0.0,-0.021560,-0.0,0.0,0.0,0.028747,-0.0,-0.0,-0.0,-0.002518,0.0,-0.012430,0.0,-0.0,0.0,-0.004969,-0.0,1.856932,0.0,-0.0,0.044565,0.0,0.0,0.0,0.0,-0.0,0.017632
8,-0.001238,0.473105,0.0,0.491508,-0.0,0.0,-0.0,-1.968550,-0.129619,0.0,0.0,-0.0,0.0,-0.022457,-0.0,0.0,0.0,0.023288,0.0,0.0,-0.0,-0.002627,0.0,-0.012430,0.0,-0.0,0.0,-0.009938,-0.0,3.159342,0.0,0.0,0.044565,0.0,0.0,0.0,0.0,-0.0,-0.000371
9,-0.009419,0.846693,0.0,0.995175,-0.0,-0.0,-0.0,0.136102,-0.098669,-0.0,-0.0,-0.0,0.0,-0.022615,-0.0,0.0,0.0,0.009700,-0.0,-0.0,-0.0,-0.002518,0.0,0.004143,0.0,-0.0,0.0,-0.000413,-0.0,-0.381786,0.0,-0.0,0.044565,0.0,0.0,0.0,0.0,-0.0,-0.000371


In [68]:
# Certainly, I set a feature to True for large positive values
# Also, I set a feature to False for large negative values
# Otherwise, it is set to True or False
polarized_shap_result = polarized_df.sum()
polarized_shap_result.sort_values()

binarizer__column_indices_to_replace___8           -0.040293
standard_scaler__column_indices_to_replace___3     -0.000245
binarizer__column_indices_to_replace___12           0.000000
binarizer__column_indices_to_replace___0            0.000000
min_max_scaler__column_indices_to_replace___12      0.000000
min_max_scaler__column_indices_to_replace___11      0.000000
min_max_scaler__column_indices_to_replace___10      0.000000
min_max_scaler__column_indices_to_replace___9       0.000000
min_max_scaler__column_indices_to_replace___8       0.000000
min_max_scaler__column_indices_to_replace___7       0.000000
binarizer__column_indices_to_replace___11           0.000000
min_max_scaler__column_indices_to_replace___5       0.000000
min_max_scaler__column_indices_to_replace___4       0.000000
binarizer__column_indices_to_replace___3            0.000000
binarizer__column_indices_to_replace___4            0.000000
binarizer__column_indices_to_replace___5            0.000000
min_max_scaler__column_i

In [69]:
# Splits positive and negative

positive_fields = polarized_shap_result[polarized_shap_result > 0]
positive_fields = positive_fields / positive_fields.sum()
negative_fields = polarized_shap_result[polarized_shap_result < 0]
negative_fields = negative_fields / negative_fields.sum()
positive_fields = positive_fields[positive_fields > 0.05]
negative_fields = negative_fields[negative_fields > 0.05]

# Each iteration, find anything above % number

In [70]:
print(positive_fields, '\n', negative_fields)

# Splits positive and negative

# Each iteration, find anything above 5% and either remove a low value or remove a high value from their options

# Continue until 0 things were removed (0 will be removed if one option for each)

# When there are X choices yet

# Try appending experiments vs continue to use the same results for analysis ; keep together for now for review

# When there were 3

standard_scaler__column_indices_to_replace___0    0.338096
standard_scaler__column_indices_to_replace___1    0.505821
binarizer__column_indices_to_replace___9          0.141051
dtype: float32 
 binarizer__column_indices_to_replace___8    0.993958
dtype: float32


In [71]:
# just weighting based on feature length etc

In [54]:
for key in positive_fields.keys():
    choices = param_distributions[
        'standard_scaler__column_indices_to_replace'][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[:-1]

for key in negative_fields.keys():
    choices = param_distributions[key][2]
    if len(choices) > 1:
        param_distributions['standard_scaler__column_indices_to_replace'][
            2] = choices[1:]

TypeError: object of type 'numpy.float64' has no len()

In [55]:
param_distributions['min_max_scaler__column_indices_to_replace']

In [37]:
# always consider all features
#

In [None]:
# featuers to consider,
# num of features
# Zeroes;
# could
# default distribution ()
# weighting by feature towards up or down

In [106]:
for key in negative_fields.keys():
    print(key.split("___")[0])

In [107]:
a = [1, 2, 3, 4]

In [108]:
a[1:]

In [109]:
a[:-1]