In [75]:
%load_ext pycodestyle_magic
# %%pycodestyle


In [174]:
import pandas as pd
import numpy as np
import shap
from pandasql import sqldf
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston



# Lambdas

def q(x):
    return sqldf(x, globals())

# Randomize given the value options


def randomize_value_choice(value_options):
    """
    The value_options format is either:
        [True, False] - True or False
        [1,2,3,4,5] - 1 to 5
        [[0,1,4],5] - Any subset of [0,1,2,3,4] and 5
            represents the number of features in the dataframe
    """
    value = None
    # Check to make sure there are value options
    if len(value_options) > 0:
        # Make sure it isn't a 2D List
        if type(value_options[0]) != list:
            # Simple choice from the list -
                # In the future this might include a distribution mean+std
            np.random.choice(value_options)
        else:
            # If the option is a 2D array it means we take
                # each element with 50% probability
            value = []
            for element in value_options[0]:
                random_number = np.random.rand(1)[0]
                if random_number > 0.5:
                    value.append(element)
    return value

# My own version of RandomizedSearchCV


def RandomizedGridSearchCV(n_experiments, pipe,
                           param_distributions, train_X, train_y,
                           test_X, test_y,
                           scoring='neg_mean_squared_error', cv=2):

    # Transform the param_distributions into four arrays
    key_list = []
    transform_class_list = []
    parameter_name_list = []
    value_options_list = []
    for key, value_options in param_distributions.items():
        class_key, parameter_name = key.split("__")
        transform_class = pipe.named_steps[class_key]
        key_list.append(key)
        transform_class_list.append(transform_class)
        parameter_name_list.append(parameter_name)
        value_options_list.append(value_options)

    # Initialize experiments dictionary
    experiments = {}
    for key, transform_class, parameter_name, value_options in \
        zip(key_list, transform_class_list, parameter_name_list,
            value_options_list):
        if type(value_options[0]) != list:
            experiments[key] = []
        else:
            # value_options[1] is the number of features
            for i in range(value_options[1]):
                experiments[key + "_" + str(i)] = []
    experiments['score'] = []

    # Iterate over the experiments
    for iteration in range(n_experiments):
        print("Iteration: ", iteration)

        # Updates the transform parameters
        for key, transform_class, parameter_name, value_options in \
            zip(key_list, transform_class_list, parameter_name_list,
                value_options_list):

            # Get the random value
            value = randomize_value_choice(value_options)

            # Set the value
            setattr(transform_class, parameter_name, value)

            # If the value option is a 2D array it is treated differently
            if type(value_options[0]) != list:
                experiments[key].append(value)
            else:
                for i in range(value_options[1]):
                    experiments[key + "_" + str(i)].append(value)
                    
        # Fit
        pipe.fit(train_X, train_y)
        
        # Predict
        pred_y = pipe.predict(test_X)
        
        # Scoring
        if scoring == 'neg_mean_squared_error':
            score = mean_squared_error(pred_y, test_y)
        else:
            raise Exception('Scoring type not implemented')
        
        # Appending the score
        experiments["score"].append(score)

    experiments_df = pd.DataFrame(experiments)
    return experiments_df

# Standard scaler data preparation class


class StandardScalerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, column_indices_to_replace=[]):
        print(np.c_[X])        
        self.column_indices_to_replace = column_indices_to_replace
        self.standard_scalers = {}

    def fit(self, X, y=None):
        for index in self.column_indices_to_replace:
            self.standard_scalers[index] = StandardScaler()
            self.standard_scalers[index].fit(X[:, index])
        return self

    def transform(self, X, y=None):
        for index in self.column_indices_to_replace:
            X[:, index] = self.standard_scalers[index].transform(X[:, index])        
        return np.c_[X]

# Min-max scaler data preparation class


class MinMaxScalerTransform(BaseEstimator, TransformerMixin):
    def __init__(self, column_indices_to_replace=[]):
        self.column_indices_to_replace = column_indices_to_replace
        self.min_max_scalers = {}

    def fit(self, X, y=None):
        for index in self.column_indices_to_replace:
            self.min_max_scalers[index] = MinMaxScaler()
            self.min_max_scalers[index].fit(X[:, index])
        return self

    def transform(self, X, y=None):
        for index in self.column_indices_to_replace:
            X[:, index] = self.min_max_scalers[index].transform(X[:, index])
        return np.c_[X]

# Generic xgboost fit using several grid searches


def get_xgboost_model(train_X, train_y):
    model = Pipeline([('xgb', XGBRegressor())])

    # 1) Tune max depth
    param_grid = [{
        'xgb__n_estimators': [100],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [1, 2, 4, 6, 8],
        'xgb__subsample': [1.00]
    }]
    gs1 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs1 = gs1.fit(train_X, train_y)
    max_depth = gs1.best_params_['xgb__max_depth']
    # print(gs1.best_score_)
    # print(gs1.best_params_)

    # 2) Tune subsample
    param_grid = [{
        'xgb__n_estimators': [100],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00]
    }]
    gs2 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs2 = gs2.fit(train_X, train_y)
    subsample = gs2.best_params_['xgb__subsample']
    # print(gs2.best_score_)
    # print(gs2.best_params_)

    # 3) Tune n_estimators
    param_grid = [{
        'xgb__n_estimators': [50, 100, 150, 200],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    }]
    gs3 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs3 = gs3.fit(train_X, train_y)
    n_estimators = gs3.best_params_['xgb__n_estimators']
    # print(gs3.best_score_)
    # print(gs3.best_params_)

    # 4) Tune learning rate
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [0.1],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    },
        {
        'xgb__n_estimators': [n_estimators * 3],
        'xgb__learning_rate': [0.03],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    }]
    gs4 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs4 = gs4.fit(train_X, train_y)
    n_estimators = gs4.best_params_['xgb__n_estimators']
    learning_rate = gs4.best_params_['xgb__learning_rate']
    # print(gs4.best_score_)
    # print(gs4.best_params_)

    # 5) Tune n_estimators
    param_grid = [{
        'xgb__n_estimators': [
            int(0.8 * n_estimators),
            int(1.0 * n_estimators),
            int(1.2 * n_estimators)
        ],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample]
    }]
    gs5 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs5 = gs5.fit(train_X, train_y)
    n_estimators = gs5.best_params_['xgb__n_estimators']
    # print(gs5.best_score_)
    # print(gs5.best_params_)

    # 6) Tune sampling by tree
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'xgb__colsample_bylevel': [1.0]
    }]
    gs6 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs6 = gs6.fit(train_X, train_y)
    colsample_bytree = gs6.best_params_['xgb__colsample_bytree']
    # print(gs6.best_score_)
    # print(gs6.best_params_)

    # 7) Tune subsample
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [1.0]
    }]
    gs7 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs7 = gs7.fit(train_X, train_y)
    subsample = gs7.best_params_['xgb__subsample']
    # print(gs7.best_score_)
    # print(gs7.best_params_)

    # 8) Tune sampling by level
    n_estimators = gs7.best_params_['xgb__n_estimators']
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    }]
    gs8 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs8 = gs8.fit(train_X, train_y)
    colsample_bylevel = gs8.best_params_['xgb__colsample_bylevel']
    # print(gs8.best_score_)
    # print(gs8.best_params_)

    # 9) Tune sampling fields
    n_estimators = gs8.best_params_['xgb__n_estimators']
    subsample = 0.9 if subsample == 1.0 else subsample
    colsample_bytree = 0.6 if colsample_bytree == 0.5 else colsample_bytree
    colsample_bylevel = 0.9 if colsample_bylevel == 1.0 else colsample_bylevel
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample, subsample + 0.1],
        'xgb__colsample_bytree': [colsample_bytree - 0.1, colsample_bytree],
        'xgb__colsample_bylevel': [colsample_bylevel, colsample_bylevel + 0.1]
    }]
    gs9 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs9 = gs9.fit(train_X, train_y)
    subsample = gs9.best_params_['xgb__subsample']
    colsample_bytree = gs9.best_params_['xgb__colsample_bytree']
    colsample_bylevel = gs9.best_params_['xgb__colsample_bylevel']
    # print(gs9.best_score_)
    # print(gs9.best_params_)

    # 10) Tune alpha
    param_grid = [{
        'xgb__n_estimators': [n_estimators],
        'xgb__learning_rate': [learning_rate],
        'xgb__max_depth': [max_depth],
        'xgb__subsample': [subsample],
        'xgb__colsample_bytree': [colsample_bytree],
        'xgb__colsample_bylevel': [colsample_bylevel],
        'xgb__reg_lambda': [0.001, 0.01, 0.1, 0.3, 1, 3, 10, 100, 1000]
    }]
    gs10 = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=2)
    gs10 = gs10.fit(train_X, train_y)
    # print(gs10.best_score_)
    # print(gs10.best_params_)

    # Find the best model
    # Sometimes the best model isn't the last one, so checking all of them
    best_model = gs1
    best_model_score = gs1.best_score_
    if gs2.best_score_ > best_model_score:
        best_model = gs2
        best_model_score = gs2.best_score_
    if gs2.best_score_ > best_model_score:
        best_model = gs2
        best_model_score = gs2.best_score_
    if gs3.best_score_ > best_model_score:
        best_model = gs3
        best_model_score = gs3.best_score_
    if gs4.best_score_ > best_model_score:
        best_model = gs4
        best_model_score = gs4.best_score_
    if gs5.best_score_ > best_model_score:
        best_model = gs5
        best_model_score = gs5.best_score_
    if gs6.best_score_ > best_model_score:
        best_model = gs6
        best_model_score = gs6.best_score_
    if gs7.best_score_ > best_model_score:
        best_model = gs7
        best_model_score = gs7.best_score_
    if gs8.best_score_ > best_model_score:
        best_model = gs8
        best_model_score = gs8.best_score_
    if gs9.best_score_ > best_model_score:
        best_model = gs9
        best_model_score = gs9.best_score_
    if gs10.best_score_ > best_model_score:
        best_model = gs10
        best_model_score = gs10.best_score_

    # Return the best model
    return XGBRegressor(**best_model.best_params_)

In [175]:
# Example dataset
boston_data = load_boston()

# Extract pandas dataframe and target
X = pd.DataFrame(boston_data['data'])
y = pd.DataFrame(boston_data['target'])

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.20, random_state=42)
train_X, test_X = train_X.values, test_X.values
train_y, test_y = train_y.values.reshape(-1, 1), test_y.values.reshape(-1, 1)

# An okay model fit to the data
# xgb_model = get_xgboost_model(train_X, train_y)

# Pipeline
pipe = Pipeline([('standard_scaler', StandardScalerTransform()),
               #  ('min_max_scaler', MinMaxScalerTransform()),
                 ('model', xgb_model)])

# Find the number of features
num_features = train_X.shape[1]

# Testing with these indices
indices = list(range(num_features))

# Possible configurations
param_distributions = {
    'standard_scaler__column_indices_to_replace': [indices, num_features] # ,
#    'min_max_scaler__column_indices_to_replace': [indices, num_features]
}

# Randomly search the space n_iter times
experiments_df = RandomizedGridSearchCV(
    n_experiments=10,
    pipe=pipe,
    param_distributions=param_distributions,
    train_X=train_X,
    train_y=train_y,
    test_X=test_X,
    test_y=test_y,
    scoring='neg_mean_squared_error',
    cv=2)

# Sort the scores
experiments_df.sort_values(by=['score'], ascending=False, inplace=True)

# Drop score
experiments_X_df = experiments_df.drop(['score'], axis=1)

# Get column names
X_column_names = experiments_X_df.columns

# Convert to numpy
experiments_X = experiments_X_df.values
experiments_y = experiments_df[['score']].values

# Create an XGBoost model tuned with the experiments data
xgb_experiments_model = get_xgboost_model(experiments_X, experiments_y)

# Fit the model
xgb_experiments_model.fit(experiments_X_df, experiments_y)

# Extract shap values
explainer = shap.TreeExplainer(xgb_experiments_model)
shap_values = explainer.shap_values(experiments_X_df)

# Shap as dataframe
pandas_shap_df = pd.DataFrame(shap_values, columns=X_column_names)
pandas_shap_df


[[6.3200e-03 1.8000e+01 2.3100e+00 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 7.8800e+00]]
Iteration:  0


ValueError: Expected 2D array, got 1D array instead:
array=[1.50234e+01 6.27390e-01 3.46600e-02 7.05042e+00 7.25800e-01 1.91860e-01
 3.96100e-02 2.05500e-02 1.51772e+01 1.44383e+01 3.73800e-02 6.88800e-02
 4.12380e-01 1.39134e+01 6.58800e-02 8.40540e-01 1.73310e-01 8.24400e-02
 2.06080e-01 1.40300e-01 7.35341e+01 1.50980e-01 1.41500e-01 3.51140e-01
 1.87000e-02 9.10300e-02 3.53501e+00 3.57800e-02 3.87350e-01 6.72400e-02
 1.35472e+00 2.22120e-01 2.33099e+00 6.44405e+00 3.30600e-02 1.43200e-02
 1.43900e-02 7.50260e-01 7.84200e-01 6.46600e-02 4.37900e-02 3.75780e-01
 4.15292e+01 4.29400e-02 1.41385e+00 9.72418e+00 9.88430e-01 5.26930e-01
 5.58107e+00 9.92485e+00 2.98500e-02 1.31580e-01 1.71420e-01 1.05393e+00
 1.55757e+01 4.54192e+00 3.23700e-02 6.79208e+01 6.04700e-02 1.49320e-01
 1.07930e-01 1.81590e-01 7.61620e-01 1.00245e+00 5.20140e-01 1.02330e+01
 6.71910e-01 1.44550e-01 1.11320e-01 1.28020e-01 8.01400e-02 1.22358e+00
 3.56868e+00 1.30580e-01 1.42310e-01 6.66400e-02 8.66400e-02 1.14600e-01
 2.77974e+00 1.11081e+01 7.99248e+00 8.98296e+00 6.12700e-02 3.58090e-01
 6.71772e+00 1.62864e+00 5.66998e+00 5.78900e-02 3.83684e+00 2.30040e+00
 1.77830e-01 1.33598e+01 2.50461e+01 2.18700e-02 1.90730e-01 2.63630e-01
 1.10874e+01 2.37934e+00 4.20300e-02 1.12658e+00 6.23560e-01 5.51500e-02
 3.55100e-02 1.64390e-01 2.92400e+00 1.51902e+00 3.15000e-02 4.62960e-01
 7.89600e-02 7.90410e-01 4.75237e+00 3.68940e-01 1.44760e-01 9.06000e-03
 9.26600e-02 2.81838e+00 3.84970e+00 2.48017e+01 2.98190e-01 5.34120e-01
 5.11830e-01 2.43938e+01 4.87141e+00 9.74400e-02 4.01100e-02 5.44520e-01
 4.89822e+00 1.96570e-01 3.87100e-02 2.36482e+01 1.03280e-01 1.00840e-01
 5.30200e-02 7.85700e-01 8.82900e-02 3.47428e+00 6.07600e-02 1.30100e-02
 1.34284e+00 1.65660e+00 5.42500e-02 7.67202e+00 8.30800e-02 4.02020e-01
 2.24890e-01 2.00849e+01 2.11610e-01 4.46200e-02 1.75050e-01 2.45220e-01
 1.80028e+00 6.39312e+00 5.56100e-02 5.37200e-02 3.76800e-02 9.82349e+00
 2.15505e+00 5.87205e+00 2.36862e+00 7.36711e+00 4.29700e-02 1.50380e-01
 2.07460e-01 1.15040e-01 4.09740e+00 9.25200e-02 9.60400e-02 1.20830e-01
 1.70900e-02 9.29900e-02 1.00080e-01 2.17700e-02 3.39830e-01 2.37857e+00
 3.53700e-02 4.30100e-02 5.11358e+01 9.91655e+00 1.96500e-02 1.69020e-01
 5.47900e-02 6.14700e-01 1.20482e+01 1.14250e-01 8.81250e-01 8.79212e+00
 7.88600e-02 5.02300e-02 8.89762e+01 5.82401e+00 5.20177e+00 1.41030e-01
 8.19900e-02 6.53876e+00 1.36781e+01 1.23290e-01 5.78000e-02 2.63548e+00
 2.49800e-02 5.08300e-02 4.83567e+00 8.20058e+00 3.31470e-01 3.69200e-01
 2.24236e+00 3.22640e-01 4.66600e-02 6.63510e-01 5.75290e-01 1.71340e-01
 6.89900e-02 7.24400e-02 3.15330e-01 2.07162e+01 6.15100e-02 2.59150e-01
 1.09600e-02 1.80846e+01 1.31170e-01 1.84982e+01 7.52601e+00 3.29820e-01
 1.35222e+01 1.22690e-01 1.78990e-01 3.58400e-02 1.50100e-02 5.73500e-02
 1.02900e-01 5.60200e-02 1.58603e+01 1.42502e+00 9.37800e-02 6.41700e-02
 7.72990e-01 1.20742e+00 3.32105e+00 9.59571e+00 2.89900e-02 4.07710e-01
 1.22040e-01 4.33700e-02 1.13290e-01 1.52880e+01 9.18702e+00 6.64200e-02
 1.27440e-01 2.20511e+01 5.29305e+00 2.29690e-01 6.12900e-02 4.81900e-02
 1.08342e+01 6.90500e-02 1.53800e-02 8.24809e+00 1.48660e-01 3.82140e-01
 1.00623e+01 1.40520e-01 1.22472e+01 2.31390e+00 8.18700e-02 3.61500e-02
 1.98020e-01 1.71710e-01 2.29270e-01 1.38799e+00 5.78340e-01 2.41030e-01
 1.77800e-02 5.44114e+00 9.55770e-01 8.64476e+00 5.37000e-01 5.40110e-01
 4.59000e-02 1.83377e+00 9.33889e+00 2.49800e-01 1.10270e-01 5.57780e-01
 3.25430e-01 5.73116e+00 2.11240e-01 3.03470e-01 1.30751e+01 1.95100e-02
 4.41700e-02 6.37960e-01 2.44668e+00 3.35900e-02 1.78667e+01 3.16360e+00
 1.19511e+01 4.56000e-02 2.10380e-01 9.39063e+00 1.09590e-01 3.04100e-02
 5.20580e-01 2.51990e-01 2.17190e-01 1.29320e-01 6.65492e+00 2.14090e-01
 2.79570e-01 7.83932e+00 1.00000e-01 6.21100e-02 9.06500e-02 3.44500e-02
 1.46336e+00 1.59360e-01 7.01300e-02 1.42362e+01 9.06800e-02 3.49400e-01
 6.56650e-01 1.32620e-01 4.98100e-02 8.15174e+00 2.73100e-02 6.28807e+00
 1.50860e-01 2.19770e-01 1.18123e+01 4.11300e-02 1.36420e-01 1.61282e+00
 8.49213e+00 8.25260e-01 3.76619e+01 3.69695e+00 3.93200e-02 5.49700e-02
 1.43337e+01 5.36000e-02 3.11300e-02 5.50070e-01 1.06120e-01 6.29760e-01
 2.53560e-01 5.66000e-02 2.25971e+01 2.21880e-01 2.01019e+00 6.61700e-02
 2.39120e-01 9.76170e-01 7.50300e-02 5.69175e+00 4.75470e-01 1.27570e-01
 1.36000e-02 4.22239e+00 8.87300e-02 3.69311e+00 8.44700e-02 1.06718e+01
 8.37000e-02 4.52700e-02 5.82115e+00 7.87500e-02 2.44953e+00 1.54450e-01
 2.53870e-01 3.04900e-02 3.30450e-01 8.22100e-02 8.52040e-01 2.69380e-01
 6.80117e+00 1.27346e+00 1.04690e-01 9.96654e+00 6.91100e-02 1.68118e+01
 8.26500e-02 2.86558e+01 2.54300e-02 6.11540e-01 4.92980e-01 2.73397e+00
 3.40060e-01 1.49632e+00 4.26131e+00 6.86000e-02 8.26725e+00 7.15100e-02
 7.75223e+00 4.54400e-02 2.89550e-01 3.77498e+00 7.16500e-02 4.74100e-02
 1.25179e+00 1.25790e-01 1.58760e-01 1.71200e-01 2.99160e-01 1.50100e-02
 1.11604e+01 2.28760e-01].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [170]:
xgb_experiments_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, xgb__learning_rate=0.03, xgb__max_depth=1,
       xgb__n_estimators=450, xgb__subsample=1.0)

In [161]:
experiments_df

Unnamed: 0,score
0,7.266728
1,7.266728
2,7.266728
3,7.266728
4,7.266728
5,7.266728
6,7.266728
7,7.266728
8,7.266728
9,7.266728


In [10]:
# Transformation to polarized groups of shap values
polarized_shap_df = pandas_shap_df.copy()
for i in range(0, len(pandas_shap_df.index)):
    for j in range(0, len(pandas_shap_df.columns)):
        if not experiments_df.iloc[i, j]:
            polarized_df.iloc[i, j] = -1 * pandas_shap_df.iloc[i, j]
polarized_df


NameError: name 'polarized_df' is not defined

In [None]:
# Certainly, I set a feature to True for large positive values
# Also, I set a feature to False for large negative values
# Otherwise, it is set to True or False
polarized_shap_result = polarized_df.sum()
polarized_shap_result
