# Force-Plot Shapley Values using Plotly
- use pre-trained h2o model

## Import dependencies

In [1]:
import sklearn
import pandas as pd
import numpy as np
import shap
import h2o
from h2o.automl import H2OAutoML

from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go

init_notebook_mode(connected=True)

## initialize h2o

In [2]:
%%capture
h2o.init()

## h2o prediction wrapper

In [3]:
class H2OProbWrapper:
    def __init__(self, h2o_model, feature_names):
        self.h2o_model = h2o_model
        self.feature_names = feature_names

    def predict_binary_prob(self, X):
        if isinstance(X, pd.Series):
            X = X.values.reshape(1,-1)
        self.dataframe= pd.DataFrame(X, columns=self.feature_names)
        self.predictions = self.h2o_model.predict(h2o.H2OFrame(self.dataframe)).as_data_frame().values
        return self.predictions.astype('float64')[:,-1] #probability of True class

## Force-plot and help functions

In [4]:
def svg_path_headless(start,shap_value,arrow_width):
    if shap_value < 0:
        shap_value = shap_value*-1
        path = ' M' + str(start) + ',0' + ' L' + str(start) + ',1' + ' L' + str(start+shap_value) + ',1' \
                + ' L' + str(start+shap_value-arrow_width) + ',0.5' + ' L' + str(start+shap_value) + ',0' \
                + ' Z'

        fillcolor = 'rgba(255, 0, 0, 0.5)'
        linecolor = 'rgb(255, 0, 0)'
    else:
        path = ' M' + str(start) + ',0' + ' L' + str(start) + ',1' + ' L' + str(start-shap_value) + ',1' \
                + ' L' + str(start-shap_value+arrow_width) + ',0.5' + ' L' + str(start-shap_value) + ',0' \
                + ' Z'

        fillcolor = 'rgba(0, 0, 255, 0.5)'
        linecolor = 'rgb(0, 0, 255)'
    
    return {
        'type': 'path',
        'path': path,
        'fillcolor': fillcolor,
        'line': {
            'color': linecolor,
            'width': 1
        }
       }

def svg_path(start,shap_value,arrow_width):
    if shap_value < 0:
        shap_value = shap_value*-1
        path = ' M' + str(start) + ',0' + ' L' + str(start-arrow_width) + ',0.5' + ' L' + str(start) + ',1' + ' L' + str(start+shap_value) + ',1' \
                + ' L' + str(start+shap_value-arrow_width) + ',0.5' + ' L' + str(start+shap_value) + ',0' \
                + ' Z'

        fillcolor = 'rgba(255, 0, 0, 0.5)'
        linecolor = 'rgb(255, 0, 0)'
    else:
        path = ' M' + str(start) + ',0' + ' L' + str(start+arrow_width) + ',0.5' + ' L' + str(start) + ',1' + ' L' + str(start-shap_value) + ',1' \
                + ' L' + str(start-shap_value+arrow_width) + ',0.5' + ' L' + str(start-shap_value) + ',0' \
                + ' Z'

        fillcolor = 'rgba(0, 0, 255, 0.5)'
        linecolor = 'rgb(0, 0, 255)'
    
    return {
        'type': 'path',
        'path': path,
        'fillcolor': fillcolor,
        'line': {
            'color': linecolor,
            'width': 1
        }
       }

def annot(x,y,text,ay,ax,color):
     return [dict(
            x=x,
            y=y,
            xref='x',
            yref='y',
            text=text,
            showarrow=True,
            arrowhead=7,
            arrowcolor=color,
            ax=ax,
            ay=ay,
            opacity=0.6,
            font = dict(
            color = color,
            size = 10
           )
        )]
    
    
def force_plot(shap_values,X,output_value,base_value):
    
    X_cols = X.columns
    
    mask = shap_values > 0.
    pos_shap_values = shap_values[mask]
    pos_X_cols = X_cols[mask]
    mask = shap_values < 0.
    neg_shap_values = shap_values[mask]
    neg_X_cols = X_cols[mask]
    neg_shap_inds = np.argsort(neg_shap_values)
    neg_shap_values = (neg_shap_values[neg_shap_inds])
    neg_X_cols = neg_X_cols[neg_shap_inds].tolist()
    pos_shap_inds = np.argsort(pos_shap_values)
    pos_shap_values = list(pos_shap_values[pos_shap_inds][::-1])
    pos_X_cols = pos_X_cols[pos_shap_inds].tolist()



    offset = output_value
    gap = 0.0025 # white space
    
    x_max = np.sum(-1*np.array((neg_shap_values)))+output_value + len(neg_shap_values)*gap + 0.1
    x_min = output_value + np.sum(-1*np.array((pos_shap_values))) - len(neg_shap_values)*gap - 0.1


    neg_annot = []
    pos_annot = []
    ays = [75,100]
    axs = [15,20]


    for indx,shap_val in enumerate(neg_shap_values):
        if indx == 0:
            offset = output_value
            shape_list_neg = [svg_path_headless(offset,shap_val,0.01)]  
            neg_text_xcoord = offset-shap_val/2
        else:
            offset = offset - neg_shap_values[indx-1] + gap
            shape_list_neg = shape_list_neg + [svg_path(offset,shap_val,0.01)]
            neg_text_xcoord = offset-gap*2
        ay = ays[np.int(((-1)**(indx+1)+1)/2)]
        ax = axs[np.int(((-1)**(indx+1)+1)/2)]*(indx+1)
        neg_annot = neg_annot + annot(x=neg_text_xcoord,y=0.5,ay=ay,ax=ax,color='red',
                                      text=neg_X_cols[indx] + '=' + str(X_test_display.iloc[person,:][neg_X_cols[indx]]) )




    for indx,shap_val in enumerate(pos_shap_values):
        if indx == 0:
            offset = output_value
            shape_list_pos = [svg_path_headless(offset,shap_val,0.01)] 
            #pos_text_xcoord = pos_text_xcoord + [offset-shap_val/2]
            pos_text_xcoord = offset-shap_val/2
        else:
            offset = offset - pos_shap_values[indx-1] - gap
            shape_list_pos = shape_list_pos + [svg_path(offset,shap_val,0.01)]  
            pos_text_xcoord = offset+gap*2
        ay = ays[np.int(((-1)**(indx+1)+1)/2)]
        pos_annot = pos_annot + annot(x=pos_text_xcoord,y=0.5,ay=ay,ax=-20*(indx+1),color='blue',
                                      text=pos_X_cols[indx] + '=' + str(X_test_display.iloc[person,:][pos_X_cols[indx]]) )





    
    
    trace0 = go.Scatter(
        x=[output_value,base_value],
        y=[1.5,1.5],
        text=['output_value = ' + '<br />' + str(np.round(output_value,3)),
              'base_value = ' +  '<br />' + str(np.round(base_value,3)),
             ],
        mode='text'
    )



    data = [trace0]
    layout = {
        'title': 'Force Plot',
        'xaxis': {
            'range': [x_min, x_max],
            'zeroline': False,
        },
        'yaxis': {
            'range': [-0.5, 2.4],
            'showgrid': False,
            'showline': False,
            'ticks': '',
            'showticklabels': False
        },
        'shapes': shape_list_neg + shape_list_pos,
        'width': 1200,
        'height': 250,
        'annotations':pos_annot + neg_annot + [
            dict(
                x=output_value,
                y=2.3,
                xref='x',
                yref='y',
                text='lower',
                showarrow=True,
                arrowhead=1,
                ax=50,
                ay=0,
                opacity=0.6,
                arrowcolor='red',
                font = dict(
                color = 'red',
                size = 10
               )
            ),
            dict(
                x=output_value,
                y=2.3,
                xref='x',
                yref='y',
                text='higher',
                showarrow=True,
                arrowhead=1,
                ax=-50,
                ay=0,
                opacity=0.6,
                arrowcolor='blue',
                font = dict(
                color = 'blue',
                size = 10
               )
            )        
            ]


    }
    fig = {
        'data': data,
        'layout': layout
    }
    iplot(fig, filename='force-plot')

## Import dataset

In [5]:
X = pd.read_pickle('X.pkl')
X_train = pd.read_pickle('X_train.pkl')
X_test = pd.read_pickle('X_test.pkl')
X_train_display = pd.read_pickle('X_train_display.pkl')
X_test_display = pd.read_pickle('X_test_display.pkl')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')
y_train_display = np.load('y_train_display.npy')
y_test_display = np.load('y_test_display.npy')

In [6]:
feature_names = list(X_train.columns)

## Import h2o model

In [7]:
bst_model = h2o.load_model('StackedEnsemble_AllModels_AutoML_20190317_062616')

In [8]:
h2o_wrapper = H2OProbWrapper(bst_model,feature_names) 

## Get base value

In [9]:
explainer = shap.KernelExplainer(h2o_wrapper.predict_binary_prob, X_train.iloc[:100,:])
base_value = explainer.expected_value

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


## Model Prediction for first person in dataset

In [10]:
person = 0 # first person in test dataset


shap_values = explainer.shap_values(X_test.iloc[person,:], nsamples=500)
output_value = h2o_wrapper.predict_binary_prob(X_test.iloc[person])[0]

print('prediction (probability that this person earns more than $50k/year) =', output_value)
print('ground_truth (this person earns more than $50k/year) =', y_test_display[person])

force_plot(shap_values,X,output_value,base_value)

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%



l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!



Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%
prediction (probability that this person earns more than $50k/year) = 0.04360118550084032
ground_truth (this person earns more than $50k/year) = False


## Second prediction
- second person in list

In [11]:
person = 1 # first person in test dataset


shap_values = explainer.shap_values(X_test.iloc[person,:], nsamples=500)
output_value = h2o_wrapper.predict_binary_prob(X_test.iloc[person])[0]

print('prediction (probability that this person earns more than $50k/year) =', output_value)
print('ground_truth (this person earns more than $50k/year) =', y_test_display[person])

force_plot(shap_values,X,output_value,base_value)

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%



l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!



Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%
prediction (probability that this person earns more than $50k/year) = 0.8012822261020577
ground_truth (this person earns more than $50k/year) = True


## shutdown h2o

In [12]:
h2o.cluster().shutdown()

H2O session _sid_aa47 closed.
