In [23]:
import pandas as pd
import altair as alt
from altair.expr import datum
import warnings
warnings.filterwarnings('ignore')

In [24]:
## Load the dataframes 
subset_analysis_filename = '../Data/4vis/subset_analysis.pkl'
subset_analysis_df = pd.read_pickle(subset_analysis_filename)

data_filename = '../Data/Model/data.pkl'
df =  pd.read_pickle(data_filename)
X = df.drop(['target'], axis=1)
y = df['target']

In [25]:
method_name_to_attr_name = {
    'shap' : 'shap_value',
    'lime': 'lime_weight', 
    'ig':  'ig_attr', 
    'deepLift':'deepLift_attr' 
}
method_name_to_feature_rank = {
    'shap' : 'shap_rank',
    'lime': 'lime_rank', 
    'ig':  'ig_rank', 
    'deepLift':'deepLift_rank' 
}

In [26]:
def Get_method_to_dicts(dict_name_to_mode):
    method_to_featNames2abs = {}
    method_to_abs2featNames = {}
    for method_name, attr_name in dict_name_to_mode.items():
        featNames2abs = {}
        abs2featNames = {}
        for feature_name in X.columns:
            abs_attr_name = feature_name + '_abs_' + dict_name_to_mode[method_name]
            featNames2abs[feature_name] =  feature_name + '_abs_' + dict_name_to_mode[method_name]
            abs2featNames[abs_attr_name] = feature_name
        method_to_abs2featNames[method_name] = abs2featNames
        method_to_featNames2abs[method_name] = featNames2abs

    return method_to_featNames2abs, method_to_abs2featNames


In [27]:
method_to_featNames2abs_attr, method_to_abs2featNames_attr = Get_method_to_dicts(method_name_to_attr_name)
method_to_featNames2abs_rank, method_to_abs2featNames_rank = Get_method_to_dicts(method_name_to_feature_rank)

In [28]:
# for fixing the x and y ranges of the visualization 
max_df = subset_analysis_df.max()
min_df = subset_analysis_df.min()

# base common chart
brush = alt.selection_interval()
base = alt.Chart(subset_analysis_df).encode(
    opacity = alt.condition(brush, alt.value(1.0), alt.value(0.5))
).add_selection(
    brush
).properties(
    width=350,
    height=350
)


In [29]:
def Get_sliced_mean_attr_plot_by_method(subset_analysis_df, feature_name, method_name, rank_mode):
    method_to_featNames2abs, method_to_abs2featNames = {}, {}

    if rank_mode:
        method_to_featNames2abs, method_to_abs2featNames = Get_method_to_dicts(method_name_to_feature_rank)
    else:
        method_to_featNames2abs, method_to_abs2featNames = Get_method_to_dicts(method_name_to_attr_name)

    feature_attr_name = feature_name + '_' + method_name_to_attr_name[method_name]
    abs_feature_attr_name = feature_name + '_abs_' + method_name_to_attr_name[method_name]

    x_title = ''
    if rank_mode:
        x_title = 'mean absolute rank ' + method_name
    else:
        x_title = 'mean absolute ' + method_name + ' attribute'

    # sliced mean attr plot 
    max_rang_feature_attr = max_df[feature_attr_name]
    min_range_feature_attr = min_df[feature_attr_name]
    subset_analysis_df = subset_analysis_df.rename(columns = method_to_abs2featNames[method_name])
    view_sliced_mean_attr_ = alt.Chart(subset_analysis_df).transform_fold(
        X.columns.tolist(),
        as_=['key', 'value']
    ).mark_bar().encode(
        y= alt.Y('key:N', title = 'feature_name'),
        x= alt.X('mean(value):Q', title = x_title)
    ).transform_filter(
        brush
    )
    
    subset_analysis_df = subset_analysis_df.rename(columns= method_to_featNames2abs[method_name])
    return view_sliced_mean_attr_

In [30]:
def Get_feature_value_vs_mean_attr_plot(subset_analysis_df, feature_name, rank_mode):
    # distribution of feature value plot 
    feature_value_name = feature_name + '_value'
    max_range_feature_value = max_df[feature_value_name]
    min_range_feature_value = min_df[feature_value_name]
    feature_value_bin_step = (max_range_feature_value - min_range_feature_value)/20

    view_feature_value = base.mark_bar().encode(
        x= alt.X(feature_value_name, bin=alt.Bin(step = feature_value_bin_step, extent=[min_range_feature_value, max_range_feature_value])), 
        y = alt.Y('count()'),
        color = 'target:N'
    )
    
    view_sliced_mean_attr = {}
    row1 = alt.hconcat()
    row2 = alt.hconcat()
    i = 0
    
    for method_name in method_name_to_attr_name.keys():
        view_sliced_mean_attr[method_name] = Get_sliced_mean_attr_plot_by_method(subset_analysis_df, feature_name, method_name, rank_mode)
        if i < 2:
            row1 |= view_sliced_mean_attr[method_name]
        else:
            row2 |= view_sliced_mean_attr[method_name]
        i += 1
    
    view_sliced_mean_attr_rank = alt.vconcat(row1, row2)
    return view_feature_value | view_sliced_mean_attr_rank 

## Feature Value Distribution vs. Mean Absolute Attribute

In [31]:
Get_feature_value_vs_mean_attr_plot(subset_analysis_df, 'glucose', rank_mode = False)

SchemaValidationError: Invalid specification

        altair.vegalite.v4.schema.channels.OpacityValue, validating 'additionalProperties'

        Additional properties are not allowed ('selection' was unexpected)
        

alt.HConcatChart(...)

## Feature Value Distribution vs. Mean Absolute Rank


In [None]:
Get_feature_value_vs_mean_attr_plot(subset_analysis_df, 'glucose', rank_mode = True)

In [None]:
subset_analysis_df

In [None]:
#feature_value_vs_attr_plot distribution

In [None]:
# Get_feature_value_vs_signed_rank_plot('Pedigree')