In [None]:
import pandas as pd
import altair as alt
from altair.expr import datum
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Load the dataframes 
subset_analysis_filename = '../Data/4vis/subset_analysis.pkl'
subset_analysis_df = pd.read_pickle(subset_analysis_filename)

data_filename = '../Data/Model/data.pkl'
df =  pd.read_pickle(data_filename)
X = df.drop(['target'], axis=1)
y = df['target']
method_name_to_attr_name = {
    'shap' : 'shap_value',
    'lime': 'lime_weight', 
    'ig':  'ig_attr', 
    'deepLift':'deepLift_attr' 
}
method_name_to_feature_rank = {
    'shap' : 'shap_rank',
    'lime': 'lime_rank', 
    'ig':  'ig_rank', 
    'deepLift':'deepLift_rank' 
}

In [None]:
# load mean_attr dataframe
mean_attr_filename = '../Data/4vis/mean_attr.pkl'
mean_attr_melted = pd.read_pickle(mean_attr_filename)
sorted_feature = mean_attr_melted[mean_attr_melted['method'] == 'sum_rank'].sort_values(by = 'feature_attr', ascending = False)['feature_name'].to_list() 

In [None]:
method_to_featNames2abs = {}
method_to_abs2featNames = {}
for method_name, attr_name in method_name_to_attr_name.items():
    featNames2abs = {}
    abs2featNames = {}
    for feature_name in X.columns:
        abs_attr_name = feature_name + '_abs_' + method_name_to_attr_name[method_name]
        featNames2abs[feature_name] =  feature_name + '_abs_' + method_name_to_attr_name[method_name]
        abs2featNames[abs_attr_name] = feature_name
    method_to_abs2featNames[method_name] = abs2featNames
    method_to_featNames2abs[method_name] = featNames2abs

In [None]:
feat_to_absRank2method = {}
# E.g. ['BMI_abs_shap_rank', 'BMI_abs_lime_rank', 'BMI_abs_ig_attr', 'BMI_abs_deepLift_attr'] -> 
# -> ['shap', 'lime', 'ig', 'deepLift']
feat_to_method2absRank = {} # for renaming back

for feature_name in X.columns:
    absRank2method = {}
    method2absRank = {}
    for method_name, attr_name in method_name_to_feature_rank.items():
        abs_rank_name =  feature_name + '_abs_' + method_name_to_feature_rank[method_name]
        absRank2method[abs_rank_name] = method_name
        method2absRank[method_name] = abs_rank_name
        
    feat_to_absRank2method[feature_name] = absRank2method
    feat_to_method2absRank[feature_name] = method2absRank

In [None]:
# for fixing the x and y ranges of the visualization 
max_df = subset_analysis_df.max()
min_df = subset_analysis_df.min()

# base common chart
brush = alt.selection_interval(
    on='[mousedown[event.shiftKey], mouseup] > mousemove',
    translate='[mousedown[event.shiftKey], mouseup] > mousemove!'
)

pan_zoom = alt.selection_interval(
    on='[mousedown[!event.shiftKey], mouseup] > mousemove',
    translate='[mousedown[!event.shiftKey], mouseup] > mousemove!',
    bind='scales'
)

base = alt.Chart(subset_analysis_df).encode(
    opacity = alt.condition(brush, alt.value(1.0), alt.value(0.3))
).add_selection(
    brush
)

In [None]:
def Get_mean_rank_plot(subset_analysis_df):
    view_rank_plot = alt.hconcat()

    for feature_name in sorted_feature:
        subset_analysis_df = subset_analysis_df.rename(columns = feat_to_absRank2method[feature_name])
        view_rank_plot |= alt.Chart(subset_analysis_df).transform_fold(
            [*method_name_to_attr_name.keys()],
            as_=['key', 'value']
        ).mark_bar().encode(
            x= alt.X('key:N', title = ''),
            y= alt.Y('mean(value):Q', title = 'Mean Absolute Rank', scale=alt.Scale(domain=(1, 8))),
            color=alt.Color('mean(value):Q', scale=alt.Scale(domain=(1, 8), scheme='viridis')),
        ).transform_filter(
            brush
        ).properties(
            title = feature_name
        ).resolve_scale(
            color='independent'
        )
        
        subset_analysis_df = subset_analysis_df.rename(columns= feat_to_method2absRank[feature_name])
    return view_rank_plot


In [None]:
def Get_sliced_mean_attr_plot_by_method(subset_analysis_df, feature_name, method_name):
    x_title = 'mean absolute ' + method_name + ' attribute'
    subset_analysis_df = subset_analysis_df.rename(columns = method_to_abs2featNames[method_name])
    view_sliced_mean_attr_ = alt.Chart(subset_analysis_df).transform_fold(
        X.columns.tolist(),
        as_=['key', 'value']
    ).mark_bar().encode(
        y= alt.Y('key:N', title = '', sort = '-x'),
        x= alt.X('mean(value):Q', title = x_title)
    ).transform_filter(
        brush
    ).properties(
        height=80,
        width = 200,
    )
    subset_analysis_df = subset_analysis_df.rename(columns= method_to_featNames2abs[method_name])
    return view_sliced_mean_attr_

In [None]:
def Get_feature_value_vs_mean_attr_plot(anchor_chart, subset_analysis_df):
    view_sliced_mean_attr = {}
    row1 = alt.hconcat()
    row2 = alt.hconcat()
    i = 0
    
    for method_name in method_name_to_attr_name.keys():
        view_sliced_mean_attr[method_name] = Get_sliced_mean_attr_plot_by_method(subset_analysis_df, feature_name, method_name)
        if i < 2:
            row1 |= view_sliced_mean_attr[method_name]
        else:
            row2 |= view_sliced_mean_attr[method_name]
        i += 1
    
    view_sliced_mean_attr = alt.vconcat(row1, row2)

    view_row1 = anchor_chart | view_sliced_mean_attr
    view_row2 = Get_mean_rank_plot(subset_analysis_df)

    return alt.vconcat(view_row1, view_row2)

In [None]:
def Get_feature_value_distr_chart (feature_name):
    # distribution of feature value plot 
    feature_value_name = feature_name + '_value'
    max_range_feature_value = max_df[feature_value_name]
    min_range_feature_value = min_df[feature_value_name]
    feature_value_bin_step = (max_range_feature_value - min_range_feature_value)/20

    view_feature_value = base.mark_bar().encode(
        x= alt.X(feature_value_name, bin=alt.Bin(step = feature_value_bin_step, extent=[min_range_feature_value, max_range_feature_value])), 
        y = alt.Y('count()'),
        color = 'target:N'
    ).properties(
        width=300,
        height=200
    )
    return view_feature_value


In [None]:
def Get_tsne_chart ():
    view_tsne = base.mark_circle().encode(
        x= alt.X('data_x-tsne'),
        y = alt.Y('data_y-tsne'),
        color = 'target:N'
    ).properties(
        width=400,
        height=300
    ).add_selection(
        pan_zoom
    )
    return view_tsne

In [None]:
def Get_Heat_Map(feature1, feature2):
    feature_value_name1 = feature1 + '_value'
    feature_value_name2 = feature2 + '_value'
    max_range_feature_value1 = max_df[feature_value_name1]
    min_range_feature_value1 = min_df[feature_value_name2]
    feature_value_bin_step1 = (max_range_feature_value1 - min_range_feature_value1)/20

    max_range_feature_value2 = max_df[feature_value_name2]
    min_range_feature_value2 = min_df[feature_value_name2]
    feature_value_bin_step2 = (max_range_feature_value2 - min_range_feature_value2)/20

    view_heat_map = base.mark_rect().encode(
        x = alt.X(feature_value_name1, bin=alt.Bin(step = feature_value_bin_step1, extent=[min_range_feature_value1, max_range_feature_value1])),
        y = alt.Y(feature_value_name2, bin=alt.Bin(step = feature_value_bin_step2, extent=[min_range_feature_value2, max_range_feature_value2])),
        color = alt.Color('count():Q')
    ).resolve_scale(
        color='independent'
    )
    return view_heat_map 
    

# Visualization for Signed Attribute 

In [None]:
#feature_value_vs_attr_plot distribution
def Get_feature_value_distr__vs__attr_plot_chart (subset_analysis_df, feature_name):
    # distribution of feature value plot 
    feature_value_name = feature_name + '_value'
    max_range_feature_value = max_df[feature_value_name]
    min_range_feature_value = min_df[feature_value_name]
    feature_value_bin_step = (max_range_feature_value - min_range_feature_value)/20

    view_feature_value = base.mark_bar().encode(
        x= alt.X(feature_value_name, bin=alt.Bin(step = feature_value_bin_step, extent=[min_range_feature_value, max_range_feature_value])), 
        y = alt.Y('count()'),
    ).resolve_scale(
        color='independent'
    )

    row1 = alt.hconcat()
    row2 = alt.hconcat()
    i = 0
    
    for method_name, attr_name in method_name_to_attr_name.items():
        # distribution of feature attr plot 
        feature_attr = feature_name + '_' + attr_name
        max_range_feature_attr = max_df[feature_attr]
        min_range_feature_attr = min_df[feature_attr]
        feature_attr_bin_step = (max_range_feature_attr - min_range_feature_attr)/30

        subset_analysis_df['is_positive'] = subset_analysis_df[feature_attr] > 0
        
        chart = alt.Chart(subset_analysis_df).mark_bar().encode(
            x= alt.X(feature_attr +':Q', bin=alt.Bin(step = feature_attr_bin_step, extent=[min_range_feature_attr, max_range_feature_attr])), 
            y = alt.Y('count()'),
            color = alt.Color('is_positive', legend=None, scale=alt.Scale(range=['crimson', 'steelblue']))
        ).transform_filter(
            brush
        ).properties(
            title = method_name,
            width=300,
            height=200
        ).resolve_scale(
            color='independent'
        )

        if i < 2:
            row1 |= chart
        else:
            row2 |= chart
        i += 1
        subset_analysis_df = subset_analysis_df.drop(['is_positive'], axis=1)
    
    view_feature_attr  = alt.vconcat(row1, row2)

    return view_feature_value | view_feature_attr

In [None]:
#feature_value_vs_attr_plot distribution
def Get_feature_value_distr__vs__signed_rank_plot_chart (subset_analysis_df, feature_name):
    # distribution of feature value plot 
    feature_value_name = feature_name + '_value'
    max_range_feature_value = max_df[feature_value_name]
    min_range_feature_value = min_df[feature_value_name]
    feature_value_bin_step = (max_range_feature_value - min_range_feature_value)/20

    view_feature_value = base.mark_bar().encode(
        x= alt.X(feature_value_name, bin=alt.Bin(step = feature_value_bin_step, extent=[min_range_feature_value, max_range_feature_value])), 
        y = alt.Y('count()'),
    ).resolve_scale(
        color='independent'
    )

    row1 = alt.hconcat()
    row2 = alt.hconcat()
    i = 0
    
    for method_name, rank_name in method_name_to_feature_rank.items():
        # distribution of feature attr plot 
        feature_rank_name = feature_name + '_' + rank_name
        max_range_feature_attr = max_df[feature_rank_name]
        min_range_feature_attr = min_df[feature_rank_name]
        feature_attr_bin_step = (max_range_feature_attr - min_range_feature_attr)/30

        subset_analysis_df['is_positive'] = subset_analysis_df[feature_rank_name] > 0
        
        chart = alt.Chart(subset_analysis_df).mark_bar().encode(
            x= alt.X(feature_rank_name +':Q', bin=alt.Bin(step = feature_attr_bin_step, extent=[min_range_feature_attr, max_range_feature_attr])), 
            y = alt.Y('count()'),
            color = alt.Color('is_positive', legend=None, scale=alt.Scale(range=['crimson', 'steelblue']))
        ).transform_filter(
            brush
        ).properties(
            title = method_name,
            width=300,
            height=200
        ).resolve_scale(
            color='independent'
        )

        if i < 2:
            row1 |= chart
        else:
            row2 |= chart
        i += 1
        subset_analysis_df = subset_analysis_df.drop(['is_positive'], axis=1)
    
    view_feature_rank  = alt.vconcat(row1, row2)

    return view_feature_value | view_feature_rank

# Feature Distr vs. Mean Absolute Attribute

In [None]:
# comment all charts out to reduce the size and store on github
anchor_chart = Get_feature_value_distr_chart('glucose')
Get_feature_value_vs_mean_attr_plot(anchor_chart, subset_analysis_df)