# Aim to recover patterns in the artifical dataset (target is the effect of gender)

In [3]:
import shapSD as ssd
import pandas as pd
import numpy as np
import copy
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#Display all content in dataframe
pd.set_option('display.max_colwidth', -1)


In [4]:
file_path = '../data/german_credit_data.csv'
original_credit = pd.read_csv(file_path, index_col=0)
# original_credit = original_credit.dropna()

origin_credit = ssd.DataEncoder(original_credit).label_encoding()
origin_credit.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,1,2,1,-1,0,1169,6,5,1
1,22,0,2,1,0,1,5951,48,5,0
2,49,1,1,1,0,-1,2096,12,3,1
3,45,1,2,0,0,0,7882,42,4,1
4,53,1,2,0,0,0,4870,24,1,0


In [5]:
x, y = origin_credit.drop('Risk', axis=1), origin_credit[['Risk']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)
clf = LogisticRegression(random_state=42, solver='lbfgs',
                         multi_class='multinomial').fit(x_train, y_train)
clf.coef_

array([[ 1.50022001e-02,  1.52404615e-01, -1.38894618e-03,
         3.30780191e-02,  3.14319983e-02, -2.46628134e-01,
        -1.75061099e-05, -1.44755122e-02,  4.51303067e-02]])

In [6]:
clf.score(x_test, y_test)

0.7

In [7]:
new_x = x.copy()
col_names = list(new_x.columns)
np.random.seed(20)
artificial_values = list(np.random.randint(2, size=(len(new_x))))


noise_names = ['noise_{}'.format(i) for i in range(1, 11)]
rand_values = np.random.randint(2, size=(len(new_x), 10))
df_noise = pd.DataFrame(rand_values, columns=noise_names)


new_x['Artificial_attribute'] = artificial_values
new_x = pd.concat([new_x, df_noise], axis=1)
new_col_names = list(set(new_x.columns) - set(col_names))
new_x.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Artificial_attribute,noise_1,noise_2,noise_3,noise_4,noise_5,noise_6,noise_7,noise_8,noise_9,noise_10
0,67,1,2,1,-1,0,1169,6,5,1,1,1,0,0,0,0,1,0,1,0
1,22,0,2,1,0,1,5951,48,5,0,1,1,0,0,1,1,0,0,0,1
2,49,1,1,1,0,-1,2096,12,3,1,0,0,1,0,0,0,0,1,1,1
3,45,1,2,0,0,0,7882,42,4,1,0,0,0,0,0,0,1,0,1,0
4,53,1,2,0,0,0,4870,24,1,0,1,0,0,1,0,0,0,1,1,1


In [8]:
clf_a = copy.deepcopy(clf)
clf_b = copy.deepcopy(clf)
# modify the wight for "Sex"
clf_a.coef_[0][1] = 0.8
clf_b.coef_[0][1] = 0.1

credit_a = new_x.loc[new_x['Artificial_attribute']==1]
credit_b = new_x.loc[new_x['Artificial_attribute']==0]

In [9]:
def get_gender_effect(data, clf, measure='binary'):
    
    flip_attr = 'Sex'
    local_exp = ssd.LocalExplainer(data,  clf)
    if measure == 'binary':
        effect = local_exp.binary_flip_explanation(flip_attr)
        effect = effect.drop(['old_prediction', 'new_prediction'], axis=1)
    elif measure == 'SHAP':
        if len(data) > 200:
            background_sample = 50
        else:
            background_sample = len(data)
        effect = local_exp.shap_values_with_attr(attr=flip_attr, background_sample=background_sample, explainer_type='Kernel')
    elif measure == 'LIME':
        effect = data.copy()
        lime_effect = local_exp.lime_explanation_as_df(instance_interval=(0, len(data)-1))
        effect['Sex_lime_weight'] = lime_effect['Sex'].values
    return effect

In [10]:
def subgroup_discovery(df_effect, target_name, label_name, selected_attr, measure, inverse_effect=False, statistic_is_positive=True):
    target = ssd.NumericTarget(target_name)
    search_space = ssd.create_selectors(df_effect, nbins=10, ignore=[target_name, label_name, selected_attr])
    task = ssd.SubgroupDiscoveryTask(df_effect, target, search_space, qf=measure, result_set_size=10)
    result = ssd.BeamSearch().execute(task, inverse_effect, statistic_is_positive)
    df_result = ssd.as_df(df_effect, result, statistics_to_show=ssd.all_statistics_numeric)
    return df_result[['quality', 'subgroup', 'size_sg', 'mean_sg', 'mean_dataset', 'mean_lift']]

## Binary flip approach

In [11]:
base_effect = get_gender_effect(new_x[col_names], clf, measure='binary')
base_effect.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Sex_abs_effect
0,67,0,2,1,-1,0,1169,6,5,0.021868
1,22,1,2,1,0,1,5951,48,5,0.067748
2,49,0,1,1,0,-1,2096,12,3,0.029502
3,45,0,2,0,0,0,7882,42,4,0.072441
4,53,0,2,0,0,0,4870,24,1,0.058772


In [12]:
target_name = 'Sex_abs_effect'
df_base_result = subgroup_discovery(base_effect, target_name, label_name=None, 
                               selected_attr='Sex', measure=ssd.GAStandardQFNumeric(0.5))
df_base_result

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.00564,Checking account=1,269.0,0.068722,0.057848,1.18797
1,0.004905,Duration>=36,170.0,0.069743,0.057848,1.20563
2,0.003445,Checking account=2,63.0,0.071572,0.057848,1.237244
3,0.003367,Credit amount>=7174,101.0,0.068443,0.057848,1.183147
4,0.002712,Age: [23:26[,133.0,0.065286,0.057848,1.128573
5,0.002564,Purpose=0,97.0,0.066081,0.057848,1.142316
6,0.002165,Credit amount: [4716:7174[,100.0,0.064693,0.057848,1.118333
7,0.001914,Duration: [30:36[,43.0,0.067078,0.057848,1.159564
8,0.001787,Credit amount: [3590:4716[,100.0,0.063499,0.057848,1.097691
9,0.001786,Age: [26:28[,101.0,0.063466,0.057848,1.097125


In [13]:
flip_a = get_gender_effect(credit_a[col_names], clf_a, measure='binary')
flip_b = get_gender_effect(credit_b[col_names], clf_b, measure='binary')
flip_effect = pd.concat([flip_a, flip_b])
flip_effect = pd.merge(flip_effect, new_x[new_col_names], left_index=True, right_index=True, how='outer')
flip_effect.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Sex_abs_effect,...,noise_1,noise_6,noise_9,noise_5,noise_3,noise_4,noise_10,Artificial_attribute,noise_2,noise_7
0,67,0,2,1,-1,0,1169,6,5,0.069846,...,1,0,1,0,0,0,0,1,1,1
1,22,1,2,1,0,1,5951,48,5,0.043695,...,1,1,0,1,0,0,1,0,1,0
2,49,0,1,1,0,-1,2096,12,3,0.096222,...,0,0,1,0,1,0,1,1,0,0
3,45,0,2,0,0,0,7882,42,4,0.296505,...,0,0,1,0,0,0,0,1,0,1
4,53,0,2,0,0,0,4870,24,1,0.039532,...,1,0,1,0,0,1,1,0,0,0


In [14]:
target_name = 'Sex_abs_effect'
binary_flip_result = subgroup_discovery(flip_effect, target_name, label_name=None, 
                               selected_attr='Sex', measure=ssd.GAStandardQFNumeric(1), 
                                        inverse_effect=False, statistic_is_positive=True)
binary_flip_result

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.048262,Artificial_attribute=1,516.0,0.231913,0.138382,1.675894
1,0.011979,Checking account=1,269.0,0.182913,0.138382,1.321805
2,0.010924,Duration>=36,170.0,0.202641,0.138382,1.464361
3,0.010323,Checking account=1 AND Artificial_attribute=1,143.0,0.304099,0.138382,2.197538
4,0.008977,Duration>=36 AND Artificial_attribute=1,95.0,0.326412,0.138382,2.358784
5,0.005929,Credit amount>=7174,101.0,0.197087,0.138382,1.424228
6,0.004687,Credit amount>=7174 AND Artificial_attribute=1,57.0,0.314141,0.138382,2.270105
7,0.004517,noise_9=0,501.0,0.147397,0.138382,1.065151
8,0.003599,noise_4=1,489.0,0.145741,0.138382,1.05318
9,0.003524,noise_9=0 AND noise_4=1,245.0,0.161781,0.138382,1.169093


## LIME approach

In [14]:
lime_base_effect = get_gender_effect(new_x[col_names], clf, measure='LIME')
lime_base_effect.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Sex_lime_weight
0,67,1,2,1,-1,0,1169,6,5,0.098481
1,22,0,2,1,0,1,5951,48,5,-0.099835
2,49,1,1,1,0,-1,2096,12,3,0.095811
3,45,1,2,0,0,0,7882,42,4,0.098794
4,53,1,2,0,0,0,4870,24,1,0.097584


In [15]:
lime_target_name = 'Sex_lime_weight'
lime_base_result = subgroup_discovery(lime_base_effect, lime_target_name, label_name=None, 
                               selected_attr='Sex', measure=ssd.GAStandardQFNumeric(0.5))
lime_base_result

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.009416,Age: [45:52[,96.0,0.068033,0.037642,1.807383
1,0.008737,Age: [36:39[,92.0,0.066447,0.037642,1.765238
2,0.008725,Housing=0,108.0,0.064192,0.037642,1.705343
3,0.007426,Duration>=36,170.0,0.055654,0.037642,1.478501
4,0.007359,Age: [39:45[,119.0,0.058974,0.037642,1.566716
5,0.007098,Purpose=0,97.0,0.060433,0.037642,1.605467
6,0.006402,Age: [30:33[,112.0,0.05677,0.037642,1.508169
7,0.006269,Credit amount: [4716:7174[,100.0,0.057466,0.037642,1.526647
8,0.005858,Housing=1,713.0,0.044579,0.037642,1.184298
9,0.005085,Housing=1 AND Duration: [24:30[,140.0,0.058169,0.037642,1.545332


In [None]:
lime_a = get_gender_effect(credit_a[col_names], clf_a, measure='LIME')
lime_b = get_gender_effect(credit_b[col_names], clf_b, measure='LIME')
lime_effect = pd.concat([lime_a, lime_b])
# lime_effect.head()
df_lime_effect = pd.merge(lime_effect, new_x[new_col_names], left_index=True, right_index=True, how='outer')
df_lime_effect.head()

In [20]:
lime_target_name = 'Sex_lime_weight'
lime_result = subgroup_discovery(df_lime_effect, lime_target_name, label_name=None, 
                               selected_attr='Sex', measure=ssd.GAStandardQFNumeric(1))
lime_result

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.02442,Artificial_attribute=1,516.0,0.112782,0.065456,1.723019
1,0.009307,Housing=1,713.0,0.078509,0.065456,1.199418
2,0.009144,Housing=1 AND Artificial_attribute=1,362.0,0.138043,0.065456,2.108942
3,0.00707,Duration>=36,170.0,0.107044,0.065456,1.635364
4,0.005971,Duration>=36 AND Artificial_attribute=1,95.0,0.175638,0.065456,2.6833
5,0.004971,Age: [45:52[,96.0,0.117236,0.065456,1.791068
6,0.004786,Housing=1 AND Artificial_attribute=1 AND noise_9=1,178.0,0.16493,0.065456,2.519716
7,0.004728,Credit amount: [4716:7174[,100.0,0.112738,0.065456,1.722355
8,0.004521,Credit amount: [4716:7174[ AND Artificial_attribute=1,52.0,0.199729,0.065456,3.051355
9,0.004498,Purpose=5 AND Artificial_attribute=1,145.0,0.143804,0.065456,2.196952


## KernelSHAP approach

In [15]:
shap_base_effect = get_gender_effect(new_x[col_names], clf, measure='SHAP')
shap_base_effect.head()

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Sex_shap_values
0,67,1,2,1,-1,0,1169,6,5,0.000688
1,22,0,2,1,0,1,5951,48,5,-0.067196
2,49,1,1,1,0,-1,2096,12,3,0.000765
3,45,1,2,0,0,0,7882,42,4,0.001187
4,53,1,2,0,0,0,4870,24,1,0.001051


In [16]:
shap_target_name = 'Sex_shap_values'
shap_base_result = subgroup_discovery(shap_base_effect, shap_target_name, label_name=None, 
                               selected_attr='Sex', measure=ssd.StandardQFNumeric(0.5))
shap_base_result

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.102424,Housing=0 AND Job=2,63.0,-0.004639,-0.017544,0.264445
1,0.09734,Checking account=-1 AND Age: [45:52[,43.0,-0.002699,-0.017544,0.153863
2,0.092595,Age: [45:52[,96.0,-0.008093,-0.017544,0.461312
3,0.090075,Age: [45:52[ AND Job=2,58.0,-0.005716,-0.017544,0.325826
4,0.089217,Age: [36:39[,92.0,-0.008242,-0.017544,0.469804
5,0.087741,Duration>=36 AND Housing=0,40.0,-0.00367,-0.017544,0.20922
6,0.086646,Age: [45:52[ AND Job=2 AND Checking account=-1,27.0,-0.000868,-0.017544,0.049505
7,0.086257,Age: [45:52[ AND Purpose=1,37.0,-0.003363,-0.017544,0.191689
8,0.085633,Age: [36:39[ AND Saving accounts=-1,26.0,-0.00075,-0.017544,0.042725
9,0.085556,Housing=1 AND Age: [45:52[,69.0,-0.007244,-0.017544,0.412907


In [17]:
shap_a = get_gender_effect(credit_a[col_names], clf_a, measure='SHAP')
shap_b = get_gender_effect(credit_b[col_names], clf_b, measure='SHAP')
shap_effect = pd.concat([shap_a, shap_b])
# shap_effect = shap_effect.reset_index(drop=True)
df_shap_effect = pd.merge(shap_effect, new_x[new_col_names], left_index=True, right_index=True, how='outer')
df_shap_effect.head()

HBox(children=(IntProgress(value=0, max=516), HTML(value='')))




HBox(children=(IntProgress(value=0, max=484), HTML(value='')))




Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Sex_shap_values,...,noise_1,noise_6,noise_9,noise_5,noise_3,noise_4,noise_10,Artificial_attribute,noise_2,noise_7
0,67,1,2,1,-1,0,1169,6,5,0.019294,...,1,0,1,0,0,0,0,1,1,1
1,22,0,2,1,0,1,5951,48,5,-0.03919,...,1,1,0,1,0,0,1,0,1,0
2,49,1,1,1,0,-1,2096,12,3,0.021319,...,0,0,1,0,1,0,1,1,0,0
3,45,1,2,0,0,0,7882,42,4,0.035564,...,0,0,1,0,0,0,0,1,0,1
4,53,1,2,0,0,0,4870,24,1,0.004935,...,1,0,1,0,0,1,1,0,0,0


In [21]:
# shap_target_name = 'Sex_shap_values'
# shap_result = subgroup_discovery(df_shap_effect, shap_target_name, label_name=None, 
#                                selected_attr='Sex', measure=ssd.StandardQFNumeric(0.5))
# shap_result

In [18]:
shap_target_name = 'Sex_shap_values'
shap_result = subgroup_discovery(df_shap_effect, shap_target_name, label_name=None, 
                               selected_attr='Sex', measure=ssd.StandardQFNumeric(1), 
                                 inverse_effect=False, statistic_is_positive=False)
shap_result

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,-7.348002,Artificial_attribute=1,516.0,-0.037525,-0.023285,1.611563
1,-4.252105,Age: [23:26[,133.0,-0.055256,-0.023285,2.373011
2,-2.280686,Age<23,57.0,-0.063297,-0.023285,2.718353
3,-0.800954,Job=0,22.0,-0.059692,-0.023285,2.563531
4,-0.793798,Saving accounts=0,603.0,-0.024602,-0.023285,1.056535
5,-0.636434,Age: [28:30[,80.0,-0.031241,-0.023285,1.341653
6,-0.631614,Saving accounts=1,103.0,-0.029417,-0.023285,1.263352
7,-0.497053,Job=1,200.0,-0.02577,-0.023285,1.106732
8,-0.478647,Age: [26:28[,101.0,-0.028024,-0.023285,1.203524
9,0.0,Dataset,1000.0,-0.023285,-0.023285,1.0
