# Aim to recover patterns in the artifical dataset (target is the effect of gender)

In [1]:
import sys
sys.path.append('..')
import shapSD as ssd
import pandas as pd
import numpy as np
import copy
from sklearn.linear_model import LogisticRegression
#Display all content in dataframe
pd.set_option('display.max_colwidth', -1)

Using TensorFlow backend.


In [2]:
file_path = '../data/adult.csv'
original_adult = pd.read_csv(file_path, index_col=0)
original_adult = original_adult[:5000]
original_adult = original_adult[['age', 'education-num', 'sex', 'hours-per-week', 'income']]
origin_adult = ssd.DataEncoder(original_adult).label_encoding()
origin_adult.head()

Unnamed: 0,age,education-num,sex,hours-per-week,income
0,39,13,1,40,0
1,50,13,1,13,0
2,38,9,1,40,0
3,53,7,1,40,0
4,28,13,0,40,0


In [3]:
x_train, y_train = origin_adult.drop('income', axis=1), origin_adult[['income']]
clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(x_train, y_train)
clf.coef_

array([[0.02384174, 0.16325068, 0.49430803, 0.02033189]])

## Construct patterns by modifying the coefficients of the selected feature

In [4]:
# insepct the influence of "Sex"
def get_effect(clf, constrain_feature, critical_v):
    clf_a = copy.deepcopy(clf)
    clf_b = copy.deepcopy(clf)
    # modify the wight for "Sex"
    clf_a.coef_[0][2] = 0.8
    clf_b.coef_[0][2] = 0.1

    adult_a = origin_adult.loc[origin_adult[constrain_feature]<critical_v]
    adult_b = origin_adult.loc[origin_adult[constrain_feature]>=critical_v]

    adult_a = adult_a.reset_index(drop=True)
    adult_b = adult_b.reset_index(drop=True)
    # construct BinaryFlip class
    flip_attr = 'sex'
    flip_a = ssd.BinaryFlip(adult_a.drop('income', axis=1),  clf_a, flip_attr)
    flip_b = ssd.BinaryFlip(adult_b.drop('income', axis=1),  clf_b, flip_attr)
    
    effect_a = flip_a.calc_abs_flip_effect()
    effect_b = flip_b.calc_abs_flip_effect()
    
    df_effect = pd.concat([effect_a, effect_b])
    df_effect = df_effect.reset_index(drop=True)
    return df_effect

## Subgroup discovery to discovery patterns

In [5]:
def subgroup_discovery(df_effect, target_name, label_name, selected_attr, measure):
    target = ssd.NumericTarget(target_name)
    search_space = ssd.create_selectors(df_effect, nbins=10, ignore=[target_name, label_name, selected_attr])
#     print(search_space)
    task = ssd.SubgroupDiscoveryTask(df_effect, target, search_space, qf=measure, result_set_size=10)
    result = ssd.BeamSearch().execute(task)
#     result = ssd.overlap_filter(result, df_effect, similarity_level=0.85)
    df_result = ssd.as_df(df_effect, result, statistics_to_show=ssd.all_statistics_numeric)
    return df_result[['quality', 'subgroup', 'size_sg', 'mean_sg', 'mean_dataset', 'mean_lift']]

### Original pattern (without modification on coefficient)

In [6]:
clf.coef_

array([[0.02384174, 0.16325068, 0.49430803, 0.02033189]])

In [7]:
flip_attr = 'sex'
sex_flip = ssd.BinaryFlip(origin_adult.drop('income', axis=1), clf, flip_attr)
sex_effect = sex_flip.calc_abs_flip_effect()    
df_effect = sex_effect.reset_index(drop=True)
df_effect.head()

Unnamed: 0,age,education-num,sex,hours-per-week,sex_abs_effect
0,39,13,0,40,0.213443
1,50,13,0,13,0.165025
2,38,9,0,40,0.09814
3,53,7,0,40,0.102866
4,28,13,1,40,0.16948


In [8]:
target_name = 'sex_abs_effect'
df_sd = subgroup_discovery(df_effect, target_name, label_name='income', 
                               selected_attr='sex', measure=ssd.GAStandardQFNumeric(1))
df_sd[:5]

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.018278,education-num>=13,1228.0,0.207028,0.132607,1.561214
1,0.006204,hours-per-week>=55,525.0,0.191691,0.132607,1.445556
2,0.004701,age: [50:58[,568.0,0.173991,0.132607,1.31208
3,0.00452,hours-per-week: [50:55[,495.0,0.178259,0.132607,1.344264
4,0.004039,age>=58,503.0,0.172752,0.132607,1.302732


In [9]:
# target_name = df_effect.columns.tolist()[-1]
# draw_decision_tree('sex', target_name, df_effect)

### Assumed pattern: when "age" < 25, the effect of gender is particular large 

In [10]:
# condition: when age < 25, the effect of "sex" ia large
df_age_effect = get_effect(clf, 'age', 25)
df_age_effect.head()

Unnamed: 0,age,education-num,sex,hours-per-week,sex_abs_effect
0,23,13,1,30,0.226294
1,19,9,0,40,0.101292
2,23,12,0,52,0.303753
3,20,10,0,44,0.155285
4,22,10,0,15,0.062812


In [11]:
target_name = 'sex_abs_effect'
df_age_sd = subgroup_discovery(df_age_effect, target_name, label_name='income', 
                               selected_attr='sex', measure=ssd.GAStandardQFNumeric(1))
df_age_sd[:5]

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.007806,age: [22:26[,510.0,0.11721,0.040685,2.880919
1,0.004212,age<22,473.0,0.085206,0.040685,2.094302
2,0.00366,education-num>=13,1228.0,0.055588,0.040685,1.366299
3,0.00235,education-num: [10:11[,1125.0,0.051128,0.040685,1.256689
4,0.001795,age: [22:26[ AND education-num>=13,110.0,0.1988,0.040685,4.886351


### Assumed pattern: when "hours-per-week" < 20, the effect of gender is particular large

In [12]:
df_hour_effect = get_effect(clf, 'hours-per-week', 20)
df_hour_effect.head()

Unnamed: 0,age,education-num,sex,hours-per-week,sex_abs_effect
0,50,13,0,13,0.307321
1,49,5,1,16,0.047538
2,22,10,0,15,0.062812
3,67,6,0,2,0.082779
4,71,10,0,2,0.249957


In [13]:
df_hour_sd = subgroup_discovery(df_hour_effect, target_name, label_name='income', 
                                selected_attr='sex', measure=ssd.GAStandardQFNumeric(1))
df_hour_sd[:5]

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.004661,education-num>=13,1228.0,0.047636,0.028657,1.662261
1,0.004499,hours-per-week<25,494.0,0.074192,0.028657,2.588939
2,0.004263,age>=58,503.0,0.071033,0.028657,2.478687
3,0.002649,hours-per-week<25 AND age>=58,133.0,0.173772,0.028657,6.063792
4,0.00141,education-num>=13 AND hours-per-week<25,70.0,0.17492,0.028657,6.103863


In [14]:
def draw_decision_tree(attr, target_name, credit_shap):
    dtree_exp = ssd.DecisionTreeExplain(credit_shap.drop([attr, target_name], axis=1), credit_shap.iloc[:, -1])
    estimator = dtree_exp.dtree_reg_model(max_depth=3)
    if len(attr.split(' ')) > 1:
        file_name = attr.split(' ')[0] + '_' + attr.split(' ')[1]
    else:
        file_name = attr
    return dtree_exp.visualize_dtree(estimator, '{}_effect'.format(file_name))