# Aim to recover patterns in the artifical dataset (target is the effect of gender)

In [1]:
import sys
sys.path.append('..')
import shapSD as ssd
import pandas as pd
import numpy as np
import copy
from sklearn.linear_model import LogisticRegression
#Display all content in dataframe
pd.set_option('display.max_colwidth', -1)

Using TensorFlow backend.


In [2]:
file_path = '../data/adult.csv'
original_adult = pd.read_csv(file_path, index_col=0)
original_adult = original_adult[:5000]
original_adult = original_adult[['age', 'education-num', 'sex', 'hours-per-week', 'income']]
origin_adult = ssd.DataEncoder(original_adult).label_encoding()
origin_adult.head()

Unnamed: 0,age,education-num,sex,hours-per-week,income
0,39,13,1,40,0
1,50,13,1,13,0
2,38,9,1,40,0
3,53,7,1,40,0
4,28,13,0,40,0


In [3]:
x_train, y_train = origin_adult.drop('income', axis=1), origin_adult[['income']]
clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(x_train, y_train)
clf.coef_

array([[0.02384174, 0.16325068, 0.49430803, 0.02033189]])

## Construct patterns by modifying the coefficients of the selected feature

In [4]:
# insepct the influence of "Sex"
def get_effect(clf, constrain_feature, critical_v):
    clf_a = copy.deepcopy(clf)
    clf_b = copy.deepcopy(clf)
    # modify the wight for "Sex"
    clf_a.coef_[0][2] = 0.8
    clf_b.coef_[0][2] = 0.1

    adult_a = origin_adult.loc[origin_adult[constrain_feature]<critical_v]
    adult_b = origin_adult.loc[origin_adult[constrain_feature]>=critical_v]

    adult_a = adult_a.reset_index(drop=True)
    adult_b = adult_b.reset_index(drop=True)
    # construct BinaryFlip class
    flip_attr = 'sex'
    flip_a = ssd.BinaryFlip(adult_a.drop('income', axis=1),  clf_a, flip_attr)
    flip_b = ssd.BinaryFlip(adult_b.drop('income', axis=1),  clf_b, flip_attr)
    
    effect_a = flip_a.calc_abs_flip_effect()
    effect_b = flip_b.calc_abs_flip_effect()
    
    df_effect = pd.concat([effect_a, effect_b])
    df_effect = df_effect.reset_index(drop=True)
    return df_effect

## Subgroup discovery to discovery patterns

In [5]:
def subgroup_discovery(df_effect, target_name, label_name, selected_attr, measure):
    target = ssd.NumericTarget(target_name)
    search_space = ssd.create_selectors(df_effect, nbins=10, ignore=[target_name, label_name, selected_attr])
#     print(search_space)
    task = ssd.SubgroupDiscoveryTask(df_effect, target, search_space, qf=measure, result_set_size=10)
    result = ssd.BeamSearch().execute(task)
#     result = ssd.overlap_filter(result, df_effect, similarity_level=0.85)
    df_result = ssd.as_df(df_effect, result, statistics_to_show=ssd.all_statistics_numeric)
    return df_result[['quality', 'subgroup', 'size_sg', 'mean_sg', 'mean_dataset', 'mean_lift']]

### Original pattern (without modification on coefficient)

In [6]:
clf.coef_

array([[0.02384174, 0.16325068, 0.49430803, 0.02033189]])

In [7]:
flip_attr = 'sex'
sex_flip = ssd.BinaryFlip(origin_adult.drop('income', axis=1), clf, flip_attr)
sex_effect = sex_flip.calc_abs_flip_effect()    
df_effect = sex_effect.reset_index(drop=True)
df_effect.head()

Unnamed: 0,age,education-num,sex,hours-per-week,old_prediction,new_prediction,sex_abs_effect
0,39,13,0,40,0.438841,0.225397,0.213443
1,50,13,0,13,0.305918,0.140894,0.165025
2,38,9,0,40,0.168043,0.069903,0.09814
3,53,7,0,40,0.176927,0.074061,0.102866
4,28,13,1,40,0.146915,0.316395,0.16948


In [8]:
target_name = 'sex_abs_effect'
df_sd = subgroup_discovery(df_effect, target_name, label_name='income', 
                               selected_attr='sex', measure=ssd.GAStandardQFNumeric(1))
df_sd[:5]

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.018278,education-num>=13,1228.0,0.207028,0.132607,1.561214
1,0.009187,old_prediction: [0.41:0.56[,499.0,0.224666,0.132607,1.694219
2,0.008843,old_prediction>=0.56,502.0,0.220689,0.132607,1.664236
3,0.008449,new_prediction: [0.31:0.43[,500.0,0.217098,0.132607,1.637152
4,0.008317,new_prediction>=0.43,501.0,0.215615,0.132607,1.625965


In [9]:
# target_name = df_effect.columns.tolist()[-1]
# draw_decision_tree('sex', target_name, df_effect)

### Assumed pattern: when "age" < 25, the effect of gender is particular large 

In [9]:
# condition: when age < 25, the effect of "sex" ia large
df_age_effect = get_effect(clf, 'age', 25)
df_age_effect.head()

Unnamed: 0,age,education-num,sex,hours-per-week,old_prediction,new_prediction,sex_abs_effect
0,23,13,1,30,0.082864,0.309158,0.226294
1,19,9,0,40,0.130771,0.029479,0.101292
2,23,12,0,52,0.441283,0.13753,0.303753
3,20,10,0,44,0.204676,0.049392,0.155285
4,22,10,0,15,0.080084,0.017273,0.062812


In [10]:
target_name = 'sex_abs_effect'
df_age_sd = subgroup_discovery(df_age_effect, target_name, label_name='income', 
                               selected_attr='sex', measure=ssd.GAStandardQFNumeric(1))
df_age_sd[:5]

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.007806,age: [22:26[,510.0,0.11721,0.040685,2.880919
1,0.004212,age<22,473.0,0.085206,0.040685,2.094302
2,0.00366,education-num>=13,1228.0,0.055588,0.040685,1.366299
3,0.002734,old_prediction>=0.40,501.0,0.067969,0.040685,1.670615
4,0.00235,education-num: [10:11[,1125.0,0.051128,0.040685,1.256689


### Assumed pattern: when "hours-per-week" < 20, the effect of gender is particular large

In [11]:
df_hour_effect = get_effect(clf, 'hours-per-week', 20)
df_hour_effect.head()

Unnamed: 0,age,education-num,sex,hours-per-week,old_prediction,new_prediction,sex_abs_effect
0,50,13,0,13,0.448214,0.140894,0.307321
1,49,5,1,16,0.012798,0.060336,0.047538
2,22,10,0,15,0.080084,0.017273,0.062812
3,67,6,0,2,0.106207,0.023429,0.082779
4,71,10,0,2,0.346753,0.096796,0.249957


In [12]:
df_hour_sd = subgroup_discovery(df_hour_effect, target_name, label_name='income', 
                                selected_attr='sex', measure=ssd.GAStandardQFNumeric(1))
df_hour_sd[:5]

Unnamed: 0,quality,subgroup,size_sg,mean_sg,mean_dataset,mean_lift
0,0.004661,education-num>=13,1228.0,0.047636,0.028657,1.662261
1,0.004499,hours-per-week<25,494.0,0.074192,0.028657,2.588939
2,0.004263,age>=58,503.0,0.071033,0.028657,2.478687
3,0.0033,new_prediction>=0.36,503.0,0.061456,0.028657,2.144517
4,0.003263,old_prediction>=0.39,501.0,0.061222,0.028657,2.136335
