In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from functions import*

In [4]:
import os
path=os.path.dirname(os.getcwd())

In [5]:
K=200
e=0.01

var_list=['hoursperweek','age','capitalgain','capitalloss' ,'education-num'] #
var_dim=len(var_list)
pa='sex'
pa_dict={'Male':1,'Female':0,'White':1,'Black':0}
pd.set_option('future.no_silent_downcasting', True)

messydata=pd.read_csv(path+'/data/adult_csv.csv',usecols=var_list+[pa,'class'])
messydata=messydata.rename(columns={pa:'S','class':'Y'})
messydata['S']=messydata['S'].replace(pa_dict)
messydata['Y']=messydata['Y'].replace({'>50K':1,'<=50K':0})
messydata=messydata[(messydata['S']==1)|(messydata['S']==0)]
for col in var_list+['S','Y']:
    messydata[col]=messydata[col].astype('category')
messydata['W']=1
X=messydata[var_list+['S','W']].to_numpy() # [X,S,W]
y=messydata['Y'].to_numpy() #[Y]

In [6]:
tv_dist=dict()
for x_name in var_list:
    x_range_single=list(pd.pivot_table(messydata,index=x_name,values=['W'])[('W')].index) 
    dist=rdata_analysis(messydata,x_range_single,x_name)
    tv_dist[x_name]=sum(abs(dist['x_0']-dist['x_1']))/2
x_list=[]
for key,val in tv_dist.items():
    if val>0.08:
        x_list+=[key]

  x_range_single=list(pd.pivot_table(messydata,index=x_name,values=['W'])[('W')].index)
  pivot=pd.pivot_table(rdata,index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  pivot0=pd.pivot_table(rdata[rdata['S']==0],index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  pivot1=pd.pivot_table(rdata[rdata['S']==1],index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  x_range_single=list(pd.pivot_table(messydata,index=x_name,values=['W'])[('W')].index)
  pivot=pd.pivot_table(rdata,index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  pivot0=pd.pivot_table(rdata[rdata['S']==0],index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  pivot1=pd.pivot_table(rdata[rdata['S']==1],index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  x_range_single=list(pd.pivot_table(messydata,index=x_name,values=['W'])[('W')].index)
  pivot=pd.pivot_table(rdata,index=x_name,values=['W'],aggfunc=[np

In [7]:
report=pd.DataFrame(columns=['DI','f1 macro','f1 micro','f1 weighted','TV distance','method'])
for ignore in range(3):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    clf=RandomForestClassifier(max_depth=5, random_state=0).fit(X_train[:,0:var_dim],y_train)

    df_test=pd.DataFrame(np.concatenate((X_test,y_test.reshape(-1,1)), axis=1),columns=var_list+['S','W','Y'])
    df_test=df_test.groupby(by=var_list+['S','Y'],as_index=False).sum()

    if len(x_list)>1:
        df_test['X']=[tuple(df_test[x_list].values[r]) for r in range(df_test.shape[0])]
        x_range=list(set(df_test['X']))
        weight=list(1/(df_test[x_list].max()-df_test[x_list].min())) # because 'education-num' range from 1 to 16 while others 1 to 4
        C=c_generate_higher(x_range,weight)
    else:
        df_test['X']=df_test[x_list]
        x_range=list(set(df_test['X']))
        C=c_generate(x_range)

    bin=len(x_range)
    var_range=list(pd.pivot_table(df_test,index=var_list,values=['S','W','Y']).index)
    dist=rdata_analysis(df_test,x_range,'X')
    dist['t_x']=dist['x'] # #dist['x'] #dist['x_0']*0.5+dist['x_1']*0.5 
    dist['v']=[(dist['x_0'][i]-dist['x_1'][i])/dist['x'][i] for i in range(bin)]
    px=np.matrix(dist['x']).T
    ptx=np.matrix(dist['t_x']).T
    if np.any(dist['x_0']==0): 
        p0=np.matrix((dist['x_0']+1.0e-9)/sum(dist['x_0']+1.0e-9)).T
    else:
        p0=np.matrix(dist['x_0']).T 
    if np.any(dist['x_1']==0):
        p1=np.matrix((dist['x_1']+1.0e-9)/sum(dist['x_1']+1.0e-9)).T
    else:
        p1=np.matrix(dist['x_1']).T 
    V=np.matrix(dist['v']).T

    coupling_base=baseline(C,e,px,ptx,V,K)
    coupling_bary=baseline(C,e,p0,p1,V,K)
    # coupling_part2=partial_repair(C,e,px,ptx,V,1.0e-2,K)
    coupling_part3=partial_repair(C,e,px,ptx,V,1.0e-3,K)
    coupling_total=partial_repair(C,e,px,ptx,V,1.0e-5,K)
    
    tv_base=assess_tv(df_test,coupling_base,x_range,x_list,var_list)
    # tv_part2=assess_tv(df_test,coupling_part2,x_range,x_list,var_list)
    tv_part3=assess_tv(df_test,coupling_part3,x_range,x_list,var_list)
    tv_total=assess_tv(df_test,coupling_total,x_range,x_list,var_list)

    y_pred=clf.predict(np.array(df_test[var_list]))
    y_pred_base=postprocess(df_test,coupling_base,x_list,x_range,var_list,var_range,clf)
    y_pred_bary,tv_bary=postprocess_bary(df_test,coupling_bary,x_list,x_range,var_list,var_range,clf)
    # y_pred_part2=postprocess(df_test,coupling_part2,x_list,x_range,var_list,var_range,clf)
    y_pred_part3=postprocess(df_test,coupling_part3,x_list,x_range,var_list,var_range,clf)
    y_pred_total=postprocess(df_test,coupling_total,x_list,x_range,var_list,var_range,clf)

    new_row=pd.Series({'DI':DisparateImpact_postprocess(df_test,y_pred),
                        'f1 macro':f1_score(df_test['Y'], y_pred, average='macro',sample_weight=df_test['W']),
                        'f1 micro':f1_score(df_test['Y'], y_pred, average='micro',sample_weight=df_test['W']),
                        'f1 weighted':f1_score(df_test['Y'], y_pred, average='weighted',sample_weight=df_test['W']),
                        'TV distance':sum(abs(dist['x_0']-dist['x_1']))/2,'method':'origin'})
    new_row_base=pd.Series({'DI':DisparateImpact_postprocess(df_test,y_pred_base),
                        'f1 macro':f1_score(df_test['Y'], y_pred_base, average='macro',sample_weight=df_test['W']),
                        'f1 micro':f1_score(df_test['Y'], y_pred_base, average='micro',sample_weight=df_test['W']),
                        'f1 weighted':f1_score(df_test['Y'], y_pred_base, average='weighted',sample_weight=df_test['W']),
                        'TV distance':tv_base,'method':'baseline'})
    new_row_bary=pd.Series({'DI':DisparateImpact_postprocess(df_test,y_pred_bary),
                        'f1 macro':f1_score(df_test['Y'], y_pred_bary, average='macro',sample_weight=df_test['W']),
                        'f1 micro':f1_score(df_test['Y'], y_pred_bary, average='micro',sample_weight=df_test['W']),
                        'f1 weighted':f1_score(df_test['Y'], y_pred_bary, average='weighted',sample_weight=df_test['W']),
                        'TV distance':tv_bary,'method':'barycentre'})
    # new_row_part2=pd.Series({'DI':DisparateImpact_postprocess(df_test,y_pred_part2),
    #                     'f1 macro':f1_score(df_test['Y'], y_pred_part2, average='macro',sample_weight=df_test['W']),
    #                     'f1 micro':f1_score(df_test['Y'], y_pred_part2, average='micro',sample_weight=df_test['W']),
    #                     'f1 weighted':f1_score(df_test['Y'], y_pred_part2, average='weighted',sample_weight=df_test['W']),
    #                     'TV distance':tv_part2,'method':'partial repair2'})
    new_row_part3=pd.Series({'DI':DisparateImpact_postprocess(df_test,y_pred_part3),
                        'f1 macro':f1_score(df_test['Y'], y_pred_part3, average='macro',sample_weight=df_test['W']),
                        'f1 micro':f1_score(df_test['Y'], y_pred_part3, average='micro',sample_weight=df_test['W']),
                        'f1 weighted':f1_score(df_test['Y'], y_pred_part3, average='weighted',sample_weight=df_test['W']),
                        'TV distance':tv_part3,'method':'partial repair3'})
    new_row_total=pd.Series({'DI':DisparateImpact_postprocess(df_test,y_pred_total),
                        'f1 macro':f1_score(df_test['Y'], y_pred_total, average='macro',sample_weight=df_test['W']),
                        'f1 micro':f1_score(df_test['Y'], y_pred_total, average='micro',sample_weight=df_test['W']),
                        'f1 weighted':f1_score(df_test['Y'], y_pred_total, average='weighted',sample_weight=df_test['W']),
                        'TV distance':tv_total,'method':'total repair'})

    #report = pd.concat([report,new_row.to_frame().T,new_row_base.to_frame().T,new_row_part2.to_frame().T,new_row_part3.to_frame().T,new_row_part4.to_frame().T], ignore_index=True) #,new_row_part4.to_frame().T
    report = pd.concat([report,new_row.to_frame().T,new_row_base.to_frame().T,new_row_bary.to_frame().T,new_row_part3.to_frame().T,new_row_total.to_frame().T], ignore_index=True) #new_row_part2.to_frame().T,
    #report = pd.concat([report,new_row.to_frame().T,new_row_base.to_frame().T,new_row_bary.to_frame().T,new_row_part2.to_frame().T,new_row_part3.to_frame().T,new_row_part4.to_frame().T], ignore_index=True) #,new_row_part4.to_frame().T

  pivot=pd.pivot_table(rdata,index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  pivot0=pd.pivot_table(rdata[rdata['S']==0],index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  pivot1=pd.pivot_table(rdata[rdata['S']==1],index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  df_t=pd.concat([df_t,sub],ignore_index=True) #pd.concat([df_t,samples_groupby(sub,x_list)], ignore_index=True)
  pivot=pd.pivot_table(rdata,index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  pivot0=pd.pivot_table(rdata[rdata['S']==0],index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  pivot1=pd.pivot_table(rdata[rdata['S']==1],index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  df_t=pd.concat([df_t,sub],ignore_index=True) #pd.concat([df_t,samples_groupby(sub,x_list)], ignore_index=True)
  pivot=pd.pivot_table(rdata,index=x_name,values=['W'],aggfunc=[np.sum],observed=False)[("sum",'W')]
  p

ZeroDivisionError: float division by zero

In [19]:
DisparateImpact_postprocess(df_test,y_pred_total)

ZeroDivisionError: float division by zero

In [26]:
df_test_tmp=df_test[:]
df_test_tmp.insert(loc=0, column='f', value=y_pred_total)
numerator=sum(df_test_tmp[(df_test_tmp['S']==0)&(df_test_tmp['f']==1)]['W'])/sum(df_test_tmp[df_test_tmp['S']==0]['W'])
denominator=sum(df_test_tmp[(df_test_tmp['S']==1)&(df_test_tmp['f']==1)]['W'])/sum(df_test_tmp[df_test_tmp['S']==1]['W'])

In [16]:
df_test[df_test['S']==0]['W']

0        1
1        1
3        4
5       29
7       76
        ..
2096     3
2099     4
2100     1
2114     1
2125     1
Name: W, Length: 804, dtype: int64

In [8]:
report

Unnamed: 0,DI,f1 macro,f1 micro,f1 weighted,TV distance,method
0,0.476469,0.677845,0.817884,0.788748,0.213168,origin
1,0.476469,0.677845,0.817884,0.788748,0.213168,baseline
2,0.816524,0.51392,0.597226,0.618989,0.001553,barycentre
3,0.785138,0.657732,0.786815,0.767481,0.012225,partial repair3
4,0.792288,0.657744,0.786456,0.767334,0.000154,total repair


In [7]:
report

Unnamed: 0,DI,f1 macro,f1 micro,f1 weighted,TV distance,method
0,0.422121,0.679841,0.819164,0.789879,0.205342,origin
1,0.422121,0.679841,0.819164,0.789879,0.205342,baseline
2,1.137156,0.548193,0.646926,0.658234,9.5e-05,barycentre
3,0.612484,0.673078,0.807442,0.782275,0.104571,partial repair2
4,0.753164,0.66123,0.788504,0.769415,0.012275,partial repair3
5,0.41162,0.688898,0.823566,0.797154,0.196117,origin
6,0.41162,0.688898,0.823566,0.797154,0.196117,baseline
7,0.76884,0.546791,0.640631,0.655862,0.00162,barycentre
8,0.58569,0.678863,0.810155,0.787464,0.095489,partial repair2
9,0.717355,0.665875,0.790295,0.773712,0.012288,partial repair3


In [9]:
report.to_csv(path+'/data/report_postprocess_bary'+str(pa)+'.csv',index=None)