# Imports & Pre-processing

In [35]:
import sys
sys.path.append("../src")

from utils import *
from mitigation_fct import *
from distance import *
from Gems_Wasserstein import *

In [36]:
import pandas as pd
import torch
import torch.nn as nn
import schedulefree
import numpy as np
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
from scipy import stats
from pandas.api.types import is_string_dtype

In [None]:
df = pd.read_csv('../Data/adult.csv')
result_path = '../Result/ADULT/'

In [38]:
df = df[(df == '?').sum(axis = 1) == 0]
df = df.drop(['fnlwgt', 'relationship', 'native.country', 'education'], axis = 1)
df = df.rename({'Age' : 'age',
           'Workclass' : 'workclass',
           'education.num' : 'educ_years',
           'occupation' : 'job',
           'marital.status' : 'marital',
           'sex' : 'gender',
           'capital.gain' : 'gain',
           'capital.loss' : 'loss',
           'hours.per.week' : 'hours_per_week',
           'Country' : 'country',
           'income' : 'y',
           'Pred target' : 'pred'},
           axis = 1)
df['y'] = df.y.apply(lambda x : 1 if (x == '>50K') else 0)
df['gender'] = df.gender.apply(lambda x : 1 if (x == 'Male') else 0)
df['White'] = df.race.apply(lambda x : 1 if (x == 'White') else 0)
df = df.drop(['race'], axis = 1)
print(df.y.mean())
df

0.24892248524633645


Unnamed: 0,age,workclass,educ_years,marital,job,gender,gain,loss,hours_per_week,y,White
1,82,Private,9,Widowed,Exec-managerial,0,0,4356,18,0,1
3,54,Private,4,Divorced,Machine-op-inspct,0,0,3900,40,0,1
4,41,Private,10,Separated,Prof-specialty,0,0,3900,40,0,1
5,34,Private,9,Divorced,Other-service,0,0,3770,45,0,1
6,38,Private,6,Separated,Adm-clerical,1,0,3770,40,0,1
...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,10,Never-married,Protective-serv,1,0,0,40,0,1
32557,27,Private,12,Married-civ-spouse,Tech-support,0,0,0,38,0,1
32558,40,Private,9,Married-civ-spouse,Machine-op-inspct,1,0,0,40,1,1
32559,58,Private,9,Widowed,Adm-clerical,0,0,0,40,0,1


In [39]:
df[df.gender == 0].y.mean() / df[df.gender == 1].y.mean()

np.float64(0.36222035623624405)

In [40]:
target_name, S_variable_name = 'y', 'gender'

In [41]:
cols_to_dummy = ['marital', 'job', 'workclass']#, 'State'
for col in cols_to_dummy:
    df[col] = df[col].astype("category")
df = pd.get_dummies(df, cols_to_dummy, dtype = int)

In [42]:
X_col, dic_col_name_index = [], {}
for index, column in enumerate(df.columns):
    if column not in [target_name, S_variable_name]:
        X_col.append(column)
    dic_col_name_index[column] = index

In [43]:
test_percentage = 0.2# 0.05
epochs, batch_size = 3, 2048
X = df[X_col].values
X_cr = (X - X.mean(axis = 0)) / (X.var(axis = 0) ** (1/2))
Y = df[target_name].values
S = df[S_variable_name].values

test_size = int(len(X) * test_percentage)
train_size = len(X) - test_size
train_size

24130

In [44]:
seed = 1234
indexs = np.arange(len(X))
np.random.seed(seed)
np.random.shuffle(indexs)
X_cr_train, Y_train, S_train = X_cr[indexs[:train_size]], Y[indexs[:train_size]], S[indexs[:train_size]]
X_cr_test, Y_test, S_test    = X_cr[indexs[train_size:]], Y[indexs[train_size:]], S[indexs[train_size:]]

X_train, X_test, Y_train, Y_test = torch.from_numpy(X_cr_train.astype(np.float32)), torch.from_numpy(X_cr_test.astype(np.float32)), torch.from_numpy(Y_train.astype(np.float32)), torch.from_numpy(Y_test.astype(np.float32))

# NN training and DI

In [45]:
threshold = 0.5

In [46]:
model = Network(X_train.shape[1], 
                activation_bool = True, 
                n_nodes=256, 
                n_loop = 2)

optimizer = schedulefree.AdamWScheduleFree(model.parameters(), 
                                           lr=0.001) #torch.optim.AdamW(model.parameters(), lr = 0.001)
net_path = result_path + 'net_state_dic.pt'
'''
training_network_threshold(model, 
                           optimizer, 
                           threshold=threshold, 
                           X_train    = X_train,
                           Y_train    = Y_train,
                           X_test     = X_test,
                           Y_test     = Y_test,
                           epochs     = epochs,
                           batch_size = batch_size,
                           )

torch.save(model.state_dict(), net_path)
'''
model.load_state_dict(torch.load(net_path, weights_only=True))
model.eval()

Network(
  (seq): Sequential(
    (0): BatchNorm1d(34, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=34, out_features=256, bias=True)
    (2): ReLU()
    (3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=256, out_features=256, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Linear(in_features=256, out_features=256, bias=True)
    (8): ReLU()
    (9): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): Linear(in_features=256, out_features=1, bias=True)
  )
  (activation): Sigmoid()
)

In [47]:
label = (Y_test > threshold)*1.
proba = model(X_test.float()).squeeze()
prob_threshold = np.quantile(proba.detach().numpy(),  1 - Y_train.mean()) - 1e-7
pred = ( proba > prob_threshold)*1.
acc = ( (pred == label)*1.).mean().item()

number_expe = 5000
inputs, groups = X_test[:number_expe], S_test[:number_expe]

DI = (pred[:number_expe][groups==0].mean() / pred[:number_expe][groups==1].mean()).item()
print(f'pred mean : {np.round(pred.mean().item(), 3)}, with an accuracy of {np.round(acc,3)}, and the Disparate Impact is {np.round(DI,3)}')

pred mean : 0.251, with an accuracy of 0.84, and the Disparate Impact is 0.3


# Mitigation

In [48]:
stresser = Stresser(X = inputs.float().numpy(), S = groups)

In [None]:
stresser.W.DI_miti_grad(model,
                        prob_threshold = prob_threshold,
                        threshold = 0.8,
                        threshold_augm_constr = 0.005,
                        threshold_lr_cdt = 0.1,
                        iteration_threshold = 200,
                        look_alike = False,
                        verbose = False,
                        CONSTR_REGU = 0.01,
                        lr = 0.1,
                        delta_type = 'mean'
                        )
t_reg_me = stresser.W.t
stresser.W.DI_miti_grad(model,
                        prob_threshold = prob_threshold,
                        threshold = 0.8,
                        threshold_augm_constr = 0.005,
                        threshold_lr_cdt = 0.1,
                        iteration_threshold = 200,
                        look_alike = False,
                        verbose = False,
                        CONSTR_REGU = 0.01,
                        lr = 0.1,
                        delta_type = 'num'
                        )
t_reg_nu = stresser.W.t
stresser.W.DI_miti_grad(model,
                        prob_threshold = prob_threshold,
                        threshold = 0.8,
                        threshold_augm_constr = 0.005,
                        threshold_lr_cdt = 0.1,
                        iteration_threshold = 200,
                        look_alike = True,
                        verbose = False,
                        CONSTR_REGU = 0.01,
                        lr = 0.1,
                        delta_type = 'mean'
                        )
t_la_me = stresser.W.t
stresser.W.DI_miti_grad(model,
                        prob_threshold = prob_threshold,
                        threshold = 0.8,
                        threshold_augm_constr = 0.005,
                        threshold_lr_cdt = 0.1,
                        iteration_threshold = 200,
                        look_alike = True,
                        verbose = False,
                        CONSTR_REGU = 0.01,
                        lr = 0.1,
                        delta_type = 'num'
                        )
t_la_nu = stresser.W.t

In [50]:
pred_ori = (model(torch.tensor(stresser.X).type(torch.float32)) > prob_threshold)*1.

tensor_reg_me = (torch.from_numpy(t_reg_me + stresser.X)).type(torch.float32)
pred_reg_me = (model(tensor_reg_me) > prob_threshold)*1.

tensor_reg_nu = (torch.from_numpy(t_reg_nu + stresser.X)).type(torch.float32)
pred_reg_nu = (model(tensor_reg_nu) > prob_threshold)*1.

tensor_la_me = (torch.from_numpy(t_la_me + stresser.X)).type(torch.float32)
pred_la_me = (model(tensor_la_me) > prob_threshold)*1.

tensor_la_nu = (torch.from_numpy(t_la_nu + stresser.X)).type(torch.float32)
pred_la_nu = (model(tensor_la_nu) > prob_threshold)*1.

In [52]:
arr_start       = np.concatenate([stresser.X, stresser.S.reshape(number_expe, -1), pred_ori], axis = 1)

arr_grad_reg_me = np.concatenate([stresser.X + t_reg_me, stresser.S.reshape(number_expe, -1), pred_reg_me], axis = 1)
arr_grad_reg_nu = np.concatenate([stresser.X + t_reg_nu, stresser.S.reshape(number_expe, -1), pred_reg_nu], axis = 1)

arr_grad_la_me  = np.concatenate([stresser.X + t_la_me, stresser.S.reshape(number_expe, -1), pred_la_me], axis = 1)
arr_grad_la_nu  = np.concatenate([stresser.X + t_la_nu, stresser.S.reshape(number_expe, -1), pred_la_nu], axis = 1)

In [27]:
new_arr_mod_SF, bins, translation = find_translation_DI(arr = arr_start,
                    S_index = -2,
                    Y_index = -1,
                    threshold = 0.8,
                    speed = 1,
                    bool_return_all = True,
                    verbose = False
                    )

In [28]:
new_arr, swaps, dic_number_swap_done, wass_distance = find_sampling_wasserstein_DI(arr = arr_start,
                                                                                   S_index = -2, #dic_col_name_index[dic_file[file_obj]['S_variable_name']],
                                                                                   Y_index = -1, #dic_col_name_index[dic_file[file_obj]['target_name']],
                                                                                   verbose = True,
                                                                                   )

0.29990912182693
0.3017853219011458
0.3036618838283534
0.30553881422254364
0.3074161197082541
0.30929380692065866
0.31117188250565825
0.3130503531199714
0.31492922543122576
0.3168085061180494
0.3186882018701624
0.32056831938846986
0.3224488653851535
0.3243298465837654
0.32621126971932096
0.32809314153839264
0.32997546879920386
0.331858258271724
0.33374151673776237
0.33562525099106416
0.3375094678374056
0.3393941740946903
0.3412793765930451
0.34316508217491715
0.3450512976951711
0.3469380300211868
0.34882528603295676
0.350713072623185
0.35260139669738555
0.3544902651739821
0.35637968498440686
0.3582696630732013
0.3601602063981163
0.3620513219302131
0.3639430166539646
0.3658352975673573
0.367728171681993
0.36962164602319214
0.37151572763009644
0.3734104235557724
0.37530574086731555
0.3772016866459547
0.3790982679871569
0.38099549200073285
0.38289336581094274
0.3847918965566024
0.38669109139119034
0.3885909574829547
0.3904915020150212
0.3923927321855013
0.39429465520760104
0.3961972783097

In [29]:
''' 
arr_miti_gems_KL_regular = Gems_regular_mitigation_arr(arr_start, 
                            S_column_index = -2,
                            Pred_column_index = -1, 
                            DI_target = 0.8)

arr_miti_gems_KL_fair = Gems_fair_mitigation_arr(arr=arr_start, 
                            S_column_index = -2,
                            Pred_column_index = -1, 
                            Y_column_index = None,
                            DI_target = 0.8)
''' 
arr_miti_gems_KL_number = Gems_fair_mitigation_arr(arr=arr_start, 
                            S_column_index = -2,
                            Pred_column_index = -1, 
                            Y_column_index = None,
                            DI_target = 0.8,
                            delta_type = 'number')

arr_miti_gems_KL_mean = Gems_fair_mitigation_arr(arr=arr_start, 
                            S_column_index = -2,
                            Pred_column_index = -1, 
                            Y_column_index = None,
                            DI_target = 0.8,
                            delta_type = 'mean')

In [None]:
np.save(file = result_path + 'DI.npy',
        arr  = DI)
np.save(file = result_path + 'X_number_column.npy',
        arr  = X_train.shape[1])
np.save(file = result_path + 'threshold.npy',
        arr  = prob_threshold)
np.save(file = result_path + 'original.npy', 
        arr = arr_start)

np.save(file = result_path + 'Grad_reg_me.npy',
        arr = arr_grad_reg_me)
np.save(file = result_path + 'Grad_reg_nu.npy',
        arr = arr_grad_reg_nu)
np.save(file = result_path + 'Grad_la_me.npy',
        arr = arr_grad_la_me)
np.save(file = result_path + 'Grad_la_nu.npy',
        arr = arr_grad_la_nu)


np.save(file = result_path + 'Miti_sampling_X.npy', 
        arr = new_arr)
np.save(file = result_path + 'Miti_mod_SF.npy', 
        arr = new_arr_mod_SF)
np.save(file = result_path + 'Miti_Gems_number.npy', 
        arr = arr_miti_gems_KL_number)
np.save(file = result_path + 'Miti_Gems_mean.npy', 
        arr = arr_miti_gems_KL_mean)