# Preparation

## Import

https://github.com/socialfoundations/folktables/tree/main

In [1]:
import sys
sys.path.append("../src")

from utils import *
from mitigation_fct import *
from distance import *
from Gems_Wasserstein import *

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import schedulefree
import numpy as np
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
from scipy import stats

In [3]:
dic_file = {
    'INC' : {
        'path' : '../Data/ASCIncome_2023.csv',
        'S_variable_name' : 'SEX',
        'target_name' : 'PINCP',
        'threshold' : 125000,
        'S_fct' : lambda S : 2 - S,
    },
    'TRA' : {
        'path' : '../Data/ACSTravelTime_2023.csv',
        'S_variable_name' : 'SEX',
        'target_name' : 'JWMNP',
        'threshold' : 25,
        'S_fct' : lambda S : 2 - S,
    },
    'MOB' : {
        'path' : '../Data/ACSMobility_2023.csv',
        'S_variable_name' : 'AGEP',
        'target_name' : 'MIG',
        'threshold' : 0.5,
        'S_fct' : lambda S : S > 25,
    },
    'EMP' : {
        'path' : '../Data/ACSEmploymentFiltered_2023.csv',
        'S_variable_name' : 'DIS',
        'target_name' : 'ESR',
        'threshold' : 0.5,
        'S_fct' : lambda S : S - 1,
    },
    'PUC' : {
        'path' : '../Data/ACSPublicCoverage_2023.csv',
        'S_variable_name' : 'DIS',
        'target_name' : 'PUBCOV',
        'threshold' : 0.5,
        'S_fct' : lambda S : 2 - S,
    },
}

## Pre-processing

In [None]:
file_obj = 'MOB'
result_path = '../Result/ASC_' + file_obj + '/'

In [71]:
#df_all = pd.read_csv('../Data/ASCIncome_2023.csv')
df_all = pd.read_csv(dic_file[file_obj]['path'])
try:
    #variable always equal to the same value
    df_all = df_all.drop('ESP', axis = 1)
except:
    ''

In [72]:
X_col, dic_col_name_index = [], {}
for index, column in enumerate(df_all.columns):
    if column not in [dic_file[file_obj]['target_name'], dic_file[file_obj]['S_variable_name']]:
        X_col.append(column)
    dic_col_name_index[column] = index

In [73]:
test_percentage = 0.2# 0.05
epochs, batch_size = 3, 2048
X = df_all[X_col].values
X_cr = (X - X.mean(axis = 0)) / (X.var(axis = 0) ** (1/2))
Y = df_all[dic_file[file_obj]['target_name']].values
S = df_all[dic_file[file_obj]['S_variable_name']].values

test_size = int(len(X) * test_percentage)
train_size = len(X) - test_size
train_size

2188379

In [None]:
seed = 1234
indexs = np.arange(len(X))
np.random.seed(seed)
np.random.shuffle(indexs)
X_cr_train, Y_train, S_train = X_cr[indexs[:train_size]], Y[indexs[:train_size]], S[indexs[:train_size]]
X_cr_test, Y_test, S_test    = X_cr[indexs[train_size:]], Y[indexs[train_size:]], S[indexs[train_size:]]

X_train, X_test, Y_train, Y_test = torch.from_numpy(X_cr_train.astype(np.float32)), torch.from_numpy(X_cr_test.astype(np.float32)), torch.from_numpy(Y_train.astype(np.float32)), torch.from_numpy(Y_test.astype(np.float32))

In [75]:
threshold = dic_file[file_obj]['threshold']

In [76]:
model = Network(X_train.shape[1], 
                activation_bool = True, 
                n_nodes=256, 
                n_loop = 2)

optimizer = schedulefree.AdamWScheduleFree(model.parameters(), 
                                           lr=0.001) #torch.optim.AdamW(model.parameters(), lr = 0.001)
net_path = result_path + 'net_state_dic.pt'
'''
training_network_threshold(model, 
                           optimizer, 
                           threshold=threshold, 
                           X_train    = X_train,
                           Y_train    = Y_train,
                           X_test     = X_test,
                           Y_test     = Y_test,
                           epochs     = 1,
                           batch_size = batch_size,
                           )

torch.save(model.state_dict(), net_path)
'''
model.load_state_dict(torch.load(net_path, weights_only=True))
model.eval()

Network(
  (seq): Sequential(
    (0): BatchNorm1d(19, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=19, out_features=256, bias=True)
    (2): ReLU()
    (3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=256, out_features=256, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Linear(in_features=256, out_features=256, bias=True)
    (8): ReLU()
    (9): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): Linear(in_features=256, out_features=1, bias=True)
  )
  (activation): Sigmoid()
)

test result

In [77]:
label = (Y_test > threshold)*1.
proba = model(X_test.float()).squeeze()
prob_threshold = np.quantile(proba.detach().numpy(),  1 - ((Y_train > threshold)*1.).mean()) - 1e-7
pred = ( proba > prob_threshold)*1.
#pred = (model(X_test.float()).squeeze() > 0.5)*1.
acc = ( (pred == label)*1.).mean().item()
number_expe = 20000
inputs, groups = X_test[:number_expe].clone(), dic_file[file_obj]['S_fct'](S_test[:number_expe].copy())
DI = (pred[:number_expe][groups==0].mean() / pred[:number_expe][groups==1].mean()).item()
print(f'pred mean : {np.round(pred.mean().item(), 3)}, with an accuracy of {np.round(acc,3)}, and the Disparate Impact is {np.round(DI,3)}')

pred mean : 0.888, with an accuracy of 0.845, and the Disparate Impact is 0.46


In [None]:
np.save(file = result_path + 'Grad_la_nu.npy',
        arr = arr_grad_la_nu)

# Mitigation

In [65]:
stresser = Stresser(X = inputs.float().numpy(), S = groups)

In [None]:
stresser.W.DI_miti_grad(model,
                        prob_threshold = prob_threshold,
                        threshold = 0.8,
                        threshold_augm_constr = 0.005,
                        threshold_lr_cdt = 0.1,
                        iteration_threshold = 200,
                        look_alike = False,
                        verbose = True,
                        CONSTR_REGU = 0.3,
                        lr = 0.1,
                        delta_type = 'mean'
                        )
t_reg_me = stresser.W.t
stresser.W.DI_miti_grad(model,
                        prob_threshold = prob_threshold,
                        threshold = 0.8,
                        threshold_augm_constr = 0.005,
                        threshold_lr_cdt = 0.1,
                        iteration_threshold = 200,
                        look_alike = False,
                        verbose = False,
                        CONSTR_REGU = 0.3,
                        lr = 0.1,
                        delta_type = 'num'
                        )
t_reg_nu = stresser.W.t
stresser.W.DI_miti_grad(model,
                        prob_threshold = prob_threshold,
                        threshold = 0.8,
                        threshold_augm_constr = 0.005,
                        threshold_lr_cdt = 0.1,
                        iteration_threshold = 200,
                        look_alike = True,
                        verbose = False,
                        CONSTR_REGU = 0.3,
                        lr = 0.1,
                        delta_type = 'mean'
                        )
t_la_me = stresser.W.t
stresser.W.DI_miti_grad(model,
                        prob_threshold = prob_threshold,
                        threshold = 0.8,
                        threshold_augm_constr = 0.005,
                        threshold_lr_cdt = 0.1,
                        iteration_threshold = 200,
                        look_alike = True,
                        verbose = False,
                        CONSTR_REGU = 0.1,
                        lr = 0.1,
                        delta_type = 'num'
                        )
t_la_nu = stresser.W.t

P(Y=1|S=0) = 0.2423, P(Y=1|S=1) = 0.7663
former DI is 0.316, we thus have a difference of 0.484 to mitigate
We want to change it to new_P(Y=1|S=0) = 0.4483, and new_P(Y=1|S=1) = 0.5603
0.2718462316262054 0.7312000758385233
the starting mean is 0.0, we are 0.272 away
Thus the direction is 1.0

0
the contraint value is 0.25539
the euclidian distance between is 0.0001257729163626209, 

1
the contraint value is 0.24279
the euclidian distance between is 0.00041885662358254194, 

2
the contraint value is 0.23108
the euclidian distance between is 0.0007932198932394385, 

3
the contraint value is 0.22048
the euclidian distance between is 0.0011948085157200694, 

4
the contraint value is 0.21222
the euclidian distance between is 0.0015941844321787357, 

5
the contraint value is 0.20451
the euclidian distance between is 0.0019735880196094513, 

6
the contraint value is 0.19793
the euclidian distance between is 0.0023226740304380655, 

7
the contraint value is 0.19111
the euclidian distance betwe

In [92]:
pred_ori = (model(torch.tensor(stresser.X).type(torch.float32)) > prob_threshold)*1.

tensor_reg_me = (torch.from_numpy(t_reg_me + stresser.X)).type(torch.float32)
pred_reg_me = (model(tensor_reg_me) > prob_threshold)*1.

tensor_reg_nu = (torch.from_numpy(t_reg_nu + stresser.X)).type(torch.float32)
pred_reg_nu = (model(tensor_reg_nu) > prob_threshold)*1.

tensor_la_me = (torch.from_numpy(t_la_me + stresser.X)).type(torch.float32)
pred_la_me = (model(tensor_la_me) > prob_threshold)*1.

tensor_la_nu = (torch.from_numpy(t_la_nu + stresser.X)).type(torch.float32)
pred_la_nu = (model(tensor_la_nu) > prob_threshold)*1.

In [93]:
arr_start       = np.concatenate([stresser.X, stresser.S.reshape(number_expe, -1), pred_ori], axis = 1)

arr_grad_reg_me = np.concatenate([stresser.X + t_reg_me, stresser.S.reshape(number_expe, -1), pred_reg_me], axis = 1)
arr_grad_reg_nu = np.concatenate([stresser.X + t_reg_nu, stresser.S.reshape(number_expe, -1), pred_reg_nu], axis = 1)

arr_grad_la_me  = np.concatenate([stresser.X + t_la_me, stresser.S.reshape(number_expe, -1), pred_la_me], axis = 1)
arr_grad_la_nu  = np.concatenate([stresser.X + t_la_nu, stresser.S.reshape(number_expe, -1), pred_la_nu], axis = 1)

In [94]:
new_arr_mod_SF, bins, translation = find_translation_DI(arr = arr_start,
                                                        S_index = -2,
                                                        Y_index = -1,
                                                        threshold = 0.8,
                                                        speed = 1,
                                                        bool_return_all = True,
                                                        verbose = False
                                                        )

In [95]:
new_arr, swaps, dic_number_swap_done, wass_distance = find_sampling_wasserstein_DI(arr = arr_start,
                                                                                   S_index = -2, #dic_col_name_index[dic_file[file_obj]['S_variable_name']],
                                                                                   Y_index = -1, #dic_col_name_index[dic_file[file_obj]['target_name']],
                                                                                   verbose = False,
                                                                                   )

In [96]:
arr_miti_gems_KL_number = Gems_fair_mitigation_arr(arr=arr_start, 
                                                   S_column_index = -2,
                                                   Pred_column_index = -1, 
                                                   Y_column_index = None,
                                                   DI_target = 0.8,
                                                   delta_type = 'number')

arr_miti_gems_KL_mean = Gems_fair_mitigation_arr(arr=arr_start, 
                                                 S_column_index = -2,
                                                 Pred_column_index = -1, 
                                                 Y_column_index = None,
                                                 DI_target = 0.8,
                                                 delta_type = 'mean')

In [None]:
np.save(file = result_path + 'DI.npy',
        arr  = DI)
np.save(file = result_path + 'X_number_column.npy',
        arr  = X_train.shape[1])
np.save(file = result_path + 'threshold.npy',
        arr  = prob_threshold)
np.save(file = result_path + 'original.npy', 
        arr = arr_start)

np.save(file = result_path + 'Grad_reg_me.npy',
        arr = arr_grad_reg_me)
np.save(file = result_path + 'Grad_reg_nu.npy',
        arr = arr_grad_reg_nu)
np.save(file = result_path + 'Grad_la_me.npy',
        arr = arr_grad_la_me)
np.save(file = result_path + 'Grad_la_nu.npy',
        arr = arr_grad_la_nu)


np.save(file = result_path + 'Miti_sampling_X.npy', 
        arr = new_arr)
np.save(file = result_path + 'Miti_mod_SF.npy', 
        arr = new_arr_mod_SF)
np.save(file = result_path + 'Miti_Gems_number.npy', 
        arr = arr_miti_gems_KL_number)
np.save(file = result_path + 'Miti_Gems_mean.npy', 
        arr = arr_miti_gems_KL_mean)