In [1]:
import torch
import random
from seeker.random import RandomSelectPairSeeker, RandomSelectSeeker, RangeGenSeeker, DistributionGenSeeker
from seeker.gradiant_based import WhiteboxSeeker, BlackboxSeeker, FoolSeeker
from utils import UnfairMetric, load_model, get_L_matrix
from data import adult
from train_dnn import get_data
from models.model import MLP, RandomForest
from distances.normalized_mahalanobis_distances import SquaredEuclideanDistance, ProtectedSEDistances
from distances.sensitive_subspace_distances import LogisticRegSensitiveSubspace
from distances.binary_distances import BinaryDistance
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from IPython.display import display

%load_ext autoreload
%autoreload 2

In [2]:
rand_seed = 0
use_protected_attr = True
protected_vars = ['race_White']
# protected_vars = ['sex_Male', 'race_White']

dataset, train_dl, test_dl = get_data(adult, rand_seed, protected_vars=protected_vars)
dataset.use_protected_attr = use_protected_attr
in_dim = dataset.dim_feature()
out_dim = 2

# prepare data
all_X, all_y = dataset.get_all_data(), dataset.labels

adult_gen = adult.Generator(sensitive_columns=dataset.protected_idxs, include_protected_feature=use_protected_attr)

In [3]:
# model_name = 'MLP'
model_name = 'RandomForest'

if model_name == 'MLP':
    model = MLP(in_dim, out_dim)
    trainer_name = 'STDTrainer'
elif model_name == 'RandomForest':
    model = RandomForest()
    trainer_name = 'RandomForestTrainer'
load_model(model, model_name, 'adult', trainer_name, use_protected_attr=use_protected_attr, \
           protected_vars=protected_vars, id=rand_seed)

In [4]:
# for tree in model.random_forest.estimators_:
#     print(tree.feature_importances_, tree.feature_importances_[26])

In [5]:
# prepare distances
distance_x_NSE = SquaredEuclideanDistance()
distance_x_Causal = ProtectedSEDistances()
distance_x_LR = LogisticRegSensitiveSubspace()
distance_y = BinaryDistance()

distance_x_NSE.fit(num_dims=dataset.dim_feature(), data_gen=adult_gen)
distance_x_Causal.fit(num_dims=dataset.dim_feature(), data_gen=adult_gen, protected_idx=dataset.protected_idxs)
distance_x_LR.fit(all_X, adult_gen, protected_idxs=dataset.protected_idxs)

In [6]:
# def rand_gen():
#     return {
#         'age': random.randint(15, 60),
#         'capital_gain': 0,
#         'capital_loss': 0,
#         'education_num': random.randint(1, 15),
#         'hours_per_week': random.randint(10, 50),
#         'race_white': random.choice([0, 1]),
#         'sex_male': random.choice([0, 1]),
#         'marital_status': random.choice(list(range(7))),
#         'occupation': random.choice(list(range(14))),
#         'relationship': random.choice(list(range(6))),
#         'workclass': random.choice(list(range(7)))
#     }

# def perturb_pair(x, pert_features, pert_func):
#     pair = dict()
#     for k, v in x.items():
#         pair[k] = torch.tensor([v, v])
#         if k in pert_features:
#             pair[k][1] = pert_func(pair[k][0])
#     return pair

# x = rand_gen()

In [7]:
# pair = adult.generate_from_origin(**perturb_pair(x, ['capital_loss'], lambda x:1+x))
# adult.get_original_feature(pair)

In [8]:
# dist = distance_x_LR(all_X[0], all_X[0] + torch.eye(all_X.shape[1]), itemwise_dist=False).squeeze()
# print(dist)
# print(dist[dataset.protected_idxs])

设置之后允许每一维变多少

In [9]:
# epsilon越大，要求不公平样本对dx越小，越严格
# epsilon = 1e10
epsilon = 9e9
unfair_metric = UnfairMetric(dx=distance_x_LR, dy=distance_y, epsilon=epsilon)

In [10]:
def show_result(result):
    pair, n_query = result[0], result[1]
    if len(result) == 3:
        print(f'n_iters = {result[2]}')
    if pair != None:
        display(adult.get_original_feature(pair), n_query)
    else:
        display('not found')

In [11]:
# random.seed(422)
# torch.manual_seed(422)

# select_seeker = RandomSelectPairSeeker(model=model, unfair_metric=unfair_metric, data=all_X)
# for _ in range(3):
#     show_result(select_seeker.seek(dx_constraint=True, max_query=1e6))

In [13]:
all_X = dataset.data
all_X_conter = all_X.clone()
all_X_conter[:, dataset.protected_idxs[0]] = 1 - all_X_conter[:, dataset.protected_idxs[0]]

all_pred = model.get_prediction(all_X)
all_pred_conter = model.get_prediction(all_X_conter)
(all_pred != all_pred_conter).sum()

tensor(1)

In [12]:
random.seed(422)
torch.manual_seed(422)

select_seeker = RandomSelectSeeker(model=model, unfair_metric=unfair_metric, data=all_X, data_gen=adult_gen)
for _ in range(3):
    show_result(select_seeker.seek(max_query=1e6))

KeyboardInterrupt: 

In [14]:
random.seed(422)
torch.manual_seed(422)

distribution_seeker = DistributionGenSeeker(model=model, unfair_metric=unfair_metric, data_gen=adult_gen)
for _ in range(3):
    show_result(distribution_seeker.seek(max_query=1e6))

KeyboardInterrupt: 

In [15]:
random.seed(422)
torch.manual_seed(422)

range_seeker = RangeGenSeeker(model=model, unfair_metric=unfair_metric, data_gen=adult_gen)
for _ in range(3):
    show_result(range_seeker.seek(max_query=1e6))

Unnamed: 0,age,capital-gain,capital-loss,education-num,hours-per-week,race_White,sex_Male,marital-status,occupation,relationship,workclass
0,44.0,98669.0,1444.0,3.0,80.0,0.0,0.0,5.0,0.0,2.0,0.0
1,44.0,98669.0,1444.0,3.0,80.0,1.0,0.0,5.0,0.0,2.0,0.0


54

KeyboardInterrupt: 

In [None]:
# random.seed(422)
# torch.manual_seed(422)

# test_seeker = WhiteboxSeeker(model=model, unfair_metric=unfair_metric, data_gen=adult_gen)
# for _ in range(3):
#     show_result(test_seeker.seek(origin_lr=1, max_query=1e6, lamb=1))

In [16]:
random.seed(422)
torch.manual_seed(422)

test_seeker = BlackboxSeeker(model=model, unfair_metric=unfair_metric, data_gen=adult_gen, easy=True)
for _ in range(3):
    show_result(test_seeker.seek(origin_lr=1, max_query=3000, lamb=1))

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
random.seed(422)
torch.manual_seed(422)

test_seeker = BlackboxSeeker(model=model, unfair_metric=unfair_metric, data_gen=adult_gen, easy=False)
for _ in range(3):
    show_result(test_seeker.seek(origin_lr=1, max_query=3000, lamb=1))

n_iters = 1


'not found'

n_iters = 1


'not found'

n_iters = 1


'not found'

In [17]:
random.seed(422)
torch.manual_seed(422)

test_seeker = FoolSeeker(model=model, unfair_metric=unfair_metric, data_gen=adult_gen)
for _ in range(3):
    show_result(test_seeker.seek(origin_lr=1, max_query=3000, lamb=1))

n_iters = 9


'not found'

n_iters = 39


'not found'

n_iters = 8


'not found'