In [None]:
import torch
import random
from torch.utils.data import SubsetRandomSampler, DataLoader
from data.data_utils import ProtectedDataset
from data import adult
from models.trainer import RandomForestTrainer
from models.model import RandomForest
from distances.normalized_mahalanobis_distances import SquaredEuclideanDistance, ProtectedSEDistances
from distances.sensitive_subspace_distances import LogisticRegSensitiveSubspace
from distances.binary_distances import BinaryDistance

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

from IPython.display import display
import matplotlib.pyplot as plt

from data.adult import Generator
from utils import UnfairMetric
from seeker.gradiant_based import BlackboxSeeker, FoolSeeker

%load_ext autoreload
%autoreload 2


def get_data(data, rand_seed, protected_vars):
    torch.manual_seed(rand_seed)
    random.seed(rand_seed)

    X, y, protected_idxs = data.load_data(protected_vars=protected_vars)

    # randomly split into train/test splits
    total_samples = len(X)
    train_size = int(total_samples * 0.8)

    indices = list(range(total_samples))
    random.shuffle(indices)
    train_indices = indices[:train_size]
    test_indices = indices[train_size:]
    train_sampler = SubsetRandomSampler(train_indices)
    test_sampler = SubsetRandomSampler(test_indices)

    dataset = ProtectedDataset(X, y, protected_idxs)
    train_loader = DataLoader(dataset, batch_size=64, sampler=train_sampler)
    test_loader = DataLoader(dataset, batch_size=1000, sampler=test_sampler)

    return dataset, train_loader, test_loader

In [None]:
protected_vars = ['race_White']
rand_seed = 0
use_protected_attr = True

dataset, train_dl, test_dl = get_data(adult, 0, protected_vars)
dataset.use_protected_attr = use_protected_attr
feature_dim = dataset.dim_feature()
output_dim = 2

model = RandomForest(max_depth=10)
trainer = RandomForestTrainer(model, train_dl, test_dl)
trainer.train()

Train Accuracy: 0.8673466444015503
Test Accuracy: 0.8564953207969666


In [None]:


features = ['age', 'capital-gain', 'capital-loss', 'education-num', 'hours-per-week']
for i in range(5, 12):
    features.append(f'marital-status{i-4}')
for i in range(12, 26):
    features.append(f'occupation{i-11}')
features.append('race_White')
for i in range(27, 33):
    features.append(f'relationship{i-26}')
features.append('sex_Male')
for i in range(34, 41):
    features.append(f'workclass{i-33}')

import numpy as np
def f(clf):
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold
    values = clf.tree_.value

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth

        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True
    
    real_feature, real_threshold = [], []
    for i in range(len(feature)):
        if not is_leaves[i]:
            real_feature.append(feature[i])
            real_threshold.append(threshold[i])
    return real_feature, real_threshold

# for tree in model.random_forest.estimators_:
#     # print(f'classes: {tree.classes_},\nfeature_importances: {tree.feature_importances_, tree.feature_importances_[26]},\nmax_features_: {tree.max_features_}, \
#     #       \nn_class: {tree.n_classes_}, \nn_features_in: {tree.n_features_in_}, \
#     #       \nn_outputs: {tree.n_outputs_}, \ntree: {tree.tree_}')
#     # print(tree.get_depth())
#     if -2 in tree.tree_.feature or 39 in tree.tree_.feature:
#         print(tree.tree_.feature)
#         for i in tree.tree_.feature:
#             print(features[i])
#         print(tree.tree_.threshold)
#         plt.figure(figsize=(12, 8))
#         plot_tree(tree, feature_names=features, filled=True)
#         plt.show()
#         input()

In [None]:
th = [[] for _ in range(model.random_forest.n_features_in_)]
for tree in model.random_forest.estimators_:
    feature, threshold = f(tree)
    for i in range(len(feature)):
        th[feature[i]].append(threshold[i])
    # for i in range(len(tree.tree_.feature)):
    #     th[tree.tree_.feature[i]].append(tree.tree_.threshold[i])
for i, threshold in enumerate(th):
    print(i, features[i], len(list(set(threshold))), sorted(list(set(threshold))))

0 age 135 [18.5, 19.0, 19.5, 20.0, 20.5, 21.0, 21.5, 22.0, 22.5, 23.0, 23.5, 24.0, 24.5, 25.0, 25.5, 26.0, 26.5, 27.0, 27.5, 28.0, 28.5, 29.0, 29.5, 30.0, 30.5, 31.0, 31.5, 32.0, 32.5, 33.0, 33.5, 34.0, 34.5, 35.0, 35.5, 36.0, 36.5, 37.0, 37.5, 38.0, 38.5, 39.0, 39.5, 40.0, 40.5, 41.0, 41.5, 42.0, 42.5, 43.0, 43.5, 44.0, 44.5, 45.0, 45.5, 46.0, 46.5, 47.0, 47.5, 48.0, 48.5, 49.0, 49.5, 50.0, 50.5, 51.0, 51.5, 52.0, 52.5, 53.0, 53.5, 54.0, 54.5, 55.0, 55.5, 56.0, 56.5, 57.0, 57.5, 58.0, 58.5, 59.0, 59.5, 60.0, 60.5, 61.0, 61.5, 62.0, 62.5, 63.0, 63.5, 64.0, 64.5, 65.0, 65.5, 66.0, 66.5, 67.0, 67.5, 68.0, 68.5, 69.0, 69.5, 70.0, 70.5, 71.0, 71.5, 72.0, 72.5, 73.0, 73.5, 74.0, 74.5, 75.0, 75.5, 76.0, 76.5, 77.0, 77.5, 78.0, 78.5, 79.0, 79.5, 80.0, 80.5, 81.0, 81.5, 82.5, 83.0, 83.5, 85.0, 85.5, 86.5, 87.0, 89.0]
1 capital-gain 408 [200.5, 297.0, 457.0, 527.5, 543.0, 575.5, 586.5, 704.5, 712.0, 727.5, 753.0, 898.5, 924.0, 1018.0, 1025.0, 1052.5, 1087.0, 1088.0, 1101.0, 1114.0, 1145.0, 1164

In [None]:

data_gen = Generator(sensitive_columns=['race_White'], include_protected_feature=True)

In [None]:
data_point = data_gen.gen_by_range()
data_gen.feature_dataframe(data=data_point)

Unnamed: 0,age,capital-gain,capital-loss,education-num,hours-per-week,race_White,sex_Male,marital-status,occupation,relationship,workclass
0,69,28328,1354,14,8,0,0,4,1,0,2


In [None]:

def iter_along_dim(x, dim, threshold, data_range):
    threshold = sorted(list(set(threshold[dim])))
    data_range = data_range[:, dim]
    l, u = data_range[0].item(), data_range[1].item()

    sections = []
    posterior = []
    for i in range(len(threshold) + 1):
        if i == 0:
            section = [l, threshold[i]]
        elif i == len(threshold):
            section = [threshold[i - 1], u]
        else:
            section = [threshold[i-1], threshold[i]]

        xx = x.clone()
        xx[:, dim] = np.floor(section[1])
        # display(data_gen.feature_dataframe(data=xx))
        sections.extend(section)
        p = model(xx)[0][1].item()
        posterior.extend([p, p])
    plt.figure(figsize=(8, 6))
    plt.plot(sections, posterior)
    plt.axvline(x=x[0, dim].item(), color='red', linestyle='--', label='x=0')
    plt.show()
    

In [None]:
iter_along_dim(data_point, dim=3, threshold=th, data_range=data_gen.get_range('data'))

In [9]:

all_X, all_y = dataset.get_all_data(), dataset.labels

distance_x_Causal = ProtectedSEDistances()
distance_x_LR = LogisticRegSensitiveSubspace()
distance_y = BinaryDistance()

# distance_x_NSE.fit(num_dims=dataset.dim_feature(), data_gen=adult_gen)
if use_protected_attr:
    distance_x_Causal.fit(num_dims=dataset.dim_feature(), data_gen=data_gen, protected_idx=dataset.protected_idxs)
    distance_x_LR.fit(all_X, data_gen, protected_idxs=dataset.protected_idxs)
else:
    sensitive_ = dataset.data[:, dataset.protected_idxs]
    distance_x_Causal.fit(num_dims=dataset.dim_feature(), data_gen=data_gen, protected_idx=[])
    distance_x_LR.fit(all_X, data_gen, data_SensitiveAttrs=sensitive_)

chosen_dx = distance_x_Causal

epsilon = 1e10
unfair_metric = UnfairMetric(dx=chosen_dx, dy=distance_y, epsilon=epsilon)

In [10]:
def show_result(result):
    pair, n_query = result[0], result[1]
    if len(result) == 3:
        print(f'n_iters = {result[2]}')
    if pair != None:
        display(data_gen.feature_dataframe(data=pair), n_query)
    else:
        display('not found')

In [11]:
random.seed(422)
torch.manual_seed(422)

test_seeker = FoolSeeker(model=model, unfair_metric=unfair_metric, data_gen=data_gen, easy=True)
# show_result(white_seeker.seek())
for i in range(3):
    display(f'try: {i}')
    show_result(test_seeker.seek(origin_lr=0.1, max_query=1e6, lamb=1))

'try: 0'

n_iters = 22


Unnamed: 0,age,capital-gain,capital-loss,education-num,hours-per-week,race_White,sex_Male,marital-status,occupation,relationship,workclass
0,17,8029,618,3,1,0,0,5,13,0,3
1,17,8028,618,3,1,0,0,5,13,0,3


40

'try: 1'

n_iters = 32


'not found'

'try: 2'

n_iters = 35


'not found'

In [12]:
all_X = dataset.data
all_X_conter = all_X.clone()
all_X_conter[:, dataset.protected_idxs[0]] = 1 - all_X_conter[:, dataset.protected_idxs[0]]

all_pred = model.get_prediction(all_X)
all_pred_conter = model.get_prediction(all_X_conter)

In [13]:
# i = 100
# pair = torch.concat([all_X[i].unsqueeze(0), all_X_conter[i].unsqueeze(0)])
# adult.get_original_feature(pair)


In [14]:
len(all_pred)

45222

In [15]:
(all_pred != all_pred_conter).sum()

tensor(346)

In [16]:
adult_gen = adult.Generator(sensitive_columns=dataset.protected_idxs, include_protected_feature=True)

distance_x_NSE = SquaredEuclideanDistance()
distance_x_Causal = ProtectedSEDistances()
distance_x_LR = LogisticRegSensitiveSubspace()
distance_y = BinaryDistance()

distance_x_NSE.fit(num_dims=dataset.dim_feature(), data_gen=adult_gen)
distance_x_Causal.fit(num_dims=dataset.dim_feature(), data_gen=adult_gen, protected_idx=dataset.protected_idxs)
distance_x_LR.fit(all_X, adult_gen, protected_idxs=dataset.protected_idxs)

In [17]:
from models.model import MLP
from utils import load_model

in_dim = dataset.dim_feature()
out_dim = 2

model_name = 'MLP'
# model_name = 'RandomForest'

if model_name == 'MLP':
    model = MLP(in_dim, out_dim)
    trainer_name = 'STDTrainer'
elif model_name == 'RandomForest':
    model = RandomForest()
    trainer_name = 'RandomForestTrainer'
load_model(model, model_name, 'adult', trainer_name, use_protected_attr=True, \
           protected_vars=protected_vars, id=rand_seed)

In [18]:
all_pred = model.get_prediction(all_X)
all_pred_conter = model.get_prediction(all_X_conter)

In [19]:
len(all_pred)

45222

In [20]:
(all_pred != all_pred_conter).sum()

tensor(440)