In [1]:
# Load all necessary packages
import sys
import numpy as np
import pandas as pd

sys.path.append("../")
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from aif360.metrics.utils import compute_boolean_conditioning_vector
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions\
                import load_preproc_data_adult, load_preproc_data_compas

from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression

from IPython.display import Markdown, display
import matplotlib.pyplot as plt
from variable_cep import VariableCEP as CalibratedEqOddsPostprocessing #modified for varying weight
from variable_cep import normed_rates
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve

from scipy import optimize

pip install 'aif360[AdversarialDebiasing]'


In [2]:
## import dataset - use compas
dataset_used = "compas" # "adult", "german", "compas"
protected_attribute_used = 2 # 1, 2

In [3]:
# code to identify the protected attributes from all of the dataset features
if dataset_used == "adult":
    dataset_orig = AdultDataset()
#     dataset_orig = load_preproc_data_adult()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
    else:
        privileged_groups = [{'race': 1}]
        unprivileged_groups = [{'race': 0}]
    
elif dataset_used == "german":
    dataset_orig = GermanDataset()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
    else:
        privileged_groups = [{'age': 1}]
        unprivileged_groups = [{'age': 0}]
    
elif dataset_used == "compas":
#     dataset_orig = CompasDataset()
    dataset_orig = load_preproc_data_compas()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
    else:
        privileged_groups = [{'race': 1}]
        unprivileged_groups = [{'race': 0}]  

In [4]:
#random seed for calibrated equal odds prediction
randseed = 12345679 

#train validation&test split
dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.6], shuffle=True)

In [5]:
# Placeholder for predicted and transformed datasets
dataset_orig_train_pred = dataset_orig_train.copy(deepcopy=True)

# Logistic regression classifier and predictions for training data
scale_orig = StandardScaler()
X_train = scale_orig.fit_transform(dataset_orig_train.features)
y_train = dataset_orig_train.labels.ravel()
lmod = LogisticRegression() #logregression

#fit original model
lmod.fit(X_train, y_train)

fav_idx = np.where(lmod.classes_ == dataset_orig_train.favorable_label)[0][0]
y_train_pred_prob = lmod.predict_proba(X_train)[:,fav_idx]

# Prediction probs for training data
class_thresh = 0.5
dataset_orig_train_pred.scores = y_train_pred_prob.reshape(-1,1)

y_train_pred = np.zeros_like(dataset_orig_train_pred.labels)
y_train_pred[y_train_pred_prob >= class_thresh] = dataset_orig_train_pred.favorable_label
y_train_pred[~(y_train_pred_prob >= class_thresh)] = dataset_orig_train_pred.unfavorable_label
dataset_orig_train_pred.labels = y_train_pred

In [6]:
#
#
#
# set up tradeoff cost-benefit calculation
include = False
N_reps = 1
N_values = 100

pbar = tqdm(total=(N_reps*N_values))

negs = []
accs = []
fps = []
fns = []

privileged_options = [True,False,None]

#whether to include all fp and all fn cost functions (True)
if include == True:
    n_range = np.linspace(0.00,1.00,N_values)
if include == False:
    n_range = np.linspace(0.01,0.99,N_values)

  0%|          | 0/100 [00:00<?, ?it/s]

In [7]:
#set up equalized odds processing

#to be replaced/substituted:
cpp = CalibratedEqOddsPostprocessing(privileged_groups = privileged_groups,
                                        unprivileged_groups = unprivileged_groups,
                                        seed=randseed)

#NP_rate replace with metric1_weight, metric2_weight to generalize
#replace cpp with wrapper for arbitrary function to optimize for in post-processing

In [8]:
metric_to_test = 'false_positive_rate'
metric_to_test_2 = None
#add in second metric

PR = None
#change

#can set to any available

In [9]:
def result_given_np(NP,args):

    dataset_orig_vt,cpp,N_reps,metric1 = args[0],args[1],args[2],args[3]

    #change/improve neg and improve optim_function!

    dataset_orig_valid_pred,dataset_orig_test_pred, dataset_new_valid_pred, dataset_new_test_pred = None,None,None,None
    cpp.set_NP(NP)

    #should be able to refit
    normed_p,normed_n = normed_rates(NP[1],NP[0])

    del dataset_orig_valid_pred
    output = 0
    
    split = False
        
    for repeat in range(N_reps):
        if (N_reps == 1) and (split == True):
            #If there's only 1 repeat then we use the same validation/test split for each
            pass
        else:
            ##########

            # New Validation/test set reshuffle and prediction for each
            if split == True:
                del dataset_orig_valid_pred
                del dataset_orig_test_pred
                del dataset_new_valid_pred
                del dataset_new_test_pred

            dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True)#validation_test split

            #change to be deterministic k fold

            dataset_orig_valid_pred = dataset_orig_valid.copy(deepcopy=True)
            dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)

            dataset_new_valid_pred = dataset_orig_valid.copy(deepcopy=True)
            dataset_new_test_pred = dataset_orig_test.copy(deepcopy=True)

            X_valid = scale_orig.transform(dataset_orig_valid.features)
            y_valid_pred_prob = lmod.predict_proba(X_valid)[:,fav_idx]

            X_test = scale_orig.transform(dataset_orig_test.features)
            y_test_pred_prob = lmod.predict_proba(X_test)[:,fav_idx]

            dataset_orig_valid_pred.scores = y_valid_pred_prob.reshape(-1,1)
            dataset_orig_test_pred.scores = y_test_pred_prob.reshape(-1,1)

            y_valid_pred = np.zeros_like(dataset_orig_valid_pred.labels)
            y_valid_pred[y_valid_pred_prob >= class_thresh] = dataset_orig_valid_pred.favorable_label
            y_valid_pred[~(y_valid_pred_prob >= class_thresh)] = dataset_orig_valid_pred.unfavorable_label
            dataset_orig_valid_pred.labels = y_valid_pred
                
            y_test_pred = np.zeros_like(dataset_orig_test_pred.labels)
            y_test_pred[y_test_pred_prob >= class_thresh] = dataset_orig_test_pred.favorable_label
            y_test_pred[~(y_test_pred_prob >= class_thresh)] = dataset_orig_test_pred.unfavorable_label
            dataset_orig_test_pred.labels = y_test_pred
            split=True

        # Odds equalizing post-processing algorithm

        ##########


        # Learn parameters to equalize odds and apply to create a new dataset
        cpp = cpp.fit(dataset_orig_valid, dataset_orig_valid_pred)

        dataset_transf_valid_pred = cpp.predict(dataset_orig_valid_pred)
        dataset_transf_test_pred = cpp.predict(dataset_orig_test_pred)

        cm_transf_valid = ClassificationMetric(dataset_orig_valid, dataset_transf_valid_pred,
                                    unprivileged_groups=unprivileged_groups,
                                    privileged_groups=privileged_groups)

        cm_transf_test = ClassificationMetric(dataset_orig_test, dataset_transf_test_pred,
                                    unprivileged_groups=unprivileged_groups,
                                    privileged_groups=privileged_groups)

        #cm_transf_test.difference
        
        test_set_metric = lambda metric : getattr(cm_transf_test,metric)()
        
        PR = None #CHANGE!
        
        output += test_set_metric(metric=metric1)
    
    output = output/N_reps

    print("NP:{}\n{}:{}\n\n".format(NP,metric1,output))

    return output


In [10]:
N_reps = 10

args = ([dataset_orig_vt,cpp,N_reps,metric_to_test])
x0 = np.array([0.1,0.1])

minimizer_kwargs = {'args':args}

In [11]:
o = optimize.basinhopping(func=result_given_np,x0 = x0, minimizer_kwargs=minimizer_kwargs)
display(o)

160771  10.5025956 ]
false_positive_rate:0.5875000000000001


NP:[-12.70160773  10.50259561]
false_positive_rate:0.5875000000000001


NP:[-13.11570831  10.86315213]
false_positive_rate:0.5864019916142558


NP:[-13.11570831  10.86315213]
false_positive_rate:0.5875000000000001


NP:[-13.1157083   10.86315213]
false_positive_rate:0.5875000000000001


NP:[-13.11570831  10.86315215]
false_positive_rate:0.5875000000000001


NP:[-13.52980889  11.22370867]
false_positive_rate:0.5864019916142558


NP:[-13.52980889  11.22370867]
false_positive_rate:0.5875000000000001


NP:[-13.52980888  11.22370867]
false_positive_rate:0.5875000000000001


NP:[-13.52980889  11.22370869]
false_positive_rate:0.5875000000000001


NP:[-13.94390948  11.58426521]
false_positive_rate:0.5864019916142558


NP:[-13.94390948  11.58426521]
false_positive_rate:0.5875000000000001


NP:[-13.94390946  11.58426521]
false_positive_rate:0.5875000000000001


NP:[-13.94390948  11.58426523]
false_positive_rate:0.5875000000000001


NP

                        fun: 0.4571658805031446
 lowest_optimization_result:       fun: 0.4571658805031446
 hess_inv: array([[1, 0],
       [0, 1]])
      jac: array([0., 0.])
  message: 'Optimization terminated successfully.'
     nfev: 4
      nit: 0
     njev: 1
   status: 0
  success: True
        x: array([-0.27859025, -0.31410058])
                    message: ['requested number of basinhopping iterations completed successfully']
      minimization_failures: 0
                       nfev: 404
                        nit: 100
                       njev: 101
                          x: array([-0.27859025, -0.31410058])