In [1]:
import numpy as np
import tensorflow as tf

In [2]:
import os
from os import listdir
from os.path import isfile, join
import sys
from itertools import product

In [3]:
from tensorflow.python.summary.summary_iterator import summary_iterator

# Adult Figures

In this notebook, we generate the Figures for the adult data set that are found in the appendix.  
This involves using data from the baseline boosting methods as well as the BuDRO tensorboard data.

These data files are not included with this code.  Please contact the authors if you would like access to these data.

In [294]:
cpath = 'PATH_TO_BUDRO_RESULTS'

bpath = 'PATH_TO_BASELINE RESULTS'
bspath = 'PATH_TO_BASELINE_STDDEVS'

In [6]:
path = cpath

In [7]:
# saving keyList in the order that I made it (for reproducibility)
keyList = ['step',
 'train balanced',
 'inner training loss',
 'test balanced',
 'p0',
 'p1',
 'spouse cons',
 'gr cons',
 'sex-TPR-prot',
 'sex-TNR-prot',
 'sex-TPR-priv',
 'sex-TNR-priv',
 'sex-gap-RMS',
 'sex-gap-MAX',
 'sex-ave-odds-diff',
 'sex-eq-opp-diff',
 'sex-stat-parity',
 'race-TPR-prot',
 'race-TNR-prot',
 'race-TPR-priv',
 'race-TNR-priv',
 'race-gap-RMS',
 'race-gap-MAX',
 'race-ave-odds-diff',
 'race-eq-opp-diff',
 'race-stat-parity',
 'slack']

numCols = len(keyList)

# Indices of values that we will want to save in our optimial grid
balind = keyList.index('test balanced')
sconsi = keyList.index('spouse cons')
grconsi = keyList.index('gr cons')
grmsi = keyList.index('sex-gap-RMS')
gmaxi = keyList.index('sex-gap-MAX')
rrmsi = keyList.index('race-gap-RMS')
rmaxi = keyList.index('race-gap-MAX')

In [62]:
basenames = [
    'max_depth',
    'eta',
    'min_child_weight',
    'lambda',
    'iter',
    'acc',
    'bl_acc',
    'grcons',
    'scons',
    'RMS(G)',
    'MAX(G)',
    'AOD(G)',
    'EOD(G)',
    'SPD(G)',
    'RMS(R)',
    'MAX(R)',
    'AOD(R)',
    'EOD(R)',
    'SPD(R)'
]

# Indices of values that we will want to save in our optimial grid
fbalind = basenames.index('bl_acc')
fsconsi = basenames.index('scons')
fgrconsi = basenames.index('grcons')
fgrmsi = basenames.index('RMS(G)')
fgmaxi = basenames.index('MAX(G)')
frrmsi = basenames.index('RMS(R)')
frmaxi = basenames.index('MAX(R)')

Create bins and related functions for processing the  BuDRO data

In [None]:
# Parameters for our graphs
lowacc = .76
highacc = .84
numbins = 50
binsize = (highacc-lowacc)/numbins
bins = np.arange(start=lowacc, stop=highacc, step=binsize)
bindepths = np.zeros(bins.shape[0])

In [11]:
binned_params = []
binned_values = []

for mybin in bins:
    binned_params.append([])
    binned_values.append([])

In [12]:
binned_stds = []

for mybin in bins:
    binned_stds.append([])

In [13]:
# Save lowest and higest values to get an idea of the graph parameters are correct
minacc = 0
maxacc = 1
minaccparms = []
maxaccparms = []

In [14]:
# optimal values that we will save in the bins
# scons, grcons, grms, gmax, rrms, rmax, averms
# want the first two colums to be as large as possible; the next two to be small
ops = np.hstack((np.zeros((numbins,2)), np.ones((numbins,5))))
opsacc = np.zeros((numbins,7))
params = np.zeros((numbins, 7), dtype=object)

In [315]:
# add a condition to the points that we test
conds = [
    ['spouse cons', '>', 0.93],
]

In [15]:
conds = []

In [16]:
def testConds(row, conds, keyList=keyList):
    success = True
    for cond in conds:
        ind = int(keyList.index(cond[0]))
        
        if cond[1] is '>':
            success = success and row[ind] > cond[2]
            
        if cond[1] is '>=':
            success = success and row[ind] >= cond[2]
            
        if cond[1] is '<':
            success = success and row[ind] < cond[2]
            
        if cond[1] is '<=':
            success = success and row[ind] <= cond[2]
            
    return success
    

In [17]:
# Find the bin that a certain accuracy belongs to 
# return -1 if too high and -2 if too low
def whichBin(acc, bins, binsize):
    possBins = np.where(bins + binsize>= acc)[0]
    if possBins.shape[0] == 0:
        return -1

    returnBin = possBins.min()
    if returnBin == 0:
        if acc < bins[0]:
            return -2

    return returnBin

In [18]:
# sgd parameters
sd_init_grid = [0.1]
epoch_grid = [200]
momentum_grid = [0.9]
lr_grid = [0.001, 0.0001]

# xgboost parameters
lambda_grid = [0.000001, 0.0001, 0.01]
depth_grid = [6,8,10,12,14]
eta_grid = [0.005, 0.001]
weight_grid = [0.1, 1.] # store as fraction of default weight
pos_grid = [0.0] # store as offset
n_iter = 200


# individual fairness parameters
eps_grid = [0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4]
gamma_grid = [0.00005]

In [20]:
# TEST PARAMETERS
# sgd parameters
sd_init_grid = [0.1]
epoch_grid = [200]
momentum_grid = [0.9]
lr_grid = [0.001]

# xgboost parameters
lambda_grid = [0.000001]
depth_grid = [12]
eta_grid = [0.001]
weight_grid = [1.] # store as fraction of default weight
pos_grid = [0.0] # store as offset
n_iter = 200


# individual fairness parameters
eps_grid = [1.4]
gamma_grid = [0.00005]

In [19]:
use_dual = True
use_sgd = True
save_data = True

In [20]:
hypers = [eps_grid, depth_grid, eta_grid, weight_grid, lambda_grid, pos_grid]
names = ['eps', 'depth', 'eta', 'weight', 'lamb', 'pos']

hypers += [sd_init_grid, epoch_grid, momentum_grid, lr_grid]
names += ['init', 'epoch', 'momentum', 'lr']

#names += ['seed']

In [21]:
seeds = np.load('PATH_TO_SEEDS/adult_seeds.npz')['seeds']

In [22]:
len(keyList)

27

Load all of the data from the tensorboard files, process it, and possibly save it (depending on the `save_data` variable). 

We save the averages or the standard deviations (not both).  I ran this twice, because I wasn't originally thinking about the standard deviations.  You may want to tweak it to save both.

This takes about 40 minutes to process all the data from the ~10000 trials that are arranged in `submit.py`

In [None]:
#%%time
num_files_processed = 0
numRows = 200
numCols = 27

tseeds = seeds
nseeds = len(tseeds)

# OUTER LOOP: over hyperparameter selections
for pack in product(*hypers):
    values = list(pack)
    #values.append(seed)
    
    currAves = np.zeros((nseeds, numRows, numCols))
    
    # INNER LOOP: over seeds to collect the averages
    for iseed, seed in enumerate(tseeds):
        
        ## SET UP VARIABLES
        if use_dual:
            (
                eps,
                max_depth,
                eta,
                min_child_weight,
                lambda_reg,
                scale_pos_weight
            ) = pack[:6]

        else:
            (
                eps,
                max_depth,
                eta,
                min_child_weight,
                lambda_reg,
                scale_pos_weight,
                gamma_reg
            ) = pack[:7]

        if use_sgd:
            (
                init,
                epoch,
                momentum,
                lr
            ) = pack[-4:]

        # GET DIRECTORY AND LIST OF FILEs
        exp_descriptor = []
        for n, v in zip(names, values):
            exp_descriptor.append(':'.join([n,str(v)]))

        exp_name = path + '_'.join(exp_descriptor) + '_result:%d/' % seed
        print(exp_name)
        onlyfiles = [f for f in listdir(exp_name) if isfile(join(exp_name, f))]
        
        # GET THE DATA FROM THE FILE
        currStep = -1
        currData = dict()
        currData['step'] = np.array([])
        for summary in summary_iterator(path=exp_name  + onlyfiles[0]):
            if summary.step < 0: print("Uh oh this is not working")
            if summary.step < currStep: print("Uh oh not in order")
            elif summary.step > currStep: 
                currStep = summary.step
                currData['step'] = np.hstack((currData['step'], np.array([currStep])))
            

            for v in summary.summary.value:       
                if v.tag not in currData:
                    currData[v.tag] = tf.make_ndarray(v.tensor)
                else:
                    currData[v.tag] = np.hstack((currData[v.tag], tf.make_ndarray(v.tensor)))
        
        num_files_processed += 1
                    
        ### GET THE RESULTS IN AN NP.ARRAY
        saveRes = np.zeros((currData['step'].shape[0], numCols))      
        saveRes[:,0] = currData['step']
        
        if saveRes.shape[0] != numRows:
            print("ERROR: Incorrect number of steps in data file")
            print("Number of steps in data: {}".format(saveRes.shape[0]))
        
        if keyList.index('step') != 0:
            print("Your processing pipeline is messed up right now")
        for ikey, ckey in enumerate(keyList):
            saveRes[:, ikey] = currData[ckey]
        
        currAves[iseed] = np.copy(saveRes)
    
    ## MAKE IT AN AVERAGE
    aves = currAves.mean(axis=0)
    devs = np.std(currAves, axis=0)
    grmss = currAves[:,:,grmsi]
    rrmss = currAves[:,:,rrmsi]
    mrms = (grmss+rrmss)/2
    avedev = np.std(mrms, axis=0)
    fulldevs = np.zeros((numRows, numCols+1))
    fulldevs[:,:-1] = devs
    fulldevs[:,-1] = avedev
    
    # SAVE 
    if save_data:
        savepath = path + 'txt/devs-' + '_'.join(exp_descriptor)
        savepath += '.dat'
        
        np.savetxt(savepath, fulldevs, delimiter='\t')


    ## UPDATE OPTIMAL VALUES FROM THIS SET OF HYPERPARAMETERS USING THE AVERAGES OVER THE SEEDS
    for irow, row in enumerate(aves):
        if not testConds(row, conds): continue
        acc = row[balind]
        aggrow = whichBin(acc, bins, binsize)
        if aggrow < 0:
            if acc < minacc:
                minacc = acc
                minaccparms.append(exp_descriptor)
            if acc > maxacc:
                maxacc = acc
                maxaccparms.append(exp_descriptor)

        else: # aggrow >= 0:
            bindepths[aggrow] += 1
            comp = ops[aggrow]

            binned_params[aggrow].append(np.array(values + [row[0]]))
            binned_values[aggrow].append(np.array([
                row[balind],
                row[sconsi], 
                row[grconsi],
                row[grmsi],
                row[gmaxi],
                row[rrmsi],
                row[rmaxi],
                (row[rrmsi] + row[grmsi])/2
            ]))
            binned_stds[aggrow].append(np.array([
                devs[irow][balind],
                devs[irow][sconsi], 
                devs[irow][grconsi],
                devs[irow][grmsi],
                devs[irow][gmaxi],
                devs[irow][rrmsi],
                devs[irow][rmaxi],
                fulldevs[irow][-1]  # calculate the deviation of the mean separately
            ]))

#             #scons, grcons, grms, gmax, rrms, rmax
#             if comp[0] < row[sconsi]:
#                 ops[aggrow, 0] = row[sconsi]
#                 opsacc[aggrow, 0] = acc
#                 params[aggrow, 0] = exp_descriptor

#             if comp[1] < row[grconsi]:
#                 ops[aggrow, 1] = row[grconsi]
#                 opsacc[aggrow, 1] = acc
#                 params[aggrow, 1] = exp_descriptor

#             if comp[2] > row[grmsi]:
#                 ops[aggrow, 2] = row[grmsi]
#                 opsacc[aggrow, 2] = acc
#                 params[aggrow, 2] = exp_descriptor

#             if comp[3] > row[gmaxi]:
#                 ops[aggrow, 3] = row[gmaxi]
#                 opsacc[aggrow, 3] = acc
#                 params[aggrow, 3] = exp_descriptor

#             if comp[4] > row[rrmsi]:
#                 ops[aggrow, 4] = row[rrmsi]
#                 opsacc[aggrow, 4] = acc
#                 params[aggrow, 4] = exp_descriptor

#             if comp[5] > row[rmaxi]:
#                 ops[aggrow, 5] = row[rmaxi]
#                 opsacc[aggrow, 5] = acc
#                 params[aggrow, 5] = exp_descriptor

#             if comp[6] > (row[rrmsi] + row[grmsi])/2:
#                 ops[aggrow, 6] = (row[rrmsi] + row[grmsi])/2
#                 opsacc[aggrow, 6] = acc
#                 params[aggrow, 6] = exp_descriptor

In [25]:
num_files_processed

10800

### Script to work with the saved data


Load the saved averages or standard deviations without needing to process all of the tensorboard data.  This runs a lot faster, but you need to tweak it to get what you want out of it.


In [None]:
#%%time
num_files_processed = 0
looked_at = 0
not_looked_at = 0
stupid = 0
exit = False
numRows = 200
numCols = 27

tseeds = seeds
nseeds = len(tseeds)

dayta2 = []
#stddat = []

# OUTER LOOP: over hyperparameter selections
for pack in product(*hypers):
    values = list(pack)
    #values.append(seed)
    
    # INNER LOOP: over seeds to collect the averages
    #for seed in tseeds:
        
    ## SET UP VARIABLES
    if use_dual:
        (
            eps,
            max_depth,
            eta,
            min_child_weight,
            lambda_reg,
            scale_pos_weight
        ) = pack[:6]

    else:
        (
            eps,
            max_depth,
            eta,
            min_child_weight,
            lambda_reg,
            scale_pos_weight,
            gamma_reg
        ) = pack[:7]

    if use_sgd:
        (
            init,
            epoch,
            momentum,
            lr
        ) = pack[-4:]

    # GET DIRECTORY AND LIST OF FILEs
    exp_descriptor = []
    for n, v in zip(names, values):
        exp_descriptor.append(':'.join([n,str(v)]))

    #exp_name = path + '_'.join(exp_descriptor) + '_result:%d/' % seed
    #print(exp_name)
    #onlyfiles = [f for f in listdir(exp_name) if isfile(join(exp_name, f))]

    # right now: get binned standard deviations
    savepath = path + 'txt/devs-' + '_'.join(exp_descriptor)
    savepath += '.dat'
    print(savepath)
    
    currAves = load_data_from_params(exp_descriptor, filepath = path + 'txt/aves-')
    currDevs = load_data_from_params(exp_descriptor, filepath = path + 'txt/devs-')
    num_files_processed += 1
    
    if currDevs.shape != (200,27): 
        print("SUM TING WONG")
        break
    
    #dayta2.append(currAves)
    
    if exit: break
    
    for irow, row in enumerate(currDevs):
        #if currAves[irow][sconsi] > 0.93: looked_at += 1
        if not testConds(currAves[irow], conds): 
            stupid += 1
            continue
        #looked_at += 1
        acc = currAves[irow][balind]
        if acc < 0.76: not_looked_at += 1
        aggrow = whichBin(acc, bins, binsize)
        if aggrow < 0:
            not_looked_at += 0
        else:
            looked_at += 1
#             binned_stds[aggrow].append(np.array([
#                 row[balind],
#                 row[sconsi], 
#                 row[grconsi],
#                 row[grmsi],
#                 row[gmaxi],
#                 row[rrmsi],
#                 row[rmaxi],
#                 (row[rrmsi] + row[grmsi])/2
#             ]))


In [325]:
num_files_processed

1080

In [29]:
for i in range(len(binned_params)):
    binned_params[i] = np.array(binned_params[i])
    binned_values[i]= np.array(binned_values[i])
    binned_stds[i] = np.array(binned_stds[i])

In [348]:
stupid + looked_at + not_looked_at - 200*1080

0

In [31]:
import pickle

In [32]:
# filename = 'binvals-adult-full.pkl'
# outfile = open(filename, 'wb')
# pickle.dump(binned_values, outfile)
# outfile.close()

In [33]:
# filename = 'binparms-adult-full.pkl'
# outfile = open(filename, 'wb')
# pickle.dump(binned_params, outfile)
# outfile.close()

In [34]:
# filename = 'binstds-adult-full.pkl'
# outfile = open(filename, 'wb')
# pickle.dump(binned_stds, outfile)
# outfile.close()

In [152]:
# binned_values = pickle.load( open('binvals-adult.pkl', 'rb') )
# binned_params = pickle.load( open('binparms-adult.pkl', 'rb') )

In [39]:
names

['eps',
 'depth',
 'eta',
 'weight',
 'lamb',
 'pos',
 'init',
 'epoch',
 'momentum',
 'lr']

In [40]:
quantities = [ 'balacc', 'scons', 'grcons', 'grms', 'gmax', 'rrms', 'rmax', 'averms' ]

In [35]:
from copy import deepcopy

### Some scripts to restrict the binned_values and binned_params by a given condition

So that we may explore things and plot certain specific regions.  E.g. we don't want to consider points with too low of a GR cons.

In [41]:
# Restrict the parameters that we are currently looking at

def restrict(name, values, binned_params, binned_values, binned_stds=None, res=False):
    ind = names.index(name)
    #print(ind)
    
    no_stds = False
    if binned_stds is None:
        no_stds = True
    
    nbp = deepcopy(binned_params)
    nbv = deepcopy(binned_values)
    if no_stds: nbs = None
    else: nbs = deepcopy(binned_stds)
    

    for i in range(len(binned_params)):
        if binned_params[i].shape[0] > 0:
            if res:
                mask = np.logical_and(binned_values[i][:,1] >= 0.93, np.isin(binned_params[i][:,ind], values))
            else: mask = np.isin(binned_params[i][:,ind], values)
            
            nbp[i] = binned_params[i][ mask ]
            nbv[i] = binned_values[i][ mask ]
            if not no_stds:  nbs[i] = binned_stds[i][ mask ]
    
    if no_stds: return nbp, nbv
    else: return nbp, nbv, nbs

In [42]:
# Restrict the values that we are currently looking at

def restrict_values(quan, cond, binned_params, binned_values, binned_stds=None):
    
    ind = quantities.index(quan)
    
    no_stds = False
    if binned_stds is None:
        no_stds = True
    
    nbp = deepcopy(binned_params)
    nbv = deepcopy(binned_values)
    if no_stds: nbs = None
    else: nbs = deepcopy(binned_stds)
    
    for i in range(len(binned_params)):
        if binned_params[i].shape[0] > 0:
            mask = np.array([cond(val) for val in binned_values[i][:,ind]])
            
            nbp[i] = binned_params[i][ mask ]
            nbv[i] = binned_values[i][ mask ]
            if not no_stds:  nbs[i] = binned_stds[i][ mask ]
        
    if no_stds: return nbp, nbv
    else: return nbp, nbv, nbs

In [418]:
nbp, nbv, nbs = restrict_values('scons', lambda x: x > 0.94, binned_params, binned_values, binned_stds)

In [44]:
stuff = []
nzrows = 0
for i in range(len(nbv)):
    if nbv[i].shape[0] > 0:
        nzrows += 1
#         for row in nbv[i]: 
# #             if row[1] < 0.96:
# #                 print("Badness 10000")

In [45]:
nzrows

44

In [46]:
def calc_obj(binned_values, binned_params, binned_stds):
    ret_val = np.zeros((len(binned_values), 7))
    acc_val = np.zeros((len(binned_values), 7))
    par_val = np.zeros((len(binned_values), 7, 11))
    std_val = np.zeros((len(binned_values), 7))

    for i in range(len(binned_values)):
        if binned_values[i].shape[0] > 0:
            
            consinds = np.argmax(binned_values[i][:,1:3], axis=0)
            gapinds = np.argmin(binned_values[i][:,3:], axis=0)
            #if i == 0: print(gapinds)
            
            ret_val[i,:2] = binned_values[i][consinds,[1,2]]
            ret_val[i,2:] = binned_values[i][gapinds, range(3,8)]
            
            acc_val[i,:2] = binned_values[i][consinds,0]
            acc_val[i,2:] = binned_values[i][gapinds,0]
            
            par_val[i,:2] = binned_params[i][consinds]
            par_val[i,2:] = binned_params[i][gapinds]
            
            std_val[i,:2] = binned_stds[i][consinds,[1,2]]
            std_val[i,2:] = binned_stds[i][gapinds,range(3,8)]
            
    return ret_val, acc_val, par_val, std_val

In [419]:
vals, accs, parms, stds = calc_obj(nbv, nbp, nbs)

# Scripts for processing baselines

These are similar to the scripts for processing the BuDRO data.

In [363]:
# Load the data that you would like to look at for baselines.
vanilla = np.load(bpath + "Rw.npy")
vanillastd = np.load(bspath + "Rw_std.npy")

In [364]:
vanilla[:,6].max()

0.8386703691999999

Bin the vanilla parameters by index since we now have an order.  Also bin without conditions to keep things more simple (conditions are easy to enforce).

We are currently ignoring the average RMS since we cannot get the std for that

In [2]:
# Parameters for our graphs - NOTE CURRENTLY USE STANDARD BINSIZE
# This is only for the baseline - projecting and reweighing should use the bins from the original BuDRO processing.
vlowacc = .79
vhighacc = .85
#vbinsize = (vhighacc-vlowacc)/numbins
vbins = np.arange(start=vlowacc, stop=vhighacc, step=binsize)

NameError: name 'np' is not defined

In [365]:
vanilla_bins = []

for mybin in bins:
    vanilla_bins.append([])

In [366]:
seen = 0
unseen = 0

for irow, row in enumerate(vanilla):
    
    acc = row[fbalind]
    aggrow = whichBin(acc, bins, binsize)
    if aggrow < 0:
        unseen += 1
    else:
        seen += 1
        vanilla_bins[aggrow].append(irow)

In [367]:
for irow in range(len(vanilla_bins)):
    vanilla_bins[irow] = np.array(vanilla_bins[irow])

In [368]:
seen

7200

In [369]:
# Restrict the values that we are currently looking at

def fget_values(quan, cond, binned_inds, values, stds):
    
    ind = basenames.index(quan)
    
    nbi = deepcopy(binned_inds)
    nbv = []
    nbs = []
    
    for i in range(len(binned_inds)):
        if binned_inds[i].shape[0] > 0:
            currVals = values[binned_inds[i]]
            mask = np.array([cond(val) for val in currVals[:,ind]])

            nbi[i] = nbi[i][mask]
            nbv.append(currVals[mask])
            nbs.append(stds[binned_inds[i]][mask])
        else:
            nbi.append(np.array([]))
            nbv.append(np.array([]))
            nbs.append(np.array([]))
     
    return nbi, nbv, nbs

In [370]:
fbi, fbv, fbs = fget_values('scons', lambda x: x > 0.7, vanilla_bins, vanilla, vanillastd)

In [372]:
stuff = []
nzrows = 0
for i in range(len(nbv)):
    if fbv[i].shape[0] > 0:
        nzrows += 1
#         for row in nbv[i]: 
# #             if row[1] < 0.96:
# #                 print("Badness 10000")

In [373]:
nzrows

33

In [374]:
def fcalc_obj(binned_inds, binned_values, binned_stds):
    
    ret_inds= np.zeros((len(binned_values), 6))
    ret_val = np.zeros((len(binned_values), 6))
    acc_val = np.zeros((len(binned_values), 6))
    std_val = np.zeros((len(binned_values), 6))

    interest = np.array([fbalind, fsconsi, fgrconsi, fgrmsi, fgmaxi, frrmsi, frmaxi])
    
    for i in range(len(binned_values)):
        if binned_values[i].shape[0] > 0:
            
            currVals = binned_values[i][:,interest]
            
            consinds = np.argmax(currVals[:,1:3], axis=0)
            gapinds = np.argmin(currVals[:,3:], axis=0)
            #if i == 0: print(gapinds)
            
            ret_inds[i,:2]= binned_inds[i][consinds]
            ret_inds[i,2:]= binned_inds[i][gapinds]
            
            ret_val[i,:2] = currVals[consinds,[1,2]]
            ret_val[i,2:] = currVals[gapinds, range(3,7)]
            
            acc_val[i,:2] = currVals[consinds,0]
            acc_val[i,2:] = currVals[gapinds,0]
            
            std_val[i,:2] = binned_stds[i][consinds,interest[[1,2]]]
            std_val[i,2:] = binned_stds[i][gapinds,interest[range(3,7)]]
            
    return ret_inds, ret_val, acc_val, std_val

In [375]:
interest = np.array([fbalind, fsconsi, fgrconsi, fgrmsi, fgmaxi, frrmsi, frmaxi])

In [376]:
vinds, vvals, vaccs, vstds = fcalc_obj(fbi, fbv, fbs)

In [377]:
tot = 0
for thing in fbi:
    tot += thing.shape[0]
    #print(thing.shape[0])
    
tot

7200

Explore some of these data

In [283]:
mask = vaccs[:,2] != 0

In [284]:
vinds[:,2][mask]

array([   0.,  101.,  600.,  700., 1800.,  103., 1802., 1952., 3851.,
       3601., 3602., 2056., 2157., 3450., 2401., 2801., 5550., 1157.,
        959.,  910.,  711.,  864., 1067.,  619.,  622., 1662., 1215.,
       1218.,  846.,  849., 1396., 1747., 4488.])

In [285]:
vaccs[:,0][mask]

array([0.78321605, 0.78501378, 0.78751257, 0.78960807, 0.79135505,
       0.79325285, 0.79442808, 0.79520238, 0.80140245, 0.80228587,
       0.80478091, 0.80480168, 0.80733044, 0.80955955, 0.80960739,
       0.81135191, 0.8138907 , 0.81579746, 0.81674892, 0.81864923,
       0.81921728, 0.82211821, 0.82300158, 0.82412022, 0.82640525,
       0.82870542, 0.82964115, 0.83068627, 0.8320844 , 0.83366053,
       0.83522358, 0.83741737, 0.83844952])

In [286]:
vstds[:,2][mask]

array([0.05517623, 0.03155663, 0.02485135, 0.02468848, 0.02305336,
       0.03151132, 0.02305704, 0.02250387, 0.01910143, 0.02041007,
       0.02040313, 0.0220585 , 0.02159554, 0.02038217, 0.0203368 ,
       0.02028065, 0.0149806 , 0.02364736, 0.02406813, 0.02418018,
       0.02455941, 0.02428719, 0.0238893 , 0.02474724, 0.0247515 ,
       0.02317756, 0.0236355 , 0.02363712, 0.02440577, 0.02440647,
       0.02350755, 0.02311765, 0.0160256 ])

In [293]:
vanilla[vinds[:,2][mask].astype('int')][:,fsconsi]

array([1.        , 1.        , 0.89315644, 0.90775014, 0.85385296,
       0.951089  , 0.86386954, 0.86442233, 0.8502377 , 0.84501935,
       0.86129353, 0.89378662, 0.90323936, 0.89273632, 0.88982863,
       0.88988391, 0.85103372, 0.80736318, 0.8019016 , 0.80348259,
       0.80090658, 0.80077391, 0.79927032, 0.79607518, 0.79646213,
       0.80202322, 0.80448867, 0.80689884, 0.80964069, 0.81063571,
       0.828911  , 0.82983969, 0.87043671])

Check to make sure that we are collecting everything correctly

In [30]:
for i in range(len(binned_params)):
    if binned_values[i].shape[0] - binned_stds[i].shape[0] != 0:
        print("Wrong stds")

# Plot some stuff

In [48]:
import plotly
import plotly.graph_objs as go
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

In [49]:
import plotly.io as pio

In [50]:
def load_data_from_params(exp_descriptor, filepath = path + 'txt/'):
    if type(exp_descriptor[0]) is str:
        savepath = filepath +  '_'.join(exp_descriptor)
        savepath +='.dat'
    else:
        fname = []
        for i, (n, v) in enumerate(zip(names, exp_descriptor)):
            if i == 1 or i==7: v = int(v)
            fname.append(':'.join([n, str(v)]))
        
        savepath = filepath +  '_'.join(fname)
        savepath +='.dat'
        
    return np.loadtxt(savepath)

In [51]:
# This is not used
def running_ave(arr, num):
    new_arr = np.zeros(arr.shape[0])
    
    for i in range(arr.shape[0]):
        minspot = i-num
        maxspot = i + num + 1
        
        if minspot < 0: minspot = 0
        if maxspot >= arr.shape[0]: maxspot = arr.shape[0]
            
        new_arr[i] = arr[minspot:maxspot].mean()
        
    return new_arr

For selecting hyperparameters to use on the test seeds.  See also `adult-params-highacc.txt` for some of the hyperparameters selected in this way.

In [98]:
nbp, nbv = restrict_values('scons', lambda x: x > 0.95, binned_params, binned_values)

In [99]:
nbp, nbv = restrict_values('grcons', lambda x: x > 0.95, nbp, nbv)

In [100]:
stuff = []
nzrows = 0
for i in range(len(nbv)):
    if nbv[i].shape[0] > 0:
        nzrows += 1
#         for row in nbv[i]: 
# #             if row[1] < 0.96:
# #                 print("TITS")

In [101]:
nzrows

44

In [102]:
vals, accs, parms = calc_obj(nbv, nbp)

For vanilla methods - not looking at average rms here

In [378]:
plot = 'grms'
corr = True
plots = ['scons', 'grcons', 'grms', 'gmax', 'rrms', 'rmax']
plotnames = ['S-cons', 'GR-cons', 'Gender gap RMS', 'Gender gap MAX', 'Race gap RMS', 'Race gap MAX']
vcolInds = interest[1:]

plotaccs = vaccs
plotops = vvals
plotstds = vstds
plotinds = vinds

otherInds = list(range(len(plots)))
if plot is 'averms': 
    #plotInd = 6
    print("AveRMS not implemented for baseline methods")
else: 
    plotInd = plots.index(plot)
    otherInds.remove(plotInd)
    
# get indices that we want to plot
inds = np.where(plotaccs[:,plotInd] > 0)[0]

# this is the main plot
xaccs = plotaccs[:,plotInd][inds]
yfair = plotops[:,plotInd][inds]
yerr = plotstds[:,plotInd][inds]

if plotInd < 6:
    traces0 = [
        go.Scatter(
            x=xaccs,
            y=running_ave(yfair,smooth),
            error_y = dict(
                type='data',
                array = yerr,
                visible=True,
            ),
            mode='lines+markers', 
            name=plotnames[plotInd])
    ]
else: traces0 = []

traces = []
# if we want to look at other stuff, get it from the values
if corr:
    other = np.zeros((inds.shape[0], len(otherInds)))
    otherstd = np.zeros((inds.shape[0], len(otherInds)))
    
    #for i in range(inds.shape[0]):
        
#         # This shouldn't happen right now...
#         if plotInd == 6:
#             corrInd = np.where( 
#                 np.logical_and(
#                     res[:,balind] == xaccs[i], 
#                     np.abs((res[:, grmsi] + res[:, rrmsi])/2 - yfair[i]) <= 10**-8
#                 ))[0][0]
#         else: 
#             corrInd = np.where( 
#                 np.logical_and(
#                     res[:,balind] == xaccs[i], 
#                     res[:, colInds[plotInd]] ==  yfair[i]
#                 ))[0][0]
            
    stuff = vanilla[ plotinds[:,plotInd][inds].astype('int') ]
    stuffstd = vanillastd[ plotinds[:,plotInd][inds].astype('int') ]
        
    other = stuff[:,vcolInds[otherInds]]
    otherstd = stuffstd[:,vcolInds[otherInds]]
        
    for i, pl in enumerate(otherInds):
        traces += [
            go.Scatter( 
                x=xaccs, 
                y=running_ave(other[:,i],smooth),
                error_y = dict(
                    type='data',
                    array = otherstd[:,i],
                    visible=True,
                ),
                mode='lines+markers', 
                name=plotnames[pl]
            )
        ]

    plotTraces = traces[:plotInd] + traces0 + traces[plotInd:]
    
else: plotTraces = traces0

fig = go.Figure(data=plotTraces, layout=layout)
iplot(fig)

In [315]:
stuff = vanilla[ plotinds[:,plotInd][inds].astype('int') ]

In [317]:
stuff.shape[]

(33, 19)

Getting the scatter objects needed for the plots used in the appendix.

In [342]:
van = deepcopy(traces0)
allvan = deepcopy(plotTraces)

In [361]:
proj = deepcopy(traces0)
allproj = deepcopy(plotTraces)

In [379]:
rw = deepcopy(traces0)
allrw = deepcopy(plotTraces)

In [381]:
rw[0]['name'] = "Reweighing"

Plotting the BuDRO data

In [436]:
plot = 'averms'
corr = True
plots = ['scons', 'grcons', 'grms', 'gmax', 'rrms', 'rmax']
plotnames = ['S-cons', 'GR-cons', 'Gender gap RMS', 'Gender gap MAX', 'Race gap RMS', 'Race gap MAX']
colInds = np.array([sconsi, grconsi, grmsi, gmaxi, rrmsi, rmaxi])

smooth = 0

plotaccs = accs
plotops = vals
plotstds = stds
plotparms = parms
# plotaccs = opsacc
# plotops = ops
# plotparms = params

# Get index of fair metric we want to plot, plus other indices
otherInds = list(range(len(plots)))
if plot is 'averms': 
    plotInd = 6
else: 
    plotInd = plots.index(plot)
    otherInds.remove(plotInd)


# get indices that we want to plot
inds = np.where(plotaccs[:,plotInd] > 0)[0]

# this is the main plot
xaccs = plotaccs[:,plotInd][inds]
yfair = plotops[:,plotInd][inds]
yerr = plotstds[:,plotInd][inds]

if plotInd < 6:
    traces0 = [
        go.Scatter(
            x=xaccs,
            y=running_ave(yfair,smooth),
            error_y = dict(
                type='data',
                array = yerr,
                visible=True,
            ),
            mode='lines+markers', 
            name=plotnames[plotInd])
    ]
else: traces0 = []

traces = []
# if we want to look at other stuff, get it from the files
if corr:
    other = np.zeros((inds.shape[0], len(otherInds)))
    otherstd = np.zeros((inds.shape[0], len(otherInds)))
    
    currParams = plotparms[:,plotInd][inds]
    
    
    for i in range(inds.shape[0]):
        if len(np.array(currParams[i])) == 0:
            print(i)
        
        res = load_data_from_params(currParams[i], filepath = path + 'txt/aves-')
        restd = load_data_from_params(currParams[i], filepath = path + 'txt/devs-')
        if plotInd == 6:
            corrInd = np.where( 
                np.logical_and(
                    res[:,balind] == xaccs[i], 
                    np.abs((res[:, grmsi] + res[:, rrmsi])/2 - yfair[i]) <= 10**-8
                ))[0][0]
        else: 
            corrInd = np.where( 
                np.logical_and(
                    res[:,balind] == xaccs[i], 
                    res[:, colInds[plotInd]] ==  yfair[i]
                ))[0][0]
        
        stuff = res[corrInd]
        other[i] = stuff[colInds[otherInds]]
        
        stuffstd = restd[corrInd]
        otherstd[i] = stuffstd[colInds[otherInds]]
        
    for i, pl in enumerate(otherInds):
        traces += [
            go.Scatter( 
                x=xaccs, 
                y=running_ave(other[:,i],smooth),
                error_y = dict(
                    type='data',
                    array = otherstd[:,i],
                    visible=True,
                ),
                mode='lines+markers', 
                name=plotnames[pl]
            )
        ]

    plotTraces = traces[:plotInd] + traces0 + traces[plotInd:]
    
else: plotTraces = traces0

fig = go.Figure(data=plotTraces, layout=layout)
iplot(fig)

In [422]:
plotTraces[3]

Scatter({
    'error_y': {'array': array([0.00833282, 0.00820268, 0.0088902 , 0.00854202, 0.01256412, 0.00634418,
                                0.00816836, 0.00841868, 0.00839925, 0.00884791, 0.00945526, 0.00938507,
                                0.00762432, 0.00821873, 0.00763702, 0.00748397, 0.00755734, 0.0083481 ,
                                0.00974687, 0.01085863, 0.00884463, 0.01114555, 0.0092872 , 0.01053772,
                                0.01002463, 0.01313935, 0.00889693, 0.01099011, 0.00991107, 0.01000412,
                                0.01040106, 0.00969966, 0.01001808, 0.00942929, 0.00926244, 0.0091891 ,
                                0.00818733, 0.00697889, 0.00823393, 0.01620153, 0.01077284, 0.00692526,
                                0.01006103, 0.00813933]),
                'type': 'data',
                'visible': True},
    'mode': 'lines+markers',
    'name': 'Gender gap MAX',
    'x': array([0.76113645, 0.76235126, 0.76443675, 0.76533298, 0.76677662, 0.7

In [401]:
traces0[0]['name'] = "BuDRO"

In [410]:
plotTraces[0]['name'] = "BuDRO"

In [413]:
allrw[0]['name'] = "Reweighing"

The figure from the appendix

In [405]:
iplot(go.Figure(data=traces0+van+proj+rw, layout=layout))

In [416]:
iplot(go.Figure(data=[plotTraces[0], allvan[0], allproj[0], allrw[0]], layout=layout))

How to find the appropriate hyperparameters corresponding to one of the points in the graphs

In [107]:
np.argmin(np.abs(xaccs - .8097))

31

In [117]:
xaccs[37]

0.8200081944465637

In [118]:
plotparms[:,2][37][-1]

162.0

In [119]:
fname = []
for i, (n, v) in enumerate(zip(names, plotparms[:,2][37])):
    if i == 1 or i==7: v = int(v)
    fname.append(':'.join([n, str(v)]))

In [120]:
fname

['eps:0.3',
 'depth:14',
 'eta:0.005',
 'weight:1.0',
 'lamb:1e-06',
 'pos:0.0',
 'init:0.1',
 'epoch:200',
 'momentum:0.9',
 'lr:0.001']

In [335]:
names

['eps',
 'depth',
 'eta',
 'weight',
 'lamb',
 'pos',
 'init',
 'epoch',
 'momentum',
 'lr']

In [432]:
layout = go.Layout(
    #colorway = [colorMap[meth] for meth in ['rankCorr'] + meths],
    font = dict(
            family='CMU Serif'
    ),
    showlegend=False,
    xaxis=dict(
        title='Balanced accuracy',
        showgrid=False,
        ticks='inside',
        showline=True,
        linecolor='black',
        mirror='ticks',
        #range=[10,170],
        #range=[0,420],
        titlefont=dict(
            #family='Computer Modern',
            size=30,
            color='#000'
        ),
        tickfont=dict(
            size=24,
            color='#000'
        )
    ),
    yaxis=dict(
        title='Gap size',
        showgrid=False,
        ticks='inside',
        showline=True,
        linecolor='black',
        mirror='ticks',
        #range=[0.03,0.16], # rfc zoom
        #range=[0.04,0.16], # ncc zoom
        #range=[0.74,1.00], # SCONS
        #tick0=0.77, # rfc mcc tics
        #dtick=0.02,
        titlefont=dict(
            #family='Computer Modern',
            size=30,
            color='#000'
        ),
        tickfont=dict(
            size=24,
            color='#000'
        )
    ),
    legend=dict(
        #x=0,
        #y=1,
        #traceorder='normal',
        font=dict(
            #family='Computer Modern',
            size=20,
            color='#000'
        ),
        #bgcolor='#E2E2E2',
        #bordercolor='#FFFFFF',
        #borderwidth=2
    ),
    margin=go.layout.Margin(
        l=100,
        r=10,
        b=90,
        t=10,
        #pad=4
    ),
    plot_bgcolor='rgba(0,0,0,0)'
)