In [1]:
import numpy as np
import copy
import os

## Readin Adult data from fair_bench/data

In [2]:
os.getcwd()

'/nas/longleaf/home/xianwen/fairness/fair_bench/laftr'

In [3]:
os.chdir('/nas/longleaf/home/xianwen/fairness/fair_bench')

In [4]:
from data.process_data import get_adult_data

train_df, test_df, feature_names, label_column = get_adult_data('./data/adult')

In [5]:
train_df.shape

(32561, 123)

In [6]:
test_df.shape

(16281, 123)

In [7]:
feature_names

['workclass_?',
 'workclass_Federal-gov',
 'workclass_Local-gov',
 'workclass_Never-worked',
 'workclass_Private',
 'workclass_Self-emp-inc',
 'workclass_Self-emp-not-inc',
 'workclass_State-gov',
 'workclass_Without-pay',
 'education_10th',
 'education_11th',
 'education_12th',
 'education_1st-4th',
 'education_5th-6th',
 'education_7th-8th',
 'education_9th',
 'education_Assoc-acdm',
 'education_Assoc-voc',
 'education_Bachelors',
 'education_Doctorate',
 'education_HS-grad',
 'education_Masters',
 'education_Preschool',
 'education_Prof-school',
 'education_Some-college',
 'marital_status_Divorced',
 'marital_status_Married-AF-spouse',
 'marital_status_Married-civ-spouse',
 'marital_status_Married-spouse-absent',
 'marital_status_Never-married',
 'marital_status_Separated',
 'marital_status_Widowed',
 'occupation_?',
 'occupation_Adm-clerical',
 'occupation_Armed-Forces',
 'occupation_Craft-repair',
 'occupation_Exec-managerial',
 'occupation_Farming-fishing',
 'occupation_Handlers-

In [8]:
np.mean(train_df['gender_Female']+train_df['gender_Male'])

1.0

In [9]:
np.mean(test_df['gender_Female']+test_df['gender_Male'])

1.0

In [10]:
train_df[label_column].unique()

array([0, 1])

In [11]:
test_df[label_column].unique()

array([0, 1])

**What we do?**:
* Set Y as double column
* Set A as a single column
* Centralization for all the columns in X
* Get indices for the validation set
* Save X, Y, A, and valid_inds as npz.

### Align Data

In [12]:
def whiten(X, mn, std, EPS=1e-8):
    mntile = np.tile(mn, (X.shape[0], 1))
    stdtile = np.maximum(np.tile(std, (X.shape[0], 1)), EPS)
    X = X - mntile
    X = np.divide(X, stdtile)
    return X

def centralization(X_train, X_test=None):
    mn = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    
    X_train_whiten =  whiten(X_train, mn, std)
    if X_test is not None:
        X_test_whiten = whiten(X_test, mn, std)
        return X_train_whiten, X_test_whiten
    else:
        return X_train_whiten, None

In [13]:
def split_senstitive_attrs(df, label_name, sensitive_attrs):
    """
    Remove the columns containing sensitive information from the df
    and store the values into a dictionary
    Remove the label column

    Input:
        df: data frame
        sensitive_attrs: list of senstitive attributes

    Output:
        nonsenstitive_df: data frame free of senstitive attributes and the labels
        labels: labels
        control_dicn: {'senstitive_attr': [values], ...}

  """
    control_dicn = {}
    nonsenstitive_df = copy.deepcopy(df)

    # split sensitive attributes
    for attr in sensitive_attrs:
        if attr not in df.columns:
            print("Warning: Column {} does NOT exist!".format(attr))
        else:
            control_dicn[attr] = nonsenstitive_df.pop(attr).to_numpy()
    
    # split the label column
    labels = nonsenstitive_df.pop(label_name).to_numpy()

    return nonsenstitive_df, labels, control_dicn


In [14]:
def process_train_test_df_laftr(train_df, test_df, label_name,
                        senstitive_attrs, indicators,
                        valid_ratio=0.2, seed=0,
                        central=False):
    """
    Take the training and testing set. 
    Split them into the X, y, and the column indicating demographic information.
    """
    x_train, y_train, x_train_control_all = split_senstitive_attrs(train_df, label_name, senstitive_attrs)
    x_test, y_test, x_test_control_all = split_senstitive_attrs(test_df, label_name, senstitive_attrs)

    # Only retain attrs listed in indicators.
    # Usually we choose the group that is potentially discriminated against here (e.g. Female), 
    # as by default, A=1 indicates the protected group
    a_train = np.array([np.array(x_train_control_all[attr], dtype=float) for attr in indicators]).T
    a_test = np.array([np.array(x_test_control_all[attr], dtype=float) for attr in indicators]).T
    
    # convert y into dummy variables
    y_train = np.array(y_train, dtype=float)
    y_test = np.array(y_test, dtype=float)
    y_train = np.array([1-y_train, y_train]).T
    y_test = np.array([1-y_test, y_test]).T
    
    # convert x into np.array
    x_train = np.array(x_train, dtype=float)
    x_test = np.array(x_test, dtype=float)
    # centralization
    if central:
        x_train, x_test = centralization(x_train, x_test)
        
    # take the indices for the validation set
    np.random.seed(seed)
    shuf = np.random.permutation(x_train.shape[0])
    valid_ct = int(x_train.shape[0] * valid_ratio)
    valid_inds = shuf[:valid_ct]
    train_inds = shuf[valid_ct:]
        

    return x_train, y_train, a_train, x_test, y_test, a_test, train_inds, valid_inds

In [15]:
x_train, y_train, a_train, x_test, y_test, a_test, train_inds, valid_inds = process_train_test_df_laftr(train_df, test_df, 'label',
                        ['gender_Male', 'gender_Female'], ['gender_Female'],
                        0.2, 0,
                        True)

In [16]:
x_train.shape

(32561, 120)

In [17]:
y_train.shape

(32561, 2)

In [18]:
a_train.shape

(32561, 1)

In [19]:
train_inds.shape

(26049,)

In [20]:
f_out_np = './laftr/data/adult/adult.npz'
np.savez(f_out_np, x_train=x_train, x_test=x_test,
         y_train=y_train, y_test=y_test,
         attr_train=a_train, attr_test=a_test,
         train_inds=train_inds, valid_inds=valid_inds)

### Process COMPAS

In [21]:
from data.process_data import get_compas_data

train_df, test_df, feature_names, label_column = get_compas_data('./data/compas')

In [22]:
feature_names

['days_b_screening_arrest',
 'c_charge_degree_F',
 'c_charge_degree_M',
 'race_African-American',
 'race_Caucasian',
 'score_text_High',
 'score_text_Low',
 'score_text_Medium',
 'sex_Female',
 'sex_Male',
 'age_0',
 'age_1',
 'age_2',
 'age_3',
 'age_4',
 'age_5',
 'decile_score_0',
 'decile_score_1',
 'decile_score_2',
 'decile_score_3',
 'decile_score_4',
 'decile_score_5',
 'priors_count_0.0',
 'priors_count_1.0',
 'priors_count_2.0',
 'priors_count_3.0',
 'priors_count_4.0']

In [23]:
np.mean(train_df['race_African-American']+train_df['race_Caucasian'])

1.0

In [24]:
np.mean(test_df['race_African-American']+test_df['race_Caucasian'])

1.0

In [25]:
label_column

'two_year_norecid'

In [26]:
train_df[label_column].unique()

array([0., 1.])

In [27]:
test_df[label_column].unique()

array([0., 1.])

In [28]:
x_train, y_train, a_train, x_test, y_test, a_test, train_inds, valid_inds = process_train_test_df_laftr(train_df, test_df, label_column,
                        ['race_Caucasian', 'race_African-American'], ['race_African-American'],
                        0.2, 0,
                        True)

In [29]:
x_train.shape

(3483, 25)

In [30]:
y_train.shape

(3483, 2)

In [31]:
a_train.shape

(3483, 1)

In [32]:
train_inds.shape

(2787,)

In [33]:
f_out_np = './laftr/data/compas/compas.npz'
np.savez(f_out_np, x_train=x_train, x_test=x_test,
         y_train=y_train, y_test=y_test,
         attr_train=a_train, attr_test=a_test,
         train_inds=train_inds, valid_inds=valid_inds)

## Double Check

In [44]:
X_adult = np.load('./laftr/data/adult/adult.npz')['x_train']

In [46]:
X_adult.shape

(32561, 120)

In [49]:
X_compas = np.load('./laftr/data/compas/compas.npz')['x_train']

In [50]:
X_compas.shape

(3483, 25)

## Check Original Input Data

In [35]:
adult_repr_files = np.load('./laftr/data/adult_original.npz')

In [36]:
adult_repr_files.files

['x_train',
 'x_test',
 'y_train',
 'y_test',
 'attr_train',
 'attr_test',
 'train_inds',
 'valid_inds']

In [37]:
original_x_train = adult_repr_files['x_train']

In [38]:
original_x_train.shape

(30162, 112)

In [39]:
original_y_train = adult_repr_files['y_train']

In [40]:
original_y_train.shape

(30162, 2)

In [41]:
original_A_train = adult_repr_files['attr_train']

In [42]:
original_A_train.shape

(30162, 1)

In [43]:
original_train_inds = adult_repr_files['train_inds']
len(original_train_inds)

24130

## Check output

In [36]:
os.chdir('/nas/longleaf/home/xianwen/fairness/fair_bench')

In [37]:
adult_repr_folder = './laftr/experiments/laftr/adult/npz'

In [38]:
adult_repr_files = np.load(os.path.join(adult_repr_folder, 'Z.npz'))
print(adult_repr_files.files)

FileNotFoundError: [Errno 2] No such file or directory: './laftr/experiments/laftr/adult/npz/Z.npz'

In [6]:
adult_repr = adult_repr_files['X']

In [7]:
type(adult_repr)

numpy.ndarray

In [8]:
adult_repr.shape

(16256, 8)

In [9]:
adult_Y_files = np.load(os.path.join(adult_repr_folder, 'Y.npz'))
print(adult_Y_files.files)
adult_A_files = np.load(os.path.join(adult_repr_folder, 'A.npz'))
print(adult_A_files.files)

['X']
['X']


In [10]:
adult_Y = adult_Y_files['X']
adult_A = adult_A_files['X']

In [11]:
adult_Y.shape

(16256, 1)

In [12]:
adult_A.shape

(16256, 1)

In [17]:
adult_repr[0:10]

array([[  2.36800551,   0.51488698,  -6.43933821,   6.79883909,
          0.74656892,  -7.35176182,  -2.16689491,   2.14770246],
       [  7.53996086,  -4.16956949,   6.33677673,  -1.14231133,
          6.37594318,   2.90546417,  -3.3146472 ,  -2.78796458],
       [ -8.4270525 ,   3.16050935,  -2.48712921,   7.05414772,
         -1.6574012 ,  -9.32408428,   3.33772087,   3.27774405],
       [-13.54678631,   3.83303809,   1.67139149,   6.61534023,
         -2.2252059 ,  -6.50504494,   5.50798273,   6.998456  ],
       [ -0.551359  ,  -0.43889219,  -6.03802681,   2.55526471,
         -3.26362801,  -1.40658844,  -0.77790767,   3.21221495],
       [  1.39876151,   1.26004899,  -5.35414743,   5.25246954,
          0.6025672 ,  -1.66529596,  -1.46500587,   4.44032478],
       [  6.9448843 ,  -5.41585302,  -3.94848156,   6.16019726,
          2.49489999,  -8.06780243,  -5.9266572 ,  -0.92539805],
       [ -1.38108146,  -2.6848805 ,   5.42455244,  -1.83300877,
          0.47482109,  -1.3840144