In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import scipy.stats as ss
from sklearn.metrics import classification_report

In [2]:
compas = pd.read_csv('../data/compas-scores-two-years.csv', encoding='latin-1')

In [3]:
# Filter data for useful rows
compas = compas[compas['days_b_screening_arrest'] >= -30]
compas = compas[compas['days_b_screening_arrest'] <= 30]
compas = compas[compas['is_recid'] != -1]
compas = compas[compas['c_charge_degree'] != "O"]
compas = compas[compas['score_text'] != "N/A"]
compas.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
5,7,marsha miles,marsha,miles,2013-11-30,Male,1971-08-22,44,25 - 45,Other,...,1,Low,2013-11-30,2013-11-30,2013-12-01,0,1,853,0,0
6,8,edward riddle,edward,riddle,2014-02-19,Male,1974-07-23,41,25 - 45,Caucasian,...,2,Low,2014-02-19,2014-03-31,2014-04-18,14,5,40,1,1


In [4]:
compas = compas[["sex","age","race","decile_score","priors_count",
                 "c_charge_degree","two_year_recid","c_jail_in", "c_jail_out"]]

compas = compas.loc[compas.race.isin(["Caucasian", "African-American"])]

In [5]:
# Convert features to categorical type
compas_encoded = compas.copy()
compas_encoded.sex = pd.get_dummies(compas["sex"])["Female"]
compas_encoded.race = pd.get_dummies(compas["race"])["African-American"]
compas_encoded.c_charge_degree = pd.get_dummies(compas["c_charge_degree"])["F"]

# Calculate length of stay to use in model (log hours)
compas_encoded['c_jail_in'] = pd.to_datetime(compas_encoded['c_jail_in'])
compas_encoded['c_jail_out'] = pd.to_datetime(compas_encoded['c_jail_out'])
compas_encoded['los'] = np.log((compas_encoded['c_jail_out']-compas_encoded['c_jail_in']).astype('timedelta64[h]'))

compas_encoded.drop(["c_jail_in", "c_jail_out"], axis = 1, inplace=True)

compas_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
compas_encoded.dropna(inplace=True)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [6]:
compas_encoded.head()
compas_encoded.drop_duplicates(keep = "first", inplace = True)

In [7]:
# X = features, S = sensitive attributes, E = explanatory attribute, Y = response
X = ['sex','age','decile_score','priors_count', 'los']
S = "race"
E = "c_charge_degree"
Y = "two_year_recid"

X_ALL = ['sex','age','decile_score','priors_count','race','c_charge_degree','los']

In [8]:
# 5:1:1 split
np.random.seed(5243)
train, test = train_test_split(compas_encoded, test_size=1/7)
train, val = train_test_split(train, test_size=1/6)

In [9]:
print(train.shape, test.shape, val.shape)

(3610, 8) (723, 8) (723, 8)


In [10]:
X_train = train[X_ALL]
L_train = train[X]
s_train = train[S]
e_train = train[E]
y_train = train[Y]

X_val = val[X_ALL]
L_val = val[X]
s_val = val[S]
e_val = val[E]
y_val = val[Y]

X_test = test[X_ALL]
L_test = test[X]
s_test = test[S]
e_test = test[E]
y_test = test[Y]

In [11]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
clf.score(X_val, y_val)

0.686030428769018

## Create functions used in pseudocode

In [12]:
# Creates a list of partitions: 1 for each unique value of e

# X is the full dataset (in our case, train)
# e is the 
def PARTITION(X):
    partitions = list()
    
    for e_i in np.unique(X[E]):
        partitions.append(X[X[E]==e_i])
    
    return partitions

In [13]:
def DELTA(X, X_ei, s_i):
    Gi = sum(X_ei[S] == s_i)
    
    X_ei_si = X_ei[X_ei[S] == s_i]
    
    P_denom = X_ei_si.shape[0]
    P_num = sum(X_ei_si[Y] == 1)
    P = P_num/P_denom
    
    X_ei_not_si = X_ei[X_ei[S] != s_i]
    
    Ps_2 = sum(X_ei_not_si[Y] == 1)/X_ei_not_si.shape[0]
    
    Ps = (P+Ps_2)/2
    
    d = int(round(Gi * abs(P - Ps)))
    
    return(d)

# Local Massaging

In [14]:
relabeled_X_ei = list()

for X_ei in PARTITION(train):
    X_ei_copy = X_ei.copy()
    
    ranker_model = LogisticRegression(random_state=0).fit(X_ei[X_ALL], X_ei[Y])
    
    afam_index = [i for (i, v) in zip(list(range(X_ei.shape[0])), list(X_ei[S] == 1)) if v]
    afam = X_ei[X_ei[S] == 1].copy()
    delta_afam = DELTA(train, X_ei, 1)
    afam_predicted_1_index = [afam_index[v] for v in np.squeeze(np.where(ranker_model.predict(afam[X_ALL]) == 1))]
    afam_predicted_1_index_Y1 = [i for (i,v) in zip(afam_predicted_1_index, X_ei.iloc[afam_predicted_1_index][Y]) if v==1]
    afam_predicted_1 = X_ei.iloc[afam_predicted_1_index_Y1]
    
    afam_ranks = (ss.rankdata(ranker_model.decision_function(afam_predicted_1[X_ALL]))-1).astype(int)
    afam_tochange = [i for (i, v) in zip(list(range(len(afam_ranks))), afam_ranks < delta_afam) if v]
    afam_tochange_idx = [afam_predicted_1_index_Y1[v] for v in afam_tochange]
    
    cauca_index = [i for (i, v) in zip(list(range(X_ei.shape[0])), list(X_ei[S] == 0)) if v]
    cauca = X_ei[X_ei[S] == 0].copy()
    delta_cauca = DELTA(train, X_ei, 0)
    cauca_predicted_0_index = [cauca_index[v] for v in np.squeeze(np.where(ranker_model.predict(cauca[X_ALL]) == 0))]
    cauca_predicted_0_index_Y0 = [i for (i,v) in zip(cauca_predicted_0_index, X_ei.iloc[cauca_predicted_0_index][Y]) if v==0]
    cauca_predicted_0 = X_ei.iloc[cauca_predicted_0_index_Y0]
    
    cauca_ranks = (ss.rankdata(-ranker_model.decision_function(cauca_predicted_0[X_ALL]))-1).astype(int)
    cauca_tochange = [i for (i, v) in zip(list(range(len(cauca_ranks))), cauca_ranks < delta_cauca) if v]
    cauca_tochange_idx = [cauca_predicted_0_index_Y0[v] for v in cauca_tochange]
    
    for i in afam_tochange_idx:
        X_ei_copy.loc[X_ei_copy.index[i], Y] = 0
    for i in cauca_tochange_idx:
        X_ei_copy.loc[X_ei_copy.index[i], Y] = 1
    
    relabeled_X_ei.append(X_ei_copy)
    
    print("DELTA(African American) = ", delta_afam, "African Americans changed from 1 to 0")
    print("DELTA(Caucasian) = ", delta_cauca, "Caucasians changed from 0 to 1")
    
local_massaging = pd.concat(relabeled_X_ei)

DELTA(African American) =  44 African Americans changed from 1 to 0
DELTA(Caucasian) =  38 Caucasians changed from 0 to 1
DELTA(African American) =  99 African Americans changed from 1 to 0
DELTA(Caucasian) =  57 Caucasians changed from 0 to 1


In [15]:
lm_X_train = local_massaging[X_ALL]
lm_Y_train = local_massaging[Y]

In [16]:
clf = LogisticRegression(random_state=0).fit(lm_X_train, lm_Y_train)
clf.score(X_val[X_ALL], y_val)

0.6708160442600276

In [17]:
# Total number changed values should be sum of all DELTAs shown above
res = [1 for i, j in zip(train.sort_index()["two_year_recid"], pd.DataFrame(lm_Y_train).sort_index()["two_year_recid"]) if i != j]
sum(res)

238

# Evaluation

In [41]:
# X must include the sensitive feature
def PARITY(X, Y_PRED):
    s = X[S]
    
    afam = X[X[S] == 1]
    num_afam = sum(Y_PRED[X[S] == 1])
    den_afam = afam.shape[0]
    
    cauca = X[X[S] == 0]
    num_cauca = sum(Y_PRED[X[S] == 0])
    den_cauca = cauca.shape[0]
    
    print("P_c(recid = 1 | race = African American) =", num_afam/den_afam)
    print("P_c(recid = 1 | race = Caucasian) =", num_cauca/den_cauca)
    parity = abs(num_afam/den_afam - num_cauca/den_cauca)
    print("Parity =", parity)
    
    return(parity)

In [46]:
# X must include S
def CALIBRATION(X, Y_TRUE, Y_PRED):
    
    afam = X[X[S] == 1]
    Y_TRUE_afam = Y_TRUE[X[S] == 1]
    num_afam = sum([1 for (i, v) in zip(Y_TRUE_afam, Y_PRED[X[S]==1]) if i == v])
    den_afam = afam.shape[0]
    
    cauca = X[X[S] == 0]
    Y_TRUE_cauca = Y_TRUE[X[S] == 0]
    num_cauca = sum([1 for (i, v) in zip(Y_TRUE_cauca, Y_PRED[X[S]==0]) if i == v])
    den_cauca = cauca.shape[0]
    
    print("P_c(recid predicted correctly | race = African American) =", num_afam/den_afam)
    print("P_c(recid predicted correctly | race = Caucasian) =", num_cauca/den_cauca)
    calibration = abs(num_afam/den_afam - num_cauca/den_cauca)
    print("Calibration =", calibration)

In [None]:
def EQUALITY_OF_ODDS(X, Y_TRUE, Y_PRED):
    

## Baseline

In [33]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
baseline_pred = clf.predict(X_val[X_ALL])
clf.score(X_val, y_val)

0.686030428769018

In [34]:
print(classification_report(y_val, clf.predict(X_val[X_ALL])))

              precision    recall  f1-score   support

           0       0.68      0.75      0.71       377
           1       0.69      0.62      0.65       346

    accuracy                           0.69       723
   macro avg       0.69      0.68      0.68       723
weighted avg       0.69      0.69      0.68       723



In [42]:
# Parity

PARITY(X_val, baseline_pred)

P_c(recid = 1 | race = African American) = 0.5011286681715575
P_c(recid = 1 | race = Caucasian) = 0.31785714285714284
Parity =  0.1832715253144147


0.1832715253144147

In [48]:
# Calibration

CALIBRATION(X_val, y_val, baseline_pred)

P_c(recid predicted correctly | race = African American) = 0.6884875846501128
P_c(recid predicted correctly | race = Caucasian) = 0.6821428571428572
Calibration = 0.006344727507255676


## Local Massaging

In [36]:
clf = LogisticRegression(random_state=0).fit(lm_X_train, lm_Y_train)
lm_pred = clf.predict(X_val[X_ALL])
clf.score(X_val[X_ALL], y_val)

0.6708160442600276

In [37]:
print(classification_report(y_val, clf.predict(X_val[X_ALL])))

              precision    recall  f1-score   support

           0       0.67      0.74      0.70       377
           1       0.68      0.60      0.63       346

    accuracy                           0.67       723
   macro avg       0.67      0.67      0.67       723
weighted avg       0.67      0.67      0.67       723



In [43]:
# Parity

PARITY(X_val, lm_pred)

P_c(recid = 1 | race = African American) = 0.3905191873589165
P_c(recid = 1 | race = Caucasian) = 0.46785714285714286
Parity =  0.07733795549822636


0.07733795549822636

In [50]:
# Calibration

CALIBRATION(X_val, y_val, lm_pred)

P_c(recid predicted correctly | race = African American) = 0.672686230248307
P_c(recid predicted correctly | race = Caucasian) = 0.6678571428571428
Calibration = 0.004829087391164166
