In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import scipy.stats as ss

In [2]:
compas = pd.read_csv('../data/compas-scores-two-years.csv', encoding='latin-1')

In [3]:
compas = compas[["sex","age","race","decile_score","priors_count","c_charge_degree","two_year_recid","c_jail_in", "c_jail_out"]]
compas = compas.loc[compas.race.isin(["Caucasian", "African-American"])]

In [4]:
compas_encoded = compas.copy()
compas_encoded.sex = pd.get_dummies(compas["sex"])["Female"]
compas_encoded.race = pd.get_dummies(compas["race"])["African-American"]
compas_encoded.c_charge_degree = pd.get_dummies(compas["c_charge_degree"])["F"]


compas_encoded['c_jail_in'] = pd.to_datetime(compas_encoded['c_jail_in'])
compas_encoded['c_jail_out'] = pd.to_datetime(compas_encoded['c_jail_out'])
compas_encoded['los'] = np.log((compas_encoded['c_jail_out']-compas_encoded['c_jail_in']).astype('timedelta64[h]'))#use log hours

compas_encoded.drop(["c_jail_in", "c_jail_out"], axis = 1, inplace=True)

compas_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
compas_encoded.dropna(inplace=True)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
compas_encoded.head()
compas_encoded.drop_duplicates(keep = "first", inplace = True)

In [6]:
X = ['sex','age','decile_score','priors_count', 'los']
S = "race"
E = "c_charge_degree"
Y = "two_year_recid"

X_ALL = ['sex','age','decile_score','priors_count','race','c_charge_degree','los']


In [7]:
# 5:1:1 split
np.random.seed(5243)
train, test = train_test_split(compas_encoded, test_size=1/7)
train, val = train_test_split(train, test_size=1/6)


In [8]:
print(train.shape, test.shape, val.shape)

(4045, 8) (810, 8) (809, 8)


In [9]:
X_train = train[X_ALL]
L_train = train[X]
s_train = train[S]
e_train = train[E]
y_train = train[Y]

X_val = val[X_ALL]
L_val = val[X]
s_val = val[S]
e_val = val[E]
y_val = val[Y]

X_test = test[X_ALL]
L_test = test[X]
s_test = test[S]
e_test = test[E]
y_test = test[Y]

In [10]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
clf.score(X_val, y_val)

0.6650185414091471

## Create functions used in pseudocode

In [11]:
# Creates a list of partitions: 1 for each unique value of e

# X is the full dataset (in our case, train)
# e is the 
def PARTITION(X):
    partitions = list()
    
    for e_i in np.unique(X[E]):
        partitions.append(X[X[E]==e_i])
    
    return partitions

In [12]:
def DELTA(X, X_ei, s_i):
    Gi = sum(X_ei[S] == s_i)
    
    X_ei_si = X_ei[X_ei[S] == s_i]
    
    P_denom = X_ei_si.shape[0]
    P_num = sum(X_ei_si[Y] == 1)
    P = P_num/P_denom
    
    X_ei_not_si = X_ei[X_ei[S] != s_i]
    
    Ps_2 = sum(X_ei_not_si[Y] == 1)/X_ei_not_si.shape[0]
    
    Ps = (P+Ps_2)/2
    
    d = int(round(Gi * abs(P - Ps)))
    
    return(d)

# Local Massaging

In [35]:
relabeled_X_ei = list()

for X_ei in PARTITION(train):
    X_ei_copy = X_ei.copy()
    
    ranker_model = LogisticRegression(random_state=0).fit(X_ei[X_ALL], X_ei[Y])
    
    afam_index = [i for (i, v) in zip(list(range(X_ei.shape[0])), list(X_ei[S] == 1)) if v]
    afam = X_ei[X_ei[S] == 1].copy()
    delta_afam = DELTA(train, X_ei, 1)
    afam_predicted_1_index = [afam_index[v] for v in np.squeeze(np.where(ranker_model.predict(afam[X_ALL]) == 1))]
    afam_predicted_1 = X_ei.iloc[afam_predicted_1_index]
    
    afam_ranks = (ss.rankdata(ranker_model.decision_function(afam_predicted_1[X_ALL]))-1).astype(int)
    afam_tochange = [i for (i, v) in zip(list(range(len(afam_ranks))), afam_ranks < delta_afam) if v]
    afam_tochange_idx = [afam_predicted_1_index[v] for v in afam_tochange]
    
    cauca_index = [i for (i, v) in zip(list(range(X_ei.shape[0])), list(X_ei[S] == 0)) if v]
    cauca = X_ei[X_ei[S] == 0].copy()
    delta_cauca = DELTA(train, X_ei, 0)
    cauca_predicted_0_index = [cauca_index[v] for v in np.squeeze(np.where(ranker_model.predict(cauca[X_ALL]) == 0))]
    cauca_predicted_0 = X_ei.iloc[cauca_predicted_0_index]
    
    cauca_ranks = (ss.rankdata(-ranker_model.decision_function(cauca_predicted_0[X_ALL]))-1).astype(int)
    cauca_tochange = [i for (i, v) in zip(list(range(len(cauca_ranks))), cauca_ranks < delta_cauca) if v]
    cauca_tochange_idx = [cauca_predicted_0_index[v] for v in cauca_tochange]
    
    for i in afam_tochange_idx:
        X_ei_copy.loc[X_ei_copy.index[i], Y] = 0
    for i in cauca_tochange_idx:
        X_ei_copy.loc[X_ei_copy.index[i], Y] = 1
    
    relabeled_X_ei.append(X_ei_copy)
    
    print("DELTA(African American) = ", delta_afam, "African Americans changed from 1 to 0")
    print("DELTA(Caucasian) = ", delta_cauca, "Caucasians changed from 0 to 1")
    
local_massaging = pd.concat(relabeled_X_ei)

DELTA(African American) =  48 African Americans changed from 1 to 0
DELTA(Caucasian) =  40 Caucasians changed from 0 to 1
DELTA(African American) =  77 African Americans changed from 1 to 0
DELTA(Caucasian) =  46 Caucasians changed from 0 to 1


In [36]:
lm_X_train = local_massaging[X_ALL]
lm_Y_train = local_massaging[Y]

In [37]:
clf = LogisticRegression(random_state=0).fit(lm_X_train, lm_Y_train)
clf.score(X_val[X_ALL], y_val)

0.6625463535228677

In [38]:
len(afam_tochange_idx)


77

In [39]:
res = [1 for i, j in zip(X_ei[Y], X_ei_copy[Y]) if i != j]
sum(res)


62

In [40]:
77+46

123

In [42]:
res = [1 for i, j in zip(train.sort_index()["two_year_recid"], pd.DataFrame(lm_Y_train).sort_index()["two_year_recid"]) if i != j]
sum(res)

105

In [20]:
pd.concat([train.sort_index()[S],train.sort_index()["two_year_recid"],pd.DataFrame(lm_Y_train).sort_index()["two_year_recid"]], axis = 1)[150:200]

Unnamed: 0,race,two_year_recid,two_year_recid.1
257,1,1,0
264,1,1,1
267,0,0,0
269,1,0,0
275,0,0,0
279,0,0,0
281,0,0,0
282,1,0,0
284,1,0,0
285,1,1,1


In [21]:
train.sort_index()

Unnamed: 0,sex,age,race,decile_score,priors_count,c_charge_degree,two_year_recid,los
1,0,34,1,3,0,1,1,5.484797
2,0,24,1,4,4,1,1,3.258097
6,0,41,0,6,14,1,1,5.017280
8,1,39,0,1,0,0,0,4.248495
9,0,21,0,3,1,1,1,3.135494
...,...,...,...,...,...,...,...,...
7207,0,30,1,2,0,0,1,3.091042
7208,0,20,1,9,0,1,0,2.995732
7209,0,23,1,7,0,1,0,3.806662
7210,0,23,1,3,0,1,0,3.784190


In [22]:
pd.concat([lm_X_train, pd.DataFrame(lm_Y_train)], axis = 1).sort_index()

Unnamed: 0,sex,age,decile_score,priors_count,race,c_charge_degree,los,two_year_recid
1,0,34,3,0,1,1,5.484797,1
2,0,24,4,4,1,1,3.258097,0
6,0,41,6,14,0,1,5.017280,1
8,1,39,1,0,0,0,4.248495,0
9,0,21,3,1,0,1,3.135494,1
...,...,...,...,...,...,...,...,...
7207,0,30,2,0,1,0,3.091042,1
7208,0,20,9,0,1,1,2.995732,0
7209,0,23,7,0,1,1,3.806662,0
7210,0,23,3,0,1,1,3.784190,0
