In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import scipy.stats as ss

In [22]:
compas = pd.read_csv('../data/compas-scores-two-years.csv', encoding='latin-1')

In [23]:
# Filter data for useful rows
compas = compas[compas['days_b_screening_arrest'] >= -30]
compas = compas[compas['days_b_screening_arrest'] <= 30]
compas = compas[compas['is_recid'] != -1]
compas = compas[compas['c_charge_degree'] != "O"]
compas = compas[compas['score_text'] != "N/A"]
compas.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
5,7,marsha miles,marsha,miles,2013-11-30,Male,1971-08-22,44,25 - 45,Other,...,1,Low,2013-11-30,2013-11-30,2013-12-01,0,1,853,0,0
6,8,edward riddle,edward,riddle,2014-02-19,Male,1974-07-23,41,25 - 45,Caucasian,...,2,Low,2014-02-19,2014-03-31,2014-04-18,14,5,40,1,1


In [24]:
compas = compas[["sex","age","race","decile_score","priors_count",
                 "c_charge_degree","two_year_recid","c_jail_in", "c_jail_out"]]

compas = compas.loc[compas.race.isin(["Caucasian", "African-American"])]

In [148]:
# Convert features to categorical type
compas_encoded = compas.copy()
compas_encoded.sex = pd.get_dummies(compas["sex"])["Female"]
compas_encoded.race = pd.get_dummies(compas["race"])["African-American"]
compas_encoded.c_charge_degree = pd.get_dummies(compas["c_charge_degree"])["F"]

# Calculate length of stay to use in model (log hours)
compas_encoded['c_jail_in'] = pd.to_datetime(compas_encoded['c_jail_in'])
compas_encoded['c_jail_out'] = pd.to_datetime(compas_encoded['c_jail_out'])
compas_encoded['los'] = np.log((compas_encoded['c_jail_out']-compas_encoded['c_jail_in']).astype('timedelta64[h]'))

compas_encoded.drop(["c_jail_in", "c_jail_out"], axis = 1, inplace=True)

compas_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
compas_encoded.dropna(inplace=True)

1       African-American
2       African-American
6              Caucasian
8              Caucasian
10             Caucasian
              ...       
7207    African-American
7208    African-American
7209    African-American
7210    African-American
7212    African-American
Name: race, Length: 5278, dtype: object
1       1
2       1
6       0
8       0
10      0
       ..
7207    1
7208    1
7209    1
7210    1
7212    1
Name: race, Length: 5278, dtype: uint8


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [26]:
compas_encoded.head()
compas_encoded.drop_duplicates(keep = "first", inplace = True)

In [32]:
# X = features, S = sensitive attributes, E = explanatory attribute, Y = response
X = ['sex','age','decile_score','priors_count', 'los']
S = "race"
E = "c_charge_degree"
Y = "two_year_recid"

X_ALL = ['sex','age','decile_score','priors_count','race','c_charge_degree','los']

In [33]:
# 5:1:1 split
np.random.seed(5243)
train, test = train_test_split(compas_encoded, test_size=1/7)
train, val = train_test_split(train, test_size=1/6)

In [34]:
print(train.shape, test.shape, val.shape)

(3610, 8) (723, 8) (723, 8)


In [35]:
X_train = train[X_ALL]
L_train = train[X]
s_train = train[S]
e_train = train[E]
y_train = train[Y]

X_val = val[X_ALL]
L_val = val[X]
s_val = val[S]
e_val = val[E]
y_val = val[Y]

X_test = test[X_ALL]
L_test = test[X]
s_test = test[S]
e_test = test[E]
y_test = test[Y]

In [36]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
clf.score(X_val, y_val)

0.686030428769018

## Create functions used in pseudocode

In [37]:
# Creates a list of partitions: 1 for each unique value of e

# X is the full dataset (in our case, train)
# e is the 
def PARTITION(X):
    partitions = list()
    
    for e_i in np.unique(X[E]):
        partitions.append(X[X[E]==e_i])
    
    return partitions

In [38]:
# Delta function returns the number of observations (i.e. people) who are incorrectly classified 
# based on theoretical probabilities of reciding, calculated as the average rate of reciding
# for each explanatory varaible (in our case, type of crime comittied, c_charge_degree)

def DELTA(X, X_ei, s_i):
    
    # Gi is the number of observations for each race
    # Don't we need to pass S as a parameter for this function?
    Gi = sum(X_ei[S] == s_i)
    
    # X_ei_si is the dataset that contains the observations for each race
    X_ei_si = X_ei[X_ei[S] == s_i]
    
    # P_denom is the number of people in group 
    # P_num is number of observations who recid
    P_denom = X_ei_si.shape[0]
    P_num = sum(X_ei_si[Y] == 1)
    
    # P is the probability of reciding for one race
    # It is calculated by taking number of people who recid in each group 
    # dividied by total number of people in that group
    P = P_num/P_denom
    
    # All other observations (for the other group)
    X_ei_not_si = X_ei[X_ei[S] != s_i]
    
    # The probability of reciding for the other group (same calculation as above)
    Ps_2 = sum(X_ei_not_si[Y] == 1)/X_ei_not_si.shape[0]
    
    # Ps is P*, which is the theoretical true probability of reciding
    # Calculated by the average 
    Ps = (P+Ps_2)/2
    
    # Calcualte the number of incorrectly classified people
    d = int(round(Gi * abs(P - Ps)))
    
    return(d)

# Local Massaging

In [39]:
relabeled_X_ei = list()

for X_ei in PARTITION(train):
    X_ei_copy = X_ei.copy()
    
    ranker_model = LogisticRegression(random_state=0).fit(X_ei[X_ALL], X_ei[Y])
    
    afam_index = [i for (i, v) in zip(list(range(X_ei.shape[0])), list(X_ei[S] == 1)) if v]
    afam = X_ei[X_ei[S] == 1].copy()
    delta_afam = DELTA(train, X_ei, 1)
    afam_predicted_1_index = [afam_index[v] for v in np.squeeze(np.where(ranker_model.predict(afam[X_ALL]) == 1))]
    afam_predicted_1 = X_ei.iloc[afam_predicted_1_index]
    
    afam_ranks = (ss.rankdata(ranker_model.decision_function(afam_predicted_1[X_ALL]))-1).astype(int)
    afam_tochange = [i for (i, v) in zip(list(range(len(afam_ranks))), afam_ranks < delta_afam) if v]
    afam_tochange_idx = [afam_predicted_1_index[v] for v in afam_tochange]
    
    cauca_index = [i for (i, v) in zip(list(range(X_ei.shape[0])), list(X_ei[S] == 0)) if v]
    cauca = X_ei[X_ei[S] == 0].copy()
    delta_cauca = DELTA(train, X_ei, 0)
    cauca_predicted_0_index = [cauca_index[v] for v in np.squeeze(np.where(ranker_model.predict(cauca[X_ALL]) == 0))]
    cauca_predicted_0 = X_ei.iloc[cauca_predicted_0_index]
    
    cauca_ranks = (ss.rankdata(-ranker_model.decision_function(cauca_predicted_0[X_ALL]))-1).astype(int)
    cauca_tochange = [i for (i, v) in zip(list(range(len(cauca_ranks))), cauca_ranks < delta_cauca) if v]
    cauca_tochange_idx = [cauca_predicted_0_index[v] for v in cauca_tochange]
    
    for i in afam_tochange_idx:
        X_ei_copy.loc[X_ei_copy.index[i], Y] = 0
    for i in cauca_tochange_idx:
        X_ei_copy.loc[X_ei_copy.index[i], Y] = 1
    
    relabeled_X_ei.append(X_ei_copy)
    
    print("DELTA(African American) = ", delta_afam, "African Americans changed from 1 to 0")
    print("DELTA(Caucasian) = ", delta_cauca, "Caucasians changed from 0 to 1")
    
local_massaging = pd.concat(relabeled_X_ei)

DELTA(African American) =  44 African Americans changed from 1 to 0
DELTA(Caucasian) =  38 Caucasians changed from 0 to 1
DELTA(African American) =  99 African Americans changed from 1 to 0
DELTA(Caucasian) =  57 Caucasians changed from 0 to 1


In [40]:
lm_X_train = local_massaging[X_ALL]
lm_Y_train = local_massaging[Y]

In [41]:
clf = LogisticRegression(random_state=0).fit(lm_X_train, lm_Y_train)
clf.score(X_val[X_ALL], y_val)

0.6915629322268326

In [42]:
len(afam_tochange_idx)

99

In [43]:
res = [1 for i, j in zip(X_ei[Y], X_ei_copy[Y]) if i != j]
sum(res)

72

In [44]:
77+46

123

In [45]:
res = [1 for i, j in zip(train.sort_index()["two_year_recid"], pd.DataFrame(lm_Y_train).sort_index()["two_year_recid"]) if i != j]
sum(res)

109

In [46]:
pd.concat([train.sort_index()[S],train.sort_index()["two_year_recid"],pd.DataFrame(lm_Y_train).sort_index()["two_year_recid"]], axis = 1)[150:200]

Unnamed: 0,race,two_year_recid,two_year_recid.1
307,1,1,1
308,0,0,0
310,0,0,1
315,0,1,1
316,0,1,1
317,0,0,0
319,1,0,0
320,1,1,1
321,0,0,0
322,0,0,0


In [47]:
train.sort_index()

Unnamed: 0,sex,age,race,decile_score,priors_count,c_charge_degree,two_year_recid,los
2,0,24,1,4,4,1,1,3.258097
6,0,41,0,6,14,1,1,5.017280
8,1,39,0,1,0,0,0,4.248495
10,0,27,0,4,0,1,0,3.218876
11,0,23,1,6,3,0,1,4.605170
...,...,...,...,...,...,...,...,...
7200,1,20,1,7,0,0,1,4.762174
7204,0,30,1,4,2,0,0,4.941642
7206,0,21,0,6,0,0,1,4.317488
7208,0,20,1,9,0,1,0,2.995732


In [48]:
pd.concat([lm_X_train, pd.DataFrame(lm_Y_train)], axis = 1).sort_index()

Unnamed: 0,sex,age,decile_score,priors_count,race,c_charge_degree,los,two_year_recid
2,0,24,4,4,1,1,3.258097,1
6,0,41,6,14,0,1,5.017280,1
8,1,39,1,0,0,0,4.248495,0
10,0,27,4,0,0,1,3.218876,0
11,0,23,6,3,1,0,4.605170,1
...,...,...,...,...,...,...,...,...
7200,1,20,7,0,1,0,4.762174,1
7204,0,30,4,2,1,0,4.941642,0
7206,0,21,6,0,0,0,4.317488,1
7208,0,20,9,0,1,1,2.995732,0


## Local Preferential Sampling

In [67]:
ranker_model = LogisticRegression(random_state=0).fit(X_ei[X_ALL], X_ei[Y])
afam = X_ei[X_ei[S] == 1].copy()
afam_y = X_

preds = ranker_model.predict(afam[X_ALL])
print(preds)
print(np.squeeze(X_ei[Y]))

# Identify incorrect results
results_df = pd.DataFrame(afam[X_ALL])
results_df["actual"] = X_ei[Y]
results_df["predicted"] = preds
results_df

#incorrect = df[df["actual"] != df["predicted"]]
#incorrect

[1 1 1 ... 1 1 0]


AttributeError: module 'numpy' has no attribute 'summary'

In [201]:
recomp_X_ei = list()

# for each partition (explanatory variable)
for X_ei in PARTITION(train):
    
    print("start partition")
    X_ei_copy = X_ei.copy()
    print("X_ei shape:", X_ei_copy.shape)
    
    # learn a ranker Hi : Xi -> Yi
    ranker_model = LogisticRegression(random_state=0).fit(X_ei[X_ALL], X_ei[Y])
    
    # Calculate half delta (AA: S_i = 1, AA: S_i = 0)
    half_delta_afam = DELTA(train, X_ei, 1) // 2
    half_delta_cauc = DELTA(train, X_ei, 0) // 2
    print("Half Delta(AA):", half_delta_afam)
    print("Half Delta(Cauc):", half_delta_cauc)
    
    # store indicies
    afam_index = [i for (i, v) in zip(list(range(X_ei.shape[0])), list(X_ei[S] == 1)) if v]
    c_index = [i for (i, v) in zip(list(range(X_ei.shape[0])), list(X_ei[S] == 0)) if v]
    print("Total AAs:", len(afam_index))
    print("Total Cs:", len(c_index))
    
    # get subset of data to work with
    afam = X_ei[X_ei[S] == 1].copy()
    c = X_ei[X_ei[S] == 0].copy()
    print("afam dataset shape:", afam.shape)
    print("c dataset shape:", c.shape)
    
    # rank AA
    afam.reset_index(drop=True, inplace=True)
    rank = pd.DataFrame(ranker_model.decision_function(afam[X_ALL]), columns = ['rank'])
    afam_with_rank = pd.concat([afam, rank], axis=1)
    
    # rank C
    c.reset_index(drop=True, inplace=True)
    rank = pd.DataFrame(ranker_model.decision_function(c[X_ALL]), columns = ['rank'])
    c_with_rank = pd.concat([c, rank], axis=1)
    
    # sort values, reset indices
    afam_with_rank = afam_with_rank.sort_values(['rank'])
    afam_with_rank.reset_index(drop = True, inplace = True)

    c_with_rank = c_with_rank.sort_values(['rank'])
    c_with_rank.reset_index(drop = True, inplace = True)

    # AA 
    
    # find rows to delete/duplicate
    recid = sum(afam_with_rank['rank'] > 0)
    no_recid = sum(afam_with_rank['rank'] < 0)    
    total = len(afam_with_rank)
    print("recid:", recid, 
          "no_recid:", no_recid, 
          "total:", total, 
          "half_delta:", half_delta_afam)
    
    # Delete values that are almost classified as recid
    low_vals = np.full(total - recid, False) # array of 5 Falses
    middle_vals = np.full(half_delta_afam, True) # array of 2 trues
    high_vals = np.full(recid - half_delta_afam, False) # array of 8 Falses
    delete = np.invert(np.concatenate([low_vals, middle_vals, high_vals])) 

    # Duplicate values that are alsmost classified as non-recid 
    False_3 = np.full(total - recid - half_delta_afam, False)
    False_4 = np.full(recid, False)
    duplicate = np.concatenate([False_3, middle_vals, False_4]) # boolean array
    
    dupes = afam_with_rank[duplicate]
    deletes = afam_with_rank[delete]
    deletes = pd.concat([afam_with_rank, dupes], axis=0)
    recomp_X_ei.append(afam_with_rank)
    
    # C
    
    recid = sum(c_with_rank['rank'] < 0)
    no_recid = sum(c_with_rank['rank'] > 0)    
    total = len(c_with_rank)
    
    print("recid:", recid, 
          "no_recid:", no_recid, 
          "total:", total, 
          "half_delta:", half_delta_cauc)
    
    # Delete values that are almost classified as recid
    low_vals = np.full(total - recid, False) # array of 5 Falses
    middle_vals = np.full(half_delta_cauc, True) # array of 2 trues
    high_vals = np.full(recid - half_delta_cauc, False) # array of 8 Falses
    delete = np.invert(np.concatenate([low_vals, middle_vals, high_vals])) 

    # Duplicate values that are alsmost classified as non-recid 
    False_3 = np.full(total - recid - half_delta_cauc, False)
    False_4 = np.full(recid, False)
    duplicate = np.concatenate([False_3, middle_vals, False_4]) # boolean array
    
    dupes = c_with_rank[duplicate]
    deletes = c_with_rank[delete]
    deletes = pd.concat([c_with_rank, dupes], axis=0)
    recomp_X_ei.append(c_with_rank)
    
    print("end partition")

local_pref_sampling = pd.concat(recomp_X_ei)

start partition
X_ei shape: (1245, 8)
Half Delta(AA): 22
Half Delta(Cauc): 19
Total AAs: 664
Total Cs: 581
afam dataset shape: (664, 8)
c dataset shape: (581, 8)
recid: 253 no_recid: 411 total: 664 half_delta: 22
recid: 480 no_recid: 101 total: 581 half_delta: 19
end partition
start partition
X_ei shape: (2365, 8)
Half Delta(AA): 49
Half Delta(Cauc): 28
Total AAs: 1504
Total Cs: 861
afam dataset shape: (1504, 8)
c dataset shape: (861, 8)
recid: 897 no_recid: 607 total: 1504 half_delta: 49
recid: 575 no_recid: 286 total: 861 half_delta: 28
end partition
