# Algorithm 6:  Handling Conditional Discrimination

In [304]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [305]:
df = pd.read_csv('../data/compas-scores-two-years.csv')
df.columns[df.isna().any()==False]

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree',
       'is_recid', 'is_violent_recid', 'type_of_assessment', 'decile_score.1',
       'score_text', 'screening_date', 'v_type_of_assessment',
       'v_decile_score', 'v_score_text', 'v_screening_date', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [306]:
selected_features = ['sex','age_cat','race','priors_count','c_charge_degree',
           'is_recid','is_violent_recid','two_year_recid']

In [307]:
df_cleaned = df[selected_features][(df['race']=='Caucasian') | (df['race']=='African-American')].copy()
df_cleaned.head()
df_cleaned.index = np.arange(0,len(df_cleaned))
df_cleaned

Unnamed: 0,sex,age_cat,race,priors_count,c_charge_degree,is_recid,is_violent_recid,two_year_recid
0,Male,25 - 45,African-American,0,F,1,1,1
1,Male,Less than 25,African-American,4,F,1,0,1
2,Male,Less than 25,African-American,1,F,0,0,0
3,Male,25 - 45,Caucasian,14,F,1,0,1
4,Female,25 - 45,Caucasian,0,M,0,0,0
...,...,...,...,...,...,...,...,...
6145,Male,25 - 45,African-American,0,M,1,0,1
6146,Male,Less than 25,African-American,0,F,0,0,0
6147,Male,Less than 25,African-American,0,F,0,0,0
6148,Male,Less than 25,African-American,0,F,0,0,0


In [308]:
def Dexpl(df,expl,out='two_year_recid'):
    pdelta = []
    pstar = []
    for v in df[expl].unique():
        pa = 0 if np.isnan((df[df['race']=='African-American'][expl] == v).mean()) else (df[df['race']=='African-American'][expl] == v).mean()
        pc = 0 if np.isnan((df[df['race']=='Caucasian'][expl] == v).mean()) else (df[df['race']=='Caucasian'][expl] == v).mean()
        pdelta.append(pa-pc) 
        psa = (df[(df['race']=='African-American') & (df[expl] == v)][out] == 1).mean()
        psa = 0 if np.isnan(psa) else psa
        psc = (df[(df['race']=='Caucasian') & (df[expl] == v)][out] == 1).mean()
        psc = 0 if np.isnan(psc) else psc
        pstar.append((psa+psc)/2)
    return sum([pdelta[i]*ps for i,ps in enumerate(pstar)])

In [309]:
def delta(df_i,race):
    g_i = (df_i['race']==race).sum()
    p_star_a = (df_i[df_i['race']=='African-American']['two_year_recid'] == 1).mean()
    p_star_a = 0 if np.isnan(p_star_a) else p_star_a
    p_star_c = (df_i[df_i['race']=='Caucasian']['two_year_recid'] == 1).mean()
    p_star_c = 0 if np.isnan(p_star_c) else p_star_c
    p_star = (p_star_a+p_star_c)/2
    if race == 'African-American':
        return g_i*abs(p_star_a-p_star)
    if race == 'Caucasian':
        return g_i*abs(p_star_c-p_star)
    
def LocalMassaging(df,expl,numerical=['priors_count']):
    df_copy = df.copy()
    for attr in np.unique(df[expl]):
        X_df = df[df[expl]==attr]
        X = OneHotEncoder(drop='if_binary',sparse=False).fit_transform(X_df.drop(['race','two_year_recid',expl]+numerical,axis=1))
        
        X = np.hstack((X,X_df[numerical].to_numpy()))
        y = OneHotEncoder(drop='if_binary',sparse=False).fit_transform(df[df[expl]==attr][['two_year_recid']])
        np.random.seed(0)
        if not any(y):
            y[np.random.randint(low=0,high=len(y))]=1
        if all(y):
            y[np.random.randint(low=0,high=len(y))]=0
        if len(y) == 1:
            continue
        clf_logistic = LogisticRegression(random_state=0,max_iter=1000).fit(X, y.flatten())
        y_prob  = clf_logistic.predict_log_proba(X) 
        
        num_a = delta(X_df,'African-American')
        num_c= delta(X_df,'Caucasian')
        positive_index_sorted = np.argsort(y_prob[:,1])[np.sort(y_prob[:,1])>np.log(0.5)]
        negative_index_sorted = np.argsort(y_prob[:,1])[np.sort(y_prob[:,1])<np.log(0.5)]
        count_a,count_c = 0,0
        
        for index in positive_index_sorted:
            if df_copy.at[X_df.index[index],'race']== 'African-American':
                df_copy.at[X_df.index[index],'two_year_recid'] = 0
                count_a += 1
            if count_a >= num_a:
                break
        for index in reversed(negative_index_sorted):
            if df_copy.at[X_df.index[index],'race']== 'Caucasian':
                df_copy.at[X_df.index[index],'two_year_recid'] = 1
                count_c += 1
            if count_c>= num_c:
                break
        
    return df_copy
            
def LocalPreferentialSampling(df,expl,numerical=['priors_count']):
    df_copy = df.copy()
    index_to_drop_a,index_to_add_a=[],[]
    index_to_drop_c,index_to_add_c=[],[]
    for attr in np.unique(df[expl]):
        X_df = df[df[expl]==attr]
        X = OneHotEncoder(drop='if_binary',sparse=False).fit_transform(X_df.drop(['race','two_year_recid',expl]+numerical,axis=1))
        X = np.hstack((X,X_df[numerical].to_numpy()))
        y = OneHotEncoder(drop='if_binary',sparse=False).fit_transform(df[df[expl]==attr][['two_year_recid']])
        np.random.seed(0)
        if not any(y):
            y[np.random.randint(low=0,high=len(y))]=1
        if all(y):
            y[np.random.randint(low=0,high=len(y))]=0
        if len(y) == 1:
            continue
        clf_logistic = LogisticRegression(random_state=0,max_iter=1000).fit(X, y.flatten())
        y_prob  = clf_logistic.predict_log_proba(X) 
        
        num_a= delta(X_df,'African-American')
        num_c = delta(X_df,'Caucasian')
        positive_index_sorted = np.argsort(y_prob[:,1])[np.sort(y_prob[:,1])>np.log(0.5)]
        negative_index_sorted = np.argsort(y_prob[:,1])[np.sort(y_prob[:,1])<np.log(0.5)]
        count_drop_a,count_add_a,count_drop_c,count_add_c = 0,0,0,0    

        for index in positive_index_sorted:
            if (df_copy.at[X_df.index[index],'race'] == 'African-American') & (count_drop_a < 0.5*num_a):
                index_to_drop_a.append(X_df.index[index])
                count_drop_a += 1           
            if (df_copy.at[X_df.index[index],'race'] == 'Caucasian') & (count_add_c < 0.5*num_c):
                index_to_add_c.append(X_df.index[index])
                count_add_c += 1
            if (count_add_c>=0.5*num_c) & (count_drop_a>=0.5*num_a):
                break
                
                
        for index in reversed(negative_index_sorted):
            if (df_copy.at[X_df.index[index],'race'] == 'African-American') & (count_add_a < 0.5*num_a):
                index_to_add_a.append(X_df.index[index])
                count_add_a += 1           
            if (df_copy.at[X_df.index[index],'race'] == 'Caucasian') & (count_drop_c < 0.5*num_c):
                index_to_drop_c.append(X_df.index[index])
                count_drop_c += 1
            if (count_drop_c>=0.5*num_c) & (count_add_a>=0.5*num_a):
                break
        
    df_added = df.iloc[index_to_add_a+index_to_add_c]
    df_copy = df_copy.drop(index_to_drop_a+index_to_drop_c)   
    return pd.concat([df_copy,df_added])


## Baseline

#### With `race`

In [310]:
##without race
numerical = ['priors_count']
enc = OneHotEncoder(drop='if_binary',sparse=False)
X = enc.fit_transform(df_cleaned.drop(['two_year_recid']+numerical,axis=1))
X = np.hstack((X,df_cleaned[numerical].to_numpy()))
y = df_cleaned['two_year_recid'].to_numpy()

In [311]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [312]:
from sklearn.svm import SVC
svc = SVC(random_state=0)
parameters = {'C':(0.5,0.75,1), 
              'kernel':('linear', 'poly', 'rbf', 'sigmoid')}
clf = GridSearchCV(svc, parameters,cv=10)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=SVC(random_state=0),
             param_grid={'C': (0.5, 0.75, 1),
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid')})

In [316]:
clf.best_params_

{'C': 0.5, 'kernel': 'linear'}

In [317]:
best_clf = SVC(random_state=0,C=0.5,kernel='linear')
best_clf.fit(X_train,y_train)
y_test_pred = best_clf.predict(X_test)
print('10-fold cross validation accuracy is: ',cross_val_score(best_clf, X_train, y_train, cv=10).mean())
print('Test accuracy is: ',best_clf.score(X_test,y_test))

10-fold cross validation accuracy is:  0.9684380563064549
Test accuracy is:  0.9685807150595883


In [318]:
aa_index = enc.inverse_transform(X_test[:,:-1])[:,2]=='African-American'
ca_index = enc.inverse_transform(X_test[:,:-1])[:,2]=='Caucasian'
aa_acc = (y_test_pred[aa_index] == y_test[aa_index]).mean()
ca_acc = (y_test_pred[ca_index] == y_test[ca_index]).mean()
print('Test accuracy in African-American group: ', aa_acc)
print('Test accuracy in Caucasian group: ',ca_acc)
print('Accuracy difference in two groups: ', abs(aa_acc-ca_acc))

Test accuracy in African-American group:  0.963963963963964
Test accuracy in Caucasian group:  0.9755434782608695
Accuracy difference in two groups:  0.011579514296905513


#### Without `race`

In [319]:
enc.get_feature_names()

array(['x0_Male', 'x1_25 - 45', 'x1_Greater than 45', 'x1_Less than 25',
       'x2_Caucasian', 'x3_M', 'x4_1', 'x5_1'], dtype=object)

In [320]:
from sklearn.svm import SVC
svc = SVC(random_state=0)
parameters = {'C':(0.5,0.75,1), 
              'kernel':('linear', 'poly', 'rbf', 'sigmoid')}
clf = GridSearchCV(svc, parameters,cv=10)
clf.fit(X_train[:,[i for i in range(X_train.shape[1]) if i !=4]], y_train)

GridSearchCV(cv=10, estimator=SVC(random_state=0),
             param_grid={'C': (0.5, 0.75, 1),
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid')})

In [321]:
clf.best_params_

{'C': 0.5, 'kernel': 'linear'}

In [322]:
best_clf = SVC(random_state=0,C=0.5,kernel='linear')
X_train_no_race = X_train[:,[i for i in range(X_train.shape[1]) if i !=4]]
X_test_no_race = X_test[:,[i for i in range(X_test.shape[1]) if i !=4]]
best_clf.fit(X_train_no_race,y_train)
y_test_pred = best_clf.predict(X_test_no_race)
print('10-fold cross validation accuracy is: ',cross_val_score(best_clf, X_train_no_race, y_train, cv=10).mean())
print('Test accuracy is: ',best_clf.score(X_test_no_race,y_test))

10-fold cross validation accuracy is:  0.9684380563064549
Test accuracy is:  0.9685807150595883


In [323]:
aa_index = enc.inverse_transform(X_test[:,:-1])[:,2]=='African-American'
ca_index = enc.inverse_transform(X_test[:,:-1])[:,2]=='Caucasian'
aa_acc = (y_test_pred[aa_index] == y_test[aa_index]).mean()
ca_acc = (y_test_pred[ca_index] == y_test[ca_index]).mean()
print('Test accuracy in African-American group: ', aa_acc)
print('Test accuracy in Caucasian group: ',ca_acc)
print('Accuracy difference in two groups: ', abs(aa_acc-ca_acc))

Test accuracy in African-American group:  0.963963963963964
Test accuracy in Caucasian group:  0.9755434782608695
Accuracy difference in two groups:  0.011579514296905513


## Local Massaging

In [324]:
df_massaged = LocalMassaging(df_cleaned,'age_cat',numerical=['priors_count'])
numerical = ['priors_count']
enc = OneHotEncoder(drop='if_binary',sparse=False)
X = enc.fit_transform(df_massaged.drop(['two_year_recid']+numerical,axis=1))
X = np.hstack((X,df_massaged[numerical].to_numpy()))
y = df_massaged['two_year_recid'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [325]:
from sklearn.svm import SVC
svc = SVC(random_state=0)
parameters = {'C':(0.5,0.75,1), 
              'kernel':('linear', 'poly', 'rbf', 'sigmoid')}
clf = GridSearchCV(svc, parameters,cv=10)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=SVC(random_state=0),
             param_grid={'C': (0.5, 0.75, 1),
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid')})

In [326]:
clf.best_params_

{'C': 0.5, 'kernel': 'linear'}

In [327]:
best_clf = SVC(random_state=0,C=0.5,kernel='linear')
X_train_no_race = X_train[:,[i for i in range(X_train.shape[1]) if i !=4]]
X_test_no_race = X_test[:,[i for i in range(X_test.shape[1]) if i !=4]]
best_clf.fit(X_train_no_race,y_train)
y_test_pred = best_clf.predict(X_test_no_race)
print('10-fold cross validation accuracy is: ',cross_val_score(best_clf, X_train_no_race, y_train, cv=10).mean())
print('Test accuracy is: ',best_clf.score(X_test_no_race,y_test))

10-fold cross validation accuracy is:  0.9250089741617401
Test accuracy is:  0.9187432286023836


In [328]:
aa_index = enc.inverse_transform(X_test[:,:-1])[:,2]=='African-American'
ca_index = enc.inverse_transform(X_test[:,:-1])[:,2]=='Caucasian'
aa_acc = (y_test_pred[aa_index] == y_test[aa_index]).mean()
ca_acc = (y_test_pred[ca_index] == y_test[ca_index]).mean()
print('Test accuracy in African-American group: ', aa_acc)
print('Test accuracy in Caucasian group: ',ca_acc)
print('Accuracy difference in two groups: ', abs(aa_acc-ca_acc))

Test accuracy in African-American group:  0.9225225225225225
Test accuracy in Caucasian group:  0.9130434782608695
Accuracy difference in two groups:  0.009479044261653025


## Local Preferential Sampling

In [329]:
df_resampled = LocalPreferentialSampling(df_cleaned,'age_cat',numerical=['priors_count'])
numerical = ['priors_count']
enc = OneHotEncoder(drop='if_binary',sparse=False)
X = enc.fit_transform(df_resampled.drop(['two_year_recid']+numerical,axis=1))
X = np.hstack((X,df_resampled[numerical].to_numpy()))
y = df_resampled['two_year_recid'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [330]:
from sklearn.svm import SVC
svc = SVC(random_state=0)
parameters = {'C':(0.5,0.75,1), 
              'kernel':('linear', 'poly', 'rbf', 'sigmoid')}
clf = GridSearchCV(svc, parameters,cv=10)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=SVC(random_state=0),
             param_grid={'C': (0.5, 0.75, 1),
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid')})

In [331]:
best_clf = SVC(random_state=0,C=0.5,kernel='linear')
X_train_no_race = X_train[:,[i for i in range(X_train.shape[1]) if i !=4]]
X_test_no_race = X_test[:,[i for i in range(X_test.shape[1]) if i !=4]]
best_clf.fit(X_train_no_race,y_train)
y_test_pred = best_clf.predict(X_test_no_race)
print('10-fold cross validation accuracy is: ',cross_val_score(best_clf, X_train_no_race, y_train, cv=10).mean())
print('Test accuracy is: ',best_clf.score(X_test_no_race,y_test))

10-fold cross validation accuracy is:  0.9674761726848494
Test accuracy is:  0.9750812567713976


In [332]:
aa_index = enc.inverse_transform(X_test[:,:-1])[:,2]=='African-American'
ca_index = enc.inverse_transform(X_test[:,:-1])[:,2]=='Caucasian'
aa_acc = (y_test_pred[aa_index] == y_test[aa_index]).mean()
ca_acc = (y_test_pred[ca_index] == y_test[ca_index]).mean()
print('Test accuracy in African-American group: ', aa_acc)
print('Test accuracy in Caucasian group: ',ca_acc)
print('Accuracy difference in two groups: ', abs(aa_acc-ca_acc))

Test accuracy in African-American group:  0.9768270944741533
Test accuracy in Caucasian group:  0.9723756906077348
Accuracy difference in two groups:  0.004451403866418513
