This notebook trains a constrained SVM model based on fairness constraints outlined in this paper: https://arxiv.org/pdf/1507.05259.pdf on COMPAS data, using race as the sensitive attribute

In [1]:
import numpy as np
from numpy.core.fromnumeric import transpose
import pandas as pd

In [2]:
# df = pd.read_csv('COMPAS_preprocessed.csv')
# df.head(5)
df = pd.read_csv('compas-scores-two-years.csv')
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_jail_in,c_jail_out,c_case_number,c_offense_date,c_arrest_date,c_days_from_compas,c_charge_degree,c_charge_desc,is_recid,r_case_number,r_charge_degree,r_days_from_arrest,r_offense_date,r_charge_desc,r_jail_in,r_jail_out,violent_recid,is_violent_recid,vr_case_number,vr_charge_degree,vr_offense_date,vr_charge_desc,type_of_assessment,decile_score.1,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,0,1,0,0,0,-1.0,2013-08-13 06:03:42,2013-08-14 05:41:20,13011352CF10A,2013-08-13,,1.0,F,Aggravated Assault w/Firearm,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2013-08-14,Risk of Violence,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,0,3,0,0,0,-1.0,2013-01-26 03:45:27,2013-02-05 05:36:53,13001275CF10A,2013-01-26,,1.0,F,Felony Battery w/Prior Convict,1,13009779CF10A,(F3),,2013-07-05,Felony Battery (Dom Strang),,,,1,13009779CF10A,(F3),2013-07-05,Felony Battery (Dom Strang),Risk of Recidivism,3,Low,2013-01-27,Risk of Violence,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,0,4,0,1,4,-1.0,2013-04-13 04:58:34,2013-04-14 07:02:04,13005330CF10A,2013-04-13,,1.0,F,Possession of Cocaine,1,13011511MM10A,(M1),0.0,2013-06-16,Driving Under The Influence,2013-06-16,2013-06-16,,0,,,,,Risk of Recidivism,4,Low,2013-04-14,Risk of Violence,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,0,8,1,0,1,,,,13000570CF10A,2013-01-12,,1.0,F,Possession of Cannabis,0,,,,,,,,,0,,,,,Risk of Recidivism,8,High,2013-01-13,Risk of Violence,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,0,1,0,0,2,,,,12014130CF10A,,2013-01-09,76.0,F,arrest case no charge,0,,,,,,,,,0,,,,,Risk of Recidivism,1,Low,2013-03-26,Risk of Violence,1,Low,2013-03-26,,,2,0,1102,0,0


In [3]:
#data wrangling 
df['race'] = df['race'].replace('African-American', 1).replace('Caucasian', 0)

df = df[(df['race'] == 0) | (df['race'] == 1)]

df['sex'] = df['sex'].replace('Male', 1).replace('Female', 0)

df['score_text'] = df['score_text'].replace('High', 1).replace('Medium', 0).replace('Low', -1)

df['c_charge_degree'] = df['c_charge_degree'].replace('M',1).replace('F',0)

df['days_in_jail'] = (pd.to_datetime(df['c_jail_out'])-pd.to_datetime(df['c_jail_in'])).dt.days

In [4]:
# cols =  ['age', 'c_charge_degree', 'race', 'score_text', 'sex',
#        'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid',
#        'two_year_recid', 'days_in_jail']
# df = df[cols]
# df.head(5)

We've imported our dataset, and we only select the columns we need, as well as our target variable: two_year_recid and race. The data column is consistent with our LFR model, in which the data selection is based on data correlation.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
# col_X = ['age', 'c_charge_degree', 'score_text', 'sex',
#        'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 
#         'days_in_jail']
col_X=['age', 'c_charge_degree', 'score_text', 'sex','c_days_from_compas','is_violent_recid','v_decile_score',
       'priors_count','juv_fel_count', 'juv_misd_count','juv_other_count','days_b_screening_arrest', 'decile_score', 'is_recid']
X = df[col_X]
Z = df['race']
y = df['two_year_recid']
X_train, X_test, Z_train, Z_test, y_train, y_test = train_test_split( X, Z, y, test_size=0.33, random_state=42)

In [6]:
scaler_train = preprocessing.StandardScaler().fit(X_train)
scaler_test = preprocessing.StandardScaler().fit(X_test)

We use scikit learn's framework for optimization: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html#scipy.optimize.minimize https://scipy-lectures.org/advanced/mathematical_optimization/auto_examples/plot_non_bounds_constraints.html

Each of the constraints required for the framework to work needs to be programmed s.t. >= 0 for it to be true.

In [7]:
import scipy
from scipy.optimize import minimize
from numpy import linalg as LA

k = X_train.shape[1]
N = X_train.shape[0]

#equation 9 from paper
def upper_theta_constraint(params, X, Z, c, k):
    theta = params[:k]
    return ((-1/len(Z)) * np.matmul(np.matmul(transpose(Z - Z.mean()), X), theta)) + c

def lower_theta_constraint(params, X, Z, c, k):
    theta = params[:k]
    return ((np.matmul(np.matmul(transpose(Z - Z.mean()), X), theta))/len(Z)) + c  

def svm_loss(params, X, y, C, k):
    theta = params[:k]
    phi = params[k:]
    y_hat = y * np.dot(X,theta) 
    y_hat = np.maximum(np.zeros_like(y_hat), (1-y_hat)) 
    
    return C*sum(y_hat)

def phi_constraint(params,k):
  #theta = params[:k]
    phi = params[k:]

    return phi 

def phi_constraint2(params, k, y , X):
    theta = params[:k]
    phi = params[k:]
    return np.dot( transpose(theta), np.matmul(y,X)) - 1 + sum(phi)

In [8]:
theta = np.array(np.random.uniform(size=k)).reshape(-1, 1)
phi = np.array(np.random.uniform(size=N))
params = np.append(theta.flatten(), phi.flatten())
X_train_scaled = scaler_train.transform(X_train)
res = scipy.optimize.minimize(svm_loss, x0=params, args=(X_train_scaled, y_train, 0.8, k), 
                        method='SLSQP', 
                        constraints=(
                            {'type': 'ineq', 'fun': upper_theta_constraint, 'args': (X_train_scaled, Z_train , 0.8, k)},
                            {'type': 'ineq', 'fun': lower_theta_constraint, 'args': (X_train_scaled, Z_train , 0.8, k)},
                            {'type': 'ineq', 'fun': phi_constraint, 'args': [k]},
                            {'type': 'ineq', 'fun': phi_constraint2, 'args': [k, y_train, scaler_train.transform(X_train)]}
                                    ))

In [9]:
pd.DataFrame(res.x).to_csv('svm_result.csv')

In [10]:
import scipy
from scipy.optimize import minimize
from numpy import linalg as LA

k = X_train_scaled.shape[1]
N = X_train_scaled.shape[0]

params_hat = pd.read_csv('svm_result.csv')
theta_hat = params_hat['0'][0:k]

Now we evaluate for our evaluation metrics on our test accordingly.

In [11]:
X_test_scaled = scaler_test.transform(X_test)
yhat_test = np.matmul(theta_hat, np.transpose(X_test_scaled))

In [12]:
yhat_test[yhat_test < 0]

array([-0.23782367, -3.61548941, -1.8838733 , ..., -0.75211819,
       -0.35780863, -0.62709293])

# eval metric 1: Accuracy
Now we evaluate for accuracy

In [27]:
len(yhat_test[((yhat_test > 0) & (y_test > 0)) | ((yhat_test <= 0) & (y_test <= 0))])/len(yhat_test)

0.7024630541871921

# eval metric 2: Calibration 
Now we evaluate for calibration

In [14]:
odds_pos = yhat_test[Z_test == 1]
odds_neg = yhat_test[Z_test == 0]
y_pos = y_test[Z_test == 1]
y_neg = y_test[Z_test == 0]

calibration_pos = len(odds_pos[((odds_pos > 0) & (y_pos > 0)) |
    ((odds_pos <= 0) & (y_pos <= 0))])/len(odds_pos)
calibration_neg = len(odds_neg[((odds_neg > 0) & (y_neg > 0)) |
    ((odds_neg <= 0) & (y_neg <= 0))])/len(odds_neg)
calibration_pos, calibration_neg   

(0.6884709730171709, 0.7236679058240396)

The sensitive group is 70.6% calibrated and the the nonprotected group is 72.1% accurate, meaning the model is able to accurately identify both groups

# eval metric 3: equality of odds

In [15]:
#protected: y = 1, then y =0 
print(len(y_pos[(odds_pos > 0) & (y_pos > 0)]) / len(y_pos[y_pos > 0]), len(y_pos[(odds_pos > 0) & (y_pos <= 0)]) / len(y_pos[y_pos <= 0]))
#nonprotected: y = 1, then y =0 
print(len(y_neg[(odds_neg > 0) & (y_neg > 0)]) / len(y_neg[y_neg > 0]), len(y_neg[(odds_neg > 0) & (y_neg <= 0)]) / len(y_neg[y_neg <= 0]))

0.6937212863705973 0.22982456140350876
0.48184818481848185 0.08134920634920635


The CSVM model is able to predict protected groups within the postive target variable 72.1% of the time, and 52.4 for non protected group. For the negative target, it has 22.3% for the protected group and 11.1% for the non protected group

# eval metric 4: parity

In [16]:
(len(odds_neg[odds_neg > 0])/len(odds_neg),
  len(odds_pos[odds_pos > 0])/len(odds_pos))

(0.23172242874845106, 0.47751430907604253)

The model predict that 26.6% of the nonprotected group will return to criminal behaviour and 48.9% of the protected will do so as well. this is slightly deviate to the true values: 37.5% and 53.4% respectively.

In [17]:
(len(y_neg[y_neg > 0])/len(y_neg),
  len(y_pos[y_pos > 0])/len(y_pos))

(0.3754646840148699, 0.5339329517579722)