This notebook trains a logistic regression model based on fairness constraints outlined in this paper:
https://arxiv.org/pdf/1507.05259.pdf
on COMPAS data, using race as the sensitive attribute

In [1]:
import numpy as np
from numpy.core.fromnumeric import transpose
import pandas as pd
df = pd.read_csv('COMPAS_preprocessed.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,age,c_charge_degree,race,score_text,sex,priors_count,days_b_screening_arrest,decile_score,is_recid,two_year_recid,c_jail_in,c_jail_out,days_in_jail
0,1,34,0,1,-1,1,0,-1.0,3,1,1,2013-01-26 03:45:27,2013-02-05 05:36:53,10
1,2,24,0,1,-1,1,4,-1.0,4,1,1,2013-04-13 04:58:34,2013-04-14 07:02:04,1
2,6,41,0,0,0,1,14,-1.0,6,1,1,2014-02-18 05:08:24,2014-02-24 12:18:30,6
3,8,39,1,0,-1,0,0,-1.0,1,0,0,2014-03-15 05:35:34,2014-03-18 04:28:46,2
4,10,27,0,0,-1,1,0,-1.0,4,0,0,2013-11-25 06:31:06,2013-11-26 08:26:57,1


We've imported our dataset, and we only select the columns we need, as well as our target variable: two_year_recid
Feature seelction is based on this notebook:
https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb

In [2]:
cols =  ['age', 'c_charge_degree', 'race', 'score_text', 'sex',
       'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid',
       'two_year_recid', 'days_in_jail']
df = df[cols]
df.head(5)

Unnamed: 0,age,c_charge_degree,race,score_text,sex,priors_count,days_b_screening_arrest,decile_score,is_recid,two_year_recid,days_in_jail
0,34,0,1,-1,1,0,-1.0,3,1,1,10
1,24,0,1,-1,1,4,-1.0,4,1,1,1
2,41,0,0,0,1,14,-1.0,6,1,1,6
3,39,1,0,-1,0,0,-1.0,1,0,0,2
4,27,0,0,-1,1,0,-1.0,4,0,0,1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
#only select features, minus target and sensitive attribute
col_X = ['age', 'c_charge_degree', 'score_text', 'sex',
       'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 
        'days_in_jail']
X = df[col_X]
Z = df['race']
y = df['two_year_recid']
X_train, X_test, Z_train, Z_test, y_train, y_test = train_test_split( X, Z, y, test_size=0.33, random_state=42)

here we normalize our data, as well as set other parameters

In [None]:
k = X_train.shape[1]
N =  X_train.shape[0]
theta = np.array(np.random.uniform(size=k))

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
scaler.transform(X_train)

array([[ 1.10233552,  1.38213171, -0.87509104, ..., -1.28278951,
        -1.02057358, -0.29957006],
       [ 0.3235945 , -0.72352005,  1.63246318, ...,  1.86062263,
         0.97984116, -0.2366014 ],
       [-0.28209296, -0.72352005,  1.63246318, ...,  1.1620866 ,
         0.97984116, -0.29957006],
       ...,
       [-0.62820008, -0.72352005,  0.37868607, ...,  0.46355057,
         0.97984116, -0.27858051],
       [ 0.84275518, -0.72352005,  0.37868607, ...,  0.11428255,
         0.97984116, -0.29957006],
       [-0.02251262, -0.72352005,  0.37868607, ...,  0.46355057,
         0.97984116, -0.32055961]])

We use scikit learn's framework for optimization:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html
https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html#scipy.optimize.minimize
https://scipy-lectures.org/advanced/mathematical_optimization/auto_examples/plot_non_bounds_constraints.html

Each of the constraints required for the framework to work needs to be programmed s.t. >= 0 for it to be true.

In [None]:
import scipy
from scipy.optimize import minimize
from numpy import linalg as LA

#equations 6 from the paper
def upper_theta_constraint(theta, X, Z, c):
  return ((-1/10) * np.matmul(np.matmul(transpose(Z - Z.mean()), X), theta)) + c

def lower_theta_constraint(theta, X, Z, c):
  return ((1/10) * (np.matmul(np.matmul(transpose(Z - Z.mean()), X), theta))) + c  

def logreg_loss(theta,X):
  odds = 1/(1 + np.exp(np.matmul(X, theta)))
  return (-1) * sum(np.log(odds))

res = scipy.optimize.minimize(logreg_loss, args=(scaler.transform(X_train)), x0=theta,
                        method='SLSQP', 
                        constraints=({'type': 'ineq', 'fun': upper_theta_constraint, 'args': (scaler.transform(X_train), Z_train , 0.8)},
                                     {'type': 'ineq', 'fun': lower_theta_constraint, 'args': (scaler.transform(X_train), Z_train , 0.8)}),
            options={"maxiter": 100000})



Now we have our parameters, we'll start to evaluate the four evaluation metrics after normalizing our test data

In [5]:
theta = res.x
"""
theta = [-1.54444915e-06, -1.01807221e-06,  1.96578786e-05, -3.34814076e-06,
        2.13276951e-06, -5.72891800e-07, -1.79066259e-05, -1.49020887e-06,
       -3.01595836e-06]
"""

In [7]:
scaler = preprocessing.StandardScaler().fit(X_test)
X_scaled_test = scaler.transform(X_test)

In [8]:
"""calculate parity P(Y_hat = 1 | S = 0) = P(Y_hat = 1 | S = 1)"""

#1) segment test dataset into sensitive and non sensitive
X_pos = scaler.transform(X_test.loc[(Z==1)])
X_neg = scaler.transform(X_test.loc[(Z==0)])
y_pos = y_test.loc[(Z==1)]
y_neg = y_test.loc[(Z==0)]


#2) calculate Y_hat for neg & pos
odds_pos = 1/(1+np.exp(np.matmul(X_pos, theta)))
odds_neg = 1/(1+np.exp(np.matmul(X_neg, theta)))
#3) find y_hat = 1 for both neg and positive
len(odds_pos[odds_pos > 0.5])/len(odds_pos), len(odds_neg[odds_neg > 0.5])/len(odds_neg)

(0.4830827067669173, 0.45132743362831856)

#parity (eval #1)
P(Yhat = 1 | sensitive attribute = 1) = 48.3%
P(Yhat = 1 | sensitive attribute = 0) = 45.1%
The algorithm does a close job of predicting chances of recid after two years for both groups

In [None]:
"""evaluation metric 2: equality of odds"""

#protected: y = 1, then y =0 
print(len(odds_pos[(y_pos > 0.5) & (odds_pos > 0.5)]) / len(odds_pos), len(odds_pos[(y_pos <= 0.5) & (odds_pos > 0.5)]) / len(odds_pos)) #0.5122180451127819 0.3746312684365782
#nonprotected: y = 1, then y =0 
print(len(odds_neg[(y_neg > 0.5) & (odds_neg > 0.5)]) / len(odds_neg), len(odds_neg[(y_neg <= 0.5) & (odds_neg > 0.5)]) / len(odds_neg))

0.2819548872180451 0.20112781954887218
0.22566371681415928 0.22566371681415928


#equality of odds (eval #2)
For when true value of y = 0
the model predicts the sensitive group accurately 20.1% of the time, and 22.5% accurately for the non-protected group

For when y = 1
the model predicts the sensitive group accurately 28.2% of the time, and 22.6% accurately for the non-protected group

The model is still more likely to predict the protect group as y = 1, but this disparity is not seen with the non-protected group

In [None]:
"""evaluation metric 3: calibration"""


calibration_pos = len(odds_pos[((odds_pos > 0.5) & (y_pos > 0.5)) |
    ((odds_pos <= 0.5) & (y_pos <= 0.5))])/len(odds_pos)
calibration_neg = len(odds_neg[((odds_neg > 0.5) & (y_neg > 0.5)) |
    ((odds_neg <= 0.5) & (y_neg <= 0.5))])/len(odds_neg)
calibration_pos, calibration_neg   

(0.568609022556391, 0.6253687315634219)

#eval 3: calibration
The protected group is 56.8% calibrated, and the non protected group is 62.5% calibrated -- meaning the model is less likely to predict the protected group correctly but by a somewhat small margin

# eval 4: accuracy
We see below that the model is 59.1% accurate

In [13]:
total_odds = 1/(1+np.exp(np.matmul(scaler.transform(X_test), theta)))
total_odds[total_odds > 0.5] = 1
total_odds[total_odds <= 0.5] = 0
len(total_odds[total_odds == y_test])/len(y_test)

0.5907003444316877