In [410]:
# Import required packages
import numpy as np
import pandas as pd
import cv2 as cv2
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import time
import scipy.optimize as optim

## 1) Load the dataset

For this project we are using the COMPAS-scores-two-years dataset, a COMPAS dataset that contains the criminal history, jail and prison time, demographics, and COMPAS risk scores for defendants from Broward county from 2013 and 2014, as well as the ground truth on whether or not these individuals actually recidivated within two years after the screening.  

There are 7214 data in total.

In [411]:
raw_data = pd.read_csv('../data/compas-scores-two-years.csv')

In [412]:
raw_data

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,10996,steven butler,steven,butler,2013-11-23,Male,1992-07-17,23,Less than 25,African-American,...,5,Medium,2013-11-23,2013-11-22,2013-11-24,0,1,860,0,0
7210,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,5,Medium,2014-02-01,2014-01-31,2014-02-02,0,1,790,0,0
7211,10999,winston gregory,winston,gregory,2014-01-14,Male,1958-10-01,57,Greater than 45,Other,...,1,Low,2014-01-14,2014-01-13,2014-01-14,0,0,808,0,0
7212,11000,farrah jean,farrah,jean,2014-03-09,Female,1982-11-17,33,25 - 45,African-American,...,2,Low,2014-03-09,2014-03-08,2014-03-09,3,0,754,0,0


## 2) Data processing

### 2.1) Subset of data with race "African-American" or "Caucasian"

We want to keep only the rows of the dataset that correspond to "African-American" or "Caucasian" race.

In [413]:
print("The dataset includes defendants of the following races: {}".format(raw_data['race'].unique()))

The dataset includes defendants of the following races: ['Other' 'African-American' 'Caucasian' 'Hispanic' 'Native American'
 'Asian']


In [433]:
processed_data = raw_data.loc[raw_data['race'].isin(["African-American", "Caucasian"])]

In [434]:
print("The original dataset includes {} African-American and Caucasian defendants.".format(processed_data.shape[0]))

The original dataset includes 6150 African-American and Caucasian defendants.


In [435]:
processed_data

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
6,8,edward riddle,edward,riddle,2014-02-19,Male,1974-07-23,41,25 - 45,Caucasian,...,2,Low,2014-02-19,2014-03-31,2014-04-18,14,5,40,1,1
8,10,elizabeth thieme,elizabeth,thieme,2014-03-16,Female,1976-06-03,39,25 - 45,Caucasian,...,1,Low,2014-03-16,2014-03-15,2014-03-18,0,2,747,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207,10994,jarred payne,jarred,payne,2014-05-10,Male,1985-07-31,30,25 - 45,African-American,...,2,Low,2014-05-10,2015-10-22,2015-10-22,0,0,529,1,1
7208,10995,raheem smith,raheem,smith,2013-10-20,Male,1995-06-28,20,Less than 25,African-American,...,9,High,2013-10-20,2014-04-07,2014-04-27,0,0,169,0,0
7209,10996,steven butler,steven,butler,2013-11-23,Male,1992-07-17,23,Less than 25,African-American,...,5,Medium,2013-11-23,2013-11-22,2013-11-24,0,1,860,0,0
7210,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,5,Medium,2014-02-01,2014-01-31,2014-02-02,0,1,790,0,0


### 2.2) Remove unuseful data

Remove unuseful columns (columns with multiple missing data).

In [436]:
processed_data = processed_data[['sex', 'age', 'age_cat', 'race', 'decile_score', 'juv_fel_count', 'juv_misd_count', 'juv_other_count',
            'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_charge_degree', 'is_recid', 
             'score_text', 'two_year_recid']]

In [437]:
processed_data

Unnamed: 0,sex,age,age_cat,race,decile_score,juv_fel_count,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_jail_in,c_jail_out,c_charge_degree,is_recid,score_text,two_year_recid
1,Male,34,25 - 45,African-American,3,0,0,0,0,-1.0,2013-01-26 03:45:27,2013-02-05 05:36:53,F,1,Low,1
2,Male,24,Less than 25,African-American,4,0,0,1,4,-1.0,2013-04-13 04:58:34,2013-04-14 07:02:04,F,1,Low,1
3,Male,23,Less than 25,African-American,8,0,1,0,1,,,,F,0,High,0
6,Male,41,25 - 45,Caucasian,6,0,0,0,14,-1.0,2014-02-18 05:08:24,2014-02-24 12:18:30,F,1,Medium,1
8,Female,39,25 - 45,Caucasian,1,0,0,0,0,-1.0,2014-03-15 05:35:34,2014-03-18 04:28:46,M,0,Low,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207,Male,30,25 - 45,African-American,2,0,0,0,0,-1.0,2014-05-09 10:01:33,2014-05-10 08:28:12,M,1,Low,1
7208,Male,20,Less than 25,African-American,9,0,0,0,0,-1.0,2013-10-19 11:17:15,2013-10-20 08:13:06,F,0,High,0
7209,Male,23,Less than 25,African-American,7,0,0,0,0,-1.0,2013-11-22 05:18:27,2013-11-24 02:59:20,F,0,Medium,0
7210,Male,23,Less than 25,African-American,3,0,0,0,0,-1.0,2014-01-31 07:13:54,2014-02-02 04:03:52,F,0,Low,0


According to the ProPublica COMPAS notebook (https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb) there are a number of reasons to remove rows because of missing data:
- If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we can assume that because of data quality reasons, that we do not have the right offense.
- The recidivist flag (is_recid) should be -1 if we could not find a compas case at all.
- Ordinary traffic offenses (c_charge_degree = 'O') will not result in Jail time and hence are removed (only two of them).
- We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.

In [438]:
# If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, 
# we can assume that because of data quality reasons, that we do not have the right offense.

processed_data = processed_data.loc[processed_data['days_b_screening_arrest'] <= 30]
processed_data = processed_data.loc[processed_data['days_b_screening_arrest'] >= -30]

In [439]:
# The recidivist flag (is_recid) should be -1 if we could not find a compas case at all.

processed_data = processed_data.loc[processed_data['is_recid'] != -1]

In [440]:
# Ordinary traffic offenses (c_charge_degree = 'O') will not result in Jail time and hence are removed 
# (only two of them).

processed_data = processed_data.loc[processed_data['c_charge_degree'] != 'O']

In [441]:
# score_text shouldn't be 'N/A'

processed_data = processed_data.loc[processed_data['score_text'] != 'N/A']

In [442]:
processed_data['length_of_stay'] = (pd.to_datetime(processed_data['c_jail_out'])-pd.to_datetime(processed_data['c_jail_in'])).apply(lambda x: x.days)

In [443]:
processed_data = processed_data.drop(columns=['c_jail_in', 'c_jail_out'])

### 2.3) Create indicator values out of columns

In [444]:
# replace the values of the sensitive attribute race as follows: Caucasian -> 1, African-American -> 0
processed_data = processed_data.replace({'race': 'Caucasian'}, 1)
processed_data = processed_data.replace({'race': 'African-American'}, 0)

In [445]:
# replace the values of sex as follows
processed_data = processed_data.replace({'sex': 'Male'}, 1)
processed_data = processed_data.replace({'sex': 'Female'}, 0)

# replace the values of age_cat as follows
processed_data = processed_data.replace({'age_cat': 'Less than 25'}, 0)
processed_data = processed_data.replace({'age_cat': '25 - 45'}, 1)
processed_data = processed_data.replace({'age_cat': 'Greater than 45'}, 2)

# replace the values of c_charge_degree as follows
processed_data = processed_data.replace({'c_charge_degree': 'F'}, 0)
processed_data = processed_data.replace({'c_charge_degree': 'M'}, 1)

# replace the values of score_text as follows
processed_data = processed_data.replace({'score_text': 'Low'}, 0)
processed_data = processed_data.replace({'score_text': 'Medium'}, 1)
processed_data = processed_data.replace({'score_text': 'High'}, 2)

### 2.4) Check for NaN values

In [446]:
# check whether there are NaN values in the final dataset as well as the number of unique values per column

unique_NAN_df = pd.DataFrame(columns=['column name', '# of unique values', '# of NaN values'])
for item in processed_data.columns:
    unique_NAN_df = unique_NAN_df.append({
        'column name': item, 
        '# of unique values': len(processed_data[item].unique()),
        '# of NaN values': sum(processed_data[item].isna() == True)}, ignore_index = True)
    
unique_NAN_df = unique_NAN_df.style.hide_index()
unique_NAN_df

column name,# of unique values,# of NaN values
sex,2,0
age,62,0
age_cat,3,0
race,2,0
decile_score,10,0
juv_fel_count,9,0
juv_misd_count,10,0
juv_other_count,8,0
priors_count,36,0
days_b_screening_arrest,56,0


In [447]:
# move two_year_recid to the end

cols = list(processed_data.columns.values)
cols.pop(cols.index('two_year_recid'))
processed_data = processed_data[cols+['two_year_recid']]

In [448]:
# move race to the first column

race_column = processed_data.pop('race')
processed_data.insert(0, 'race', race_column)

In [449]:
processed_data

Unnamed: 0,race,sex,age,age_cat,decile_score,juv_fel_count,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_charge_degree,is_recid,score_text,length_of_stay,two_year_recid
1,0,1,34,1,3,0,0,0,0,-1.0,0,1,0,10,1
2,0,1,24,0,4,0,0,1,4,-1.0,0,1,0,1,1
6,1,1,41,1,6,0,0,0,14,-1.0,0,1,1,6,1
8,1,0,39,1,1,0,0,0,0,-1.0,1,0,0,2,0
10,1,1,27,1,4,0,0,0,0,-1.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207,0,1,30,1,2,0,0,0,0,-1.0,1,1,0,0,1
7208,0,1,20,0,9,0,0,0,0,-1.0,0,0,2,0,0
7209,0,1,23,0,7,0,0,0,0,-1.0,0,0,1,1,0
7210,0,1,23,0,3,0,0,0,0,-1.0,0,0,0,1,0


In [587]:
processed_data = processed_data.drop(columns=['age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count'])

In [588]:
# save final data set to csv

processed_data.to_csv("../output/processed-compas-scores-two-years.csv", index=False)

## 3) Split data

We will first get the labels and the sensitive data.

In [589]:
data = np.array(processed_data)
y = np.array(data[:,-1]).flatten()
data = data[:,:-1]
sensitive = data[:,0]
data = preprocessing.scale(data)
data = data[:,1:]

In [590]:
len(data[0])

9

Split data into sensitive and nonsensitive data (sensitive --> race: Caucasian)

In [591]:
sensitive_idx = np.array(np.where(sensitive==1))[0].flatten()
nonsensitive_idx = np.array(np.where(sensitive!=1))[0].flatten()
data_sensitive = data[sensitive_idx,:]
data_nonsensitive = data[nonsensitive_idx,:]
y_sensitive = y[sensitive_idx]
y_nonsensitive = y[nonsensitive_idx]

Split data into training, validation, and testing sets (training: validation: testing = 6:2:2).

In [592]:
# split sensitive data into training, validation, and testing sets

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(data_sensitive, y_sensitive, test_size= 0.2, random_state=42)
X_train_s, X_valid_s, y_train_s, y_valid_s = train_test_split(X_train_s, y_train_s, test_size = 0.25, random_state=42)

In [593]:
# split non-sensitive data into training, validation, and testing sets

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(data_nonsensitive, y_nonsensitive, test_size= 0.2, random_state=42)
X_train_n, X_valid_n, y_train_n, y_valid_n = train_test_split(X_train_n, y_train_n, test_size = 0.25, random_state=42)

In [594]:
# create final training, validation, and testing sets

X_train = np.concatenate((X_train_s, X_train_n))
X_valid = np.concatenate((X_valid_s, X_valid_n))
X_test = np.concatenate((X_test_s, X_test_n))

Y_train = np.concatenate((y_train_s, y_train_n))
Y_valid = np.concatenate((y_valid_s, y_valid_n))
Y_test = np.concatenate((y_test_s, y_test_n))

##  4) Learning Fair Representations (LFR)

In [595]:
# this function returns the distance matrix
def distances(X, v, alpha, N, P, k):
    dists = np.zeros((N, k))
    for i in range(N):
        for p in range(P):
            for j in range(k):    
                dists[i, j] += (X[i, p] - v[j, p]) * (X[i, p] - v[j, p]) * alpha[p]
    return dists

# this function returns the M_nk
def M_nk(dists, N, k):
    M_nk = np.zeros((N, k))
    exp = np.zeros((N, k))
    denom = np.zeros(N)
    for i in range(N):
        for j in range(k):
            exp[i, j] = np.exp(-1 * dists[i, j])
            denom[i] += exp[i, j]
        for j in range(k):
            if denom[i]:
                M_nk[i, j] = exp[i, j] / denom[i]
            else:
                M_nk[i, j] = exp[i, j] / 1e-6
    return M_nk
 
# this function returns the M_k array
def M_k(M_nk, N, k):
    M_k = np.zeros(k)
    for j in range(k):
        for i in range(N):
            M_k[j] += M_nk[i, j]
        M_k[j] /= N
    return M_k

# this function reconstructs of X to x_n_hat and L_x
def x_n_hat(X, M_nk, v, N, P, k):
    x_n_hat = np.zeros((N, P))
    L_x = 0.0
    for i in range(N):
        for p in range(P):
            for j in range(k):
                x_n_hat[i, p] += M_nk[i, j] * v[j, p]
            L_x += (X[i, p] - x_n_hat[i, p]) * (X[i, p] - x_n_hat[i, p])
    return x_n_hat, L_x

# this function returns a list of prediction
def yhat(M_nk, y, w, N, k):
    yhat = np.zeros(N)
    L_y = 0.0
    for i in range(N):
        for j in range(k):
            yhat[i] += M_nk[i, j] * w[j]
        yhat[i] = 1e-6 if yhat[i] <= 0 else yhat[i]
        yhat[i] = 0.999 if yhat[i] >= 1 else yhat[i]
        L_y += -1 * y[i] * np.log(yhat[i]) - (1.0 - y[i]) * np.log(1.0 - yhat[i])
    return yhat, L_y

In [599]:
# this function returns the objective function we want to minimize
def LFR_objective(params, data_sensitive, data_nonsensitive, y_sensitive, 
        y_nonsensitive,  k=10, A_x = 1e-4, A_y = 0.1, A_z = 1000):
    LFR_objective.iters += 1 
    Ns, P = data_sensitive.shape
    Nns, _ = data_nonsensitive.shape
    
    alpha0 = params[:P]
    alpha1 = params[P : 2 * P]
    w = params[2 * P : (2 * P) + k]
    v = np.matrix(params[(2 * P) + k:]).reshape((k, P))
        
    dists_sensitive = distances(data_sensitive, v, alpha0, Ns, P, k)
    dists_nonsensitive = distances(data_nonsensitive, v, alpha1, Nns, P, k)

    M_nk_sensitive = M_nk(dists_sensitive, Ns, k)
    M_nk_nonsensitive = M_nk(dists_nonsensitive, Nns, k)
    
    M_k_sensitive = M_k(M_nk_sensitive, Ns, k)
    M_k_nonsensitive = M_k(M_nk_nonsensitive, Nns, k)
    
    L_z = 0.0
    for j in range(k):
        L_z += abs(M_k_sensitive[j] - M_k_nonsensitive[j])

    x_n_hat_sensitive, L_x_sen = x_n_hat(data_sensitive, M_nk_sensitive, v, Ns, P, k)
    x_n_hat_nonsensitive, L_x_nsen = x_n_hat(data_nonsensitive, M_nk_nonsensitive, v, Nns, P, k)
    L_x = L_x_sen + L_x_nsen

    yhat_sensitive, L_y_sen = yhat(M_nk_sensitive, y_sensitive, w, Ns, k)
    yhat_nonsensitive, L_y_nsen = yhat(M_nk_nonsensitive, y_nonsensitive, w, Nns, k)
    L_y = L_y_sen + L_y_nsen

    objective = A_x * L_x + A_y * L_y + A_z * L_z
    return objective

LFR_objective.iters = 0

In [600]:
def LFR(X_train_s, X_train_n, y_train_s, y_train_n, K=10, A_x = 1e-4, A_y = 0.1, A_z = 1000):
    rez = np.random.uniform(size=data.shape[1] * 2 + K + data.shape[1] * K)
    bnd = []
    for i, k2 in enumerate(rez):
        if i < data.shape[1] * 2 or i >= data.shape[1] * 2 + K:
            bnd.append((None, None))
        else:
            bnd.append((0, 1))
    
    # minimize the metric by parameters alpha, w and v
    para, min_L, d = optim.fmin_l_bfgs_b(LFR_objective, x0=rez, epsilon=1e-5, 
                                         args=(X_train_s, X_train_n, y_train_s, y_train_n, K, A_z, A_x, A_y), 
                                         bounds = bnd, approx_grad=True, 
                                         maxfun=150, maxiter=150)
    
    return para

In [601]:
start = time.time()
final_parameters = LFR(X_train_s, X_train_n, y_train_s, y_train_n, 10, 0.3, 0.3, 0.4)
print( f"Total training time: {time.time() - start}")

# TODO: takes too much time --> we should keep only a few columns

Total training time: 295.70304012298584


In [604]:
final_parameters

array([ 0.72418526,  1.10281443,  0.83385323,  1.01941046,  0.75045468,
        0.66180699,  0.38657395,  0.41670306,  0.61934767,  0.74703014,
        1.22999615,  0.93824682,  1.18782623,  0.78592103,  0.73697726,
        0.57994651,  0.87899898,  0.8734584 ,  0.72098701,  0.69640556,
        0.62336661,  0.19376819,  0.42367338,  0.18366534,  0.80244202,
        0.62497762,  0.93647379,  0.3573076 , -0.03019849, -1.55353123,
        0.53165918, -0.486171  , -0.0405762 , -0.36268593,  0.29000827,
       -0.00825785, -0.60782731,  1.08676948,  0.86120349,  0.18026651,
        1.25589241,  0.7287432 ,  0.63810597, -0.08902166,  0.74436267,
        0.98297588,  0.12718441,  0.64386292,  0.02466943,  0.81018697,
        0.38421418, -0.30615283,  0.70073279, -0.17998717,  0.27579651,
        1.1425859 ,  0.29769654,  0.3361893 ,  1.04741125,  1.01496017,
        0.73369038,  0.18771022,  0.19761567,  0.99087182, -0.75896973,
       -0.63475463, -0.92934965, -1.10451039,  0.58718814,  0.54

## 5) Evaluation