# Data Cleaning

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
compas = pd.read_csv("../data/compas-scores-two-years.csv")

In [4]:
compas.columns

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [5]:
compas = compas[['race', 'age_cat', 'c_charge_degree', 'sex', 'priors_count', 
    'c_jail_in', 'c_jail_out', 'two_year_recid']]

In [6]:
compas = compas.dropna()

In [7]:
compas

Unnamed: 0,race,age_cat,c_charge_degree,sex,priors_count,c_jail_in,c_jail_out,two_year_recid
0,Other,Greater than 45,F,Male,0,2013-08-13 06:03:42,2013-08-14 05:41:20,0
1,African-American,25 - 45,F,Male,0,2013-01-26 03:45:27,2013-02-05 05:36:53,1
2,African-American,Less than 25,F,Male,4,2013-04-13 04:58:34,2013-04-14 07:02:04,1
5,Other,25 - 45,M,Male,0,2013-11-30 04:50:18,2013-12-01 12:28:56,0
6,Caucasian,25 - 45,F,Male,14,2014-02-18 05:08:24,2014-02-24 12:18:30,1
...,...,...,...,...,...,...,...,...
7209,African-American,Less than 25,F,Male,0,2013-11-22 05:18:27,2013-11-24 02:59:20,0
7210,African-American,Less than 25,F,Male,0,2014-01-31 07:13:54,2014-02-02 04:03:52,0
7211,Other,Greater than 45,F,Male,0,2014-01-13 05:48:01,2014-01-14 07:49:46,0
7212,African-American,25 - 45,M,Female,3,2014-03-08 08:06:02,2014-03-09 12:18:04,0


In [7]:

#Age variable takes three levels of Age < 25, 25 < Age < 45, or Age > 45. Charge Degree has two
#values Misdemeanor or Felony, Gender is either Male or Female, Prior Counts can be 0, 1  3, or larger than 3,
#and Length of Stay can be  1 week,  3 months, or > 3 months.

#only consider individuals who are African-American (race_cat = 0) or Caucasian (race_cat = 1)
compas = compas[(compas["race"]=='Caucasian') |(compas["race"]=='African-American') ]
compas["race_cat"] = compas["race"].apply(lambda x: 1 if x == "Caucasian" else 0)
compas = compas.drop(columns = "race")
compas["gender_cat"] = compas["sex"].apply(lambda x: 1 if x == "Female" else 0)
compas = compas.drop(columns = "sex")
compas["charge_cat"] = compas["c_charge_degree"].apply(lambda x: 1 if x == "F" else 0)
compas = compas.drop(columns = "c_charge_degree")
compas["length_stay"] = pd.to_datetime(compas["c_jail_out"]) - pd.to_datetime(compas['c_jail_in'])
compas["length_stay"] = compas["length_stay"].apply(lambda x: x.days)
compas = compas.drop(columns = ["c_jail_in","c_jail_out"])
compas['length_stay'] = compas["length_stay"].apply(lambda x: 0 if x <= 7 else 0.5 if 7< x <= 90 else 1)
compas["priors_count"] = compas["priors_count"].apply(lambda x: 0 if x==0 else 0.5 if 1<=x<=3 else 1)
compas['age_cat'] = compas['age_cat'].apply(lambda x:0 if x == "Less than 25" else 0.5 if x == "25 - 45" else 1)

compas = compas.dropna()
compas.to_csv("cleaned_compas.csv")

y = compas["two_year_recid"]
protected_attribute = compas["race_cat"]
df = compas.drop(columns=["two_year_recid","race_cat"])


In [9]:
df

Unnamed: 0,age_cat,priors_count,gender_cat,charge_cat,length_stay
1,0.5,0.0,0,1,0.5
2,0.0,1.0,0,1,0.0
6,0.5,1.0,0,1,0.0
8,0.5,0.0,1,0,0.0
9,0.0,0.5,0,1,0.0
...,...,...,...,...,...
7207,0.5,0.0,0,0,0.0
7208,0.0,0.0,0,1,0.0
7209,0.0,0.0,0,1,0.0
7210,0.0,0.0,0,1,0.0


In [10]:
from sklearn.utils import shuffle
df = df.to_numpy()
y = y.to_numpy()
protected_attribute = protected_attribute.to_numpy()
y, protected_attribute, X = shuffle(y, protected_attribute, df, random_state = 0)

# A7 FFS

In [11]:
train_index = int(len(X)*0.8)
x_train, y_train, race_train = X[:train_index], y[:train_index], protected_attribute[:train_index]
x_test, y_test, race_test = X[train_index:], y[train_index:],protected_attribute[train_index:]

In [12]:
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import log_loss
from scipy import optimize
import copy
import itertools
import math
from sklearn.svm import SVC
from sklearn.utils import shuffle


In [15]:
# import my library to calculate A7
import sys
import os
sys.path.insert(1, '../lib')

from A7_calculation import *

In [16]:
# Calculate Shapley disc, acc coefs for each feature over training data
shap_acc = []
shap_disc = []
for i in range(5):
    acc_i = get_shapley_acc_i(y_train, x_train, race_train, i)
    disc_i = get_shapley_disc_i(y_train, x_train, race_train, i)
    
    shap_acc.append(acc_i)
    shap_disc.append(disc_i)

# Build Shapley output
feature_names = ["Age", "Prior Count", "Gender", "Charge Degree", "Length of Stay"]
shapley_df = pd.DataFrame(list(zip(feature_names, shap_acc, shap_disc)),
                          columns=["Feature", "Accuracy", "Discrimination"])

shapley_df = shapley_df.sort_values(by=["Accuracy"], ascending=[False]).reset_index(0, True)
shapley_df.to_csv("compas-data-shapley-table.csv")

In [17]:
shapley_df

Unnamed: 0,Feature,Accuracy,Discrimination
0,Prior Count,1.264639,52111.625488
1,Age,1.235037,51820.286134
2,Length of Stay,1.09467,51358.709692
3,Charge Degree,1.083787,42019.119913
4,Gender,0.994515,41921.965331


In [18]:
# fairness-utility score
#F = shapley_acc - alpha*shapley_disc
# set alpha=0.00001, one hundred thousandth
shapley_df["F"] = shapley_df.Accuracy - 0.00001*shapley_df.Discrimination
shapley_df.Discrimination = shapley_df.Discrimination.apply(lambda x:"%E"%x)
shapley_df = shapley_df.sort_values(by=["F"], ascending=[False]).reset_index(0, True)

In [19]:
shapley_df.to_csv("compas-data-shapley-table.csv")
shapley_df

Unnamed: 0,Feature,Accuracy,Discrimination,F
0,Prior Count,1.264639,52111.63,0.743523
1,Age,1.235037,51820.29,0.716834
2,Charge Degree,1.083787,42019.12,0.663596
3,Length of Stay,1.09467,51358.71,0.581083
4,Gender,0.994515,41921.97,0.575295
