In [67]:
import pandas as pd
import numpy as np
addr = "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
online_df = pd.read_csv(addr)

In [68]:
online_df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


In [69]:
df = online_df
df = df.dropna(subset=["days_b_screening_arrest"]) # dropping missing vals

# convert to np array
data = df.to_dict('list')
for k in data.keys():
    data[k] = np.array(data[k])


""" Filtering the data """

# These filters are the same as propublica (refer to https://github.com/propublica/compas-analysis)
# If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense. 
idx = np.logical_and(data["days_b_screening_arrest"]<=30, data["days_b_screening_arrest"]>=-30)


# We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all.
idx = np.logical_and(idx, data["is_recid"] != -1)

# In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed (only two of them).
idx = np.logical_and(idx, data["c_charge_degree"] != "O") # F: felony, M: misconduct

# We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.
idx = np.logical_and(idx, data["score_text"] != "NA")

# we will only consider blacks and whites for this analysis
idx = np.logical_and(idx, np.logical_or(data["race"] == "African-American", data["race"] == "Caucasian"))

# select the examples that satisfy this criteria
for k in data.keys():
    data[k] = data[k][idx]

In [70]:
print(type(data))
#print(datadata)

<class 'dict'>


In [71]:
filtered_df = pd.DataFrame.from_dict(data, orient='columns', dtype=None)
filtered_df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
1,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
2,8,edward riddle,edward,riddle,2014-02-19,Male,1974-07-23,41,25 - 45,Caucasian,...,2,Low,2014-02-19,2014-03-31,2014-04-18,14,5,40,1,1
3,10,elizabeth thieme,elizabeth,thieme,2014-03-16,Female,1976-06-03,39,25 - 45,Caucasian,...,1,Low,2014-03-16,2014-03-15,2014-03-18,0,2,747,0,0
4,14,benjamin franc,benjamin,franc,2013-11-26,Male,1988-06-01,27,25 - 45,Caucasian,...,4,Low,2013-11-26,2013-11-25,2013-11-26,0,0,857,0,0


In [72]:
# minimal data processing is done inspired by the blogpost here
#https://github.com/mbilalzafar/fair-classification/blob/master/disparate_impact/adult_data_demo/prepare_adult_data.py

#CONT_VARIABLES = ["priors_count"] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot
#Target_field = "two_year_recid" # the decision variable
#Sensitive_field = "race"

Target_field = "two_year_recid"

keep_field_set = set(["age_cat", "race", "sex", "priors_count", "c_charge_degree","priors_count","two_year_recid","race"])
all_fields = set(filtered_df.columns)
for item in all_fields:
     if item not in keep_field_set:
        #print(item)    
        filtered_df = filtered_df.drop(item, axis=1)
        
        
# converting the target field to str
# recidivate = 1 means a bad outcome

filtered_df[Target_field] = filtered_df[Target_field].replace(1, 'Recidivated')
filtered_df[Target_field] = filtered_df[Target_field].replace(0, 'Not Recidivated')




In [73]:
filtered_df.head()

Unnamed: 0,sex,age_cat,race,priors_count,c_charge_degree,two_year_recid
0,Male,25 - 45,African-American,0,F,Recidivated
1,Male,Less than 25,African-American,4,F,Recidivated
2,Male,25 - 45,Caucasian,14,F,Recidivated
3,Female,25 - 45,Caucasian,0,M,Not Recidivated
4,Male,25 - 45,Caucasian,0,F,Not Recidivated


In [74]:
filtered_df['sex'].unique()

array(['Male', 'Female'], dtype=object)

In [75]:
filtered_df['age_cat'].unique()

array(['25 - 45', 'Less than 25', 'Greater than 45'], dtype=object)

In [76]:
filtered_df['race'].unique()

array(['African-American', 'Caucasian'], dtype=object)

In [77]:
filtered_df['priors_count'].unique()

array([ 0,  4, 14,  3,  1,  7,  6,  5, 13,  8,  9, 21,  2, 15, 10, 28, 19,
       11, 23, 25, 36, 12, 33, 16, 18, 20, 17, 22, 30, 24, 27, 26, 37, 29,
       38, 31])

In [78]:
filtered_df['c_charge_degree'].unique()

array(['F', 'M'], dtype=object)

In [79]:
filtered_df['two_year_recid'].unique()

array(['Recidivated', 'Not Recidivated'], dtype=object)

In [80]:

filtered_df.to_csv('compas.csv', index=False)

file_df = pd.read_csv('compas.csv')



In [81]:
file_df.head()

Unnamed: 0,sex,age_cat,race,priors_count,c_charge_degree,two_year_recid
0,Male,25 - 45,African-American,0,F,Recidivated
1,Male,Less than 25,African-American,4,F,Recidivated
2,Male,25 - 45,Caucasian,14,F,Recidivated
3,Female,25 - 45,Caucasian,0,M,Not Recidivated
4,Male,25 - 45,Caucasian,0,F,Not Recidivated


In [82]:
file_df[['two_year_recid']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5278 entries, 0 to 5277
Data columns (total 1 columns):
two_year_recid    5278 non-null object
dtypes: object(1)
memory usage: 41.3+ KB
