In [None]:
### 6-7 preprocessing 
### 7-7:30 split

In [23]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("./compas-scores-two-years.csv")
#print(data.columns())
print(data.columns.values)

['id' 'name' 'first' 'last' 'compas_screening_date' 'sex' 'dob' 'age'
 'age_cat' 'race' 'juv_fel_count' 'decile_score' 'juv_misd_count'
 'juv_other_count' 'priors_count' 'days_b_screening_arrest' 'c_jail_in'
 'c_jail_out' 'c_case_number' 'c_offense_date' 'c_arrest_date'
 'c_days_from_compas' 'c_charge_degree' 'c_charge_desc' 'is_recid'
 'r_case_number' 'r_charge_degree' 'r_days_from_arrest' 'r_offense_date'
 'r_charge_desc' 'r_jail_in' 'r_jail_out' 'violent_recid'
 'is_violent_recid' 'vr_case_number' 'vr_charge_degree' 'vr_offense_date'
 'vr_charge_desc' 'type_of_assessment' 'decile_score.1' 'score_text'
 'screening_date' 'v_type_of_assessment' 'v_decile_score' 'v_score_text'
 'v_screening_date' 'in_custody' 'out_custody' 'priors_count.1' 'start'
 'end' 'event' 'two_year_recid']


In [3]:
idx = np.where((data['days_b_screening_arrest']<=30) & (data['days_b_screening_arrest']>=-30)
 & (data['is_recid']!=-1) & (data['c_charge_degree']!="O") & (data['score_text']!="N/A") &
 ((data['race']=="African-American") |(data['race']=="Caucasian")))

However not all of the rows are useable for the first round of analysis.
<br>
<br>
There are a number of reasons remove rows because of missing data:
<br>
<br>
If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense.
<br>
<br>
We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all.
<br>
<br>
In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed (only two of them).
<br>
<br>
We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.

In [4]:
df=data.iloc[idx]
df=df.filter(items=['raw_data', 'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 'priors_count', 
                    'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out'])
df

Unnamed: 0,age,c_charge_degree,race,age_cat,score_text,sex,priors_count,days_b_screening_arrest,decile_score,is_recid,two_year_recid,c_jail_in,c_jail_out
1,34,F,African-American,25 - 45,Low,Male,0,-1.0,3,1,1,2013-01-26 03:45:27,2013-02-05 05:36:53
2,24,F,African-American,Less than 25,Low,Male,4,-1.0,4,1,1,2013-04-13 04:58:34,2013-04-14 07:02:04
6,41,F,Caucasian,25 - 45,Medium,Male,14,-1.0,6,1,1,2014-02-18 05:08:24,2014-02-24 12:18:30
8,39,M,Caucasian,25 - 45,Low,Female,0,-1.0,1,0,0,2014-03-15 05:35:34,2014-03-18 04:28:46
10,27,F,Caucasian,25 - 45,Low,Male,0,-1.0,4,0,0,2013-11-25 06:31:06,2013-11-26 08:26:57
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207,30,M,African-American,25 - 45,Low,Male,0,-1.0,2,1,1,2014-05-09 10:01:33,2014-05-10 08:28:12
7208,20,F,African-American,Less than 25,High,Male,0,-1.0,9,0,0,2013-10-19 11:17:15,2013-10-20 08:13:06
7209,23,F,African-American,Less than 25,Medium,Male,0,-1.0,7,0,0,2013-11-22 05:18:27,2013-11-24 02:59:20
7210,23,F,African-American,Less than 25,Low,Male,0,-1.0,3,0,0,2014-01-31 07:13:54,2014-02-02 04:03:52


In [28]:
# change class label 0 to -1 as in the paper the predictions are either 1 or. -1
y = df['two_year_recid']
y[y==0] = -1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
features = ["age_cat", "race", "sex", "priors_count", "c_charge_degree"] #features to be used for classification
cont = ["priors_count"] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot
class_feature = "two_year_recid" # the decision variable
sensitive_feature = ["race"]

In [20]:
#"age_cat", "race", "sex", "priors_count", "c_charge_degree"]
df1=df[["sex","age_cat",'decile_score','priors_count', 'c_jail_in', 'c_jail_out','c_charge_degree','score_text',"race"]]


df1['length_of_stay']=df1['c_jail_out'].apply(pd.to_datetime) - df1['c_jail_in'].apply(pd.to_datetime)
df1['length_of_stay']=df1['length_of_stay'].dt.days
df1['length_of_stay'] = df1.length_of_stay.apply(lambda x:'greater than 100 days'  if x >100 else ('10-100 days' if x >10 else 'less than 10 days'))

# Label Encoding 
df1.loc[df1.race=='Caucasian','race']=1
df1.loc[df1.race=='African-American','race']=0

categorical_variables = ["c_charge_degree","race","sex","age_cat","score_text",'length_of_stay']
for var in categorical_variables:
    df1[var] = df1[var].astype('category').cat.codes
    
#df1=df1[["sex","age_cat",'decile_score','priors_count','days_b_screening_arrest','c_charge_degree','is_recid','score_text','length_of_stay',"race", 'two_year_recid']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/p

In [21]:
df1

Unnamed: 0,sex,age_cat,decile_score,priors_count,c_jail_in,c_jail_out,c_charge_degree,score_text,race,length_of_stay
1,1,0,3,0,2013-01-26 03:45:27,2013-02-05 05:36:53,0,1,0,2
2,1,2,4,4,2013-04-13 04:58:34,2013-04-14 07:02:04,0,1,0,2
6,1,0,6,14,2014-02-18 05:08:24,2014-02-24 12:18:30,0,2,1,2
8,0,0,1,0,2014-03-15 05:35:34,2014-03-18 04:28:46,1,1,1,2
10,1,0,4,0,2013-11-25 06:31:06,2013-11-26 08:26:57,0,1,1,2
...,...,...,...,...,...,...,...,...,...,...
7207,1,0,2,0,2014-05-09 10:01:33,2014-05-10 08:28:12,1,1,0,2
7208,1,2,9,0,2013-10-19 11:17:15,2013-10-20 08:13:06,0,0,0,2
7209,1,2,7,0,2013-11-22 05:18:27,2013-11-24 02:59:20,0,2,0,2
7210,1,2,3,0,2014-01-31 07:13:54,2014-02-02 04:03:52,0,1,0,2


## Split train,test,valid

In [30]:
X_train, X_rem, y_train, y_rem = train_test_split(df1,y, train_size=5/7.0, random_state=1)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5,random_state=1)