In [1]:
import pandas as pd
from datetime import datetime
import csv
import math

In [2]:
# Load data and explore
raw_data = pd.read_csv("../data/compas-scores-two-years.csv")
print(raw_data.shape[0])

7214


In [3]:
raw_data.loc[3]

id                                              5
name                                  marcu brown
first                                       marcu
last                                        brown
compas_screening_date                  2013-01-13
sex                                          Male
dob                                    1993-01-21
age                                            23
age_cat                              Less than 25
race                             African-American
juv_fel_count                                   0
decile_score                                    8
juv_misd_count                                  1
juv_other_count                                 0
priors_count                                    1
days_b_screening_arrest                       NaN
c_jail_in                                     NaN
c_jail_out                                    NaN
c_case_number                       13000570CF10A
c_offense_date                         2013-01-12


In [4]:
# Filter data for only Caucasians and African-Americans
df = raw_data.loc[raw_data['race'].isin(['Caucasian', 'African-American']),
                  ['sex', 'age_cat', 'race', 'priors_count', 'c_charge_degree', 'c_jail_in', 'c_jail_out', 'two_year_recid']]

In [5]:
# Create new column for length of stay and remove NAs
df['c_jail_in'] = pd.to_datetime(df['c_jail_in'])
df['c_jail_out'] = pd.to_datetime(df['c_jail_out'])
df['length_of_stay'] = (df['c_jail_out'] - df['c_jail_in']).dt.days
df = df.dropna(subset=['length_of_stay'])
# Change variables into factors

In [6]:
df.head()

Unnamed: 0,sex,age_cat,race,priors_count,c_charge_degree,c_jail_in,c_jail_out,two_year_recid,length_of_stay
1,Male,25 - 45,African-American,0,F,2013-01-26 03:45:27,2013-02-05 05:36:53,1,10.0
2,Male,Less than 25,African-American,4,F,2013-04-13 04:58:34,2013-04-14 07:02:04,1,1.0
6,Male,25 - 45,Caucasian,14,F,2014-02-18 05:08:24,2014-02-24 12:18:30,1,6.0
8,Female,25 - 45,Caucasian,0,M,2014-03-15 05:35:34,2014-03-18 04:28:46,0,2.0
9,Male,Less than 25,Caucasian,1,F,2015-01-06 03:55:34,2015-01-07 03:38:44,1,0.0


In [7]:
print(df["c_charge_degree"].unique())

['F' 'M']


In [8]:
def prior_count(x):
    if x==0:
        return 0
    if x<=3:
        return 1
    return 2
def length_of_stay(x):
    if x<=7:
        return 0
    if x<=90:
        return 1
    return 2
def age_cat(x):
    if x=="Less than 25":
        return 0
    if x=="25 - 45":
        return 1
    return 2
def race(x):
    if x=="African-American":
        return 0
    return 1
def sex(x):
    if x=="Male":
        return 0
    return 1
def c_charge_degree(x):
    if x=="M":
        return 0
    return 1

In [9]:
df['two_year_recid'] = df['two_year_recid'].astype('category')
df['age_cat'] = df['age_cat'].map(lambda x:age_cat(x))
df['c_charge_degree'] = df['c_charge_degree'].map(lambda x:c_charge_degree(x))
df['sex']=df['sex'].map(lambda x:sex(x))
df['priors_count']=df['priors_count'].map(lambda x:prior_count(x))
df['length_of_stay']=df['length_of_stay'].map(lambda x:length_of_stay(x))
df['race']=df['race'].map(lambda x:race(x))

In [10]:
# Drop c_jail_in and c_jail_out columns from the DataFrame
df.drop(['c_jail_in', 'c_jail_out'], axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,sex,age_cat,race,priors_count,c_charge_degree,two_year_recid,length_of_stay
1,0,1,0,0,1,1,1
2,0,0,0,2,1,1,0
6,0,1,1,2,1,1,0
8,1,1,1,0,0,0,0
9,0,0,1,1,1,1,0
