In [1]:
#imports
import os, sys
import pandas as pd
import numpy as np
sys.path.insert(1, '../lib/')
import util as ut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Data Clean, Split

In [2]:
#load data
compas = pd.read_csv('../data/compas-scores-two-years.csv')
print(compas.shape)
print(compas.columns)

(7214, 53)
Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')


In [3]:
#clean data
compas = compas[compas.race.isin(['Caucasian','African-American'])]
compas = compas[(compas.days_b_screening_arrest <= 30) & (compas.days_b_screening_arrest >= -30)]
compas = compas[compas.is_recid != -1]
compas = compas[compas.c_charge_degree != "O"]
compas = compas[compas.score_text != "N/A"]

cleanup_nums = {"sex":     {"Male": 0, "Female": 1},
                "age_cat": {"25 - 45": 0, "Less than 25": 1, "Greater than 45": 2},
                "race": {"African-American": 1, "Caucasian": 0},
                "c_charge_degree": {"F": 0, "M": 1},
                "score_text": {"Low": 0, "Medium": 1, "High": 2}}
compas = compas.replace(cleanup_nums)

compas["length_of_stay"] = (pd.to_datetime(compas.c_jail_out) - pd.to_datetime(compas.c_jail_in)).astype('timedelta64[D]')

X = ["age", "c_charge_degree", "age_cat", "sex", "priors_count", "length_of_stay"]
S = ["race"]
features = ["age", "c_charge_degree", "age_cat", "sex", "priors_count", "length_of_stay", "race"]
Y = ["two_year_recid"]

compas.to_csv('../output/compas-scores-two-years_cleaned.csv')

In [4]:
#prepare training
x_train, x_test, y_train, y_test = train_test_split(compas[features], compas[Y], test_size=0.1, random_state=42)

x_train=x_train.reset_index()
x_test=x_test.reset_index()
y_train=y_train.reset_index()
y_test=y_test.reset_index()


X_train = x_train[X]
S_train = x_train[S]

X_test = x_test[X]
S_test = x_test[S]

## Baseline Model

In [5]:
model_base = LogisticRegression(fit_intercept=False).fit(x_train,y_train[Y].values.flatten())
base_y_train = model_base.predict(x_train)
base_y_test = model_base.predict(x_test)

#metrics
print('train accuracy: ' + str(model_base.score(x_train,y_train[Y].values.flatten())))
print('test accuracy: ' + str(model_base.score(x_test,y_test[Y].values.flatten())))

base_calib_train = ut.calibration(x_train, base_y_train, y_train[Y].values.flatten())
base_calib_test = ut.calibration(x_test, base_y_test, y_test[Y].values.flatten())

print('train calibration: ' + str(base_calib_train))
print('test calibration: ' + str(base_calib_test))

train accuracy: 0.6738947368421052
test accuracy: 0.6609848484848485
train calibration: 0.007050538563553199
test calibration: 0.011778846153846168


In [6]:
theta_star = model_base.coef_

## Gamma

In [7]:
theta_g = ut.model_gamma(x_train,y_train[Y].values.flatten(),S_train[S].values.flatten(),base_y_train)

#to use score
model_gamma = LogisticRegression()
model_gamma.coef_ = theta_g.reshape((1,-1))
model_gamma.intercept_ = 0
model_gamma.classes_ = np.array([0,1])
g_y_train = model_gamma.predict(x_train)
g_y_test = model_gamma.predict(x_test)

#metrics
print('train accuracy: ' + str(model_gamma.score(x_train,y_train[Y].values.flatten())))
print('test accuracy: ' + str(model_gamma.score(x_test,y_test[Y].values.flatten())))

g_calib_train = ut.calibration(x_train, g_y_train, y_train[Y].values.flatten())
g_calib_test = ut.calibration(x_test, g_y_test, y_test[Y].values.flatten())

print('train calibration difference: ' + str(g_calib_train))
print('test calibration difference: ' + str(g_calib_test))

train accuracy: 0.4686315789473684
test accuracy: 0.48674242424242425
train calibration difference: 0.13350091724466173
test calibration difference: 0.12091346153846161


## Fine-grained Gamma

In [8]:
theta_fg = ut.model_fg(x_train.values,y_train[Y].values.flatten(),S_train[S].values.flatten(),base_y_train)

#to use score
model_fine_gamma = LogisticRegression()
model_fine_gamma.coef_ = theta_fg.reshape((1,-1))
model_fine_gamma.intercept_ = 0
model_fine_gamma.classes_ = np.array([0,1])
fg_y_train = model_fine_gamma.predict(x_train)
fg_y_test = model_fine_gamma.predict(x_test)

#metrics
print('train accuracy: ' + str(model_fine_gamma.score(x_train,y_train[Y].values.flatten())))
print('test accuracy: ' + str(model_fine_gamma.score(x_test,y_test[Y].values.flatten())))

fg_calib_train = ut.calibration(x_train, fg_y_train, y_train[Y].values.flatten())
fg_calib_test = ut.calibration(x_test, fg_y_test, y_test[Y].values.flatten())

print('train calibration difference: ' + str(fg_calib_train))
print('test calibration difference: ' + str(fg_calib_test))

train accuracy: 0.4686315789473684
test accuracy: 0.48674242424242425
train calibration difference: 0.13350091724466173
test calibration difference: 0.12091346153846161
