In [1]:
import warnings
warnings.filterwarnings('ignore')

#imports
import os, sys
import pandas as pd
import numpy as np
sys.path.insert(1, '../lib/')
import util as ut

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Data Clean, Split

In [2]:
#load data
raw_data = '../data/compas-scores-two-years.csv'
df = pd.read_csv(raw_data)

In [3]:
#clean data

data = df[(df['race']=='African-American')|(df['race']=='Caucasian')]
data["length_of_stay"] = (pd.to_datetime(data.c_jail_out) - pd.to_datetime(data.c_jail_in)).astype('timedelta64[D]')
data = data[(data.days_b_screening_arrest <= 30) & (data.days_b_screening_arrest >= -30)]
data = data[data.is_recid != -1]
data = data[data.c_charge_degree != "O"]
data = data[data.score_text != "N/A"]
data.drop(['first','last','c_case_number','c_charge_desc','violent_recid','vr_charge_degree','vr_case_number','vr_offense_date','vr_charge_desc',
          'c_arrest_date','r_jail_out','r_jail_in','r_days_from_arrest','r_charge_desc',
          'r_offense_date','r_case_number','r_charge_degree'], axis=1,inplace=True)
data['c_days_from_compas'] = data['c_days_from_compas'] .fillna(data['c_days_from_compas'].mode()[0])
data = data.dropna()
X = data.drop(['id','two_year_recid','dob','name','v_type_of_assessment','type_of_assessment'], axis=1)
X.set_index(data.id,inplace=True)
y = data[['two_year_recid']]
y.set_index(data.id,inplace=True)
X = X.drop(['c_offense_date','c_jail_in','c_jail_out','out_custody','in_custody','screening_date','compas_screening_date','v_screening_date'],axis=1)
#convert categorical variables into numeric
X['race']= X['race'].replace(['African-American','Caucasian'],[0,1.0])
X['sex'] = X['sex'].replace(['Male','Female'],[0,1.0])
X['age_cat'] = X['age_cat'].replace(['25 - 45','Less than 25', 'Greater than 45'],[0,1.0,2.0])
X['c_charge_degree'] = X['c_charge_degree'].replace(['M','F'],[0,1.0])
X['score_text'] = X['score_text'].replace(['Low', 'High', 'Medium'],[0,1.0,2.0])
X['v_score_text'] = X['v_score_text'].replace(['Low', 'High', 'Medium'],[0,1.0,2.0])
X = X.fillna(0)

#compas = compas[compas.race.isin(['Caucasian','African-American'])]
#compas = compas[(compas.days_b_screening_arrest <= 30) & (compas.days_b_screening_arrest >= -30)]
#compas = compas[compas.is_recid != -1]
#compas = compas[compas.c_charge_degree != "O"]
#compas = compas[compas.score_text != "N/A"]

#cleanup_nums = {"sex":     {"Male": 0, "Female": 1},
#                "age_cat": {"25 - 45": 0, "Less than 25": 1, "Greater than 45": 2},
#                "race": {"African-American": 1, "Caucasian": 0},
#                "c_charge_degree": {"F": 0, "M": 1},
#                "score_text": {"Low": 0, "Medium": 1, "High": 2}}
#compas = compas.replace(cleanup_nums)

#compas["length_of_stay"] = (pd.to_datetime(compas.c_jail_out) - pd.to_datetime(compas.c_jail_in)).astype('timedelta64[D]')

#X = ["age", "c_charge_degree", "age_cat", "sex", "priors_count", "length_of_stay"]
#S = ["race"]
#features = ["age", "c_charge_degree", "age_cat", "sex", "priors_count", "length_of_stay", "race"]
#Y = ["two_year_recid"]

#compas.to_csv('../output/compas-scores-two-years_cleaned.csv')

In [4]:
#prepare training

y=np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.1,random_state=44)
x_sensitive_tr = X_train.loc[:, 'race']
x_sensitive_te = X_test.loc[:, 'race']
X_train = X_train.loc[:, X_train.columns != 'race']
X_test = X_test.loc[:, X_test.columns != 'race']

X_train=X_train.reset_index()
X_test=X_test.reset_index()
x_sensitive_tr=x_sensitive_tr.reset_index()
x_sensitive_te=x_sensitive_te.reset_index()

X_train=X_train.drop(['id'], axis=1)
X_test=X_test.drop(['id'], axis=1)
x_sensitive_tr=x_sensitive_tr.drop(['id'], axis=1)
x_sensitive_te=x_sensitive_te.drop(['id'], axis=1)

#X_train = x_train[X]
#S_train = x_train[S]

#X_test = x_test[X]
#S_test = x_test[S]

## Baseline Model

In [5]:
model_base = LogisticRegression(fit_intercept=False).fit(X_train,y_train.flatten())
base_y_train = model_base.predict(X_train)
base_y_test = model_base.predict(X_test)

theta_star = model_base.coef_.flatten()

#metrics
print('train accuracy: ' + str(model_base.score(X_train,y_train.flatten())))
print('test accuracy: ' + str(model_base.score(X_test,y_test.flatten())))

base_calib_train = ut.calibration(base_y_train, y_train.flatten(),x_sensitive_tr['race'].values)
base_calib_test = ut.calibration(base_y_test, y_test.flatten(),x_sensitive_tr['race'].values)

print('train calibration: ' + str(base_calib_train))
print('test calibration: ' + str(base_calib_test))

train accuracy: 0.9639225181598063
test accuracy: 0.9564270152505446
train calibration: 0.0013216642186006933
test calibration: -0.010360962566844933


## Gamma

In [6]:
import imp
imp.reload(ut)

for g in [0,0.001,0.01,0.05,0.1,0.15,0.2]:
    print('Using gamma = ',g)

    theta_g = ut.model_gamma(X_train,y_train.flatten(),x_sensitive_tr['race'].values,theta_star,g)
   
    #to use score
    model_gamma = LogisticRegression(fit_intercept=False)
    model_gamma.coef_ = theta_g.reshape((1,-1))
    model_gamma.intercept_ = 0
    model_gamma.classes_ = np.array([0,1.0])
    g_y_train = model_gamma.predict(X_train)
    g_y_test = model_gamma.predict(X_test)

    #metrics
    print('train accuracy: ' + str(model_gamma.score(X_train,y_train.flatten())))
    print('test accuracy: ' + str(model_gamma.score(X_test,y_test.flatten())))

    g_calib_train = ut.calibration(g_y_train, y_train.flatten(),x_sensitive_tr['race'].values)
    g_calib_test = ut.calibration(g_y_test, y_test.flatten(),x_sensitive_tr['race'].values)

    print('train calibration difference: ' + str(g_calib_train))
    print('test calibration difference: ' + str(g_calib_test))

Using gamma =  0
train accuracy: 0.47893462469733655
test accuracy: 0.5032679738562091
train calibration difference: 0.13196630217683586
test calibration difference: 0.05514705882352944
Using gamma =  0.001
train accuracy: 0.47893462469733655
test accuracy: 0.5032679738562091
train calibration difference: 0.13196630217683586
test calibration difference: 0.05514705882352944
Using gamma =  0.01
train accuracy: 0.47893462469733655
test accuracy: 0.5032679738562091
train calibration difference: 0.13196630217683586
test calibration difference: 0.05514705882352944
Using gamma =  0.05
train accuracy: 0.47893462469733655
test accuracy: 0.5032679738562091
train calibration difference: 0.13196630217683586
test calibration difference: 0.05514705882352944
Using gamma =  0.1
train accuracy: 0.47893462469733655
test accuracy: 0.5032679738562091
train calibration difference: 0.13196630217683586
test calibration difference: 0.05514705882352944
Using gamma =  0.15
train accuracy: 0.47893462469733655
te

## Fine-grained Gamma

In [7]:
import imp
imp.reload(ut)

for g in [0.1]:
    
    print('Using gamma = ',g)
    
    
    theta_fg = ut.model_fg(X_train.values,y_train.flatten(),x_sensitive_tr['race'].values,theta_star,g)

    #to use score
    model_fine_gamma = LogisticRegression(fit_intercept=False)
    model_fine_gamma.coef_ = theta_fg.reshape((1,-1))
    model_fine_gamma.intercept_ = 0
    model_fine_gamma.classes_ = np.array([0,1.0])
    fg_y_train = model_fine_gamma.predict(X_train)
    fg_y_test = model_fine_gamma.predict(X_test)

    #metrics
    print('train accuracy: ' + str(model_fine_gamma.score(X_train,y_train.flatten())))
    print('test accuracy: ' + str(model_fine_gamma.score(X_test,y_test.flatten())))

    fg_calib_train = ut.calibration(fg_y_train, y_train.flatten(),x_sensitive_tr['race'].values)
    fg_calib_test = ut.calibration(fg_y_test, y_test.flatten(),x_sensitive_tr['race'].values)

    print('train calibration difference: ' + str(fg_calib_train))
    print('test calibration difference: ' + str(fg_calib_test))

Using gamma =  0.1
train accuracy: 0.47046004842615013
test accuracy: 0.49019607843137253
train calibration difference: 0.13672643056429867
test calibration difference: 0.05113636363636359
