In [1]:
import re
import json
import pickle
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

In [31]:
def clean_str(s):
    return re.sub(r'\s', '', s)

def are_equal(a , b):
    return clean_str(a) == clean_str(b)

def a_in_b(a , b):
    return clean_str(a) in clean_str(b)

def add_cols (df, columns, default_value = 0):
    for k, e in enumerate(columns):
        i = common_data([e], df.columns, are_equal)

        if i == -1:
            df[e] = default_value
    return df

def common_data(list1, list2, comparisonf): 
    result = -1
  
    for k, x in enumerate(list1): 
        for y in list2: 
            
            if comparisonf(x,y):
                result = k
                return result  
                  
    return result

def prepare_data(df, feat2dummie, cols_to_pow, columns, drop_first = True, max_pow = 3):
    
    for i in feat2dummie:
        one_hot = pd.get_dummies(df[i], prefix=i, drop_first=drop_first)
        df = df.drop(i,axis = 1)
        df = df.join(one_hot)

    is_recid, is_violent_recid = df['is_recid'], df['is_violent_recid']

    columns_to_drop = ['is_recid', 'is_violent_recid']
    df = df.drop(columns_to_drop, axis=1)
    
    dft = df.copy()

    for k,i in enumerate(cols_to_pow):
        for j in range(1, max_pow):
            dft[i+str(j+1)] = pow(df[i], j+1)
    
    dft = add_cols(dft, columns)
    
    return dft, is_recid, is_violent_recid

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
data_folder = './data'
model_folder = './model'

train_dataset = 'train_compas_processed.xlsx'
validate_dataset = 'validate_compas_processed.xlsx'
model_colums_sample = 'model_colums_sample.xlsx'

model_file_name = 'rf_recidivism_prediction.sav'

### Load data

In [6]:
train_df_init = pd.read_excel (data_folder+'/'+train_dataset)
validate_df_init = pd.read_excel (data_folder+'/'+validate_dataset)

train_df = train_df_init
validate_df = validate_df_init

In [30]:
columns = pd.read_excel (data_folder+'/'+model_colums_sample).columns

In [8]:
train_df.shape

(8918, 14)

In [9]:
validate_df.shape

(470, 14)

# Bias detection in the data

## Oleg O

# Feature selection from the data

## Oleg M

### Prepare data for prediction

In [33]:
max_pow = 3
drop_first = True

feat2dummie = ['sex','age_cat','race','c_charge_degree','c_cat','weapon_firearm']
cols_to_pow = ['age','decile_score','priors_count','juv_count']

df, is_recid, is_violent_recid = prepare_data(validate_df, feat2dummie, cols_to_pow, columns, drop_first = True, max_pow = 3)

In [None]:
df.shape

In [34]:
df.head(10)

Unnamed: 0,age,decile_score,priors_count,days_b_screening_arrest,c_days_from_compas,juv_count,sex_Male,age_cat_Greater than 45,age_cat_Less than 25,race_Asian,race_Caucasian,race_Hispanic,race_Other,c_charge_degree_M,c_cat_battery,c_cat_burglary,c_cat_cannabis,c_cat_driving,c_cat_grand theft,c_cat_mischief,c_cat_no charge,c_cat_other,c_cat_poss,c_cat_tampering,weapon_firearm_True,age2,age3,decile_score2,decile_score3,priors_count2,priors_count3,juv_count2,juv_count3,sex_Female,age_cat_25 - 45,race_African-American,race_Native American,c_charge_degree_F,c_cat_assault,c_cat_lewdness,c_cat_resisting,c_cat_sexual,weapon_firearm_False
0,51,1,3,-1,1,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2601,132651,1,1,9,27,0,0,0,0,0,0,0,0,0,0,0,0
1,24,2,0,-1,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,576,13824,4,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,55,1,0,-1,1,0,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,3025,166375,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,35,4,7,-1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1225,42875,16,64,49,343,0,0,0,0,0,0,0,0,0,0,0,0
4,51,2,3,-1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,2601,132651,4,8,9,27,0,0,0,0,0,0,0,0,0,0,0,0
5,52,8,5,-1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2704,140608,64,512,25,125,0,0,0,0,0,0,0,0,0,0,0,0
6,29,2,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,841,24389,4,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,57,1,0,-1,2,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3249,185193,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,28,2,1,-1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,784,21952,4,8,1,1,0,0,0,0,0,0,0,0,0,0,0,0
9,28,9,1,-1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,784,21952,81,729,1,1,0,0,0,0,0,0,0,0,0,0,0,0


### Load model

In [37]:
loaded_model = pickle.load(open(model_folder+'/'+model_file_name, 'rb'))

In [39]:
pred_result = loaded_model.predict(df)

In [40]:
result = roc_auc_score(is_recid, pred_result)

print(result)

0.5057466708941027


# Result interpretability

## Valerii & Andrew

In [None]:
from treeinterpreter import treeinterpreter as ti

for i,row in test.iterrows():

    data_point = pd.DataFrame([row])
    data_point.set_axis(['value_variable'], inplace=True) # Once transposed, it will be the column name
    prediction, bias, contributions = ti.predict(model, data_point)
    
    local_interpretation = data_point.append(
        pd.DataFrame([[round(c[1],3) for c in contributions[0]]], columns=data_point.columns.tolist(), index=['contribution_variable'])
    ).T.sort_values('contribution_variable', ascending=False)
# print(local_interpretation)

In [None]:
print(local_interpretation)

In [None]:
from sklearn import tree

In [None]:
dot_data = tree.export_graphviz(model,
                                out_file=None,
                                filled=True,
                                rounded=True)

In [None]:

graph = pydotplus.graph_from_dot_data(dot_data)

colors = ('turquoise', 'orange')
edges = collections.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])

graph.write_png('tree.png')

In [None]:
data_point

In [None]:
estimator = model.estimators_[5]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = df.feature_names,
                class_names = df.target_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')