In [6]:
import re
import json
import pickle
import numpy as np
import pandas as pd

In [7]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

In [8]:
def clean_str(s):
    return re.sub(r'\s', '', s)

def are_equal(a , b):
    return clean_str(a) == clean_str(b)

def a_in_b(a , b):
    return clean_str(a) in clean_str(b)

def add_cols (df, columns, default_value = 0):
    for k, e in enumerate(columns):
        i = common_data([e], df.columns, are_equal)

        if i == -1:
            df[e] = default_value
    return df

def common_data(list1, list2, comparisonf): 
    result = -1
  
    for k, x in enumerate(list1): 
        for y in list2: 
            
            if comparisonf(x,y):
                result = k
                return result  
                  
    return result

def prepare_data(df, feat2dummie, cols_to_pow, columns, drop_first = True, max_pow = 3):
    
    for i in feat2dummie:
        one_hot = pd.get_dummies(df[i], prefix=i, drop_first=drop_first)
        df = df.drop(i,axis = 1)
        df = df.join(one_hot)

    is_recid, is_violent_recid = df['is_recid'], df['is_violent_recid']

    columns_to_drop = ['is_recid', 'is_violent_recid']
    df = df.drop(columns_to_drop, axis=1)
    
    dft = df.copy()

    for k,i in enumerate(cols_to_pow):
        for j in range(1, max_pow):
            dft[i+str(j+1)] = pow(df[i], j+1)
    
    dft = add_cols(dft, columns)
    
    return dft, is_recid, is_violent_recid

In [9]:
pd.set_option('display.max_columns', None)

In [10]:
data_folder = './data'
model_folder = './model'

train_dataset = 'train_compas_processed.xlsx'
validate_dataset = 'validate_compas_processed.xlsx'
model_colums_sample = 'model_colums_sample.xlsx'

model_file_name = 'rf_recidivism_prediction.sav'

### Load data

In [11]:
train_df_init = pd.read_excel (data_folder+'/'+train_dataset)
validate_df_init = pd.read_excel (data_folder+'/'+validate_dataset)

train_df = train_df_init
validate_df = validate_df_init

In [12]:
columns = pd.read_excel (data_folder+'/'+model_colums_sample).columns

In [13]:
train_df.shape

(8918, 14)

In [14]:
validate_df.shape

(470, 14)

#### Prepare data for prediction

In [45]:
max_pow = 3
drop_first = True

feat2dummie = ['sex','age_cat','race','c_charge_degree','c_cat','weapon_firearm']
cols_to_pow = ['age','decile_score','priors_count','juv_count']

validate_df_processed, is_recid, is_violent_recid = prepare_data(validate_df, feat2dummie, cols_to_pow, columns, drop_first = True, max_pow = 3)

In [16]:
#df.shape

In [17]:
#df.head(10)

### Load model

In [18]:
rfc = pickle.load(open(model_folder+'/'+model_file_name, 'rb'))

In [38]:
train_df_processed, is_recid2, is_violent_recid2 = prepare_data(train_df, feat2dummie, cols_to_pow, columns, drop_first = True, max_pow = 3)

train_df_processed['prediction'] = rfc.predict(train_df_processed)

In [39]:
train_df_processed.head(10)

Unnamed: 0,age,decile_score,priors_count,days_b_screening_arrest,c_days_from_compas,juv_count,sex_Male,age_cat_Greater than 45,age_cat_Less than 25,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,c_charge_degree_M,c_cat_battery,c_cat_burglary,c_cat_cannabis,c_cat_driving,c_cat_grand theft,c_cat_lewdness,c_cat_mischief,c_cat_no charge,c_cat_other,c_cat_poss,c_cat_resisting,c_cat_sexual,c_cat_tampering,weapon_firearm_True,age2,age3,decile_score2,decile_score3,priors_count2,priors_count3,juv_count2,juv_count3,sex_Female,age_cat_25 - 45,race_African-American,c_charge_degree_F,c_cat_assault,weapon_firearm_False,prediction
0,22,9,2,-1,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,484,10648,81,729,4,8,1,1,0,0,0,0,0,0,0
1,22,4,0,-1,1,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,484,10648,16,64,0,0,0,0,0,0,0,0,0,0,0
2,46,9,2,-1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2116,97336,81,729,4,8,0,0,0,0,0,0,0,0,0
3,66,1,1,-1,1,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,4356,287496,1,1,1,1,0,0,0,0,0,0,0,0,0
4,51,2,2,-23,23,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2601,132651,4,8,4,8,0,0,0,0,0,0,0,0,0
5,28,5,10,-20,20,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,784,21952,25,125,100,1000,0,0,0,0,0,0,0,0,0
6,29,7,5,-1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,841,24389,49,343,25,125,0,0,0,0,0,0,0,0,0
7,27,10,7,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,729,19683,100,1000,49,343,0,0,0,0,0,0,0,0,0
8,48,1,1,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2304,110592,1,1,1,1,0,0,0,0,0,0,0,0,0
9,45,1,2,-1,1,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2025,91125,1,1,4,8,0,0,0,0,0,0,0,0,0


In [40]:
train_df['prediction'] = train_df_processed['prediction']

## 1. Bias detection in the data

### Oleg O

You should use train_df for audit 

In [41]:
train_df.head()

Unnamed: 0,sex,age,age_cat,race,decile_score,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,is_recid,is_violent_recid,juv_count,c_cat,weapon_firearm,prediction
0,Female,22,Less than 25,African-American,9,2,-1,1,F,0,0,1,battery,True,0
1,Male,22,Less than 25,Caucasian,4,0,-1,1,M,0,0,0,battery,False,0
2,Male,46,Greater than 45,African-American,9,2,-1,1,F,0,0,0,grand theft,False,0
3,Male,66,Greater than 45,Caucasian,1,1,-1,1,M,0,0,0,no charge,False,0
4,Male,51,Greater than 45,Caucasian,2,2,-23,23,F,0,0,0,other,False,0


In [42]:
train_df_processed.head()

Unnamed: 0,age,decile_score,priors_count,days_b_screening_arrest,c_days_from_compas,juv_count,sex_Male,age_cat_Greater than 45,age_cat_Less than 25,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,c_charge_degree_M,c_cat_battery,c_cat_burglary,c_cat_cannabis,c_cat_driving,c_cat_grand theft,c_cat_lewdness,c_cat_mischief,c_cat_no charge,c_cat_other,c_cat_poss,c_cat_resisting,c_cat_sexual,c_cat_tampering,weapon_firearm_True,age2,age3,decile_score2,decile_score3,priors_count2,priors_count3,juv_count2,juv_count3,sex_Female,age_cat_25 - 45,race_African-American,c_charge_degree_F,c_cat_assault,weapon_firearm_False,prediction
0,22,9,2,-1,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,484,10648,81,729,4,8,1,1,0,0,0,0,0,0,0
1,22,4,0,-1,1,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,484,10648,16,64,0,0,0,0,0,0,0,0,0,0,0
2,46,9,2,-1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2116,97336,81,729,4,8,0,0,0,0,0,0,0,0,0
3,66,1,1,-1,1,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,4356,287496,1,1,1,1,0,0,0,0,0,0,0,0,0
4,51,2,2,-23,23,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2601,132651,4,8,4,8,0,0,0,0,0,0,0,0,0


## 2. Feature selection from the data

### Oleg M

In [21]:
train_df.head()

Unnamed: 0,sex,age,age_cat,race,decile_score,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,is_recid,is_violent_recid,juv_count,c_cat,weapon_firearm
0,Female,22,Less than 25,African-American,9,2,-1,1,F,0,0,1,battery,True
1,Male,22,Less than 25,Caucasian,4,0,-1,1,M,0,0,0,battery,False
2,Male,46,Greater than 45,African-American,9,2,-1,1,F,0,0,0,grand theft,False
3,Male,66,Greater than 45,Caucasian,1,1,-1,1,M,0,0,0,no charge,False
4,Male,51,Greater than 45,Caucasian,2,2,-23,23,F,0,0,0,other,False
5,Male,28,25 - 45,African-American,5,10,-20,20,F,1,0,0,poss,False
6,Male,29,25 - 45,African-American,7,5,-1,1,F,1,0,0,cannabis,False
7,Male,27,25 - 45,African-American,10,7,0,1,F,0,0,0,battery,True
8,Male,48,Greater than 45,Caucasian,1,1,0,1,M,1,0,0,driving,False
9,Male,45,Greater than 45,Hispanic,1,2,-1,1,F,0,0,0,battery,False


In [43]:
train_df_processed.head()

Unnamed: 0,age,decile_score,priors_count,days_b_screening_arrest,c_days_from_compas,juv_count,sex_Male,age_cat_Greater than 45,age_cat_Less than 25,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,c_charge_degree_M,c_cat_battery,c_cat_burglary,c_cat_cannabis,c_cat_driving,c_cat_grand theft,c_cat_lewdness,c_cat_mischief,c_cat_no charge,c_cat_other,c_cat_poss,c_cat_resisting,c_cat_sexual,c_cat_tampering,weapon_firearm_True,age2,age3,decile_score2,decile_score3,priors_count2,priors_count3,juv_count2,juv_count3,sex_Female,age_cat_25 - 45,race_African-American,c_charge_degree_F,c_cat_assault,weapon_firearm_False,prediction
0,22,9,2,-1,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,484,10648,81,729,4,8,1,1,0,0,0,0,0,0,0
1,22,4,0,-1,1,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,484,10648,16,64,0,0,0,0,0,0,0,0,0,0,0
2,46,9,2,-1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2116,97336,81,729,4,8,0,0,0,0,0,0,0,0,0
3,66,1,1,-1,1,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,4356,287496,1,1,1,1,0,0,0,0,0,0,0,0,0
4,51,2,2,-23,23,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2601,132651,4,8,4,8,0,0,0,0,0,0,0,0,0


In [44]:
rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

## 3. Result interpretability

### Valerii & Andrew

In [25]:
rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [46]:
pred_result = rfc.predict(validate_df_processed)

In [47]:
result = roc_auc_score(is_recid, pred_result)

print(result)

0.5
