In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dython.nominal import associations
from mlxtend.plotting import plot_decision_regions
from itertools import chain
import math
from random import seed, shuffle
from scipy.optimize import minimize 
from multiprocessing import Pool, Process, Queue
from collections import defaultdict
from copy import deepcopy
import matplotlib.pyplot as plt 
from sklearn.metrics import accuracy_score as accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.calibration import CalibratedClassifierCV

sns.set_style('darkgrid')
%matplotlib inline

import warnings
import time
warnings.filterwarnings("ignore")

# Project 4 - A3 and A6

## A3 Method

## A6 Method

In [None]:
raw_data = '../data/compas-scores-two-years.csv'
df = pd.read_csv(raw_data)
data = df[(df['race']=='African-American')|(df['race']=='Caucasian')]
data.drop(['violent_recid','vr_charge_degree','vr_case_number','vr_offense_date','vr_charge_desc',
          'c_arrest_date','r_jail_out','r_jail_in','r_days_from_arrest','r_charge_desc',
          'r_offense_date','r_case_number','r_charge_degree'], axis=1,inplace=True)
data['c_offense_date'] = pd.to_datetime(data['c_offense_date'])
data['c_jail_in']= pd.to_datetime(data['c_jail_in'])
data['c_jail_out']= pd.to_datetime(data['c_jail_out'])
data['out_custody']= pd.to_datetime(data['out_custody'])
data['in_custody']= pd.to_datetime(data['in_custody'])
data['screening_date']= pd.to_datetime(data['screening_date'])
data['compas_screening_date']= pd.to_datetime(data['compas_screening_date'])
data['v_screening_date']= pd.to_datetime(data['v_screening_date'])
data['c_days_from_compas'] = data['c_days_from_compas'] .fillna(data['c_days_from_compas'].mode()[0])
X = data.drop(['id','two_year_recid','c_case_number','sex','dob','name'], axis=1)
X.set_index(data.id,inplace=True)
af = data[data.race=='African-American']
ca = data[data.race =='Caucasian']
rate_af = round(af[af['two_year_recid']==1].shape[0]/af.shape[0],2)
rate_ca = round(ca[ca['two_year_recid']==1].shape[0]/af.shape[0],2)
print(f'The rate of Recidivism for African-American is {rate_af}\nThe rate of Recidivism for Caucasian is {rate_ca}')
print(f'The Corrected Recidivism rate should be {(rate_af+rate_ca)/2:,} ')
y = data[['two_year_recid']]
y.set_index(data.id,inplace=True)

In [None]:
corr = X.corr()
fig,ax = plt.subplots(figsize=(15,12))
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
X_new = enc.fit(X)
X_new = enc.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_new,y,test_size=.3,random_state=44)


In [None]:
forest = RandomForestClassifier(random_state=44).fit(X_train, y_train) 
y_pred_test = forest.predict(X_test)
print(accuracy_score(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))
cali_X = X.loc[y_test.index]
cali_X['label'] = y_test
cali_X['pred'] = y_pred_test
af_X = cali_X[cali_X.race=='African-American']
ca_X =  cali_X[cali_X.race=='Caucasian']
print(np.abs(accuracy_score(af_X.label, af_X.pred)-accuracy_score(ca_X.label, ca_X.pred)))

The model showed that African-American has a higher Recidivism rate compared to Caucasian, and in reality the rate should be pretty much the same

In [None]:
categorical_features = ['first','last','age_cat','race','c_charge_degree','c_charge_desc','type_of_assessment',
                       'score_text','v_type_of_assessment','v_score_text']

In [None]:
selected_column= X[categorical_features]
categorical_df = selected_column.copy()

In [None]:
categorical_correlation= associations(categorical_df, filename= 'categorical_correlation.png', figsize=(10,10))

Here we set the correlation threshold to be no less than 0.2, and we get the $e_{i}$ to be first, last, c_charge_desc, score_text and v_score_text

In [None]:
e_list = ['first', 'last', 'c_charge_desc', 'score_text' , 'v_score_text']
exclude_e_s = [
 'compas_screening_date',
 'age',
 'age_cat',
 'juv_fel_count',
 'decile_score',
 'juv_misd_count',
 'juv_other_count',
 'priors_count',
 'days_b_screening_arrest',
 'c_jail_in',
 'c_jail_out',
 'c_offense_date',
 'c_days_from_compas',
 'c_charge_degree',
 'is_recid',
 'is_violent_recid',
 'type_of_assessment',
 'decile_score.1',
 'screening_date',
 'v_type_of_assessment',
 'v_decile_score',
 'v_screening_date',
 'in_custody',
 'out_custody',
 'priors_count.1',
 'start',
 'end',
 'event']
e = X[e_list]
new_X = X[exclude_e_s]
s = X[['race']]

### Handling Conditional Discrimination

![a3.jpg](../figs/a3.jpg)

In [None]:
def partition(X,e):
    e_list = list(e.keys())
    X_i = []
    for i in range(len(e_list)):
        X[e_list[i]] = e[e_list[i]]
        X_i.append(X)
    return X_i

In [None]:
temp = partition(new_X,e)

![a4.jpg](../figs/a4.jpg)

For Algorithm 4, $p^\star(+|e_i)$, $p(+|e_i,\text{gender})$ is changed each time for different i, I have no idea what $G_i$ is the description is too abstract, as so I will define $G_i$ as a constant $C$ (Hyperparameter)

In [None]:
def ceildiv(a, b):
    return -(a // -b)
def delta(item, y_pred, y_prob, G_i = 12):
    
    item_copy = item.copy()
    item_copy['label'] = y_pred
    af = item_copy[item_copy.race=='African-American']
    ca = item_copy[item_copy.race =='Caucasian']
    rate_af = round(af[af['label']==1].shape[0]/af.shape[0],2)
    rate_ca = round(ca[ca['label']==1].shape[0]/ca.shape[0],2)
    p_star = (rate_af+rate_ca)/2
    threshold = np.abs(rate_af-p_star)
    
    test_df = pd.DataFrame(y_prob,columns=['No','Yes'])
    test_df.set_index(item.index,inplace=True)
    test_df['race']=item['race']
    test_df['label'] = y_pred
    temp = test_df[(np.abs(test_df.Yes - test_df.No)<=G_i*threshold)]
#     print(temp.shape[0],threshold)
    for i in range(temp.shape[0]):
        if temp.race.iloc[i] == 'African-American' and temp.Yes.iloc[i] > temp.No.iloc[i]:
            temp.label.iloc[i] = 0
        elif temp.race.iloc[i] == 'Caucasian' and temp.Yes.iloc[i] < temp.No.iloc[i]:
            temp.label.iloc[i] = 1
    
    item_copy['label'].loc[temp.index] = temp.label
    
    return item_copy
def delta2(item, y_pred, y_prob, G_i = 6):
    
    item_copy = item.copy()
    item_copy['label'] = y_pred
    af = item_copy[item_copy.race=='African-American']
    ca = item_copy[item_copy.race =='Caucasian']
    rate_af = round(af[af['label']==1].shape[0]/af.shape[0],2)
    rate_ca = round(ca[ca['label']==1].shape[0]/ca.shape[0],2)
    p_star = (rate_af+rate_ca)/2
    threshold = np.abs(rate_af-p_star)
    
    test_df = pd.DataFrame(y_prob,columns=['No','Yes'])
    test_df.set_index(item.index,inplace=True)
    test_df['race']=item['race']
    test_df['label'] = y_pred
    temp = test_df[(np.abs(test_df.Yes - test_df.No)<=G_i*threshold)]
#     print(temp.shape[0],threshold)
    temp_dic = temp[['race','label']].value_counts().to_dict()
    
    aa_0,aa_1,ca_0,ca_1 = 0,0,0,0
    for key, value in zip(temp_dic.keys(),temp_dic.values()):
        if key == ('African-American', 1):
            aa_1 = ceildiv(value,2)
        if key == ('African-American', 0):
            aa_0 = ceildiv(value,2)       
        if key == ('Caucasian', 1):
            ca_1 = ceildiv(value,2)        
        if key == ('Caucasian', 0):
            ca_0 = ceildiv(value,2)

    if aa_1>aa_0:
        aa_replace = aa_0
    else:
        aa_replace = aa_1

    if ca_1>ca_0:
        ca_replace = ca_0
    else:
        ca_replace = ca_1

    count_aa = 0
    count_ca = 0

    while(count_aa<aa_replace):
        for idx in list(temp.index):
            if temp.race.loc[idx] == 'African-American' and temp.label.loc[idx] ==1:
                temp.label.loc[idx] = 0
                count_aa +=1

    while(count_ca<ca_replace):
        for idx in list(temp.index):
            if temp.race.loc[idx] == 'Caucasian' and temp.label.loc[idx] ==0:
                temp.label.loc[idx] = 1
                count_ca +=1
    
    item_copy['label'].loc[temp.index] = temp.label
    
    return item_copy

## Algorithm 1

![a1.jpg](../figs/a1.jpg)

In [None]:
def local_massaging(X,s,e,y):
    X['race'] = s
    X_i_list =partition(X,e)
    df_list = []
    pd_list = []
    for item in X_i_list:
        enc = OneHotEncoder(handle_unknown='ignore')
        X_new = enc.fit(item)
        X_new = enc.transform(item)
        X_train, X_test, y_train, y_test = train_test_split(X_new,y,test_size=.3,random_state=5)
        forest = RandomForestClassifier(random_state=44).fit(X_train, y_train) 
        y_pred = forest.predict(X_new)
        y_prob = forest.predict_proba(X_new)
        temp_df = delta(item,y_pred,y_prob)
        pd_list.append(temp_df)
        df_list.append(temp_df[['label']])
    
    # Here we take vote 
    
    result = pd.concat(df_list,axis = 1)
    result.loc[result.sum(axis=1)<=2,'new_label'] = 0
    result.loc[result.sum(axis=1)>=3,'new_label'] = 1
    X['new_label'] = result.new_label
    return X

In [None]:
New_X = local_massaging(new_X,s,e,y)

In [None]:
af = New_X[New_X.race=='African-American']
ca = New_X[New_X.race =='Caucasian']
rate_af = round(af[af['new_label']==1].shape[0]/af.shape[0],2)
rate_ca = round(ca[ca['new_label']==1].shape[0]/ca.shape[0],2)

In [None]:
rate_af,rate_ca 

In [None]:
We can see the discrimation is being balanced 

In [None]:
New_y = New_X.new_label
New_X_copy = New_X.copy()
New_X_copy.drop(['new_label'],axis=1,inplace=True)
X_new = enc.fit(New_X_copy)
X_new = enc.transform(New_X_copy)
X_train, X_test, y_train, y_test = train_test_split(X_new,New_y,test_size=.3,random_state=42)
forest = RandomForestClassifier(random_state=42).fit(X_train, y_train) 
y_pred_test = forest.predict(X_test)

Accuracy

In [None]:
print(accuracy_score(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))
cali_X = New_X.loc[y_test.index]
cali_X['label'] = y_test
cali_X['pred'] = y_pred_test
af_X = cali_X[cali_X.race=='African-American']
ca_X =  cali_X[cali_X.race=='Caucasian']
print(np.abs(accuracy_score(af_X.label, af_X.pred)-accuracy_score(ca_X.label, ca_X.pred)))

### Algorithm 2

![a2.jpg](../figs/a2.jpg)

In [None]:
def local_preferential_sampking(X,s,e,y):
    X['race'] = s
    X_i_list =partition(X,e)
    df_list = []
    pd_list = []
    for item in X_i_list:
        enc = OneHotEncoder(handle_unknown='ignore')
        X_new = enc.fit(item)
        X_new = enc.transform(item)
        X_train, X_test, y_train, y_test = train_test_split(X_new,y,test_size=.3,random_state=42)
        forest = RandomForestClassifier(random_state = 88).fit(X_train, y_train) 
        y_pred = forest.predict(X_new)
        y_prob = forest.predict_proba(X_new)
        temp_df = delta2(item,y_pred,y_prob,6)
        pd_list.append(temp_df)
        df_list.append(temp_df[['label']])
    
    # Here we take vote 
    
    result = pd.concat(df_list,axis = 1)
    result.loc[result.sum(axis=1)<=2,'new_label'] = 0
    result.loc[result.sum(axis=1)>=3,'new_label'] = 1
    X['new_label'] = result.new_label
    
    return X

In [None]:
X_New = local_preferential_sampling(new_X,s,e,y)
af = X_New[X_New.race=='African-American']
ca = X_New[X_New.race =='Caucasian']
rate_af = round(af[af['new_label']==1].shape[0]/af.shape[0],2)
rate_ca = round(ca[ca['new_label']==1].shape[0]/ca.shape[0],2)
rate_af,rate_ca


In [None]:
y_New = X_New.new_label
X_New_copy = X_New.copy()
X_New_copy.drop(['new_label'],axis=1,inplace=True)
X_new = enc.fit(X_New_copy)
X_new = enc.transform(X_New_copy)
X_train, X_test, y_train, y_test = train_test_split(X_new,y_New,test_size=.3,random_state=42)
forest = RandomForestClassifier(random_state=42).fit(X_train, y_train) 
y_pred_test = forest.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))
cali_X = X_New.loc[y_test.index]
cali_X['label'] = y_test
cali_X['pred'] = y_pred_test
af_X = cali_X[cali_X.race=='African-American']
ca_X =  cali_X[cali_X.race=='Caucasian']
print(np.abs(accuracy_score(af_X.label, af_X.pred)-accuracy_score(ca_X.label, ca_X.pred)))

## Conclusion:
We can see that the accuracy and Calibration slightly decreased after modified algorithms but in return of a good elimination towards bad discrimination. After all we conclude that the drop of accuracy is worth the shot.