In [9]:
# importing libraries ...

import pandas as pd
import math
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings

In [34]:
# ALGORITHM 1: LOCAL MASSAGING
# inputs - dataframe, sensitive attribute ("race" in this case), explanatory attribute (v_decile_score), label attribute (two_year_recid)
# outputs - output_dictionary (a dictionary of dataframes subgrouped oby explanatory attribute )

def local_massage(dataframe, s_attribute, e_attribute, y_attribute, param): 
    
    # creates sub dictionaries for each category of score
    dictionary = partition(dataframe, e_attribute)
    
    # dictionary to store our output
    output_dictionary = {}
    
    # for each category of score
    for key in dictionary.keys(): 
        tmp_df = dictionary[key]
        
        # learn a ranker H (logistic regression, which will give us probabilities of y = 0 or 1)
        model = LogisticRegression()
        model.fit(tmp_df.drop(columns=['two_year_recid'], axis=1), tmp_df.two_year_recid.copy())
            
        # add these predictions to the dataframe
        tmp_df['prob_0'], tmp_df['prob_1'] = model.predict_proba(tmp_df.drop(columns=['two_year_recid'], axis=1)).T
        
        # find the number of caucassians to switch using algorithm 4
        no_of_caucasian_switch = int(subroutine(dataframe, s_attribute, 1, e_attribute, key, y_attribute)*param)
        no_of_aa_switch = int(subroutine(dataframe, s_attribute, 0, e_attribute, key, y_attribute)*param)

        # create subtables (these encompass the data)
        # we use these to make finding the top X closest to the boundary easier
        caucasian_df_pos = tmp_df[(tmp_df['race'] == 1) & (tmp_df['two_year_recid'] == 1)]
        caucasian_df_neg = tmp_df[(tmp_df['race'] == 1) & (tmp_df['two_year_recid'] == 0)] 
        aa_df_pos = tmp_df[(tmp_df['race'] == 0) & (tmp_df['two_year_recid'] == 1)]
        aa_df_neg = tmp_df[(tmp_df['race'] == 0) & (tmp_df['two_year_recid'] == 0)]
        
        # sort to make finding the closest to the boundary easier
        # here, we are only concerned with switch 0 caucasians to 1 and 1 african-americans to 0
        caucasian_df_neg['dist'] = abs(caucasian_df_neg['prob_0'] - 0.5)
        caucasian_df_neg = caucasian_df_neg.sort_values(by='dist')
        aa_df_pos['dist'] = abs(aa_df_pos['prob_0'] - 0.5)
        aa_df_pos = aa_df_pos.sort_values(by='dist')
        
        caucasian_df_neg = caucasian_df_neg.reset_index(drop=True)
        aa_df_pos = aa_df_pos.reset_index(drop=True)
        
        # relabel the closest "no_of_caucasian_switch" of the caucasians
        for i in range(no_of_caucasian_switch): 
            caucasian_df_neg.at[i, 'two_year_recid'] = 1
            
        # relabel the closest "no_of_AA_switch" of the african-americans
        for i in range(no_of_aa_switch): 
            aa_df_pos.at[i, 'two_year_recid'] = 0

        # join all the 4 subsets (now with new labels) and store in output_dictionary
        output_dictionary[key] = pd.concat([caucasian_df_pos, caucasian_df_neg, aa_df_pos, aa_df_neg], ignore_index=True)
    
    # return the new dataframe with the labels
    return output_dictionary

In [35]:
# ALGORITHM 2: LOCAL PREFERENTIAL SAMPLING
# inputs - dataframe, sensitive attribute ("race" in this case), explanatory attribute (v_decile_score), label attribute (two_year_recid)
# outputs - output_dictionary (a dictionary of dataframes subgrouped oby explanatory attribute )

def local_preferential_sampling(dataframe, s_attribute, e_attribute, y_attribute, param):
    
    # creates sub dictionaries for each category of score
    dictionary = partition(dataframe, e_attribute)
    
    # dictionary to store our output
    output_dictionary = {}
    
    # for each category of score
    for key in dictionary.keys(): 
        tmp_df = dictionary[key]

    
        # learn a ranker H (logistic regression, which will give us probabilities of y = 0 or 1)
        model = LogisticRegression()
        model.fit(tmp_df.drop(['two_year_recid'], axis=1), tmp_df.two_year_recid.copy())
        
        # add these predictions to the dataframe
        tmp_df['prob_0'], tmp_df['prob_1'] = model.predict_proba(tmp_df.drop(['two_year_recid'], axis=1)).T
        
        # find the number of caucassians to delete/duplicate using algorithm 4
        no_of_caucasian_switch = int((subroutine(dataframe, s_attribute, 1, e_attribute, key, y_attribute)/2)*param)
        no_of_aa_switch = int((subroutine(dataframe, s_attribute, 0, e_attribute, key, y_attribute)/2)*param)
        
        # create subtables (these encompass the data)
        # we use these to make finding the top X closest to the boundary easier
        caucasian_df_pos = tmp_df[(tmp_df['race'] == 1) & (tmp_df['two_year_recid'] == 1)]
        caucasian_df_neg = tmp_df[(tmp_df['race'] == 1) & (tmp_df['two_year_recid'] == 0)] 
        aa_df_pos = tmp_df[(tmp_df['race'] == 0) & (tmp_df['two_year_recid'] == 1)]
        aa_df_neg = tmp_df[(tmp_df['race'] == 0) & (tmp_df['two_year_recid'] == 0)]
        
        caucasian_df_neg['dist'] = abs(caucasian_df_neg['prob_0'] - 0.5)
        caucasian_df_pos['dist'] = abs(caucasian_df_pos['prob_0'] - 0.5)
        aa_df_pos['dist'] = abs(aa_df_pos['prob_0'] - 0.5)
        aa_df_neg['dist'] = abs(aa_df_neg['prob_0'] - 0.5)
        
        # sort to make finding the closest to the boundary easier
        caucasian_df_neg = caucasian_df_neg.sort_values(by='dist')
        caucasian_df_pos = caucasian_df_pos.sort_values(by='dist')
        aa_df_neg = aa_df_neg.sort_values(by='dist')
        aa_df_pos = aa_df_pos.sort_values(by='dist')
        
        caucasian_df_neg = caucasian_df_neg.reset_index(drop=True)
        caucasian_df_pos = caucasian_df_pos.reset_index(drop=True)
        aa_df_neg = aa_df_neg.reset_index(drop=True)
        aa_df_pos = aa_df_pos.reset_index(drop=True)
                
        # delete the closest negative caucasians
        for i in range(no_of_caucasian_switch): 
            caucasian_df_neg = caucasian_df_neg.drop([caucasian_df_neg.index[0]])
            
        # delete the closest positive aas
        for i in range(no_of_aa_switch): 
            aa_df_pos = aa_df_pos.drop([aa_df_pos.index[0]])
            
        # duplicate the closest positive caucasians
        caucasian_df_pos = pd.concat([caucasian_df_pos, caucasian_df_pos[:no_of_caucasian_switch]])
            
        # duplicate the closest negative aas
        aa_df_neg = pd.concat([aa_df_neg, aa_df_neg[:no_of_aa_switch]])
            
        # join all the 4 subsets (now with new labels) and store in output_dictionary
        output_dictionary[key] = pd.concat([caucasian_df_pos, caucasian_df_neg, aa_df_pos, aa_df_neg], ignore_index=True)
    
    return output_dictionary

In [36]:
# ALGORITHM 3: PARTITION
# input - dataframe (pandas dataframe), e_attribute (string, explanatory attribute)
# output - a dictionary of dataframes where key is a unique e_attribute, and the dataframes
# contain all rows from the original dataframe where e_attribute is the key

def partition(dataframe, e_attribute): 
    
    dataframe_dict = {}
    
    unique_e = dataframe[e_attribute].unique()
    
    for e in unique_e: 
        
        tmp_df = dataframe[(dataframe[e_attribute]) == e]
        dataframe_dict[e] = tmp_df
    
    return dataframe_dict

In [37]:
# ALGORITHM 4: SUBROUTINE DELTA(s_attribute)
# input - dataframe (entire pandas dataframe), s_attribute (name of s_attribute), s_attribute_value (value of s_attribute)
# e_attribute (name of e_attribute), e_attribute_value (value of e_attribute), y_attribute (name of y_attribute)
# output - an integer indicating the number of people to flip labels

def subroutine(dataframe, s_attribute, s_attribute_value, e_attribute, e_attribute_value, y_attribute): 
    
    G_i = dataframe[(dataframe[s_attribute] == s_attribute_value) & (dataframe[e_attribute] == e_attribute_value)].shape[0]
    
    p_1 = dataframe[(dataframe[y_attribute] == 1) & (dataframe[s_attribute] == s_attribute_value) & (dataframe[e_attribute] == e_attribute_value)].shape[0]/G_i
    
    p_2_1 = (dataframe[(dataframe[y_attribute] == 1) & (dataframe[s_attribute] == 1) & (dataframe[e_attribute] == e_attribute_value)].shape[0])/(dataframe[(dataframe[s_attribute] == 1) & (dataframe[e_attribute] == e_attribute_value)].shape[0])
    p_2_2 = (dataframe[(dataframe[y_attribute] == 1) & (dataframe[s_attribute] == 0) & (dataframe[e_attribute] == e_attribute_value)].shape[0])/(dataframe[(dataframe[s_attribute] == 0) & (dataframe[e_attribute] == e_attribute_value)].shape[0])
    p_2 = (p_2_1 + p_2_2) / 2
    
    return G_i * abs(p_1-p_2)

In [46]:
# Script to get the accuracy and calibration of each model
# Also returns the model in case you want to see confusion matrix
# Random train/test not implemented in this script, but the code can be easily adjusted
# PARAM is optional, but generally we are only concerned with param=1 because this is what the paper says
def evaluate(algorithm, param): 
    
    warnings.filterwarnings('ignore')

    # REPLACE HERE WITH FILENAME
    # load in the data (already categorized and processed by Ananya)
    dataframe = pd.read_csv('../output/cat_encoded_df.csv')

    # drop 'vr_charge_degree_column' (missing values)
    dataframe = dataframe.drop(columns=['vr_charge_degree'])

    dataframe = dataframe.drop(columns=['is_recid'])

    dataframe = dataframe.drop(columns=['is_violent_recid'])

    train = dataframe[:int(dataframe.shape[0]*0.8)]

    test = dataframe[int(dataframe.shape[0]*0.8):]

    aa_test = test[test['race'] == 0]
    cau_test = test[test['race'] == 1]
        
    # run local massage algorithm
    
    if(algorithm==1): 
        output = local_massage(train, "race", "v_decile_score", "two_year_recid", param)
        
    if(algorithm==2): 
        output = local_preferential_sampling(train, "race", "v_decile_score", "two_year_recid", param)

    if(algorithm==3): 
        output = train
        
        model = LogisticRegression()
        model.fit(output.drop(columns=['two_year_recid'], axis=1), output.two_year_recid.copy())
    
        accuracy = accuracy_score(test.two_year_recid.copy().values, model.predict(test.drop(columns=['two_year_recid'], axis=1)))
    
        c1 = accuracy_score(aa_test.two_year_recid.copy().values, model.predict(aa_test.drop(columns=['two_year_recid'], axis=1)))
    
        c2 = accuracy_score(cau_test.two_year_recid.copy().values, model.predict(cau_test.drop(columns=['two_year_recid'], axis=1)))

        calibration = abs(c1-c2)
    
        print("Accuracy: " + str(accuracy))
        
        return model
        
    # aggregate the output into one final dataframe
    output = pd.concat([output[1], output[2], output[3], output[4], output[5], output[6], output[7], output[8], output[9], output[10]])

    # drop the probability columns
    output = output.drop(columns=['prob_0', 'prob_1', 'dist'])

    model = LogisticRegression()
    model.fit(output.drop(['two_year_recid'], axis=1), output.two_year_recid.copy())

    accuracy = accuracy_score(test.two_year_recid.copy().values, model.predict(test.drop(['two_year_recid'], axis=1)))
    
    c1 = accuracy_score(aa_test.two_year_recid.copy().values, model.predict(aa_test.drop(['two_year_recid'], axis=1)))
    
    c2 = accuracy_score(cau_test.two_year_recid.copy().values, model.predict(cau_test.drop(['two_year_recid'], axis=1)))

    calibration = abs(c1-c2)
    
    print("Accuracy: " + str(accuracy))
    
    return model

In [20]:
# Script to get the accuracy and calibration of each model
# Also returns the model in case you want to see confusion matrix
# Random train/test not implemented in this script, but the code can be easily adjusted
# PARAM is optional, but generally we are only concerned with param=1 because this is what the paper says
def evaluate(algorithm, param): 
    
    warnings.filterwarnings('ignore')

    # REPLACE HERE WITH FILENAME
    # load in the data (already categorized and processed by Ananya)
    dataframe = pd.read_csv('../output/cat_encoded_df.csv')

    # drop 'vr_charge_degree_column' (missing values)
    dataframe = dataframe.drop(columns=['vr_charge_degree'])

    dataframe = dataframe.drop(columns=['is_recid'])

    dataframe = dataframe.drop(columns=['is_violent_recid'])

    train = dataframe[:int(dataframe.shape[0]*0.8)]

    test = dataframe[int(dataframe.shape[0]*0.8):]

    aa_test = test[test['race'] == 0]
    cau_test = test[test['race'] == 1]
        
    # run local massage algorithm
    
    if(algorithm==1): 
        output = local_massage(train, "race", "v_decile_score", "two_year_recid", param)
        
    if(algorithm==2): 
        output = local_preferential_sampling(train, "race", "v_decile_score", "two_year_recid", param)

    if(algorithm==3): 
        output = train
        
        model = LogisticRegression()
        model.fit(output.drop(columns=['two_year_recid', 'race'], axis=1), output.two_year_recid.copy())
    
        accuracy = accuracy_score(test.two_year_recid.copy().values, model.predict(test.drop(columns=['two_year_recid', 'race'], axis=1)))
    
        c1 = accuracy_score(aa_test.two_year_recid.copy().values, model.predict(aa_test.drop(columns=['two_year_recid', 'race'], axis=1)))
    
        c2 = accuracy_score(cau_test.two_year_recid.copy().values, model.predict(cau_test.drop(columns=['two_year_recid', 'race'], axis=1)))

        calibration = abs(c1-c2)
    
        print("Accuracy: " + str(accuracy))
        print("Calibration: " + str(calibration))
        
        return model
        
    # aggregate the output into one final dataframe
    output = pd.concat([output[1], output[2], output[3], output[4], output[5], output[6], output[7], output[8], output[9], output[10]])

    # drop the probability columns
    output = output.drop(columns=['prob_0', 'prob_1', 'dist'])

    model = LogisticRegression()
    model.fit(output.drop(columns = ['two_year_recid' , 'race'], axis=1), output.two_year_recid.copy())

    accuracy = accuracy_score(test.two_year_recid.copy().values, model.predict(test.drop(columns = ['two_year_recid' , 'race'], axis=1)))
    
    c1 = accuracy_score(aa_test.two_year_recid.copy().values, model.predict(aa_test.drop(columns = ['two_year_recid' , 'race'], axis=1)))
    
    c2 = accuracy_score(cau_test.two_year_recid.copy().values, model.predict(cau_test.drop(columns = ['two_year_recid' , 'race'], axis=1)))

    calibration = abs(c1-c2)
    
    print("Accuracy: " + str(accuracy))
    
    return model

In [43]:
# Run Local Massaging
model = evaluate(1, 1)

Accuracy: 0.651219512195122


In [44]:
# Run Local Preferential Sampling
model = evaluate(2, 1)

Accuracy: 0.6471544715447154


In [47]:
# Run Baseline
model = evaluate(3, 1)

Accuracy: 0.6504065040650406
