# A5-Prejudice Remover

In [1]:
## Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2 as cv2
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import seaborn as sns

### Data processing

In [2]:
## Load raw data
raw_data = pd.read_csv('../data/compas-scores-two-years.csv')

In [3]:
## Data cleaning & data processing

# Keep only the "African-American" and "Caucasian" rows
processed_data = raw_data.loc[raw_data['race'].isin(["African-American", "Caucasian"])]
# Remove columns with multiple missing data
processed_data = processed_data[['sex', 'age', 'age_cat', 'race', 'decile_score', 'juv_fel_count', 'juv_misd_count', 'juv_other_count',
            'priors_count', 'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_charge_degree', 'is_recid', 
             'score_text', 'two_year_recid']]
# If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, 
# we can assume that because of data quality reasons, that we do not have the right offense.

processed_data = processed_data.loc[processed_data['days_b_screening_arrest'] <= 30]
processed_data = processed_data.loc[processed_data['days_b_screening_arrest'] >= -30]
# The recidivist flag (is_recid) should be -1 if we could not find a compas case at all.

processed_data = processed_data.loc[processed_data['is_recid'] != -1]
# Ordinary traffic offenses (c_charge_degree = 'O') will not result in Jail time and hence are removed 
# (only two of them).

processed_data = processed_data.loc[processed_data['c_charge_degree'] != 'O']
# score_text shouldn't be 'N/A'

processed_data = processed_data.loc[processed_data['score_text'] != 'N/A']

processed_data['length_of_stay'] = (pd.to_datetime(processed_data['c_jail_out'])-pd.to_datetime(processed_data['c_jail_in'])).apply(lambda x: x.days)

processed_data = processed_data.drop(columns=['c_jail_in', 'c_jail_out'])

In [4]:
processed_data

Unnamed: 0,sex,age,age_cat,race,decile_score,juv_fel_count,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_charge_degree,is_recid,score_text,two_year_recid,length_of_stay
1,Male,34,25 - 45,African-American,3,0,0,0,0,-1.0,F,1,Low,1,10
2,Male,24,Less than 25,African-American,4,0,0,1,4,-1.0,F,1,Low,1,1
6,Male,41,25 - 45,Caucasian,6,0,0,0,14,-1.0,F,1,Medium,1,6
8,Female,39,25 - 45,Caucasian,1,0,0,0,0,-1.0,M,0,Low,0,2
10,Male,27,25 - 45,Caucasian,4,0,0,0,0,-1.0,F,0,Low,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207,Male,30,25 - 45,African-American,2,0,0,0,0,-1.0,M,1,Low,1,0
7208,Male,20,Less than 25,African-American,9,0,0,0,0,-1.0,F,0,High,0,0
7209,Male,23,Less than 25,African-American,7,0,0,0,0,-1.0,F,0,Medium,0,1
7210,Male,23,Less than 25,African-American,3,0,0,0,0,-1.0,F,0,Low,0,1


In [5]:
# Replace the values of the sensitive attribute race as follows: Caucasian -> 1, African-American -> 0
processed_data = processed_data.replace({'race': 'Caucasian'}, 1)
processed_data = processed_data.replace({'race': 'African-American'}, 0)

# Replace the values of sex as follows
processed_data = processed_data.replace({'sex': 'Male'}, 1)
processed_data = processed_data.replace({'sex': 'Female'}, 0)

# Replace the values of age_cat as follows
processed_data = processed_data.replace({'age_cat': 'Less than 25'}, 0)
processed_data = processed_data.replace({'age_cat': '25 - 45'}, 1)
processed_data = processed_data.replace({'age_cat': 'Greater than 45'}, 2)

# Replace the values of c_charge_degree as follows
processed_data = processed_data.replace({'c_charge_degree': 'F'}, 0)
processed_data = processed_data.replace({'c_charge_degree': 'M'}, 1)

# Replace the values of score_text as follows
processed_data = processed_data.replace({'score_text': 'Low'}, 0)
processed_data = processed_data.replace({'score_text': 'Medium'}, 1)
processed_data = processed_data.replace({'score_text': 'High'}, 2)

In [6]:
# Check whether there are NaN values in the final dataset as well as the number of unique values per column

unique_NAN_df = pd.DataFrame(columns=['column name', '# of unique values', '# of NaN values'])
for item in processed_data.columns:
    unique_NAN_df = unique_NAN_df.append({
        'column name': item, 
        '# of unique values': len(processed_data[item].unique()),
        '# of NaN values': sum(processed_data[item].isna() == True)}, ignore_index = True)
    
unique_NAN_df = unique_NAN_df.style.hide_index()
unique_NAN_df

column name,# of unique values,# of NaN values
sex,2,0
age,62,0
age_cat,3,0
race,2,0
decile_score,10,0
juv_fel_count,9,0
juv_misd_count,10,0
juv_other_count,8,0
priors_count,36,0
days_b_screening_arrest,56,0


In [7]:
# move two_year_recid to the end

cols = list(processed_data.columns.values)
cols.pop(cols.index('two_year_recid'))
processed_data = processed_data[cols+['two_year_recid']]

In [8]:
# move race to the first column

race_column = processed_data.pop('race')
processed_data.insert(0, 'race', race_column)

In [9]:
processed_data

Unnamed: 0,race,sex,age,age_cat,decile_score,juv_fel_count,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_charge_degree,is_recid,score_text,length_of_stay,two_year_recid
1,0,1,34,1,3,0,0,0,0,-1.0,0,1,0,10,1
2,0,1,24,0,4,0,0,1,4,-1.0,0,1,0,1,1
6,1,1,41,1,6,0,0,0,14,-1.0,0,1,1,6,1
8,1,0,39,1,1,0,0,0,0,-1.0,1,0,0,2,0
10,1,1,27,1,4,0,0,0,0,-1.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207,0,1,30,1,2,0,0,0,0,-1.0,1,1,0,0,1
7208,0,1,20,0,9,0,0,0,0,-1.0,0,0,2,0,0
7209,0,1,23,0,7,0,0,0,0,-1.0,0,0,1,1,0
7210,0,1,23,0,3,0,0,0,0,-1.0,0,0,0,1,0


In [10]:
processed_data = processed_data.drop(columns=['age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count'])

In [11]:
# save final data set to csv
# processed_data.to_csv("../output/processed-compas-scores-two-years.csv", index=False)

### Split the data

In [12]:
data = np.array(processed_data)
y = np.array(data[:,-1]).flatten()
data = data[:,:-1]
sensitive = data[:,0]
data = preprocessing.scale(data)
data = data[:,1:]

In [13]:
sensitive_idx = np.array(np.where(sensitive==1))[0].flatten()
nonsensitive_idx = np.array(np.where(sensitive!=1))[0].flatten()
data_sensitive = data[sensitive_idx,:]
data_nonsensitive = data[nonsensitive_idx,:]
y_sensitive = y[sensitive_idx]
y_nonsensitive = y[nonsensitive_idx]

In [14]:
# split sensitive data into training, validation, and testing sets

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(data_sensitive, y_sensitive, test_size= 0.2, random_state=42)
X_train_s, X_valid_s, y_train_s, y_valid_s = train_test_split(X_train_s, y_train_s, test_size = 0.25, random_state=42)

In [15]:
# split non-sensitive data into training, validation, and testing sets

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(data_nonsensitive, y_nonsensitive, test_size= 0.2, random_state=42)
X_train_n, X_valid_n, y_train_n, y_valid_n = train_test_split(X_train_n, y_train_n, test_size = 0.25, random_state=42)

In [16]:
# create final training, validation, and testing sets

X_train = np.concatenate((X_train_s, X_train_n))
X_valid = np.concatenate((X_valid_s, X_valid_n))
X_test = np.concatenate((X_test_s, X_test_n))

Y_train = np.concatenate((y_train_s, y_train_n))
Y_valid = np.concatenate((y_valid_s, y_valid_n))
Y_test = np.concatenate((y_test_s, y_test_n))

### Prejudice Remover 

In [None]:
import sys

sys.path.append('../lib/')
import LFR

sys.path.append('../lib/')
from EvalMetrics import *

sys.path.append('../lib/')
%run '../lib/LFR.py'

sys.path.append('../lib/')
%run '../lib/EvalMetrics.py'