## COMPAS Recidivism Racial Bias EDA and DE

In [10]:
import numpy as np
import pandas as pd
import os

In [2]:
work_dir = os.getcwd()
print("Working directory %s" % work_dir)

Working directory C:\Users\omisko\Desktop\University\UCU\Responsible-Ds-2019\noble-model-audit


In [3]:
data_dir = os.path.join(work_dir, 'data')
print("Data directory %s" % data_dir)

Data directory C:\Users\omisko\Desktop\University\UCU\Responsible-Ds-2019\noble-model-audit\data


In [4]:
compas_data_filename = 'compas-scores-raw.csv'
compas_data_filepath = os.path.join(data_dir, compas_data_filename)
compas_data = pd.read_csv(compas_data_filepath)

In [7]:
compas_data.head(5)

Unnamed: 0,Person_ID,AssessmentID,Case_ID,Agency_Text,LastName,FirstName,MiddleName,Sex_Code_Text,Ethnic_Code_Text,DateOfBirth,...,RecSupervisionLevel,RecSupervisionLevelText,Scale_ID,DisplayText,RawScore,DecileScore,ScoreText,AssessmentType,IsCompleted,IsDeleted
0,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,7,Risk of Violence,-2.08,4,Low,New,1,0
1,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,8,Risk of Recidivism,-1.06,2,Low,New,1,0
2,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,18,Risk of Failure to Appear,15.0,1,Low,New,1,0
3,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,...,1,Low,7,Risk of Violence,-2.84,2,Low,New,1,0
4,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,...,1,Low,8,Risk of Recidivism,-1.5,1,Low,New,1,0


First of all, we need to rename the columns for convenience.

In [9]:
compas_data_columns = ['person_id', 'assessment_id', 'case_id', 'agency_text', 
                       'last_name', 'first_name', 'middle_name', 'sex', 'nationality', 'birth_date',
                       'scale_set_id', 'scale_set', 'assessment_reason', 'language',
                       'legal_status', 'custody_status', 'marital_status', 'screening_date',
                       'rec_supervision_level_num', 'rec_supervision_level', 'scale_id',
                       'display_text', 'raw_score', 'decile_score', 'score_text', 'assessment_type',
                       'is_completed', 'is_deleted']

compas_data.columns = compas_data_columns
compas_data.head()

Unnamed: 0,person_id,assessment_id,case_id,agency_text,last_name,first_name,middle_name,sex,nationality,birth_date,...,rec_supervision_level_num,rec_supervision_level,scale_id,display_text,raw_score,decile_score,score_text,assessment_type,is_completed,is_deleted
0,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,7,Risk of Violence,-2.08,4,Low,New,1,0
1,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,8,Risk of Recidivism,-1.06,2,Low,New,1,0
2,50844,57167,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,...,1,Low,18,Risk of Failure to Appear,15.0,1,Low,New,1,0
3,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,...,1,Low,7,Risk of Violence,-2.84,2,Low,New,1,0
4,50848,57174,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,...,1,Low,8,Risk of Recidivism,-1.5,1,Low,New,1,0


A lot of columns provide unnecessary information for exploratory data analysis.

For example **language** column contains **100% of English data**. 
**assessment_reason** has only **intake** values.
Also, **is_completed** and **is_deleted** columns would not provide us a lot of valuable information.

Therefore, we need to remove some columns from the data frame.

In [12]:
columns_to_drop = ['person_id', 'assessment_id', 'scale_set_id', 
                   'language', 'screening_date', 'rec_supervision_level_num', 
                   'scale_id', 'is_completed', 'is_deleted']

In [14]:
compas_data = compas_data.drop(columns_to_drop, axis=1)
compas_data.head()

Unnamed: 0,case_id,agency_text,last_name,first_name,middle_name,sex,nationality,birth_date,scale_set,assessment_reason,legal_status,custody_status,marital_status,rec_supervision_level,display_text,raw_score,decile_score,score_text,assessment_type
0,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,Risk and Prescreen,Intake,Pretrial,Jail Inmate,Single,Low,Risk of Violence,-2.08,4,Low,New
1,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,Risk and Prescreen,Intake,Pretrial,Jail Inmate,Single,Low,Risk of Recidivism,-1.06,2,Low,New
2,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,Risk and Prescreen,Intake,Pretrial,Jail Inmate,Single,Low,Risk of Failure to Appear,15.0,1,Low,New
3,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,Risk and Prescreen,Intake,Pretrial,Jail Inmate,Married,Low,Risk of Violence,-2.84,2,Low,New
4,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,Risk and Prescreen,Intake,Pretrial,Jail Inmate,Married,Low,Risk of Recidivism,-1.5,1,Low,New


Now, after taking a look at the data we can see that we have plenty of duplicates already in the first five rows. Therefore, it is necessary to remove duplicate data. We can safely search duplicates by **case_id** since it should be unique for each case.

In [16]:
compas_data = compas_data.drop_duplicates(subset = 'case_id')
compas_data.head()

Unnamed: 0,case_id,agency_text,last_name,first_name,middle_name,sex,nationality,birth_date,scale_set,assessment_reason,legal_status,custody_status,marital_status,rec_supervision_level,display_text,raw_score,decile_score,score_text,assessment_type
0,51950,PRETRIAL,Fisher,Kevin,,Male,Caucasian,12/05/92,Risk and Prescreen,Intake,Pretrial,Jail Inmate,Single,Low,Risk of Violence,-2.08,4,Low,New
3,51956,PRETRIAL,KENDALL,KEVIN,,Male,Caucasian,09/16/84,Risk and Prescreen,Intake,Pretrial,Jail Inmate,Married,Low,Risk of Violence,-2.84,2,Low,New
6,51963,PRETRIAL,DAYES,DANIEL,,Male,African-American,08/25/94,Risk and Prescreen,Intake,Pretrial,Jail Inmate,Single,High,Risk of Violence,-1.2,8,High,New
9,51958,PRETRIAL,Debe,Mikerlie,George,Female,African-American,10/09/94,Risk and Prescreen,Intake,Pretrial,Jail Inmate,Significant Other,Medium,Risk of Violence,-1.29,7,Medium,New
12,51945,PRETRIAL,McLaurin,Stephanie,Nicole,Female,African-American,06/29/85,Risk and Prescreen,Intake,Pretrial,Jail Inmate,Single,Low,Risk of Violence,-2.9,2,Low,New
