In [8]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [18]:
# Read data and calculate length of stay
df_raw = pd.read_csv("../data/compas-scores-two-years.csv")
length_of_stay = pd.to_datetime(df_raw["c_jail_out"]) - pd.to_datetime(df_raw["c_jail_in"])
length_of_stay_days = length_of_stay.astype('timedelta64[h]') / 24

# Select relevant features and filter data
filtered_df = df_raw.loc[
    (df_raw["race"].isin(["African-American", "Caucasian"])) &
    (length_of_stay_days > 0)
].assign(
    length_of_stay=length_of_stay_days
)[[
    "two_year_recid", "race", "sex", "age", "c_charge_degree", "priors_count", "length_of_stay"
]]

# Drop missing values
cleaned_df = filtered_df.dropna()

# Display cleaned data
cleaned_df.head()

Unnamed: 0,two_year_recid,race,sex,age,c_charge_degree,priors_count,length_of_stay
1,1,African-American,Male,34,F,0,10.041667
2,1,African-American,Male,24,F,4,1.083333
6,1,Caucasian,Male,41,F,14,6.291667
8,0,Caucasian,Female,39,M,0,2.916667
9,1,Caucasian,Male,21,F,1,0.958333


In [19]:
# Define encoding functions
def encode_race(race):
    return 0 if race == 'African-American' else 1

def encode_sex(sex):
    return 0 if sex == 'Female' else 1

def encode_age(age):
    if age < 25:
        return 0
    elif age > 45:
        return 2
    else:
        return 1

def encode_c_charge_degree(c_charge_degree):
    return 0 if c_charge_degree == 'M' else 1

def encode_priors_count(priors_count):
    if priors_count == 0:
        return 0
    elif priors_count > 3:
        return 2
    else:
        return 1

def standardize_length_of_stay(length_of_stay, mean, std):
    return (length_of_stay - mean) / std

# Encode features
cleaned_df['race'] = cleaned_df['race'].apply(encode_race)
cleaned_df['sex'] = cleaned_df['sex'].apply(encode_sex)
cleaned_df['age'] = cleaned_df['age'].apply(encode_age)
cleaned_df['c_charge_degree'] = cleaned_df['c_charge_degree'].apply(encode_c_charge_degree)
cleaned_df['priors_count'] = cleaned_df['priors_count'].apply(encode_priors_count)

# Standardize length_of_stay
length_of_stay_mean = cleaned_df['length_of_stay'].mean()
length_of_stay_std = cleaned_df['length_of_stay'].std()
cleaned_df['length_of_stay'] = cleaned_df['length_of_stay'].apply(standardize_length_of_stay, args=(length_of_stay_mean, length_of_stay_std))

# Display encoded data
cleaned_df.reset_index(drop = True, inplace=True)
cleaned_df.head()


Unnamed: 0,two_year_recid,race,sex,age,c_charge_degree,priors_count,length_of_stay
0,1,0,1,1,1,0,-0.187151
1,1,0,1,0,1,2,-0.356541
2,1,1,1,1,1,2,-0.258059
3,0,1,0,1,0,0,-0.321875
4,1,1,1,0,1,1,-0.358905


In [20]:
cleaned_df.to_csv("../data/ccompas_scores_two_years_encoded.csv")