In [32]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import math
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [33]:
compas_score = pd.read_csv("compas-scores-two-years.csv")

In [34]:
compas_score.shape


(7214, 53)

In [35]:
compas_score.describe()

Unnamed: 0,id,age,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,is_recid,r_days_from_arrest,violent_recid,is_violent_recid,decile_score.1,v_decile_score,priors_count.1,start,end,event,two_year_recid
count,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0,6907.0,7192.0,7214.0,2316.0,0.0,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0,7214.0
mean,5501.255753,34.817993,0.06723,4.509565,0.090934,0.109371,3.472415,3.304763,57.731368,0.481148,20.26943,,0.113529,4.509565,3.691849,3.472415,11.465068,553.436651,0.382867,0.450652
std,3175.70687,11.888922,0.473972,2.856396,0.485239,0.501586,4.882538,75.809505,329.740215,0.499679,74.871668,,0.317261,2.856396,2.510148,4.882538,46.954563,399.020583,0.48612,0.497593
min,1.0,18.0,0.0,1.0,0.0,0.0,0.0,-414.0,0.0,0.0,-1.0,,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2735.25,25.0,0.0,2.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,,0.0,2.0,1.0,0.0,0.0,148.25,0.0,0.0
50%,5509.5,31.0,0.0,4.0,0.0,0.0,2.0,-1.0,1.0,0.0,0.0,,0.0,4.0,3.0,2.0,0.0,530.5,0.0,0.0
75%,8246.5,42.0,0.0,7.0,0.0,0.0,5.0,0.0,2.0,1.0,1.0,,0.0,7.0,5.0,5.0,1.0,914.0,1.0,1.0
max,11001.0,96.0,20.0,10.0,13.0,17.0,38.0,1057.0,9485.0,1.0,993.0,,1.0,10.0,10.0,38.0,937.0,1186.0,1.0,1.0


In [36]:
#compas_score.drop(['id', 'name', 'first', 'last', 'compas_screening_date', 'dob', 'age_cat', 'c_case_number'], axis=1, inplace=True)
cols_to_keep = ['sex', 'age', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree', 'is_recid', 'is_violent_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out', 'score_text', 'days_b_screening_arrest']
df1 = compas_score[cols_to_keep].copy()
df1.drop(['is_recid'], axis=1, inplace=True)
print("Shape: ", df1.shape)
df1.isna().sum()



Shape:  (7214, 15)


sex                          0
age                          0
race                         0
juv_fel_count                0
decile_score                 0
juv_misd_count               0
juv_other_count              0
priors_count                 0
c_charge_degree              0
is_violent_recid             0
two_year_recid               0
c_jail_in                  307
c_jail_out                 307
score_text                   0
days_b_screening_arrest    307
dtype: int64

In [37]:
print(df1.columns)
df1.head()


Index(['sex', 'age', 'race', 'juv_fel_count', 'decile_score', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'c_charge_degree',
       'is_violent_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out',
       'score_text', 'days_b_screening_arrest'],
      dtype='object')


Unnamed: 0,sex,age,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,c_charge_degree,is_violent_recid,two_year_recid,c_jail_in,c_jail_out,score_text,days_b_screening_arrest
0,Male,69,Other,0,1,0,0,0,F,0,0,2013-08-13 06:03:42,2013-08-14 05:41:20,Low,-1.0
1,Male,34,African-American,0,3,0,0,0,F,1,1,2013-01-26 03:45:27,2013-02-05 05:36:53,Low,-1.0
2,Male,24,African-American,0,4,0,1,4,F,0,1,2013-04-13 04:58:34,2013-04-14 07:02:04,Low,-1.0
3,Male,23,African-American,0,8,1,0,1,F,0,0,,,High,
4,Male,43,Other,0,1,0,0,2,F,0,0,,,Low,


In [38]:
# Condition 1 - remove petty traffic offenses that do not cause jail time
cond_1 = (df1.c_charge_degree != 'O')

# Condition 2 - remove rows which do not have a compas score
cond_2 = (df1.score_text != 'N/A')# & (df1.is_recid != -1)

# Condition 3 - remove cases in which charge date is more than 30 days
cond_3 = (df1.days_b_screening_arrest <= 30) & (df1.days_b_screening_arrest >= -30)

df2 = df1[(cond_1 & cond_2 & cond_3)].copy()

print("Shape: ", df2.shape)
df2.head()
df2.isna().sum()

Shape:  (6172, 15)


sex                        0
age                        0
race                       0
juv_fel_count              0
decile_score               0
juv_misd_count             0
juv_other_count            0
priors_count               0
c_charge_degree            0
is_violent_recid           0
two_year_recid             0
c_jail_in                  0
c_jail_out                 0
score_text                 0
days_b_screening_arrest    0
dtype: int64

In [39]:
#enc = OneHotEncoder()
#categorical_cols = ["sex", "race", "c_charge_degree", "score_text"]
#enc.fit(df2[categorical_cols])
#encoded_features = enc.transform(df2[categorical_cols]).toarray()
#df2.drop(categorical_cols, axis=1, inplace=True)
#df2 = pd.concat([df2, pd.DataFrame(encoded_features)], axis=1)

# Change variable types
df2['c_charge_degree'] = df2['c_charge_degree'].astype('category')
df2['race'] = df2['race'].astype('category')
df2['sex'] = df2['sex'].astype('category')
df2['score_text'] = df2['score_text'].astype('category')

# Explode c_charge_degree using get_dummies
dummies_c_charge_degree = pd.get_dummies(df2['c_charge_degree'])
df2 = pd.concat([df2, dummies_c_charge_degree], axis=1)
df2 = df2.drop(['c_charge_degree'], axis=1)



# Explode race using get_dummies
dummies_race = pd.get_dummies(df2['race'])
df2 = pd.concat([df2, dummies_race], axis=1)
df2 = df2.drop(['race'], axis=1)

# Explode sex using get_dummies
dummies_sex = pd.get_dummies(df2['sex'])
df2 = pd.concat([df2, dummies_sex], axis=1)
df2 = df2.drop(['sex'], axis=1)

# Converting score_text to take binary values 
# Low = LowScore, Medium and High = HighScore
dummies_score = pd.get_dummies(df2['score_text'])
df2 = pd.concat([df2, dummies_score], axis=1)
df2 = df2.drop(['score_text'], axis=1)
#df2['score_text']= np.where(df2['score_text'] == "Low", "LowScore", df2['score_text'])
#df2['score_text']= np.where(df2['score_text'] != "LowScore", "HighScore", df2['score_text'])
#df2['score_text'] = df2['score_text'].astype('category')

In [40]:
print(df2.shape)
df2.isna().sum()

(6172, 24)


age                        0
juv_fel_count              0
decile_score               0
juv_misd_count             0
juv_other_count            0
priors_count               0
is_violent_recid           0
two_year_recid             0
c_jail_in                  0
c_jail_out                 0
days_b_screening_arrest    0
F                          0
M                          0
African-American           0
Asian                      0
Caucasian                  0
Hispanic                   0
Native American            0
Other                      0
Female                     0
Male                       0
High                       0
Low                        0
Medium                     0
dtype: int64

In [41]:
extra_df = df2.copy()
def jail_time(row):
 in_var = pd.to_datetime(row['c_jail_in'])
 out_var = pd.to_datetime(row['c_jail_out'])
 return (out_var - in_var).days
extra_df = extra_df[extra_df['c_jail_in'].notna()]
extra_df = extra_df[extra_df['c_jail_in'].notna()]
extra_df['days_in_jail'] = extra_df.apply(jail_time, axis=1)
extra_df.shape

(6172, 25)

In [42]:
extra_df.drop(['c_jail_in', 'c_jail_out'], axis=1, inplace=True)
extra_df.shape
print(extra_df.columns)
extra_df

Index(['age', 'juv_fel_count', 'decile_score', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'is_violent_recid', 'two_year_recid',
       'days_b_screening_arrest', 'F', 'M', 'African-American', 'Asian',
       'Caucasian', 'Hispanic', 'Native American', 'Other', 'Female', 'Male',
       'High', 'Low', 'Medium', 'days_in_jail'],
      dtype='object')


Unnamed: 0,age,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,is_violent_recid,two_year_recid,days_b_screening_arrest,F,...,Caucasian,Hispanic,Native American,Other,Female,Male,High,Low,Medium,days_in_jail
0,69,0,1,0,0,0,0,0,-1.0,1,...,0,0,0,1,0,1,0,1,0,0
1,34,0,3,0,0,0,1,1,-1.0,1,...,0,0,0,0,0,1,0,1,0,10
2,24,0,4,0,1,4,0,1,-1.0,1,...,0,0,0,0,0,1,0,1,0,1
5,44,0,1,0,0,0,0,0,0.0,0,...,0,0,0,1,0,1,0,1,0,1
6,41,0,6,0,0,14,0,1,-1.0,1,...,1,0,0,0,0,1,0,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,23,0,7,0,0,0,0,0,-1.0,1,...,0,0,0,0,0,1,0,0,1,1
7210,23,0,3,0,0,0,0,0,-1.0,1,...,0,0,0,0,0,1,0,1,0,1
7211,57,0,1,0,0,0,0,0,-1.0,1,...,0,0,0,1,0,1,0,1,0,1
7212,33,0,2,0,0,3,0,0,-1.0,0,...,0,0,0,0,1,0,0,1,0,1


In [43]:
scaler = StandardScaler()
numerical_cols = ["age", "juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", "days_b_screening_arrest", "days_in_jail"]
extra_df[numerical_cols] = scaler.fit_transform(extra_df[numerical_cols])

In [44]:
extra_df

Unnamed: 0,age,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,is_violent_recid,two_year_recid,days_b_screening_arrest,F,...,Caucasian,Hispanic,Native American,Other,Female,Male,High,Low,Medium,days_in_jail
0,2.938237,-0.127923,1,-0.183232,-0.235102,-0.684413,0,0,0.145601,1,...,0,0,0,1,0,1,0,1,0,-0.313191
1,-0.045568,-0.127923,3,-0.183232,-0.235102,-0.684413,1,1,0.145601,1,...,0,0,0,0,0,1,0,1,0,-0.099011
2,-0.898084,-0.127923,4,-0.183232,1.889425,0.158866,0,1,0.145601,1,...,0,0,0,0,0,1,0,1,0,-0.291773
5,0.806948,-0.127923,1,-0.183232,-0.235102,-0.684413,0,0,0.342285,0,...,0,0,0,1,0,1,0,1,0,-0.291773
6,0.551193,-0.127923,6,-0.183232,-0.235102,2.267065,0,1,0.145601,1,...,1,0,0,0,0,1,0,0,1,-0.184683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,-0.983335,-0.127923,7,-0.183232,-0.235102,-0.684413,0,0,0.145601,1,...,0,0,0,0,0,1,0,0,1,-0.291773
7210,-0.983335,-0.127923,3,-0.183232,-0.235102,-0.684413,0,0,0.145601,1,...,0,0,0,0,0,1,0,1,0,-0.291773
7211,1.915218,-0.127923,1,-0.183232,-0.235102,-0.684413,0,0,0.145601,1,...,0,0,0,1,0,1,0,1,0,-0.291773
7212,-0.130819,-0.127923,2,-0.183232,-0.235102,-0.051954,0,0,0.145601,0,...,0,0,0,0,1,0,0,1,0,-0.291773


In [45]:
#compas_score["jail_duration"] = compas_score["c_jail_out"].combine(compas_score["c_jail_in"], lambda x1, x2: (pd.to_datetime(x1) - pd.to_datetime(x2)).days)
#compas_score.drop(["c_jail_in", "c_jail_out"], axis=1, inplace=True)

In [46]:
extra_df.isna().sum()


age                        0
juv_fel_count              0
decile_score               0
juv_misd_count             0
juv_other_count            0
priors_count               0
is_violent_recid           0
two_year_recid             0
days_b_screening_arrest    0
F                          0
M                          0
African-American           0
Asian                      0
Caucasian                  0
Hispanic                   0
Native American            0
Other                      0
Female                     0
Male                       0
High                       0
Low                        0
Medium                     0
days_in_jail               0
dtype: int64

In [47]:
X = extra_df.drop("two_year_recid", axis=1)
y = extra_df["two_year_recid"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.333, random_state=42)

In [48]:
X_train.to_csv("compas_train.csv", index=False)
X_test.to_csv("compas_test.csv", index=False)
y_train.to_csv("compas_train_labels.csv", index=False)
y_test.to_csv("compas_test_labels.csv", index=False)

In [49]:
X_train = pd.read_csv("compas_train.csv")
X_test = pd.read_csv("compas_test.csv")
y_train = pd.read_csv("compas_train_labels.csv")
y_test = pd.read_csv("compas_test_labels.csv")

selected_columns = ['age', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 
                    'is_violent_recid','days_b_screening_arrest','F','M','Female','Male','High','Low','Medium', 'days_in_jail'
]
X_train_protected = X_train[selected_columns]
X_test_protected = X_test[selected_columns]

# Step 8: Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_protected, y_train)

# Step 9: Make predictions on the test set
y_pred = model.predict(X_test_protected)

# Step 10: Evaluate the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7144941634241245


  y = column_or_1d(y, warn=True)


In [50]:
print(X_test['African-American'])

0       1
1       0
2       1
3       0
4       0
       ..
2051    1
2052    0
2053    0
2054    0
2055    1
Name: African-American, Length: 2056, dtype: int64


In [51]:
X_test_df = pd.DataFrame(X_test)
# print(X_test_df['African-American'])

X_test_df['two_year_recid'] = y_test #y_test
X_test_df['y_pred'] = y_pred


In [52]:
X_test_df.shape

(2056, 24)

In [53]:
print(sum((X_test_df['African-American'] == 1)))
print(sum((X_test_df['Caucasian'] == 1)))

1057
708


In [54]:
FP_AA = sum((X_test_df['African-American'] == 1) & (X_test_df['two_year_recid'] == 0) & (X_test_df['y_pred'] == 1))
TN_AA = sum((X_test_df['African-American'] == 1) & (X_test_df['two_year_recid'] == 0) & (X_test_df['y_pred'] == 0))
print(FP_AA)
print(TN_AA)

131
386


In [55]:
FP_CC= sum((X_test_df['Caucasian'] == 1) & (X_test_df['two_year_recid'] == 0) & (X_test_df['y_pred'] == 1))
TN_CC= sum((X_test_df['Caucasian'] == 1) & (X_test_df['two_year_recid'] == 0) & (X_test_df['y_pred'] == 0))
print(FP_CC)
print(TN_CC)

44
383


In [56]:
FP_AA_rate = FP_AA/(FP_AA + TN_AA)
print(FP_AA_rate)

0.25338491295938104


In [57]:
FP_CC_rate = FP_CC/(FP_CC + TN_CC)
print(FP_CC_rate)

0.10304449648711944


In [58]:
cali_AA= sum((X_test_df['African-American'] == 1) & (X_test_df['y_pred'] == 1) & (X_test_df['two_year_recid'] == 1))
cali_AA_t= sum((X_test_df['African-American'] == 1) & (X_test_df['y_pred'] == 1))

cali_CC= sum((X_test_df['Caucasian'] == 1) & (X_test_df['y_pred'] == 1) & (X_test_df['two_year_recid'] == 1))
cali_CC_t= sum((X_test_df['Caucasian'] == 1) & (X_test_df['y_pred'] == 1))

In [59]:
print(cali_AA)
print(cali_AA_t)

361
492


In [60]:
print(cali_CC)
print(cali_CC_t)

131
175


In [61]:
calibration_value_AA = cali_AA/cali_AA_t
print(calibration_value_AA)

0.733739837398374


In [62]:
calibration_value_CC = cali_CC/cali_CC_t
print(calibration_value_CC)

0.7485714285714286
