In [1]:
# import compas.db
import sqlite3
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
os.chdir('..')
os.getcwd()

'/Users/sarahkurihara/Documents/projects/ads-spring-2022-prj4-group-8-1'

In [3]:
#Import data
df = pd.read_csv('data/compas-scores-two-years.csv')
cols = ["age", "c_charge_degree", "race", "age_cat", "score_text", "sex", "priors_count", 
        "days_b_screening_arrest", "decile_score", "is_recid", "two_year_recid", "c_jail_in", 
        "c_jail_out"]
df = df.filter(items = cols)
df = df[(df.race=='African-American') | (df.race=='Caucasian')]
df

Unnamed: 0,age,c_charge_degree,race,age_cat,score_text,sex,priors_count,days_b_screening_arrest,decile_score,is_recid,two_year_recid,c_jail_in,c_jail_out
1,34,F,African-American,25 - 45,Low,Male,0,-1.0,3,1,1,2013-01-26 03:45:27,2013-02-05 05:36:53
2,24,F,African-American,Less than 25,Low,Male,4,-1.0,4,1,1,2013-04-13 04:58:34,2013-04-14 07:02:04
3,23,F,African-American,Less than 25,High,Male,1,,8,0,0,,
6,41,F,Caucasian,25 - 45,Medium,Male,14,-1.0,6,1,1,2014-02-18 05:08:24,2014-02-24 12:18:30
8,39,M,Caucasian,25 - 45,Low,Female,0,-1.0,1,0,0,2014-03-15 05:35:34,2014-03-18 04:28:46
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207,30,M,African-American,25 - 45,Low,Male,0,-1.0,2,1,1,2014-05-09 10:01:33,2014-05-10 08:28:12
7208,20,F,African-American,Less than 25,High,Male,0,-1.0,9,0,0,2013-10-19 11:17:15,2013-10-20 08:13:06
7209,23,F,African-American,Less than 25,Medium,Male,0,-1.0,7,0,0,2013-11-22 05:18:27,2013-11-24 02:59:20
7210,23,F,African-American,Less than 25,Low,Male,0,-1.0,3,0,0,2014-01-31 07:13:54,2014-02-02 04:03:52


In [5]:
df.dtypes

age                          int64
c_charge_degree             object
race                        object
age_cat                     object
score_text                  object
sex                         object
priors_count                 int64
days_b_screening_arrest    float64
decile_score                 int64
is_recid                     int64
two_year_recid               int64
c_jail_in                   object
c_jail_out                  object
dtype: object

In [6]:
df.nunique()

age                          63
c_charge_degree               2
race                          2
age_cat                       3
score_text                    3
sex                           2
priors_count                 37
days_b_screening_arrest     381
decile_score                 10
is_recid                      2
two_year_recid                2
c_jail_in                  5915
c_jail_out                 5893
dtype: int64

In [7]:
list(df.columns)

['age',
 'c_charge_degree',
 'race',
 'age_cat',
 'score_text',
 'sex',
 'priors_count',
 'days_b_screening_arrest',
 'decile_score',
 'is_recid',
 'two_year_recid',
 'c_jail_in',
 'c_jail_out']

In [8]:
# Change columns to factors
df['sex'] = (df['sex'].values == 'Female').astype(int)
df['race'] = (df['race'].values == 'African-American').astype(int)
df['race'] = df['race'].astype('category')
df['c_charge_degree'] = df['c_charge_degree'].astype('category')
df = pd.concat([df, pd.get_dummies(df.age_cat, drop_first=False)], axis=1)
df = df.rename(columns={"25 - 45": "middle_age", "Greater than 45": "old", "Less than 25": "young"})

# Filter data
df['days_b_screening_arrest'] = df['days_b_screening_arrest'] >= -30
df['days_b_screening_arrest'] = df['days_b_screening_arrest'] <= 30
df['is_recid'] = df['is_recid'] != -1
df['c_charge_degree'] = df['c_charge_degree'] != "O"
df['score_text'] = df['score_text'] != 'N/A'

df

Unnamed: 0,age,c_charge_degree,race,age_cat,score_text,sex,priors_count,days_b_screening_arrest,decile_score,is_recid,two_year_recid,c_jail_in,c_jail_out,middle_age,old,young
1,34,True,1,25 - 45,True,0,0,True,3,True,1,2013-01-26 03:45:27,2013-02-05 05:36:53,1,0,0
2,24,True,1,Less than 25,True,0,4,True,4,True,1,2013-04-13 04:58:34,2013-04-14 07:02:04,0,0,1
3,23,True,1,Less than 25,True,0,1,True,8,True,0,,,0,0,1
6,41,True,0,25 - 45,True,0,14,True,6,True,1,2014-02-18 05:08:24,2014-02-24 12:18:30,1,0,0
8,39,True,0,25 - 45,True,1,0,True,1,True,0,2014-03-15 05:35:34,2014-03-18 04:28:46,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207,30,True,1,25 - 45,True,0,0,True,2,True,1,2014-05-09 10:01:33,2014-05-10 08:28:12,1,0,0
7208,20,True,1,Less than 25,True,0,0,True,9,True,0,2013-10-19 11:17:15,2013-10-20 08:13:06,0,0,1
7209,23,True,1,Less than 25,True,0,0,True,7,True,0,2013-11-22 05:18:27,2013-11-24 02:59:20,0,0,1
7210,23,True,1,Less than 25,True,0,0,True,3,True,0,2014-01-31 07:13:54,2014-02-02 04:03:52,0,0,1


In [11]:
# Build logistic regression model

features = ["sex", "middle_age", "young", "old", "race", "priors_count", "c_charge_degree", "decile_score"]
response = ['two_year_recid']

X = df[features]
y = df[response]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = LogisticRegression(random_state=0).fit(X_train, y_train.values.ravel())

In [12]:
# Fit logistic regression model
clf.score(X_test, y_test)

0.6788177339901478

In [10]:
# Get predictions
y_pred = clf.predict(X_test)

# Identify incorrect results
results_df = pd.DataFrame(X_test)
results_df["actual"] = y_test
results_df["predicted"] = y_pred
results_df

#incorrect = df[df["actual"] != df["predicted"]]
#incorrect

Unnamed: 0,sex,middle_age,young,old,race,priors_count,c_charge_degree,decile_score,actual,predicted
3299,0,0,1,0,1,1,True,7,1,1
7192,0,1,0,0,0,0,True,2,0,0
6147,0,0,0,1,1,10,True,3,1,0
5591,0,0,1,0,1,1,True,10,1,1
5846,0,1,0,0,1,0,True,4,1,0
...,...,...,...,...,...,...,...,...,...,...
6949,0,0,0,1,0,0,True,2,1,0
2155,0,1,0,0,0,6,True,5,0,1
4770,1,0,1,0,1,5,True,9,1,1
5811,0,0,0,1,0,12,True,2,1,0
