In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

# PS 08
David Gao
## 1 Is COMPAS fair?
#### 1.1 

In [2]:
compas = pd.read_csv(r"compas-score-data.csv", sep="\t")
compas.head(5)

Unnamed: 0,age,c_charge_degree,race,age_cat,sex,priors_count,decile_score,two_year_recid
0,69,F,Other,Greater than 45,Male,0,1,0
1,34,F,African-American,25 - 45,Male,0,3,1
2,24,F,African-American,Less than 25,Male,4,4,1
3,44,M,Other,25 - 45,Male,0,1,0
4,41,F,Caucasian,25 - 45,Male,14,6,1


In [3]:
compas.isna().sum()

age                0
c_charge_degree    0
race               0
age_cat            0
sex                0
priors_count       0
decile_score       0
two_year_recid     0
dtype: int64

In [4]:
compas.describe()

Unnamed: 0,age,priors_count,decile_score,two_year_recid
count,6172.0,6172.0,6172.0,6172.0
mean,34.534511,3.246436,4.418503,0.45512
std,11.730938,4.74377,2.839463,0.498022
min,18.0,0.0,1.0,0.0
25%,25.0,0.0,2.0,0.0
50%,31.0,1.0,4.0,0.0
75%,42.0,4.0,7.0,1.0
max,96.0,38.0,10.0,1.0


#### 1.2

In [5]:
temp = compas[(compas.race == "Caucasian") | (compas.race == "African-American")].copy()
temp.sample(5)

Unnamed: 0,age,c_charge_degree,race,age_cat,sex,priors_count,decile_score,two_year_recid
3073,33,F,African-American,25 - 45,Male,3,5,1
3084,40,M,African-American,25 - 45,Male,0,1,0
3574,29,F,Caucasian,25 - 45,Female,6,7,1
5225,61,M,African-American,Greater than 45,Male,2,8,0
1946,53,M,African-American,Greater than 45,Male,1,2,0


#### 1.3

In [6]:
# high score variable: "0" for low score 1 to 4, and "1" for high score 5 to 10
temp["high_score"] = pd.cut(temp.decile_score,
                                      bins = [ 1, 5, 11], 
                                      labels = [0, 1],
                                      right = False)
temp.sample(5)

Unnamed: 0,age,c_charge_degree,race,age_cat,sex,priors_count,decile_score,two_year_recid,high_score
5388,18,F,African-American,Less than 25,Male,4,5,1,1
5961,33,F,Caucasian,25 - 45,Male,2,4,0,0
2565,52,M,African-American,Greater than 45,Male,0,1,1,0
4765,25,F,African-American,25 - 45,Male,2,4,0,0
1273,40,F,Caucasian,25 - 45,Male,0,2,0,0


#### 1.4
#### (a)

In [7]:
low_risk = temp[temp.high_score == 0].two_year_recid.mean()
low_risk

0.3200145296040683

In [8]:
high_risk = temp[temp.high_score == 1].two_year_recid.mean()
high_risk

0.6344554455445545

The recidivism rate for low risk individuals is approximately 32%. And the recidivism rate for high risk individuals is approximately 63%.
#### (b)

In [9]:
caucasians = temp[temp.race == "Caucasian"].two_year_recid.mean()
caucasians

0.3908701854493581

In [10]:
african_americans = temp[temp.race == "African-American"].two_year_recid.mean()
african_americans

0.5231496062992126

The recidivism rate for Caucasian individuals is approximately 39%. And the recidivism rate for African American individuals is approximately 52%.
#### 1.5

In [26]:
cm = pd.crosstab(temp.two_year_recid+0, temp.high_score)
cm

high_score,0,1
two_year_recid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1872,923
1,881,1602


In [12]:
temp.shape

(5278, 9)

In [13]:
(1872 + 1602)/5278

0.6582038651004168

In [14]:
1 - (1872 + 1602)/5278

0.34179613489958316

In [15]:
881/5278

0.16691928760894278

In [16]:
923/5278

0.1748768472906404

In [135]:
# FPR
923/(1872 + 923)

0.3302325581395349

In [136]:
# FNR
881/(1602 + 881)

0.35481272654047524

COMPAS predicts 65% of individuals accurately, and predicts 34% of individials incorrectly. And 16% of individuals are falsely classified as low risk, where as 17% of individuals are falsely classified as high risk. 
#### 1.6
The accuracy is 0.6582, and the FPR is 0.33 and FNR is 0.35. I feel uncomfortable about this result as COMPAS only predict about half individuals correctly. For me, error less than 20% would be mroe acceptable. I think judges might do better based on their experience, but will not have very large difference as COMPAS.
#### 1.7

In [27]:
Caucasians = temp[temp.race == "Caucasian"]
Caucasians.head(3)

Unnamed: 0,age,c_charge_degree,race,age_cat,sex,priors_count,decile_score,two_year_recid,high_score
4,41,F,Caucasian,25 - 45,Male,14,6,1,1
6,39,M,Caucasian,25 - 45,Female,0,1,0,0
7,27,F,Caucasian,25 - 45,Male,0,4,0,0


In [28]:
African_A = temp[temp.race == "African-American"]
African_A.head(3)

Unnamed: 0,age,c_charge_degree,race,age_cat,sex,priors_count,decile_score,two_year_recid,high_score
1,34,F,African-American,25 - 45,Male,0,3,1,0
2,24,F,African-American,Less than 25,Male,4,4,1,0
8,23,M,African-American,Less than 25,Male,3,6,1,1


In [29]:
cm_C = pd.crosstab(Caucasians.two_year_recid+0, Caucasians.high_score)
cm_C

high_score,0,1
two_year_recid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,999,282
1,408,414


In [30]:
cm_A = pd.crosstab(African_A.two_year_recid+0, African_A.high_score)
cm_A

high_score,0,1
two_year_recid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,873,641
1,473,1188


In [31]:
from sklearn.metrics import accuracy_score
accuracy_C = accuracy_score(Caucasians.two_year_recid+0, Caucasians.high_score)
accuracy_C

0.6718972895863052

In [32]:
from sklearn.metrics import accuracy_score
accuracy_A = accuracy_score(African_A.two_year_recid+0, African_A.high_score)
accuracy_A

0.6491338582677165

#### (a)
The accuracy for COMPAS classification on African-American individuals is 0.671897. And the accuracy on Caucasians is 0.649133.
#### (b)


In [33]:
282/(999 + 282) # FPR for Caucasians

0.22014051522248243

In [34]:
641/(873 + 641) # FPR for African American

0.4233817701453104

Caucasians have lower False Positive rate.
#### (c)

In [35]:
408/(414 + 408) # FNR for Caucasians

0.49635036496350365

In [37]:
473/(471 + 1188) # FNR for African American

0.28511151295961423

African Americans have lower False Negarive rate.
#### 1.8
The COMPAS algorithm is unfair. Though the accuracy is fairly similar for both Caucasians and African Americans, The False positive rate and False negative rate are different. Caucasians have lower False Positive rate, which means relatively more African Americans with low risk were incorrectly identified as high risk. And African Americans have lower False Negarive rate, which means relatively more high risk Caucasians are falsly identified as low risk. As a result, African American is more likely to be identified as high risk compared with Caucasians. Thus, COMAPS is unfair.
## 2 Can you beat COMPAS
#### 2.1
I think appropriate model performance measures are Accuracy and Recall. A fair algorithm should identify true low risk individuals as low risk and true high risk individuals as high risk correctly, thus accuracy should be a good measurement of this. And in particular, we are more likely want true high risk individuals be identified correctly, thus Recall shows how many identified high risk is correct.
#### 2.2

In [42]:
temp_reg = temp[["age", "c_charge_degree", "age_cat", "priors_count", "two_year_recid"]]
temp_reg.head(3)

Unnamed: 0,age,c_charge_degree,age_cat,priors_count,two_year_recid
1,34,F,25 - 45,0,1
2,24,F,Less than 25,4,1
4,41,F,25 - 45,14,1


In [45]:
# create dummies for categorical variables
X = temp_reg[["age", "c_charge_degree", "age_cat", "priors_count"]]
X = pd.get_dummies(X, drop_first=True)
X.head(3)

Unnamed: 0,age,priors_count,c_charge_degree_M,age_cat_Greater than 45,age_cat_Less than 25
1,34,0,0,0,0
2,24,4,0,0,1
4,41,14,0,0,0


In [48]:
y = temp_reg.two_year_recid
y.head(3)

1    1
2    1
4    1
Name: two_year_recid, dtype: int64

In [49]:
from sklearn.linear_model import LogisticRegression
m = LogisticRegression()
_ = m.fit(X, y)

In [52]:
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(m, X, y, scoring="accuracy", cv=10)
np.mean(accuracy)

0.6703291213846242

In [53]:
recall = cross_val_score(m, X, y, scoring="recall", cv=10)
np.mean(recall)

0.6161630392537893

Accuracy for logistic regression model is 0.6703, and Recall is 0.6162.
#### 2.3

In [54]:
# include only age categorical variable
X = temp_reg[["c_charge_degree", "age_cat", "priors_count"]]
X = pd.get_dummies(X, drop_first=True)
m = LogisticRegression()
_ = m.fit(X, y)

In [55]:
accuracy = cross_val_score(m, X, y, scoring="accuracy", cv=10)
np.mean(accuracy)

0.6614218561324823

In [56]:
recall = cross_val_score(m, X, y, scoring="recall", cv=10)
np.mean(recall)

0.5980421686746988

In [58]:
# include only age numerical variable
X = temp_reg[["age", "c_charge_degree", "priors_count"]]
X = pd.get_dummies(X, drop_first=True)
m = LogisticRegression()
_ = m.fit(X, y)

In [59]:
accuracy = cross_val_score(m, X, y, scoring="accuracy", cv=10)
np.mean(accuracy)

0.674883201081019

In [60]:
recall = cross_val_score(m, X, y, scoring="recall", cv=10)
np.mean(recall)

0.5851745692447208

In [61]:
# regression without considering any age variables
X = temp_reg[["c_charge_degree", "priors_count"]]
X = pd.get_dummies(X, drop_first=True)
m = LogisticRegression()
_ = m.fit(X, y)

In [62]:
accuracy = cross_val_score(m, X, y, scoring="accuracy", cv=10)
np.mean(accuracy)

0.6354666206658617

In [67]:
recall = cross_val_score(m, X, y, scoring="recall", cv=10)
np.mean(recall)

0.5996761238502396

In [92]:
# change categories of age
X = temp_reg.copy()
X["age_cat"] = pd.cut(X.age, 
                  bins = [ 18, 25, 35, 45, 55, np.inf], 
                  labels = ["under 25", "25 - 35", "35 - 45", "45 - 55", "more than 45"],
                  right = False)


In [93]:
# includes new age categories and exclued c_charge_degree
X = X[["age", "age_cat", "priors_count"]]
X = pd.get_dummies(X, drop_first=True)
m = LogisticRegression()
_ = m.fit(X, y)

In [94]:
accuracy = cross_val_score(m, X, y, scoring="accuracy", cv=10)
np.mean(accuracy)

0.67392724685182

In [95]:
recall = cross_val_score(m, X, y, scoring="recall", cv=10)
np.mean(recall)

0.6246178261432829

I tried 4 differnet models, which includes only age categorical variable, includes only age numerical variable, exculdes all age variables, and includes new age categories and exclued c_charge_degree. The best model is the last one that includes new age categories and exclued c_charge_degree, and this model is slightly better on Recall and almost the same on Accuracy.
#### 2.4

In [97]:
X = temp[["age", "sex", "c_charge_degree", "age_cat", "priors_count"]]
X = pd.get_dummies(X, drop_first=True)
X.head(3)

Unnamed: 0,age,priors_count,sex_Male,c_charge_degree_M,age_cat_Greater than 45,age_cat_Less than 25
1,34,0,1,0,0,0
2,24,4,1,0,0,1
4,41,14,1,0,0,0


In [98]:
m = LogisticRegression()
_ = m.fit(X, y)

In [99]:
accuracy = cross_val_score(m, X, y, scoring="accuracy", cv=10)
np.mean(accuracy)

0.6731682335690876

In [100]:
recall = cross_val_score(m, X, y, scoring="recall", cv=10)
np.mean(recall)

0.6137453037958285

The new model including sex does not have much improvement.
#### 2.5

In [101]:
X = temp[["age", "race", "sex", "c_charge_degree", "age_cat", "priors_count"]]
X = pd.get_dummies(X, drop_first=True)
X.head(3)

Unnamed: 0,age,priors_count,race_Caucasian,sex_Male,c_charge_degree_M,age_cat_Greater than 45,age_cat_Less than 25
1,34,0,0,1,0,0,0
2,24,4,0,1,0,0,1
4,41,14,1,1,0,0,0


In [102]:
m = LogisticRegression()
_ = m.fit(X, y)

In [103]:
accuracy = cross_val_score(m, X, y, scoring="accuracy", cv=10)
np.mean(accuracy)

0.6727894456902995

In [104]:
recall = cross_val_score(m, X, y, scoring="recall", cv=10)
np.mean(recall)

0.6141501489830288

This model including sex and race does not have much improvement.
#### 2.6
My besd model, the one includes new age categories and exclued c_charge_degree, and models from 2.4 and 2.5 managed to be equally good as COMPAS on Accuracy and Recall, but none of these new model really beats COMPAS model. Thus, sex and gender does not help on improving predictions. As a result, judges can use these new medels from 2.3, 2.4, and 2.5, and judges can use them exactly the same as COMPAS.
## 3 Is your model more fair
#### 3.1

In [110]:
# recover the best model
# change categories of age
X_temp = temp.copy()
X_temp["age_cat"] = pd.cut(X.age, 
                  bins = [ 18, 25, 35, 45, 55, np.inf], 
                  labels = ["under 25", "25 - 35", "35 - 45", "45 - 55", "more than 45"],
                  right = False)
# includes new age categories and exclued c_charge_degree
X = X_temp[["age", "age_cat", "priors_count"]]
X = pd.get_dummies(X, drop_first=True)
m = LogisticRegression()
_ = m.fit(X, y)

In [122]:
# separate data by race
# data for Caucasians
X_C = X_temp[X_temp.race == "Caucasian"][["age", "age_cat", "priors_count"]]
X_C = pd.get_dummies(X_C, drop_first=True)

In [123]:
# data for African Americans
X_A = X_temp[X_temp.race == "African-American"][["age", "age_cat", "priors_count"]]
X_A = pd.get_dummies(X_A, drop_first=True)

In [126]:
# predict
predict_C = m.predict(X_C)
predict_A = m.predict(X_A)

In [127]:
cm_C = pd.crosstab(Caucasians.two_year_recid+0, predict_C)
cm_C

col_0,0,1
two_year_recid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1034,247
1,456,366


In [128]:
cm_A = pd.crosstab(African_A.two_year_recid+0, predict_A)
cm_A

col_0,0,1
two_year_recid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,952,562
1,460,1201


In [131]:
# FPR for Caucasians
247/(1034 + 247)

0.19281811085089773

In [130]:
# FPR for African Americans
562/(952 + 562) 

0.3712021136063408

In [132]:
# FNR for Caucasians
456/(456 + 366)

0.5547445255474452

In [133]:
# FNR for African Americans
460/(460 + 1201)

0.2769416014449127

#### 3.2
The False positive rate and False negative rate are still different in my model. False Positive rate for both Caucasians and African Americans decrease, and new False Negative rates are similar as those for COMPAS. Caucasians have lower False Positive rate, which means relatively more African Americans with low risk were incorrectly identified as high risk. And African Americans have lower False Negarive rate, which means relatively more high risk Caucasians are falsly identified as low risk. As a result, African American is more likely to be identified as high risk compared with Caucasians. 

I spent 6 hours on this PS.