In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"

df = pd.read_csv(url)

df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


In [3]:
filtered = df[
    (df["days_b_screening_arrest"] <= 30) &
    (df["days_b_screening_arrest"] >= -30) &
    (df["is_recid"] != -1) &
    (df["c_charge_degree"] != "O") &
    (df["score_text"] != "N/A")
].copy()

print("Original shape:", df.shape)
print("Filtered shape:", filtered.shape)
filtered["race"].value_counts()


Original shape: (7214, 53)
Filtered shape: (6172, 53)


Unnamed: 0_level_0,count
race,Unnamed: 1_level_1
African-American,3175
Caucasian,2103
Hispanic,509
Other,343
Asian,31
Native American,11


In [4]:
target = "two_year_recid"

features = [
    "sex", "age", "age_cat", "race",
    "priors_count", "c_charge_degree",
    "juv_fel_count", "juv_misd_count", "juv_other_count"
]

X = filtered[features]
y = filtered[target].astype(int)

print("Base rate (overall):", y.mean())


Base rate (overall): 0.4551198963058976


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

cat_cols = ["sex", "age_cat", "race", "c_charge_degree"]
num_cols = [c for c in features if c not in cat_cols]

preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", "passthrough", num_cols),
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)


(4320, 9) (1852, 9)


In [6]:
from sklearn.linear_model import LogisticRegression

log_model = Pipeline([
    ("pre", preprocess),
    ("clf", LogisticRegression(max_iter=2000))
])

log_model.fit(X_train, y_train)
print("Logistic Regression trained.")


Logistic Regression trained.


In [7]:
from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline([
    ("pre", preprocess),
    ("clf", RandomForestClassifier(n_estimators=300, random_state=42))
])

rf_model.fit(X_train, y_train)
print("Random Forest trained.")


Random Forest trained.


In [8]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

def eval_model(model, X_test, y_test):
    proba = model.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)
    return {
        "acc": accuracy_score(y_test, pred),
        "auc": roc_auc_score(y_test, proba),
        "cm": confusion_matrix(y_test, pred),
        "pred": pred
    }

log_res = eval_model(log_model, X_test, y_test)
rf_res  = eval_model(rf_model,  X_test, y_test)

print("LogReg Acc:", log_res["acc"], "AUC:", log_res["auc"])
print("LogReg Confusion Matrix:\n", log_res["cm"])

print("\nRF Acc:", rf_res["acc"], "AUC:", rf_res["auc"])
print("RF Confusion Matrix:\n", rf_res["cm"])


LogReg Acc: 0.6862850971922246 AUC: 0.7329914517856493
LogReg Confusion Matrix:
 [[780 229]
 [352 491]]

RF Acc: 0.6252699784017278 AUC: 0.6763135340652984
RF Confusion Matrix:
 [[712 297]
 [397 446]]


In [9]:
import numpy as np

def fairness_by_race(y_true, y_pred, race_series):
    for group in ["African-American", "Caucasian"]:
        idx = (race_series == group).values
        yt = np.asarray(y_true)[idx]
        yp = np.asarray(y_pred)[idx]

        tn, fp, fn, tp = confusion_matrix(yt, yp, labels=[0,1]).ravel()

        pos_rate = yp.mean()
        fpr = fp / (fp + tn) if (fp + tn) else np.nan
        tpr = tp / (tp + fn) if (tp + fn) else np.nan

        print("\nGroup:", group)
        print("Count:", len(yt))
        print("Positive prediction rate:", round(pos_rate, 4))
        print("FPR:", round(fpr, 4))
        print("TPR:", round(tpr, 4))

print("=== Logistic Regression Fairness ===")
fairness_by_race(y_test, log_res["pred"], X_test["race"])

print("\n=== Random Forest Fairness ===")
fairness_by_race(y_test, rf_res["pred"], X_test["race"])


=== Logistic Regression Fairness ===

Group: African-American
Count: 956
Positive prediction rate: 0.523
FPR: 0.3146
TPR: 0.7045

Group: Caucasian
Count: 645
Positive prediction rate: 0.2682
FPR: 0.1827
TPR: 0.4125

=== Random Forest Fairness ===

Group: African-American
Count: 956
Positive prediction rate: 0.4927
FPR: 0.3528
TPR: 0.6145

Group: Caucasian
Count: 645
Positive prediction rate: 0.2837
FPR: 0.2395
TPR: 0.3583
