In [1]:
! git clone https://github.com/propublica/compas-analysis.git

Cloning into 'compas-analysis'...
remote: Enumerating objects: 31, done.[K
remote: Total 31 (delta 0), reused 0 (delta 0), pack-reused 31[K
Unpacking objects: 100% (31/31), done.


In [2]:
!ls

compas-analysis  sample_data


In [3]:
! ls compas-analysis/

'Compas Analysis.ipynb'
 compas.db
 compas-scores.csv
 compas-scores-raw.csv
 compas-scores-two-years.csv
 compas-scores-two-years-violent.csv
 cox-parsed.csv
 cox-violent-parsed.csv
'Cox with interaction term and independent variables.ipynb'
 README
 truth_tables.py


In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("compas-analysis/compas-scores-two-years.csv")

In [6]:
# select features
df_selected = df[['two_year_recid', 'race', 'age', 'priors_count', 'decile_score']]
# select only African-Amereican and Caucasian
df_selected = df_selected.loc[df_selected['race'].isin(('African-American', 'Caucasian'))]
# change African-American = 0, Caucasian = 1
df_selected.loc[df_selected["race"] == "African-American", "race"] = 0
df_selected.loc[df_selected["race"] == "Caucasian", "race"] = 1

df_selected.head()

Unnamed: 0,two_year_recid,race,age,priors_count,decile_score
1,1,0,34,0,3
2,1,0,24,4,4
3,0,0,23,1,8
6,1,1,41,14,6
8,0,1,39,0,1


In [25]:
df_selected.shape

(6150, 5)

In [26]:
from sklearn.datasets import make_classification
X, y = make_classification(n_features=4, n_redundant=0, 
                           n_informative=2, random_state=1, 
                           n_clusters_per_class=1)

def sigmoid(z):
    return 1.0/(1 + np.exp(-z))

def loss(y, y_hat):
    loss = -np.mean(y*(np.log(y_hat)) - (1-y)*np.log(1-y_hat))
    return loss

def gradients(X, y, y_hat):
    
    # X --> Input.
    # y --> true/target value.
    # y_hat --> hypothesis/predictions.
    # w --> weights (parameter).
    # b --> bias (parameter).
    
    # m-> number of training examples.
    m = X.shape[0]
    
    # Gradient of loss w.r.t weights.
    dw = (1/m)*np.dot(X.T, (y_hat - y))
    
    # Gradient of loss w.r.t bias.
    db = (1/m)*np.sum((y_hat - y)) 
    
    return dw, db

def normalize(X):
    
    # X --> Input.
    
    # m-> number of training examples
    # n-> number of features 
    m, n = X.shape
    
    # Normalizing all the n features of X.
    for i in range(n):
        X = (X - X.mean(axis=0))/X.std(axis=0)
        
    return X

## Train Function

In [27]:
def train(X, y, bs, epochs, lr):
    
    # X --> Input.
    # y --> true/target value.
    # bs --> Batch Size.
    # epochs --> Number of iterations.
    # lr --> Learning rate.
        
    # m-> number of training examples
    # n-> number of features 
    m, n = X.shape
    
    # Initializing weights and bias to zeros.
    w = np.zeros((n,1))
    b = 0
    
    # Reshaping y.
    y = y.reshape(m,1)
    
    # Normalizing the inputs.
    x = normalize(X)
    
    # Empty list to store losses.
    losses = []
    
    # Training loop.
    for epoch in range(epochs):
        for i in range((m-1)//bs + 1):
            
            # Defining batches. SGD.
            start_i = i*bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Calculating hypothesis/prediction.
            y_hat = sigmoid(np.dot(xb, w) + b)
            
            # Getting the gradients of loss w.r.t parameters.
            dw, db = gradients(xb, yb, y_hat)
            
            # Updating the parameters.
            w -= lr*dw
            b -= lr*db
        
        # Calculating loss and appending it in the list.
        l = loss(y, sigmoid(np.dot(X, w) + b))
        losses.append(l)
        
    # returning weights, bias and losses(List).
    return w, b, losses

## Predict Function

In [28]:
def predict(X):
    
    # X --> Input.
    
    # Normalizing the inputs.
    x = normalize(X)
    
    # Calculating presictions/y_hat.
    preds = sigmoid(np.dot(X, w) + b)
    
    # Empty List to store predictions.
    pred_class = []
    # if y_hat >= 0.5 --> round up to 1
    # if y_hat < 0.5 --> round up to 1
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    
    return np.array(pred_class)

## Train and predict

In [29]:
# Training 
X = df_selected[['race', 'age', 'priors_count', 'decile_score']].to_numpy(dtype="float32")
y = df_selected['two_year_recid'].to_numpy(dtype="float32")
w, b, l = train(X, y, bs=1000, epochs=1000, lr=0.01)


In [30]:
from sklearn.metrics import accuracy_score
accuracy_score(y,predict(X))


0.5642276422764227

## Use A5

In [148]:
def lossA5(y, y_hat, X, s, lam, Py_hat, w_s0, w_s1, b0, b1):
    loss = -np.mean(y*(np.log(y_hat)) - (1-y)*np.log(1-y_hat))
    for i in range(s.shape[0]):
      if s[i] == 0:
        loss -= sigmoid(np.dot(X[i], w_s0) + b0)*np.log(R_pr(X, s, w_s0, w_s1, b0, b1, Py_hat)/Py_hat) + lam*np.sum(w_s0)
      else:
        loss -= sigmoid(np.dot(X[i], w_s1) + b1)*np.log(R_pr(X, s, w_s0, w_s1, b0, b1, Py_hat)/Py_hat) + lam*np.sum(w_s1)
    return loss

def Py_given_s_hat(X, s, w_s0, w_s1, b0, b1):
    Xs = df_selected.loc[df_selected['race'] == s]
    return sum(predictA5(X, s, w_s0, w_s1, b0, b1)) / Xs.shape[1]


def R_pr(X, s, w_s0, w_s1, b0, b1, Py_hat):
    return sum(predictA5(X, s, w_s0, w_s1, b0, b1))* np.log(Py_given_s_hat(X, s, w_s0, w_s1, b0, b1)/Py_hat)

def gradientsA5(X, y, y_hat, s, w_s0, w_s1, b0, b1, lam, Py_hat):
    
    # X --> Input.
    # y --> true/target value.
    # y_hat --> hypothesis/predictions.
    # w --> weights (parameter).
    # b --> bias (parameter).
    
    # m-> number of training examples.
    m = X.shape[0]
    
    # Gradient of loss w.r.t weights.
    dw_s0 = np.array([])
    dw_s1 = np.array([])

    s0_indice = []
    s1_indice = []
    for i in range(X.shape[0]):
        if s[i] == 0:
          s0_indice.append(i)
        else:
          s1_indice.append(i)
    
    ys0 = (y_hat[s0_indice].reshape(len(s0_indice), 1) - y[s0_indice])
    ys1 = (y_hat[s1_indice].reshape(len(s1_indice), 1) - y[s1_indice])
    
    X0 = X[s0_indice]
    X1 = X[s1_indice]
    #print(X0.T @ ys0.reshape(ys0.shape[0], 1))
    dw_s0 = (1/m)*np.dot(X0.T, ys0) #+ np.dot(X.T, (y_hat - y))*np.log(R_pr(X, s, w_s0, w_s1, b0, b1, Py_hat)/Py_hat) + lam*np.sum(w_s0, w_s1)
    dw_s1 = (1/m)*np.matmul(X1.T, ys1) #+ np.dot(X.T, (y_hat - y))*np.log(R_pr(X, s, w_s0, w_s1, b0, b1, Py_hat)/Py_hat) + lam*np.sum(w_s0, w_s1)
    
    # Gradient of loss w.r.t bias.
    db_0 = (1/m)*np.sum((y_hat - y)) 
    db_1 = (1/m)*np.sum((y_hat - y)) 
    
    return dw_s0, dw_s1, db_0, db_1


## Train with A5

In [146]:
def trainA5(X, y, s, bs, epochs, lr, lam):
    
    # X --> Input.
    # y --> true/target value.
    # bs --> Batch Size.
    # epochs --> Number of iterations.
    # lr --> Learning rate.
        
    # m-> number of training examples
    # n-> number of features 
    m, n = X.shape
    
    # Initializing weights and bias to zeros.
    w_s0 = np.zeros((n,1))
    w_s1 = np.zeros((n,1))
    b0 = 0
    b1 = 0
    
    Py_hat = sum(predictA5(X, s, w_s0, w_s1, b0, b1)) / X.shape[1]
    
    # Reshaping y.
    y = y.reshape(m,1)
    
    # Normalizing the inputs.
    x = normalize(X)
    
    # Empty list to store losses.
    losses = []
    
    # Training loop.
    for epoch in range(epochs):
        for i in range((m-1)//bs + 1):
            
            # Defining batches. SGD.
            start_i = i*bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            sb = s[start_i:end_i]
            
            # Calculating hypothesis/prediction.
            y_hat = predictA5(xb, sb, w_s0, w_s1, b0, b1)
            
            # Getting the gradients of loss w.r.t parameters.
            dw_s0, dw_s1, db_0, db_1 = gradientsA5(xb, yb, y_hat, s, w_s0, w_s1, b0, b1, lam, Py_hat)
            
            # Updating the parameters.
            w_s0 -= lr*dw_s0
            w_s1 -= lr*dw_s1
            b0 -= lr*db_0
            b1 -= lr*db_1
            Py_hat = sum(predictA5(X, s, w_s0, w_s1, b0, b1)) / X.shape[1]
        
        # Calculating loss and appending it in the list.
        y_hat = predictA5(xb, sb, w_s0, w_s1, b0, b1)
        l = lossA5(y, y_hat, X, s, lam, Py_hat, w_s0, w_s1, b0, b1)
        losses.append(l)
        
    # returning weights, bias and losses(List).
    return w_s0, w_s1, b0, b1, l

## Predict with A5

In [67]:
def predictA5(X, s, w_s0, w_s1, b0, b1):
    
    # X --> Input.
    
    # Normalizing the inputs.
    x = normalize(X)
    
    # Calculating presictions/y_hat.
    preds = []
    for i in range(X.shape[0]):
      if s[i] == 0:
        preds.append(sigmoid(np.dot(X[i], w_s0) + b0))
      else:
        preds.append(sigmoid(np.dot(X[i], w_s1) + b1))
    
    # Empty List to store predictions.
    pred_class = []
    # if y_hat >= 0.5 --> round up to 1
    # if y_hat < 0.5 --> round up to 1
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    
    return np.array(pred_class)

In [None]:
X = df_selected[['age', 'priors_count', 'decile_score']].to_numpy(dtype="float32")
y = df_selected['two_year_recid'].to_numpy(dtype="float32")
S = df_selected['race'].to_numpy(dtype="float32")

#w_s0, w_s1, b0, b1, l = trainA5(X, y, S, bs=1000, epochs=10, lr=0.01, lam=0.0001)

In [150]:
df0 = df_selected.loc[df_selected['race'] == 0]
df1 = df_selected.loc[df_selected['race'] == 1]
X0 = df0[['age', 'priors_count', 'decile_score']].to_numpy(dtype="float32")
X1 = df1[['age', 'priors_count', 'decile_score']].to_numpy(dtype="float32")
y0 = df0['two_year_recid'].to_numpy(dtype="float32")
y1 = df1['two_year_recid'].to_numpy(dtype="float32")

In [152]:
w0, b0, l0 = train(X0, y0, bs=1000, epochs=1000, lr=0.01)

In [153]:
w1, b1, l1 = train(X1, y1, bs=1000, epochs=1000, lr=0.01)

In [50]:
def predictA5(X, w0, w1, b0, b1, S):
    
    # X --> Input.
    
    # Normalizing the inputs.
    x = normalize(X)
    preds = []
    # Calculating presictions/y_hat.
    for i in range(S.shape[0]):
      if S[i] == 0:
          preds.append(sigmoid(np.dot(X[i], w0) + b0)[0])
      else:
          preds.append(sigmoid(np.dot(X[i], w1) + b1)[0])
    
    
    
    # Empty List to store predictions.
    pred_class = []
    # if y_hat >= 0.5 --> round up to 1
    # if y_hat < 0.5 --> round up to 1
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    
    return np.array(pred_class)

In [179]:
from sklearn.metrics import accuracy_score
accuracy_score(y, predictA5(X, w0, w1, b0, b1, S))

0.6022764227642277

0.04 percent improvement compared with logistic regression

## n = 5 features

In [45]:
df_selected2 = df[['two_year_recid', 'race', 'age', 'priors_count', 'decile_score', 'is_violent_recid', 'juv_fel_count']]
#df_selected2 = df_selected2.select_dtypes(include=[np.number])

# select only African-Amereican and Caucasian
df_selected2 = df_selected2.loc[df_selected2['race'].isin(('African-American', 'Caucasian'))]
# change African-American = 0, Caucasian = 1
df_selected2.loc[df_selected2["race"] == "African-American", "race"] = 0
df_selected2.loc[df_selected2["race"] == "Caucasian", "race"] = 1


In [46]:
# Training 
X = df_selected2[['race', 'age', 'priors_count', 'decile_score', 'juv_fel_count', 'is_violent_recid']].to_numpy(dtype="float32")
y = df_selected2['two_year_recid'].to_numpy(dtype="float32")

w, b, l = train(X, y, bs=1000, epochs=1000, lr=0.01)

In [47]:
accuracy_score(y, predict(X))

0.619349593495935

In [52]:
X = df_selected2[['age', 'priors_count', 'decile_score', 'is_violent_recid', 'juv_fel_count']].to_numpy(dtype="float32")
y = df_selected2['two_year_recid'].to_numpy(dtype="float32")
S = df_selected2['race'].to_numpy(dtype="float32")


df0 = df_selected2.loc[df_selected['race'] == 0]
df1 = df_selected2.loc[df_selected['race'] == 1]
X0 = df0[['age', 'priors_count', 'decile_score', 'is_violent_recid', 'juv_fel_count']].to_numpy(dtype="float32")
X1 = df1[['age', 'priors_count', 'decile_score', 'is_violent_recid', 'juv_fel_count']].to_numpy(dtype="float32")
y0 = df0['two_year_recid'].to_numpy(dtype="float32")
y1 = df1['two_year_recid'].to_numpy(dtype="float32")

w0, b0, l0 = train(X0, y0, bs=1000, epochs=1000, lr=0.01)

w1, b1, l1 = train(X1, y1, bs=1000, epochs=1000, lr=0.01)


In [53]:
accuracy_score(y, predictA5(X, w0, w1, b0, b1, S))

0.6226016260162601

## 7 features

In [55]:
df_selected3 = df[['two_year_recid', 'race', 'age', 'priors_count', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'decile_score', 'is_violent_recid']]
# select only African-Amereican and Caucasian
df_selected3 = df_selected3.loc[df_selected3['race'].isin(('African-American', 'Caucasian'))]
# change African-American = 0, Caucasian = 1
df_selected3.loc[df_selected3["race"] == "African-American", "race"] = 0
df_selected3.loc[df_selected3["race"] == "Caucasian", "race"] = 1

In [56]:
# Training 
X = df_selected3[['race', 'age', 'priors_count', 'decile_score', 'juv_fel_count', 'is_violent_recid', 'juv_other_count', 'juv_misd_count']].to_numpy(dtype="float32")
y = df_selected3['two_year_recid'].to_numpy(dtype="float32")

w, b, l = train(X, y, bs=1000, epochs=1000, lr=0.01)

In [57]:
accuracy_score(y, predict(X))

0.6208130081300813

In [58]:
X = df_selected3[['age', 'priors_count', 'decile_score', 'is_violent_recid', 'juv_fel_count', 'juv_other_count', 'juv_misd_count']].to_numpy(dtype="float32")
y = df_selected3['two_year_recid'].to_numpy(dtype="float32")
S = df_selected3['race'].to_numpy(dtype="float32")


df0 = df_selected3.loc[df_selected['race'] == 0]
df1 = df_selected3.loc[df_selected['race'] == 1]
X0 = df0[['age', 'priors_count', 'decile_score', 'is_violent_recid', 'juv_fel_count', 'juv_other_count', 'juv_misd_count']].to_numpy(dtype="float32")
X1 = df1[['age', 'priors_count', 'decile_score', 'is_violent_recid', 'juv_fel_count', 'juv_other_count', 'juv_misd_count']].to_numpy(dtype="float32")
y0 = df0['two_year_recid'].to_numpy(dtype="float32")
y1 = df1['two_year_recid'].to_numpy(dtype="float32")

w0, b0, l0 = train(X0, y0, bs=1000, epochs=1000, lr=0.01)

w1, b1, l1 = train(X1, y1, bs=1000, epochs=1000, lr=0.01)


In [59]:
accuracy_score(y, predictA5(X, w0, w1, b0, b1, S))

0.6239024390243902