In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
import c_svm
from c_svm import SVM


In [2]:
df = pd.read_csv("data/compas-scores-two-years_cleaned.csv")
df.head()

Unnamed: 0,sex,age_cat,race,priors_count,c_charge_degree,two_year_recid,length_of_stay
0,Male,25 - 45,African-American,-0.733607,F,1,-0.167773
1,Male,< 25,African-American,0.055928,F,1,-0.340654
2,Male,25 - 45,Caucasian,2.029767,F,1,-0.244609
3,Female,25 - 45,Caucasian,-0.733607,M,0,-0.321445
4,Male,< 25,Caucasian,-0.536224,F,1,-0.359864


In [3]:
df.columns

Index(['sex', 'age_cat', 'race', 'priors_count', 'c_charge_degree',
       'two_year_recid', 'length_of_stay'],
      dtype='object')

In [4]:
df['race'].value_counts()

African-American    3537
Caucasian           2378
Name: race, dtype: int64

Data Processing

In [5]:
sensitive_attribute = 'race'
target = 'two_year_recid'
features = ['sex', 'age_cat', 'priors_count', 'c_charge_degree', 'length_of_stay']

In [6]:
categorical_features = ['sex', 'age_cat', 'race', 'c_charge_degree']
df['sex'] = df['sex'].apply(lambda x: 0 if x == 'Female' else 1)
df['age_cat'] = df['age_cat'].apply(lambda x: 0 if x == '< 25' else(2 if x == '> 45' else 1))
df['race'] = df['race'].apply(lambda x: 0 if x == 'African-American' else 1)
df['c_charge_degree'] = df['c_charge_degree'].apply(lambda x: 0 if x == 'M' else 1)
df.head()

Unnamed: 0,sex,age_cat,race,priors_count,c_charge_degree,two_year_recid,length_of_stay
0,1,1,0,-0.733607,1,1,-0.167773
1,1,0,0,0.055928,1,1,-0.340654
2,1,1,1,2.029767,1,1,-0.244609
3,0,1,1,-0.733607,0,0,-0.321445
4,1,0,1,-0.536224,1,1,-0.359864


Training and Testing Set

In [16]:
features = df[['sex', 'age_cat', 'c_charge_degree', 'length_of_stay',"priors_count"]]
sensitive = df['race']
target = df['two_year_recid']
X_train, X_test, y_train, y_test, race_train, race_test = \
    train_test_split(features, target, sensitive, test_size=0.3, random_state=6, shuffle = True)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

SVM Without Constraint

In [9]:
def calibration(y_true, y_pred, sensitive_features):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    c_index = np.where(sensitive_features == 1)[0]           
    a_index = np.where(sensitive_features == 0)[0]       
    
    y_pred_c = y_pred[c_index]           
    y_true_c = y_true[c_index] 
    acc_c = sum(y_pred_c == y_true_c)/len(y_pred_c)

    y_pred_a = y_pred[a_index]   
    y_true_a = y_true[a_index]
    acc_a = sum(y_pred_a == y_true_a)/len(y_pred_a)

    calibration = abs(acc_c - acc_a) * 100
    return(calibration)


In [10]:
def p_rule(sensitive_var, y_pred):
    protected = np.where(sensitive_var == 1)[0]
    not_protected = np.where(sensitive_var == 0)[0]
    protected_pred = np.where(y_pred[protected] == 1)
    not_protected_pred = np.where(y_pred[not_protected] == 1)
    protected_percent = protected_pred[0].shape[0]/protected.shape[0]
    not_protected_percent = not_protected_pred[0].shape[0]/not_protected.shape[0]
    ratio = min(protected_percent/not_protected_percent, not_protected_percent/protected_percent)
    
    return ratio, protected_percent, not_protected_percent

In [11]:
unconstrained_svm = SVC(kernel='linear', probability = True,)
unconstrained_svm = unconstrained_svm.fit(X_train, y_train)
y_pred_unconstrained = unconstrained_svm.predict(X_train)
result = {"Classifier": ["SVM", "SVM"],
        "Set": ["Train", "Test"],
        "Accuracy (%)": [unconstrained_svm.score(X_train, y_train) * 100, unconstrained_svm.score(X_test, y_test) * 100],
        "Calibration(%)": [calibration(y_train, y_pred_unconstrained, race_train), calibration(y_test, y_pred_unconstrained, race_test)],
         "P-rule (%)": [p_rule(race_train, y_pred_unconstrained)[0]*100, p_rule(race_test, y_pred_unconstrained)[0]*100],
        "Protected (%)": [p_rule(race_train, y_pred_unconstrained)[1]*100, p_rule(race_test, y_pred_unconstrained)[1]*100],
        "Not protected (%)": [p_rule(race_train, y_pred_unconstrained)[2]*100, p_rule(race_test, y_pred_unconstrained)[2]*100]}



pd.DataFrame(result)

Unnamed: 0,Classifier,Set,Accuracy (%),Calibration(%),P-rule (%),Protected (%),Not protected (%)
0,SVM,Train,66.145393,1.098249,55.630231,30.655391,55.105634
1,SVM,Test,65.426881,1.817903,99.055846,47.325103,47.776184


With Constraint

In [17]:
svm = SVM()
x_control_train = {'race': race_train}
weights = svm.train(X_train, y_train, x_control_train, C=1, max_iter=100, lamb=1, gamma=None, apply_fairness_constraints=1, sensitive_attrs=['race'], sensitive_attrs_to_cov_thresh={'race': 0})
pred_y_test = np.sign(np.dot(X_test, weights))
pred_y_train = np.sign(np.dot(X_train, weights))
csvm_test_acc = sum(pred_y_test == y_test)/len(y_test)
csvm_train_acc = sum(pred_y_train == y_train)/len(y_train)


Running custom model


In [18]:
result = {"Classifier": ["C-SVM", "C-SVM"],
        "Set": ["Train", "Test"],
        "Accuracy (%)": [csvm_test_acc * 100, csvm_train_acc * 100],
        "Calibration(%)": [calibration(y_train, pred_y_train, race_train),
                                 calibration(y_test, pred_y_test, race_test)],
         "P-rule (%)": [p_rule(race_train, pred_y_train)[0]*100, p_rule(race_test, pred_y_test)[0]*100],
        "Protected (%)": [p_rule(race_train, pred_y_train)[1]*100, p_rule(race_test, pred_y_test)[1]*100],
        "Not protected (%)": [p_rule(race_train, pred_y_train)[2]*100, p_rule(race_test, pred_y_test)[2]*100]}



pd.DataFrame(result)

Unnamed: 0,Classifier,Set,Accuracy (%),Calibration(%),P-rule (%),Protected (%),Not protected (%)
0,C-SVM,Train,48.84507,13.713531,99.963059,99.760622,99.797489
1,C-SVM,Test,47.077295,10.186922,99.904201,99.717115,99.812734
