# ADS Project 4: Machine Learning Fairness
## Fall 2022
#### Maximizing Accuracy under Fairness Constraints (C-LR and C-SVM)

Group 5: Christopher Halim

## 1. Background

+ The dataset that is used in this notebook has been cleaned and is stored under the output Github file. 

+ This notebook algorithm is on maximizing accuracy under fairness constraints using C-LR and C-SVM (A2)

+ We used `utils.py` and `loss_funcs.py` as additional modules for our analysis to help build our algorithm

## 2. Data Preparation and EDA

In [20]:
# Load modules
import os, sys
import numpy as np
import pandas as pd
sys.path.insert(0, "/Users/christopherhalim888/Library/CloudStorage/OneDrive-Personal/Master/Fall 2022/STAT 5243")
import utils1 as ut
import loss_funcs as lf
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.metrics import log_loss
from utils12 import *
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

In [21]:
print(dir(ut))

['Pool', 'Process', 'Queue', 'SEED', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'add_intercept', 'check_accuracy', 'check_binary', 'compute_cross_validation_error', 'compute_p_rule', 'deepcopy', 'defaultdict', 'get_avg_correlation_dict', 'get_constraint_list_cov', 'get_correlations', 'get_line_coordinates', 'get_one_hot_encoding', 'lf', 'minimize', 'np', 'plot_cov_thresh_vs_acc_pos_ratio', 'plt', 'print_classifier_fairness_stats', 'print_covariance_sensitive_attrs', 'seed', 'shuffle', 'split_into_train_test', 'sys', 'test_sensitive_attr_constraint_cov', 'train_model']


In [22]:
# Load data
df = pd.read_csv('/Users/christopherhalim888/Library/CloudStorage/OneDrive-Personal/Master/Fall 2022/STAT 5243/compas-scores-two-years(cleaned).csv')
df.head()

Unnamed: 0,sex,age_cat,race,priors_count,c_charge_degree,length_of_stay,two_year_recid
0,Male,25 - 45,African-American,-0.733607,F,-0.177294,1
1,Male,< 25,African-American,0.055928,F,-0.350235,1
2,Male,25 - 45,Caucasian,2.029767,F,-0.254156,1
3,Female,25 - 45,Caucasian,-0.733607,M,-0.311803,0
4,Male,< 25,Caucasian,-0.536224,F,-0.350235,1


Encode categorical variables with dummy variables:
+ `sex`: 1 for male and 0 for female
+ `age_cat`: 2 for > 45, 1 for 25 - 45 and 0 for < 25
+ `race`: 1 for caucasian and 0 for african-american
+ `c_charge_degree`: 1 for F and 0 for M

In [23]:
# Encode variables with dummy variables
df['sex'] = df['sex'].apply(lambda sex: 0 if sex == 'Female' else 1)
df['age_cat'] = df['age_cat'].apply(lambda age_cat: 2 if age_cat == '> 45' else(1 if age_cat == '25 - 45' else 0))
df['race'] = df['race'].apply(lambda race: 0 if race == 'African-American' else 1)
df['c_charge_degree'] = df['c_charge_degree'].apply(lambda c_charge_degree: 0 if c_charge_degree == 'M' else 1)
df.head()

Unnamed: 0,sex,age_cat,race,priors_count,c_charge_degree,length_of_stay,two_year_recid
0,1,1,0,-0.733607,1,-0.177294,1
1,1,0,0,0.055928,1,-0.350235,1
2,1,1,1,2.029767,1,-0.254156,1
3,0,1,1,-0.733607,0,-0.311803,0
4,1,0,1,-0.536224,1,-0.350235,1


In [24]:
# Vars to store features
variables = ['sex', 'age_cat', 'priors_count', 'c_charge_degree', 'length_of_stay']
dependent = 'race'
target = 'two_year_recid'

# Function to process the data
def shuffle_data(df):
    y_label = df[target]
    protected = df[dependent]
    df_new = df[variables]
    y_label, protected, df_new = shuffle(y_label, protected, df_new, random_state = 617)
    
    return y_label.to_numpy(), protected.to_numpy(), df_new.to_numpy()

# Split data into train and test, with 80:20 ratio
y_label, protected, df_new =  shuffle_data(df)
train_index = int(len(df_new) * 0.8)
x_train, y_train, race_train = df_new[:train_index], y_label[:train_index], protected[:train_index]
x_test, y_test, race_test = df_new[train_index:], y_label[train_index:],protected[train_index:]

Construct a function to determine the p% and a function to compute calibration.

+ **Protected**: Caucasians (i.e., `race == 1`)
+ **Not protected**: African-Americans (i.e., `race == 0`)

In [25]:
# Function to compute p-rule
def p_rule(var, y_pred):
    protected = np.where(var == 1)[0]
    not_protected = np.where(var == 0)[0]
    protected_pred = np.where(y_pred[protected] == 1)
    not_protected_pred = np.where(y_pred[not_protected] == 1)
    protected_percent = protected_pred[0].shape[0]/protected.shape[0]
    not_protected_percent = not_protected_pred[0].shape[0]/not_protected.shape[0]
    ratio = min(protected_percent/not_protected_percent, not_protected_percent/protected_percent)
    
    return ratio, protected_percent, not_protected_percent

In [26]:
# Function to compute calibration
def calibrate(var, y_pred, y_true):
    protected_point = np.where(var == 1)[0]
    y_predcau = y_pred[protected_point]
    y_truecau = y_true[protected_point]
    
    pcau = sum(y_predcau==y_truecau)/len(y_truecau)
    not_protected_point = np.where(var == 0)[0]
    
    y_predafa = y_pred[not_protected_point]
    y_trueafa = y_true[not_protected_point]
    
    pafa = sum(y_predafa==y_trueafa)/len(y_trueafa)
    calibrate = abs(pcau-pafa)
    
    return(calibrate)

## 3. Logistic Regression <a class="anchor" id="lr"></a>

### 3.1 Training unconstrained classifier

In [27]:
# Train model and print results
model = LogisticRegression(random_state = 0).fit(x_train, y_train)
coeff = model.coef_
intercept = model.intercept_
optimal_loss = log_loss(y_train, model.predict_proba(x_train))
results_lr = {"Classifier": ["LR", "LR"], 
              "Set": ["Train", "Test"],
              "P-rule (%)": [p_rule(race_train, m.predict(x_train))[0]*100, p_rule(race_test, m.predict(x_test))[0]*100],
              "Accuracy (%)": [model.score(x_train, y_train)*100, model.score(x_test, y_test)*100],
              "Protected (%)": [p_rule(race_train, model.predict(x_train))[1]*100, p_rule(race_test, model.predict(x_test))[1]*100],
              "Not protected (%)": [p_rule(race_train, model.predict(x_train))[2]*100, p_rule(race_test, model.predict(x_test))[2]*100],
              "Calibration (%)": [calibrate(race_train, model.predict(x_train), y_train)*100, calibrate(race_test, model.predict(x_test), y_test)*100]
             }

pd.DataFrame(results_lr)

Unnamed: 0,Classifier,Set,P-rule (%),Accuracy (%),Protected (%),Not protected (%),Calibration (%)
0,LR,Train,99.947862,66.462384,29.752501,54.429933,1.417935
1,LR,Test,100.0,65.426881,33.611691,53.977273,3.36965


### 3.2 Optimizing classifier accuracy subject to fairness constraints

Optimize our accuracy subject to fairness constraints. By setting 'race': 0', we assume that the classifier should achieve 0 covariance with respect to 'race' value and distance to the decision boundary. A 0 covariance means that no correlation exists between the two variables.

In [28]:
# Setting flags
apply_fairness_constraints = 1 # set this flag to 1 since we want to optimize accuracy subject to fairness constraints
apply_accuracy_constraint = 0
sep_constraint = 0
gamma = None
sensitive_attrs = ['race']
sensitive_attrs_to_cov_thresh = {'race': 0}
x_control = {'race': race_train}

# Train model
np.random.seed(100)
w = ut.train_model(x_train,
                   y_train,
                   x_control,
                   lf._logistic_loss,
                   apply_fairness_constraints,
                   apply_accuracy_constraint,
                   sep_constraint,
                   sensitive_attrs,
                   sensitive_attrs_to_cov_thresh,
                   gamma)

In [29]:
# Fit coefficients/weights into logistic regression in sklearn
m = LogisticRegression()
m.coef_= w.reshape((1,-1))
m.intercept_ = 0
m.classes_ = np.array([0, 1])

In [30]:
# Print results
result_CLR = {"Classifier": ["C-LR", "C-LR"],
               "Set": ["Train", "Test"],
               "P-rule (%)": [p_rule(race_train, m.predict(x_train))[0]*100, p_rule(race_test, m.predict(x_test))[0]*100],
               "Accuracy (%)": [m.score(x_train, y_train)*100, m.score(x_test, y_test)*100], 
               "Protected (%)": [p_rule(race_train, m.predict(x_train))[1]*100, p_rule(race_test, m.predict(x_test))[1]*100],
               "Not protected (%)": [p_rule(race_train, m.predict(x_train))[2]*100, p_rule(race_test, m.predict(x_test))[2]*100],
               "Calibration (%)": [calibrate(race_train, m.predict(x_train), y_train)*100, calibrate(race_test, m.predict(x_test), y_test)*100]
             }
pd.DataFrame(result_CLR)

Unnamed: 0,Classifier,Set,P-rule (%),Accuracy (%),Protected (%),Not protected (%),Calibration (%)
0,C-LR,Train,99.947862,48.013525,99.842022,99.894105,13.261964
1,C-LR,Test,100.0,46.661031,100.0,100.0,10.000415


## 4. Support Vector Machine (SVM) <a class="anchor" id="svm"></a>

The following SVM codes are an adaptation of the paper on [Fairness Constraints: Mechanisms for Fair Classification](https://arxiv.org/abs/1507.05259). Additional helper functions like `SVM_scratch.py`, `datapreprocess.py` and `helper.py`were adapted from this GitHub [repo](https://github.com/SreeranjaniD/Fairness-in-Classification-using-SVM).

In [31]:
# Load modules
from SVM_scratch import *
from datapreprocess import *
from helper import *

# Set variables needed for training
x_control_train = {'race': race_train}
x_control_test = {'race': race_test}

Here we define a function for the classifier, which trains the model based on the fairness constraints.

In [32]:
# Define function for classifier
def classifier(apply_fairness_constraints,loss_function,c,sensitive_attrs,max_iter=1000,epoch=50,lamb=1,lr=0.1,C=1,gamma=None):
    svm =SVM()
    w = svm.training(x_train,y_train, x_control_train, loss_function,C,max_iter,lamb,epoch,lr, apply_fairness_constraints, sensitive_attrs, c,gamma)
    train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(w, x_train, y_train, x_test, y_test, None, None)
    distances_hyperplane_test = (np.dot(x_test, w)).tolist()
    all_class_labels_assigned_test = np.sign(distances_hyperplane_test)
    correlation_test_dict = ut.get_correlations(None, None, all_class_labels_assigned_test, x_control_test, sensitive_attrs)
    cov_dict_test = ut.print_covariance_sensitive_attrs(None, x_test, distances_hyperplane_test, x_control_test, sensitive_attrs)
    ut.print_classifier_fairness_stats([test_score],correlation_test_dict, [cov_dict_test], sensitive_attrs[0])
    

### 4.1 T Train unconstrained SVM classifier

In [33]:
svm_model = SVC(kernel = 'linear', probability = True)

# Train model and print results
model_1 = svm_model.fit(x_train, y_train)
optimal_loss = log_loss(y_train, model_1.predict_proba(x_train))
results_svm = {"Classifier": ["SVM", "SVM"],
               "Set": ["Train", "Test"],
               "P-rule (%)": [p_rule(race_train, model_1.predict(x_train))[0]*100, p_rule(race_test, model_1.predict(x_test))[0]*100],
               "Accuracy (%)": [model_1.score(x_train, y_train)*100, model_1.score(x_test, y_test)*100],
               "Calibration (%)": [calibrate(race_train, model_1.predict(x_train), y_train)*100, calibrate(race_test, model_1.predict(x_test), y_test)*100],
               "Protected (%)": [p_rule(race_train, model_1.predict(x_train))[1]*100, p_rule(race_test, model_1.predict(x_test))[1]*100],
               "Not protected (%)": [p_rule(race_train, model_1.predict(x_train))[2]*100, p_rule(race_test, model_1.predict(x_test))[2]*100]
               }

print_result = pd.DataFrame(results_svm)
print_result

Unnamed: 0,Classifier,Set,P-rule (%),Accuracy (%),Calibration (%),Protected (%),Not protected (%)
0,SVM,Train,52.057837,66.208791,0.81844,26.276988,50.476527
1,SVM,Test,63.109384,65.511412,3.227605,31.106472,49.289773


### 4.2 Optimize SVM Classifier Accuracy Subject to Fairness Constraints

Now we optimize accuracy subject to fairness constraints. 

In [34]:
# Subject to fairness constraints
apply_fairness_constraints = 1 # set this flag to one since we want to optimize accuracy subject to fairness constraints
loss_function = lf._hinge_loss
sensitive_attrs = ['race']
c = {'race': 0} # covariance threshold
C = 1 # penalty term

# gamma controls how much loss in accuracy we are willing to incur to achieve fairness 
# Increase in gamma will decrease the accuracy to a certain limit 
# Set gamma to None because we are not use the Gamma to tune our analysis
gamma = None
epochs = 1000 # Number of epochs 
lamb = 1 # lambda 
lr = 0.1  # learning rate 
max_iter = 1000

We defined a 'csvm' function that returns the train and test scores, and the predicted values for train and test so that we can compute the accuracy and calibration scores later.

In [35]:
# Define function for training that returns scores
def csvm(x_train, y_train, x_test, y_test, C, max_iter, lamb, epochs,lr, apply_fairness_constraints, 
         sensitive_attrs, sensitive_attrs_to_cov_thresh=c, gamma=None):
    
    svm = SVM()
    w = svm.training(x_train, y_train, x_control_train, loss_function, C, max_iter, lamb, epochs, lr, 
                     apply_fairness_constraints, sensitive_attrs, sensitive_attrs_to_cov_thresh, gamma)
    y_test_pred = np.sign(np.dot(x_test, w))
    y_train_pred = np.sign(np.dot(x_train, w))
    
    def accuracy(y, Y_pred):
        correct_answers = (Y_pred == y).astype(int) # will have 1 when the prediction and the actual label match
        accuracy = float(sum(correct_answers)) / float(len(correct_answers))
        return accuracy, sum(correct_answers)

    train_score, correct_answers_train = accuracy(y_train, y_train_pred)
    test_score, correct_answers_test = accuracy(y_test, y_test_pred)
    return train_score, test_score, correct_answers_train, correct_answers_test, y_test_pred, y_train_pred

In [36]:

train_score, test_score, correct_answers_train, correct_answers_test, y_test_pred, y_train_pred = csvm(x_train, y_train, x_test, y_test, C, max_iter, lamb, epochs,lr, apply_fairness_constraints, sensitive_attrs, sensitive_attrs_to_cov_thresh=c, gamma=None)


Running custom model with fairness constraints


In [38]:
# Results
results_csvm = {"Classifier": ["C-SVM", "C-SVM"],
                "Set": ["Train", "Test"],
                "P-rule (%)": [p_rule(race_train, y_train_pred)[0]*100, p_rule(race_test, y_test_pred)[0]*100],
                "Accuracy (%)": [train_score*100, test_score*100],
                "Calibration (%)": [calibrate(race_train, y_train_pred, y_train)*100, calibrate(race_test, y_test_pred, y_test)*100],
                "Protected (%)": [p_rule(race_train, y_train_pred)[1]*100, p_rule(race_test, y_test_pred)[1]*100],
                "Not protected (%)": [p_rule(race_train, y_train_pred)[2]*100, p_rule(race_test, y_test_pred)[2]*100]
                }

pd.DataFrame(results_csvm)

Unnamed: 0,Classifier,Set,P-rule (%),Accuracy (%),Calibration (%),Protected (%),Not protected (%)
0,C-SVM,Train,99.930962,47.886729,13.314047,99.684044,99.752912
1,C-SVM,Test,99.724117,46.49197,10.067138,99.582463,99.857955
