# Maximize Fairness Under Accuraacy constraints - Logistic Regression

## 1) Preprocess Bank marketing Data

In [2]:
#Load data 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import log_loss

fp = "drive/MyDrive/data/bank-full.csv"
bank_df = pd.read_csv(fp, delimiter=";")

In [59]:

def process_data(bank_df): 
  
  y_actual = bank_df["y"].apply(lambda x: 1 if x == "yes" else -1)
  age = bank_df["age"].apply(lambda x: 1 if 25<= x <= 60 else 0)
  bank_df = bank_df.drop(columns = ["age", "y", "day", "month"])

  #Have more than two categories - one hot encode
  columns = ['job', 'marital', 'education','contact','poutcome']
  for c in columns: 
    dummies = pd.get_dummies(bank_df[c])
    bank_df = pd.merge(bank_df, dummies, left_index = True, right_index = True)
    bank_df = bank_df.drop(columns = [c])

  #Binary variables, apply 1/0 for yes/no
  binary_vars = ['default', 'housing', 'loan']
  for b in binary_vars: 
    bank_df[b] = bank_df[b].apply(lambda x: 1 if x =="yes" else 0 )

  bank_df, y_actual, age = shuffle(bank_df, y_actual, age, random_state=0)

  return bank_df.to_numpy(), y_actual.to_numpy(), age.to_numpy()

def accuracy(w, x, y):
  
  pred = np.dot(x, w.reshape(35,1))
  pred_prob = 1/(1+ 2.718**(-pred))
  
  pred_prob[pred_prob>=0.5] = 1
  pred_prob[pred_prob<0.5] = -1

  matches = np.where(pred_prob==y_train.reshape(pred_prob.shape))

  return (matches[0].shape[0]/pred_prob.shape[0]), pred_prob

In [4]:
X, y, age = process_data(bank_df)

#X,y, and age shuffled already, split into train and test tests
train_index = int(len(X)*.80)
x_train, y_train, age_train = X[:train_index], y[:train_index], age[:train_index]
x_test, y_test, age_test = X[train_index:], y[train_index:], age[train_index:]

In [5]:
#Get optimal coefficients by just training normal LR model 
clf = LogisticRegression(random_state=0).fit(x_train, y_train)
print(f"Logistic Regression Accuracy: {clf.score(x_test, y_test)}")

coeff = clf.coef_
intercept = clf.intercept_
optimal_loss = log_loss(y_train, clf.predict_proba(x_train))


Logistic Regression Accuracy: 0.8973791883224593


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [6]:
#Reshape arrays to calcualte dist from decision boundary 
ind = x_train.shape[0]
lift = np.ones(ind).reshape(ind, 1)
x_train = np.concatenate((x_train, lift), axis = 1)

optimal_weights = np.concatenate((coeff, intercept.reshape(1,1)), axis = 1)

In [7]:
#Create Constraints for optimization problem 
#clf.predict_proba(x_train)
def logisitc_loss(weights, X, y):

  dp = np.dot(X, weights.reshape(35,1))
  dp =dp.astype(np.float64)
  pred_prob = 1/(1+ 2.718**(-dp))

  pred_classes = np.concatenate((1-pred_prob, pred_prob), axis = 1)
  loss = log_loss(y, pred_classes)

  return loss 

In [8]:
def constraint1(weights, x, y):
  
  upd_loss = logisitc_loss(weights, x, y)

  return (1+gamma)*optimal_loss - upd_loss #Accuracy constraint function from paper 

In [40]:
#Optmization function to minimize 
def opt_function(w, x, protected_var):
  
  dist_bound = np.dot(w, x_train.T)
  protected_cov = (protected_var - np.mean(protected_var)) * dist_bound
  
  return float(abs(np.sum(protected_cov))) / float(x_train.shape[0])

In [101]:
#Determine p% rule ratio - number of protected in the positive class: not protected in positive class
def p_rule(age_var, predicted_y):
  
  not_protected = np.where(age_var != 1)[0]
  protected = np.where(age_var == 1)[0] 

  protected_preds = np.where(predicted_y[protected] == 1)
  nonpro_preds = np.where(predicted_y[not_protected] == 1)

  perc_ratio = (protected_preds[0].shape[0]/protected.shape[0])/(nonpro_preds[0].shape[0]/not_protected.shape[0])

  return perc_ratio



In [42]:
#https://towardsdatascience.com/optimization-with-scipy-and-application-ideas-to-machine-learning-81d39c7938b8
from scipy import optimize
gamma = 0.5
cons = {'type':'ineq', 'fun': constraint1, 'args': (x_train, y_train)}

result = optimize.minimize(opt_function,
                           x0=optimal_weights,
                           args= (x_train,age_train),
                           method='SLSQP',
                           constraints=cons,
                           options={'maxiter':10})


  import sys
  import sys
  import sys
  import sys


In [36]:
result

     fun: 0.029138651963161573
     jac: array([-4.52482840e-04,  2.42700321e+01, -1.35448407e-02, -4.90969815e-03,
        1.54766212e+00, -2.73674491e-02,  4.97470845e-01,  1.30606962e-02,
       -2.54969136e-03, -5.69617329e-03, -1.11719617e-03,  1.29369786e-04,
       -6.12392649e-03,  1.72699948e-02, -8.18497268e-04, -2.09253910e-03,
        6.33785198e-03, -4.80537978e-03, -7.10967695e-04,  1.77153386e-04,
       -7.09632877e-04, -3.29249771e-03,  4.00213036e-03,  3.76235601e-03,
       -1.46457413e-03, -4.81153652e-03,  2.51375418e-03,  1.56183960e-04,
        6.63564773e-03, -6.79183216e-03, -1.02818478e-04,  8.27180455e-04,
        4.38000355e-03, -5.10436622e-03,  0.00000000e+00])
 message: 'Iteration limit exceeded'
    nfev: 77
     nit: 2
    njev: 2
  status: 9
 success: False
       x: array([-3.18810944e-02, -6.23799635e-04, -8.67269924e-01, -2.94533431e-01,
        6.23096097e-03, -1.14287494e-01,  1.24018258e-03, -2.10365207e-03,
       -5.80513178e-02, -3.39050346e-0

In [103]:
val, pred_y = accuracy(result.x, x_train, y_train)

In [104]:
val

0.8734240212342402

In [105]:
p_rule(age_train, pred_y)


1.1166947094337898