# Maximize Fairness Under Accuraacy constraints - Logistic Regression

## 1) Preprocess Bank marketing Data

In [1]:
#Load data 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import log_loss

fp = "drive/MyDrive/data/bank-full.csv"
bank_df = pd.read_csv(fp, delimiter=";")

In [2]:

def process_data(bank_df): 
  
  y_actual = bank_df["y"].apply(lambda x: 1 if x == "yes" else -1)
  age = bank_df["age"].apply(lambda x: 1 if 25<= x <= 60 else 0)
  bank_df = bank_df.drop(columns = ["age", "y", "day", "month"])

  #Have more than two categories - one hot encode
  columns = ['job', 'marital', 'education','contact','poutcome']
  for c in columns: 
    dummies = pd.get_dummies(bank_df[c])
    bank_df = pd.merge(bank_df, dummies, left_index = True, right_index = True)
    bank_df = bank_df.drop(columns = [c])

  #Binary variables, apply 1/0 for yes/no
  binary_vars = ['default', 'housing', 'loan']
  for b in binary_vars: 
    bank_df[b] = bank_df[b].apply(lambda x: 1 if x =="yes" else 0 )

  bank_df, y_actual, age = shuffle(bank_df, y_actual, age, random_state=0)

  return bank_df.to_numpy(dtype=np.float), y_actual.to_numpy(dtype=np.float), age.to_numpy(dtype=np.float)



In [3]:
X, y, age = process_data(bank_df)

#X,y, and age shuffled already, split into train and test tests
train_index = int(len(X)*.80)
x_train, y_train, age_train = X[:train_index], y[:train_index], age[:train_index]
x_test, y_test, age_test = X[train_index:], y[train_index:], age[train_index:]

In [4]:
#Get optimal coefficients by just training normal LR model 
clf = LogisticRegression(random_state=0).fit(x_train, y_train)
print(f"Logistic Regression Accuracy: {clf.score(x_test, y_test)}")

coeff = clf.coef_
intercept = clf.intercept_
optimal_loss = log_loss(y_train, clf.predict_proba(x_train))


Logistic Regression Accuracy: 0.8973791883224593


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [5]:
#Reshape arrays to calcualte dist from decision boundary 
ind = x_train.shape[0]
lift = np.ones(ind).reshape(ind, 1)
x_train = np.concatenate((x_train, lift), axis = 1)

optimal_weights = np.concatenate((coeff, intercept.reshape(1,1)), axis = 1)

In [6]:
#Create Constraints for optimization problem 
#clf.predict_proba(x_train)
def logisitc_loss(weights, X, y):

  dp = np.dot(X, weights.reshape(35,1))
  pred_prob = 1/(1+ 2.718**(-dp))

  pred_classes = np.concatenate((1-pred_prob, pred_prob), axis = 1)
  loss = log_loss(y, pred_classes)

  return loss 

In [7]:
def constraint1(weights, x, y):
  
  upd_loss = logisitc_loss(weights, x, y)

  return (1+gamma)*optimal_loss - upd_loss #Accuracy constraint function from paper 

In [8]:
#Optmization function to minimize 
def opt_function(w, x, protected_var):
  
  sens_diff = protected_var - np.mean(protected_var)
  dist_boundary = np.dot(x, w.reshape(35,1))
  sum_x = np.sum(abs(sens_diff*dist_boundary))

  return sum_x/x.shape[0]

In [9]:
#https://towardsdatascience.com/optimization-with-scipy-and-application-ideas-to-machine-learning-81d39c7938b8
from scipy import optimize
gamma = 0.5
cons = {'type':'ineq', 'fun': constraint1, 'args': (x_train, y_train)}

result = optimize.minimize(opt_function,
                           x0=optimal_weights,
                           args= (x_train,age_train),
                           method='SLSQP',
                           constraints=cons,
                           options={'maxiter':10})


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
 

In [209]:
opt_function(optimal_weights, x_train, age_train)

8013.946353216141

In [214]:
sens_diff = age_train - np.mean(age_train)
dist_boundary = np.dot(x_train, optimal_weights.reshape(35,1))
sum_x = np.sum(abs(sens_diff  *dist_boundary))

In [215]:
sum_x

289848411.70312136

In [213]:
sens_diff

array([0.04432095, 0.04432095, 0.04432095, ..., 0.04432095, 0.04432095,
       0.04432095])