In [1]:
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import lightgbm as lgb
import rpy2

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

from rpy2 import robjects
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import StrVector
import rpy2.robjects.numpy2ri as numpy2ri

from aif360.sklearn.datasets import fetch_adult
from aif360.sklearn.metrics import disparate_impact_ratio, average_odds_error, generalized_fpr
from aif360.sklearn.metrics import generalized_fnr, difference, statistical_parity_difference

import fairadapt

  from pandas.core.index import Index as PandasIndex
pip install 'aif360[AdversarialDebiasing]'


## Load a dataset

In [2]:
X, y, sample_weight = fetch_adult()
X.head()

X = X.drop(['education', 'capital-gain', 'capital-loss', 'relationship'], axis = 1)
X = X[0:5000]
y = y[0:5000]

X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,workclass,education-num,marital-status,occupation,race,sex,hours-per-week,native-country
Unnamed: 0_level_1,race,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Non-white,Male,25.0,Private,7.0,Never-married,Machine-op-inspct,Non-white,Male,40.0,United-States
1,White,Male,38.0,Private,9.0,Married-civ-spouse,Farming-fishing,White,Male,50.0,United-States
2,White,Male,28.0,Local-gov,12.0,Married-civ-spouse,Protective-serv,White,Male,40.0,United-States
3,Non-white,Male,44.0,Private,10.0,Married-civ-spouse,Machine-op-inspct,Non-white,Male,40.0,United-States
5,White,Male,34.0,Private,6.0,Never-married,Other-service,White,Male,30.0,United-States


In [3]:
(X_train, X_test,
 y_train, y_test) = train_test_split(X, y, train_size=0.8, random_state=1234567)

## Train a LGBM and compute accuracy and parity gap

In [4]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)
accuracy_score(y_pred, y_test)

# parity gap
par_gap = statistical_parity_difference(y_test, y_pred, prot_attr = "sex", 
                                         priv_group = "Male", pos_label = ">50K")
print("Parity gap equals {:.2f}%".format(100 * par_gap))

Parity gap equals -20.00%


## Construct the adjacency matrix (causal graph)

In [5]:
# construct an adjacency matrix
train_df = pd.concat([X_train, y_train], axis=1)
adj_mat = pd.DataFrame(
    np.zeros((len(train_df.columns), len(train_df.columns)), dtype=int),
    index = train_df.columns.values,
    columns = train_df.columns.values
)

# Construct the adjacency matrix of the causal graph
adj_mat.at[["sex","age","native-country"],
        ["marital-status", "education-num","workclass", "hours-per-week", "occupation","annual-income"]] = 1
adj_mat.at["marital-status",
            ["education-num","workclass", "hours-per-week", "occupation","annual-income"]] = 1
adj_mat.at["education-num",
            ["workclass", "hours-per-week","occupation", "annual-income"]] = 1
adj_mat.at[["workclass", "hours-per-week", "occupation"],
            "annual-income"] = 1

## Instantiate and run fairadapt

In [6]:
pandas2ri.activate()

FA = fairadapt.fairadapt(prot_attr = "sex", adj_mat = adj_mat, outcome = "annual-income")

Xf_train, yf_train, Xf_test = FA.fit_transform(X_train, y_train, X_test)

## Retrain LGBM and check whether discrimination was removed

In [7]:
clf_fair = lgb.LGBMClassifier()
clf_fair.fit(Xf_train, yf_train)

yf_pred=clf_fair.predict(Xf_test)

# fair parity gap
fair_gap = statistical_parity_difference(y_test, yf_pred, prot_attr = "sex", 
                                         priv_group = "Male", pos_label = ">50K")
print("Fair parity gap equals {:.2f}%".format(100 * fair_gap))

Fair parity gap equals -5.36%
