In [66]:
import pandas as pd
import numpy as np

from fairlearn.metrics import MetricFrame
from fairlearn.metrics import count, \
                              false_positive_rate, \
                              selection_rate, equalized_odds_ratio
from fairlearn.preprocessing import CorrelationRemover
from fairlearn.reductions import ExponentiatedGradient, EqualizedOdds

 
                              
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [67]:
# Let's load the dataset
data = pd.read_csv('./../data/synth_data_for_training.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12645 entries, 0 to 12644
Columns: 316 entries, adres_aantal_brp_adres to checked
dtypes: int64(316)
memory usage: 30.5 MB


In [68]:
data.columns

Index(['adres_aantal_brp_adres', 'adres_aantal_verschillende_wijken',
       'adres_aantal_verzendadres', 'adres_aantal_woonadres_handmatig',
       'adres_dagen_op_adres', 'adres_recentst_onderdeel_rdam',
       'adres_recentste_buurt_groot_ijsselmonde',
       'adres_recentste_buurt_nieuwe_westen', 'adres_recentste_buurt_other',
       'adres_recentste_buurt_oude_noorden',
       ...
       'typering_hist_aantal', 'typering_hist_inburgeringsbehoeftig',
       'typering_hist_ind', 'typering_hist_sector_zorg', 'typering_ind',
       'typering_indicatie_geheime_gegevens', 'typering_other',
       'typering_transport__logistiek___tuinbouw',
       'typering_zorg__schoonmaak___welzijn', 'checked'],
      dtype='object', length=316)

In [69]:

protected_variables = ["persoon_geslacht_vrouw",'belemmering_ind']
output_variable = ["checked"]

# Simple preprocessing
X = data.drop(output_variable, axis=1)
X = X.astype(np.float32)
y = data[output_variable]
A = X[protected_variables]

# Train test split
# Let's split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, A, test_size=0.25, random_state=42)



In [70]:
model = GradientBoostingClassifier()

model.fit(X_train, y_train.ravel())

AttributeError: 'DataFrame' object has no attribute 'ravel'

In [None]:
y_pred = model.predict(X_test)

In [None]:
X_test[protected_variables]

Unnamed: 0,persoon_geslacht_vrouw,belemmering_ind
1688,1.0,0.0
7251,1.0,0.0
5329,1.0,0.0
1697,1.0,1.0
8200,1.0,0.0
...,...,...
5646,1.0,1.0
10391,0.0,1.0
4083,0.0,0.0
4023,0.0,1.0


In [None]:

# Construct a function dictionary
my_metrics = {
    'tpr' : recall_score,
    'fpr' : false_positive_rate,
    'sel' : selection_rate,
    'count' : count
}

# Construct a MetricFrame
mf = MetricFrame(
    metrics=my_metrics,
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=X_test[protected_variables]
)
mf.overall

tpr         0.467320
fpr         0.005602
sel         0.050285
count    3162.000000
dtype: float64

In [None]:
mf.by_group


Unnamed: 0_level_0,Unnamed: 1_level_0,tpr,fpr,sel,count
persoon_geslacht_vrouw,belemmering_ind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,0.0,0.423729,0.006525,0.043155,672.0
0.0,1.0,0.504587,0.007928,0.0625,992.0
1.0,0.0,0.3125,0.007421,0.032368,587.0
1.0,1.0,0.533333,0.001218,0.053787,911.0


In [None]:
mf.difference(method='to_overall')

tpr         0.154820
fpr         0.004384
sel         0.017917
count    2575.000000
dtype: float64

In [None]:
print(equalized_odds_ratio(y_test,
                               y_pred,
                               sensitive_features=X_test[protected_variables]))

0.15364538019836438


In [None]:
cr = CorrelationRemover(sensitive_feature_ids=protected_variables)

pipeline = Pipeline(
    steps=[
        ("preprocessor", cr),
        (
            "classifier",
            GradientBoostingClassifier(solver="liblinear", fit_intercept=True),
        ),
        
    ]
)

In [None]:
exponentiated_gradient = ExponentiatedGradient(
    estimator=pipeline,
    constraints=EqualizedOdds(),
    sample_weight_name="classifier__sample_weight",
)
exponentiated_gradient.fit(X_train, y_train, sensitive_features=A_train)
print(exponentiated_gradient.predict(X_test))