In [21]:
import pandas as pd
import numpy as np

from fairlearn.metrics import MetricFrame
from fairlearn.metrics import count, \
                              false_positive_rate, \
                              selection_rate, equalized_odds_ratio
from fairlearn.preprocessing import CorrelationRemover
from fairlearn.reductions import ExponentiatedGradient, EqualizedOdds

 
                              
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from fairlearn.metrics import (
    demographic_parity_difference,
    equalized_odds_difference,
)
from fairlearn.reductions import (  # noqa
    DemographicParity,
    EqualizedOdds,
    ExponentiatedGradient,
)

from constants import protected_attributes


# Settings
np.random.seed(0)
import warnings
warnings.filterwarnings("ignore")

In [23]:

ds_train = pd.read_csv('./../data/train.csv')
ds_test = pd.read_csv('./../data/test.csv')

# Let's specify the features and the target
y_train = ds_train["checked"]
X_train = ds_train.drop(['checked'], axis=1)
X_train = X_train.astype(np.float32)

# Let's specify the features and the target
y_test = ds_test["checked"]
X_test = ds_test.drop(['checked'], axis=1)
X_test = X_test.astype(np.float32)

In [24]:

constraint = DemographicParity()

# Define a gradient boosting classifier
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

print(X_train.shape)
print(y_train.shape)
print(len(protected_attributes))

mitigator = ExponentiatedGradient(classifier, constraint)
mitigator.fit(
    X_train, y_train, sensitive_features=X_train[protected_attributes[:2]])

(10116, 315)
(10116,)
45


In [40]:
params = mitigator.estimator.get_params()
print(params)

model = mitigator.estimator


{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 1.0, 'loss': 'log_loss', 'max_depth': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 0, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [41]:
empty_df = pd.DataFrame({}, columns=X_train.columns)
for col in X_train.columns:
    empty_df[col] = empty_df[col].astype(X_train[col].dtypes.name)

# Result: An empty DataFrame with matching columns and data types
print(empty_df)
empty_df_y = pd.DataFrame({}, columns=['checked'])

# Result: An empty DataFrame with matching columns and data types
print(empty_df, empty_df_y)

Empty DataFrame
Columns: [adres_aantal_brp_adres, adres_aantal_verschillende_wijken, adres_aantal_verzendadres, adres_aantal_woonadres_handmatig, adres_dagen_op_adres, adres_recentst_onderdeel_rdam, adres_recentste_buurt_groot_ijsselmonde, adres_recentste_buurt_nieuwe_westen, adres_recentste_buurt_other, adres_recentste_buurt_oude_noorden, adres_recentste_buurt_vreewijk, adres_recentste_plaats_other, adres_recentste_plaats_rotterdam, adres_recentste_wijk_charlois, adres_recentste_wijk_delfshaven, adres_recentste_wijk_feijenoord, adres_recentste_wijk_ijsselmonde, adres_recentste_wijk_kralingen_c, adres_recentste_wijk_noord, adres_recentste_wijk_other, adres_recentste_wijk_prins_alexa, adres_recentste_wijk_stadscentru, adres_unieke_wijk_ratio, afspraak_aanmelding_afgesloten, afspraak_aantal_woorden, afspraak_afgelopen_jaar_afsprakenplan, afspraak_afgelopen_jaar_monitoring_insp__wet_taaleis_na_12_mnd_n_a_v__taa04_____geen_maatregel, afspraak_afgelopen_jaar_ontheffing, afspraak_afgelopen_j

In [42]:
model.fit(empty_df, empty_df_y)

ValueError: Found array with 0 sample(s) (shape=(0, 315)) while a minimum of 1 is required by GradientBoostingClassifier.

In [43]:
y_pred = model.predict(X_test)

AttributeError: 'GradientBoostingClassifier' object has no attribute 'estimators_'

In [None]:
X_test[protected_variables]

Unnamed: 0,persoon_geslacht_vrouw,belemmering_ind
1688,1.0,0.0
7251,1.0,0.0
5329,1.0,0.0
1697,1.0,1.0
8200,1.0,0.0
...,...,...
5646,1.0,1.0
10391,0.0,1.0
4083,0.0,0.0
4023,0.0,1.0


In [None]:

# Construct a function dictionary
my_metrics = {
    'tpr' : recall_score,
    'fpr' : false_positive_rate,
    'sel' : selection_rate,
    'count' : count
}

# Construct a MetricFrame
mf = MetricFrame(
    metrics=my_metrics,
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=X_test[protected_variables]
)
mf.overall

tpr         0.467320
fpr         0.005602
sel         0.050285
count    3162.000000
dtype: float64

In [None]:
mf.by_group


Unnamed: 0_level_0,Unnamed: 1_level_0,tpr,fpr,sel,count
persoon_geslacht_vrouw,belemmering_ind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,0.0,0.423729,0.006525,0.043155,672.0
0.0,1.0,0.504587,0.007928,0.0625,992.0
1.0,0.0,0.3125,0.007421,0.032368,587.0
1.0,1.0,0.533333,0.001218,0.053787,911.0


In [None]:
mf.difference(method='to_overall')

tpr         0.154820
fpr         0.004384
sel         0.017917
count    2575.000000
dtype: float64

In [None]:
print(equalized_odds_ratio(y_test,
                               y_pred,
                               sensitive_features=X_test[protected_variables]))

0.15364538019836438


In [None]:
cr = CorrelationRemover(sensitive_feature_ids=protected_variables)

pipeline = Pipeline(
    steps=[
        ("preprocessor", cr),
        (
            "classifier",
            GradientBoostingClassifier(solver="liblinear", fit_intercept=True),
        ),
        
    ]
)

In [None]:
exponentiated_gradient = ExponentiatedGradient(
    estimator=pipeline,
    constraints=EqualizedOdds(),
    sample_weight_name="classifier__sample_weight",
)
exponentiated_gradient.fit(X_train, y_train, sensitive_features=A_train)
print(exponentiated_gradient.predict(X_test))