In [87]:
# Copyright (c) Fairlearn contributors.
# Licensed under the MIT License.

"""
==========================================
Passing pipelines to mitigation techniques
==========================================
"""
# %%
# This notebook shows how to pass :class:`sklearn.pipeline.Pipeline` to
# mitigation techniques from Fairlearn. Note that the notebook is not to be
# used as an example for how to assess and mitigate fairness. It is merely a
# demonstration of the technical aspects of passing
# :class:`sklearn.pipeline.Pipeline`. For more information around proper
# fairness assessment and mitigation please refer to the :ref:`user_guide`.

import json
import pandas as pd
import numpy as np

import onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType


from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import *


from fairlearn.reductions import EqualizedOdds, ExponentiatedGradient
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import *

In [88]:

# %%
# Below we load the "Adult" census dataset and split its features, sensitive
# features, and labels into train and test sets.

# Let's load the dataset
data = pd.read_csv('./../data/synth_data_for_training.csv')
data.info()

protected_variables = ["persoon_geslacht_vrouw"]
output_variable = ["checked"]

# Simple preprocessing
X = pd.get_dummies(data.drop(output_variable, axis=1))
X = X.astype(np.float32)
y = data[output_variable]
A = data[protected_variables]

(X_train, X_test, y_train, y_test, A_train, A_test) = train_test_split(
    X, y, A, test_size=0.3, random_state=12345, stratify=y
)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12645 entries, 0 to 12644
Columns: 316 entries, adres_aantal_brp_adres to checked
dtypes: int64(316)
memory usage: 30.5 MB


In [89]:

# %%
# To illustrate Fairlearn's compatibility with
# :class:`~sklearn.pipeline.Pipeline` we first need to build our pipeline.
# In the following we assemble a pipeline by combining preprocessing steps
# with an estimator. The preprocessing steps include imputing, scaling for
# numerical features and one-hot encoding for categorical features.

numeric_transformer = Pipeline(
    steps=[
        ("impute", SimpleImputer()),
        ("scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]
)


# cr = CorrelationRemover(sensitive_feature_ids=protected_variables)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        # ("cr", cr),
        (
            "classifier",
            GradientBoostingClassifier(),
        ),
    ]
)



In [90]:

# %%
# Similarly, :class:`fairlearn.reductions.ExponentiatedGradient` works with
# pipelines. Since it requires the :code:`sample_weight` parameter of the
# underlying estimator internally we need to provide it with the correct
# way of passing :code:`sample_weight` to just the :code:`"classifier"` step
# using the step name followed by two underscores and :code:`sample_weight`.

exponentiated_gradient = ExponentiatedGradient(
    estimator=pipeline,
    constraints=EqualizedOdds(),
    sample_weight_name="classifier__sample_weight",
)
exponentiated_gradient.fit(X_train, y_train, sensitive_features=A_train)
y_pred = exponentiated_gradient.predict(X_test)

In [91]:
# Construct a function dictionary
my_metrics = {
    'recall_score' : recall_score,
    'false_positive_ratio' : false_positive_rate,
    'selection_rate' : selection_rate,
    'count' : count,
}

# Construct a MetricFrame
mf = MetricFrame(
    metrics=my_metrics,
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=X_test[protected_variables]
)
mf.overall

recall_score               0.444737
false_positive_ratio       0.004101
selection_rate             0.048234
count                   3794.000000
dtype: float64

In [92]:
equalized_odds_difference(y_true=y_test,
    y_pred=y_pred,
    sensitive_features=A_test)

0.04621010638297873

In [93]:
equalized_odds_ratio(y_true=y_test,
    y_pred=y_pred,
    sensitive_features=A_test)

0.5883929768127597

In [94]:
print(y_test.shape)
print(y_pred.shape)
y_pred = y_pred.reshape(y_test.shape[0], 1)
print(y_pred.shape)

false_negative_rate(y_true=y_test, y_pred=y_pred)

(3794, 1)
(3794,)
(3794, 1)


0.5552631578947368

In [95]:
params = exponentiated_gradient.get_params()

save_model = GradientBoostingClassifier(init=params)

In [99]:


# Let's train a simple model
pipeline.fit(X_train, y_train)


# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

# Let's save the model


onnx.save(onnx_model, "model/exponentiated_gradient_model_1.onnx")

RuntimeError: Unable to find column name 'adres_aantal_brp_adres' among names ['X']. Make sure the input names specified with parameter initial_types fits the column names specified in the pipeline to convert. This may happen because a ColumnTransformer follows a transformer without any mapped converter in a pipeline.