From 8cfa9de08ad9a1e92430244bdd06058287229023 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 16 May 2019 15:01:12 -0400 Subject: [PATCH 01/61] Initial sklearn-compatible datasets and metrics --- aif360/sklearn/__init__.py | 0 aif360/sklearn/datasets/__init__.py | 2 + aif360/sklearn/datasets/openml_datasets.py | 140 +++++++++++++ aif360/sklearn/datasets/utils.py | 108 +++++++++++ aif360/sklearn/metrics/__init__.py | 1 + aif360/sklearn/metrics/metrics.py | 216 +++++++++++++++++++++ docs/source/conf.py | 1 + 7 files changed, 468 insertions(+) create mode 100644 aif360/sklearn/__init__.py create mode 100644 aif360/sklearn/datasets/__init__.py create mode 100644 aif360/sklearn/datasets/openml_datasets.py create mode 100644 aif360/sklearn/datasets/utils.py create mode 100644 aif360/sklearn/metrics/__init__.py create mode 100644 aif360/sklearn/metrics/metrics.py diff --git a/aif360/sklearn/__init__.py b/aif360/sklearn/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aif360/sklearn/datasets/__init__.py b/aif360/sklearn/datasets/__init__.py new file mode 100644 index 00000000..1a5a27f0 --- /dev/null +++ b/aif360/sklearn/datasets/__init__.py @@ -0,0 +1,2 @@ +from aif360.sklearn.datasets.utils import * +from aif360.sklearn.datasets.openml_datasets import * diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py new file mode 100644 index 00000000..b902b436 --- /dev/null +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -0,0 +1,140 @@ +import os + +import numpy as np +import pandas as pd +from sklearn.datasets import fetch_openml + +from aif360.sklearn.datasets.utils import standarize_dataset + + +# cache location +DATA_HOME = os.path.join(os.path.dirname(os.path.abspath(__file__)), + '..', 'data', 'raw') +# name -> data_id mapping +DATA_ID = {'adult': 1590, + 'german': 31, + 'bank': 1461 # TODO: this seems to be an old version +} + +def fetch_and_format_openml(name): + """Fetch openml dataset by name and format categorical features. + + Args: + name ({'adult', 'german', or 'bank'}): Name of OpenML dataset. Converted + to data_id using `DATA_ID` mapping. + + Returns: + pandas.DataFrame: A DataFrame containing all data, including target, + with categorical features converted to 'category' dtypes. + """ + def categorize(item): + return cats[int(item)] if not pd.isna(item) else item + + data_id = DATA_ID[name] + data = fetch_openml(data_id=data_id, data_home=DATA_HOME, target_column=None) + df = pd.DataFrame(data.data, columns=data.feature_names) + + for col, cats in data.categories.items(): + df[col] = df[col].apply(categorize).astype('category') + + return df + +def load_adult(usecols=[], dropcols=[], numeric_only=False, dropna=True): + """Load the Adult Census Income Dataset. + + Args: + usecols (single label or list-like, optional): Column name(s) to keep. + All others are dropped. + dropcols (single label or list-like, optional): Column name(s) to drop. + numeric_only (bool): Drop all non-numeric feature columns. + dropna (bool): Drop rows with NAs. + + Returns: + namedtuple: Tuple containing X, y, and sample_weights for the Adult + dataset accessible by index or name. + + Examples: + >>> adult = load_adult() + >>> adult.X.shape + (45222, 13) + + >>> adult_num = load_adult(numeric_only=True) + >>> adult_num.X.shape + (48842, 5) + + >>> privileged = adult.xs('White', level='race', drop_level=False) + >>> privileged = adult.query('race == "White"') + """ + return standarize_dataset(fetch_and_format_openml('adult'), + protected_attributes=['race', 'sex'], + target='class', pos_label='>50K', + sample_weight='fnlwgt', usecols=usecols, + dropcols=dropcols, numeric_only=numeric_only, + dropna=dropna) + +def load_german(usecols=[], dropcols=[], numeric_only=False, dropna=True): + """Load the German Credit Dataset. + + Args: + usecols (single label or list-like, optional): Column name(s) to keep. + All others are dropped. + dropcols (single label or list-like, optional): Column name(s) to drop. + numeric_only (bool): Drop all non-numeric feature columns. + dropna (bool): Drop rows with NAs. + + Returns: + namedtuple: Tuple containing X and y for the German dataset accessible + by index or name. + + Examples: + >>> german = load_german() + >>> german.X.shape + (1000, 21) + + >>> german_num = load_german(numeric_only=True) + >>> german_num.X.shape + (1000, 7) + """ + df = fetch_and_format_openml('german') + # Note: marital_status directly implies sex. i.e. 'div/dep/mar' => 'female' + # and all others => 'male' + personal_status = df.pop('personal_status').str.split(expand=True) + personal_status.columns = ['sex', 'marital_status'] + df = df.join(personal_status.astype('category')) + return standarize_dataset(df, protected_attributes=['sex', 'age'], + target='class', pos_label='good', + usecols=usecols, dropcols=dropcols, + numeric_only=numeric_only, dropna=dropna) + +def load_bank(usecols=[], dropcols='duration', numeric_only=False, dropna=False): + """Load the Bank Marketing Dataset. + + Args: + usecols (single label or list-like, optional): Column name(s) to keep. + All others are dropped. + dropcols (single label or list-like, optional): Column name(s) to drop. + numeric_only (bool): Drop all non-numeric feature columns. + dropna (bool): Drop rows with NAs. + + Returns: + namedtuple: Tuple containing X and y for the Bank dataset accessible by + index or name. + + Examples: + >>> bank = load_bank() + >>> bank.X.shape + (45211, 15) + + >>> bank_num = load_bank(numeric_only=True) + >>> bank_num.X.shape + (45211, 6) + """ + df = fetch_and_format_openml('bank') + df.columns = ['age', 'job', 'marital', 'education', 'default', 'balance', + 'housing', 'loan', 'contact', 'day', 'month', 'duration', + 'campaign', 'pdays', 'previous', 'poutcome', 'y'] + # df = df.replace({'unknown': None}) # TODO: this messes up the categories + # df.select_dtypes('object').astype('category', inplace=True) + return standarize_dataset(df, protected_attributes=['age'], target='y', + pos_label='2', usecols=usecols, dropcols=dropcols, + numeric_only=numeric_only, dropna=dropna) diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py new file mode 100644 index 00000000..65239813 --- /dev/null +++ b/aif360/sklearn/datasets/utils.py @@ -0,0 +1,108 @@ +from collections import namedtuple + +import numpy as np +import pandas as pd +from pandas.core.dtypes.common import is_list_like +from sklearn.compose import make_column_transformer +from sklearn.preprocessing import OneHotEncoder + +# TODO: binarize protected_attributes option? +def standarize_dataset(df, *, protected_attributes, target, pos_label=None, + sample_weight=None, usecols=[], dropcols=[], + numeric_only=False, dropna=True): + """Separate data, targets, and possibly sample weights and populate + protected attributes as sample properties. + + Args: + df (pandas.DataFrame): DataFrame with features and target together. + protected_attributes (single label or list-like): Label or list of + labels corresponding to protected attribute columns. Even if these + are dropped from the features, they remain in the index. + target (single label or list-like): Column label of the target (outcome) + variable. + pos_label (scalar, list-like, or function, optional): A value, list of + values, or function designating the positive binary label from the + raw data. + sample_weight (single label, optional): Name of the column containing + sample weights. + usecols (single label or list-like, optional): Column(s) to keep. All + others are dropped. + dropcols (single label or list-like, optional): Column(s) to drop. + numeric_only (bool): Drop all non-numeric feature columns. + dropna (bool): Drop rows with NAs. + + Returns: + (X, y, [sample_weight]): + + * `pandas.DataFrame`: Feature array. + + * `pandas.DataFrame` or `pandas.Series`: Target array. + + * `pandas.Series`, optional: Sample weights. + + Note: + The order of execution for the dropping parameters is: dropcols -> + usecols -> numeric_only -> dropna. + + Examples: + >>> import pandas as pd + >>> from sklearn.linear_model import LinearRegression + + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['X', 'y', 'Z']) + >>> train = standarize_dataset(df, protected_attributes='Z', target='y') + >>> reg = LinearRegression().fit(*train) + + >>> import numpy as np + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import train_test_split + >>> df = pd.DataFrame(np.hstack(make_classification(n_features=5))) + >>> X, y = standarize_dataset(df, protected_attributes=0, target=5) + >>> X_tr, X_te, y_tr, y_te = train_test_split(X, y) + """ + df = df.set_index(protected_attributes, drop=False) # append=True? + + y = df.pop(target) + if pos_label is not None: + if not callable(pos_label): + pos = pos_label if is_list_like(pos_label) else [pos_label] + pos = np.array(pos) + # find all instances which match any of the favorable classes + def pos_label(val): + # return np.logical_or.reduce(np.equal.outer(pos, col), axis=(0, 2)) + return np.logical_or.reduce(pos == val) + + # TODO: won't work for multilabel (target is list) case, try DataFrame.eval()? + y = y.apply(pos_label).astype('int') + + # Column-wise drops + df = df.drop(dropcols, axis=1) + if usecols: + df = df[usecols] + if numeric_only: + df = df.select_dtypes(['number', 'bool']) + # upcast all feature dimensions to a consistent numerical dtype + df = df.apply(pd.to_numeric, axis=1) + # Index-wise drops + if dropna: + notna = df.notna().all(axis=1) & y.notna() + df = df.loc[notna] + y = y.loc[notna] + + if sample_weight is not None: + sample_weight = df.pop(sample_weight) + # return namedtuple('Dataset', ['X', 'y', 'sample_weight'])(df, y, sample_weight) + # TODO: is this less readable? + return namedtuple('Dataset', 'X y sample_weight')(df, y, sample_weight) + return namedtuple('Dataset', ['X', 'y'])(df, y) + +def make_onehot_transformer(X): + """Shortcut for encoding categorical features as one-hot vectors. + + Note: This changes the column order as well as removes DataFrame formatting. + + Returns: + sklearn.compose.ColumnTransformer: Class capable of transforming + categorical features in X to one-hot features. + """ + return make_column_transformer((OneHotEncoder(), X.dtypes == 'category'), + remainder='passthrough') diff --git a/aif360/sklearn/metrics/__init__.py b/aif360/sklearn/metrics/__init__.py new file mode 100644 index 00000000..ceaef288 --- /dev/null +++ b/aif360/sklearn/metrics/__init__.py @@ -0,0 +1 @@ +from aif360.sklearn.metrics.metrics import * diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py new file mode 100644 index 00000000..ced11a34 --- /dev/null +++ b/aif360/sklearn/metrics/metrics.py @@ -0,0 +1,216 @@ +from functools import partial + +import numpy as np +from sklearn.metrics import make_scorer, recall_score +from sklearn.neighbors import NearestNeighbors + + +# # ============================== VALIDATORS ==================================== +# def validate_index_match(arg1, arg2): +# """ +# Raises: +# ValueError: If arg1 and arg2 do not have equivalent indices. +# """ +# if not arg1.index.equals(arg2.index): +# raise ValueError("Indices must match to perform a valid comparison.") + + +# ============================= META-METRICS =================================== +def difference(func, y_true, y_pred=None, *, priv_expr): + """Compute the difference between unprivileged and privileged subsets for an + arbitrary metric. + + Note: The optimal value of a difference is 0. To make it a scorer, one must + take the absolute value and set `greater_is_better` to False. + + Unprivileged group is taken to be the inverse of the privileged group. + + Args: + func (function): A metric function from `aif360.sklearn.metrics` or + `sklearn.metrics`. Keyword args should be filled in with partial. + y_true (pandas.Series): Ground truth (correct) target values. + y_pred (array-like, optional): Estimated targets as returned by a + classifier. + priv_expr (string, keyword-only): A query expression describing the + privileged group (see `pandas.DataFrame.eval` and + `pandas.DataFrame.query` for details). + + Returns: + scalar: Difference in metric value for unprivileged and privileged groups. + + Examples: + >>> X, y = load_german(numeric_only=True) + >>> y_pred = LogisticRegression().fit(X, y).predict(X) + >>> difference(precision_score, y, y_pred, priv_expr='sex == "male"') + -0.06955430006277463 + """ + # Note: provide blank name because if index name clashes with column name, + # column name gets preference + priv = y_true.to_frame('').eval(priv_expr) + if y_pred is None: + return func(y_true[~priv]) - func(y_true[priv]) + return func(y_true[~priv], y_pred[~priv]) - func(y_true[priv], y_pred[priv]) + +def ratio(func, y_true, y_pred=None, *, priv_expr): + """Compute the ratio between unprivileged and privileged subsets for an + arbitrary metric. + + Note: The optimal value of a ratio is 1. To make it a scorer, one must + subtract 1, take the absolute value, and set `greater_is_better` to False. + + Unprivileged group is taken to be the inverse of the privileged group. + + Args: + func (function): A metric function from `aif360.sklearn.metrics` or + `sklearn.metrics`. Keyword args should be filled in with partial. + y_true (pandas.Series): Ground truth (correct) target values. + y_pred (array-like, optional): Estimated targets as returned by a + classifier. + priv_expr (string, keyword-only): A query expression describing the + privileged group (see `pandas.DataFrame.eval` and + `pandas.DataFrame.query` for details). + + Returns: + scalar: Ratio of metric values for unprivileged and privileged groups. + """ + # Note: provide blank name because if index name clashes with column name, + # column name gets preference + priv = y_true.to_frame('').eval(priv_expr) + if y_pred is None: + return func(y_true[~priv]) - func(y_true[priv]) + return func(y_true[~priv], y_pred[~priv]) / func(y_true[priv], y_pred[priv]) + + +# =========================== SCORER FACTORIES ================================= +def make_difference_scorer(func): + return make_scorer(lambda y, y_pred, **kw: abs(func(y, y_pred, **kw)), + greater_is_better=False) + +def make_ratio_scorer(func): + return make_scorer(lambda y, y_pred, **kw: abs(func(y, y_pred, **kw) - 1), + greater_is_better=False) + + +# ================================ HELPERS ===================================== +def specificity_score(y_true, y_pred, neg_label=0, sample_weight=None): + """Compute the specificity or true negative rate. + + Args: + y_true: + y_pred: + neg_label (scalar, optional): The class to report. Note: the data should + be binary. + """ + # neg_labels = np.setdiff1d(np.unique(np.hstack((y_true, y_pred))), + # np.array([pos_label])) + # if neg_labels.size != 2: + # raise ValueError("This function only applies to binary classification.") + return recall_score(y_true, y_pred, pos_label=neg_label, + sample_weight=sample_weight) + +def base_rate(y, y_pred=None, pos_label=1, sample_weight=None): + y = np.array(y) + if sample_weight is not None: + return ((y == pos_label) * sample_weight).sum() / sample_weight.sum() + return (y == pos_label).sum() / len(y) + +def selection_rate(y_true, y_pred, pos_label=1, sample_weight=None): + return base_rate(y_pred, pos_label=pos_label, sample_weight=sample_weight) + + +# ============================ GROUP FAIRNESS ================================== +def statistical_parity_difference(*y, priv_expr, pos_label=1, sample_weight=None): + rate = base_rate if len(y) == 1 or y[1] is None else selection_rate + rate = partial(rate, pos_label=pos_label, sample_weight=sample_weight) + return difference(rate, *y, priv_expr=priv_expr) + +def disparate_impact_ratio(*y, priv_expr, pos_label=1, sample_weight=None): + rate = base_rate if len(y) == 1 or y[1] is None else selection_rate + rate = partial(rate, pos_label=pos_label, sample_weight=sample_weight) + return ratio(rate, *y, priv_expr=priv_expr) + + +def equal_opportunity_difference(y_true, y_pred, priv_expr, pos_label=1, + sample_weight=None): + rec = partial(recall_score, pos_label=pos_label, + sample_weight=sample_weight) + return difference(rec, y_true, y_pred, priv_expr=priv_expr) + +def average_odds_difference(y_true, y_pred, priv_expr, pos_label=1, + sample_weight=None): + tnr = partial(specificity_score, labels=labels, pos_label=pos_label, + sample_weight=sample_weight) + tpr = partial(recall_score, labels=labels, pos_label=pos_label, + sample_weight=sample_weight) + return 0.5 * (difference(tnr, y_true, y_pred, priv_expr=priv_expr) + + difference(tpr, y_true, y_pred, priv_expr=priv_expr)) + +def average_odds_error(y_true, y_pred, priv_expr, pos_label=1, + sample_weight=None): + tnr = partial(specificity_score, pos_label=pos_label, + sample_weight=sample_weight) + tpr = partial(recall_score, pos_label=pos_label, sample_weight=sample_weight) + return 0.5 * (abs(difference(tnr, y_true, y_pred, priv_expr=priv_expr)) + + abs(difference(tpr, y_true, y_pred, priv_expr=priv_expr))) + + +# ================================ INDICES ===================================== +def generalized_entropy_index(b, alpha=2): + if alpha == 0: + return -(np.log(b / b.mean()) / b.mean()).mean() + elif alpha == 1: + # moving the b inside the log allows for 0 values + return (np.log((b / b.mean())**b) / b.mean()).mean() + else: + return ((b / b.mean())**alpha - 1).mean() / (alpha * (alpha - 1)) + +def generalized_entropy_error(y_true, y_pred, alpha=2, pos_label=1): + # sample_weight=None): + b = 1 + (y_pred == pos_label) - (y_true == pos_label) + return generalized_entropy_index(b, alpha=alpha) + +def between_group_generalized_entropy_error(priv_expr, y_true, y_pred, alpha=2, + pos_label=1): + b = np.empty_like(y_true, dtype='float') + priv = y_true.to_frame().eval(priv_expr) + b[priv] = (1 + (y_pred.loc[priv] == pos_label) + - (y_true.loc[priv] == pos_label)).mean() + b[~priv] = (1 + (y_pred.loc[~priv] == pos_label) + - (y_true.loc[~priv] == pos_label)).mean() + return generalized_entropy_index(b, alpha=alpha) + +def theil_index(b): + return generalized_entropy_index(b, alpha=1) + +def coefficient_of_variation(b): + return 2 * np.sqrt(generalized_entropy_index(b, alpha=2)) + + +# ========================== INDIVIDUAL FAIRNESS =============================== +# TODO: not technically a scorer but you should be allowed to score transformers +# Is consistency_difference posible? +def consistency_score(X, y, n_neighbors=5): + # learn a KNN on the features + nbrs = NearestNeighbors(n_neighbors, algorithm='ball_tree').fit(X) + _, indices = nbrs.kneighbors(X) + + # compute consistency score + return 1 - abs(y - y[indices].mean(axis=1)).mean() + + +# ================================ ALIASES ===================================== +def sensitivity_score(y_true, y_pred, pos_label=1, sample_weight=None): + """Alias of `sklearn.metrics.recall_score` for binary classes only.""" + return recall_score(y_true, y_pred, pos_label=pos_label, + sample_weight=sample_weight) + +# def false_negative_rate_error(y_true, y_pred, pos_label=1, sample_weight=None): +# return 1 - recall_score(y_true, y_pred, pos_label=pos_label, +# sample_weight=sample_weight) + +# def false_positive_rate_error(y_true, y_pred, pos_label=1, sample_weight=None): +# return 1 - specificity_score(y_true, y_pred, pos_label=pos_label, +# sample_weight=sample_weight) + +mean_difference = statistical_parity_difference +mean_difference.__doc__ = """Alias of :meth:`statistical_parity_difference`.""" diff --git a/docs/source/conf.py b/docs/source/conf.py index c96ac4c0..1c302d1e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -41,6 +41,7 @@ intersphinx_mapping = {'numpy': ('http://docs.scipy.org/doc/numpy/', None), 'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None), 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), + 'sklearn': ('https://scikit-learn.org/stable/modules/generated/', None), 'python': ('https://docs.python.org/{}.{}'.format(*sys.version_info), None)} autoclass_content = 'both' From 1f4ae57756be3b23808ce84f0616bb24b2b0ce6f Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 16 May 2019 17:43:44 -0400 Subject: [PATCH 02/61] added initial dataset tests --- aif360/sklearn/datasets/openml_datasets.py | 3 - aif360/sklearn/datasets/utils.py | 19 +++-- aif360/sklearn/tests/test_datasets.py | 84 ++++++++++++++++++++++ 3 files changed, 96 insertions(+), 10 deletions(-) create mode 100644 aif360/sklearn/tests/test_datasets.py diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index b902b436..9d9986ea 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -61,9 +61,6 @@ def load_adult(usecols=[], dropcols=[], numeric_only=False, dropna=True): >>> adult_num = load_adult(numeric_only=True) >>> adult_num.X.shape (48842, 5) - - >>> privileged = adult.xs('White', level='race', drop_level=False) - >>> privileged = adult.query('race == "White"') """ return standarize_dataset(fetch_and_format_openml('adult'), protected_attributes=['race', 'sex'], diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index 65239813..84d3551e 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -32,13 +32,16 @@ def standarize_dataset(df, *, protected_attributes, target, pos_label=None, dropna (bool): Drop rows with NAs. Returns: - (X, y, [sample_weight]): + namedtuple: - * `pandas.DataFrame`: Feature array. + A tuple-like object where items can be accessed by index or name. + Contains the following attributes: - * `pandas.DataFrame` or `pandas.Series`: Target array. + * `pandas.DataFrame`: X: Feature array. - * `pandas.Series`, optional: Sample weights. + * `pandas.DataFrame` or `pandas.Series`: y: Target array. + + * `pandas.Series`, optional: sample_weight: Sample weights. Note: The order of execution for the dropping parameters is: dropcols -> @@ -77,6 +80,9 @@ def pos_label(val): # Column-wise drops df = df.drop(dropcols, axis=1) if usecols: + if not is_list_like(usecols): + # make sure we don't return a Series instead of a DataFrame + usecols = [usecols] df = df[usecols] if numeric_only: df = df.select_dtypes(['number', 'bool']) @@ -90,9 +96,8 @@ def pos_label(val): if sample_weight is not None: sample_weight = df.pop(sample_weight) - # return namedtuple('Dataset', ['X', 'y', 'sample_weight'])(df, y, sample_weight) - # TODO: is this less readable? - return namedtuple('Dataset', 'X y sample_weight')(df, y, sample_weight) + return namedtuple('WeightedDataset', ['X', 'y', 'sample_weight'])( + df, y, sample_weight) return namedtuple('Dataset', ['X', 'y'])(df, y) def make_onehot_transformer(X): diff --git a/aif360/sklearn/tests/test_datasets.py b/aif360/sklearn/tests/test_datasets.py new file mode 100644 index 00000000..1b997c72 --- /dev/null +++ b/aif360/sklearn/tests/test_datasets.py @@ -0,0 +1,84 @@ +from functools import partial + +import numpy as np +import pandas as pd +import pytest + +from aif360.sklearn.datasets import * + + +df = pd.DataFrame([[1, 2, 3, 'a'], [5, 6, 7, 'b'], [np.NaN, 10, 11, 'c']], + columns=['X1', 'X2', 'y', 'Z']) +basic = partial(standarize_dataset, df=df, protected_attributes='Z', target='y', + dropna=False) + +def test_standardize_dataset_basic(): + dataset = basic() + X, y = dataset + X, y = dataset.X, dataset.y + with pytest.raises(ValueError): + X, y, sample_weight = dataset + with pytest.raises(AttributeError): + dataset.sample_weight + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) + assert X.index.equals(y.index) + assert X.shape == (3, 3) + +def test_sample_weight_basic(): + with_weights = basic(sample_weight='X2') + assert len(with_weights) == 3 + assert with_weights.X.shape == (3, 2) + +def test_pos_label_basic(): + assert (basic().y == [3, 7, 11]).all() + assert (basic(pos_label=3).y == [1, 0, 0]).all() + assert (basic(pos_label=[3, 7, 11]).y == 1).all() + assert (basic(pos_label=lambda y: 10 > y > 5).y == [0, 1, 0]).all() + +def test_usecols_dropcols_basic(): + assert basic(usecols='X1').X.columns.to_list() == ['X1'] + assert basic(usecols=['X1', 'Z']).X.columns.to_list() == ['X1', 'Z'] + + assert basic(dropcols='X1').X.columns.to_list() == ['X2', 'Z'] + assert basic(dropcols=['X1', 'Z']).X.columns.to_list() == ['X2'] + + assert basic(usecols='X1', dropcols=['X2']).X.columns.to_list() == ['X1'] + with pytest.raises(KeyError): + basic(usecols=['X1', 'X2'], dropcols='X2') + +def test_dropna_basic(): + basic_dropna = partial(standarize_dataset, df=df, protected_attributes='Z', + target='y', dropna=True) + assert basic_dropna().X.shape == (2, 3) + assert basic(dropcols='X1').X.shape == (3, 2) + +def test_numeric_only_basic(): + assert basic(numeric_only=True).X.shape == (3, 2) + assert (basic(numeric_only=True).X.dtypes == 'float').all() + assert basic(dropcols='Z', numeric_only=True).X.shape == (3, 2) + assert (basic(dropcols='X1', numeric_only=True).X.dtypes == 'int').all() + +def test_fetch_and_format_openml(): + df = fetch_and_format_openml('german') + assert df.equals(df.select_dtypes(['number', 'category'])) + +def test_load_adult(): + adult = load_adult() + assert len(adult) == 3 + assert adult.X.shape == (45222, 13) + assert load_adult(dropna=False).X.shape == (48842, 13) + assert load_adult(numeric_only=True).X.shape == (48842, 5) + +def test_load_german(): + german = load_german() + assert len(german) == 2 + assert german.X.shape == (1000, 21) + assert load_german(numeric_only=True).X.shape == (1000, 7) + +def test_load_bank(): + bank = load_bank() + assert len(bank) == 2 + assert bank.X.shape == (45211, 15) + assert load_bank(dropcols=[]).X.shape == (45211, 16) + assert load_bank(numeric_only=True).X.shape == (45211, 6) From 2aef3fca622d12384832d91bdcfe442c799d2f75 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 16 May 2019 20:46:30 -0400 Subject: [PATCH 03/61] fixed to_list for older pandas versions --- aif360/sklearn/tests/test_datasets.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/aif360/sklearn/tests/test_datasets.py b/aif360/sklearn/tests/test_datasets.py index 1b997c72..9b00e801 100644 --- a/aif360/sklearn/tests/test_datasets.py +++ b/aif360/sklearn/tests/test_datasets.py @@ -37,13 +37,13 @@ def test_pos_label_basic(): assert (basic(pos_label=lambda y: 10 > y > 5).y == [0, 1, 0]).all() def test_usecols_dropcols_basic(): - assert basic(usecols='X1').X.columns.to_list() == ['X1'] - assert basic(usecols=['X1', 'Z']).X.columns.to_list() == ['X1', 'Z'] + assert basic(usecols='X1').X.columns.tolist() == ['X1'] + assert basic(usecols=['X1', 'Z']).X.columns.tolist() == ['X1', 'Z'] - assert basic(dropcols='X1').X.columns.to_list() == ['X2', 'Z'] - assert basic(dropcols=['X1', 'Z']).X.columns.to_list() == ['X2'] + assert basic(dropcols='X1').X.columns.tolist() == ['X2', 'Z'] + assert basic(dropcols=['X1', 'Z']).X.columns.tolist() == ['X2'] - assert basic(usecols='X1', dropcols=['X2']).X.columns.to_list() == ['X1'] + assert basic(usecols='X1', dropcols=['X2']).X.columns.tolist() == ['X1'] with pytest.raises(KeyError): basic(usecols=['X1', 'X2'], dropcols='X2') From 2b1799a4bd8871f4de74c859d46dc545af9f4998 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Fri, 17 May 2019 18:41:36 -0400 Subject: [PATCH 04/61] added metrics tests --- aif360/datasets/adult_dataset.py | 2 +- aif360/sklearn/datasets/openml_datasets.py | 1 - aif360/sklearn/metrics/metrics.py | 108 +++++++++++---------- aif360/sklearn/tests/test_metrics.py | 71 ++++++++++++++ 4 files changed, 129 insertions(+), 53 deletions(-) create mode 100644 aif360/sklearn/tests/test_metrics.py diff --git a/aif360/datasets/adult_dataset.py b/aif360/datasets/adult_dataset.py index e0c432b1..ea4b73f7 100644 --- a/aif360/datasets/adult_dataset.py +++ b/aif360/datasets/adult_dataset.py @@ -99,7 +99,7 @@ def __init__(self, label_name='income-per-year', import sys sys.exit(1) - df = pd.concat([train, test], ignore_index=True) + df = pd.concat([test, train], ignore_index=True) super(AdultDataset, self).__init__(df=df, label_name=label_name, favorable_classes=favorable_classes, diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 9d9986ea..51646a73 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -1,6 +1,5 @@ import os -import numpy as np import pandas as pd from sklearn.datasets import fetch_openml diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index ced11a34..2a4da306 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -16,7 +16,7 @@ # ============================= META-METRICS =================================== -def difference(func, y_true, y_pred=None, *, priv_expr): +def difference(func, y, *args, priv_expr, sample_weight=None, **kwargs): """Compute the difference between unprivileged and privileged subsets for an arbitrary metric. @@ -27,13 +27,15 @@ def difference(func, y_true, y_pred=None, *, priv_expr): Args: func (function): A metric function from `aif360.sklearn.metrics` or - `sklearn.metrics`. Keyword args should be filled in with partial. - y_true (pandas.Series): Ground truth (correct) target values. - y_pred (array-like, optional): Estimated targets as returned by a - classifier. + `sklearn.metrics`. + y (pandas.Series): Outcome vector with protected attributes as index. + *args: Additional positional args to be passed through to `func`. priv_expr (string, keyword-only): A query expression describing the privileged group (see `pandas.DataFrame.eval` and `pandas.DataFrame.query` for details). + sample_weight (array-like, optional): Sample weights passed through to + `func`. + **kwargs: Additional keyword args to be passed through to `func`. Returns: scalar: Difference in metric value for unprivileged and privileged groups. @@ -44,14 +46,18 @@ def difference(func, y_true, y_pred=None, *, priv_expr): >>> difference(precision_score, y, y_pred, priv_expr='sex == "male"') -0.06955430006277463 """ + args = (y,) + args # Note: provide blank name because if index name clashes with column name, # column name gets preference - priv = y_true.to_frame('').eval(priv_expr) - if y_pred is None: - return func(y_true[~priv]) - func(y_true[priv]) - return func(y_true[~priv], y_pred[~priv]) - func(y_true[priv], y_pred[priv]) + idx = y.to_frame('').eval(priv_expr) + unpriv = map(lambda a: a[~idx], args) + priv = map(lambda a: a[idx], args) + if sample_weight is not None: + return (func(*unpriv, sample_weight=sample_weight[~idx], **kwargs) + - func(*priv, sample_weight=sample_weight[idx], **kwargs)) + return func(*unpriv, **kwargs) - func(*priv, **kwargs) -def ratio(func, y_true, y_pred=None, *, priv_expr): +def ratio(func, y, *args, priv_expr, sample_weight=None, **kwargs): """Compute the ratio between unprivileged and privileged subsets for an arbitrary metric. @@ -62,23 +68,27 @@ def ratio(func, y_true, y_pred=None, *, priv_expr): Args: func (function): A metric function from `aif360.sklearn.metrics` or - `sklearn.metrics`. Keyword args should be filled in with partial. - y_true (pandas.Series): Ground truth (correct) target values. - y_pred (array-like, optional): Estimated targets as returned by a - classifier. + `sklearn.metrics`. + y (pandas.Series): Outcome vector with protected attributes as index. + *args: Additional positional args to be passed through to `func`. priv_expr (string, keyword-only): A query expression describing the privileged group (see `pandas.DataFrame.eval` and `pandas.DataFrame.query` for details). + sample_weight (array-like, optional): Sample weights passed through to + `func`. + **kwargs: Additional keyword args to be passed through to `func`. Returns: scalar: Ratio of metric values for unprivileged and privileged groups. """ - # Note: provide blank name because if index name clashes with column name, - # column name gets preference - priv = y_true.to_frame('').eval(priv_expr) - if y_pred is None: - return func(y_true[~priv]) - func(y_true[priv]) - return func(y_true[~priv], y_pred[~priv]) / func(y_true[priv], y_pred[priv]) + args = (y,) + args + idx = y.to_frame('').eval(priv_expr) + unpriv = map(lambda a: a[~idx], args) + priv = map(lambda a: a[idx], args) + if sample_weight is not None: + return (func(*unpriv, sample_weight=sample_weight[~idx], **kwargs) + / func(*priv, sample_weight=sample_weight[idx], **kwargs)) + return func(*unpriv, **kwargs) / func(*priv, **kwargs) # =========================== SCORER FACTORIES ================================= @@ -109,10 +119,7 @@ def specificity_score(y_true, y_pred, neg_label=0, sample_weight=None): sample_weight=sample_weight) def base_rate(y, y_pred=None, pos_label=1, sample_weight=None): - y = np.array(y) - if sample_weight is not None: - return ((y == pos_label) * sample_weight).sum() / sample_weight.sum() - return (y == pos_label).sum() / len(y) + return np.average(y == pos_label, weights=sample_weight) def selection_rate(y_true, y_pred, pos_label=1, sample_weight=None): return base_rate(y_pred, pos_label=pos_label, sample_weight=sample_weight) @@ -121,37 +128,35 @@ def selection_rate(y_true, y_pred, pos_label=1, sample_weight=None): # ============================ GROUP FAIRNESS ================================== def statistical_parity_difference(*y, priv_expr, pos_label=1, sample_weight=None): rate = base_rate if len(y) == 1 or y[1] is None else selection_rate - rate = partial(rate, pos_label=pos_label, sample_weight=sample_weight) - return difference(rate, *y, priv_expr=priv_expr) + return difference(rate, *y, priv_expr=priv_expr, pos_label=pos_label, + sample_weight=sample_weight) def disparate_impact_ratio(*y, priv_expr, pos_label=1, sample_weight=None): rate = base_rate if len(y) == 1 or y[1] is None else selection_rate - rate = partial(rate, pos_label=pos_label, sample_weight=sample_weight) - return ratio(rate, *y, priv_expr=priv_expr) + return ratio(rate, *y, priv_expr=priv_expr, pos_label=pos_label, + sample_weight=sample_weight) def equal_opportunity_difference(y_true, y_pred, priv_expr, pos_label=1, sample_weight=None): - rec = partial(recall_score, pos_label=pos_label, - sample_weight=sample_weight) - return difference(rec, y_true, y_pred, priv_expr=priv_expr) + return difference(recall_score, y_true, y_pred, priv_expr=priv_expr, + pos_label=pos_label, sample_weight=sample_weight) -def average_odds_difference(y_true, y_pred, priv_expr, pos_label=1, +def average_odds_difference(y_true, y_pred, priv_expr, pos_label=1, neg_label=0, sample_weight=None): - tnr = partial(specificity_score, labels=labels, pos_label=pos_label, - sample_weight=sample_weight) - tpr = partial(recall_score, labels=labels, pos_label=pos_label, - sample_weight=sample_weight) - return 0.5 * (difference(tnr, y_true, y_pred, priv_expr=priv_expr) - + difference(tpr, y_true, y_pred, priv_expr=priv_expr)) - -def average_odds_error(y_true, y_pred, priv_expr, pos_label=1, + tnr_diff = difference(specificity_score, y_true, y_pred, priv_expr=priv_expr, + neg_label=neg_label, sample_weight=sample_weight) + tpr_diff = difference(recall_score, y_true, y_pred, priv_expr=priv_expr, + pos_label=pos_label, sample_weight=sample_weight) + return (tpr_diff - tnr_diff) / 2 + +def average_odds_error(y_true, y_pred, priv_expr, pos_label=1, neg_label=0, sample_weight=None): - tnr = partial(specificity_score, pos_label=pos_label, - sample_weight=sample_weight) - tpr = partial(recall_score, pos_label=pos_label, sample_weight=sample_weight) - return 0.5 * (abs(difference(tnr, y_true, y_pred, priv_expr=priv_expr)) - + abs(difference(tpr, y_true, y_pred, priv_expr=priv_expr))) + tnr_diff = difference(specificity_score, y_true, y_pred, priv_expr=priv_expr, + neg_label=neg_label, sample_weight=sample_weight) + tpr_diff = difference(recall_score, y_true, y_pred, priv_expr=priv_expr, + pos_label=pos_label, sample_weight=sample_weight) + return (abs(tnr_diff) + abs(tpr_diff)) / 2 # ================================ INDICES ===================================== @@ -169,14 +174,14 @@ def generalized_entropy_error(y_true, y_pred, alpha=2, pos_label=1): b = 1 + (y_pred == pos_label) - (y_true == pos_label) return generalized_entropy_index(b, alpha=alpha) -def between_group_generalized_entropy_error(priv_expr, y_true, y_pred, alpha=2, +def between_group_generalized_entropy_error(y_true, y_pred, priv_expr, alpha=2, pos_label=1): b = np.empty_like(y_true, dtype='float') - priv = y_true.to_frame().eval(priv_expr) - b[priv] = (1 + (y_pred.loc[priv] == pos_label) - - (y_true.loc[priv] == pos_label)).mean() - b[~priv] = (1 + (y_pred.loc[~priv] == pos_label) - - (y_true.loc[~priv] == pos_label)).mean() + priv = y_true.to_frame('').eval(priv_expr) + b[priv] = (1 + (y_pred[priv] == pos_label) + - (y_true[priv] == pos_label)).mean() + b[~priv] = (1 + (y_pred[~priv] == pos_label) + - (y_true[~priv] == pos_label)).mean() return generalized_entropy_index(b, alpha=alpha) def theil_index(b): @@ -189,6 +194,7 @@ def coefficient_of_variation(b): # ========================== INDIVIDUAL FAIRNESS =============================== # TODO: not technically a scorer but you should be allowed to score transformers # Is consistency_difference posible? +# use sample_weight? def consistency_score(X, y, n_neighbors=5): # learn a KNN on the features nbrs = NearestNeighbors(n_neighbors, algorithm='ball_tree').fit(X) diff --git a/aif360/sklearn/tests/test_metrics.py b/aif360/sklearn/tests/test_metrics.py new file mode 100644 index 00000000..5c263303 --- /dev/null +++ b/aif360/sklearn/tests/test_metrics.py @@ -0,0 +1,71 @@ +import numpy as np +from sklearn.linear_model import LogisticRegression + +from aif360.datasets import AdultDataset +from aif360.sklearn.datasets import load_adult +from aif360.metrics import ClassificationMetric +from aif360.sklearn.metrics import * + + +X, y, sample_weight = load_adult(numeric_only=True) +X.insert(2, 'race', X.index.get_level_values('race').to_series(index=X.index).map( + lambda r: r == 'White').astype('float')) +X.insert(3, 'sex', X.index.get_level_values('sex').to_series(index=X.index).map( + {'Male': 1, 'Female': 0}).astype('float')) +y_pred = LogisticRegression(solver='liblinear').fit(X, y, + sample_weight=sample_weight).predict(X) +priv = 'sex == "Male"' +adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], + features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', + 'hours-per-week'], features_to_drop=[]) +adult_pred = adult.copy() +adult_pred.labels = y_pred +cm = ClassificationMetric(adult, adult_pred, + unprivileged_groups=[{'sex': 0}], + privileged_groups=[{'sex': 1}]) + +def test_dataset_equality(): + # print(X.shape, adult.features.shape) + # print(X.head()) + # print(adult.feature_names) + # print(adult.features[:5]) + assert (adult.features == X.values).all() + +def test_consistency(): + assert consistency_score(X, y) == cm.consistency() + +def test_specificity(): + assert specificity_score(y, y_pred, sample_weight=sample_weight) == cm.specificity() + +def test_selection_rate(): + assert selection_rate(y, y_pred, sample_weight=sample_weight) == cm.selection_rate() + +def test_disparate_impact(): + assert disparate_impact_ratio(y, y_pred, priv_expr=priv, + sample_weight=sample_weight) == cm.disparate_impact() + +def test_statistical_parity(): + assert statistical_parity_difference(y, y_pred, priv_expr=priv, + sample_weight=sample_weight) == cm.statistical_parity_difference() + +def test_equal_opportunity(): + assert equal_opportunity_difference(y, y_pred, priv_expr=priv, + sample_weight=sample_weight) == cm.equal_opportunity_difference() + +def test_average_odds_difference(): + assert np.isclose(average_odds_difference(y, y_pred, priv_expr=priv, + sample_weight=sample_weight), + cm.average_odds_difference()) + +def test_average_odds_error(): + assert np.isclose(average_odds_error(y, y_pred, priv_expr=priv, + sample_weight=sample_weight), + cm.average_abs_odds_difference()) + +def test_generalized_entropy_index(): + assert np.isclose(generalized_entropy_error(y, y_pred), + cm.generalized_entropy_index()) + +def test_between_group_generalized_entropy_index(): + assert between_group_generalized_entropy_error(y, y_pred, priv_expr=priv) \ + == cm.between_group_generalized_entropy_index() From 9da5abd7dc30106bba4742ad0ab5308d8c8b0c11 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 21 May 2019 12:02:25 -0400 Subject: [PATCH 05/61] added README and docs --- aif360/sklearn/README.md | 41 ++++++++++++++++++++++ aif360/sklearn/datasets/openml_datasets.py | 8 ++--- aif360/sklearn/datasets/utils.py | 15 ++++---- aif360/sklearn/metrics/metrics.py | 8 +++-- docs/source/conf.py | 6 ++-- docs/source/index.rst | 1 + docs/source/modules/sklearn.rst | 21 +++++++++++ 7 files changed, 83 insertions(+), 17 deletions(-) create mode 100644 aif360/sklearn/README.md create mode 100644 docs/source/modules/sklearn.rst diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md new file mode 100644 index 00000000..14a4c318 --- /dev/null +++ b/aif360/sklearn/README.md @@ -0,0 +1,41 @@ +## `aif360.sklearn` + +This is a wholly separate interface for interacting with data, viewing metrics, +and running debiasing algorithms than the main AIF360 package. The purpose of +this sub-package is to match scikit-learn paradigms/APIs for easier integration +in typical machine learning workflows. + +To do: + +- [x] Reformat datasets as separate X and y (and sample_weight) DataFrame +objects with sample properties (protected attributes) as the index +- [ ] Load included datasets in the above format (partially done) + - [x] Use `sklearn.datasets.fetch_openml` to load UCI datasets (#53) + - [ ] COMPAS + - [ ] MEPS +- [ ] Implement metrics as individual functions instead of instance methods +(mostly done) + - [x] Make certain metrics compatible as sklearn scorers + - [ ] Generalized confusion matrix + - [ ] Sample distortion metrics +- [ ] Make inprocessing algorithms compatible as sklearn `Estimator`s + - [ ] Adversarial debiasing + - [ ] Meta-fair classifier + - [ ] Prejudice remover +- [ ] Make preprocessing algorithms compatible as sklearn `Transformer`s + - [ ] Add functionality to modify X and y (worst case: just `predict()` + + `transform()` separately) + - [ ] Disparate impact remover + - [ ] Learning fair representations + - [ ] Optimized preprocessing + - [ ] Reweighing + - [ ] Use dynamic object to pass sample_weight to estimator, etc. after they + are fitted +- [ ] Make postprocessing algorithms compatible + - [ ] Allow `fit()` on `y_true`,`y_pred` + - [ ] Calibrated equalized odds postprocessing + - [ ] Equalized odds postprocessing + - [ ] Reject option classification +- [ ] Miscellaneous: + - [ ] LIME encoder + - [ ] Explainers diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 51646a73..c65f3784 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -24,7 +24,7 @@ def fetch_and_format_openml(name): Returns: pandas.DataFrame: A DataFrame containing all data, including target, - with categorical features converted to 'category' dtypes. + with categorical features converted to 'category' dtypes. """ def categorize(item): return cats[int(item)] if not pd.isna(item) else item @@ -50,7 +50,7 @@ def load_adult(usecols=[], dropcols=[], numeric_only=False, dropna=True): Returns: namedtuple: Tuple containing X, y, and sample_weights for the Adult - dataset accessible by index or name. + dataset accessible by index or name. Examples: >>> adult = load_adult() @@ -80,7 +80,7 @@ def load_german(usecols=[], dropcols=[], numeric_only=False, dropna=True): Returns: namedtuple: Tuple containing X and y for the German dataset accessible - by index or name. + by index or name. Examples: >>> german = load_german() @@ -114,7 +114,7 @@ def load_bank(usecols=[], dropcols='duration', numeric_only=False, dropna=False) Returns: namedtuple: Tuple containing X and y for the Bank dataset accessible by - index or name. + index or name. Examples: >>> bank = load_bank() diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index 84d3551e..3db33c11 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -7,7 +7,7 @@ from sklearn.preprocessing import OneHotEncoder # TODO: binarize protected_attributes option? -def standarize_dataset(df, *, protected_attributes, target, pos_label=None, +def standarize_dataset(df, protected_attributes, target, pos_label=None, sample_weight=None, usecols=[], dropcols=[], numeric_only=False, dropna=True): """Separate data, targets, and possibly sample weights and populate @@ -32,16 +32,16 @@ def standarize_dataset(df, *, protected_attributes, target, pos_label=None, dropna (bool): Drop rows with NAs. Returns: - namedtuple: + collections.namedtuple: A tuple-like object where items can be accessed by index or name. Contains the following attributes: - * `pandas.DataFrame`: X: Feature array. + * **X** (`pandas.DataFrame`) -- Feature array. - * `pandas.DataFrame` or `pandas.Series`: y: Target array. + * **y** (`pandas.DataFrame` or `pandas.Series`) -- Target array. - * `pandas.Series`, optional: sample_weight: Sample weights. + * **sample_weight** (`pandas.Series`, optional) -- Sample weights. Note: The order of execution for the dropping parameters is: dropcols -> @@ -103,11 +103,12 @@ def pos_label(val): def make_onehot_transformer(X): """Shortcut for encoding categorical features as one-hot vectors. - Note: This changes the column order as well as removes DataFrame formatting. + Note: + This changes the column order as well as removes DataFrame formatting. Returns: sklearn.compose.ColumnTransformer: Class capable of transforming - categorical features in X to one-hot features. + categorical features in X to one-hot features. """ return make_column_transformer((OneHotEncoder(), X.dtypes == 'category'), remainder='passthrough') diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index 2a4da306..4d02a310 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -206,7 +206,7 @@ def consistency_score(X, y, n_neighbors=5): # ================================ ALIASES ===================================== def sensitivity_score(y_true, y_pred, pos_label=1, sample_weight=None): - """Alias of `sklearn.metrics.recall_score` for binary classes only.""" + """Alias of :func:`sklearn.metrics.recall_score` for binary classes only.""" return recall_score(y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight) @@ -218,5 +218,7 @@ def sensitivity_score(y_true, y_pred, pos_label=1, sample_weight=None): # return 1 - specificity_score(y_true, y_pred, pos_label=pos_label, # sample_weight=sample_weight) -mean_difference = statistical_parity_difference -mean_difference.__doc__ = """Alias of :meth:`statistical_parity_difference`.""" +def mean_difference(*y, priv_expr, pos_label=1, sample_weight=None): + """Alias of :func:`statistical_parity_difference`.""" + return statistical_parity_difference(*y, priv_expr=priv_expr, + pos_label=pos_label, sample_weight=sample_weight) diff --git a/docs/source/conf.py b/docs/source/conf.py index 1c302d1e..66493140 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -38,10 +38,10 @@ 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax'] -intersphinx_mapping = {'numpy': ('http://docs.scipy.org/doc/numpy/', None), - 'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None), +intersphinx_mapping = {'numpy': ('https://docs.scipy.org/doc/numpy/', None), + 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None), 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), - 'sklearn': ('https://scikit-learn.org/stable/modules/generated/', None), + 'sklearn': ('https://scikit-learn.org/stable/', None), 'python': ('https://docs.python.org/{}.{}'.format(*sys.version_info), None)} autoclass_content = 'both' diff --git a/docs/source/index.rst b/docs/source/index.rst index 9b780a61..37ba7078 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -14,6 +14,7 @@ Welcome to AI Fairness 360's documentation! modules/datasets modules/explainers modules/metrics + modules/sklearn Indices and tables diff --git a/docs/source/modules/sklearn.rst b/docs/source/modules/sklearn.rst new file mode 100644 index 00000000..e6234ddf --- /dev/null +++ b/docs/source/modules/sklearn.rst @@ -0,0 +1,21 @@ +:mod:`aif360.sklearn` +===================== + +.. automodule:: aif360.sklearn + +Datasets +-------- + +.. automodule:: aif360.sklearn.datasets.utils + :members: + +.. automodule:: aif360.sklearn.datasets.openml_datasets + :members: + +Metrics +------- + +.. automodule:: aif360.sklearn.metrics.metrics + :members: + +.. autofunction:: aif360.sklearn.metrics.mean_difference From 025ecc168f1481942718a16c7ee6803b1b0a6f5c Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 23 May 2019 15:16:01 -0400 Subject: [PATCH 06/61] simpler dataset loading and 'groups' for metrics * dataset loading is more similar to sklearn.datasets * label binarization is now done outside standardize_dataset * metrics use 'groups' and 'priv_group' to signify priv/unpriv split --- aif360/sklearn/README.md | 5 +- aif360/sklearn/datasets/openml_datasets.py | 160 +++++++++++++++------ aif360/sklearn/datasets/utils.py | 41 +++--- aif360/sklearn/metrics/metrics.py | 127 ++++++++-------- aif360/sklearn/tests/test_metrics.py | 1 + docs/source/modules/sklearn.rst | 2 - requirements.txt | 2 +- 7 files changed, 198 insertions(+), 140 deletions(-) diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md index 14a4c318..c5bd0d8c 100644 --- a/aif360/sklearn/README.md +++ b/aif360/sklearn/README.md @@ -9,13 +9,14 @@ To do: - [x] Reformat datasets as separate X and y (and sample_weight) DataFrame objects with sample properties (protected attributes) as the index -- [ ] Load included datasets in the above format (partially done) +- [ ] Load included datasets in the above format - [x] Use `sklearn.datasets.fetch_openml` to load UCI datasets (#53) - [ ] COMPAS - [ ] MEPS - [ ] Implement metrics as individual functions instead of instance methods -(mostly done) - [x] Make certain metrics compatible as sklearn scorers + - [ ] Use "groups" and "priv_group" keywords to specify protected attributes to + functions (partially done) - [ ] Generalized confusion matrix - [ ] Sample distortion metrics - [ ] Make inprocessing algorithms compatible as sklearn `Estimator`s diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index c65f3784..6da3838c 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -7,20 +7,17 @@ # cache location -DATA_HOME = os.path.join(os.path.dirname(os.path.abspath(__file__)), - '..', 'data', 'raw') -# name -> data_id mapping -DATA_ID = {'adult': 1590, - 'german': 31, - 'bank': 1461 # TODO: this seems to be an old version -} +DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)), + '..', 'data', 'raw') -def fetch_and_format_openml(name): - """Fetch openml dataset by name and format categorical features. +def to_dataframe(data): + """Format an OpenML dataset Bunch as a DataFrame with categorical features + if needed. Args: - name ({'adult', 'german', or 'bank'}): Name of OpenML dataset. Converted - to data_id using `DATA_ID` mapping. + data (Bunch): Dict-like object containing `data`, `feature_names` and, + optionally, `categories` attributes. Note: `data` should contain + both X and y data. Returns: pandas.DataFrame: A DataFrame containing all data, including target, @@ -29,22 +26,33 @@ def fetch_and_format_openml(name): def categorize(item): return cats[int(item)] if not pd.isna(item) else item - data_id = DATA_ID[name] - data = fetch_openml(data_id=data_id, data_home=DATA_HOME, target_column=None) - df = pd.DataFrame(data.data, columns=data.feature_names) - - for col, cats in data.categories.items(): + df = pd.DataFrame(data['data'], columns=data['feature_names']) + for col, cats in data['categories'].items(): df[col] = df[col].apply(categorize).astype('category') return df -def load_adult(usecols=[], dropcols=[], numeric_only=False, dropna=True): +def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], + dropcols=[], numeric_only=False, dropna=True): """Load the Adult Census Income Dataset. + Binarizes 'race' to 'White' (privileged) or 'Non-white' (unprivileged). + The other protected attribute is 'sex' ('Male' is privileged and 'Female' is + unprivileged). The outcome variable is '>50K' (favorable) or '<=50K' + (unfavorable). + Args: - usecols (single label or list-like, optional): Column name(s) to keep. - All others are dropped. - dropcols (single label or list-like, optional): Column name(s) to drop. + subset ({'train', 'test', or 'all'}, optional): Select the dataset to + load: 'train' for the training set, 'test' for the test set, 'all' + for both. + data_home (string, optional): Specify another download and cache folder + for the datasets. By default all AIF360 datasets are stored in + 'aif360/sklearn/data/raw' subfolders. + binary_race (bool, optional): Group all non-white races together. + usecols (single label or list-like, optional): Feature column(s) to + keep. All others are dropped. + dropcols (single label or list-like, optional): Feature column(s) to + drop. numeric_only (bool): Drop all non-numeric feature columns. dropna (bool): Drop rows with NAs. @@ -53,25 +61,57 @@ def load_adult(usecols=[], dropcols=[], numeric_only=False, dropna=True): dataset accessible by index or name. Examples: - >>> adult = load_adult() + >>> adult = fetch_adult() >>> adult.X.shape (45222, 13) - >>> adult_num = load_adult(numeric_only=True) + >>> adult_num = fetch_adult(numeric_only=True) >>> adult_num.X.shape (48842, 5) """ - return standarize_dataset(fetch_and_format_openml('adult'), - protected_attributes=['race', 'sex'], - target='class', pos_label='>50K', - sample_weight='fnlwgt', usecols=usecols, - dropcols=dropcols, numeric_only=numeric_only, - dropna=dropna) + if subset not in {'train', 'test', 'all'}: + raise ValueError("subset must be either 'train', 'test', or 'all'; " + "cannot be {}".format(subset)) + df = to_dataframe(fetch_openml(data_id=1590, data_home=data_home or + DATA_HOME_DEFAULT, target_column=None)) + if subset == 'train': + df = df.iloc[16281:] + elif subset == 'test': + df = df.iloc[:16281] + + df.class = df.class.cat.as_ordered() # '<=50K' < '>50K' + df = df.rename(columns={'class': 'annual-income'}) # more descriptive name + + # binarize protected attributes + if binary_race: + df.race = df.race.cat.set_categories(['Non-white', 'White'], + ordered=True).fillna('Non-white') + df.sex = df.sex.cat.as_ordered() # 'Female' < 'Male' + + return standarize_dataset(df, protected_attributes=['race', 'sex'], + target='annual-income', sample_weight='fnlwgt', + usecols=usecols, dropcols=dropcols, + numeric_only=numeric_only, dropna=dropna) -def load_german(usecols=[], dropcols=[], numeric_only=False, dropna=True): +def fetch_german(data_home=None, usecols=[], dropcols=[], numeric_only=False, + dropna=True): """Load the German Credit Dataset. + Protected attributes are 'sex' ('male' is privileged and 'female' is + unprivileged) and 'age' (left as continuous but [1]_ recommends `age >= 25` + be considered privileged and `age < 25` be considered unprivileged; this can + be done at metric evaluation time). The outcome variable is 'good' + (favorable) or 'bad' (unfavorable). + + References: + .. [1] F. Kamiran and T. Calders, "Classifying without + discriminating," 2nd International Conference on Computer, + Control and Communication, 2009. + Args: + data_home (string, optional): Specify another download and cache folder + for the datasets. By default all AIF360 datasets are stored in + 'aif360/sklearn/data/raw' subfolders. usecols (single label or list-like, optional): Column name(s) to keep. All others are dropped. dropcols (single label or list-like, optional): Column name(s) to drop. @@ -83,54 +123,84 @@ def load_german(usecols=[], dropcols=[], numeric_only=False, dropna=True): by index or name. Examples: - >>> german = load_german() + >>> german = fetch_german() >>> german.X.shape (1000, 21) - >>> german_num = load_german(numeric_only=True) + >>> german_num = fetch_german(numeric_only=True) >>> german_num.X.shape (1000, 7) + + >>> DISPARATE IMPACT AGE EXAMPLE HERE """ - df = fetch_and_format_openml('german') + df = to_dataframe(fetch_openml(data_id=31, data_home=data_home or + DATA_HOME_DEFAULT, target_column=None)) + + df.class = df.class.cat.as_ordered() # 'bad' < 'good' + df = df.rename(columns={'class': 'credit-risk'}) # more descriptive name + # Note: marital_status directly implies sex. i.e. 'div/dep/mar' => 'female' # and all others => 'male' personal_status = df.pop('personal_status').str.split(expand=True) personal_status.columns = ['sex', 'marital_status'] df = df.join(personal_status.astype('category')) + df.sex = df.sex.cat.as_ordered() # 'female' < 'male' + return standarize_dataset(df, protected_attributes=['sex', 'age'], - target='class', pos_label='good', - usecols=usecols, dropcols=dropcols, - numeric_only=numeric_only, dropna=dropna) + target='credit-risk', usecols=usecols, + dropcols=dropcols, numeric_only=numeric_only, + dropna=dropna) -def load_bank(usecols=[], dropcols='duration', numeric_only=False, dropna=False): +def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', + numeric_only=False, dropna=False): """Load the Bank Marketing Dataset. + The protected attribute is 'age' (left as continuous). The outcome variable + is 'yes' or 'no'. TODO: which is favorable? + Args: + data_home (string, optional): Specify another download and cache folder + for the datasets. By default all AIF360 datasets are stored in + 'aif360/sklearn/data/raw' subfolders. + percent10 (bool, optional): Download the reduced version (10% of data). usecols (single label or list-like, optional): Column name(s) to keep. All others are dropped. dropcols (single label or list-like, optional): Column name(s) to drop. numeric_only (bool): Drop all non-numeric feature columns. - dropna (bool): Drop rows with NAs. + dropna (bool): Drop rows with NAs. Note: this is False by default for + this dataset. Returns: namedtuple: Tuple containing X and y for the Bank dataset accessible by index or name. Examples: - >>> bank = load_bank() + >>> bank = fetch_bank() >>> bank.X.shape (45211, 15) - >>> bank_num = load_bank(numeric_only=True) + >>> bank_nona = fetch_bank(dropna=True) + >>> bank_nona.X.shape + (7842, 15) + + >>> bank_num = fetch_bank(numeric_only=True) >>> bank_num.X.shape (45211, 6) """ - df = fetch_and_format_openml('bank') + # TODO: this seems to be an old version + df = to_dataframe(fetch_openml(data_id=1558 if percent10 else 1461, + data_home=data_home or DATA_HOME_DEFAULT, + target_column=None)) df.columns = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', - 'campaign', 'pdays', 'previous', 'poutcome', 'y'] - # df = df.replace({'unknown': None}) # TODO: this messes up the categories - # df.select_dtypes('object').astype('category', inplace=True) - return standarize_dataset(df, protected_attributes=['age'], target='y', - pos_label='2', usecols=usecols, dropcols=dropcols, + 'campaign', 'pdays', 'previous', 'poutcome', 'deposit'] + # remap target + df.deposit = df.deposit.cat.rename_categories({'1': 'no', '2': 'yes'}) + # df.deposit = df.deposit.cat.as_ordered() + # replace 'unknown' marker with NaN + df.select_dtypes('category').apply(lambda s: s.cat.remove_categories('unknown') + if 'unknown' in s.cat.categories else s, + inplace=True) + return standarize_dataset(df, protected_attributes='age', target='deposit', + usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, dropna=dropna) diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index 3db33c11..60d61e37 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -1,15 +1,12 @@ from collections import namedtuple -import numpy as np import pandas as pd from pandas.core.dtypes.common import is_list_like from sklearn.compose import make_column_transformer from sklearn.preprocessing import OneHotEncoder -# TODO: binarize protected_attributes option? -def standarize_dataset(df, protected_attributes, target, pos_label=None, - sample_weight=None, usecols=[], dropcols=[], - numeric_only=False, dropna=True): +def standarize_dataset(df, protected_attributes, target, sample_weight=None, + usecols=[], dropcols=[], numeric_only=False, dropna=True): """Separate data, targets, and possibly sample weights and populate protected attributes as sample properties. @@ -20,9 +17,11 @@ def standarize_dataset(df, protected_attributes, target, pos_label=None, are dropped from the features, they remain in the index. target (single label or list-like): Column label of the target (outcome) variable. - pos_label (scalar, list-like, or function, optional): A value, list of - values, or function designating the positive binary label from the - raw data. + # pos_label (scalar, list-like, or function, optional): A value, list of + # values, or boolean function (True if positive) designating the + # positive binary label from the raw data. All others will be + # considered negative. The resulting target array will have value 1 if + # positive and 0 if negative. sample_weight (single label, optional): Name of the column containing sample weights. usecols (single label or list-like, optional): Column(s) to keep. All @@ -62,20 +61,17 @@ def standarize_dataset(df, protected_attributes, target, pos_label=None, >>> X, y = standarize_dataset(df, protected_attributes=0, target=5) >>> X_tr, X_te, y_tr, y_te = train_test_split(X, y) """ - df = df.set_index(protected_attributes, drop=False) # append=True? + df = df.set_index(protected_attributes, drop=False) # TODO: append=True? + # TODO: convert to 1/0 if numeric_only? y = df.pop(target) - if pos_label is not None: - if not callable(pos_label): - pos = pos_label if is_list_like(pos_label) else [pos_label] - pos = np.array(pos) - # find all instances which match any of the favorable classes - def pos_label(val): - # return np.logical_or.reduce(np.equal.outer(pos, col), axis=(0, 2)) - return np.logical_or.reduce(pos == val) - - # TODO: won't work for multilabel (target is list) case, try DataFrame.eval()? - y = y.apply(pos_label).astype('int') + # if not callable(pos_label): + # if not is_list_like(pos_label): + # pos_label = [pos_label] + # # find all instances which match any of the favorable classes + # y = y.isin(pos_label).astype('int') + # else: + # y = y.apply(pos_label).astype('int') # Column-wise drops df = df.drop(dropcols, axis=1) @@ -85,6 +81,11 @@ def pos_label(val): usecols = [usecols] df = df[usecols] if numeric_only: + # binary categorical columns -> 1/0 + for col in df.select_dtypes('category'): + # TODO: allow any size ordered categorical? + if len(df[col].cat.categories) == 2 and df[col].cat.ordered: + df[col] = df[col].cat.factorize(sort=True)[0] df = df.select_dtypes(['number', 'bool']) # upcast all feature dimensions to a consistent numerical dtype df = df.apply(pd.to_numeric, axis=1) diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index 4d02a310..79c91c19 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -1,22 +1,10 @@ -from functools import partial - import numpy as np from sklearn.metrics import make_scorer, recall_score from sklearn.neighbors import NearestNeighbors -# # ============================== VALIDATORS ==================================== -# def validate_index_match(arg1, arg2): -# """ -# Raises: -# ValueError: If arg1 and arg2 do not have equivalent indices. -# """ -# if not arg1.index.equals(arg2.index): -# raise ValueError("Indices must match to perform a valid comparison.") - - # ============================= META-METRICS =================================== -def difference(func, y, *args, priv_expr, sample_weight=None, **kwargs): +def difference(func, y, *args, groups, priv_group=1, sample_weight=None, **kwargs): """Compute the difference between unprivileged and privileged subsets for an arbitrary metric. @@ -30,9 +18,9 @@ def difference(func, y, *args, priv_expr, sample_weight=None, **kwargs): `sklearn.metrics`. y (pandas.Series): Outcome vector with protected attributes as index. *args: Additional positional args to be passed through to `func`. - priv_expr (string, keyword-only): A query expression describing the - privileged group (see `pandas.DataFrame.eval` and - `pandas.DataFrame.query` for details). + groups (array-like, keyword-only): Group labels (protected attributes) + for the samples. + priv_group (scalar, optional): Label value for the privileged group. sample_weight (array-like, optional): Sample weights passed through to `func`. **kwargs: Additional keyword args to be passed through to `func`. @@ -43,21 +31,19 @@ def difference(func, y, *args, priv_expr, sample_weight=None, **kwargs): Examples: >>> X, y = load_german(numeric_only=True) >>> y_pred = LogisticRegression().fit(X, y).predict(X) - >>> difference(precision_score, y, y_pred, priv_expr='sex == "male"') + >>> sex = X.index.get_level_values('sex') + >>> difference(precision_score, y, y_pred, groups=sex, priv_group='male') -0.06955430006277463 """ - args = (y,) + args - # Note: provide blank name because if index name clashes with column name, - # column name gets preference - idx = y.to_frame('').eval(priv_expr) - unpriv = map(lambda a: a[~idx], args) - priv = map(lambda a: a[idx], args) + idx = (groups == priv_group) + unpriv = map(lambda a: a[~idx], (y,) + args) + priv = map(lambda a: a[idx], (y,) + args) if sample_weight is not None: return (func(*unpriv, sample_weight=sample_weight[~idx], **kwargs) - func(*priv, sample_weight=sample_weight[idx], **kwargs)) return func(*unpriv, **kwargs) - func(*priv, **kwargs) -def ratio(func, y, *args, priv_expr, sample_weight=None, **kwargs): +def ratio(func, y, *args, groups, priv_group=1, sample_weight=None, **kwargs): """Compute the ratio between unprivileged and privileged subsets for an arbitrary metric. @@ -71,9 +57,9 @@ def ratio(func, y, *args, priv_expr, sample_weight=None, **kwargs): `sklearn.metrics`. y (pandas.Series): Outcome vector with protected attributes as index. *args: Additional positional args to be passed through to `func`. - priv_expr (string, keyword-only): A query expression describing the - privileged group (see `pandas.DataFrame.eval` and - `pandas.DataFrame.query` for details). + groups (array-like, keyword-only): Group labels (protected attributes) + for the samples. + priv_group (scalar, optional): Label value for the privileged group. sample_weight (array-like, optional): Sample weights passed through to `func`. **kwargs: Additional keyword args to be passed through to `func`. @@ -81,10 +67,9 @@ def ratio(func, y, *args, priv_expr, sample_weight=None, **kwargs): Returns: scalar: Ratio of metric values for unprivileged and privileged groups. """ - args = (y,) + args - idx = y.to_frame('').eval(priv_expr) - unpriv = map(lambda a: a[~idx], args) - priv = map(lambda a: a[idx], args) + idx = (groups == priv_group) + unpriv = map(lambda a: a[~idx], (y,) + args) + priv = map(lambda a: a[idx], (y,) + args) if sample_weight is not None: return (func(*unpriv, sample_weight=sample_weight[~idx], **kwargs) / func(*priv, sample_weight=sample_weight[idx], **kwargs)) @@ -106,15 +91,11 @@ def specificity_score(y_true, y_pred, neg_label=0, sample_weight=None): """Compute the specificity or true negative rate. Args: - y_true: - y_pred: + y_true (array-like): Ground truth (correct) target values. + y_pred (array-like): Estimated targets as returned by a classifier. neg_label (scalar, optional): The class to report. Note: the data should be binary. """ - # neg_labels = np.setdiff1d(np.unique(np.hstack((y_true, y_pred))), - # np.array([pos_label])) - # if neg_labels.size != 2: - # raise ValueError("This function only applies to binary classification.") return recall_score(y_true, y_pred, pos_label=neg_label, sample_weight=sample_weight) @@ -126,40 +107,46 @@ def selection_rate(y_true, y_pred, pos_label=1, sample_weight=None): # ============================ GROUP FAIRNESS ================================== -def statistical_parity_difference(*y, priv_expr, pos_label=1, sample_weight=None): +def statistical_parity_difference(*y, groups, priv_group=1, pos_label=1, + sample_weight=None): rate = base_rate if len(y) == 1 or y[1] is None else selection_rate - return difference(rate, *y, priv_expr=priv_expr, pos_label=pos_label, - sample_weight=sample_weight) + return difference(rate, *y, groups=groups, priv_group=priv_group, + pos_label=pos_label, sample_weight=sample_weight) -def disparate_impact_ratio(*y, priv_expr, pos_label=1, sample_weight=None): +def disparate_impact_ratio(*y, groups, priv_group=1, pos_label=1, + sample_weight=None): rate = base_rate if len(y) == 1 or y[1] is None else selection_rate - return ratio(rate, *y, priv_expr=priv_expr, pos_label=pos_label, - sample_weight=sample_weight) + return ratio(rate, *y, groups=groups, priv_group=priv_group, + pos_label=pos_label, sample_weight=sample_weight) +def equal_opportunity_difference(y_true, y_pred, groups, priv_group=1, + pos_label=1, sample_weight=None): + return difference(recall_score, y_true, y_pred, groups=groups, + priv_group=priv_group, pos_label=pos_label, + sample_weight=sample_weight) -def equal_opportunity_difference(y_true, y_pred, priv_expr, pos_label=1, - sample_weight=None): - return difference(recall_score, y_true, y_pred, priv_expr=priv_expr, - pos_label=pos_label, sample_weight=sample_weight) - -def average_odds_difference(y_true, y_pred, priv_expr, pos_label=1, neg_label=0, - sample_weight=None): - tnr_diff = difference(specificity_score, y_true, y_pred, priv_expr=priv_expr, - neg_label=neg_label, sample_weight=sample_weight) - tpr_diff = difference(recall_score, y_true, y_pred, priv_expr=priv_expr, - pos_label=pos_label, sample_weight=sample_weight) +def average_odds_difference(y_true, y_pred, groups, priv_group=1, pos_label=1, + neg_label=0, sample_weight=None): + tnr_diff = difference(specificity_score, y_true, y_pred, groups=groups, + priv_group=priv_group, neg_label=neg_label, + sample_weight=sample_weight) + tpr_diff = difference(recall_score, y_true, y_pred, groups=groups, + priv_group=priv_group, pos_label=pos_label, + sample_weight=sample_weight) return (tpr_diff - tnr_diff) / 2 -def average_odds_error(y_true, y_pred, priv_expr, pos_label=1, neg_label=0, - sample_weight=None): - tnr_diff = difference(specificity_score, y_true, y_pred, priv_expr=priv_expr, - neg_label=neg_label, sample_weight=sample_weight) - tpr_diff = difference(recall_score, y_true, y_pred, priv_expr=priv_expr, - pos_label=pos_label, sample_weight=sample_weight) +def average_odds_error(y_true, y_pred, groups, priv_group=1, pos_label=1, + neg_label=0, sample_weight=None): + tnr_diff = difference(specificity_score, y_true, y_pred, groups=groups, + priv_group=priv_group, neg_label=neg_label, + sample_weight=sample_weight) + tpr_diff = difference(recall_score, y_true, y_pred, groups=groups, + priv_group=priv_group, pos_label=pos_label, + sample_weight=sample_weight) return (abs(tnr_diff) + abs(tpr_diff)) / 2 -# ================================ INDICES ===================================== +# ========================== INDIVIDUAL FAIRNESS =============================== def generalized_entropy_index(b, alpha=2): if alpha == 0: return -(np.log(b / b.mean()) / b.mean()).mean() @@ -174,14 +161,15 @@ def generalized_entropy_error(y_true, y_pred, alpha=2, pos_label=1): b = 1 + (y_pred == pos_label) - (y_true == pos_label) return generalized_entropy_index(b, alpha=alpha) -def between_group_generalized_entropy_error(y_true, y_pred, priv_expr, alpha=2, +def between_group_generalized_entropy_error(y_true, y_pred, groups, + priv_group=None, alpha=2, pos_label=1): b = np.empty_like(y_true, dtype='float') - priv = y_true.to_frame('').eval(priv_expr) - b[priv] = (1 + (y_pred[priv] == pos_label) - - (y_true[priv] == pos_label)).mean() - b[~priv] = (1 + (y_pred[~priv] == pos_label) - - (y_true[~priv] == pos_label)).mean() + if priv_group is not None: + groups = [1 if g == priv_group else 0 for g in groups] + for g in np.unique(groups): + b[groups == g] = (1 + (y_pred[groups == g] == pos_label) + - (y_true[groups == g] == pos_label)).mean() return generalized_entropy_index(b, alpha=alpha) def theil_index(b): @@ -191,7 +179,6 @@ def coefficient_of_variation(b): return 2 * np.sqrt(generalized_entropy_index(b, alpha=2)) -# ========================== INDIVIDUAL FAIRNESS =============================== # TODO: not technically a scorer but you should be allowed to score transformers # Is consistency_difference posible? # use sample_weight? @@ -218,7 +205,7 @@ def sensitivity_score(y_true, y_pred, pos_label=1, sample_weight=None): # return 1 - specificity_score(y_true, y_pred, pos_label=pos_label, # sample_weight=sample_weight) -def mean_difference(*y, priv_expr, pos_label=1, sample_weight=None): +def mean_difference(*y, groups, priv_group=1, pos_label=1, sample_weight=None): """Alias of :func:`statistical_parity_difference`.""" - return statistical_parity_difference(*y, priv_expr=priv_expr, + return statistical_parity_difference(*y, groups=groups, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) diff --git a/aif360/sklearn/tests/test_metrics.py b/aif360/sklearn/tests/test_metrics.py index 5c263303..ed3636f8 100644 --- a/aif360/sklearn/tests/test_metrics.py +++ b/aif360/sklearn/tests/test_metrics.py @@ -30,6 +30,7 @@ def test_dataset_equality(): # print(adult.feature_names) # print(adult.features[:5]) assert (adult.features == X.values).all() + assert (adult.labels == y.values).all() def test_consistency(): assert consistency_score(X, y) == cm.consistency() diff --git a/docs/source/modules/sklearn.rst b/docs/source/modules/sklearn.rst index e6234ddf..273f8256 100644 --- a/docs/source/modules/sklearn.rst +++ b/docs/source/modules/sklearn.rst @@ -17,5 +17,3 @@ Metrics .. automodule:: aif360.sklearn.metrics.metrics :members: - -.. autofunction:: aif360.sklearn.metrics.mean_difference diff --git a/requirements.txt b/requirements.txt index 76daec2e..767db283 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ ipywidgets tqdm numpy>=1.16 matplotlib -pandas>=0.23.3 +pandas>=0.24 pytest>=3.5.0 scipy scikit-learn From 8e96177c887f9f56dcc7f06625c262c10db6d226 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Wed, 5 Jun 2019 11:11:47 -0400 Subject: [PATCH 07/61] fixes to categoricals --- aif360/sklearn/datasets/openml_datasets.py | 4 ++-- aif360/sklearn/datasets/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 6da3838c..d6082840 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -79,8 +79,8 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], elif subset == 'test': df = df.iloc[:16281] - df.class = df.class.cat.as_ordered() # '<=50K' < '>50K' df = df.rename(columns={'class': 'annual-income'}) # more descriptive name + df['annual-income'] = df['annual-income'].cat.as_ordered() # '<=50K' < '>50K' # binarize protected attributes if binary_race: @@ -136,8 +136,8 @@ def fetch_german(data_home=None, usecols=[], dropcols=[], numeric_only=False, df = to_dataframe(fetch_openml(data_id=31, data_home=data_home or DATA_HOME_DEFAULT, target_column=None)) - df.class = df.class.cat.as_ordered() # 'bad' < 'good' df = df.rename(columns={'class': 'credit-risk'}) # more descriptive name + df['credit-risk'] = df['credit-risk'].cat.as_ordered() # 'bad' < 'good' # Note: marital_status directly implies sex. i.e. 'div/dep/mar' => 'female' # and all others => 'male' diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index 60d61e37..b5fff624 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -85,7 +85,7 @@ def standarize_dataset(df, protected_attributes, target, sample_weight=None, for col in df.select_dtypes('category'): # TODO: allow any size ordered categorical? if len(df[col].cat.categories) == 2 and df[col].cat.ordered: - df[col] = df[col].cat.factorize(sort=True)[0] + df[col] = df[col].factorize(sort=True)[0] df = df.select_dtypes(['number', 'bool']) # upcast all feature dimensions to a consistent numerical dtype df = df.apply(pd.to_numeric, axis=1) From 8abb897dfd41e8d6c406da333edf147fca4e4888 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Wed, 5 Jun 2019 16:03:10 -0400 Subject: [PATCH 08/61] fixes for tests, updated README --- .travis.yml | 4 +- aif360/sklearn/README.md | 7 +- aif360/sklearn/datasets/openml_datasets.py | 6 +- aif360/sklearn/examples/Getting Started.ipynb | 468 ++++++++++++++++++ aif360/sklearn/metrics/metrics.py | 2 +- aif360/sklearn/tests/test_datasets.py | 38 +- aif360/sklearn/tests/test_metrics.py | 27 +- 7 files changed, 510 insertions(+), 42 deletions(-) create mode 100644 aif360/sklearn/examples/Getting Started.ipynb diff --git a/.travis.yml b/.travis.yml index 9aa44262..fdfa087e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ env: branches: only: - - master + - sklearn-compat install: - pip install -r requirements.txt @@ -28,4 +28,4 @@ before_script: script: # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics - - travis_wait pytest tests + - travis_wait python -m pytest aif360/sklearn/tests diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md index c5bd0d8c..fbaf9adc 100644 --- a/aif360/sklearn/README.md +++ b/aif360/sklearn/README.md @@ -5,6 +5,9 @@ and running debiasing algorithms than the main AIF360 package. The purpose of this sub-package is to match scikit-learn paradigms/APIs for easier integration in typical machine learning workflows. +See [Getting Started](examples/Getting%20Started.ipynb) to see `aif360.sklearn` +in action. + To do: - [x] Reformat datasets as separate X and y (and sample_weight) DataFrame @@ -15,8 +18,8 @@ objects with sample properties (protected attributes) as the index - [ ] MEPS - [ ] Implement metrics as individual functions instead of instance methods - [x] Make certain metrics compatible as sklearn scorers - - [ ] Use "groups" and "priv_group" keywords to specify protected attributes to - functions (partially done) + - [x] Use "groups" and "priv_group" keywords to specify protected attributes to + functions - [ ] Generalized confusion matrix - [ ] Sample distortion metrics - [ ] Make inprocessing algorithms compatible as sklearn `Estimator`s diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index d6082840..ac8c32d1 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -198,9 +198,9 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', df.deposit = df.deposit.cat.rename_categories({'1': 'no', '2': 'yes'}) # df.deposit = df.deposit.cat.as_ordered() # replace 'unknown' marker with NaN - df.select_dtypes('category').apply(lambda s: s.cat.remove_categories('unknown') - if 'unknown' in s.cat.categories else s, - inplace=True) + df.select_dtypes('category').apply(lambda s: + s.cat.remove_categories('unknown', inplace=True) + if 'unknown' in s.cat.categories else s) return standarize_dataset(df, protected_attributes='age', target='deposit', usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, dropna=dropna) diff --git a/aif360/sklearn/examples/Getting Started.ipynb b/aif360/sklearn/examples/Getting Started.ipynb new file mode 100644 index 00000000..58031a8d --- /dev/null +++ b/aif360/sklearn/examples/Getting Started.ipynb @@ -0,0 +1,468 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting Started" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import recall_score\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from aif360.sklearn.datasets import fetch_adult\n", + "from aif360.sklearn.metrics import disparate_impact_ratio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Datasets are formatted as separate `X` (# samples x # features) and `y` (# samples x # labels) DataFrames. The index of each DataFrame contains protected attribute values per sample. Datasets may also load a `sample_weight` object to be used with certain algorithms/metrics. All of this makes it so that aif360 is compatible with scikit-learn objects.\n", + "\n", + "For example, we can easily load the Adult dataset from UCI with the following line:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageworkclasseducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
racesex
Non-whiteMale25.0Private11th7.0Never-marriedMachine-op-inspctOwn-childNon-whiteMale0.00.040.0United-States
WhiteMale38.0PrivateHS-grad9.0Married-civ-spouseFarming-fishingHusbandWhiteMale0.00.050.0United-States
Male28.0Local-govAssoc-acdm12.0Married-civ-spouseProtective-servHusbandWhiteMale0.00.040.0United-States
Non-whiteMale44.0PrivateSome-college10.0Married-civ-spouseMachine-op-inspctHusbandNon-whiteMale7688.00.040.0United-States
WhiteMale34.0Private10th6.0Never-marriedOther-serviceNot-in-familyWhiteMale0.00.030.0United-States
\n", + "
" + ], + "text/plain": [ + " age workclass education education-num \\\n", + "race sex \n", + "Non-white Male 25.0 Private 11th 7.0 \n", + "White Male 38.0 Private HS-grad 9.0 \n", + " Male 28.0 Local-gov Assoc-acdm 12.0 \n", + "Non-white Male 44.0 Private Some-college 10.0 \n", + "White Male 34.0 Private 10th 6.0 \n", + "\n", + " marital-status occupation relationship \\\n", + "race sex \n", + "Non-white Male Never-married Machine-op-inspct Own-child \n", + "White Male Married-civ-spouse Farming-fishing Husband \n", + " Male Married-civ-spouse Protective-serv Husband \n", + "Non-white Male Married-civ-spouse Machine-op-inspct Husband \n", + "White Male Never-married Other-service Not-in-family \n", + "\n", + " race sex capital-gain capital-loss hours-per-week \\\n", + "race sex \n", + "Non-white Male Non-white Male 0.0 0.0 40.0 \n", + "White Male White Male 0.0 0.0 50.0 \n", + " Male White Male 0.0 0.0 40.0 \n", + "Non-white Male Non-white Male 7688.0 0.0 40.0 \n", + "White Male White Male 0.0 0.0 30.0 \n", + "\n", + " native-country \n", + "race sex \n", + "Non-white Male United-States \n", + "White Male United-States \n", + " Male United-States \n", + "Non-white Male United-States \n", + "White Male United-States " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X, y, sample_weight = fetch_adult()\n", + "X.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also easily load a version of the dataset which only contains numeric or binary columns and split it with scikit-learn:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageeducation-numracesexcapital-gaincapital-losshours-per-week
racesex
Non-whiteFemale18.07.00.00.00.00.020.0
WhiteMale55.09.01.01.00.00.040.0
Female43.09.01.00.00.00.040.0
Male44.011.01.01.04386.00.040.0
Male41.09.01.01.00.00.055.0
\n", + "
" + ], + "text/plain": [ + " age education-num race sex capital-gain capital-loss \\\n", + "race sex \n", + "Non-white Female 18.0 7.0 0.0 0.0 0.0 0.0 \n", + "White Male 55.0 9.0 1.0 1.0 0.0 0.0 \n", + " Female 43.0 9.0 1.0 0.0 0.0 0.0 \n", + " Male 44.0 11.0 1.0 1.0 4386.0 0.0 \n", + " Male 41.0 9.0 1.0 1.0 0.0 0.0 \n", + "\n", + " hours-per-week \n", + "race sex \n", + "Non-white Female 20.0 \n", + "White Male 40.0 \n", + " Female 40.0 \n", + " Male 40.0 \n", + " Male 55.0 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X, y, _ = fetch_adult(numeric_only=True)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)\n", + "X_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the data in this format, we can easily train a scikit-learn model and get predictions for the test data:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = LogisticRegression(solver='liblinear').fit(X_train, y_train).predict(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can analyze our predictions and quickly calucate the disparate impact for females vs. males:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.19176335549523604" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sex = y_test.index.get_level_values('sex')\n", + "disparate_impact_ratio(y_test, y_pred, groups=sex, priv_group='Male', pos_label='>50K')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Debiasing algorithms" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Not yet implemented." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index 79c91c19..a0e4f813 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -29,7 +29,7 @@ def difference(func, y, *args, groups, priv_group=1, sample_weight=None, **kwarg scalar: Difference in metric value for unprivileged and privileged groups. Examples: - >>> X, y = load_german(numeric_only=True) + >>> X, y = fetch_german(numeric_only=True) >>> y_pred = LogisticRegression().fit(X, y).predict(X) >>> sex = X.index.get_level_values('sex') >>> difference(precision_score, y, y_pred, groups=sex, priv_group='male') diff --git a/aif360/sklearn/tests/test_datasets.py b/aif360/sklearn/tests/test_datasets.py index 9b00e801..3e5c8a4a 100644 --- a/aif360/sklearn/tests/test_datasets.py +++ b/aif360/sklearn/tests/test_datasets.py @@ -30,11 +30,11 @@ def test_sample_weight_basic(): assert len(with_weights) == 3 assert with_weights.X.shape == (3, 2) -def test_pos_label_basic(): - assert (basic().y == [3, 7, 11]).all() - assert (basic(pos_label=3).y == [1, 0, 0]).all() - assert (basic(pos_label=[3, 7, 11]).y == 1).all() - assert (basic(pos_label=lambda y: 10 > y > 5).y == [0, 1, 0]).all() +# def test_pos_label_basic(): +# assert (basic().y == [3, 7, 11]).all() +# assert (basic(pos_label=3).y == [1, 0, 0]).all() +# assert (basic(pos_label=[3, 7, 11]).y == 1).all() +# assert (basic(pos_label=lambda y: 10 > y > 5).y == [0, 1, 0]).all() def test_usecols_dropcols_basic(): assert basic(usecols='X1').X.columns.tolist() == ['X1'] @@ -59,26 +59,26 @@ def test_numeric_only_basic(): assert basic(dropcols='Z', numeric_only=True).X.shape == (3, 2) assert (basic(dropcols='X1', numeric_only=True).X.dtypes == 'int').all() -def test_fetch_and_format_openml(): - df = fetch_and_format_openml('german') - assert df.equals(df.select_dtypes(['number', 'category'])) +# def test_fetch_and_format_openml(): +# df = fetch_and_format_openml('german') +# assert df.equals(df.select_dtypes(['number', 'category'])) -def test_load_adult(): - adult = load_adult() +def test_fetch_adult(): + adult = fetch_adult() assert len(adult) == 3 assert adult.X.shape == (45222, 13) - assert load_adult(dropna=False).X.shape == (48842, 13) - assert load_adult(numeric_only=True).X.shape == (48842, 5) + assert fetch_adult(dropna=False).X.shape == (48842, 13) + assert fetch_adult(numeric_only=True).X.shape == (48842, 7) -def test_load_german(): - german = load_german() +def test_fetch_german(): + german = fetch_german() assert len(german) == 2 assert german.X.shape == (1000, 21) - assert load_german(numeric_only=True).X.shape == (1000, 7) + assert fetch_german(numeric_only=True).X.shape == (1000, 8) -def test_load_bank(): - bank = load_bank() +def test_fetch_bank(): + bank = fetch_bank() assert len(bank) == 2 assert bank.X.shape == (45211, 15) - assert load_bank(dropcols=[]).X.shape == (45211, 16) - assert load_bank(numeric_only=True).X.shape == (45211, 6) + assert fetch_bank(dropcols=[]).X.shape == (45211, 16) + assert fetch_bank(numeric_only=True).X.shape == (45211, 6) diff --git a/aif360/sklearn/tests/test_metrics.py b/aif360/sklearn/tests/test_metrics.py index ed3636f8..9edf9146 100644 --- a/aif360/sklearn/tests/test_metrics.py +++ b/aif360/sklearn/tests/test_metrics.py @@ -2,19 +2,16 @@ from sklearn.linear_model import LogisticRegression from aif360.datasets import AdultDataset -from aif360.sklearn.datasets import load_adult +from aif360.sklearn.datasets import fetch_adult from aif360.metrics import ClassificationMetric from aif360.sklearn.metrics import * -X, y, sample_weight = load_adult(numeric_only=True) -X.insert(2, 'race', X.index.get_level_values('race').to_series(index=X.index).map( - lambda r: r == 'White').astype('float')) -X.insert(3, 'sex', X.index.get_level_values('sex').to_series(index=X.index).map( - {'Male': 1, 'Female': 0}).astype('float')) +X, y, sample_weight = fetch_adult(numeric_only=True) +y = y.factorize(sort=True)[0] y_pred = LogisticRegression(solver='liblinear').fit(X, y, sample_weight=sample_weight).predict(X) -priv = 'sex == "Male"' +priv = X.index.get_level_values('sex') adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'], features_to_drop=[]) @@ -30,10 +27,10 @@ def test_dataset_equality(): # print(adult.feature_names) # print(adult.features[:5]) assert (adult.features == X.values).all() - assert (adult.labels == y.values).all() + assert (adult.labels.ravel() == y).all() def test_consistency(): - assert consistency_score(X, y) == cm.consistency() + assert np.isclose(consistency_score(X, y), cm.consistency()) def test_specificity(): assert specificity_score(y, y_pred, sample_weight=sample_weight) == cm.specificity() @@ -42,24 +39,24 @@ def test_selection_rate(): assert selection_rate(y, y_pred, sample_weight=sample_weight) == cm.selection_rate() def test_disparate_impact(): - assert disparate_impact_ratio(y, y_pred, priv_expr=priv, + assert disparate_impact_ratio(y, y_pred, groups=priv, priv_group='Male', sample_weight=sample_weight) == cm.disparate_impact() def test_statistical_parity(): - assert statistical_parity_difference(y, y_pred, priv_expr=priv, + assert statistical_parity_difference(y, y_pred, groups=priv, priv_group='Male', sample_weight=sample_weight) == cm.statistical_parity_difference() def test_equal_opportunity(): - assert equal_opportunity_difference(y, y_pred, priv_expr=priv, + assert equal_opportunity_difference(y, y_pred, groups=priv, priv_group='Male', sample_weight=sample_weight) == cm.equal_opportunity_difference() def test_average_odds_difference(): - assert np.isclose(average_odds_difference(y, y_pred, priv_expr=priv, + assert np.isclose(average_odds_difference(y, y_pred, groups=priv, priv_group='Male', sample_weight=sample_weight), cm.average_odds_difference()) def test_average_odds_error(): - assert np.isclose(average_odds_error(y, y_pred, priv_expr=priv, + assert np.isclose(average_odds_error(y, y_pred, groups=priv, priv_group='Male', sample_weight=sample_weight), cm.average_abs_odds_difference()) @@ -68,5 +65,5 @@ def test_generalized_entropy_index(): cm.generalized_entropy_index()) def test_between_group_generalized_entropy_index(): - assert between_group_generalized_entropy_error(y, y_pred, priv_expr=priv) \ + assert between_group_generalized_entropy_error(y, y_pred, groups=priv, priv_group='Male') \ == cm.between_group_generalized_entropy_index() From 15a8eb2de59e934ceff7db02eeb4408002d5f093 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Wed, 5 Jun 2019 22:49:46 -0400 Subject: [PATCH 09/61] added travis badge to README --- aif360/sklearn/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md index fbaf9adc..59298f3f 100644 --- a/aif360/sklearn/README.md +++ b/aif360/sklearn/README.md @@ -1,5 +1,7 @@ ## `aif360.sklearn` +[![Build Status](https://travis-ci.org/IBM/AIF360.svg?branch=sklearn-compat)](https://travis-ci.org/IBM/AIF360) + This is a wholly separate interface for interacting with data, viewing metrics, and running debiasing algorithms than the main AIF360 package. The purpose of this sub-package is to match scikit-learn paradigms/APIs for easier integration From 3f594a42e5c04823a8ff6f1da9350783f03894bf Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 13 Jun 2019 10:06:25 -0400 Subject: [PATCH 10/61] updated todo with external blockers --- aif360/sklearn/README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md index 59298f3f..695b5eff 100644 --- a/aif360/sklearn/README.md +++ b/aif360/sklearn/README.md @@ -25,12 +25,15 @@ objects with sample properties (protected attributes) as the index - [ ] Generalized confusion matrix - [ ] Sample distortion metrics - [ ] Make inprocessing algorithms compatible as sklearn `Estimator`s + - [ ] **[External]** `get_feature_names()` from data preprocessing + steps that would remove DataFrame formatting + - [ ] SLEP008? + - [ ] Prejudice remover - [ ] Adversarial debiasing - [ ] Meta-fair classifier - - [ ] Prejudice remover - [ ] Make preprocessing algorithms compatible as sklearn `Transformer`s - - [ ] Add functionality to modify X and y (worst case: just `predict()` + - `transform()` separately) + - [ ] **[External]** Add functionality to modify X and y + - [ ] SLEP001 - [ ] Disparate impact remover - [ ] Learning fair representations - [ ] Optimized preprocessing @@ -38,7 +41,8 @@ objects with sample properties (protected attributes) as the index - [ ] Use dynamic object to pass sample_weight to estimator, etc. after they are fitted - [ ] Make postprocessing algorithms compatible - - [ ] Allow `fit()` on `y_true`,`y_pred` + - [ ] **[External]** Allow for `fit(y_true, y_pred)` + - [ ] New SLEP? - [ ] Calibrated equalized odds postprocessing - [ ] Equalized odds postprocessing - [ ] Reject option classification From 7754b32e1bf4bb1e2c6700311efdf35b063e55e7 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 13 Jun 2019 11:37:40 -0400 Subject: [PATCH 11/61] added reweighing workaround to example --- aif360/sklearn/examples/Getting Started.ipynb | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/aif360/sklearn/examples/Getting Started.ipynb b/aif360/sklearn/examples/Getting Started.ipynb index 58031a8d..0df0db33 100644 --- a/aif360/sklearn/examples/Getting Started.ipynb +++ b/aif360/sklearn/examples/Getting Started.ipynb @@ -15,10 +15,12 @@ "source": [ "import numpy as np\n", "import pandas as pd\n", + "from sklearn.pipeline import make_pipeline\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import recall_score\n", "from sklearn.model_selection import train_test_split\n", "\n", + "from aif360.sklearn.algorithms.preprocessing import Reweighing\n", "from aif360.sklearn.datasets import fetch_adult\n", "from aif360.sklearn.metrics import disparate_impact_ratio" ] @@ -441,7 +443,13 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "pipe = make_pipeline(Reweighing(), LinearRegression())\n", + "# sample_weight_ will be updated after it is fit\n", + "fit_params = {'linearregression__sample_weight':\n", + " pipe.named_steps.reweighing.sample_weight_}\n", + "pipe.fit(X, y, **fit_params)" + ] } ], "metadata": { From 17b0c952c3ac776d15c46946359824bf3cdf8d16 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 18 Jun 2019 16:55:49 -0400 Subject: [PATCH 12/61] added Reweighing algorithm --- aif360/sklearn/README.md | 11 +- aif360/sklearn/datasets/openml_datasets.py | 28 +++-- aif360/sklearn/preprocessing/__init__.py | 1 + aif360/sklearn/preprocessing/reweighing.py | 113 +++++++++++++++++++++ aif360/sklearn/tests/test_reweighing.py | 57 +++++++++++ docs/source/modules/sklearn.rst | 6 ++ 6 files changed, 203 insertions(+), 13 deletions(-) create mode 100644 aif360/sklearn/preprocessing/__init__.py create mode 100644 aif360/sklearn/preprocessing/reweighing.py create mode 100644 aif360/sklearn/tests/test_reweighing.py diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md index 695b5eff..7912bb4c 100644 --- a/aif360/sklearn/README.md +++ b/aif360/sklearn/README.md @@ -27,19 +27,20 @@ objects with sample properties (protected attributes) as the index - [ ] Make inprocessing algorithms compatible as sklearn `Estimator`s - [ ] **[External]** `get_feature_names()` from data preprocessing steps that would remove DataFrame formatting - - [ ] SLEP008? + - [ ] SLEP008 - [ ] Prejudice remover - [ ] Adversarial debiasing - [ ] Meta-fair classifier - [ ] Make preprocessing algorithms compatible as sklearn `Transformer`s - [ ] **[External]** Add functionality to modify X and y - - [ ] SLEP001 + - [ ] SLEP005 - Resampling API - [ ] Disparate impact remover - [ ] Learning fair representations - [ ] Optimized preprocessing - - [ ] Reweighing - - [ ] Use dynamic object to pass sample_weight to estimator, etc. after they - are fitted + - [X] Reweighing + - [X] Use dynamic object to pass sample_weight to estimator, etc. after they + are fitted (NOTE: does not work with GridSearchCV) + - [ ] **[External]** SLEP006 - Sample properties - [ ] Make postprocessing algorithms compatible - [ ] **[External]** Allow for `fit(y_true, y_pred)` - [ ] New SLEP? diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index ac8c32d1..1aac923a 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -93,18 +93,18 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, dropna=dropna) -def fetch_german(data_home=None, usecols=[], dropcols=[], numeric_only=False, - dropna=True): +def fetch_german(data_home=None, binary_age=False, usecols=[], dropcols=[], + numeric_only=False, dropna=True): """Load the German Credit Dataset. Protected attributes are 'sex' ('male' is privileged and 'female' is - unprivileged) and 'age' (left as continuous but [1]_ recommends `age >= 25` - be considered privileged and `age < 25` be considered unprivileged; this can - be done at metric evaluation time). The outcome variable is 'good' - (favorable) or 'bad' (unfavorable). + unprivileged) and 'age' (left as continuous but [#kamiran09]_ recommends + `age >= 25` be considered privileged and `age < 25` be considered + unprivileged; this can be done at metric evaluation time). The outcome + variable is 'good' (favorable) or 'bad' (unfavorable). References: - .. [1] F. Kamiran and T. Calders, "Classifying without + .. [#kamiran09] F. Kamiran and T. Calders, "Classifying without discriminating," 2nd International Conference on Computer, Control and Communication, 2009. @@ -131,7 +131,15 @@ def fetch_german(data_home=None, usecols=[], dropcols=[], numeric_only=False, >>> german_num.X.shape (1000, 7) - >>> DISPARATE IMPACT AGE EXAMPLE HERE + + + >>> X, y = fetch_german(numeric_only=True) + >>> y_pred = LogisticRegression().fit(X, y).predict(X) + >>> age = X.index.get_level_values('age') >= 25 + >>> disparate_impact_ratio(y, y_pred, groups=age, priv_group=True, + ... pos_label='good') + 0.9483094846144106 + """ df = to_dataframe(fetch_openml(data_id=31, data_home=data_home or DATA_HOME_DEFAULT, target_column=None)) @@ -139,6 +147,10 @@ def fetch_german(data_home=None, usecols=[], dropcols=[], numeric_only=False, df = df.rename(columns={'class': 'credit-risk'}) # more descriptive name df['credit-risk'] = df['credit-risk'].cat.as_ordered() # 'bad' < 'good' + # binarize protected attributes + if binary_age: + df.age = pd.cut(df.age, [0, 25, 100], right=False, labels=['young', 'aged']) + # Note: marital_status directly implies sex. i.e. 'div/dep/mar' => 'female' # and all others => 'male' personal_status = df.pop('personal_status').str.split(expand=True) diff --git a/aif360/sklearn/preprocessing/__init__.py b/aif360/sklearn/preprocessing/__init__.py new file mode 100644 index 00000000..8cac812f --- /dev/null +++ b/aif360/sklearn/preprocessing/__init__.py @@ -0,0 +1 @@ +from aif360.sklearn.preprocessing.reweighing import * diff --git a/aif360/sklearn/preprocessing/reweighing.py b/aif360/sklearn/preprocessing/reweighing.py new file mode 100644 index 00000000..f61d7643 --- /dev/null +++ b/aif360/sklearn/preprocessing/reweighing.py @@ -0,0 +1,113 @@ +from warnings import warn + +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin +from sklearn.base import clone +from sklearn.utils.metaestimators import if_delegate_has_method + +class Reweighing(BaseEstimator, TransformerMixin): + """Reweighing is a preprocessing technique that weights the examples in each + (group, label) combination differently to ensure fairness before + classification [#kamiran12]_. + + Attributes: + groups_ (array, shape (n_groups,)): A list of group labels known to the + transformer. + classes_ (array, shape (n_classes,)): A list of class labels known to + the transformer. + sample_weight_ (array, shape (n_samples,)): New sample weights after + transformation. See examples for details. + reweigh_factors_ (array, shape (n_groups, n_labels)): Reweighing factors + for each combination of group and class labels used to debias + samples. Existing sample weights are multiplied by the corresponding + factor for that sample's group and class. + + Examples: + >>> pipe = make_pipeline(Reweighing(), LinearRegression()) + >>> # sample_weight_ will be used after it is fit + >>> fit_params = {'linearregression__sample_weight': + ... pipe['reweighing'].sample_weight_} + >>> pipe.fit(X, y, **fit_params) + + References: + .. [#kamiran12] F. Kamiran and T. Calders, "Data Preprocessing + Techniques for Classification without Discrimination," Knowledge and + Information Systems, 2012. + """ + # TODO: binary option for groups/labels? + def __init__(self): + self.sample_weight_ = np.empty(0) # dynamic object for use in Pipeline + + def fit(self, X, y=None): + raise NotImplementedError("Only 'fit_transform' is allowed.") + + def transform(self, X): + raise NotImplementedError("Only 'fit_transform' is allowed.") + + def fit_transform(self, X, y, groups, sample_weight=None): + """Compute the factors for reweighing the dataset and transform the + sample weights. + + Args: + X (array-like): Training samples. + y (array-like): Training labels. + groups (array-like): Protected attributes corresponding to samples. + sample_weight (array-like, optional): Sample weights. + + Returns: + X: Unchanged samples. Only the sample weights are different after + transformation (see the `sample_weight_` attribute). + """ + if sample_weight is None: + sample_weight = np.ones(y.shape) + # resize all references (might be part of a Pipeline) + self.sample_weight_.resize(sample_weight.shape, refcheck=False) + self.groups_ = np.unique(groups) + self.classes_ = np.unique(y) + + def N_(i): return sample_weight[i].sum() + + N = sample_weight.sum() + for g in self.groups_: + for c in self.classes_: + g_and_c = (groups == g) & (y == c) + if np.any(g_and_c): + W_gc = N_(groups == g) * N_(y == c) / (N * N_(g_and_c)) + self.sample_weight_[g_and_c] = W_gc * sample_weight[g_and_c] + return X + + +class ReweighingMeta(BaseEstimator, MetaEstimatorMixin): + def __init__(self, estimator): + self.reweigher = Reweighing() + self.estimator = estimator + + def fit(self, X, y, pa_groups, sample_weight=None): + self.reweigher_ = clone(self.reweigher) + self.estimator_ = clone(self.estimator) + + self.reweigher_.fit_transform(X, y, pa_groups, sample_weight=sample_weight) + try: + self.estimator_.fit(X, y, sample_weight=self.reweigher_.sample_weight_) + except TypeError: + raise ValueError("'estimator' ({}) does not incorporate " + "'sample_weight' in 'fit()''.".format( + type(self.estimator_))) + return self + + @if_delegate_has_method('estimator') + def predict(self, X): + return self.estimator_.predict(X) + + @if_delegate_has_method('estimator') + def predict_proba(self, X): + return self.estimator_.predict_proba(X) + + @if_delegate_has_method('estimator') + def predict_log_proba(self, X): + return self.estimator_.predict_log_proba(X) + + # TODO: sample_weight isn't passed by GridSearchCV.score() + @if_delegate_has_method('estimator') + def score(self, X, y, sample_weight=None): + return self.estimator_.score(X, y, sample_weight=sample_weight) diff --git a/aif360/sklearn/tests/test_reweighing.py b/aif360/sklearn/tests/test_reweighing.py new file mode 100644 index 00000000..ec42c0e9 --- /dev/null +++ b/aif360/sklearn/tests/test_reweighing.py @@ -0,0 +1,57 @@ +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import make_pipeline + +from aif360.datasets import GermanDataset +from aif360.sklearn.datasets import fetch_german +from aif360.algorithms.preprocessing import Reweighing as OrigReweighing +from aif360.sklearn.preprocessing import Reweighing, ReweighingMeta + + +X, y = fetch_german(numeric_only=True, binary_age=True, dropcols='duration') +german = GermanDataset(categorical_features=[], features_to_keep=[ + 'credit_amount', 'investment_as_income_percentage', 'residence_since', + 'age', 'number_of_credits', 'people_liable_for', 'sex']) + +def test_dataset_equality(): + assert (german.features == X.values).all() + +def test_reweighing_sex(): + orig_rew = OrigReweighing(unprivileged_groups=[{'sex': 0}], + privileged_groups=[{'sex': 1}]) + german_fair = orig_rew.fit_transform(german) + rew = Reweighing() + rew.fit_transform(X, y, groups=X.index.get_level_values('sex')) + + # assert orig_rew.w_up_unfav == rew.reweigh_factors_[0, 0] + # assert orig_rew.w_up_fav == rew.reweigh_factors_[0, 1] + # assert np.isclose(orig_rew.w_p_unfav, rew.reweigh_factors_[1, 0]) + # assert orig_rew.w_p_fav, rew.reweigh_factors_[1, 1] + assert np.allclose(german_fair.instance_weights, rew.sample_weight_) + +def test_reweighing_intersection(): + rew = Reweighing() + rew.fit_transform(X, y, groups=X.index.to_flat_index()) + # assert rew.reweigh_factors_.shape == (4, 2) + assert len(rew.groups_) == 4 + +def test_pipeline(): + logreg = LogisticRegression(solver='liblinear') + pipe = make_pipeline(Reweighing(), logreg) + fit_params = {'logisticregression__sample_weight': pipe[0].sample_weight_, + 'reweighing__groups': X.index.get_level_values('sex')} + pipe.fit(X, y, **fit_params) + assert (logreg.fit(X, y, sample_weight=pipe[0].sample_weight_).coef_ + == pipe[-1].coef_).all() + +def test_gridsearch(): + rew = ReweighingMeta(LogisticRegression(solver='liblinear')) + params = {'estimator__C': [1, 10]} + clf = GridSearchCV(rew, params, cv=5) + # TODO: 'groups' name clashes with CV splitter + fit_params = {'pa_groups': X.index.get_level_values('sex'), + 'sample_weight': np.random.random(y.shape)} + clf.fit(X, y, **fit_params) + # print(clf.score(X, y)) + assert len(clf.best_estimator_.reweigher_.groups_) == 2 diff --git a/docs/source/modules/sklearn.rst b/docs/source/modules/sklearn.rst index 273f8256..5a9fdb15 100644 --- a/docs/source/modules/sklearn.rst +++ b/docs/source/modules/sklearn.rst @@ -17,3 +17,9 @@ Metrics .. automodule:: aif360.sklearn.metrics.metrics :members: + +Preprocessing +------------- + +.. autoclass:: aif360.sklearn.preprocessing.reweighing.Reweighing + :members: From cc9246f9cb9e1a0845965a61cd546b045668bea1 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 18 Jun 2019 16:59:07 -0400 Subject: [PATCH 13/61] clean up comments --- aif360/sklearn/datasets/utils.py | 12 ------------ aif360/sklearn/metrics/metrics.py | 4 ++-- aif360/sklearn/tests/test_datasets.py | 10 ---------- aif360/sklearn/tests/test_metrics.py | 4 ---- aif360/sklearn/tests/test_reweighing.py | 6 +----- 5 files changed, 3 insertions(+), 33 deletions(-) diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index b5fff624..e9cce52f 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -17,11 +17,6 @@ def standarize_dataset(df, protected_attributes, target, sample_weight=None, are dropped from the features, they remain in the index. target (single label or list-like): Column label of the target (outcome) variable. - # pos_label (scalar, list-like, or function, optional): A value, list of - # values, or boolean function (True if positive) designating the - # positive binary label from the raw data. All others will be - # considered negative. The resulting target array will have value 1 if - # positive and 0 if negative. sample_weight (single label, optional): Name of the column containing sample weights. usecols (single label or list-like, optional): Column(s) to keep. All @@ -65,13 +60,6 @@ def standarize_dataset(df, protected_attributes, target, sample_weight=None, # TODO: convert to 1/0 if numeric_only? y = df.pop(target) - # if not callable(pos_label): - # if not is_list_like(pos_label): - # pos_label = [pos_label] - # # find all instances which match any of the favorable classes - # y = y.isin(pos_label).astype('int') - # else: - # y = y.apply(pos_label).astype('int') # Column-wise drops df = df.drop(dropcols, axis=1) diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index a0e4f813..d4da5c81 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -16,7 +16,7 @@ def difference(func, y, *args, groups, priv_group=1, sample_weight=None, **kwarg Args: func (function): A metric function from `aif360.sklearn.metrics` or `sklearn.metrics`. - y (pandas.Series): Outcome vector with protected attributes as index. + y (array-like): Outcome vector with protected attributes as index. *args: Additional positional args to be passed through to `func`. groups (array-like, keyword-only): Group labels (protected attributes) for the samples. @@ -55,7 +55,7 @@ def ratio(func, y, *args, groups, priv_group=1, sample_weight=None, **kwargs): Args: func (function): A metric function from `aif360.sklearn.metrics` or `sklearn.metrics`. - y (pandas.Series): Outcome vector with protected attributes as index. + y (array-like): Outcome vector with protected attributes as index. *args: Additional positional args to be passed through to `func`. groups (array-like, keyword-only): Group labels (protected attributes) for the samples. diff --git a/aif360/sklearn/tests/test_datasets.py b/aif360/sklearn/tests/test_datasets.py index 3e5c8a4a..eab905f3 100644 --- a/aif360/sklearn/tests/test_datasets.py +++ b/aif360/sklearn/tests/test_datasets.py @@ -30,12 +30,6 @@ def test_sample_weight_basic(): assert len(with_weights) == 3 assert with_weights.X.shape == (3, 2) -# def test_pos_label_basic(): -# assert (basic().y == [3, 7, 11]).all() -# assert (basic(pos_label=3).y == [1, 0, 0]).all() -# assert (basic(pos_label=[3, 7, 11]).y == 1).all() -# assert (basic(pos_label=lambda y: 10 > y > 5).y == [0, 1, 0]).all() - def test_usecols_dropcols_basic(): assert basic(usecols='X1').X.columns.tolist() == ['X1'] assert basic(usecols=['X1', 'Z']).X.columns.tolist() == ['X1', 'Z'] @@ -59,10 +53,6 @@ def test_numeric_only_basic(): assert basic(dropcols='Z', numeric_only=True).X.shape == (3, 2) assert (basic(dropcols='X1', numeric_only=True).X.dtypes == 'int').all() -# def test_fetch_and_format_openml(): -# df = fetch_and_format_openml('german') -# assert df.equals(df.select_dtypes(['number', 'category'])) - def test_fetch_adult(): adult = fetch_adult() assert len(adult) == 3 diff --git a/aif360/sklearn/tests/test_metrics.py b/aif360/sklearn/tests/test_metrics.py index 9edf9146..a2db21ec 100644 --- a/aif360/sklearn/tests/test_metrics.py +++ b/aif360/sklearn/tests/test_metrics.py @@ -22,10 +22,6 @@ privileged_groups=[{'sex': 1}]) def test_dataset_equality(): - # print(X.shape, adult.features.shape) - # print(X.head()) - # print(adult.feature_names) - # print(adult.features[:5]) assert (adult.features == X.values).all() assert (adult.labels.ravel() == y).all() diff --git a/aif360/sklearn/tests/test_reweighing.py b/aif360/sklearn/tests/test_reweighing.py index ec42c0e9..30e8f37a 100644 --- a/aif360/sklearn/tests/test_reweighing.py +++ b/aif360/sklearn/tests/test_reweighing.py @@ -24,17 +24,13 @@ def test_reweighing_sex(): rew = Reweighing() rew.fit_transform(X, y, groups=X.index.get_level_values('sex')) - # assert orig_rew.w_up_unfav == rew.reweigh_factors_[0, 0] - # assert orig_rew.w_up_fav == rew.reweigh_factors_[0, 1] - # assert np.isclose(orig_rew.w_p_unfav, rew.reweigh_factors_[1, 0]) - # assert orig_rew.w_p_fav, rew.reweigh_factors_[1, 1] assert np.allclose(german_fair.instance_weights, rew.sample_weight_) def test_reweighing_intersection(): rew = Reweighing() rew.fit_transform(X, y, groups=X.index.to_flat_index()) - # assert rew.reweigh_factors_.shape == (4, 2) assert len(rew.groups_) == 4 + assert len(rew.classes_) == 2 def test_pipeline(): logreg = LogisticRegression(solver='liblinear') From 8c58f650f12ceb04a2f066b791ec1b03c331ea85 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 18 Jun 2019 16:59:50 -0400 Subject: [PATCH 14/61] fixed package version in docs --- docs/source/conf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 66493140..03058220 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -67,10 +67,10 @@ # |version| and |release|, also used in various other places throughout the # built documents. # -# The short X.Y version. -version = u'0.1' # The full version, including alpha/beta/rc tags. -release = u'0.1.0' +release = aif360.__version__ +# The short X.Y version. +version = '.'.join(release.split('.')[:2]) # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 1e7899c42498e30ccf2db3435e127014f5fe80a0 Mon Sep 17 00:00:00 2001 From: Animesh Singh Date: Wed, 19 Jun 2019 22:48:43 -0700 Subject: [PATCH 15/61] adding hyperlinks to SLEPs --- aif360/sklearn/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md index 7912bb4c..558ac562 100644 --- a/aif360/sklearn/README.md +++ b/aif360/sklearn/README.md @@ -27,13 +27,13 @@ objects with sample properties (protected attributes) as the index - [ ] Make inprocessing algorithms compatible as sklearn `Estimator`s - [ ] **[External]** `get_feature_names()` from data preprocessing steps that would remove DataFrame formatting - - [ ] SLEP008 + - [ ] [SLEP008](https://github.com/scikit-learn/enhancement_proposals/pull/18 )? - [ ] Prejudice remover - [ ] Adversarial debiasing - [ ] Meta-fair classifier - [ ] Make preprocessing algorithms compatible as sklearn `Transformer`s - [ ] **[External]** Add functionality to modify X and y - - [ ] SLEP005 - Resampling API + - [ ] [SLEP001](https://github.com/scikit-learn/enhancement_proposals/blob/master/slep001/proposal.rst) - [ ] Disparate impact remover - [ ] Learning fair representations - [ ] Optimized preprocessing From c1c1e4052d6738b70d78c862f8935185bf24ecec Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 24 Jun 2019 10:46:03 -0400 Subject: [PATCH 16/61] added binary_age opt to german; fixed NAs in bank --- aif360/sklearn/datasets/openml_datasets.py | 37 +++++++++++----------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 1aac923a..98d0aa31 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -72,8 +72,8 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], if subset not in {'train', 'test', 'all'}: raise ValueError("subset must be either 'train', 'test', or 'all'; " "cannot be {}".format(subset)) - df = to_dataframe(fetch_openml(data_id=1590, data_home=data_home or - DATA_HOME_DEFAULT, target_column=None)) + df = to_dataframe(fetch_openml(data_id=1590, target_column=None, + data_home=data_home or DATA_HOME_DEFAULT)) if subset == 'train': df = df.iloc[16281:] elif subset == 'test': @@ -93,15 +93,15 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, dropna=dropna) -def fetch_german(data_home=None, binary_age=False, usecols=[], dropcols=[], +def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], numeric_only=False, dropna=True): """Load the German Credit Dataset. Protected attributes are 'sex' ('male' is privileged and 'female' is - unprivileged) and 'age' (left as continuous but [#kamiran09]_ recommends - `age >= 25` be considered privileged and `age < 25` be considered - unprivileged; this can be done at metric evaluation time). The outcome - variable is 'good' (favorable) or 'bad' (unfavorable). + unprivileged) and 'age' (binarized by default as recommended by + [#kamiran09]_: `age >= 25` is considered privileged and `age < 25` is + considered unprivileged; see the `binary_age` flag to keep this continuous). + The outcome variable is 'good' (favorable) or 'bad' (unfavorable). References: .. [#kamiran09] F. Kamiran and T. Calders, "Classifying without @@ -112,6 +112,9 @@ def fetch_german(data_home=None, binary_age=False, usecols=[], dropcols=[], data_home (string, optional): Specify another download and cache folder for the datasets. By default all AIF360 datasets are stored in 'aif360/sklearn/data/raw' subfolders. + binary_age (bool, optional): If `True`, split protected attribute, + `age`, into 'aged' (privileged) and 'youth' (unprivileged). The + `age` feature remains continuous. usecols (single label or list-like, optional): Column name(s) to keep. All others are dropped. dropcols (single label or list-like, optional): Column name(s) to drop. @@ -135,21 +138,20 @@ def fetch_german(data_home=None, binary_age=False, usecols=[], dropcols=[], >>> X, y = fetch_german(numeric_only=True) >>> y_pred = LogisticRegression().fit(X, y).predict(X) - >>> age = X.index.get_level_values('age') >= 25 - >>> disparate_impact_ratio(y, y_pred, groups=age, priv_group=True, + >>> disparate_impact_ratio(y, y_pred, prot_attr='age', priv_group=True, ... pos_label='good') 0.9483094846144106 """ - df = to_dataframe(fetch_openml(data_id=31, data_home=data_home or - DATA_HOME_DEFAULT, target_column=None)) + df = to_dataframe(fetch_openml(data_id=31, target_column=None, + data_home=data_home or DATA_HOME_DEFAULT)) df = df.rename(columns={'class': 'credit-risk'}) # more descriptive name df['credit-risk'] = df['credit-risk'].cat.as_ordered() # 'bad' < 'good' - # binarize protected attributes - if binary_age: - df.age = pd.cut(df.age, [0, 25, 100], right=False, labels=['young', 'aged']) + # binarize protected attribute (but not corresponding feature) + age = (pd.cut(df.age, [0, 25, 100], right=False, labels=['young', 'aged']) + if binary_age else 'age') # Note: marital_status directly implies sex. i.e. 'div/dep/mar' => 'female' # and all others => 'male' @@ -158,7 +160,7 @@ def fetch_german(data_home=None, binary_age=False, usecols=[], dropcols=[], df = df.join(personal_status.astype('category')) df.sex = df.sex.cat.as_ordered() # 'female' < 'male' - return standarize_dataset(df, protected_attributes=['sex', 'age'], + return standarize_dataset(df, protected_attributes=['sex', age], target='credit-risk', usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, dropna=dropna) @@ -210,9 +212,8 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', df.deposit = df.deposit.cat.rename_categories({'1': 'no', '2': 'yes'}) # df.deposit = df.deposit.cat.as_ordered() # replace 'unknown' marker with NaN - df.select_dtypes('category').apply(lambda s: - s.cat.remove_categories('unknown', inplace=True) - if 'unknown' in s.cat.categories else s) + df.apply(lambda s: s.cat.remove_categories('unknown', inplace=True) + if hasattr(s, 'cat') and 'unknown' in s.cat.categories else s) return standarize_dataset(df, protected_attributes='age', target='deposit', usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, dropna=dropna) From 93a7cdf00bb606ff91ff2f9bd0a02aec54f71765 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 24 Jun 2019 10:53:53 -0400 Subject: [PATCH 17/61] modified onehot_transformer to return DataFrame --- aif360/sklearn/datasets/utils.py | 47 ++++++++++++++++++++++----- aif360/sklearn/tests/test_datasets.py | 7 +++- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index e9cce52f..4566c983 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -2,8 +2,9 @@ import pandas as pd from pandas.core.dtypes.common import is_list_like -from sklearn.compose import make_column_transformer +from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder +from sklearn.utils.validation import check_is_fitted def standarize_dataset(df, protected_attributes, target, sample_weight=None, usecols=[], dropcols=[], numeric_only=False, dropna=True): @@ -56,7 +57,9 @@ def standarize_dataset(df, protected_attributes, target, sample_weight=None, >>> X, y = standarize_dataset(df, protected_attributes=0, target=5) >>> X_tr, X_te, y_tr, y_te = train_test_split(X, y) """ - df = df.set_index(protected_attributes, drop=False) # TODO: append=True? + df = df.set_index(protected_attributes, drop=False, append=True) + # df = df.set_index(sample_weight or np.ones(df.shape[0]), append=True) + # df.index = df.index.set_names('sample_weight', level=-1) # TODO: convert to 1/0 if numeric_only? y = df.pop(target) @@ -84,20 +87,48 @@ def standarize_dataset(df, protected_attributes, target, sample_weight=None, y = y.loc[notna] if sample_weight is not None: - sample_weight = df.pop(sample_weight) return namedtuple('WeightedDataset', ['X', 'y', 'sample_weight'])( - df, y, sample_weight) + df, y, df.pop(sample_weight).rename('sample_weight')) return namedtuple('Dataset', ['X', 'y'])(df, y) -def make_onehot_transformer(X): +def make_onehot_transformer(): """Shortcut for encoding categorical features as one-hot vectors. Note: - This changes the column order as well as removes DataFrame formatting. + This changes the column order. Returns: sklearn.compose.ColumnTransformer: Class capable of transforming categorical features in X to one-hot features. """ - return make_column_transformer((OneHotEncoder(), X.dtypes == 'category'), - remainder='passthrough') + class PandasOutOneHotTransformer(ColumnTransformer): + def __init__(self): + ohe = ('onehotencoder', OneHotEncoder(), + lambda X: X.dtypes == 'category') + super().__init__([ohe], remainder='passthrough') + + def get_feature_names(self): + check_is_fitted(self, 'transformers_') + dummies = self.named_transformers_.onehotencoder.get_feature_names( + input_features=self.ohe_input_features_) + passthroughs = self.passthrough_features_ + return list(dummies) + list(passthroughs) + + def fit(self, X, y=None): + self.ohe_input_features_ = X.columns[X.dtypes == 'category'] + self.passthrough_features_ = X.columns[X.dtypes != 'category'] + return super().fit(X, y=y) + + def fit_transform(self, X, y=None): + Xt = super().fit_transform(X, y=y) + self.ohe_input_features_ = X.columns[X.dtypes == 'category'] + self.passthrough_features_ = X.columns[X.dtypes != 'category'] + columns = self.get_feature_names() + return pd.DataFrame(Xt, columns=columns, index=X.index) + + def transform(self, X): + Xt = super().transform(X) + columns = self.get_feature_names() + return pd.DataFrame(Xt, columns=columns, index=X.index) + + return PandasOutOneHotTransformer() diff --git a/aif360/sklearn/tests/test_datasets.py b/aif360/sklearn/tests/test_datasets.py index eab905f3..4253bcd8 100644 --- a/aif360/sklearn/tests/test_datasets.py +++ b/aif360/sklearn/tests/test_datasets.py @@ -4,7 +4,8 @@ import pandas as pd import pytest -from aif360.sklearn.datasets import * +from aif360.sklearn.datasets import fetch_adult, fetch_bank, fetch_german +from aif360.sklearn.datasets import standarize_dataset, make_onehot_transformer df = pd.DataFrame([[1, 2, 3, 'a'], [5, 6, 7, 'b'], [np.NaN, 10, 11, 'c']], @@ -72,3 +73,7 @@ def test_fetch_bank(): assert bank.X.shape == (45211, 15) assert fetch_bank(dropcols=[]).X.shape == (45211, 16) assert fetch_bank(numeric_only=True).X.shape == (45211, 6) + +def test_onehot_transformer(): + X, y = fetch_german() + assert len(make_onehot_transformer().fit_transform(X).columns) == 63 From 8e52268d69a0739a757f29a9a2975b65eae61e77 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 24 Jun 2019 10:58:46 -0400 Subject: [PATCH 18/61] tweaks to reweighing to conform with sklearn --- aif360/sklearn/preprocessing/__init__.py | 2 +- aif360/sklearn/preprocessing/reweighing.py | 100 ++++++++++++++------- aif360/sklearn/tests/test_reweighing.py | 71 +++++++-------- 3 files changed, 105 insertions(+), 68 deletions(-) diff --git a/aif360/sklearn/preprocessing/__init__.py b/aif360/sklearn/preprocessing/__init__.py index 8cac812f..f49b7673 100644 --- a/aif360/sklearn/preprocessing/__init__.py +++ b/aif360/sklearn/preprocessing/__init__.py @@ -1 +1 @@ -from aif360.sklearn.preprocessing.reweighing import * +from aif360.sklearn.preprocessing.reweighing import Reweighing, ReweighingMeta diff --git a/aif360/sklearn/preprocessing/reweighing.py b/aif360/sklearn/preprocessing/reweighing.py index f61d7643..58cb13eb 100644 --- a/aif360/sklearn/preprocessing/reweighing.py +++ b/aif360/sklearn/preprocessing/reweighing.py @@ -1,11 +1,23 @@ -from warnings import warn - import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin -from sklearn.base import clone +from pandas.core.dtypes.common import is_list_like +from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone +from sklearn.utils import check_consistent_length from sklearn.utils.metaestimators import if_delegate_has_method +from sklearn.utils.validation import column_or_1d, has_fit_parameter + + +def check_inputs(X, y, sample_weight): + if not hasattr(X, 'index'): + raise TypeError("Expected `DataFrame`, got {} instead.".format(type(X))) + y = column_or_1d(y) + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + else: + sample_weight = np.ones(X.shape[0]) + check_consistent_length(X, y, sample_weight) + return X, y, sample_weight -class Reweighing(BaseEstimator, TransformerMixin): +class Reweighing(BaseEstimator): """Reweighing is a preprocessing technique that weights the examples in each (group, label) combination differently to ensure fairness before classification [#kamiran12]_. @@ -34,65 +46,90 @@ class Reweighing(BaseEstimator, TransformerMixin): Techniques for Classification without Discrimination," Knowledge and Information Systems, 2012. """ - # TODO: binary option for groups/labels? - def __init__(self): - self.sample_weight_ = np.empty(0) # dynamic object for use in Pipeline - def fit(self, X, y=None): - raise NotImplementedError("Only 'fit_transform' is allowed.") + def __init__(self, prot_attr=None): + """ + Args: + prot_attr (single label or list-like, optional): Protected + attribute(s) to use as sensitive attribute(s) in the reweighing + process. If more than one attribute, all combinations of values + (intersections) are considered. Default is `None` meaning all + protected attributes from the dataset are used. + """ + self.prot_attr = prot_attr - def transform(self, X): - raise NotImplementedError("Only 'fit_transform' is allowed.") + def fit(self, X, y, sample_weight=None): + self.fit_transform(X, y, sample_weight=sample_weight) + return self - def fit_transform(self, X, y, groups, sample_weight=None): + def fit_transform(self, X, y, sample_weight=None): """Compute the factors for reweighing the dataset and transform the sample weights. Args: X (array-like): Training samples. y (array-like): Training labels. - groups (array-like): Protected attributes corresponding to samples. sample_weight (array-like, optional): Sample weights. Returns: X: Unchanged samples. Only the sample weights are different after transformation (see the `sample_weight_` attribute). """ - if sample_weight is None: - sample_weight = np.ones(y.shape) - # resize all references (might be part of a Pipeline) - self.sample_weight_.resize(sample_weight.shape, refcheck=False) + X, y, sample_weight = check_inputs(X, y, sample_weight) + + all_prot_attrs = X.index.names[1:] + if self.prot_attr is None: + self.prot_attr_ = all_prot_attrs + elif not is_list_like(self.prot_attr): + self.prot_attr_ = [self.prot_attr] + else: + self.prot_attr_ = self.prot_attr + + if any(p not in X.index.names for p in self.prot_attr_): + raise ValueError("Some of the attributes provided are not present " + "in the dataset. Expected a subset of:\n{}\nGot:\n" + "{}".format(all_prot_attrs, self.prot_attr_)) + + self.sample_weight_ = np.empty_like(sample_weight) + groups = X.index.droplevel(list(set(X.index.names) + - set(self.prot_attr_))).to_flat_index() + # TODO: maintain categorical ordering self.groups_ = np.unique(groups) self.classes_ = np.unique(y) + n_groups = len(self.groups_) + n_classes = len(self.classes_) + self.reweigh_factors_ = np.full((n_groups, n_classes), np.nan) def N_(i): return sample_weight[i].sum() - N = sample_weight.sum() - for g in self.groups_: - for c in self.classes_: + for i, g in enumerate(self.groups_): + for j, c in enumerate(self.classes_): g_and_c = (groups == g) & (y == c) if np.any(g_and_c): W_gc = N_(groups == g) * N_(y == c) / (N * N_(g_and_c)) self.sample_weight_[g_and_c] = W_gc * sample_weight[g_and_c] + self.reweigh_factors_[i, j] = W_gc return X class ReweighingMeta(BaseEstimator, MetaEstimatorMixin): - def __init__(self, estimator): - self.reweigher = Reweighing() + def __init__(self, estimator, reweigher=Reweighing()): + if not has_fit_parameter(estimator, 'sample_weight'): + raise TypeError("`estimator` (type: {}) does not have fit parameter" + " `sample_weight`.".format(type(estimator))) + self.reweigher = reweigher self.estimator = estimator - def fit(self, X, y, pa_groups, sample_weight=None): + @property + def _estimator_type(self): + return self.estimator._estimator_type + + def fit(self, X, y, sample_weight=None): self.reweigher_ = clone(self.reweigher) self.estimator_ = clone(self.estimator) - self.reweigher_.fit_transform(X, y, pa_groups, sample_weight=sample_weight) - try: - self.estimator_.fit(X, y, sample_weight=self.reweigher_.sample_weight_) - except TypeError: - raise ValueError("'estimator' ({}) does not incorporate " - "'sample_weight' in 'fit()''.".format( - type(self.estimator_))) + self.reweigher_.fit_transform(X, y, sample_weight=sample_weight) + self.estimator_.fit(X, y, sample_weight=self.reweigher_.sample_weight_) return self @if_delegate_has_method('estimator') @@ -107,7 +144,6 @@ def predict_proba(self, X): def predict_log_proba(self, X): return self.estimator_.predict_log_proba(X) - # TODO: sample_weight isn't passed by GridSearchCV.score() @if_delegate_has_method('estimator') def score(self, X, y, sample_weight=None): return self.estimator_.score(X, y, sample_weight=sample_weight) diff --git a/aif360/sklearn/tests/test_reweighing.py b/aif360/sklearn/tests/test_reweighing.py index 30e8f37a..f1e2a223 100644 --- a/aif360/sklearn/tests/test_reweighing.py +++ b/aif360/sklearn/tests/test_reweighing.py @@ -1,53 +1,54 @@ import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV -from sklearn.pipeline import make_pipeline +from sklearn.metrics import accuracy_score, make_scorer -from aif360.datasets import GermanDataset -from aif360.sklearn.datasets import fetch_german +from aif360.datasets import AdultDataset +from aif360.sklearn.datasets import fetch_adult from aif360.algorithms.preprocessing import Reweighing as OrigReweighing from aif360.sklearn.preprocessing import Reweighing, ReweighingMeta -X, y = fetch_german(numeric_only=True, binary_age=True, dropcols='duration') -german = GermanDataset(categorical_features=[], features_to_keep=[ - 'credit_amount', 'investment_as_income_percentage', 'residence_since', - 'age', 'number_of_credits', 'people_liable_for', 'sex']) - -def test_dataset_equality(): - assert (german.features == X.values).all() +# X, y = fetch_german(numeric_only=True, dropcols='duration') +# X.age = (X.age >= 25).astype('int') +# german = GermanDataset(categorical_features=[], features_to_keep=[ +# 'credit_amount', 'investment_as_income_percentage', 'residence_since', +# 'age', 'number_of_credits', 'people_liable_for', 'sex']) +X, y, sample_weight = fetch_adult(numeric_only=True) +adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], + features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', + 'hours-per-week'], features_to_drop=[]) def test_reweighing_sex(): orig_rew = OrigReweighing(unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}]) - german_fair = orig_rew.fit_transform(german) - rew = Reweighing() - rew.fit_transform(X, y, groups=X.index.get_level_values('sex')) + adult_fair = orig_rew.fit_transform(adult) + rew = Reweighing('sex') + rew.fit_transform(X, y, sample_weight=sample_weight) - assert np.allclose(german_fair.instance_weights, rew.sample_weight_) + # assert np.allclose([[orig_rew.w_up_unfav, orig_rew.w_up_fav], + # [orig_rew.w_p_unfav, orig_rew.w_p_fav]], + # rew.reweigh_factors_) + assert np.allclose(adult_fair.instance_weights, rew.sample_weight_) def test_reweighing_intersection(): rew = Reweighing() - rew.fit_transform(X, y, groups=X.index.to_flat_index()) - assert len(rew.groups_) == 4 - assert len(rew.classes_) == 2 - -def test_pipeline(): - logreg = LogisticRegression(solver='liblinear') - pipe = make_pipeline(Reweighing(), logreg) - fit_params = {'logisticregression__sample_weight': pipe[0].sample_weight_, - 'reweighing__groups': X.index.get_level_values('sex')} - pipe.fit(X, y, **fit_params) - assert (logreg.fit(X, y, sample_weight=pipe[0].sample_weight_).coef_ - == pipe[-1].coef_).all() + rew.fit_transform(X, y) + assert rew.reweigh_factors_.shape == (4, 2) def test_gridsearch(): - rew = ReweighingMeta(LogisticRegression(solver='liblinear')) - params = {'estimator__C': [1, 10]} - clf = GridSearchCV(rew, params, cv=5) - # TODO: 'groups' name clashes with CV splitter - fit_params = {'pa_groups': X.index.get_level_values('sex'), - 'sample_weight': np.random.random(y.shape)} - clf.fit(X, y, **fit_params) - # print(clf.score(X, y)) - assert len(clf.best_estimator_.reweigher_.groups_) == 2 + # logreg = LogisticRegression(solver='lbfgs', max_iter=500) + # rew = ReweighingMeta(estimator=logreg, reweigher=Reweighing('sex')) + rew = ReweighingMeta(estimator=LogisticRegression(solver='liblinear')) + + # UGLY workaround for sklearn issue: https://stackoverflow.com/a/49598597 + def score_func(y_true, y_pred, sample_weight): + idx = y_true.index.to_flat_index() + return accuracy_score(y_true, y_pred, sample_weight=sample_weight[idx]) + scoring = make_scorer(score_func, **{'sample_weight': sample_weight}) + + params = {'estimator__C': [1, 10], 'reweigher__prot_attr': ['sex']} + + clf = GridSearchCV(rew, params, scoring=scoring, cv=5, iid=False) + clf.fit(X, y, **{'sample_weight': sample_weight}) + # print(clf.best_score_) From 0183449ddb87366d0c1cc8719078da8f8157e52b Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 24 Jun 2019 12:32:22 -0400 Subject: [PATCH 19/61] updated README --- aif360/sklearn/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md index 558ac562..da318ced 100644 --- a/aif360/sklearn/README.md +++ b/aif360/sklearn/README.md @@ -27,7 +27,7 @@ objects with sample properties (protected attributes) as the index - [ ] Make inprocessing algorithms compatible as sklearn `Estimator`s - [ ] **[External]** `get_feature_names()` from data preprocessing steps that would remove DataFrame formatting - - [ ] [SLEP008](https://github.com/scikit-learn/enhancement_proposals/pull/18 )? + - [ ] SLEP007/8 - [ ] Prejudice remover - [ ] Adversarial debiasing - [ ] Meta-fair classifier @@ -38,8 +38,7 @@ objects with sample properties (protected attributes) as the index - [ ] Learning fair representations - [ ] Optimized preprocessing - [X] Reweighing - - [X] Use dynamic object to pass sample_weight to estimator, etc. after they - are fitted (NOTE: does not work with GridSearchCV) + - [X] Meta-estimator workaround - [ ] **[External]** SLEP006 - Sample properties - [ ] Make postprocessing algorithms compatible - [ ] **[External]** Allow for `fit(y_true, y_pred)` From 89b4a79253c5009b8a17bcb543369815db2887a0 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 24 Jun 2019 19:48:56 -0400 Subject: [PATCH 20/61] fixed docstring formatting --- aif360/sklearn/datasets/openml_datasets.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 98d0aa31..37122b17 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -15,9 +15,9 @@ def to_dataframe(data): if needed. Args: - data (Bunch): Dict-like object containing `data`, `feature_names` and, - optionally, `categories` attributes. Note: `data` should contain - both X and y data. + data (Bunch): Dict-like object containing ``data``, ``feature_names`` + and, optionally, ``categories`` attributes. Note: ``data`` should + contain both X and y data. Returns: pandas.DataFrame: A DataFrame containing all data, including target, @@ -99,9 +99,10 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], Protected attributes are 'sex' ('male' is privileged and 'female' is unprivileged) and 'age' (binarized by default as recommended by - [#kamiran09]_: `age >= 25` is considered privileged and `age < 25` is - considered unprivileged; see the `binary_age` flag to keep this continuous). - The outcome variable is 'good' (favorable) or 'bad' (unfavorable). + [#kamiran09]_: ``age >= 25`` is considered privileged and ``age < 25`` is + considered unprivileged; see the ``binary_age`` flag to keep this + continuous). The outcome variable is 'good' (favorable) or 'bad' + (unfavorable). References: .. [#kamiran09] F. Kamiran and T. Calders, "Classifying without @@ -113,8 +114,8 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], for the datasets. By default all AIF360 datasets are stored in 'aif360/sklearn/data/raw' subfolders. binary_age (bool, optional): If `True`, split protected attribute, - `age`, into 'aged' (privileged) and 'youth' (unprivileged). The - `age` feature remains continuous. + ``age``, into 'aged' (privileged) and 'youth' (unprivileged). The + ``age`` feature remains continuous. usecols (single label or list-like, optional): Column name(s) to keep. All others are dropped. dropcols (single label or list-like, optional): Column name(s) to drop. From d57b6df1ec2bcc313b344d924d828f409c3429fb Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 24 Jun 2019 19:50:38 -0400 Subject: [PATCH 21/61] changed metrics to use prot_attr --- aif360/sklearn/metrics/metrics.py | 86 +++++++++++++--------- aif360/sklearn/preprocessing/reweighing.py | 41 ++--------- aif360/sklearn/tests/test_metrics.py | 55 ++++++++------ aif360/sklearn/utils.py | 53 +++++++++++++ 4 files changed, 145 insertions(+), 90 deletions(-) create mode 100644 aif360/sklearn/utils.py diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index d4da5c81..4d87490e 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -2,39 +2,52 @@ from sklearn.metrics import make_scorer, recall_score from sklearn.neighbors import NearestNeighbors +from aif360.sklearn.utils import check_groups + + +__all__ = [ + 'consistency_score', 'specificity_score', 'selection_rate', + 'disparate_impact_ratio', 'statistical_parity_difference', + 'equal_opportunity_difference', 'average_odds_difference', + 'average_odds_error', 'generalized_entropy_error', + 'between_group_generalized_entropy_error' +] # ============================= META-METRICS =================================== -def difference(func, y, *args, groups, priv_group=1, sample_weight=None, **kwargs): +def difference(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, + **kwargs): """Compute the difference between unprivileged and privileged subsets for an arbitrary metric. Note: The optimal value of a difference is 0. To make it a scorer, one must - take the absolute value and set `greater_is_better` to False. + take the absolute value and set ``greater_is_better`` to False. Unprivileged group is taken to be the inverse of the privileged group. Args: - func (function): A metric function from `aif360.sklearn.metrics` or - `sklearn.metrics`. + func (function): A metric function from :mod:`sklearn.metrics` or + :mod:`aif360.sklearn.metrics.metrics`. y (array-like): Outcome vector with protected attributes as index. - *args: Additional positional args to be passed through to `func`. - groups (array-like, keyword-only): Group labels (protected attributes) - for the samples. + *args: Additional positional args to be passed through to ``func``. + prot_attr (array-like, keyword-only): Protected attribute(s). If + ``None``, all protected attributes in ``y`` are used. priv_group (scalar, optional): Label value for the privileged group. sample_weight (array-like, optional): Sample weights passed through to - `func`. - **kwargs: Additional keyword args to be passed through to `func`. + ``func``. + **kwargs: Additional keyword args to be passed through to ``func``. Returns: - scalar: Difference in metric value for unprivileged and privileged groups. + scalar: Difference in metric value for unprivileged and privileged + groups. Examples: >>> X, y = fetch_german(numeric_only=True) >>> y_pred = LogisticRegression().fit(X, y).predict(X) - >>> sex = X.index.get_level_values('sex') - >>> difference(precision_score, y, y_pred, groups=sex, priv_group='male') + >>> difference(precision_score, y, y_pred, prot_attr='sex', + ... priv_group='male') -0.06955430006277463 """ + groups, _ = check_groups(y, prot_attr) idx = (groups == priv_group) unpriv = map(lambda a: a[~idx], (y,) + args) priv = map(lambda a: a[idx], (y,) + args) @@ -43,30 +56,32 @@ def difference(func, y, *args, groups, priv_group=1, sample_weight=None, **kwarg - func(*priv, sample_weight=sample_weight[idx], **kwargs)) return func(*unpriv, **kwargs) - func(*priv, **kwargs) -def ratio(func, y, *args, groups, priv_group=1, sample_weight=None, **kwargs): +def ratio(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, + **kwargs): """Compute the ratio between unprivileged and privileged subsets for an arbitrary metric. Note: The optimal value of a ratio is 1. To make it a scorer, one must - subtract 1, take the absolute value, and set `greater_is_better` to False. + subtract 1, take the absolute value, and set ``greater_is_better`` to False. Unprivileged group is taken to be the inverse of the privileged group. Args: - func (function): A metric function from `aif360.sklearn.metrics` or - `sklearn.metrics`. + func (function): A metric function from :mod:`sklearn.metrics` or + :mod:`aif360.sklearn.metrics.metrics`. y (array-like): Outcome vector with protected attributes as index. - *args: Additional positional args to be passed through to `func`. + *args: Additional positional args to be passed through to ``func``. groups (array-like, keyword-only): Group labels (protected attributes) for the samples. priv_group (scalar, optional): Label value for the privileged group. sample_weight (array-like, optional): Sample weights passed through to - `func`. - **kwargs: Additional keyword args to be passed through to `func`. + ``func``. + **kwargs: Additional keyword args to be passed through to ``func``. Returns: scalar: Ratio of metric values for unprivileged and privileged groups. """ + groups, _ = check_groups(y, prot_attr) idx = (groups == priv_group) unpriv = map(lambda a: a[~idx], (y,) + args) priv = map(lambda a: a[idx], (y,) + args) @@ -107,40 +122,40 @@ def selection_rate(y_true, y_pred, pos_label=1, sample_weight=None): # ============================ GROUP FAIRNESS ================================== -def statistical_parity_difference(*y, groups, priv_group=1, pos_label=1, +def statistical_parity_difference(*y, prot_attr=None, priv_group=1, pos_label=1, sample_weight=None): rate = base_rate if len(y) == 1 or y[1] is None else selection_rate - return difference(rate, *y, groups=groups, priv_group=priv_group, + return difference(rate, *y, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) -def disparate_impact_ratio(*y, groups, priv_group=1, pos_label=1, +def disparate_impact_ratio(*y, prot_attr=None, priv_group=1, pos_label=1, sample_weight=None): rate = base_rate if len(y) == 1 or y[1] is None else selection_rate - return ratio(rate, *y, groups=groups, priv_group=priv_group, + return ratio(rate, *y, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) -def equal_opportunity_difference(y_true, y_pred, groups, priv_group=1, +def equal_opportunity_difference(y_true, y_pred, prot_attr=None, priv_group=1, pos_label=1, sample_weight=None): - return difference(recall_score, y_true, y_pred, groups=groups, + return difference(recall_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) -def average_odds_difference(y_true, y_pred, groups, priv_group=1, pos_label=1, +def average_odds_difference(y_true, y_pred, prot_attr=None, priv_group=1, pos_label=1, neg_label=0, sample_weight=None): - tnr_diff = difference(specificity_score, y_true, y_pred, groups=groups, + tnr_diff = difference(specificity_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, neg_label=neg_label, sample_weight=sample_weight) - tpr_diff = difference(recall_score, y_true, y_pred, groups=groups, + tpr_diff = difference(recall_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) return (tpr_diff - tnr_diff) / 2 -def average_odds_error(y_true, y_pred, groups, priv_group=1, pos_label=1, +def average_odds_error(y_true, y_pred, prot_attr=None, priv_group=1, pos_label=1, neg_label=0, sample_weight=None): - tnr_diff = difference(specificity_score, y_true, y_pred, groups=groups, + tnr_diff = difference(specificity_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, neg_label=neg_label, sample_weight=sample_weight) - tpr_diff = difference(recall_score, y_true, y_pred, groups=groups, + tpr_diff = difference(recall_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) return (abs(tnr_diff) + abs(tpr_diff)) / 2 @@ -157,13 +172,14 @@ def generalized_entropy_index(b, alpha=2): return ((b / b.mean())**alpha - 1).mean() / (alpha * (alpha - 1)) def generalized_entropy_error(y_true, y_pred, alpha=2, pos_label=1): - # sample_weight=None): + # sample_weight=None): b = 1 + (y_pred == pos_label) - (y_true == pos_label) return generalized_entropy_index(b, alpha=alpha) -def between_group_generalized_entropy_error(y_true, y_pred, groups, +def between_group_generalized_entropy_error(y_true, y_pred, prot_attr=None, priv_group=None, alpha=2, pos_label=1): + groups = check_groups(y_true, prot_attr) b = np.empty_like(y_true, dtype='float') if priv_group is not None: groups = [1 if g == priv_group else 0 for g in groups] @@ -205,7 +221,7 @@ def sensitivity_score(y_true, y_pred, pos_label=1, sample_weight=None): # return 1 - specificity_score(y_true, y_pred, pos_label=pos_label, # sample_weight=sample_weight) -def mean_difference(*y, groups, priv_group=1, pos_label=1, sample_weight=None): +def mean_difference(*y, prot_attr=None, priv_group=1, pos_label=1, sample_weight=None): """Alias of :func:`statistical_parity_difference`.""" - return statistical_parity_difference(*y, groups=groups, priv_group=priv_group, + return statistical_parity_difference(*y, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) diff --git a/aif360/sklearn/preprocessing/reweighing.py b/aif360/sklearn/preprocessing/reweighing.py index 58cb13eb..5a80c457 100644 --- a/aif360/sklearn/preprocessing/reweighing.py +++ b/aif360/sklearn/preprocessing/reweighing.py @@ -1,21 +1,10 @@ import numpy as np -from pandas.core.dtypes.common import is_list_like from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone -from sklearn.utils import check_consistent_length from sklearn.utils.metaestimators import if_delegate_has_method -from sklearn.utils.validation import column_or_1d, has_fit_parameter +from sklearn.utils.validation import has_fit_parameter +from aif360.sklearn.utils import check_inputs, check_groups -def check_inputs(X, y, sample_weight): - if not hasattr(X, 'index'): - raise TypeError("Expected `DataFrame`, got {} instead.".format(type(X))) - y = column_or_1d(y) - if sample_weight is not None: - sample_weight = column_or_1d(sample_weight) - else: - sample_weight = np.ones(X.shape[0]) - check_consistent_length(X, y, sample_weight) - return X, y, sample_weight class Reweighing(BaseEstimator): """Reweighing is a preprocessing technique that weights the examples in each @@ -53,7 +42,7 @@ def __init__(self, prot_attr=None): prot_attr (single label or list-like, optional): Protected attribute(s) to use as sensitive attribute(s) in the reweighing process. If more than one attribute, all combinations of values - (intersections) are considered. Default is `None` meaning all + (intersections) are considered. Default is ``None`` meaning all protected attributes from the dataset are used. """ self.prot_attr = prot_attr @@ -77,22 +66,8 @@ def fit_transform(self, X, y, sample_weight=None): """ X, y, sample_weight = check_inputs(X, y, sample_weight) - all_prot_attrs = X.index.names[1:] - if self.prot_attr is None: - self.prot_attr_ = all_prot_attrs - elif not is_list_like(self.prot_attr): - self.prot_attr_ = [self.prot_attr] - else: - self.prot_attr_ = self.prot_attr - - if any(p not in X.index.names for p in self.prot_attr_): - raise ValueError("Some of the attributes provided are not present " - "in the dataset. Expected a subset of:\n{}\nGot:\n" - "{}".format(all_prot_attrs, self.prot_attr_)) - self.sample_weight_ = np.empty_like(sample_weight) - groups = X.index.droplevel(list(set(X.index.names) - - set(self.prot_attr_))).to_flat_index() + groups, self.prot_attr_ = check_groups(X, self.prot_attr) # TODO: maintain categorical ordering self.groups_ = np.unique(groups) self.classes_ = np.unique(y) @@ -132,18 +107,18 @@ def fit(self, X, y, sample_weight=None): self.estimator_.fit(X, y, sample_weight=self.reweigher_.sample_weight_) return self - @if_delegate_has_method('estimator') + @if_delegate_has_method('estimator_') def predict(self, X): return self.estimator_.predict(X) - @if_delegate_has_method('estimator') + @if_delegate_has_method('estimator_') def predict_proba(self, X): return self.estimator_.predict_proba(X) - @if_delegate_has_method('estimator') + @if_delegate_has_method('estimator_') def predict_log_proba(self, X): return self.estimator_.predict_log_proba(X) - @if_delegate_has_method('estimator') + @if_delegate_has_method('estimator_') def score(self, X, y, sample_weight=None): return self.estimator_.score(X, y, sample_weight=sample_weight) diff --git a/aif360/sklearn/tests/test_metrics.py b/aif360/sklearn/tests/test_metrics.py index a2db21ec..e470f32e 100644 --- a/aif360/sklearn/tests/test_metrics.py +++ b/aif360/sklearn/tests/test_metrics.py @@ -4,17 +4,22 @@ from aif360.datasets import AdultDataset from aif360.sklearn.datasets import fetch_adult from aif360.metrics import ClassificationMetric -from aif360.sklearn.metrics import * +from aif360.sklearn.metrics import ( + consistency_score, specificity_score, selection_rate, + disparate_impact_ratio, statistical_parity_difference, + equal_opportunity_difference, average_odds_difference, + average_odds_error, generalized_entropy_error, + between_group_generalized_entropy_error) X, y, sample_weight = fetch_adult(numeric_only=True) y = y.factorize(sort=True)[0] y_pred = LogisticRegression(solver='liblinear').fit(X, y, sample_weight=sample_weight).predict(X) -priv = X.index.get_level_values('sex') adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], - features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', - 'hours-per-week'], features_to_drop=[]) + features_to_keep=['age', 'education-num', 'capital-gain', + 'capital-loss', 'hours-per-week'], + features_to_drop=[]) adult_pred = adult.copy() adult_pred.labels = y_pred cm = ClassificationMetric(adult, adult_pred, @@ -29,37 +34,43 @@ def test_consistency(): assert np.isclose(consistency_score(X, y), cm.consistency()) def test_specificity(): - assert specificity_score(y, y_pred, sample_weight=sample_weight) == cm.specificity() + spec = specificity_score(y, y_pred, sample_weight=sample_weight) + assert spec == cm.specificity() def test_selection_rate(): - assert selection_rate(y, y_pred, sample_weight=sample_weight) == cm.selection_rate() + select = selection_rate(y, y_pred, sample_weight=sample_weight) + assert select == cm.selection_rate() def test_disparate_impact(): - assert disparate_impact_ratio(y, y_pred, groups=priv, priv_group='Male', - sample_weight=sample_weight) == cm.disparate_impact() + di = disparate_impact_ratio(y, y_pred, prot_attr='sex', priv_group='Male', + sample_weight=sample_weight) + assert di == cm.disparate_impact() def test_statistical_parity(): - assert statistical_parity_difference(y, y_pred, groups=priv, priv_group='Male', - sample_weight=sample_weight) == cm.statistical_parity_difference() + stat = statistical_parity_difference(y, y_pred, prot_attr='sex', + priv_group='Male', sample_weight=sample_weight) + assert stat == cm.statistical_parity_difference() def test_equal_opportunity(): - assert equal_opportunity_difference(y, y_pred, groups=priv, priv_group='Male', - sample_weight=sample_weight) == cm.equal_opportunity_difference() + eopp = equal_opportunity_difference(y, y_pred, prot_attr='sex', + priv_group='Male', sample_weight=sample_weight) + assert eopp == cm.equal_opportunity_difference() def test_average_odds_difference(): - assert np.isclose(average_odds_difference(y, y_pred, groups=priv, priv_group='Male', - sample_weight=sample_weight), - cm.average_odds_difference()) + aod = average_odds_difference(y, y_pred, prot_attr='sex', priv_group='Male', + sample_weight=sample_weight) + assert np.isclose(aod, cm.average_odds_difference()) def test_average_odds_error(): - assert np.isclose(average_odds_error(y, y_pred, groups=priv, priv_group='Male', - sample_weight=sample_weight), - cm.average_abs_odds_difference()) + aoe = average_odds_error(y, y_pred, prot_attr='sex', priv_group='Male', + sample_weight=sample_weight) + assert np.isclose(aoe, cm.average_abs_odds_difference()) def test_generalized_entropy_index(): - assert np.isclose(generalized_entropy_error(y, y_pred), - cm.generalized_entropy_index()) + gei = generalized_entropy_error(y, y_pred) + assert np.isclose(gei, cm.generalized_entropy_index()) def test_between_group_generalized_entropy_index(): - assert between_group_generalized_entropy_error(y, y_pred, groups=priv, priv_group='Male') \ - == cm.between_group_generalized_entropy_index() + bggei = between_group_generalized_entropy_error(y, y_pred, prot_attr='sex', + priv_group='Male') + assert bggei == cm.between_group_generalized_entropy_index() diff --git a/aif360/sklearn/utils.py b/aif360/sklearn/utils.py new file mode 100644 index 00000000..e18646bf --- /dev/null +++ b/aif360/sklearn/utils.py @@ -0,0 +1,53 @@ +from pandas.core.dtypes.common import is_list_like +from sklearn.utils import check_consistent_length +from sklearn.utils.validation import column_or_1d + + +def check_inputs(X, y, sample_weight): + if not hasattr(X, 'index'): + raise TypeError("Expected `DataFrame`, got {} instead.".format( + type(X).__name__)) + y = column_or_1d(y) + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + else: + sample_weight = np.ones(X.shape[0]) + check_consistent_length(X, y, sample_weight) + return X, y, sample_weight + +def check_groups(X, prot_attr): + """Validates ``X`` and returns ``groups`` and ``prot_attr``. + + Args: + X (`pandas.Series` or `pandas.DataFrame`): . + prot_attr (single label or list-like): Protected attribute(s). If + ``None``, all protected attributes in ``X`` are used. + + Returns: + (`pandas.Index`, list-like): + + * **groups** (`pandas.Index`) -- Label (or tuple of labels) of + protected attribute for each sample in ``X``. + * **prot_attr** (list-like) -- Modified input. If input is a single + label, returns single-item list. If input is ``None`` returns list + of all protected attributes. + """ + if not hasattr(X, 'index'): + raise TypeError( + "Expected `Series` or `DataFrame`, got {} instead.".format( + type(X).__name__)) + + all_prot_attrs = X.index.names[1:] + if prot_attr is None: + prot_attr = all_prot_attrs + elif not is_list_like(prot_attr): + prot_attr = [prot_attr] + + if any(p not in X.index.names for p in prot_attr): + raise ValueError("Some of the attributes provided are not present " + "in the dataset. Expected a subset of:\n{}\nGot:\n" + "{}".format(all_prot_attrs, prot_attr)) + + groups = X.index.droplevel(list(set(X.index.names) - set(prot_attr))) + + return groups.to_flat_index(), prot_attr From d8958bbd67f6f899d5e42adc76f346fab31f4d38 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 24 Jun 2019 19:51:44 -0400 Subject: [PATCH 22/61] added __all__ to __init__s --- aif360/sklearn/metrics/__init__.py | 19 ++++++++++++++++++- aif360/sklearn/preprocessing/__init__.py | 4 ++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/aif360/sklearn/metrics/__init__.py b/aif360/sklearn/metrics/__init__.py index ceaef288..84aa3f1e 100644 --- a/aif360/sklearn/metrics/__init__.py +++ b/aif360/sklearn/metrics/__init__.py @@ -1 +1,18 @@ -from aif360.sklearn.metrics.metrics import * +from aif360.sklearn.metrics.metrics import consistency_score +from aif360.sklearn.metrics.metrics import specificity_score +from aif360.sklearn.metrics.metrics import selection_rate +from aif360.sklearn.metrics.metrics import disparate_impact_ratio +from aif360.sklearn.metrics.metrics import statistical_parity_difference +from aif360.sklearn.metrics.metrics import equal_opportunity_difference +from aif360.sklearn.metrics.metrics import average_odds_difference +from aif360.sklearn.metrics.metrics import average_odds_error +from aif360.sklearn.metrics.metrics import generalized_entropy_error +from aif360.sklearn.metrics.metrics import between_group_generalized_entropy_error + +__all__ = [ + 'consistency_score', 'specificity_score', 'selection_rate', + 'disparate_impact_ratio', 'statistical_parity_difference', + 'equal_opportunity_difference', 'average_odds_difference', + 'average_odds_error', 'generalized_entropy_error', + 'between_group_generalized_entropy_error' +] diff --git a/aif360/sklearn/preprocessing/__init__.py b/aif360/sklearn/preprocessing/__init__.py index f49b7673..61a0431d 100644 --- a/aif360/sklearn/preprocessing/__init__.py +++ b/aif360/sklearn/preprocessing/__init__.py @@ -1 +1,5 @@ from aif360.sklearn.preprocessing.reweighing import Reweighing, ReweighingMeta + +__all__ = [ + 'Reweighing', 'ReweighingMeta' +] From 0bd3837bfd8ac6f4ca9fc6f4d2876affd8fdadc1 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 27 Jun 2019 12:35:31 -0400 Subject: [PATCH 23/61] updated notebook with reweighing example --- aif360/sklearn/examples/Getting Started.ipynb | 208 ++++++++++++------ 1 file changed, 146 insertions(+), 62 deletions(-) diff --git a/aif360/sklearn/examples/Getting Started.ipynb b/aif360/sklearn/examples/Getting Started.ipynb index 0df0db33..b65f8f78 100644 --- a/aif360/sklearn/examples/Getting Started.ipynb +++ b/aif360/sklearn/examples/Getting Started.ipynb @@ -15,12 +15,11 @@ "source": [ "import numpy as np\n", "import pandas as pd\n", - "from sklearn.pipeline import make_pipeline\n", "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.metrics import recall_score\n", - "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, recall_score, make_scorer\n", + "from sklearn.model_selection import GridSearchCV, train_test_split\n", "\n", - "from aif360.sklearn.algorithms.preprocessing import Reweighing\n", + "from aif360.sklearn.preprocessing import ReweighingMeta\n", "from aif360.sklearn.datasets import fetch_adult\n", "from aif360.sklearn.metrics import disparate_impact_ratio" ] @@ -68,6 +67,7 @@ " \n", " \n", " \n", + " \n", " age\n", " workclass\n", " education\n", @@ -83,6 +83,7 @@ " native-country\n", " \n", " \n", + " \n", " race\n", " sex\n", " \n", @@ -102,6 +103,7 @@ " \n", " \n", " \n", + " 0\n", " Non-white\n", " Male\n", " 25.0\n", @@ -119,7 +121,8 @@ " United-States\n", " \n", " \n", - " White\n", + " 1\n", + " White\n", " Male\n", " 38.0\n", " Private\n", @@ -136,6 +139,8 @@ " United-States\n", " \n", " \n", + " 2\n", + " White\n", " Male\n", " 28.0\n", " Local-gov\n", @@ -152,6 +157,7 @@ " United-States\n", " \n", " \n", + " 3\n", " Non-white\n", " Male\n", " 44.0\n", @@ -169,6 +175,7 @@ " United-States\n", " \n", " \n", + " 5\n", " White\n", " Male\n", " 34.0\n", @@ -190,37 +197,37 @@ "" ], "text/plain": [ - " age workclass education education-num \\\n", - "race sex \n", - "Non-white Male 25.0 Private 11th 7.0 \n", - "White Male 38.0 Private HS-grad 9.0 \n", - " Male 28.0 Local-gov Assoc-acdm 12.0 \n", - "Non-white Male 44.0 Private Some-college 10.0 \n", - "White Male 34.0 Private 10th 6.0 \n", + " age workclass education education-num \\\n", + " race sex \n", + "0 Non-white Male 25.0 Private 11th 7.0 \n", + "1 White Male 38.0 Private HS-grad 9.0 \n", + "2 White Male 28.0 Local-gov Assoc-acdm 12.0 \n", + "3 Non-white Male 44.0 Private Some-college 10.0 \n", + "5 White Male 34.0 Private 10th 6.0 \n", "\n", - " marital-status occupation relationship \\\n", - "race sex \n", - "Non-white Male Never-married Machine-op-inspct Own-child \n", - "White Male Married-civ-spouse Farming-fishing Husband \n", - " Male Married-civ-spouse Protective-serv Husband \n", - "Non-white Male Married-civ-spouse Machine-op-inspct Husband \n", - "White Male Never-married Other-service Not-in-family \n", + " marital-status occupation relationship \\\n", + " race sex \n", + "0 Non-white Male Never-married Machine-op-inspct Own-child \n", + "1 White Male Married-civ-spouse Farming-fishing Husband \n", + "2 White Male Married-civ-spouse Protective-serv Husband \n", + "3 Non-white Male Married-civ-spouse Machine-op-inspct Husband \n", + "5 White Male Never-married Other-service Not-in-family \n", "\n", - " race sex capital-gain capital-loss hours-per-week \\\n", - "race sex \n", - "Non-white Male Non-white Male 0.0 0.0 40.0 \n", - "White Male White Male 0.0 0.0 50.0 \n", - " Male White Male 0.0 0.0 40.0 \n", - "Non-white Male Non-white Male 7688.0 0.0 40.0 \n", - "White Male White Male 0.0 0.0 30.0 \n", + " race sex capital-gain capital-loss hours-per-week \\\n", + " race sex \n", + "0 Non-white Male Non-white Male 0.0 0.0 40.0 \n", + "1 White Male White Male 0.0 0.0 50.0 \n", + "2 White Male White Male 0.0 0.0 40.0 \n", + "3 Non-white Male Non-white Male 7688.0 0.0 40.0 \n", + "5 White Male White Male 0.0 0.0 30.0 \n", "\n", - " native-country \n", - "race sex \n", - "Non-white Male United-States \n", - "White Male United-States \n", - " Male United-States \n", - "Non-white Male United-States \n", - "White Male United-States " + " native-country \n", + " race sex \n", + "0 Non-white Male United-States \n", + "1 White Male United-States \n", + "2 White Male United-States \n", + "3 Non-white Male United-States \n", + "5 White Male United-States " ] }, "execution_count": 2, @@ -242,7 +249,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -267,6 +274,7 @@ " \n", " \n", " \n", + " \n", " age\n", " education-num\n", " race\n", @@ -276,6 +284,7 @@ " hours-per-week\n", " \n", " \n", + " \n", " race\n", " sex\n", " \n", @@ -289,6 +298,7 @@ " \n", " \n", " \n", + " 7916\n", " Non-white\n", " Female\n", " 18.0\n", @@ -300,7 +310,8 @@ " 20.0\n", " \n", " \n", - " White\n", + " 26447\n", + " White\n", " Male\n", " 55.0\n", " 9.0\n", @@ -311,6 +322,8 @@ " 40.0\n", " \n", " \n", + " 20889\n", + " White\n", " Female\n", " 43.0\n", " 9.0\n", @@ -321,6 +334,8 @@ " 40.0\n", " \n", " \n", + " 30145\n", + " White\n", " Male\n", " 44.0\n", " 11.0\n", @@ -331,6 +346,8 @@ " 40.0\n", " \n", " \n", + " 7473\n", + " White\n", " Male\n", " 41.0\n", " 9.0\n", @@ -345,31 +362,33 @@ "" ], "text/plain": [ - " age education-num race sex capital-gain capital-loss \\\n", - "race sex \n", - "Non-white Female 18.0 7.0 0.0 0.0 0.0 0.0 \n", - "White Male 55.0 9.0 1.0 1.0 0.0 0.0 \n", - " Female 43.0 9.0 1.0 0.0 0.0 0.0 \n", - " Male 44.0 11.0 1.0 1.0 4386.0 0.0 \n", - " Male 41.0 9.0 1.0 1.0 0.0 0.0 \n", + " age education-num race sex capital-gain \\\n", + " race sex \n", + "7916 Non-white Female 18.0 7.0 0.0 0.0 0.0 \n", + "26447 White Male 55.0 9.0 1.0 1.0 0.0 \n", + "20889 White Female 43.0 9.0 1.0 0.0 0.0 \n", + "30145 White Male 44.0 11.0 1.0 1.0 4386.0 \n", + "7473 White Male 41.0 9.0 1.0 1.0 0.0 \n", "\n", - " hours-per-week \n", - "race sex \n", - "Non-white Female 20.0 \n", - "White Male 40.0 \n", - " Female 40.0 \n", - " Male 40.0 \n", - " Male 55.0 " + " capital-loss hours-per-week \n", + " race sex \n", + "7916 Non-white Female 0.0 20.0 \n", + "26447 White Male 0.0 40.0 \n", + "20889 White Female 0.0 40.0 \n", + "30145 White Male 0.0 40.0 \n", + "7473 White Male 0.0 55.0 " ] }, - "execution_count": 3, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "X, y, _ = fetch_adult(numeric_only=True)\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)\n", + "X, y, sample_weight = fetch_adult(numeric_only=True)\n", + "(X_train, X_test,\n", + " y_train, y_test,\n", + " sw_train, sw_test) = train_test_split(X, y, sample_weight, train_size=0.7, random_state=123)\n", "X_train.head()" ] }, @@ -421,7 +440,7 @@ ], "source": [ "sex = y_test.index.get_level_values('sex')\n", - "disparate_impact_ratio(y_test, y_pred, groups=sex, priv_group='Male', pos_label='>50K')" + "disparate_impact_ratio(y_test, y_pred, prot_attr='sex', priv_group='Male', pos_label='>50K')" ] }, { @@ -435,21 +454,86 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Not yet implemented." + "`ReweighingMeta` is a workaround until changing sample weights can be handled properly in `Pipeline`/`GridSearchCV`" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 10, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index([(7916, 'Non-white', 'Female'), (26447, 'White', 'Male'),\n", + " (20889, 'White', 'Female'), (30145, 'White', 'Male'),\n", + " (7473, 'White', 'Male'), (29361, 'White', 'Male'),\n", + " (12277, 'White', 'Male'), (44372, 'White', 'Male'),\n", + " (32291, 'White', 'Female'), (44411, 'White', 'Female'),\n", + " ...\n", + " (38298, 'White', 'Male'), (4173, 'White', 'Male'),\n", + " (7854, 'White', 'Male'), (16424, 'White', 'Female'),\n", + " (2087, 'White', 'Male'), (16120, 'White', 'Male'),\n", + " (24476, 'White', 'Male'), (8295, 'White', 'Female'),\n", + " (1449, 'White', 'Male'), (33323, 'White', 'Male')],\n", + " dtype='object', length=6838)\n" + ] + }, + { + "ename": "NameError", + "evalue": "name 'accuracy_score' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mclf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrew\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'sample_weight'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msw_train\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 686\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 688\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 689\u001b[0m \u001b[0;31m# For multi-metric evaluation, store the best_index_, best_params_ and\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[0;34m(self, evaluate_candidates)\u001b[0m\n\u001b[1;32m 1146\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1147\u001b[0m \u001b[0;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1148\u001b[0;31m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1149\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[0;34m(candidate_params)\u001b[0m\n\u001b[1;32m 664\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 665\u001b[0m in product(candidate_params,\n\u001b[0;32m--> 666\u001b[0;31m cv.split(X, y, groups)))\n\u001b[0m\u001b[1;32m 667\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 919\u001b[0m \u001b[0;31m# remaining jobs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 920\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 921\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 922\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_original_iterator\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 923\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m 757\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 758\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 759\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 760\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 761\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 714\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[0mjob_idx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 716\u001b[0;31m \u001b[0mjob\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 717\u001b[0m \u001b[0;31m# A job can complete so quickly than its callback is\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 718\u001b[0m \u001b[0;31m# called before we get here, causing self._jobs to\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[0;34m(self, func, callback)\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[0;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 182\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 183\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 547\u001b[0m \u001b[0;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[0;31m# arguments in memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 549\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 550\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 551\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 224\u001b[0m return [func(*args, **kwargs)\n\u001b[0;32m--> 225\u001b[0;31m for func, args, kwargs in self.items]\n\u001b[0m\u001b[1;32m 226\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 227\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 224\u001b[0m return [func(*args, **kwargs)\n\u001b[0;32m--> 225\u001b[0;31m for func, args, kwargs in self.items]\n\u001b[0m\u001b[1;32m 226\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 227\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_fit_and_score\u001b[0;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)\u001b[0m\n\u001b[1;32m 552\u001b[0m \u001b[0mfit_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 553\u001b[0m \u001b[0;31m# _score will return dict if is_multimetric is True\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 554\u001b[0;31m \u001b[0mtest_scores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_multimetric\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 555\u001b[0m \u001b[0mscore_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mfit_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_score\u001b[0;34m(estimator, X_test, y_test, scorer, is_multimetric)\u001b[0m\n\u001b[1;32m 595\u001b[0m \"\"\"\n\u001b[1;32m 596\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_multimetric\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 597\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_multimetric_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 598\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0my_test\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_multimetric_score\u001b[0;34m(estimator, X_test, y_test, scorers)\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[0mscore\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 626\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 627\u001b[0;31m \u001b[0mscore\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 628\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 629\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'item'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/metrics/scorer.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, estimator, X, y_true, sample_weight)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 96\u001b[0m return self._sign * self._score_func(y_true, y_pred,\n\u001b[0;32m---> 97\u001b[0;31m **self._kwargs)\n\u001b[0m\u001b[1;32m 98\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mscore_func\u001b[0;34m(y_true, y_pred, sample_weight)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0midx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my_true\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_flat_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msample_weight\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mscoring\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_scorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscore_func\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'sample_weight'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'accuracy_score' is not defined" + ] + } + ], "source": [ - "pipe = make_pipeline(Reweighing(), LinearRegression())\n", - "# sample_weight_ will be updated after it is fit\n", - "fit_params = {'linearregression__sample_weight':\n", - " pipe.named_steps.reweighing.sample_weight_}\n", - "pipe.fit(X, y, **fit_params)" + "rew = ReweighingMeta(estimator=LogisticRegression(solver='liblinear'))\n", + "\n", + "# UGLY workaround for sklearn issue: https://stackoverflow.com/a/49598597\n", + "def score_func(y_true, y_pred, sample_weight):\n", + " idx = y_true.index.to_flat_index()\n", + " print(idx)\n", + " return accuracy_score(y_true, y_pred, sample_weight=sample_weight[idx])\n", + "scoring = make_scorer(score_func, **{'sample_weight': sample_weight})\n", + "\n", + "params = {'estimator__C': [1, 10], 'reweigher__prot_attr': ['sex']}\n", + "\n", + "clf = GridSearchCV(rew, params, scoring=scoring, cv=5)\n", + "clf.fit(X_train, y_train, **{'sample_weight': sw_train})\n", + "clf.score(X_test, y_test)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [] } ], "metadata": { From 4107dd71c514700f72efe97ef0a93c4321e2237b Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 11 Jul 2019 16:13:51 -0400 Subject: [PATCH 24/61] initial adversarial debiasing port --- aif360/sklearn/inprocessing/__init__.py | 5 + .../inprocessing/adversarial_debiasing.py | 228 ++++++++++++++++++ aif360/sklearn/utils.py | 1 + 3 files changed, 234 insertions(+) create mode 100644 aif360/sklearn/inprocessing/__init__.py create mode 100644 aif360/sklearn/inprocessing/adversarial_debiasing.py diff --git a/aif360/sklearn/inprocessing/__init__.py b/aif360/sklearn/inprocessing/__init__.py new file mode 100644 index 00000000..863d3676 --- /dev/null +++ b/aif360/sklearn/inprocessing/__init__.py @@ -0,0 +1,5 @@ +from aif360.sklearn.inprocessing.adversarial_debiasing import AdversarialDebiasing + +__all__ = [ + 'AdversarialDebiasing' +] diff --git a/aif360/sklearn/inprocessing/adversarial_debiasing.py b/aif360/sklearn/inprocessing/adversarial_debiasing.py new file mode 100644 index 00000000..e82e287d --- /dev/null +++ b/aif360/sklearn/inprocessing/adversarial_debiasing.py @@ -0,0 +1,228 @@ +import numpy as np +from scipy.special import softmax +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.preprocessing import LabelEncoder +from sklearn.utils import check_is_fitted, check_random_state +import tensorflow as tf + +from aif360.sklearn.utils import check_inputs, check_groups + + +class AdversarialDebiasing(BaseEstimator, ClassifierMixin): + """Adversarial debiasing is an in-processing technique that learns a + classifier to maximize prediction accuracy and simultaneously reduce an + adversary's ability to determine the protected attribute from the + predictions [#zhang18]_. This approach leads to a fair classifier as the + predictions cannot carry any group discrimination information that the + adversary can exploit. + + References: + .. [#zhang18] B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating + Unwanted Biases with Adversarial Learning," AAAI/ACM Conference on + Artificial Intelligence, Ethics, and Society, 2018. + """ + + def __init__(self, prot_attr=None, adversary_loss_weight=0.1, num_epochs=50, + batch_size=128, classifier_num_hidden_units=200, debias=True, + verbose=True, random_state=None): + + self.prot_attr = prot_attr + self.adversary_loss_weight = adversary_loss_weight + self.num_epochs = num_epochs + self.batch_size = batch_size + self.classifier_num_hidden_units = classifier_num_hidden_units + self.debias = debias + self.verbose = verbose + self.random_state = random_state + + @property + def classifier_logits_(self): + check_is_fitted(self, ['input_ph', 'keep_prob']) + with tf.variable_scope('classifier_model'): + W1 = tf.get_variable( + 'W1', [self.input_ph.shape[1], self.classifier_num_hidden_units], + initializer=tf.contrib.layers.xavier_initializer()) + b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), + name='b1') + + h1 = tf.nn.relu(tf.matmul(self.input_ph, W1) + b1) + h1 = tf.nn.dropout(h1, keep_prob=self.keep_prob) + + W2 = tf.get_variable( + 'W2', [self.classifier_num_hidden_units, 1], + initializer=tf.contrib.layers.xavier_initializer()) + b2 = tf.Variable(tf.zeros(shape=[1]), name='b2') + + pred_logits = tf.matmul(h1, W2) + b2 + + return pred_logits + + @property + def adversary_logits_(self): + """Compute the adversary predictions for the protected attribute.""" + check_is_fitted(self, ['classifier_logits_', 'true_labels_ph']) + with tf.variable_scope("adversary_model"): + c = tf.get_variable('c', initializer=tf.constant(1.0)) + s = tf.sigmoid((1 + tf.abs(c)) * self.classifier_logits) + + W2 = tf.get_variable('W2', [3, 1], + initializer=tf.contrib.layers.xavier_initializer()) + b2 = tf.Variable(tf.zeros(shape=[1]), name='b2') + + pred_prot_attr_logits = tf.matmul( + tf.concat([s, s * self.true_labels_ph, s * (1.0 - self.true_labels_ph)], axis=1), + W2) + b2 + # pred_prot_attr_labels = tf.sigmoid(pred_prot_attr_logit) + + return pred_prot_attr_logits + + def _train(self, X, y, groups): + + + def fit(self, X, y): + rng = check_random_state(self.random_state) + # tf.random.seed(random_state) + + groups, self.prot_attr_ = check_groups(X, self.prot_attr) + lb = LabelBinarizer() + y = lb.fit_transform(y) + # TODO: LabelEncoder for groups + self.groups_ = np.unique(groups) + self.classes_ = lb.classes_ + self.sess_ = tf.Session() + + n_samples, n_features = X.shape + n_classes = len(self.classes_) + n_groups = len(self.groups_) + + with tf.variable_scope('adversarial_debiasing'): + n_samples, n_features = X.shape + n_classes = len(self.classes_) + n_groups = len(self.groups_) + + # Setup placeholders + self.input_ph = tf.placeholder(tf.float32, shape=[None, n_features]) + self.prot_attr_ph = tf.placeholder(tf.float32, shape=[None, n_groups]) + self.true_labels_ph = tf.placeholder(tf.float32, shape=[None, n_classes]) + self.keep_prob = tf.placeholder(tf.float32) + + # Obtain classifier loss + loss_fn = (tf.nn.sigmoid_cross_entropy_with_logits if n_classes == 1 + else tf.nn.softmax_cross_entropy_with_logits) + # clf_loss = loss_fn(labels=self.true_labels_ph, logits=self.classifier_logits_) + # clf_loss = tf.reduce_sum(sample_weight * clf_loss) / tf.reduce_sum(sample_weight) + clf_loss = tf.reduce_mean(loss_fn(labels=self.true_labels_ph, + logits=self.classifier_logits_)) + + if self.debias: + # Obtain adversary loss + loss_fn = (tf.nn.sigmoid_cross_entropy_with_logits if n_groups == 1 + else tf.nn.softmax_cross_entropy_with_logits) + adv_loss = tf.reduce_mean(loss_fn(labels=self.prot_attr_ph, + logits=self.adversary_logits_)) + + # Setup optimizers with learning rates + global_step = tf.Variable(0, trainable=False) + starter_learning_rate = 0.001 + learning_rate = tf.train.exponential_decay( + starter_learning_rate, global_step, 1000, 0.96, staircase=True) + clf_opt = tf.train.AdamOptimizer(learning_rate) + if self.debias: + adv_opt = tf.train.AdamOptimizer(learning_rate) + + clf_vars = [var for var in tf.trainable_variables() + if 'classifier_model' in var.name] + if self.debias: + adv_vars = [var for var in tf.trainable_variables() + if 'adversary_model' in var.name] + # Update classifier parameters + adv_grads = {var: grad for (grad, var) in + adv_opt.compute_gradients(adv_loss, var_list=clf_vars)} + + normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny) + + clf_grads = [] + for (grad, var) in clf_opt.compute_gradients(clf_loss, var_list=clf_vars): + if self.debias: + unit_adv_grad = normalize(adv_grads[var]) + # proj_{adv_grad} clf_grad: + grad -= tf.reduce_sum(grad * unit_adv_grad) * unit_adv_grad + grad -= self.adversary_loss_weight * adv_grads[var] + clf_grads.append((grad, var)) + clf_minimizer = clf_opt.apply_gradients(clf_grads, global_step=global_step) + + if self.debias: + # Update adversary parameters + adv_minimizer = adv_opt.minimize(adv_loss, var_list=adv_vars, + global_step=global_step) + + self.sess_.run(tf.global_variables_initializer()) + self.sess_.run(tf.local_variables_initializer()) + + # Begin training + for epoch in range(self.num_epochs): + # TODO: why rng.choice(n_samples, n_samples)? + shuffled_ids = rng.shuffle(np.arange(n_samples)) + for i in range(n_samples // self.batch_size): + batch_ids = shuffled_ids[self.batch_size * i:self.batch_size * (i+1)] + batch_features = X[batch_ids] + batch_labels = y[batch_ids] + batch_prot_attr = groups[batch_ids] + batch_feed_dict = {self.input_ph: batch_features, + self.true_labels_ph: batch_labels, + self.prot_attr_ph: batch_prot_attr, + self.keep_prob: 0.8} + if self.debias: + _, _, clf_loss_value, adv_loss_value = ( + self.sess_.run([clf_minimizer, adv_minimizer, + clf_loss, adv_loss], + feed_dict=batch_feed_dict)) + if i % 200 == 0 and self.verbose: + print("epoch {}; iter: {}; batch classifier loss: " + "{}; batch adversarial loss: {}".format( + epoch, i, clf_loss_value, + adv_loss_value)) + else: + _, clf_loss_value = self.sess_.run( + [clf_minimizer, clf_loss], + feed_dict=batch_feed_dict) + if i % 200 == 0 and self.verbose: + print("epoch {}; iter: {}; batch classifier loss: " + "{}".format(epoch, i, clf_loss_value)) + + return self + + def decision_function(self, X): + check_is_fitted(self, ['classes_', 'input_ph', 'keep_prob', 'classifier_logits_']) + n_samples = X.shape[0] + groups, _ = check_groups(X, self.prot_attr_) + + samples_covered = 0 + scores = np.empty((n_samples, len(self.classes_))) + while samples_covered < n_samples: + start = samples_covered + end = samples_covered + self.batch_size + if end > n_samples: + end = n_samples + + batch_ids = np.arange(start, end) + batch_features = X[batch_ids] + batch_prot_attr = groups[batch_ids] + + batch_feed_dict = {self.input_ph: batch_features, + self.keep_prob: 1.0} + + # batch_logits = self.sess_.run(self.classifier_logits_, feed_dict=batch_feed_dict) + scores[batch_ids] = self.sess_.run(self.classifier_logits_, + feed_dict=batch_feed_dict) + samples_covered += len(batch_features) + + return scores + + def predict_proba(self, X): + decision = self.decision_function(X) + return softmax(decision, axis=1) + + def predict(self, X): + indices = self.decision_function(X).argmax(axis=1) + return self.classes_[indices] diff --git a/aif360/sklearn/utils.py b/aif360/sklearn/utils.py index e18646bf..6c850e70 100644 --- a/aif360/sklearn/utils.py +++ b/aif360/sklearn/utils.py @@ -1,3 +1,4 @@ +import numpy as np from pandas.core.dtypes.common import is_list_like from sklearn.utils import check_consistent_length from sklearn.utils.validation import column_or_1d From df85e42318a11b6d84ea8b264c6173f03acf5ed5 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 15 Jul 2019 22:27:44 -0400 Subject: [PATCH 25/61] multiclass/multigroup support for adv debiasing --- .../inprocessing/adversarial_debiasing.py | 120 ++++++++---------- 1 file changed, 56 insertions(+), 64 deletions(-) diff --git a/aif360/sklearn/inprocessing/adversarial_debiasing.py b/aif360/sklearn/inprocessing/adversarial_debiasing.py index e82e287d..2d4bc7a0 100644 --- a/aif360/sklearn/inprocessing/adversarial_debiasing.py +++ b/aif360/sklearn/inprocessing/adversarial_debiasing.py @@ -22,10 +22,11 @@ class AdversarialDebiasing(BaseEstimator, ClassifierMixin): Artificial Intelligence, Ethics, and Society, 2018. """ - def __init__(self, prot_attr=None, adversary_loss_weight=0.1, num_epochs=50, - batch_size=128, classifier_num_hidden_units=200, debias=True, - verbose=True, random_state=None): + def __init__(self, sess, prot_attr=None, adversary_loss_weight=0.1, + num_epochs=50, batch_size=128, classifier_num_hidden_units=200, + debias=True, verbose=True, random_state=None): + self.sess = sess self.prot_attr = prot_attr self.adversary_loss_weight = adversary_loss_weight self.num_epochs = num_epochs @@ -37,10 +38,12 @@ def __init__(self, prot_attr=None, adversary_loss_weight=0.1, num_epochs=50, @property def classifier_logits_(self): - check_is_fitted(self, ['input_ph', 'keep_prob']) + check_is_fitted(self, ['input_ph', 'keep_prob', 'classes_']) + n_features = self.input_ph.shape[1] + n_classes = len(self.classes_) with tf.variable_scope('classifier_model'): W1 = tf.get_variable( - 'W1', [self.input_ph.shape[1], self.classifier_num_hidden_units], + 'W1', [n_features, self.classifier_num_hidden_units], initializer=tf.contrib.layers.xavier_initializer()) b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), name='b1') @@ -49,9 +52,9 @@ def classifier_logits_(self): h1 = tf.nn.dropout(h1, keep_prob=self.keep_prob) W2 = tf.get_variable( - 'W2', [self.classifier_num_hidden_units, 1], + 'W2', [self.classifier_num_hidden_units, n_classes], initializer=tf.contrib.layers.xavier_initializer()) - b2 = tf.Variable(tf.zeros(shape=[1]), name='b2') + b2 = tf.Variable(tf.zeros(shape=[n_classes]), name='b2') pred_logits = tf.matmul(h1, W2) + b2 @@ -60,72 +63,62 @@ def classifier_logits_(self): @property def adversary_logits_(self): """Compute the adversary predictions for the protected attribute.""" - check_is_fitted(self, ['classifier_logits_', 'true_labels_ph']) + check_is_fitted(self, ['classifier_logits_', 'true_labels_ph', 'groups_']) + n_groups = len(self.groups_) with tf.variable_scope("adversary_model"): c = tf.get_variable('c', initializer=tf.constant(1.0)) - s = tf.sigmoid((1 + tf.abs(c)) * self.classifier_logits) + s = tf.sigmoid((1 + tf.abs(c)) * self.classifier_logits_) - W2 = tf.get_variable('W2', [3, 1], - initializer=tf.contrib.layers.xavier_initializer()) - b2 = tf.Variable(tf.zeros(shape=[1]), name='b2') + W2 = tf.get_variable('W2', [3, n_groups], + initializer=tf.contrib.layers.xavier_initializer()) + b2 = tf.Variable(tf.zeros(shape=[n_groups]), name='b2') pred_prot_attr_logits = tf.matmul( - tf.concat([s, s * self.true_labels_ph, s * (1.0 - self.true_labels_ph)], axis=1), + tf.concat([s, s * self.true_labels_ph, + s * (1.0 - self.true_labels_ph)], axis=1), W2) + b2 - # pred_prot_attr_labels = tf.sigmoid(pred_prot_attr_logit) return pred_prot_attr_logits - def _train(self, X, y, groups): - - def fit(self, X, y): rng = check_random_state(self.random_state) # tf.random.seed(random_state) groups, self.prot_attr_ = check_groups(X, self.prot_attr) - lb = LabelBinarizer() - y = lb.fit_transform(y) - # TODO: LabelEncoder for groups - self.groups_ = np.unique(groups) - self.classes_ = lb.classes_ - self.sess_ = tf.Session() + le = LabelEncoder() + y = le.fit_transform(y) + self.classes_ = le.classes_ + groups = le.fit_transform(groups) + self.groups_ = le.classes_ n_samples, n_features = X.shape - n_classes = len(self.classes_) - n_groups = len(self.groups_) with tf.variable_scope('adversarial_debiasing'): - n_samples, n_features = X.shape - n_classes = len(self.classes_) - n_groups = len(self.groups_) - # Setup placeholders self.input_ph = tf.placeholder(tf.float32, shape=[None, n_features]) - self.prot_attr_ph = tf.placeholder(tf.float32, shape=[None, n_groups]) - self.true_labels_ph = tf.placeholder(tf.float32, shape=[None, n_classes]) + self.prot_attr_ph = tf.placeholder(tf.float32, shape=[None, 1]) + self.true_labels_ph = tf.placeholder(tf.float32, shape=[None, 1]) self.keep_prob = tf.placeholder(tf.float32) + global_step = tf.train.get_or_create_global_step() + starter_learning_rate = 0.001 + learning_rate = tf.train.exponential_decay(starter_learning_rate, + global_step, 1000, 0.96, staircase=True) + # Obtain classifier loss - loss_fn = (tf.nn.sigmoid_cross_entropy_with_logits if n_classes == 1 - else tf.nn.softmax_cross_entropy_with_logits) - # clf_loss = loss_fn(labels=self.true_labels_ph, logits=self.classifier_logits_) - # clf_loss = tf.reduce_sum(sample_weight * clf_loss) / tf.reduce_sum(sample_weight) - clf_loss = tf.reduce_mean(loss_fn(labels=self.true_labels_ph, - logits=self.classifier_logits_)) + clf_loss = tf.reduce_mean( + tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=self.true_labels_ph, + logits=self.classifier_logits_)) if self.debias: # Obtain adversary loss - loss_fn = (tf.nn.sigmoid_cross_entropy_with_logits if n_groups == 1 - else tf.nn.softmax_cross_entropy_with_logits) - adv_loss = tf.reduce_mean(loss_fn(labels=self.prot_attr_ph, - logits=self.adversary_logits_)) + adv_loss = tf.reduce_mean( + tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=self.prot_attr_ph, + logits=self.adversary_logits_)) - # Setup optimizers with learning rates - global_step = tf.Variable(0, trainable=False) - starter_learning_rate = 0.001 - learning_rate = tf.train.exponential_decay( - starter_learning_rate, global_step, 1000, 0.96, staircase=True) + # Setup optimizers clf_opt = tf.train.AdamOptimizer(learning_rate) if self.debias: adv_opt = tf.train.AdamOptimizer(learning_rate) @@ -135,9 +128,11 @@ def fit(self, X, y): if self.debias: adv_vars = [var for var in tf.trainable_variables() if 'adversary_model' in var.name] - # Update classifier parameters + # Compute grad wrt classifier parameters adv_grads = {var: grad for (grad, var) in adv_opt.compute_gradients(adv_loss, var_list=clf_vars)} + # Update adversary parameters (don't increment global step yet) + adv_min = adv_opt.minimize(adv_loss, var_list=adv_vars) normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny) @@ -149,22 +144,17 @@ def fit(self, X, y): grad -= tf.reduce_sum(grad * unit_adv_grad) * unit_adv_grad grad -= self.adversary_loss_weight * adv_grads[var] clf_grads.append((grad, var)) - clf_minimizer = clf_opt.apply_gradients(clf_grads, global_step=global_step) - - if self.debias: - # Update adversary parameters - adv_minimizer = adv_opt.minimize(adv_loss, var_list=adv_vars, - global_step=global_step) + clf_min = clf_opt.apply_gradients(clf_grads, global_step=global_step) - self.sess_.run(tf.global_variables_initializer()) - self.sess_.run(tf.local_variables_initializer()) + self.sess.run(tf.global_variables_initializer()) # Begin training for epoch in range(self.num_epochs): # TODO: why rng.choice(n_samples, n_samples)? shuffled_ids = rng.shuffle(np.arange(n_samples)) for i in range(n_samples // self.batch_size): - batch_ids = shuffled_ids[self.batch_size * i:self.batch_size * (i+1)] + batch_ids = shuffled_ids[self.batch_size * i: + self.batch_size * (i+1)] batch_features = X[batch_ids] batch_labels = y[batch_ids] batch_prot_attr = groups[batch_ids] @@ -174,8 +164,8 @@ def fit(self, X, y): self.keep_prob: 0.8} if self.debias: _, _, clf_loss_value, adv_loss_value = ( - self.sess_.run([clf_minimizer, adv_minimizer, - clf_loss, adv_loss], + self.sess.run([clf_min, adv_min, + clf_loss, adv_loss], feed_dict=batch_feed_dict)) if i % 200 == 0 and self.verbose: print("epoch {}; iter: {}; batch classifier loss: " @@ -183,8 +173,8 @@ def fit(self, X, y): epoch, i, clf_loss_value, adv_loss_value)) else: - _, clf_loss_value = self.sess_.run( - [clf_minimizer, clf_loss], + _, clf_loss_value = self.sess.run( + [clf_min, clf_loss], feed_dict=batch_feed_dict) if i % 200 == 0 and self.verbose: print("epoch {}; iter: {}; batch classifier loss: " @@ -193,9 +183,12 @@ def fit(self, X, y): return self def decision_function(self, X): - check_is_fitted(self, ['classes_', 'input_ph', 'keep_prob', 'classifier_logits_']) + check_is_fitted(self, ['classes_', 'input_ph', 'keep_prob', + 'classifier_logits_']) n_samples = X.shape[0] groups, _ = check_groups(X, self.prot_attr_) + le = LabelEncoder().fit(self.groups_) + groups = le.transform(groups) samples_covered = 0 scores = np.empty((n_samples, len(self.classes_))) @@ -212,9 +205,8 @@ def decision_function(self, X): batch_feed_dict = {self.input_ph: batch_features, self.keep_prob: 1.0} - # batch_logits = self.sess_.run(self.classifier_logits_, feed_dict=batch_feed_dict) - scores[batch_ids] = self.sess_.run(self.classifier_logits_, - feed_dict=batch_feed_dict) + scores[batch_ids] = self.sess.run(self.classifier_logits_, + feed_dict=batch_feed_dict) samples_covered += len(batch_features) return scores From d2d0ddcdc2fa9cfba108315d742f0430907b82a3 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 30 Jul 2019 14:15:28 -0400 Subject: [PATCH 26/61] fix build errors --- aif360/sklearn/datasets/utils.py | 5 +---- aif360/sklearn/metrics/metrics.py | 16 +++++++++++----- aif360/sklearn/tests/test_metrics.py | 4 +++- aif360/sklearn/utils.py | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index 4566c983..f3e10117 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -23,7 +23,7 @@ def standarize_dataset(df, protected_attributes, target, sample_weight=None, usecols (single label or list-like, optional): Column(s) to keep. All others are dropped. dropcols (single label or list-like, optional): Column(s) to drop. - numeric_only (bool): Drop all non-numeric feature columns. + numeric_only (bool): Drop all non-numeric, non-binary feature columns. dropna (bool): Drop rows with NAs. Returns: @@ -58,10 +58,7 @@ def standarize_dataset(df, protected_attributes, target, sample_weight=None, >>> X_tr, X_te, y_tr, y_te = train_test_split(X, y) """ df = df.set_index(protected_attributes, drop=False, append=True) - # df = df.set_index(sample_weight or np.ones(df.shape[0]), append=True) - # df.index = df.index.set_names('sample_weight', level=-1) - # TODO: convert to 1/0 if numeric_only? y = df.pop(target) # Column-wise drops diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index 4d87490e..eca1bf95 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -1,6 +1,7 @@ import numpy as np from sklearn.metrics import make_scorer, recall_score from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_X_y from aif360.sklearn.utils import check_groups @@ -62,7 +63,8 @@ def ratio(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, arbitrary metric. Note: The optimal value of a ratio is 1. To make it a scorer, one must - subtract 1, take the absolute value, and set ``greater_is_better`` to False. + take the minimum of the ratio and its inverse, subtract it from 1, and set + ``greater_is_better`` to False. Unprivileged group is taken to be the inverse of the privileged group. @@ -97,8 +99,10 @@ def make_difference_scorer(func): greater_is_better=False) def make_ratio_scorer(func): - return make_scorer(lambda y, y_pred, **kw: abs(func(y, y_pred, **kw) - 1), - greater_is_better=False) + def score_fn(y, y_pred, **kwargs): + ratio = func(y, y_pred, **kwargs) + return 1 - min(ratio, 1/ratio) + return make_scorer(score_fn, greater_is_better=False) # ================================ HELPERS ===================================== @@ -179,7 +183,7 @@ def generalized_entropy_error(y_true, y_pred, alpha=2, pos_label=1): def between_group_generalized_entropy_error(y_true, y_pred, prot_attr=None, priv_group=None, alpha=2, pos_label=1): - groups = check_groups(y_true, prot_attr) + groups, _ = check_groups(y_true, prot_attr) b = np.empty_like(y_true, dtype='float') if priv_group is not None: groups = [1 if g == priv_group else 0 for g in groups] @@ -199,9 +203,11 @@ def coefficient_of_variation(b): # Is consistency_difference posible? # use sample_weight? def consistency_score(X, y, n_neighbors=5): + # cast as ndarrays + X, y = check_X_y(X, y) # learn a KNN on the features nbrs = NearestNeighbors(n_neighbors, algorithm='ball_tree').fit(X) - _, indices = nbrs.kneighbors(X) + indices = nbrs.kneighbors(X, return_distance=False) # compute consistency score return 1 - abs(y - y[indices].mean(axis=1)).mean() diff --git a/aif360/sklearn/tests/test_metrics.py b/aif360/sklearn/tests/test_metrics.py index e470f32e..0c040edd 100644 --- a/aif360/sklearn/tests/test_metrics.py +++ b/aif360/sklearn/tests/test_metrics.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from sklearn.linear_model import LogisticRegression from aif360.datasets import AdultDataset @@ -13,7 +14,8 @@ X, y, sample_weight = fetch_adult(numeric_only=True) -y = y.factorize(sort=True)[0] +# y = y.cat.rename_categories(range(len(y.cat.categories))) +y = pd.Series(y.factorize(sort=True)[0], name=y.name, index=y.index) y_pred = LogisticRegression(solver='liblinear').fit(X, y, sample_weight=sample_weight).predict(X) adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], diff --git a/aif360/sklearn/utils.py b/aif360/sklearn/utils.py index 6c850e70..bfec0351 100644 --- a/aif360/sklearn/utils.py +++ b/aif360/sklearn/utils.py @@ -38,7 +38,7 @@ def check_groups(X, prot_attr): "Expected `Series` or `DataFrame`, got {} instead.".format( type(X).__name__)) - all_prot_attrs = X.index.names[1:] + all_prot_attrs = [name for name in X.index.names if name] # not None or '' if prot_attr is None: prot_attr = all_prot_attrs elif not is_list_like(prot_attr): From 7a2414a3f1c048094c780aff33e253b5d487eed9 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 12 Aug 2019 14:44:16 -0400 Subject: [PATCH 27/61] Add ensure_binary option to check_groups --- aif360/sklearn/utils.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/aif360/sklearn/utils.py b/aif360/sklearn/utils.py index bfec0351..1fb75d3f 100644 --- a/aif360/sklearn/utils.py +++ b/aif360/sklearn/utils.py @@ -16,39 +16,48 @@ def check_inputs(X, y, sample_weight): check_consistent_length(X, y, sample_weight) return X, y, sample_weight -def check_groups(X, prot_attr): - """Validates ``X`` and returns ``groups`` and ``prot_attr``. +def check_groups(arr, prot_attr, ensure_binary=False): + """Validates ``arr`` and returns ``groups`` and ``prot_attr``. Args: - X (`pandas.Series` or `pandas.DataFrame`): . + arr (`pandas.Series` or `pandas.DataFrame`): A Pandas object containing + protected attribute information in the index. prot_attr (single label or list-like): Protected attribute(s). If - ``None``, all protected attributes in ``X`` are used. + ``None``, all protected attributes in ``arr`` are used. + ensure_binary (bool): Raise an error if the resultant groups are not + binary. Returns: - (`pandas.Index`, list-like): + tuple: * **groups** (`pandas.Index`) -- Label (or tuple of labels) of - protected attribute for each sample in ``X``. + protected attribute for each sample in ``arr``. * **prot_attr** (list-like) -- Modified input. If input is a single label, returns single-item list. If input is ``None`` returns list of all protected attributes. """ - if not hasattr(X, 'index'): + if not hasattr(arr, 'index'): raise TypeError( "Expected `Series` or `DataFrame`, got {} instead.".format( - type(X).__name__)) + type(arr).__name__)) - all_prot_attrs = [name for name in X.index.names if name] # not None or '' + all_prot_attrs = [name for name in arr.index.names if name] # not None or '' if prot_attr is None: prot_attr = all_prot_attrs elif not is_list_like(prot_attr): prot_attr = [prot_attr] - if any(p not in X.index.names for p in prot_attr): + if any(p not in arr.index.names for p in prot_attr): raise ValueError("Some of the attributes provided are not present " "in the dataset. Expected a subset of:\n{}\nGot:\n" "{}".format(all_prot_attrs, prot_attr)) - groups = X.index.droplevel(list(set(X.index.names) - set(prot_attr))) + groups = arr.index.droplevel(list(set(arr.index.names) - set(prot_attr))) + groups = groups.to_flat_index() + + n_unique = groups.nunique() + if ensure_binary and n_unique != 2: + raise ValueError("Expected 2 protected attribute groups, got {}".format( + groups.unique() if n_unique > 5 else n_unique)) - return groups.to_flat_index(), prot_attr + return groups, prot_attr From aac9954e87f3666af7f60a540648d52d71d560c0 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 29 Oct 2019 15:02:48 -0400 Subject: [PATCH 28/61] `numeric_only` converts index and label as well --- aif360/sklearn/datasets/openml_datasets.py | 11 ++- aif360/sklearn/datasets/utils.py | 81 ++++------------------ 2 files changed, 20 insertions(+), 72 deletions(-) diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 37122b17..562cd734 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -88,7 +88,7 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], ordered=True).fillna('Non-white') df.sex = df.sex.cat.as_ordered() # 'Female' < 'Male' - return standarize_dataset(df, protected_attributes=['race', 'sex'], + return standarize_dataset(df, prot_attr=['race', 'sex'], target='annual-income', sample_weight='fnlwgt', usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, dropna=dropna) @@ -161,10 +161,9 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], df = df.join(personal_status.astype('category')) df.sex = df.sex.cat.as_ordered() # 'female' < 'male' - return standarize_dataset(df, protected_attributes=['sex', age], - target='credit-risk', usecols=usecols, - dropcols=dropcols, numeric_only=numeric_only, - dropna=dropna) + return standarize_dataset(df, prot_attr=['sex', age], target='credit-risk', + usecols=usecols, dropcols=dropcols, + numeric_only=numeric_only, dropna=dropna) def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', numeric_only=False, dropna=False): @@ -215,6 +214,6 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', # replace 'unknown' marker with NaN df.apply(lambda s: s.cat.remove_categories('unknown', inplace=True) if hasattr(s, 'cat') and 'unknown' in s.cat.categories else s) - return standarize_dataset(df, protected_attributes='age', target='deposit', + return standarize_dataset(df, prot_attr='age', target='deposit', usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, dropna=dropna) diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index f3e10117..964f34d9 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -1,21 +1,17 @@ from collections import namedtuple -import pandas as pd from pandas.core.dtypes.common import is_list_like -from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import OneHotEncoder -from sklearn.utils.validation import check_is_fitted -def standarize_dataset(df, protected_attributes, target, sample_weight=None, - usecols=[], dropcols=[], numeric_only=False, dropna=True): +def standarize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], + dropcols=[], numeric_only=False, dropna=True): """Separate data, targets, and possibly sample weights and populate protected attributes as sample properties. Args: df (pandas.DataFrame): DataFrame with features and target together. - protected_attributes (single label or list-like): Label or list of - labels corresponding to protected attribute columns. Even if these - are dropped from the features, they remain in the index. + prot_attr (single label or list-like): Label or list of labels + corresponding to protected attribute columns. Even if these are + dropped from the features, they remain in the index. target (single label or list-like): Column label of the target (outcome) variable. sample_weight (single label, optional): Name of the column containing @@ -47,85 +43,38 @@ def standarize_dataset(df, protected_attributes, target, sample_weight=None, >>> from sklearn.linear_model import LinearRegression >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['X', 'y', 'Z']) - >>> train = standarize_dataset(df, protected_attributes='Z', target='y') + >>> train = standarize_dataset(df, prot_attr='Z', target='y') >>> reg = LinearRegression().fit(*train) >>> import numpy as np >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import train_test_split >>> df = pd.DataFrame(np.hstack(make_classification(n_features=5))) - >>> X, y = standarize_dataset(df, protected_attributes=0, target=5) + >>> X, y = standarize_dataset(df, prot_attr=0, target=5) >>> X_tr, X_te, y_tr, y_te = train_test_split(X, y) """ - df = df.set_index(protected_attributes, drop=False, append=True) - - y = df.pop(target) - # Column-wise drops - df = df.drop(dropcols, axis=1) + df = df.drop(columns=dropcols) if usecols: if not is_list_like(usecols): # make sure we don't return a Series instead of a DataFrame usecols = [usecols] df = df[usecols] + if numeric_only: - # binary categorical columns -> 1/0 for col in df.select_dtypes('category'): - # TODO: allow any size ordered categorical? - if len(df[col].cat.categories) == 2 and df[col].cat.ordered: + if df[col].cat.ordered: df[col] = df[col].factorize(sort=True)[0] df = df.select_dtypes(['number', 'bool']) - # upcast all feature dimensions to a consistent numerical dtype - df = df.apply(pd.to_numeric, axis=1) + # Index-wise drops if dropna: - notna = df.notna().all(axis=1) & y.notna() - df = df.loc[notna] - y = y.loc[notna] + df.dropna() + + df = df.set_index(prot_attr, drop=False, append=True) + y = df.pop(target) if sample_weight is not None: return namedtuple('WeightedDataset', ['X', 'y', 'sample_weight'])( df, y, df.pop(sample_weight).rename('sample_weight')) return namedtuple('Dataset', ['X', 'y'])(df, y) - -def make_onehot_transformer(): - """Shortcut for encoding categorical features as one-hot vectors. - - Note: - This changes the column order. - - Returns: - sklearn.compose.ColumnTransformer: Class capable of transforming - categorical features in X to one-hot features. - """ - class PandasOutOneHotTransformer(ColumnTransformer): - def __init__(self): - ohe = ('onehotencoder', OneHotEncoder(), - lambda X: X.dtypes == 'category') - super().__init__([ohe], remainder='passthrough') - - def get_feature_names(self): - check_is_fitted(self, 'transformers_') - dummies = self.named_transformers_.onehotencoder.get_feature_names( - input_features=self.ohe_input_features_) - passthroughs = self.passthrough_features_ - return list(dummies) + list(passthroughs) - - def fit(self, X, y=None): - self.ohe_input_features_ = X.columns[X.dtypes == 'category'] - self.passthrough_features_ = X.columns[X.dtypes != 'category'] - return super().fit(X, y=y) - - def fit_transform(self, X, y=None): - Xt = super().fit_transform(X, y=y) - self.ohe_input_features_ = X.columns[X.dtypes == 'category'] - self.passthrough_features_ = X.columns[X.dtypes != 'category'] - columns = self.get_feature_names() - return pd.DataFrame(Xt, columns=columns, index=X.index) - - def transform(self, X): - Xt = super().transform(X) - columns = self.get_feature_names() - return pd.DataFrame(Xt, columns=columns, index=X.index) - - return PandasOutOneHotTransformer() From dc317cf496b767a0e41dee4a8ad8a2c990993642 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 29 Oct 2019 15:06:35 -0400 Subject: [PATCH 29/61] changed Reweighing to return X, sample_weight removed Reweighing.sample_weight_ attribute --- aif360/sklearn/preprocessing/reweighing.py | 26 ++++++++++++---------- aif360/sklearn/tests/test_reweighing.py | 4 ++-- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/aif360/sklearn/preprocessing/reweighing.py b/aif360/sklearn/preprocessing/reweighing.py index 5a80c457..c73b96fe 100644 --- a/aif360/sklearn/preprocessing/reweighing.py +++ b/aif360/sklearn/preprocessing/reweighing.py @@ -16,8 +16,6 @@ class Reweighing(BaseEstimator): transformer. classes_ (array, shape (n_classes,)): A list of class labels known to the transformer. - sample_weight_ (array, shape (n_samples,)): New sample weights after - transformation. See examples for details. reweigh_factors_ (array, shape (n_groups, n_labels)): Reweighing factors for each combination of group and class labels used to debias samples. Existing sample weights are multiplied by the corresponding @@ -61,12 +59,14 @@ def fit_transform(self, X, y, sample_weight=None): sample_weight (array-like, optional): Sample weights. Returns: - X: Unchanged samples. Only the sample weights are different after - transformation (see the `sample_weight_` attribute). + tuple: + + **X** -- Unchanged samples. + **sample_weight** -- Transformed sample weights. """ X, y, sample_weight = check_inputs(X, y, sample_weight) - self.sample_weight_ = np.empty_like(sample_weight) + sample_weight_t = np.empty_like(sample_weight) groups, self.prot_attr_ = check_groups(X, self.prot_attr) # TODO: maintain categorical ordering self.groups_ = np.unique(groups) @@ -82,16 +82,13 @@ def N_(i): return sample_weight[i].sum() g_and_c = (groups == g) & (y == c) if np.any(g_and_c): W_gc = N_(groups == g) * N_(y == c) / (N * N_(g_and_c)) - self.sample_weight_[g_and_c] = W_gc * sample_weight[g_and_c] + sample_weight_t[g_and_c] = W_gc * sample_weight[g_and_c] self.reweigh_factors_[i, j] = W_gc - return X + return X, sample_weight_t class ReweighingMeta(BaseEstimator, MetaEstimatorMixin): def __init__(self, estimator, reweigher=Reweighing()): - if not has_fit_parameter(estimator, 'sample_weight'): - raise TypeError("`estimator` (type: {}) does not have fit parameter" - " `sample_weight`.".format(type(estimator))) self.reweigher = reweigher self.estimator = estimator @@ -100,11 +97,16 @@ def _estimator_type(self): return self.estimator._estimator_type def fit(self, X, y, sample_weight=None): + if not has_fit_parameter(self.estimator, 'sample_weight'): + raise TypeError("`estimator` (type: {}) does not have fit parameter" + " `sample_weight`.".format(type(self.estimator))) + self.reweigher_ = clone(self.reweigher) self.estimator_ = clone(self.estimator) - self.reweigher_.fit_transform(X, y, sample_weight=sample_weight) - self.estimator_.fit(X, y, sample_weight=self.reweigher_.sample_weight_) + X, sample_weight = self.reweigher_.fit_transform(X, y, + sample_weight=sample_weight) + self.estimator_.fit(X, y, sample_weight=sample_weight) return self @if_delegate_has_method('estimator_') diff --git a/aif360/sklearn/tests/test_reweighing.py b/aif360/sklearn/tests/test_reweighing.py index f1e2a223..97631043 100644 --- a/aif360/sklearn/tests/test_reweighing.py +++ b/aif360/sklearn/tests/test_reweighing.py @@ -24,12 +24,12 @@ def test_reweighing_sex(): privileged_groups=[{'sex': 1}]) adult_fair = orig_rew.fit_transform(adult) rew = Reweighing('sex') - rew.fit_transform(X, y, sample_weight=sample_weight) + _, new_sample_weight = rew.fit_transform(X, y, sample_weight=sample_weight) # assert np.allclose([[orig_rew.w_up_unfav, orig_rew.w_up_fav], # [orig_rew.w_p_unfav, orig_rew.w_p_fav]], # rew.reweigh_factors_) - assert np.allclose(adult_fair.instance_weights, rew.sample_weight_) + assert np.allclose(adult_fair.instance_weights, new_sample_weight) def test_reweighing_intersection(): rew = Reweighing() From 0f184c3b5e71bd59e9dd291eb5f4b410a60f9ce5 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 29 Oct 2019 15:08:34 -0400 Subject: [PATCH 30/61] made sample_weight optional in check_inputs --- aif360/sklearn/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aif360/sklearn/utils.py b/aif360/sklearn/utils.py index 1fb75d3f..28db1e61 100644 --- a/aif360/sklearn/utils.py +++ b/aif360/sklearn/utils.py @@ -4,7 +4,7 @@ from sklearn.utils.validation import column_or_1d -def check_inputs(X, y, sample_weight): +def check_inputs(X, y, sample_weight=None): if not hasattr(X, 'index'): raise TypeError("Expected `DataFrame`, got {} instead.".format( type(X).__name__)) From ec4a1de138adb74c08fc7f773358ddd1e1fef722 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 29 Oct 2019 15:11:46 -0400 Subject: [PATCH 31/61] matched tests to new numeric dataset format --- aif360/sklearn/tests/test_metrics.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/aif360/sklearn/tests/test_metrics.py b/aif360/sklearn/tests/test_metrics.py index 0c040edd..c0a1c6e9 100644 --- a/aif360/sklearn/tests/test_metrics.py +++ b/aif360/sklearn/tests/test_metrics.py @@ -1,5 +1,4 @@ import numpy as np -import pandas as pd from sklearn.linear_model import LogisticRegression from aif360.datasets import AdultDataset @@ -14,8 +13,6 @@ X, y, sample_weight = fetch_adult(numeric_only=True) -# y = y.cat.rename_categories(range(len(y.cat.categories))) -y = pd.Series(y.factorize(sort=True)[0], name=y.name, index=y.index) y_pred = LogisticRegression(solver='liblinear').fit(X, y, sample_weight=sample_weight).predict(X) adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], @@ -44,27 +41,27 @@ def test_selection_rate(): assert select == cm.selection_rate() def test_disparate_impact(): - di = disparate_impact_ratio(y, y_pred, prot_attr='sex', priv_group='Male', + di = disparate_impact_ratio(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert di == cm.disparate_impact() def test_statistical_parity(): stat = statistical_parity_difference(y, y_pred, prot_attr='sex', - priv_group='Male', sample_weight=sample_weight) + sample_weight=sample_weight) assert stat == cm.statistical_parity_difference() def test_equal_opportunity(): eopp = equal_opportunity_difference(y, y_pred, prot_attr='sex', - priv_group='Male', sample_weight=sample_weight) + sample_weight=sample_weight) assert eopp == cm.equal_opportunity_difference() def test_average_odds_difference(): - aod = average_odds_difference(y, y_pred, prot_attr='sex', priv_group='Male', + aod = average_odds_difference(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert np.isclose(aod, cm.average_odds_difference()) def test_average_odds_error(): - aoe = average_odds_error(y, y_pred, prot_attr='sex', priv_group='Male', + aoe = average_odds_error(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert np.isclose(aoe, cm.average_abs_odds_difference()) @@ -73,6 +70,5 @@ def test_generalized_entropy_index(): assert np.isclose(gei, cm.generalized_entropy_index()) def test_between_group_generalized_entropy_index(): - bggei = between_group_generalized_entropy_error(y, y_pred, prot_attr='sex', - priv_group='Male') + bggei = between_group_generalized_entropy_error(y, y_pred, prot_attr='sex') assert bggei == cm.between_group_generalized_entropy_index() From f8c4fc5cda17ad79fb194a29219a39e6853047c4 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 29 Oct 2019 17:20:07 -0400 Subject: [PATCH 32/61] added generalized_fnr/fpr metrics also added warning when dividing by zero in ratio --- aif360/sklearn/metrics/__init__.py | 19 +------- aif360/sklearn/metrics/metrics.py | 77 +++++++++++++++++++++--------- 2 files changed, 56 insertions(+), 40 deletions(-) diff --git a/aif360/sklearn/metrics/__init__.py b/aif360/sklearn/metrics/__init__.py index 84aa3f1e..ceaef288 100644 --- a/aif360/sklearn/metrics/__init__.py +++ b/aif360/sklearn/metrics/__init__.py @@ -1,18 +1 @@ -from aif360.sklearn.metrics.metrics import consistency_score -from aif360.sklearn.metrics.metrics import specificity_score -from aif360.sklearn.metrics.metrics import selection_rate -from aif360.sklearn.metrics.metrics import disparate_impact_ratio -from aif360.sklearn.metrics.metrics import statistical_parity_difference -from aif360.sklearn.metrics.metrics import equal_opportunity_difference -from aif360.sklearn.metrics.metrics import average_odds_difference -from aif360.sklearn.metrics.metrics import average_odds_error -from aif360.sklearn.metrics.metrics import generalized_entropy_error -from aif360.sklearn.metrics.metrics import between_group_generalized_entropy_error - -__all__ = [ - 'consistency_score', 'specificity_score', 'selection_rate', - 'disparate_impact_ratio', 'statistical_parity_difference', - 'equal_opportunity_difference', 'average_odds_difference', - 'average_odds_error', 'generalized_entropy_error', - 'between_group_generalized_entropy_error' -] +from aif360.sklearn.metrics.metrics import * diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index eca1bf95..4adadda0 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -1,17 +1,20 @@ +import warnings + import numpy as np from sklearn.metrics import make_scorer, recall_score from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_X_y +from sklearn.exceptions import UndefinedMetricWarning from aif360.sklearn.utils import check_groups __all__ = [ - 'consistency_score', 'specificity_score', 'selection_rate', + 'base_rate', 'consistency_score', 'specificity_score', 'selection_rate', 'disparate_impact_ratio', 'statistical_parity_difference', 'equal_opportunity_difference', 'average_odds_difference', - 'average_odds_error', 'generalized_entropy_error', - 'between_group_generalized_entropy_error' + 'average_odds_error', 'generalized_entropy_error', 'generalized_fnr', + 'between_group_generalized_entropy_error', 'generalized_fpr' ] # ============================= META-METRICS =================================== @@ -88,9 +91,18 @@ def ratio(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, unpriv = map(lambda a: a[~idx], (y,) + args) priv = map(lambda a: a[idx], (y,) + args) if sample_weight is not None: - return (func(*unpriv, sample_weight=sample_weight[~idx], **kwargs) - / func(*priv, sample_weight=sample_weight[idx], **kwargs)) - return func(*unpriv, **kwargs) / func(*priv, **kwargs) + numerator = func(*unpriv, sample_weight=sample_weight[~idx], **kwargs) + denominator = func(*priv, sample_weight=sample_weight[idx], **kwargs) + else: + numerator = func(*unpriv, **kwargs) + denominator = func(*priv, **kwargs) + + if denominator == 0: + warnings.warn("The ratio is ill-defined and being set to 0.0 because " + "the {} for privileged samples is 0.".format(func.__name__), + UndefinedMetricWarning) + + return numerator / denominator # =========================== SCORER FACTORIES ================================= @@ -106,6 +118,7 @@ def score_fn(y, y_pred, **kwargs): # ================================ HELPERS ===================================== +# TODO: make this more general def specificity_score(y_true, y_pred, neg_label=0, sample_weight=None): """Compute the specificity or true negative rate. @@ -118,12 +131,32 @@ def specificity_score(y_true, y_pred, neg_label=0, sample_weight=None): return recall_score(y_true, y_pred, pos_label=neg_label, sample_weight=sample_weight) -def base_rate(y, y_pred=None, pos_label=1, sample_weight=None): - return np.average(y == pos_label, weights=sample_weight) +def base_rate(y_true, y_pred=None, pos_label=1, sample_weight=None): + return np.average(y_true == pos_label, weights=sample_weight) def selection_rate(y_true, y_pred, pos_label=1, sample_weight=None): return base_rate(y_pred, pos_label=pos_label, sample_weight=sample_weight) +def generalized_fpr(y_true, y_pred, pos_label=1, sample_weight=None): + idx = (y_true != pos_label) + if not np.any(idx): + warnings.warn("generalized_fpr is ill-defined because there are no true" + " negatives in y_true.", UndefinedMetricWarning) + return 0. + if sample_weight is None: + return y_pred[idx].mean() + return np.average(y_pred[idx], weights=sample_weight[idx]) + +def generalized_fnr(y_true, y_pred, pos_label=1, sample_weight=None): + idx = (y_true == pos_label) + if not np.any(idx): + warnings.warn("generalized_fnr is ill-defined because there are no true" + " positives in y_true.", UndefinedMetricWarning) + return 0. + if sample_weight is None: + return 1 - y_pred[idx].mean() + return 1 - np.average(y_pred[idx], weights=sample_weight[idx]) + # ============================ GROUP FAIRNESS ================================== def statistical_parity_difference(*y, prot_attr=None, priv_group=1, pos_label=1, @@ -144,25 +177,25 @@ def equal_opportunity_difference(y_true, y_pred, prot_attr=None, priv_group=1, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) -def average_odds_difference(y_true, y_pred, prot_attr=None, priv_group=1, pos_label=1, - neg_label=0, sample_weight=None): - tnr_diff = difference(specificity_score, y_true, y_pred, prot_attr=prot_attr, - priv_group=priv_group, neg_label=neg_label, - sample_weight=sample_weight) +def average_odds_difference(y_true, y_pred, prot_attr=None, priv_group=1, + pos_label=1, neg_label=0, sample_weight=None): + fpr_diff = -difference(specificity_score, y_true, y_pred, + prot_attr=prot_attr, priv_group=priv_group, + neg_label=neg_label, sample_weight=sample_weight) tpr_diff = difference(recall_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) - return (tpr_diff - tnr_diff) / 2 + return (tpr_diff + fpr_diff) / 2 -def average_odds_error(y_true, y_pred, prot_attr=None, priv_group=1, pos_label=1, - neg_label=0, sample_weight=None): - tnr_diff = difference(specificity_score, y_true, y_pred, prot_attr=prot_attr, - priv_group=priv_group, neg_label=neg_label, - sample_weight=sample_weight) +def average_odds_error(y_true, y_pred, prot_attr=None, priv_group=1, + pos_label=1, neg_label=0, sample_weight=None): + fpr_diff = -difference(specificity_score, y_true, y_pred, + prot_attr=prot_attr, priv_group=priv_group, + neg_label=neg_label, sample_weight=sample_weight) tpr_diff = difference(recall_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) - return (abs(tnr_diff) + abs(tpr_diff)) / 2 + return (abs(tpr_diff) + abs(fpr_diff)) / 2 # ========================== INDIVIDUAL FAIRNESS =============================== @@ -223,8 +256,8 @@ def sensitivity_score(y_true, y_pred, pos_label=1, sample_weight=None): # return 1 - recall_score(y_true, y_pred, pos_label=pos_label, # sample_weight=sample_weight) -# def false_positive_rate_error(y_true, y_pred, pos_label=1, sample_weight=None): -# return 1 - specificity_score(y_true, y_pred, pos_label=pos_label, +# def false_positive_rate_error(y_true, y_pred, neg_label=0, sample_weight=None): +# return 1 - specificity_score(y_true, y_pred, neg_label=neg_label, # sample_weight=sample_weight) def mean_difference(*y, prot_attr=None, priv_group=1, pos_label=1, sample_weight=None): From 7ce2f42f0e7cd713e9d5100a01f1dc72a53143eb Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 29 Oct 2019 17:27:12 -0400 Subject: [PATCH 33/61] fixed dataset_processing changed bank dataset target to return bool --- aif360/sklearn/datasets/openml_datasets.py | 3 +-- aif360/sklearn/datasets/utils.py | 28 ++++++++++++---------- aif360/sklearn/tests/test_datasets.py | 16 ++++++------- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 562cd734..45d8cd7f 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -209,8 +209,7 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'deposit'] # remap target - df.deposit = df.deposit.cat.rename_categories({'1': 'no', '2': 'yes'}) - # df.deposit = df.deposit.cat.as_ordered() + df.deposit = df.deposit.map({'1': False, '2': True}) # replace 'unknown' marker with NaN df.apply(lambda s: s.cat.remove_categories('unknown', inplace=True) if hasattr(s, 'cat') and 'unknown' in s.cat.categories else s) diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index 964f34d9..e714026b 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -35,8 +35,8 @@ def standarize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], * **sample_weight** (`pandas.Series`, optional) -- Sample weights. Note: - The order of execution for the dropping parameters is: dropcols -> - usecols -> numeric_only -> dropna. + The order of execution for the dropping parameters is: numeric_only -> + dropcols -> usecols -> dropna. Examples: >>> import pandas as pd @@ -53,6 +53,17 @@ def standarize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], >>> X, y = standarize_dataset(df, prot_attr=0, target=5) >>> X_tr, X_te, y_tr, y_te = train_test_split(X, y) """ + # TODO: warn user if label in prot_attr, target, or dropcols is already dropped + # TODO: error message if label in usecols is already dropped + if numeric_only: + for col in df.select_dtypes('category'): + if df[col].cat.ordered: + df[col] = df[col].factorize(sort=True)[0] + df = df.select_dtypes(['number', 'bool']) + + df = df.set_index(prot_attr, drop=False, append=True) + y = df.pop(target) + # Column-wise drops df = df.drop(columns=dropcols) if usecols: @@ -61,18 +72,11 @@ def standarize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], usecols = [usecols] df = df[usecols] - if numeric_only: - for col in df.select_dtypes('category'): - if df[col].cat.ordered: - df[col] = df[col].factorize(sort=True)[0] - df = df.select_dtypes(['number', 'bool']) - # Index-wise drops if dropna: - df.dropna() - - df = df.set_index(prot_attr, drop=False, append=True) - y = df.pop(target) + notna = df.notna().all(axis=1) & y.notna() + df = df.loc[notna] + y = y.loc[notna] if sample_weight is not None: return namedtuple('WeightedDataset', ['X', 'y', 'sample_weight'])( diff --git a/aif360/sklearn/tests/test_datasets.py b/aif360/sklearn/tests/test_datasets.py index 4253bcd8..05974f1e 100644 --- a/aif360/sklearn/tests/test_datasets.py +++ b/aif360/sklearn/tests/test_datasets.py @@ -5,12 +5,12 @@ import pytest from aif360.sklearn.datasets import fetch_adult, fetch_bank, fetch_german -from aif360.sklearn.datasets import standarize_dataset, make_onehot_transformer +from aif360.sklearn.datasets import standarize_dataset df = pd.DataFrame([[1, 2, 3, 'a'], [5, 6, 7, 'b'], [np.NaN, 10, 11, 'c']], columns=['X1', 'X2', 'y', 'Z']) -basic = partial(standarize_dataset, df=df, protected_attributes='Z', target='y', +basic = partial(standarize_dataset, df=df, prot_attr='Z', target='y', dropna=False) def test_standardize_dataset_basic(): @@ -43,16 +43,16 @@ def test_usecols_dropcols_basic(): basic(usecols=['X1', 'X2'], dropcols='X2') def test_dropna_basic(): - basic_dropna = partial(standarize_dataset, df=df, protected_attributes='Z', + basic_dropna = partial(standarize_dataset, df=df, prot_attr='Z', target='y', dropna=True) assert basic_dropna().X.shape == (2, 3) assert basic(dropcols='X1').X.shape == (3, 2) def test_numeric_only_basic(): - assert basic(numeric_only=True).X.shape == (3, 2) - assert (basic(numeric_only=True).X.dtypes == 'float').all() - assert basic(dropcols='Z', numeric_only=True).X.shape == (3, 2) - assert (basic(dropcols='X1', numeric_only=True).X.dtypes == 'int').all() + assert basic(prot_attr='X2', numeric_only=True).X.shape == (3, 2) + with pytest.raises(KeyError): + assert (basic(prot_attr='X2', dropcols='Z', numeric_only=True).X.shape + == (3, 2)) def test_fetch_adult(): adult = fetch_adult() @@ -76,4 +76,4 @@ def test_fetch_bank(): def test_onehot_transformer(): X, y = fetch_german() - assert len(make_onehot_transformer().fit_transform(X).columns) == 63 + assert len(pd.get_dummies(X).columns) == 63 From 973a7741f04790334b3673412103b6678576bf09 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 29 Oct 2019 17:32:24 -0400 Subject: [PATCH 34/61] initial calibrated equalized odds port bug fix in old implementation (weighted cost was calculated incorrectly) --- .../calibrated_eq_odds_postprocessing.py | 2 +- aif360/sklearn/postprocessing/__init__.py | 123 ++++++++++++++++++ .../calibrated_equalized_odds.py | 114 ++++++++++++++++ .../tests/test_calibrated_equalized_odds.py | 47 +++++++ 4 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 aif360/sklearn/postprocessing/__init__.py create mode 100644 aif360/sklearn/postprocessing/calibrated_equalized_odds.py create mode 100644 aif360/sklearn/tests/test_calibrated_equalized_odds.py diff --git a/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py b/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py index 471e2b66..4bae2ed9 100644 --- a/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py +++ b/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py @@ -208,4 +208,4 @@ def weighted_cost(fp_rate, fn_rate, cm, privileged): * (1 - cm.base_rate(privileged=privileged))) + (fn_rate / norm_const * cm.generalized_false_negative_rate(privileged=privileged) - * (1 - cm.base_rate(privileged=privileged)))) + * cm.base_rate(privileged=privileged))) diff --git a/aif360/sklearn/postprocessing/__init__.py b/aif360/sklearn/postprocessing/__init__.py new file mode 100644 index 00000000..49e89d42 --- /dev/null +++ b/aif360/sklearn/postprocessing/__init__.py @@ -0,0 +1,123 @@ +from logging import warning + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone +from sklearn.model_selection import train_test_split +from sklearn.utils.metaestimators import if_delegate_has_method +from sklearn.utils.validation import check_is_fitted + +from aif360.sklearn.postprocessing.calibrated_equalized_odds import CalibratedEqualizedOdds + + +class PostProcessingMeta(BaseEstimator, MetaEstimatorMixin): + """ + Attributes: + estimator_: Cloned ``estimator``. + postprocessor_: Cloned ``postprocessor``. + use_proba_ (bool): Determined depending on the postprocessor type if + `use_proba` is None. + """ + + def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(), + use_proba=None, val_size=0.25, **options): + """ + Args: + estimator (sklearn.BaseEstimator): Original estimator. + postprocessor: Post-processing algorithm. + use_proba (bool): Use ``self.estimator_.predict_proba()`` instead of + ``self.estimator_.predict()`` as input to postprocessor. If + ``None``, defaults to ``True`` if the postprocessor supports it. + val_size (int or float): Size of validation set used to fit the + postprocessor. The estimator fits on the remainder of the + training set. + See :func:`~sklearn.model_selection.train_test_split` for + details. + **options: Keyword options passed through to + :func:`~sklearn.model_selection.train_test_split`. + Note: 'train_size' and 'test_size' will be ignored in favor of + ``val_size``. + """ + self.estimator = estimator + self.postprocessor = postprocessor + self.val_size = val_size + self.options = options + + @property + def _estimator_type(self): + return self.postprocessor._estimator_type + + def fit(self, X, y, pos_label=1, sample_weight=None): + self.pos_label_ = pos_label + self.use_proba_ = isinstance(self.postprocessor, CalibratedEqualizedOdds) + if self.use_proba_ and not hasattr(self.estimator, 'predict_proba'): + raise TypeError("`estimator` (type: {}) does not implement method " + "`predict_proba()`.".format(type(self.estimator))) + + if 'train_size' in self.options or 'test_size' in self.options: + warning("'train_size' and 'test_size' are ignored in favor of 'val_size'") + options_ = self.options.copy() + options_['test_size'] = self.val_size + if 'train_size' in options_: + del options_['train_size'] + + self.estimator_ = clone(self.estimator) + self.postprocessor_ = clone(self.postprocessor) + + if sample_weight is not None: + X_est, X_post, y_est, y_post, sw_est, sw_post = train_test_split( + X, y, sample_weight, **options_) + self.estimator_.fit(X_est, y_est, sample_weight=sw_est) + else: + X_est, X_post, y_est, y_post = train_test_split(X, y, **options_) + self.estimator_.fit(X_est, y_est) + + pos_idx = np.nonzero(self.estimator_.classes_ == pos_label)[0][0] + y_pred = (self.estimator_.predict(X_post) if not self.use_proba_ else + self.estimator_.predict_proba(X_post)[:, pos_idx]) + self.postprocessor_.fit(y_post, y_pred, pos_label=pos_label, + sample_weight=None if sample_weight is None else sw_post) + return self + + @property + def classes_(self): + # order of postprocessor.classes_ may differ from estimator_.classes_ + check_is_fitted(self.postprocessor_, 'classes_') + return self.postprocessor_.classes_ + + @if_delegate_has_method('postprocessor_') + def predict(self, X): + pos_idx = np.nonzero(self.estimator_.classes_ == self.pos_label_)[0][0] + y_pred = (self.estimator_.predict(X) if not self.use_proba_ else + self.estimator_.predict_proba(X)[:, pos_idx]) + y_pred = pd.Series(y_pred, index=X.index) + return self.postprocessor_.predict(y_pred) + + @if_delegate_has_method('postprocessor_') + def predict_proba(self, X): + pos_idx = np.nonzero(self.estimator_.classes_ == self.pos_label_)[0][0] + y_pred = (self.estimator_.predict(X) if not self.use_proba_ else + self.estimator_.predict_proba(X)[:, pos_idx]) + y_pred = pd.Series(y_pred, index=X.index) + return self.postprocessor_.predict_proba(y_pred) + + @if_delegate_has_method('postprocessor_') + def predict_log_proba(self, X): + pos_idx = np.nonzero(self.estimator_.classes_ == self.pos_label_)[0][0] + y_pred = (self.estimator_.predict(X) if not self.use_proba_ else + self.estimator_.predict_proba(X)[:, pos_idx]) + y_pred = pd.Series(y_pred, index=X.index) + return self.postprocessor_.predict_log_proba(y_pred) + + @if_delegate_has_method('postprocessor_') + def score(self, X, y, sample_weight=None): + pos_idx = np.nonzero(self.estimator_.classes_ == self.pos_label_)[0][0] + y_pred = (self.estimator_.predict(X) if not self.use_proba_ else + self.estimator_.predict_proba(X)[:, pos_idx]) + y_pred = pd.Series(y_pred, index=X.index) + return self.postprocessor_.score(y_pred, y, sample_weight=sample_weight) + + +__all__ = [ + 'CalibratedEqualizedOdds', 'PostProcessingMeta' +] diff --git a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py new file mode 100644 index 00000000..322d331a --- /dev/null +++ b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py @@ -0,0 +1,114 @@ +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.preprocessing import LabelEncoder +from sklearn.utils import check_random_state + +from aif360.sklearn.metrics import base_rate, generalized_fnr, generalized_fpr +from aif360.sklearn.utils import check_groups + + +class CalibratedEqualizedOdds(BaseEstimator, ClassifierMixin): + """Calibrated equalized odds postprocessing is a post-processing technique + that optimizes over calibrated classifier score outputs to find + probabilities with which to change output labels with an equalized odds + objective [#pleiss17]_. + + References: + .. [#pleiss17] `G. Pleiss, M. Raghavan, F. Wu, J. Kleinberg, and + K. Q. Weinberger, "On Fairness and Calibration," Conference on Neural + Information Processing Systems, 2017. + `_ + + Adapted from: + https://github.com/gpleiss/equalized_odds_and_calibration/blob/master/calib_eq_odds.py + """ + def __init__(self, prot_attr=None, cost_constraint='weighted', + random_state=None): + """ + Args: + prot_attr (single label or list-like, optional): Protected + attribute(s) to use as sensitive attribute(s) in the post- + processing. If more than one attribute, all combinations of + values (intersections) are considered. Default is ``None`` + meaning all protected attributes from the dataset are used. + Note: This algorithm requires there be exactly 2 groups + (privileged and unprivileged). + cost_constraint ('fpr', 'fnr', or 'weighted'): + random_state (int or numpy.RandomState, optional): + """ + self.prot_attr = prot_attr + self.cost_constraint = cost_constraint + self.random_state = random_state + + def fit(self, y_true, y_pred, pos_label=1, sample_weight=None): + groups, self.prot_attr_ = check_groups(y_true, self.prot_attr) + self.classes_ = np.unique(y_true) + self.groups_ = np.unique(groups) + + if pos_label not in self.classes_: + raise ValueError('pos_label={} is not present in y_true. The valid ' + 'values are:\n{}'.format(pos_label, self.classes_)) + + if len(self.groups_) != 2: + raise ValueError('prot_attr={}\nyielded {} groups:\n{}\nbut this ' + 'algorithm requires a binary division of the ' + 'data.'.format(self.prot_attr_, len(self.groups_), + self.groups_)) + + # ensure self.classes_ = [neg_label, pos_label] + self.classes_ = np.append(np.delete(self.classes_, pos_label), pos_label) + + def args(grp_idx, triv=False): + i = (groups == self.groups_[grp_idx]) + pred = (np.full_like(y_pred, self.base_rates_[grp_idx]) if triv else + y_pred) + return dict(y_true=y_true[i], y_pred=pred[i], pos_label=pos_label, + sample_weight=sample_weight[i] if sample_weight is not None else None) + + self.base_rates_ = [base_rate(**args(i)) for i in range(2)] + + def weighted_cost(grp_idx, triv=False): + fpr = generalized_fpr(**args(grp_idx, triv=triv)) + fnr = generalized_fnr(**args(grp_idx, triv=triv)) + base_rate = self.base_rates_[grp_idx] + if self.cost_constraint == 'fpr': + return fpr + elif self.cost_constraint == 'fnr': + return fnr + elif self.cost_constraint == 'weighted': + return fpr * (1 - base_rate) + fnr * base_rate + else: + raise ValueError("`cost_constraint` must be one of: 'fpr', " + "'fnr', or 'weighted'") + + costs = [weighted_cost(i) for i in range(2)] + self.mix_rates_ = [(costs[1] - costs[0]) + / (weighted_cost(0, triv=True) - costs[0]), + (costs[0] - costs[1]) + / (weighted_cost(1, triv=True) - costs[1])] + self.mix_rates_[np.argmax(costs)] = 0 + + return self + + def predict_proba(self, y_pred): + rng = check_random_state(self.random_state) + + groups, _ = check_groups(y_pred, self.prot_attr_) + if not set(np.unique(groups)) <= set(self.groups_): + raise ValueError('The protected groups from y_pred:\n{}\ndo not ' + 'match those from the training set:\n{}'.format( + np.unique(groups), self.groups_)) + + yt = np.empty_like(y_pred) + for grp_idx in range(2): + i = (groups == self.groups_[grp_idx]) + to_replace = (rng.rand(sum(i)) < self.mix_rates_[grp_idx]) + new_preds = y_pred[i].copy() + new_preds[to_replace] = self.base_rates_[grp_idx] + yt[i] = new_preds + + return np.stack([1 - yt, yt], axis=-1) + + def predict(self, y_pred): + scores = self.predict_proba(y_pred) + return self.classes_[scores.argmax(axis=1)] diff --git a/aif360/sklearn/tests/test_calibrated_equalized_odds.py b/aif360/sklearn/tests/test_calibrated_equalized_odds.py new file mode 100644 index 00000000..247ba4c8 --- /dev/null +++ b/aif360/sklearn/tests/test_calibrated_equalized_odds.py @@ -0,0 +1,47 @@ +import numpy as np +from sklearn.linear_model import LogisticRegression + +from aif360.datasets import AdultDataset +from aif360.sklearn.datasets import fetch_adult +from aif360.algorithms.postprocessing import CalibratedEqOddsPostprocessing +from aif360.sklearn.postprocessing import CalibratedEqualizedOdds, PostProcessingMeta + + +X, y, sample_weight = fetch_adult(numeric_only=True) +adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], + features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', + 'hours-per-week'], features_to_drop=[]) + +def test_calib_eq_odds_sex(): + logreg = LogisticRegression(solver='lbfgs', max_iter=500) + y_pred = logreg.fit(X, y, sample_weight=sample_weight).predict_proba(X)[:, 1] + adult_pred = adult.copy() + adult_pred.scores = y_pred + orig_cal_eq_odds = CalibratedEqOddsPostprocessing( + unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}]) + orig_cal_eq_odds.fit(adult, adult_pred) + cal_eq_odds = CalibratedEqualizedOdds('sex') + cal_eq_odds.fit(y, y_pred, sample_weight=sample_weight) + + assert np.isclose(orig_cal_eq_odds.priv_mix_rate, cal_eq_odds.mix_rates_[1]) + assert np.isclose(orig_cal_eq_odds.unpriv_mix_rate, cal_eq_odds.mix_rates_[0]) + +def test_postprocessingmeta(): + logreg = LogisticRegression(solver='lbfgs', max_iter=500) + + adult_est, adult_post = adult.split([0.75], shuffle=False) + logreg.fit(adult_est.features, adult_est.labels.ravel()) + y_pred = logreg.predict_proba(adult_post.features)[:, 1] + adult_pred = adult_post.copy() + adult_pred.scores = y_pred + orig_cal_eq_odds = CalibratedEqOddsPostprocessing( + unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}]) + orig_cal_eq_odds.fit(adult_post, adult_pred) + + cal_eq_odds = PostProcessingMeta(estimator=logreg, + postprocessor=CalibratedEqualizedOdds('sex'), shuffle=False) + cal_eq_odds.fit(X, y, sample_weight=sample_weight) + + assert np.allclose([orig_cal_eq_odds.unpriv_mix_rate, + orig_cal_eq_odds.priv_mix_rate], + cal_eq_odds.postprocessor_.mix_rates_) From 40cad96cc4c7340c5d3380483aedc78f1f0cb2f6 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Wed, 30 Oct 2019 16:46:25 -0400 Subject: [PATCH 35/61] fixed adversarial debiasing reproducibility --- .../inprocessing/adversarial_debiasing.py | 15 +- .../inprocessing/adversarial_debiasing.py | 195 ++++++++++-------- .../tests/test_adversarial_debiasing.py | 77 +++++++ requirements.txt | 2 +- 4 files changed, 198 insertions(+), 91 deletions(-) create mode 100644 aif360/sklearn/tests/test_adversarial_debiasing.py diff --git a/aif360/algorithms/inprocessing/adversarial_debiasing.py b/aif360/algorithms/inprocessing/adversarial_debiasing.py index 02da1217..3297a96b 100644 --- a/aif360/algorithms/inprocessing/adversarial_debiasing.py +++ b/aif360/algorithms/inprocessing/adversarial_debiasing.py @@ -80,14 +80,14 @@ def _classifier_model(self, features, features_dim, keep_prob): """ with tf.variable_scope("classifier_model"): W1 = tf.get_variable('W1', [features_dim, self.classifier_num_hidden_units], - initializer=tf.contrib.layers.xavier_initializer()) + initializer=tf.contrib.layers.xavier_initializer(seed=self.seed1)) b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), name='b1') h1 = tf.nn.relu(tf.matmul(features, W1) + b1) - h1 = tf.nn.dropout(h1, keep_prob=keep_prob) + h1 = tf.nn.dropout(h1, keep_prob=keep_prob, seed=self.seed2) W2 = tf.get_variable('W2', [self.classifier_num_hidden_units, 1], - initializer=tf.contrib.layers.xavier_initializer()) + initializer=tf.contrib.layers.xavier_initializer(seed=self.seed3)) b2 = tf.Variable(tf.zeros(shape=[1]), name='b2') pred_logit = tf.matmul(h1, W2) + b2 @@ -103,7 +103,7 @@ def _adversary_model(self, pred_logits, true_labels): s = tf.sigmoid((1 + tf.abs(c)) * pred_logits) W2 = tf.get_variable('W2', [3, 1], - initializer=tf.contrib.layers.xavier_initializer()) + initializer=tf.contrib.layers.xavier_initializer(seed=self.seed4)) b2 = tf.Variable(tf.zeros(shape=[1]), name='b2') pred_protected_attribute_logit = tf.matmul(tf.concat([s, s * true_labels, s * (1.0 - true_labels)], axis=1), W2) + b2 @@ -123,6 +123,8 @@ def fit(self, dataset): """ if self.seed is not None: np.random.seed(self.seed) + ii32 = np.iinfo(np.int32) + self.seed1, self.seed2, self.seed3, self.seed4 = np.random.randint(ii32.min, ii32.max, size=4) # Map the dataset labels to 0 and 1. temp_labels = dataset.labels.copy() @@ -177,14 +179,15 @@ def fit(self, dataset): if self.debias: # Update adversary parameters - adversary_minimizer = adversary_opt.minimize(pred_protected_attributes_loss, var_list=adversary_vars, global_step=global_step) + with tf.control_dependencies([classifier_minimizer]): + adversary_minimizer = adversary_opt.minimize(pred_protected_attributes_loss, var_list=adversary_vars)#, global_step=global_step) self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) # Begin training for epoch in range(self.num_epochs): - shuffled_ids = np.random.choice(num_train_samples, num_train_samples) + shuffled_ids = np.random.choice(num_train_samples, num_train_samples, replace=False) for i in range(num_train_samples//self.batch_size): batch_ids = shuffled_ids[self.batch_size*i: self.batch_size*(i+1)] batch_features = dataset.features[batch_ids] diff --git a/aif360/sklearn/inprocessing/adversarial_debiasing.py b/aif360/sklearn/inprocessing/adversarial_debiasing.py index 2d4bc7a0..1ba8a248 100644 --- a/aif360/sklearn/inprocessing/adversarial_debiasing.py +++ b/aif360/sklearn/inprocessing/adversarial_debiasing.py @@ -1,8 +1,9 @@ import numpy as np -from scipy.special import softmax +import scipy.special from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.preprocessing import LabelEncoder -from sklearn.utils import check_is_fitted, check_random_state +from sklearn.utils import check_random_state +from sklearn.utils.validation import check_is_fitted import tensorflow as tf from aif360.sklearn.utils import check_inputs, check_groups @@ -22,12 +23,13 @@ class AdversarialDebiasing(BaseEstimator, ClassifierMixin): Artificial Intelligence, Ethics, and Society, 2018. """ - def __init__(self, sess, prot_attr=None, adversary_loss_weight=0.1, - num_epochs=50, batch_size=128, classifier_num_hidden_units=200, - debias=True, verbose=True, random_state=None): + def __init__(self, prot_attr=None, scope_name='classifier', + adversary_loss_weight=0.1, num_epochs=50, batch_size=128, + classifier_num_hidden_units=200, debias=True, verbose=False, + random_state=None): - self.sess = sess self.prot_attr = prot_attr + self.scope_name = scope_name self.adversary_loss_weight = adversary_loss_weight self.num_epochs = num_epochs self.batch_size = batch_size @@ -36,87 +38,103 @@ def __init__(self, sess, prot_attr=None, adversary_loss_weight=0.1, self.verbose = verbose self.random_state = random_state - @property - def classifier_logits_(self): - check_is_fitted(self, ['input_ph', 'keep_prob', 'classes_']) - n_features = self.input_ph.shape[1] - n_classes = len(self.classes_) - with tf.variable_scope('classifier_model'): - W1 = tf.get_variable( - 'W1', [n_features, self.classifier_num_hidden_units], - initializer=tf.contrib.layers.xavier_initializer()) - b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), - name='b1') - - h1 = tf.nn.relu(tf.matmul(self.input_ph, W1) + b1) - h1 = tf.nn.dropout(h1, keep_prob=self.keep_prob) - - W2 = tf.get_variable( - 'W2', [self.classifier_num_hidden_units, n_classes], - initializer=tf.contrib.layers.xavier_initializer()) - b2 = tf.Variable(tf.zeros(shape=[n_classes]), name='b2') - - pred_logits = tf.matmul(h1, W2) + b2 - - return pred_logits - - @property - def adversary_logits_(self): - """Compute the adversary predictions for the protected attribute.""" - check_is_fitted(self, ['classifier_logits_', 'true_labels_ph', 'groups_']) - n_groups = len(self.groups_) - with tf.variable_scope("adversary_model"): - c = tf.get_variable('c', initializer=tf.constant(1.0)) - s = tf.sigmoid((1 + tf.abs(c)) * self.classifier_logits_) - - W2 = tf.get_variable('W2', [3, n_groups], - initializer=tf.contrib.layers.xavier_initializer()) - b2 = tf.Variable(tf.zeros(shape=[n_groups]), name='b2') - - pred_prot_attr_logits = tf.matmul( - tf.concat([s, s * self.true_labels_ph, - s * (1.0 - self.true_labels_ph)], axis=1), - W2) + b2 - - return pred_prot_attr_logits - def fit(self, X, y): + X, y, _ = check_inputs(X, y) rng = check_random_state(self.random_state) - # tf.random.seed(random_state) + ii32 = np.iinfo(np.int32) + seed1, seed2, seed3, seed4 = rng.randint(ii32.min, ii32.max, size=4) + + tf.reset_default_graph() + self.sess_ = tf.Session() groups, self.prot_attr_ = check_groups(X, self.prot_attr) le = LabelEncoder() y = le.fit_transform(y) self.classes_ = le.classes_ + groups = groups.map(str) # BUG: LabelEncoder converts to ndarray which removes tuple formatting groups = le.fit_transform(groups) self.groups_ = le.classes_ + n_classes = len(self.classes_) + n_groups = len(self.groups_) + # use sigmoid for binary case + if n_classes == 2: + n_classes = 1 + if n_groups == 2: + n_groups = 1 + n_samples, n_features = X.shape - with tf.variable_scope('adversarial_debiasing'): + with tf.variable_scope(self.scope_name): # Setup placeholders self.input_ph = tf.placeholder(tf.float32, shape=[None, n_features]) self.prot_attr_ph = tf.placeholder(tf.float32, shape=[None, 1]) self.true_labels_ph = tf.placeholder(tf.float32, shape=[None, 1]) self.keep_prob = tf.placeholder(tf.float32) - global_step = tf.train.get_or_create_global_step() - starter_learning_rate = 0.001 - learning_rate = tf.train.exponential_decay(starter_learning_rate, - global_step, 1000, 0.96, staircase=True) + # Create classifier + with tf.variable_scope('classifier_model'): + W1 = tf.get_variable( + 'W1', [n_features, self.classifier_num_hidden_units], + initializer=tf.initializers.glorot_uniform(seed=seed1)) + b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), + name='b1') + + h1 = tf.nn.relu(tf.matmul(self.input_ph, W1) + b1) + h1 = tf.nn.dropout(h1, rate=1-self.keep_prob, seed=seed2) + + W2 = tf.get_variable( + 'W2', [self.classifier_num_hidden_units, n_classes], + initializer=tf.initializers.glorot_uniform(seed=seed3)) + b2 = tf.Variable(tf.zeros(shape=[n_classes]), name='b2') + + self.classifier_logits_ = tf.matmul(h1, W2) + b2 # Obtain classifier loss - clf_loss = tf.reduce_mean( - tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=self.true_labels_ph, - logits=self.classifier_logits_)) + if self.classifier_logits_.shape[1] == 1: + clf_loss = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits( + labels=self.true_labels_ph, + logits=self.classifier_logits_)) + else: + clf_loss = tf.reduce_mean( + tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=tf.squeeze(tf.cast(self.true_labels_ph, + tf.int32)), + logits=self.classifier_logits_)) if self.debias: + # Create adversary + with tf.variable_scope("adversary_model"): + c = tf.get_variable('c', initializer=tf.constant(1.0)) + s = tf.sigmoid((1 + tf.abs(c)) * self.classifier_logits_) + + W2 = tf.get_variable('W2', [3, n_groups], + initializer=tf.initializers.glorot_uniform(seed=seed4)) + b2 = tf.Variable(tf.zeros(shape=[n_groups]), name='b2') + + self.adversary_logits_ = tf.matmul( + tf.concat([s, s * self.true_labels_ph, + s * (1.0 - self.true_labels_ph)], axis=1), + W2) + b2 + # Obtain adversary loss - adv_loss = tf.reduce_mean( - tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=self.prot_attr_ph, - logits=self.adversary_logits_)) + if self.adversary_logits_.shape[1] == 1: + adv_loss = tf.reduce_mean( + tf.nn.sigmoid_cross_entropy_with_logits( + labels=self.prot_attr_ph, + logits=self.adversary_logits_)) + else: + adv_loss = tf.reduce_mean( + tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=tf.squeeze(tf.cast(self.prot_attr_ph, + tf.int32)), + logits=self.adversary_logits_)) + + global_step = tf.train.get_or_create_global_step() + starter_learning_rate = 0.001 + learning_rate = tf.train.exponential_decay(starter_learning_rate, + global_step, 1000, 0.96, staircase=True) # Setup optimizers clf_opt = tf.train.AdamOptimizer(learning_rate) @@ -131,8 +149,6 @@ def fit(self, X, y): # Compute grad wrt classifier parameters adv_grads = {var: grad for (grad, var) in adv_opt.compute_gradients(adv_loss, var_list=clf_vars)} - # Update adversary parameters (don't increment global step yet) - adv_min = adv_opt.minimize(adv_loss, var_list=adv_vars) normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny) @@ -144,27 +160,30 @@ def fit(self, X, y): grad -= tf.reduce_sum(grad * unit_adv_grad) * unit_adv_grad grad -= self.adversary_loss_weight * adv_grads[var] clf_grads.append((grad, var)) + clf_min = clf_opt.apply_gradients(clf_grads, global_step=global_step) + if self.debias: + with tf.control_dependencies([clf_min]): + adv_min = adv_opt.minimize(adv_loss, var_list=adv_vars) - self.sess.run(tf.global_variables_initializer()) + self.sess_.run(tf.global_variables_initializer()) # Begin training for epoch in range(self.num_epochs): - # TODO: why rng.choice(n_samples, n_samples)? - shuffled_ids = rng.shuffle(np.arange(n_samples)) + shuffled_ids = rng.permutation(n_samples) for i in range(n_samples // self.batch_size): batch_ids = shuffled_ids[self.batch_size * i: self.batch_size * (i+1)] - batch_features = X[batch_ids] - batch_labels = y[batch_ids] - batch_prot_attr = groups[batch_ids] + batch_features = X.iloc[batch_ids] + batch_labels = y[batch_ids][:, np.newaxis] + batch_prot_attr = groups[batch_ids][:, np.newaxis] batch_feed_dict = {self.input_ph: batch_features, self.true_labels_ph: batch_labels, self.prot_attr_ph: batch_prot_attr, self.keep_prob: 0.8} if self.debias: _, _, clf_loss_value, adv_loss_value = ( - self.sess.run([clf_min, adv_min, + self.sess_.run([clf_min, adv_min, clf_loss, adv_loss], feed_dict=batch_feed_dict)) if i % 200 == 0 and self.verbose: @@ -173,7 +192,7 @@ def fit(self, X, y): epoch, i, clf_loss_value, adv_loss_value)) else: - _, clf_loss_value = self.sess.run( + _, clf_loss_value = self.sess_.run( [clf_min, clf_loss], feed_dict=batch_feed_dict) if i % 200 == 0 and self.verbose: @@ -186,12 +205,12 @@ def decision_function(self, X): check_is_fitted(self, ['classes_', 'input_ph', 'keep_prob', 'classifier_logits_']) n_samples = X.shape[0] - groups, _ = check_groups(X, self.prot_attr_) - le = LabelEncoder().fit(self.groups_) - groups = le.transform(groups) + n_classes = len(self.classes_) + if n_classes == 2: + n_classes = 1 samples_covered = 0 - scores = np.empty((n_samples, len(self.classes_))) + scores = np.empty((n_samples, n_classes)) while samples_covered < n_samples: start = samples_covered end = samples_covered + self.batch_size @@ -199,22 +218,30 @@ def decision_function(self, X): end = n_samples batch_ids = np.arange(start, end) - batch_features = X[batch_ids] - batch_prot_attr = groups[batch_ids] + batch_features = X.iloc[batch_ids] batch_feed_dict = {self.input_ph: batch_features, self.keep_prob: 1.0} - scores[batch_ids] = self.sess.run(self.classifier_logits_, + scores[batch_ids] = self.sess_.run(self.classifier_logits_, feed_dict=batch_feed_dict) samples_covered += len(batch_features) - return scores + return scores.ravel() if scores.shape[1] == 1 else scores def predict_proba(self, X): decision = self.decision_function(X) - return softmax(decision, axis=1) + + if decision.ndim == 1: + decision_2d = np.c_[np.zeros_like(decision), decision] + else: + decision_2d = decision + return scipy.special.softmax(decision_2d, axis=1) def predict(self, X): - indices = self.decision_function(X).argmax(axis=1) + scores = self.decision_function(X) + if scores.ndim == 1: + indices = (scores > 0).astype(np.int) + else: + indices = scores.argmax(axis=1) return self.classes_[indices] diff --git a/aif360/sklearn/tests/test_adversarial_debiasing.py b/aif360/sklearn/tests/test_adversarial_debiasing.py new file mode 100644 index 00000000..c28fb17c --- /dev/null +++ b/aif360/sklearn/tests/test_adversarial_debiasing.py @@ -0,0 +1,77 @@ +import numpy as np +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import accuracy_score +import tensorflow as tf + +from aif360.datasets import AdultDataset +from aif360.sklearn.datasets import fetch_adult +from aif360.algorithms.inprocessing import AdversarialDebiasing as OldAdversarialDebiasing +from aif360.sklearn.inprocessing import AdversarialDebiasing + + +X, y, sample_weight = fetch_adult(numeric_only=True) +adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], + features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', + 'hours-per-week'], features_to_drop=[]) + +def test_adv_debias_old_reproduce(): + sess = tf.Session() + old_adv_deb = OldAdversarialDebiasing(unprivileged_groups=[{'sex': 0}], + privileged_groups=[{'sex': 1}], + scope_name='old_classifier', + sess=sess, num_epochs=5, seed=123) + old_preds = old_adv_deb.fit_predict(adult) + sess.close() + tf.reset_default_graph() + sess = tf.Session() + old_adv_deb2 = OldAdversarialDebiasing(unprivileged_groups=[{'sex': 0}], + privileged_groups=[{'sex': 1}], + scope_name='old_classifier', + sess=sess, num_epochs=5, seed=123) + old_preds2 = old_adv_deb2.fit_predict(adult) + sess.close() + + assert np.allclose(old_preds.labels, old_preds2.labels) + +def test_adv_debias_old(): + tf.reset_default_graph() + sess = tf.Session() + old_adv_deb = OldAdversarialDebiasing(unprivileged_groups=[{'sex': 0}], + privileged_groups=[{'sex': 1}], + scope_name='old_classifier', + sess=sess, num_epochs=5, seed=123) + old_preds = old_adv_deb.fit_predict(adult) + sess.close() + adv_deb = AdversarialDebiasing('sex', num_epochs=5, random_state=123) + new_preds = adv_deb.fit(X, y).predict(X) + adv_deb.sess_.close() + assert np.allclose(old_preds.labels.flatten(), new_preds) + +def test_adv_debias_reproduce(): + adv_deb = AdversarialDebiasing('sex', num_epochs=5, random_state=123) + new_preds = adv_deb.fit(X, y).predict(X) + adv_deb.sess_.close() + new_acc = accuracy_score(y, new_preds) + + adv_deb2 = AdversarialDebiasing('sex', num_epochs=5, random_state=123) + new_preds = adv_deb2.fit(X, y).predict(X) + adv_deb.sess_.close() + + assert new_acc == accuracy_score(y, new_preds) + +def test_adv_debias_intersection(): + adv_deb = AdversarialDebiasing(scope_name='intersect', num_epochs=5) + adv_deb.fit(X, y) + adv_deb.sess_.close() + assert adv_deb.adversary_logits_.shape[1] == 4 + +def test_adv_debias_grid(): + adv_deb = AdversarialDebiasing('sex', num_epochs=10, random_state=123) + + params = {'debias': [True, False]} + + clf = GridSearchCV(adv_deb, params, cv=3) + clf.fit(X, y) + + clf.best_estimator_.sess_.close() + assert clf.best_params_ == {'debias': False} diff --git a/requirements.txt b/requirements.txt index 767db283..bf52ab8c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ numpy>=1.16 matplotlib pandas>=0.24 pytest>=3.5.0 -scipy +scipy>=1.2.0 scikit-learn cvxpy>=1.0 scs==2.1.0 From dc410a2e86b53ed85c046ad1d38bbc1002b5286f Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Wed, 30 Oct 2019 18:06:47 -0400 Subject: [PATCH 36/61] updated Getting Started notebook --- aif360/sklearn/examples/Getting Started.ipynb | 452 +++++++++++++----- 1 file changed, 337 insertions(+), 115 deletions(-) diff --git a/aif360/sklearn/examples/Getting Started.ipynb b/aif360/sklearn/examples/Getting Started.ipynb index b65f8f78..026bf790 100644 --- a/aif360/sklearn/examples/Getting Started.ipynb +++ b/aif360/sklearn/examples/Getting Started.ipynb @@ -13,15 +13,20 @@ "metadata": {}, "outputs": [], "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", + "import tensorflow as tf\n", "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.metrics import accuracy_score, recall_score, make_scorer\n", + "from sklearn.metrics import accuracy_score\n", "from sklearn.model_selection import GridSearchCV, train_test_split\n", "\n", "from aif360.sklearn.preprocessing import ReweighingMeta\n", + "from aif360.sklearn.inprocessing import AdversarialDebiasing\n", + "from aif360.sklearn.postprocessing import CalibratedEqualizedOdds, PostProcessingMeta\n", "from aif360.sklearn.datasets import fetch_adult\n", - "from aif360.sklearn.metrics import disparate_impact_ratio" + "from aif360.sklearn.metrics import disparate_impact_ratio, average_odds_error, generalized_fpr, generalized_fnr" ] }, { @@ -249,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -298,88 +303,88 @@ " \n", " \n", " \n", - " 7916\n", - " Non-white\n", - " Female\n", - " 18.0\n", + " 0\n", + " 0\n", + " 1\n", + " 25.0\n", " 7.0\n", + " 0\n", + " 1\n", " 0.0\n", " 0.0\n", - " 0.0\n", - " 0.0\n", - " 20.0\n", + " 40.0\n", " \n", " \n", - " 26447\n", - " White\n", - " Male\n", - " 55.0\n", + " 1\n", + " 1\n", + " 1\n", + " 38.0\n", " 9.0\n", - " 1.0\n", - " 1.0\n", + " 1\n", + " 1\n", " 0.0\n", " 0.0\n", - " 40.0\n", + " 50.0\n", " \n", " \n", - " 20889\n", - " White\n", - " Female\n", - " 43.0\n", - " 9.0\n", - " 1.0\n", - " 0.0\n", + " 2\n", + " 1\n", + " 1\n", + " 28.0\n", + " 12.0\n", + " 1\n", + " 1\n", " 0.0\n", " 0.0\n", " 40.0\n", " \n", " \n", - " 30145\n", - " White\n", - " Male\n", + " 3\n", + " 0\n", + " 1\n", " 44.0\n", - " 11.0\n", - " 1.0\n", - " 1.0\n", - " 4386.0\n", + " 10.0\n", + " 0\n", + " 1\n", + " 7688.0\n", " 0.0\n", " 40.0\n", " \n", " \n", - " 7473\n", - " White\n", - " Male\n", - " 41.0\n", - " 9.0\n", - " 1.0\n", - " 1.0\n", + " 4\n", + " 1\n", + " 0\n", + " 18.0\n", + " 10.0\n", + " 1\n", + " 0\n", " 0.0\n", " 0.0\n", - " 55.0\n", + " 30.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " age education-num race sex capital-gain \\\n", - " race sex \n", - "7916 Non-white Female 18.0 7.0 0.0 0.0 0.0 \n", - "26447 White Male 55.0 9.0 1.0 1.0 0.0 \n", - "20889 White Female 43.0 9.0 1.0 0.0 0.0 \n", - "30145 White Male 44.0 11.0 1.0 1.0 4386.0 \n", - "7473 White Male 41.0 9.0 1.0 1.0 0.0 \n", + " age education-num race sex capital-gain capital-loss \\\n", + " race sex \n", + "0 0 1 25.0 7.0 0 1 0.0 0.0 \n", + "1 1 1 38.0 9.0 1 1 0.0 0.0 \n", + "2 1 1 28.0 12.0 1 1 0.0 0.0 \n", + "3 0 1 44.0 10.0 0 1 7688.0 0.0 \n", + "4 1 0 18.0 10.0 1 0 0.0 0.0 \n", "\n", - " capital-loss hours-per-week \n", - " race sex \n", - "7916 Non-white Female 0.0 20.0 \n", - "26447 White Male 0.0 40.0 \n", - "20889 White Female 0.0 40.0 \n", - "30145 White Male 0.0 40.0 \n", - "7473 White Male 0.0 55.0 " + " hours-per-week \n", + " race sex \n", + "0 0 1 40.0 \n", + "1 1 1 50.0 \n", + "2 1 1 40.0 \n", + "3 0 1 40.0 \n", + "4 1 0 30.0 " ] }, - "execution_count": 9, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -387,11 +392,43 @@ "source": [ "X, y, sample_weight = fetch_adult(numeric_only=True)\n", "(X_train, X_test,\n", - " y_train, y_test,\n", - " sw_train, sw_test) = train_test_split(X, y, sample_weight, train_size=0.7, random_state=123)\n", + " y_train, y_test) = train_test_split(X, y, train_size=0.7, shuffle=False)\n", "X_train.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "the protected attribute information is replicated in the labels:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " race sex\n", + "0 0 1 0\n", + "1 1 1 0\n", + "2 1 1 1\n", + "3 0 1 1\n", + "4 1 0 0\n", + "Name: annual-income, dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train.head()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -408,11 +445,23 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.823858595509452" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "y_pred = LogisticRegression(solver='liblinear').fit(X_train, y_train).predict(X_test)" + "y_pred = LogisticRegression(solver='liblinear').fit(X_train, y_train).predict(X_test)\n", + "accuracy_score(y_test, y_pred)" ] }, { @@ -424,23 +473,54 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.19176335549523604" + "0.19826239080897468" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sex = y_test.index.get_level_values('sex')\n", - "disparate_impact_ratio(y_test, y_pred, prot_attr='sex', priv_group='Male', pos_label='>50K')" + "disparate_impact_ratio(y_test, y_pred, prot_attr='sex')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And similarly, we can assess how close the predictions are to equality of odds.\n", + "\n", + "`average_odds_error()` computes the (unweighted) average of the absolute values of the true positive rate (TPR) difference and false positive rate (FPR) difference, i.e.:\n", + "\n", + "$\\tfrac{1}{2}\\left(|FPR_{D = \\text{unprivileged}} - FPR_{D = \\text{privileged}}|\n", + " + |TPR_{D = \\text{unprivileged}} - TPR_{D = \\text{privileged}}|\\right)$" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.12427040384779571" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "average_odds_error(y_test, y_pred, prot_attr='sex')" ] }, { @@ -459,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": { "scrolled": false }, @@ -468,70 +548,212 @@ "name": "stdout", "output_type": "stream", "text": [ - "Index([(7916, 'Non-white', 'Female'), (26447, 'White', 'Male'),\n", - " (20889, 'White', 'Female'), (30145, 'White', 'Male'),\n", - " (7473, 'White', 'Male'), (29361, 'White', 'Male'),\n", - " (12277, 'White', 'Male'), (44372, 'White', 'Male'),\n", - " (32291, 'White', 'Female'), (44411, 'White', 'Female'),\n", - " ...\n", - " (38298, 'White', 'Male'), (4173, 'White', 'Male'),\n", - " (7854, 'White', 'Male'), (16424, 'White', 'Female'),\n", - " (2087, 'White', 'Male'), (16120, 'White', 'Male'),\n", - " (24476, 'White', 'Male'), (8295, 'White', 'Female'),\n", - " (1449, 'White', 'Male'), (33323, 'White', 'Male')],\n", - " dtype='object', length=6838)\n" + "0.8147819559134648\n", + "{'estimator__C': 10, 'reweigher__prot_attr': 'sex'}\n" ] - }, + } + ], + "source": [ + "rew = ReweighingMeta(estimator=LogisticRegression(solver='liblinear'))\n", + "\n", + "params = {'estimator__C': [1, 10], 'reweigher__prot_attr': ['sex']}\n", + "\n", + "clf = GridSearchCV(rew, params, scoring='accuracy', cv=5)\n", + "clf.fit(X_train, y_train)\n", + "print(clf.score(X_test, y_test))\n", + "print(clf.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.639237550613212" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "disparate_impact_ratio(y_test, clf.predict(X_test), prot_attr='sex')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rather than trying to weight accuracy and fairness, we can try a fair in-processing algorithm:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ { - "ename": "NameError", - "evalue": "name 'accuracy_score' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mclf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrew\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'sample_weight'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msw_train\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[1;32m 685\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 686\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 687\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 688\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 689\u001b[0m \u001b[0;31m# For multi-metric evaluation, store the best_index_, best_params_ and\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[0;34m(self, evaluate_candidates)\u001b[0m\n\u001b[1;32m 1146\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1147\u001b[0m \u001b[0;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1148\u001b[0;31m \u001b[0mevaluate_candidates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1149\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[0;34m(candidate_params)\u001b[0m\n\u001b[1;32m 664\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 665\u001b[0m in product(candidate_params,\n\u001b[0;32m--> 666\u001b[0;31m cv.split(X, y, groups)))\n\u001b[0m\u001b[1;32m 667\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 919\u001b[0m \u001b[0;31m# remaining jobs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 920\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 921\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 922\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_original_iterator\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 923\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m 757\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 758\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 759\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 760\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 761\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 714\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[0mjob_idx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 716\u001b[0;31m \u001b[0mjob\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 717\u001b[0m \u001b[0;31m# A job can complete so quickly than its callback is\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 718\u001b[0m \u001b[0;31m# called before we get here, causing self._jobs to\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[0;34m(self, func, callback)\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[0;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 182\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 183\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 547\u001b[0m \u001b[0;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[0;31m# arguments in memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 549\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 550\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 551\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 224\u001b[0m return [func(*args, **kwargs)\n\u001b[0;32m--> 225\u001b[0;31m for func, args, kwargs in self.items]\n\u001b[0m\u001b[1;32m 226\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 227\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 224\u001b[0m return [func(*args, **kwargs)\n\u001b[0;32m--> 225\u001b[0;31m for func, args, kwargs in self.items]\n\u001b[0m\u001b[1;32m 226\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 227\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_fit_and_score\u001b[0;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)\u001b[0m\n\u001b[1;32m 552\u001b[0m \u001b[0mfit_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 553\u001b[0m \u001b[0;31m# _score will return dict if is_multimetric is True\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 554\u001b[0;31m \u001b[0mtest_scores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_multimetric\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 555\u001b[0m \u001b[0mscore_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mfit_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_score\u001b[0;34m(estimator, X_test, y_test, scorer, is_multimetric)\u001b[0m\n\u001b[1;32m 595\u001b[0m \"\"\"\n\u001b[1;32m 596\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_multimetric\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 597\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_multimetric_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 598\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0my_test\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_multimetric_score\u001b[0;34m(estimator, X_test, y_test, scorers)\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[0mscore\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 626\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 627\u001b[0;31m \u001b[0mscore\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 628\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 629\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'item'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/anaconda/envs/aif360/lib/python3.5/site-packages/sklearn/metrics/scorer.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, estimator, X, y_true, sample_weight)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 96\u001b[0m return self._sign * self._score_func(y_true, y_pred,\n\u001b[0;32m---> 97\u001b[0;31m **self._kwargs)\n\u001b[0m\u001b[1;32m 98\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mscore_func\u001b[0;34m(y_true, y_pred, sample_weight)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0midx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my_true\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_flat_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0maccuracy_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msample_weight\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mscoring\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_scorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscore_func\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'sample_weight'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'accuracy_score' is not defined" + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /anaconda/envs/aif360/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Colocations handled automatically by placer.\n" ] + }, + { + "data": { + "text/plain": [ + "0.8218794786050638" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "rew = ReweighingMeta(estimator=LogisticRegression(solver='liblinear'))\n", + "adv_deb = AdversarialDebiasing(prot_attr='sex')\n", + "adv_deb.fit(X_train, y_train)\n", + "adv_deb.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.022611763594614448" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "average_odds_error(y_test, adv_deb.predict(X_test), prot_attr='sex')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that `AdversarialDebiasing` creates a TensorFlow session which we should close when we're finished to free up resources:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "adv_deb.sess_.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's try a post-processor, `CalibratedEqualizedOdds`.\n", "\n", - "# UGLY workaround for sklearn issue: https://stackoverflow.com/a/49598597\n", - "def score_func(y_true, y_pred, sample_weight):\n", - " idx = y_true.index.to_flat_index()\n", - " print(idx)\n", - " return accuracy_score(y_true, y_pred, sample_weight=sample_weight[idx])\n", - "scoring = make_scorer(score_func, **{'sample_weight': sample_weight})\n", + "Since the post-processor needs to be trained on data unseen by the original estimator, we will use the `PostProcessingMeta` class which splits the data and trains the estimator and post-processor with their own split." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7676926226711254" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cal_eq_odds = CalibratedEqualizedOdds('sex', cost_constraint='fnr')\n", + "log_reg = LogisticRegression(solver='liblinear')\n", + "postproc = PostProcessingMeta(estimator=log_reg, postprocessor=cal_eq_odds)\n", "\n", - "params = {'estimator__C': [1, 10], 'reweigher__prot_attr': ['sex']}\n", + "postproc.fit(X_train, y_train)\n", + "accuracy_score(y_test, postproc.predict(X_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfUAAAEKCAYAAAALjMzdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzs3Xdck1f7P/DPCXsLiGwRgRDCcFEUR92K/VkUseJotY66H63Vjm+X1qq1j9papFq11Yrax1W1rmprK9hqawVF2UslArIEwoaEnN8fSWyAAEEJCXjer1dekHvlusM4Oec+93URSikYhmEYhun8OJoOgGEYhmGY9sEadYZhGIbpIlijzjAMwzBdBGvUGYZhGKaLYI06wzAMw3QRrFFnGIZhmC5CrY06ISSIEJJKCMkghLynZH1PQsgVQshtQshdQshLCuv+T7ZfKiFkvDrjZBiGYZiugKjrPnVCiA6ANABjAWQDuAlgBqU0SWGbPQBuU0p3EUL4AC5QSnvJvv8fgAAADgAuA+BSSuvVEizDMAzDdAHq7KkHAMiglN6jlNYBOAJgUqNtKABz2fcWAHJl308CcIRSWkspvQ8gQ3Y8hmEYhmGaoavGYzsCeKjwPBvAwEbbrAPwCyHkPwBMAIxR2PfvRvs6Nn4BQshCAAsBwMTEZACPx2uXwDUtIQEwNgZ699Z0JAzTUGxsbBGl1EbTcTAMo5w6G3VVzADwPaV0GyEkEMBBQoiPqjtTSvcA2AMA/v7+NCYmRk1hdhyJRNqgv/EG8Pnnmo6GYRoihGRpOgaGYZqnzkY9B4CzwnMn2TJF8wEEAQCl9C9CiCGA7iru2yUVFgK1tUDPnpqOhGEYhuls1HlN/SYAD0KIKyFEH8B0AGcabSMAMBoACCFeAAwBFMq2m04IMSCEuALwAPCPGmPVGlmyfhBr1BmGYZi2UltPnVIqJoQsB3AJgA6AfZTSRELIegAxlNIzAFYD2EsIWQXppLnXqXQ6fiIh5BiAJABiAMuel5nvAoH0q4uLZuNgGIZhOh+1XlOnlF4AcKHRso8Vvk8CMKSZfTcC2KjO+LSRvFFnPXWmK4mNje2hq6v7LQAfsKRXDPO0JAASxGLxggEDBhQo20DTE+WYRrKyADMzwMJC05EwTPvR1dX91s7OzsvGxqaEw+GoJzkGw3RxEomEFBYW8vPy8r4FEKxsG/aJWcsIBNKhd0I0HQnDtCsfGxubMtagM8zT43A41MbGRgjpiJfybTowHkYFAgEbeme6JA5r0Bnm2cn+jpptu1mjrmVYo84wDMM8Ldaoa5HKSqCoiDXqDMMwzNNhjboWeShLqstuZ2MY9Th48GA3QsiA27dvG8qXpaam6nt4eHgDwLlz58xGjhzp/qyvExoa2mv//v2WABAWFuYSGxtrCADGxsb9nuW4586dM/v1119N2rqfo6Oj76NHj1SaGB0eHm49e/bsdutaDB8+3L2oqEgHADZs2NCjd+/e3sHBwa6HDx+2eP/99+3a63XkJBIJBg0axC0uLuYAgI6OzgAej8eXP1JTU/Xb+zXlnva9y83N1R02bJhHe8TAZr9rEXY7G8Oo15EjR6z69+9fERkZadWvX7/c1vd4dkePHm1Tal2RSAQ9PT2l637//XczU1PT+rFjx1a2S3AdIDo6OkP+/XfffWdz+fLlNDc3N5FskVDV47T0vig6duyYhbe3d7WVlZUEAAwMDCQpKSlJre2nSQ4ODmJbW1vRL7/8YjJu3Lhn+tmyRl2LsGxyzPNg3jw4JyTAuD2P6eODqn37GhSQakIoFHJu3rxpevny5dTg4GCPL7/8UuVGXSwWY+nSpU5XrlyxIITQOXPmFH3wwQcFa9assb948WK32tpajr+/f8Xhw4ezOJyGA6ABAQGeW7duffjiiy9WAcD8+fOdo6OjzW1sbEQ//vjjPQcHB3FAQICnj49P1T///GMaGhpa7OnpWbN582Z7kUjEsbS0FB89evReVVUVJzIy0obD4dBjx45Zb9++XeDn51czd+5cl5ycHH0A+OKLLwTjxo2rzMvL0wkNDe2dn5+vP2DAgIrmSmyfOHHC/OOPP3asr68nVlZW4r/++itNcf0PP/xg0TgOZ2dn8fnz501Xr17dEwAIIbh+/XpKWVmZTmhoaO+Kigqd+vp6smPHjqygoKAKR0dH35iYmOTVq1c7ZGdnG0yYMMFj1qxZRZaWlvUxMTEmkZGRgtzcXF1l5/HWW2853Lt3z0AgEBg4OjrWrl279tHcuXNdRSIRkUgk+PHHHzN9fX1rFWM+fPiw1aJFi4pa+3kuW7bM6dq1a2Z1dXXkjTfeKHj77beLzp07Z/bJJ584mJubi1NTU42Dg4OLfX19q3fu3GlbW1tLTp06lent7V3b3Pui+BrNnZOy987S0lIyefLk0sjISOtnbdTZ8LsWEQgAHR3AwUHTkTBM1/PDDz90GzFihNDPz6/W0tJS/Mcff6j8wWLbtm02AoFAPykpKTEtLS1pwYIFjwHg7bffLkhISEhOT09PrK6u5hw5cqTFDBPV1dUcf3//yoyMjMQhQ4aUv/fee0/+2uvq6khCQkLyJ598kj927NiKuLi4lOTk5KSpU6cWr1+/3s7T07Nu9uzZhYsXL85PSUlJCgoKqli0aJHzW2+9lZ+QkJB86tSpzMWLF/cCgPfee88hMDCwIiMjIzEkJKT00aNHTYacc3NzdZcvX97r5MmTmampqUmnT5/ObLyNsjhk74ddeHh4VkpKStLff/+dYmpqKtm3b5/V6NGjhSkpKUnJycmJAwcOrGr0/gt69Oghio6OTlu7dm2DxCnNnQcApKenG169ejX17Nmz93fs2GGzdOnS/JSUlKS7d+8mu7q61jWOOTY21nTIkCFPGsba2lqOfOh97NixbgCwffv27hYWFvUJCQnJd+7cST5w4IBNSkqKPgCkpKQY7du3T5Cenp5w4sQJ67S0NMP4+Pjk1157rWjbtm09WnpfVDknZe8dAAwZMqTyn3/+MVXya9MmrKeuRQQCwNER0GU/FaYLa61HrS7Hjh2zWrFiRQEAhIaGFh88eNBq2LBhVa3tBwC///67+eLFiwvlw7+2trb1APDzzz+bffHFF3Y1NTWc0tJSXT6fX40WhpQ5HA4WLFhQDADz5s17PGXKlCfX72fMmFEs//7+/fv6kydPdiosLNSrq6vjODs71yo73rVr18zT09ON5M8rKip0hEIh5++//zY7efJkBgBMnz5duGjRoiZptqOiokwCAgLKeTxeneI5KWoujkGDBlWsWbPGedq0acUzZswocXNzkwwaNKhy0aJFvUQiEWfq1KklgwcPrm75XW39PAAgKCio1NTUlAJAYGBg5datW+2zs7P1p0+fXtK4lw4AQqFQ19LSUiJ/rmz4/fLly+YpKSnGZ86csQSA8vJynaSkJEN9fX3q6+tb6eLiIgKAnj171k6YMEEIAH369KmOjo42a+l9UeWclL13gHQIvqCg4Jmv97OeuhbJymJD7wyjDvn5+Tp///232bJly1wcHR19IyIi7M6ePWspkUha37kZVVVVZPXq1S4nT57MTEtLS3r11VeLampq2vQ/lShkmTIzM3sSzPLly3suXbq0IC0tLSkiIiKrtrZW6XEppbh161ZySkpKUkpKSlJBQcFdCwuLpz+pRpqLY9OmTXnffvttVnV1NWfYsGG827dvG06YMKHi6tWrqY6OjnXz5s1zjYiIsFb1dVo6DxMTkyfns3jx4uKffvopw8jISDJx4kSPM2fOmDU+lo6ODq2vb7lUCKWUbNu2TSB/vZycnPgpU6aUAYCBgcGTaxUcDgeGhoZU/n19fT1p6X1R5ZyUvXeA9PfJwMDgmX92rFHXIvJscgzDtK+DBw9ahoSEFOfm5sbn5OTE5+Xl3XVycqq7dOmSSsOdo0ePLtu9e3d3kUg6vys/P1+nqqqKAwB2dnZioVDIOXv2rGVrx5FIJJDPiv/++++tAwICypVtV15ertOzZ0+RfDv5cjMzs/ry8nId+fOhQ4eWffbZZz3kz69fv24EAIMGDSqX73fs2DHzsrIyHTQyYsSIyn/++cdMPuycn5/fZJvm4khMTDQICAio3rhxY56fn19lQkKCYVpamr6Tk5No9erVRbNnzy68deuWypc3mjuPxpKSkvS9vLxqP/zww4Lx48eXxsXFNdnO1dW1Jjk52aCl1xs7dqxw165dNrW1tQQA7t69a1BWVqZye9jc+6LKOSl77wAgISHBkMvlqjy60RzWqGuJ+nogO5v11BlGHY4fP241ZcqUEsVlkyZNKjl06JCVKvuvWrWq0MnJqY7H43l7enryv/vuO6vu3bvXz5o1q9DLy8t75MiR3D59+rQ6wcnIyEjyzz//mHh4eHhfvXrV7LPPPnukbLsPPvggd8aMGW7e3t5e1tbWTyZghYaGlp4/f74bj8fjX7x40XTPnj0Pb926ZcLlcvlubm7eERERNgCwefPm3GvXrpm6u7t7nzx50tLe3r7JtWcHBwdxeHj4g5CQEHdPT09+SEhIb1Xj+O9//9vDw8PDm8vl8vX09OjUqVOFly5dMvPy8vL28vLi//jjj1bvvPNOvirvLQA0dx6NHTp0yIrL5XrzeDx+cnKy0aJFix433mbcuHHCX375pUkPXtGqVauKeDxeja+vr5eHh4f3G2+84SISiVROzt3c+6LKOSl77wDg119/NQsKClL5boDmkOZmRXY2/v7+NCYmRtNhPLXcXOn19F27gMWLNR0NwyhHCImllPq3db87d+486NOnT4szkhmmPWRlZenNmDGj1/Xr19M1HUtb+Pv7e/78888ZNjY2rZYZv3PnTvc+ffr0UraO9dS1BLudjWEY5tm5uLiI5s2bVyRPPtMZ5Obm6q5cuTJflQa9NWyetZaQJ55h19QZhmGezYIFC0pa30p7ODg4iF977bXS9jhWp/kk09XJG3VnZ83GwTAMw3RerFHXEllZQLdugLm5piNhGIZhOiu1NuqEkCBCSCohJIMQ8p6S9V8SQuJkjzRCSKnCunqFdWfUGac2YLezMQzDMM9KbdfUCSE6AL4GMBZANoCbhJAzlNInmX0opasUtv8PAMUKRtWU0r7qik/bsDrqDMMwzLNSZ089AEAGpfQepbQOwBEAk1rYfgaA/6kxHq3GsskxjPqx0qut62qlVwkhAyZNmuQqXy8SiWBpadmntZ/z0/4u1NTUEH9/f095oqKOps7Z745AgxzP2QAGKtuQEOICwBXA7wqLDQkhMQDEADZTSk+rK1BNKysDSkvZ8DvDqBsrvdrxNF161cjISJKammpUUVFBTE1N6alTp8xtbW3V1uIaGhrS4cOHl3377bdWS5YsKW59j/alLRPlpgM4QSlVvEfPRZbkYiaA7YQQt8Y7EUIWEkJiCCExhYWFHRVru3so++jDeurMc2HePGcEBHi262PevFbvG5GXXt2/f/+DU6dOqZRJTk4sFmPhwoVO8kxgGzdu7AEAa9assffx8fHy8PDwnjFjhouyXPIBAQGeV69efZIydf78+c7u7u7egYGB3NzcXF35NvPmzXP28fHx2rBhg+0PP/xg4efnx/Py8uIPHjyY+/DhQ93U1FT9yMhIm2+++cZWnlEuNzdXd/z48W4+Pj5ePj4+Xr/88osJAOTl5ekMGTLEw93d3TssLMylpdKrfD7fy9PTkx8YGMhtvF5ZHABw/vx5U3nlMy8vL35JSQknKytLz9/f35PH4/E9PDy8L168aAr8O0owc+bMnvLSq5988kkPxRGB5s7jrbfecpg8ebJr//79eVOmTHGNiYkx9PX19eLxeHwul8uPj49vkg728OHDViEhIQ1uDxszZozw+PHj3QDgf//7n1VoaOiTxvbKlSvGffv25Xl5efH79evHu3PnTpNjlpWVcV555ZVevr6+Xl5eXvxDhw51A4Dm4pk6dWrpkSNH2vQ71l7U2ajnAFD8Q3OSLVNmOhoNvVNKc2Rf7wGIQsPr7fJt9lBK/Sml/jY2SrMKdgry29lYo84w6sNKrzb0vJReBYDXXnut+OjRo5ZVVVUkOTnZODAw8Mn6Pn361Ny8eTMlOTk5ae3atTnvvPOOU+Njvv/++/YjR44si4+PT/7jjz9SP/zwQ6eysjJOc/G88MIL1Xfv3m3zZZL2oM7h95sAPAghrpA25tMh7XU3QAjhAbAE8JfCMksAVZTSWkJIdwBDAPxXjbFqFMsmxzxX9u1jpVfBSq+qch7As5deBYCBAwdWZ2dnG+zdu9dqzJgxDX4+xcXFOmFhYa4PHjwwJIRQZTngo6KizC9dutQtPDzcDgBqa2tJRkaGfnPx6OrqQk9Pj5aUlHAax6JuauupU0rFAJYDuAQgGcAxSmkiIWQ9ISRYYdPpAI7QhuNDXgBiCCF3AFyB9Jp6g3q4XYlAAOjpAfb2mo6EYbomVnr16XSl0qtBQUGla9eudZ49e3aD69zvvvuu4/Dhw8vT09MTz549m1FXV6e0jOqJEycy5PE9evQovn///jUtxSMSiYixsXGHF1dR6zV1SukFSimXUupGKd0oW/YxpfSMwjbrKKXvNdrvOqXUl1LaR/b1O3XGqWkCAeDkBHC0ZYYDw3QxrPQqK726ZMmSojVr1uQGBAQ0GEEoKyvTcXJyqgOA3bt3d1f2uiNHjizbtm2brfxD4LVr14xaiicvL0+nW7duYsXa7B2FNSNagN3OxjDqxUqvstKrbm5uog8//LCg8fJ33303b926dU5eXl58sVhpFVVs3rw5VywWEx6Px3d3d/f+8MMPHVuK5+effzZvPMzfUVjpVS3g4gKMGAEcOKDpSBimZaz0KqPttKH06rhx49y2bt2a7efnp3QuxLNipVe1mFgM5OSwnjrDMEx70HTp1ZqaGhIcHFyqrga9Naz0qobl5gL19axRZxiGaS+aLL1qaGhIly9f3uSyQEdhPXUNY3XUGYZhmPbCGnUNY4lnGIZhmPbCGnUNkzfqzq0muWQYhmGYlrFGXcOysgBra8BEIwkFGUY7SSTAb7/BJDIS3X77DSbPkCPmiczMTL3Ro0e7ubi4+Dg7O/vMnTvXuaampkn2MAB48OCBXlBQUJNbvBpTrEDWVm+99ZbDxx9/bKvq9s9a4U3Rf//7Xxt5cpjbt28bynO4JyYmGvTr14/3rMcPCgrqnZSUpA9Ic79zuVy+PFf801SZU1VnrazWnlijrmECAbuezjCKjh6FhYMD/IKDwV26FL1efhlcBwf4HT2KFvOqt0QikWDy5MnuwcHBpVlZWQn3799PqKys5KxcudKx8bYikQi9evUSXbx48V5rx42Ojs7o3r170/RlWu6dd94plE/mOn78eLfg4OCS5OTkJG9v79rbt2+nqHociUSCxtnbYmJiDOvr6wmfz39yb3x0dHSaPBubNlaYU6yspulYnhVr1DVMIGDX0xlG7uhRWMyZg975+dCrqgKnshI61dXg5OdDb84c9H7ahv3s2bNmBgYGkpUrVz4GpLm5v/nmm4dHjx7tXl5ezgkPD7ceNWqU+6BBg7iDBw/2VKyxXl5eznnppZd6u7m5eY8dO9bNz8+PJ6+6Jq9Alpqaqt+7d2/v6dOnu7i7u3sPGTLEo6KiggDAtm3buvv4+Hh5enryx48f71ZeXt7i/92HDx/qjh071s3T05Pv6enZpGcrFAo5gYGBXD6f78Xlcp9UDCsrK+OMGDHC3dPTk+/h4eG9d+9eSwBYunSpo5ubmzeXy+UvXLjQCfh3lODo0aMWe/bssf3+++9tBg4cyAUajgh89NFHtj4+Pl5cLpe/atUqB0Baf75Xr14+ISEhvbhcrndmZmaDYjHff/+99csvv9ygSpoyzR3b1dXVOzQ0tFevXr18goODXU+fPm3Wv39/nouLi8+VK1eMga5XWa09sUZdgyhl2eQYRk4iAVasgEttrfL/S7W14KxcCZenGYqPj4836tOnT4PiLVZWVhJ7e/u6pKQkAwBITEw0/umnnzJv3ryZqrjdli1bbLp161afmZmZuGnTppykpCSlw8cCgcBwxYoVBRkZGYkWFhb1kZGRlgAwa9askoSEhOTU1NQkT0/P6vDwcKWpSOUWL17cc9iwYeWpqalJiYmJSf37969RXG9sbCw5f/58RlJSUnJ0dHTa+++/7ySRSHDy5ElzOzs7UWpqalJ6enrilClTyvLy8nQuXLhgmZ6enpiWlpa0adOmBhnswsLChPLKbzdu3EhTXHfy5EnzjIwMw7t37yYnJycnxcXFGf/888+msnM1WL58eWFGRkYil8ttkK3uxo0bpoMGDWrwXg8fPpzL4/H4fn5+vNaO/fDhQ8N33303PzMzMyEzM9Pw8OHD1jExMSkbN27M3rhxoz3Q9SqrtSd2n7oGCYVAeTkbfmcYALhyBSYVFWjx+nR5OXSiomAyahTafQh32LBhZcoqlV2/ft105cqVBQDwwgsv1HC5XKWV3RwdHWvllcn69etX9eDBAwMAiI2NNfr4448dy8vLdSorK3WGDx/eYvrQ69evm504ceI+IB1RsLa2bhCTRCIhb775ptPff/9tyuFwUFBQoJ+dna3bv3//6g8++MB5yZIljpMmTRIGBQVViEQiGBgYSMLCwnpNnDixNCwsTOXUpRcvXjS/evWqOZ/P5wNAVVUVJyUlxbB379519vb2daNHj1b6MygsLNSzs7NrcHE6Ojo6zd7e/kkO1paO7ejoWCvPz87lcqtHjRpVxuFw0L9//6oNGzY4AF2vslp7Yj11DWK3szHMv3JyoEcIWsxbTQhodjb02npsHx+f6jt37jQoMFJcXMx59OiRPp/PrwWkPeC2HleRvr7+k9h1dHSoWCwmALBw4ULXiIgIQVpaWtK7776b21zFNVXt3r3b6vHjx7rx8fHJKSkpSdbW1qLq6mqOn59f7a1bt5J8fX2rP/roI8c1a9bY6+npIS4uLnnq1Kkl586d6zZixAgPVV+HUoo333zzkfxauEAgSFi1alUR0PJ7ZWBgIKmurm7xHFs6tuL7yOFwYGhoSAFAR0cH9fX1BOh6ldXaE2vUNYjVUWeYfzk6QiSRQOlsdDlKQZyc0OYpysHBweU1NTUc+YxvsViMpUuXOr/yyitFiiVPlQkMDKw4cuSIJQDExsYapqWlKa0g1pyqqipOz549RbW1tUSVa7ZDhgwp37Jli408zsePHzcYvRAKhTrdu3cXGRgY0LNnz5rl5ubqA9IZ+2ZmZpKlS5cWv/XWW3lxcXHGQqGQI+vVCr/55puHKSkpKldOmzBhQtnBgwe7y+ua379/Xy8nJ6fV0V0PDw+lVdLa49hyXa2yWntiw+8axLLJMcy/Ro5EpZkZ6qurm+9smJmhfsSItg+9czgcnD59OmPhwoUuW7ZssZdIJBg1apQwPDw8p7V933777cJp06b1cnNz83Zzc6txd3evsbS0VHnG+3vvvZcbEBDgZWVlJe7fv39FRUVFi5cYdu3aJXj99ddduFxudw6Hg4iIiKwxY8Y8OecFCxYUT5gwwZ3L5fL9/PyqXF1dawDpMP///d//OXE4HOjq6tKdO3dmlZaW6kycONG9traWAMCnn376UNW4p0yZUpaYmGj4wgsv8ABp7/zw4cP3dXV1W2z0JkyYUPr777+bTZ48WWlZ2Wc5tty7776bt2DBAtfPP//cYezYsUon5W3evDl34cKFPXk8Hl8ikRBnZ+faK1euZBw6dMjq2LFj1rq6utTGxkb06aefPgI0W1mtPbEqbRr07rvAV18BVVWsljrTOai7Spt89ruyyXIGBpAcOIB7YWHo0H+8YrEYdXV1xNjYmCYmJhqMGzeOm5mZmSAfFmYaqqioIEOGDPGMjY1N0dXtPP1GdVdWa08tVWnrPO94FyQQSDPJsQadYaRkDfa9lSvhUl4OHUJAKQUxM0P9V18hq6MbdEB6S9uwYcM8RSIRoZTiyy+/zGINevNMTU3pxx9/nHv//n19Dw+PJnXctZGmK6u1J9ZT16DBgwEjI+C33zQdCcOopqPqqUskQFQUTLKzoefkBNGIEahkH34ZRor11LWUQACMG6fpKBhG+3A4gDpuW2OYrk6tn30JIUGEkFRCSAYh5D0l678khMTJHmmEkFKFdXMIIemyxxx1xqkJIpG0ljqb+d6FPH4M/PWXNKsQwzCMBqitUSeE6AD4GsAEAHwAMwghfMVtKKWrKKV9KaV9AewAcFK2rxWAtQAGAggAsJYQYqmuWDUhO1v6v5816l3I3r3SayoZGZqOhGGY55Q6e+oBADIopfcopXUAjgCY1ML2MwD8T/b9eAC/UkqLKaUlAH4FEKTGWDscu52tixGLgV27gFGjAA+V83swDMO0K3U26o4AFO+JzJYta4IQ4gLAFcDvbdmXELKQEBJDCIkpLCxsl6A7Cssm18WcPSv9of7nP5qOpGuQ1l41QWRkN/z2mwnaofYqK736r44uvTpgwABPxfU8Ho8vL5jTHMWiOm01ePBgbmFh4VP9XDo7bZlPOh3ACUppm0oYUkr3UEr9KaX+NjY2agpNPeSNulOTMgRMpxQRIf2ENnGipiPp/I4etYCDgx+Cg7lYurQXXn6ZCwcHPxw9ykqvtpOOLr1aWVmpk5GRoQcAt27dMmyn02jWjBkzHm/durVzNQrtRJ2Neg4AZ4XnTrJlykzHv0Pvbd23U8rKAnr0kN7SxnRyiYnA778DS5cCnSjZhlY6etQCc+b0Rn6+HqqqOKis1EF1NQf5+XqYM6f30zbsrPSqZkuvTp48uTgyMtIKACIjI61CQ0OL5etSU1P1BwwY4Mnn8734fL5X4/MFpAmAFi1a5CSPZcuWLd0BICsrS8/f399T3vO/ePGiKQBMnz699OTJk9Ytvc9dlTob9ZsAPAghroQQfUgb7jONNyKE8ABYAvhLYfElAOMIIZayCXLjZMu6DIGAXU/v9OQ9lIgIwMAAmD9fs/F0dtLaqy5oruBJbS0HK1e6PM1QPCu9qtnSqzNmzCg5e/asJQBcunSp25QpU540+g4ODuI//vgjLSkpKfno0aP3Vq1a1eSi5Pbt27tbWFjUJyQkJN+5cyf5wIEDNikpKfr79u2zGj16tDAlJSUpOTlVXsC/AAAgAElEQVQ5ceDAgVUAYGNjU19XV0fy8vKeuyF4tXUrKKViQshySBtjHQD7KKWJhJD1AGIopfIGfjqAI1QhCw6ltJgQ8imkHwwAYD2ltBhdiEAA8PkNl9XXAzrP3a9gJ5WSAgQGAr/8AkRGAjNnAt1b/F/NtObKFRO0khcd5eU6iIoywahRrPRqJyq92qNHj3oLCwvxnj17LN3d3atNTU2ffDKrq6sj8+fPd0lKSjLicDjIyspqUgzm8uXL5ikpKcZnzpyxBIDy8nKdpKQkw0GDBlUuWrSol0gk4kydOrVE/v4DgLW1tVggEOjb2dlVNz5eV6bWa+qU0guUUi6l1I1SulG27GOFBh2U0nWU0ib3sFNK91FK3WWP/eqMs6NRKh1+V5wkl5IibRNSU5vfj9ESlALz5gFlZcC0adLk/cuXazqqzi8nRw+EtHyTPyEU2dms9GonLL06derUknfeecdlxowZDTpoGzdutO3Ro4coOTk5KT4+PkkkEikro0q2bdsmkMeSk5MTP2XKlLIJEyZUXL16NdXR0bFu3rx5rvLJf4C0fvqz/kw7I22ZKPdcKS6WtgPy4XfFNmLePJa7ROv9+CNw9650uDgrC/D0BPr313RUnZ+jowgSSYulV0EpgZMTK73aCUuvzpo1q2TZsmV5U6ZMKWt8Pvb29iIdHR3s3LnTuvHEOwAYO3ascNeuXTbyanN37941KCsr46Slpek7OTmJVq9eXTR79uzCW7duGQPSCXyFhYV6np6enT6Xe1uxWT0a0Ph2thMngNu3pW3ErVvSNmPqVM3Fx7SgogJYvBiolI08Ugrk5Eifmyi91MqoauTISpiZ1UNJL+8JM7N6jBjBSq92wtKrlpaWko0bN+Y13v7NN98sCA0NdTty5Ij1qFGjhEZGRk0+ZK1atarowYMHBr6+vl6UUmJlZSW6cOFC5qVLl8zCw8PtdHV1qbGxcf3hw4fvA8Cff/5p3K9fv0o9vTYP6nR6rKCLBpw+DYSEADExwIMH0hFcxbk/HA5w/DgwZYrGQmSas3q1NMlMtcJlOkND6cz3bds0F1cHUXtBF/nsd2VD1AYGEhw4cA9tuC7cHljp1bbRhtKrc+fOdZ48eXLppEmTmq3p3pm1VNCFDb9rgLynnpYGhIWhyWReiUTa0F+40PGxMS1ISWnaoANATY10OZsQ8ezCwoQ4cOAebG1FMDaWwMSkHsbGEtjaijTRoAPSW9oCAgJ4np6e/JCQEDdWerVliqVXNRWDj49PdVdt0FvTZXrqrq6udO3atQ2WeXt744UXXoBIJMLhw4eb7NO3b1/07dsXVVVVOHbsWJP1/v7+8PHxgVAoxKlTp5qsDwwMhKenJ4qKinDu3Lkm61988UX07t0beXl5uHjx4pPlmZnSYi43b45GfLwznJ0fYvTopvVX//knCAkJdrh//x6uXr3aZP3EiRPRvXt3pKam4q+//mqyPiQkBBYWFkhISICyUYxp06bB2NgYcXFxiIuLa7J+1qxZ0NPTw82bN5GYmNhk/euvvw4AuH79OtLSGtwNAz09PcyaNQsAEB0djfv37zdYb2xsjGnTpgEALl++jOzs7Abrzc3NMUU2VHHx4kXk5TUctbO2tsbLL78MADh79iweP37cYL2dnR2CgqSZhU+ePImysgaX8eDk5IQxY8YAAI4dO4aqqoYTml1dXTF8+HAAwOHDhyESiaTXSGTH4aalYfD16wCA72XvA8zNgX7SW3y19XdPbvTo0XB2dsbDhw/xm5Lav0FBQbCzs8O9ew1/9+bOndshpVdltVdNkJ2tBycnEUaMqASrvcowAFjpVa1TWwvo6QHCVvocJSXAjRvSJDWMhlVXA+WtfPAvL5duxzIKPTtp7VVWepVh2qjL9NQ70zX1gQOlHb4UFZIx7twJLFmi/piYVlAKDBki/ZSlLPkJhwMMGgT8+SdAWp7A3Zmp/Zp6I2IxS9LHMI2xa+paRiCQFvJqbTSRw3kymstoGiHAvn2AfjOXCQ0MpOu7cIPe0W7fhqGVFfreuYMmt0cxDKMca9Q7WG0tkJcH+Pu3noDMxkbaq2e0BI+nvGCLkZF0OMXTs+k65qlIJMDcuehVUQGd119Hr3Yo0sYwzwXWqHewh7K7RF1cgP37m08Lq6PDOn5aqa6u6Q/F2BhYv14z8XRRBw7AMi0NRpQCqakwjoxEt2c9po6OzgB54Y8JEyb0bq2wijLr16/v8TT7dQZtLXUaGhraa//+/Zbt8dqNS92+/PLLrlwul//JJ5/0ePPNNx1Onz5t9izHP3jwYLc1a9bYA9JiNj169PDj8Xh8Ho/HX7p0qdKS4O1FXvSnrfstXLjQ6cyZM20+7y75y6nN5LezubgAL70EHDvWdBiew5Euf+mljo+PaYFAAJw7B0ya9G+iGRMTYPdulnimHQmF4KxahZ7V1dL/T9XV4Lz5JlzKyp7t/5WBgYEkJSUlKT09PVFPT49u27atzaU5d+/ebVtRUfHUcSgrVcqgQalbgUCge+fOHZO0tLSktWvXFmzfvj23cSKblohETRMOfvHFF3arV68ulD9fvHhxvjzl7M6dO7WyAuiaNWsKPv/8c7u27sca9Q7WOJvclCnAkSP/Tpg2NASOHmWJZ7TSrl3Sr19+Cfj5ST999enDfljtbM0aONTUNPzfVFMDzurVcGiv1xg6dGhFRkaGAQCsW7fO1sPDw9vDw8N7/fr1PQDlZUw3bNjQo6CgQG/48OFceZlSReHh4dajR492CwgI8HRxcfFZvXq1PaC8VOnu3butuFwu38PDw3vJkiVPeoonTpww5/P5Xp6envzAwECuPJZXXnmll6+vr5eXl9eTUqsxMTGGvr6+Xjwej8/lcvnx8fEGzZVf/eOPP4xfeOEFT29vb6+hQ4d6ZGVl6cmXy0u8fvHFF83eZ/PBBx/Ycblcvqenp9Ke7Zo1a+x9fHy8PDw8vGfMmOEikV0v2bBhQw952deJEyf2BoDz58+bynvJXl5e/JKSEo7iKMGYMWO4BQUF+jwej3/x4kVTxRGB5s4jICDAc968ec4+Pj5eGzZssFWM7e7duwb6+voSe3t7cUu/Ey0de/78+c4+Pj5evXv39o6OjjYeN26cm4uLi8+KFSue/E6OGTPGzdvb28vd3d1769atSi+u7ty500r+M5s5c6aLWCyGWCxGaGhoLw8PD2/56AQAcLncutLSUl2BQNC2Xj6ltNkHpNXVrrS0jbY8BgwYQDuDdesoJYTS2tp/l0kklAYGUsrhUDp4sPQ5o2Wqqym1tqY0JET6PDmZ0m7dKE1J0WxcHQzSCott/vuMi4t7QCmNae1x6xZNMDCgEuntBg0fBgZUEhdH41U5jrKHkZFRPaU0pq6uLmbUqFElmzdvzrp69WqSh4dHlVAovFVaWnrLzc2t+s8//0zcv39/RlhYWKF836KiotuU0hgHB4fa3NzcOGXH/+qrr+5379697tGjR7fLy8tj3d3dq6Ojo5NSUlLuEkLo5cuXkymlMffv379jZ2dXm5OTE1dXVxczcODAssjIyIycnJw4W1vbuuTk5LuU0pi8vLzblNKYZcuWPfr666/vUUpjCgsLb7u4uNQIhcJbs2fPzt+5c+c9SmlMdXV1bHl5eayyuGtqamL79u1bkZOTE0cpjdmzZ0/m1KlTiyilMR4eHlUXLlxIoZTGLFy4MM/d3b268XkdPXo0rW/fvhVlZWW3FOOaMmVK0b59+zIVl1FKYyZNmvT48OHD6ZTSGBsbm7qqqqpYeeyU0piRI0eWXrp0KZlSGlNaWnqrrq4uJiUl5a78tRW/V3ydls7jhRdeKJ81a1aBsp/L9u3b7y9YsCBP/nzVqlW5NjY2dZ6enlWenp5VJ06cSGvt2IsXL35EKY1Zv369wMbGpu7Bgwd3qqqqYnv06FH36NGj24rvgfxnL18u/52JjY1NGDlyZGlNTU0spTRm1qxZBTt27Lh/9erVpMDAQKE8Pvn7RCmNCQsLK9y/f39G43OS/T0p/Vtr8RMApbSeECIhhFhQSjs8k1NXJBAA9vYNJ1HLJ1YHBrLr6FrryBHg8eN/q7HxeEBREauV247kk+OUjJ4CAEQi4PXX0Ss2FqlPk4emtraWw+Px+AAwcODA8pUrVxZt2bLF5qWXXio1NzeXAMD/+3//r+TKlStmwcHBwsZlTFV5jaFDh5bZ2dnVy48VFRVlGhYWVqpYqvTPP/80GTRoULmDg4MYAMLCwoqjo6NNdXR0aEBAQDmPx6sDAHkZ2KioKPNLly51Cw8Pt5OdB8nIyNAPDAys3Lp1q312drb+9OnTS3x9fWuVlV+9efOmYXp6utGoUaO40vdZAhsbG1FRUZFOeXm5zoQJEyoAYN68eY9///13i8bn9Ouvv5q/+uqrTwrfKCtP+/PPP5t98cUXdjU1NZzS0lJdPp9fDUDo6elZHRIS4hocHFw6a9asUgAYNGhQxZo1a5ynTZtWPGPGjBI3NzeVpkHevXvXQNl5yNc3rv4m9+jRIz0bG5sGvfTFixfnr1+/Pl/+vLn3SL4+JCSkFAD69OlT7e7uXu3i4iICAGdn59p79+7p29nZVX/++ee258+f7wYAeXl5eomJiYZ2dnZPci1cvHjRLCEhwbhPnz5eAFBTU8Pp0aOHOCwsrPThw4cGc+bMcX755ZeFISEhTzJl2djYiHNyctqUmU+Vbn0FgHhCyK8AngRIKV3RlhdipASChiVX5VgbocUoBXbsALy9gZEj/13OfljtKikJBgkJMGluprtEAhIfD9PkZBh4e6PN1bfk19RV2VZexvTHH3+0+OijjxwvX75ctnXr1keK20RGRnbbtGmTAwDs2bPnAQCQRp/I5c+fpQQopRQnTpzI6NOnT4Nz7t+/f82wYcMqT506ZTFx4kSPHTt2ZAUHB5c3jnvatGml7u7u1XFxcQ0yYxQVFbXLL3BVVRVZvXq1y40bN5Lc3d1Fb731lkNNTQ0HAK5cuZL+888/m/30008WW7dutU9NTU3ctGlT3uTJk4U//fSTxbBhw3jnz59PV+X9oZQSZech11y1PSMjI4lQKGytA9viseVpgTkcDgwMDJ4kd+FwOBCLxeTcuXNm0dHRZjExMSlmZmaSgIAAz8blZyml5JVXXnn89ddfN7mGn5CQkHTq1Cnzb775xubo0aNWx48ffwAANTU1RFmBm5ao8nn3JICPAFwFEKvwYJ5Cc406wNoIrfX339LyecuXs2EUNeLzUevjg0oOB0ozYnE4oL6+qPDyanuD3pyRI0dWXLhwoVt5eTmnrKyMc+HCBcuRI0eWKytjCgAmJib18lKks2fPLpVPtnrxxRerAODPP/80z8/P16moqCAXLlzoNnz48CY9/GHDhlXeuHHD7NGjR7pisRjHjx+3GjFiRMWIESMq//nnH7OUlBR9AMjPz9eRxVi2bds2W/l16mvXrhkBQFJSkr6Xl1fthx9+WDB+/PjSuLg4I2Vx+/n51RQXF+tevnzZBJD29GNiYgy7d+9eb2ZmVn/p0iVTAPj++++VloUdP3582aFDh7rLZ/3L45KrqqriAICdnZ1YKBRyzp49awkA9fX1yMzM1H/55ZfLv/7665yKigodoVCok5iYaBAQEFC9cePGPD8/v8qEhARDVX5WzZ1Ha/t5e3vXZGZmtpjr4GmPLVdaWqpjYWFRb2ZmJrl9+7bhnTt3msycDQoKKjt37pylvHxtfn6+Tlpamv6jR4906+vr8frrr5d+9tlnOfHx8U/K42ZmZhr26dOnuvGxWtJqT51SeqAtB2SaR6m0UQ8O1nQkTJtERAAWFsCrr2o6ki6NwwH278eDwEDwa5U023p6wPff40F7poAfOnRo1cyZMx/379/fCwBee+21wiFDhlT/+OOP5o3LmALAnDlzioKCgri2trZ1N27cSGt8PD8/v8rg4GC3vLw8/alTpz5+8cUXq1JTUxsMn7q4uIjWrl2bM3z4cC6llIwZM6b01VdfLQWA8PDwByEhIe4SiQTW1tai69evp2/evDl34cKFPXk8Hl8ikRBnZ+faK1euZBw6dMjq2LFj1rq6utTGxkb06aefPvrzzz9NGsdtaGhIjxw5krlixYqe5eXlOvX19WTJkiX5/v7+Nd99992DBQsW9CKEYMSIEWWNzwcApk6dWnbr1i3jvn37eunp6dExY8YIIyIinvQ2u3fvXj9r1qxCLy8vbxsbG3GfPn0qAUAsFpOZM2e6lpeX61BKyYIFCwq6d+9ev3r1aofr16+bE0Kop6dn9dSpU4UCgaDVGqktnUdL+40fP77ivffec5ZIJOA088vztMeWCw0NFe7Zs8emd+/e3r17966RvweKBgwYUPPhhx/mjB49miuRSKCnp0fDw8MFxsbGkvnz5/eSSCQEANavX58NSD9YPHjwwODFF19sU7rkVtPEEkKGAFgHwAXSDwEEAKWU9m5pv47WGdLEFhQAtrbSkVz5pVlGy+XlSYdWli2Tznp/znVEmtg33oDTwYOwqa39dyTRwACS115D4d69yG5pX00KDw+3jomJMYmMjBRoOhamoblz5zpPmjSptC23xmlaZGRkt9jYWOOvvvoqt/G6Z00T+x2ALwAMBfACAH/Z11YRQoIIIamEkAxCyHvNbDONEJJECEkkhPygsLyeEBIne5xR5fW0XePb2ZhOYPdu6QytpUs1HclzY9s25BoaosF1RENDSLZtQ5N/bgyjivXr1z+qrKzsVLdwi8Vi8tFHH+W3vmVDqkyUE1JKf27rgQkhOgC+BjAWQDaAm4SQM5TSJIVtPAD8H4AhlNISQojifZLVlNK+bX1dbZaVJf3KGvVOoq4O+OYbYMIEabJ+pkOYm0Py5ZcQLFuGXtXV4BgZQbJ9O7LMzaHVyWJXrFjxGMDjVjdkOpyzs7N41qxZneoOrnnz5pU8zX6qfHK5QgjZQggJJIT0lz9U2C8AQAal9B6ltA7AEQCTGm3zBoCvKaUlAEApLWhT9J2MYjY5phM4eVI6/M6ulXS4OXNQwuWimhDA0xNVs2ejVNMxMUxnoEpPXV5SRPE6GgUwqpX9HAE8VHierXAsOS4AEEKuQZroZh2l9KJsnSEhJAaAGMBmSunpxi9ACFkIYCEA9OwE3V+BADA1Bbo9cxZrpkNERADu7kBQkKYjee7IJ82NfFHM+/573XadHMcwXVmzjTohZCWl9CsAH1FK/1Tj63sAGAHACcBVQogvpbQUgAulNIcQ0hvA74SQeEpppuLOlNI9APYA0olyaoqx3WRlSYfe2V1RncDt28C1a9LJcaxF0Yh+uI0SMhwEfwDoo+lwGKZTaOm/1VzZ1/CnPHYOAGeF506yZYqyAZyhlIoopfcBpEHayINSmiP7eg9AFIBOX1lcIGBD751GRIS0+trrr2s6kueTNL1cL1JRoYPXX+8FVnuVYVTSUqOeTAhJB+BJCLmr8IgnhNxV4dg3AXgQQlwJIfoApgNoPIv9NKS9dBBCukM6HH+PEGJJCDFQWD4EgEqZoLRZS4lnGC3y+DHwww/A7NnsWommHDhgibQ0I0hrrxojMpKVXlWz56n0KiFkQEJCwpOENOvXr+9BCBlw9epV4+aPIi3u0to2ymzatMlm+/bt1m2PvO2a/eWklM4AMAxABoCXFR4TZV9bRCkVA1gO4BKAZADHKKWJhJD1hBB5+pVLAB4TQpIAXAHwNqX0MQAvADGEkDuy5ZsVZ813RtXVQGEha9Q7hW+/BWpqpPemMx1PKORg1aqekKfZrK7m4M03XVBWxkqvdlEdXXrVw8OjOjIy8kkGvdOnT1u5u7urlGjmafznP/95vHv3btvWt3x2Lf5yUkrzKKV9KKVZjR+qHJxSeoFSyqWUulFKN8qWfUwpPSP7nlJK36KU8imlvpTSI7Ll12XP+8i+fvesJ6pp7B71TqK+Hti5U5rj3cdH09E8n9ascYAsd/gTNTUcrF7NSq+y0qvtUnr1pZdeKr1w4UI3AEhMTDQwMzMTW1paPlk/a9asnj4+Pl7u7u7eq1atUvp7d/LkSfO+ffvy+Hy+14QJE3rL0wcvXbrUUX7OCxcudAKkeemdnJxqr1y50uZeflu1rU4r89TY7WydxNmz0h8Wyx6nGbdvG+LgwR6orW04nbS2loODB3tg+fJCNCps0lYikQiXLl0yHzduXNkff/xh/MMPP1jHxsYmU2kJZ6/Ro0eXp6enG9jZ2YmioqIyAODx48c61tbW9bt27bKNjo5Oa6429927d03i4+MTTU1NJf369eNPmjRJaGtrKxYIBAbffffd/dGjRz948OCB3rp16xxjY2OTbWxsxMOGDeMePHiw2+jRoyuWL1/eKyoqKoXH49XJc6y///779iNHjiw7fvz4g6KiIh1/f3+v4ODgsh07dtgsXbo0f8mSJcU1NTVELBbjxIkTFo3jrq2tJStWrOh5/vz5DAcHB/HevXst16xZ43j8+PEH8+fP7/XVV18JJkyYULFo0SInZed07Ngx8wsXLnSLjY1NMTMzkzTO/Q4Ab7/9doG84M3kyZNdjxw5YjFz5kxheHi4XVZWVryRkRGVF5DZtm2bXXh4eNa4ceMqhUIhx9jYWFJQ8O/dzGfPns2YOHGih7z4zt69e7sD0rSpzZ0HANTV1ZGEhITkxrFduXLF1M/Pr0pxmbm5eb2Dg0PdzZs3DU+cONFt6tSpJQcPHnxSA/2LL77IsbW1rReLxRg8eLDnjRs3jAYOHPgkB/ujR490N23aZH/16tU0c3NzyQcffGD36aef2q5Zs6bgwoULlvfu3UvgcDgNiub079+/MioqymzkyJENYmlvXfLakDZiPfVOIiICcHZmCfo1QTY5Di3XXn3qSXPy0qu+vr58JyenupUrVxZFRUWZykuvWlhYSOSlV/v371/9xx9/mC9ZssTx4sWLptbW1iqNmctLr5qamlJ56VUAaK70qp6e3pPSq1FRUSbNlV798ssv7Xk8Hn/o0KGeiqVXt23bZv/BBx/Ypaen65uamlJlcSuWLOXxePwtW7bY5+bm6ikrvarsnFQtvern58fjcrn869evmyUkJBgBgLz06s6dO6309PQo8G/p1Q0bNvQoKirS0dNrNe07gIalVxXPQ76+LaVXAWDatGnFBw8etDp//rzlrFmzGiR6OXDggBWfz/fi8/n89PR0wzt37jQo7hIVFWWSmZlpGBAQwOPxePwjR45YCwQCfWtr63oDAwNJWFhYrwMHDnQzNTV98svao0cPsWK86sJ66h0kK0t6Z5RDuw0gMu0uKQn47Tfgs88AXfan0eGSkgyQkGDSbKMtkRDEx5siOdkA3t6s9CorvdpAW0uvhoWFCT/++GMnX1/fKisrqyf7pqSk6EdERNjKRlLqQ0NDe9U0uhxEKcXQoUPLzp49e7/xcePi4pLPnDljfuLECctdu3b1+Pvvv9MAaf30tpZRfRrN9tQJIWcJIWeae6g7sK5GIAAcHaWVphgt9fXXgIEBsGCBpiN5PvH5tfDxqQSHozznBIdD4etbAS8vVnoVrPTqs5ZeNTMzk6xbty77o48+avBhraSkRMfIyEhiZWVV//DhQ92oqCiLxvuOGDGiMiYmxlQ+g76srIxz9+5dA6FQyCkuLtYJCwsTfvPNNw9TUlKeXENPS0sz8PHxaVMZ1afRUndkq+zrFAB2AA7Jns8A0OYk8887djublhMKgQMHgBkzgO7dW9+eaX/SNHIPEBjIR/O1Vx+0ZzIgVnr1+S69unDhwib51QMDA6t9fHyq3NzcfOzt7esGDBjQ5IOZg4ODePfu3Q+mT5/eu66ujgDA2rVrcywsLCQTJ050r5XNCfn000+fZFW9efOm6eeff672okSqlF6NaVxqUdkyTdP20qtubsDAgdLbnxkt9NVXwJtvAjExwIABmo5Ga3VE6VW88YYTDh60QW3tv/+BDQwkeO21Quzdy0qvMm2m6dKr165dM9qyZYvd6dOnmwzXP41nLb1qIkvVCgAghLgCMGmPwJ4XEgnw8CGb+a61JBLp0HtgIGvQtcG2bbkwNGx47dHQUIJt21jpVeapaLr0akFBgd7nn3/eOKOqWqgyG2gVgChCyD0ABIALgEVqjaqLyc+XTtxlw+9a6pdfgPR0YN06TUfCAIC5uQRffinAsmW9UF3NgZGRBNu3Z8HcXKtzxbLSq9pL06VXQ0JClF7aUIdWP7nIqqZ5AFgJYAUAT0rpJXUH1pWw29m0XEQEYGcHTJ2q6Ui6MolEIlG9lNGcOSXgcqshrb1ahdmzWelVhgEg+ztq9gNuq406IcQYwNsAllNK7wDoSQiZ2H4hdn1Zsvx7rFHXQpmZwIULwKJFgL5+69szTyuhsLDQQuWGXT5pztS0vr0nxzFMZyWRSEhhYaEFgITmtlFl+H0/gFgAgbLnOQCOAzj3zBE+J1g2OS22cyegowMsXKjpSLo0sVi8IC8v79u8vDwfqJr0isMBoqKyAZjhzp1nKujBMF2EBECCWCxu9r5bVRp1N0ppGCFkBgBQSqtI4wwLTIsEAsDCAjA313QkTAOVlcC+fdJhd5YVSK0GDBhQAICl6WMYNVPlE3MdIcQIAAUAQogbgHZL/vA8yMpiQ+9a6dAhoLQUWL5c05EwDMO0C1V66usAXATgTAg5DGlt89fVGFOXIxCwoXetQ6l0gly/fsDgwZqOhmEYpl202qhTSn8hhMQCGATpLW0rKaWqJZFgAEgbddZuaJnoaCAhAfjuO4BdTWIYpotQZfb7bwAGUkrPU0rPUUqLCCF7OiC2LqGiAiguZsPvWiciArC2lqaFZRiG6SJUuabuCuBdQshahWValSJWm7GZ71ro4UPg9Glp4RYjI01HwzAM025UadRLAYwGYCur3NakYg3TPJZ4Rgt98430mvqSJZqOhGEYpl2p0qgTSqmYUroUwI8A/gTQQ5WDE0KCCCGphJAMQsh7zWwzjXt1Np4AABokSURBVBCSRAhJJIT8oLB8DiEkXfaYo8rraSPWqGuZmhpgzx4gOJgNnzAM0+WoMvv9G/k3lNLvCSHxAJa1thMhRAfA1wDGAsgGcJMQcoZSmqSwjQeA/wMwhFJaQgjpIVtuBWAtpMP8FECsbN8mZfK0XVYWoKsL2NtrOhIGAHD0KFBUxG5jYximS2q2p04IkadKOU4IsZI/ANwHsEaFYwcAyKCU3qOU1gE4AmBSo23eAPC1vLGmlBbIlo8H8CultFi27lcAQSqflRYRCAAnJ2nSMkbDKAV27AC8vIBRozQdDcMwTLtrqaf+A4CJkKaIpZDeziZHAfRWtpMCRwAPFZ5nAxjYaBsuABBCrgHQAbBOVkBG2b6OjV+AELIQwEIA6Kml49sCARt61xo3bgCxsdIyq+w2NoZhuqBmG3VK6UTZV1c1v74HgBEAnABcJYT4qrozpXQPgD0A4O/vT9UR4LPKygKGDdN0FAwA6W1s5ubA7NmajoRhGEYtmm3UCSH9W9qRUnqrlWPnAHBWeO4kW6YoG8ANSqkIwH1CSBqkjXwOpA294r5Rrbye1qmvB7Kz2XwsrZCXBxw7BixdCpiaajoahmEYtWhp+H1bC+sogNYuSt4E4EEIcYW0kZ4OYGajbU4DmAFgPyGkO6TD8fcAZALYRAixlG03DtIJdZ3Ko0fShp0Nv2uBvXsBkUjaqDMMw3RRLQ2/j3yWA1NKxYSQ5QAuQXq9fB+lNJEQsh5ADKX0jGzdOEJIEoB6AG9TSh8DACHkU0g/GADAekpp8bPEownsdjYtIRJJ700PCgK4XE1HwzAMozaq3NIGQogPAD4AQ/kySmlka/tRSi8AuNBo2ccK31MAb8kejffdB2CfKvFpq6ws6VfWqGvYyZNAbq70/nSGYZgurNVGXZYedgSkjfoFABMgTUDTaqP+vGM9dS0REQH07g1MmKDpSBiGYdRKlYxyUyFNE5tHKZ0LoA8AlipWBQIBYGXF5mVpVFwc8OefwLJlAEeVX3eGYZjOS5X/ctWUUgkAsSwhTQEazmpnmpGVxXrpGhcRARgbA/PmaToShmEYtVPlmnoMIaQbgL2QJqKpAPCXWqPqIgQC6agvoyGPHwOHDwNz5gDdumk6GoZhGLVrsVEnhBAAn1FKSwF8Qwi5CMCcUnq3Q6Lr5AQCYMQITUfxHNu3T1rAZVmrpQoYhmG6hBYbdUopJYRcAOAre/6gI4LqCoRC6YMNv2tIfT2wc6f0U5WvykkKGYZhOjVVrqnfIoS8oPZIuhj5zHeWTU5Dzp0DHjxg1dgYhnmuqHJNfSCAWYSQLACVkBZ2oZRSP7VG1smx29k0LCJCWh5vUuPCgAzDMF2XKo36eLVH0QWxRl2DkpOBy5eBjRulxewZhmGeE60Ov1NKsyC9hW2U7PsqVfZ73mVlAfr6gK2tpiN5Dn39NWBgALzxhqYjYRiG6VCtNs6yjHLv4t+CKnoADqkzqK5AIACcnVm+kw5XVgYcOABMnw7Y2Gg6GoZhmA6lSpMTAiAY0uvpoJTmAjBTZ1BdgUDAht414sABoKKCTZBjGOa5pEqjXicrvEIBgBBiot6QugaWTU4DJBLpBLlBgwB/f01HwzAM0+FUmUV0jBCyG0A3QsgbAOZBml2OaYZIJC0Kxm5n62C//gqkpQGH2NUhhmGeT6026pTSrYSQsQDKAHgC+JhS+qvaI+vEcnOlnUbWU+9gERHSmYmvvKLpSBiGYTRCpft9ZI04a8hVxOqoa8C9e8D588CHH0pvO2AYhnkOqTL7fQohJJ0QIiSElBFCygkhZR0RXGfFsslpwM6dgI4OsHixpiNhGIbRGFV66v8F8DKlNFndwXQV8kbdmRWo7RiVlcB33wGhoYCDg6ajYRiG0RhVZr/nP22DTggJIoSkEkIyCCHvKVn/OiGkkBASJ3ssUFhXr7D8zNO8vqYIBNJbpI2MNB3Jc+KHH4DSUnYbG8Mwzz1V66kfBXAaQK18IaX0ZEs7EUJ0AHwNYCyAbAA3CSFnKKVJjTY9SilV9t+4mlLaV4X4tA67na0DUQrs2AH07QsMGaLpaBiGYTRKlUbdHNLUsOMUllEALTbqAAIAZFBK7wEAIeQIgEkAGjfqXY5AAPB4mo7iOXH1KhAfD3z7LUCIpqNhGIbRKFVuaZv7lMd2BPBQ4Xk2pBXfGgslhLwIIA3AKkqpfB9DQkgMADGAzZTS0413JIQsBLAQAHpqSdeYUmmjPm5c69sy7SAiArCyAmbO1HQkDMMwGqfK7HcuIeQ3QkiC7LkfIeTDdnr9swB6ycq4/grggMI6F0qpP4CZALYTQtwa70wp3UMp9aeU+ttoSZ7vkhJpllIt+YzRtT18CJw6BcyfzyYwMAzDQLWJcnshLeYiAgD6/9u7/2iryjqP4+8PKJCCgwaGgs3FQs1KTS/YL5nWSk1LwdLK0hUsLGIp1RonZ2rV1KR/TL+Ws5oLEmBkZqn5YymOOGpNpjkJXFMxTSZU7gW0QH6JCsiF7/yx92WdbvfHPvecffc5535ea511795n732/z4V1v+fZ+3m+T8Qq4MIM520gWd2t04R0334RsTkiOp/TXwucUvLehvTrc8ADwLsy/MzCeTrbAFq4MKnyc+mlRUdiZlYTsiT1gyJiRZd9HRnOWwlMkjRR0jCSDwJ/NYpd0hElm9OAP6b7D5U0PP1+DPA+6uRZvNdRHyC7dsGiRXDuudDUVHQ0ZmY1IctAuZfSW9+dC7pcALzY10kR0SFpLnAvMBRYEhFPSboSaI2IpcAXJU0j+ZCwBZiZnv42YKGkfSQfPL7dzaj5muRqcgPklltg0yb4wheKjsTMrGYoWYCtlwOko4FFwHuBrcDzwMURsTb36MrQ3Nwcra2tRYfBFVckY7dee82DsXM1ZQrs2AFPP+1f9ACS9Gg61sXMalCW0e/PAaenS64OiYgd+YdVvzrXUXeeydHy5bByZfLpyb9oM7P9+kzqki7vsg2wHXg0Ih7PKa661ZnULUfz5sGoUfCZzxQdiZlZTckyUK4ZmEMy73w88HngLGCxpH/OMba65GpyOfvLX+Dmm2HmzCSxm5nZflkGyk0ATo6IVwAkfRO4G5gKPEqy4IsBu3fDiy96OluuFi+GPXvgssuKjsTMrOZk6akfTknNd5L56m+KiJ1d9g96G9JZ+O6p52TPHliwICnXd+yxRUdjZlZzsvTUfwYsl3Rnun0u8PN04FxdTDMbKJ7OlrM77oAXXkiKzpiZ2d/IMvr9Kkn3kBSAAZgTEZ1zxy7KLbI65GpyOWtpgaOPhrPPLjoSM7OalKWnTprEi58EXuM6k/qECcXG0ZCeeAIeegi+/30YOrToaMzMalKWZ+qWUVsbjBsHw4cXHUkDmjcvWbRl1qyiIzEzq1lO6lXU3u5b77nYsgV+9jO4+GI49NCiozEzq1lO6lXkwjM5WbIEdu6EuXOLjsTMrKY5qVdJhJN6LvbuhfnzYepUOOGEoqMxM6tpTupV8tJLSWfSSb3Kli2DtWu9GpuZWQZO6lXi6Ww5aWlJphOcd17RkZiZ1Twn9SrpTOruqVfRM8/A/ffDnDlwQKbZl2Zmg5qTepW4mlwO5s+HYcPgc58rOhIzs7rgpF4l7e1w8MFw2GFFR9IgXn4ZrrsOPvlJOPzwoqMxM6sLTupV0jnyPVlu3ip2/fXwyiseIGdmVoZck7qksyStlrRG0le6eX+mpE2SHk9fny15b4akP6WvGXnGWQ1eR72K9u1LKshNmQKTJxcdjZlZ3cgtqUsaCswHzgaOBz4l6fhuDr05Ik5KX9em5x4GfBM4FZgCfFNSTZcSczW5KvrlL2H1anjPe+CRR5IiAGZm1qc8e+pTgDUR8VxEvA7cBEzPeO6HgPsjYktEbAXuB87KKc6K7dwJGze6p14Vy5bB9OnJc4wlS+CMM5Jf7LJlRUdmZlbz8kzq44F1Jdvr031dnS9plaRbJR1VzrmSZktqldS6adOmasVdtvXrk69O6hVatgw+9jHYtSvpne/YkTxXX78eLrjAid3MrA9FD5S7C2iKiBNIeuM/KefkiFgUEc0R0Tx27NhcAszC09mqIAJmz4bdu7t/f+dO+PznfSvezKwXeSb1DcBRJdsT0n37RcTmiOj8K34tcErWc2uJq8lVwfLlsG1b78ds2wYrVgxMPGZmdSjPpL4SmCRpoqRhwIXA0tIDJB1RsjkN+GP6/b3AmZIOTQfInZnuq0nt7ckj4PHdPVywbF58MVm8pTdDhsALLwxMPGZmdSi32psR0SFpLkkyHgosiYinJF0JtEbEUuCLkqYBHcAWYGZ67hZJV5F8MAC4MiK25BVrpdra4Mgj4cADi46kjo0bB6+/3vsx+/Ylv2gzM+tWrgW1I2IZsKzLvm+UfP9V4Ks9nLsEWJJnfNXi6WxVsGdPkrR7M3p0MnfdzMy6VfRAuYbgddSrYN48GDkSRozo/v03vAEWLnTJPjOzXjipV2jfPli3zkm9IuvXw+23J6ux3XZbstTqyJFwyCHJ1wkT4NZb4cMfLjpSM7Oa5vUsK7RxYzILy0m9AgsXJp+OLr0UJk5Mbn2sWJEMijvyyOSWu3voZmZ9clKvkKezVWj3bli0CM45J0nokCTwU08tNi4zszrk2+8V6kzq7qn30y23JLc7vBqbmVnFnNQr5GpyFWppgWOPhQ9+sOhIzMzqnpN6hdrbk/Fco0cXHUkdWrEiec2dmxSWMTOzivgvaYU8na0C8+bBqFEwY0bRkZiZNQQn9Qq1tTmp98vGjXDzzUlCHzWq6GjMzBqCk3qFXE2unxYvTsrCXnZZ0ZGYmTUMJ/UKvPoqbN7snnrZ9uyBBQvgjDPguOOKjsbMrGF4nnoF1q1Lvjqpl+nOO2HDhiSxm5lZ1binXgFPZ+unlhZoanLZVzOzKnNSr4CryfXDqlXw4IPJs/ShQ4uOxsysoTipV6C9PclLRxxRdCR1ZN68ZMW1WbOKjsTMrOE4qVegrQ3Gj4cDPDIhmy1b4IYb4KKL4LDDio7GzKzhOKlXwNPZyvTjH8POnUkFOTMzqzon9Qq4mlwZ9u6F+fPhtNPgxBOLjsbMrCHlmtQlnSVptaQ1kr7Sy3HnSwpJzel2k6Sdkh5PXz/MM87+2Ls3mdLmpJ7RPffA8897NTYzsxzl9jRY0lBgPnAGsB5YKWlpRDzd5bhRwJeA5V0u8WxEnJRXfJX685+ho8O33zNraUkGIJx3XtGRmJk1rDx76lOANRHxXES8DtwETO/muKuA7wC7coyl6ryOehlWr4b77oM5c+DAA4uOxsysYeWZ1McD60q216f79pN0MnBURNzdzfkTJT0m6TeSTssxzn5xUi/D/PkwbBjMnl10JGZmDa2wyViShgBXAzO7eftF4M0RsVnSKcAdkt4eES93ucZsYDbAmwc4u7qaXEY7dsB118EnPgGHH150NGZmDS3PnvoG4KiS7Qnpvk6jgHcAD0haC7wbWCqpOSJ2R8RmgIh4FHgWOKbrD4iIRRHRHBHNY8eOzakZ3Wtvh0MP9aqhfbr++iSxe4CcmVnu8kzqK4FJkiZKGgZcCCztfDMitkfEmIhoiogm4BFgWkS0ShqbDrRD0tHAJOC5HGMtm6ezZRCRVJCbPBmmTCk6GjOzhpfb7feI6JA0F7gXGAosiYinJF0JtEbE0l5OnwpcKWkPsA+YExFb8oq1P9raPPK9T7/6FTzzTNJbNzOz3OX6TD0ilgHLuuz7Rg/HfqDk+9uA2/KMrVLt7TB1atFR1LiWFhg7NnmebmZmuXNFuX54+WXYts2333u1di3cdVcy4n348KKjMTMbFJzU+8HT2TK45hoYMiSZm25mZgPCSb0fvI56H157Da69Fj76UZgwoehozMwGDSf1fnBPvQ833ghbt3oam5nZAHNS74f29qTa6bhxRUdSgyKSAXLvfGeyIpuZmQ2YwirK1bO2tuSu8hB/JPpbDz8MTzwBixaBVHQ0ZmaDitNSP7S3+3l6j1paYPRo+PSni47EzGzQcVLvB1eT68GGDXD77XDJJXDwwUVHY2Y26Dipl6mjI8ldTurdWLgQ9u6FSy8tOhIzs0HJSb1ML7yQ5C3ffu9i9+4kqX/kI3D00UVHY2Y2KDmpl8nT2Xpw662wcaOnsZmZFchJvUxeR70HLS1wzDFw+ulFR2JmNmg5qZfJPfVurFwJy5fD3Lme52dmViD/BS5TezuMGQMHHVR0JDVk3jwYORJmzCg6EjOzQc1JvUyeztbFpk1w001JQj/kkKKjMTMb1JzUy9TW5qT+VxYvhtdfT269m5lZoZzUyxCRJHVPZ0t1dMCCBcnguOOOKzoaM7NBz7Xfy7B9O7zyinvq+915J6xfD/PnFx2JmZnhnnpZPJ2ti5YWaGpKCs6YmVnhck3qks6StFrSGklf6eW48yWFpOaSfV9Nz1st6UN5xplV53Q2334HnnwSfvObpCTs0KFFR2NmZuR4+13SUGA+cAawHlgpaWlEPN3luFHAl4DlJfuOBy4E3g4cCfxS0jERsTeveLPwHPUS8+bBiBEwa1bRkZiZWSrPnvoUYE1EPBcRrwM3AdO7Oe4q4DvArpJ904GbImJ3RDwPrEmvV6j2dhg+HMaOLTqSgm3dCjfcABddBG98Y9HRmJlZKs+BcuOBdSXb64FTSw+QdDJwVETcLemKLuc+0uXc8V1/gKTZwOx0c7ekP1Qj8L4UcLd5DPDSgP/UvvzoR8mr/2qzXdXRqG07tugAzKxnhY1+lzQEuBqY2d9rRMQiYFF6vdaIaO7jlLrUqG1r1HZB47ZNUmvRMZhZz/JM6huAo0q2J6T7Oo0C3gE8IAlgHLBU0rQM55qZmVkXeT5TXwlMkjRR0jCSgW9LO9+MiO0RMSYimiKiieR2+7SIaE2Pu1DScEkTgUnAihxjNTMzq3u59dQjokPSXOBeYCiwJCKeknQl0BoRS3s59ylJvwCeBjqAyzKMfF9UrdhrUKO2rVHbBY3btkZtl1lDUEQUHYOZmZlVgSvKmZmZNQgndTMzswZRd0m9r9Kz6eC6m9P3l0tqGvgoy5ehXVMl/V5Sh6QLioixvzK07XJJT0taJelXkuqiEG+Gds2R9KSkxyX9Nq2UWBcqKfFsZsWpq6ReUnr2bOB44FPd/KG8BNgaEW8F/oOkWl1Ny9iudpI5/T8f2Ogqk7FtjwHNEXECcCvw3YGNsnwZ2/XziHhnRJxE0qarBzjMfsnYtm5LPJtZseoqqZOt9Ox04Cfp97cCH1Q6Eb6G9dmuiFgbEauAfUUEWIEsbft1RLyWbj5CUpeg1mVp18slmwcD9TIqtZISz2ZWoHpL6t2Vnu1aPnb/MRHRAWwHar1AeZZ21aty23YJcE+uEVVHpnZJukzSsyQ99S8OUGyV6rNtpSWeBzIwM+tdvSV1a2CSLgaage8VHUu1RMT8iHgL8C/A14uOpxpKSjz/U9GxmNlfq7eknqV87P5jJB0A/B2weUCi679GLoubqW2STge+RlJVcPcAxVaJcv/NbgLOyzWi6imnxPNa4N0kJZ49WM6sYPWW1HstPZtaCsxIv78A+J+o/Qo7WdpVr/psm6R3AQtJEvrGAmLsjyztmlSy+RHgTwMYXyUqKfFsZgWqq6SePiPvLD37R+AXnaVn04VgAH4EvFHSGuByoMfpOLUiS7skTZa0Hvg4sFDSU8VFnF3Gf7PvASOBW9LpXzX/gSZju+ZKekrS4yT/F2f0cLmakrFtZlaDXCbWzMysQdRVT93MzMx65qRuZmbWIJzUzczMGoSTupmZWYNwUjczM2sQTuo24CQ90FmoRNIySaMrvN4HJP1XD+/dmK7+9o+V/Awzs3pwQNEBWONJF9BRRPS5+ExEfDjHOMYBk9MV+7Kec0A6T9vMrO64pz5ISPrXdH3s36a91y+n+98i6b8lPSrpIUnHpfuvk/Sfkv5X0nOla7hLukLSyrQH/K10X1N6/euBPwBHSVogqTUtwPKtHuJaK2lMuvb44+nreUm/Tt8/U9LvlKwlf4ukken+syQ9I+n3wMd6aPZ9wPj0mqeldwh+kG7/QdKU9Fr/Jumnkh4GflqN37eZWRGc1AcBSZOB84ETSdbILq3RvQj4QkScAnwZuKbkvSOA9wPnAN9Or3UmMIlkec6TgFMkTU2PnwRcExFvj4g24GsR0QycAPyDpBN6ijEifpiuOz6ZZFWwqyWNIVkE5fSIOBloBS6XNAJYDJwLnAKM6+Gy04BnI+KkiHgo3XdQ+nMuBZaUHHt8+nM+1VOMZma1zrffB4f3AXdGxC5gl6S7ANJe73tJyrN2Hju85Lw70lvoT0t6U7rvzPT1WLo9kiSZtwNtEfFIyfmfkDSb5P/ZESSJc1Ufsf6ApF7/XZLOSc95OI1vGPA74Djg+Yj4U9qOG4DZGX8XNwJExIOSDil5nr80InZmvIaZWU1yUh/chgDb0p5rd0pXS1PJ13+PiIWlB0pqAl4t2Z5I0vOfHBFbJV0HjOgtGEkzgb8nqTve+bPu79p7ltRTvFl0rYvcuf1q1wPNzOqNb78PDg8D50oakfbOzwGIiJeB5yV9HJIBbpJO7ONa9wKzSp5tj5d0eDfHHUKSKLenvfyze7uopM7b/xeXDLB7BHifpLemxxws6RjgGaBJ0lvS48q5Zf7J9FrvB7ZHxPYyzjUzq2nuqQ8CEbEyXflsFfAX4EmgM5ldBCyQ9HXgQJJ1v5/o5Vr3SXob8Lv0lvgrwMXA3i7HPSHpMZIEvI7kg0Vv5gKHAb9Or9saEZ9Ne+83Sup8LPD1iPi/9Lb+3ZJeAx4iWeM7i11pXAcCszKeY2ZWF7xK2yAhaWREvCLpIOBBYHZE/L7ouAaSpAeAL3vdbzNrVO6pDx6LJB1P8lz7J4MtoZuZDQbuqZuZmTUID5QzMzNrEE7qZmZmDcJJ3czMrEE4qZuZmTUIJ3UzM7MG8f+4tntQUvTdwQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "y_pred = postproc.predict_proba(X_test)[:, 1]\n", + "y_lr = postproc.estimator_.predict_proba(X_test)[:, 1]\n", + "br = postproc.postprocessor_.base_rates_\n", + "i = X_test.sex == 1\n", + "\n", + "plt.plot([0, br[0]], [0, 1-br[0]], '-b', label='All calibrated classifiers (Females)')\n", + "plt.plot([0, br[1]], [0, 1-br[1]], '-r', label='All calibrated classifiers (Males)')\n", + "\n", + "plt.scatter(generalized_fpr(y_test[~i], y_lr[~i]),\n", + " generalized_fnr(y_test[~i], y_lr[~i]),\n", + " 300, c='b', marker='.', label='Original classifier (Females)')\n", + "plt.scatter(generalized_fpr(y_test[i], y_lr[i]),\n", + " generalized_fnr(y_test[i], y_lr[i]),\n", + " 300, c='r', marker='.', label='Original classifier (Males)')\n", + " \n", + "plt.scatter(generalized_fpr(y_test[~i], y_pred[~i]),\n", + " generalized_fnr(y_test[~i], y_pred[~i]),\n", + " 100, c='b', marker='d', label='Post-processed classifier (Females)')\n", + "plt.scatter(generalized_fpr(y_test[i], y_pred[i]),\n", + " generalized_fnr(y_test[i], y_pred[i]),\n", + " 100, c='r', marker='d', label='Post-processed classifier (Males)')\n", + "\n", + "plt.plot([0, 1], [generalized_fnr(y_test, y_pred)]*2, '--', c='0.5')\n", "\n", - "clf = GridSearchCV(rew, params, scoring=scoring, cv=5)\n", - "clf.fit(X_train, y_train, **{'sample_weight': sw_train})\n", - "clf.score(X_test, y_test)" + "plt.axis('square')\n", + "plt.xlim([0, 0.4])\n", + "plt.ylim([0.4, 0.8])\n", + "plt.xlabel('generalized fpr');\n", + "plt.ylabel('generalized fnr');\n", + "plt.legend(bbox_to_anchor=(1.04,1), loc='upper left');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see the generalized false negative rate is approximately equalized and the classifiers remain close to the calibration lines." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [] } From e0856e370ba3f8edeff881b199970e77f5c57ee9 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 31 Oct 2019 09:49:07 -0400 Subject: [PATCH 37/61] updated readme --- aif360/sklearn/README.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md index da318ced..98497eb9 100644 --- a/aif360/sklearn/README.md +++ b/aif360/sklearn/README.md @@ -20,16 +20,16 @@ objects with sample properties (protected attributes) as the index - [ ] MEPS - [ ] Implement metrics as individual functions instead of instance methods - [x] Make certain metrics compatible as sklearn scorers - - [x] Use "groups" and "priv_group" keywords to specify protected attributes to + - [x] Use "prot_attr" and "priv_group" keywords to specify protected attributes to functions - - [ ] Generalized confusion matrix + - [x] Generalized confusion matrix - [ ] Sample distortion metrics - [ ] Make inprocessing algorithms compatible as sklearn `Estimator`s + - [x] Adversarial debiasing - [ ] **[External]** `get_feature_names()` from data preprocessing steps that would remove DataFrame formatting - [ ] SLEP007/8 - [ ] Prejudice remover - - [ ] Adversarial debiasing - [ ] Meta-fair classifier - [ ] Make preprocessing algorithms compatible as sklearn `Transformer`s - [ ] **[External]** Add functionality to modify X and y @@ -41,11 +41,9 @@ objects with sample properties (protected attributes) as the index - [X] Meta-estimator workaround - [ ] **[External]** SLEP006 - Sample properties - [ ] Make postprocessing algorithms compatible - - [ ] **[External]** Allow for `fit(y_true, y_pred)` - - [ ] New SLEP? - - [ ] Calibrated equalized odds postprocessing + - [x] Calibrated equalized odds postprocessing + - [x] Meta-estimator workaround again - [ ] Equalized odds postprocessing - [ ] Reject option classification - [ ] Miscellaneous: - - [ ] LIME encoder - [ ] Explainers From 8f8cd760d0692dc5afe5c9328420212e8ff780c1 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 31 Oct 2019 17:37:50 -0400 Subject: [PATCH 38/61] fixed tests and added additional tests --- aif360/sklearn/datasets/openml_datasets.py | 2 +- .../tests/test_calibrated_equalized_odds.py | 10 ++++++++++ aif360/sklearn/tests/test_metrics.py | 19 +++++++++++++++++-- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 45d8cd7f..c1c30b02 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -209,7 +209,7 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'deposit'] # remap target - df.deposit = df.deposit.map({'1': False, '2': True}) + df.deposit = df.deposit.map({'1': False, '2': True}).astype('bool') # replace 'unknown' marker with NaN df.apply(lambda s: s.cat.remove_categories('unknown', inplace=True) if hasattr(s, 'cat') and 'unknown' in s.cat.categories else s) diff --git a/aif360/sklearn/tests/test_calibrated_equalized_odds.py b/aif360/sklearn/tests/test_calibrated_equalized_odds.py index 247ba4c8..f1a6f3b3 100644 --- a/aif360/sklearn/tests/test_calibrated_equalized_odds.py +++ b/aif360/sklearn/tests/test_calibrated_equalized_odds.py @@ -1,5 +1,6 @@ import numpy as np from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split from aif360.datasets import AdultDataset from aif360.sklearn.datasets import fetch_adult @@ -26,6 +27,15 @@ def test_calib_eq_odds_sex(): assert np.isclose(orig_cal_eq_odds.priv_mix_rate, cal_eq_odds.mix_rates_[1]) assert np.isclose(orig_cal_eq_odds.unpriv_mix_rate, cal_eq_odds.mix_rates_[0]) +def test_split(): + adult_est, adult_post = adult.split([0.75], shuffle=False) + X_est, X_post, y_est, y_post = train_test_split(X, y, shuffle=False) + + assert np.all(adult_est.features == X_est) + assert np.all(adult_est.labels.ravel() == y_est) + assert np.all(adult_post.features == X_post) + assert np.all(adult_post.labels.ravel() == y_post) + def test_postprocessingmeta(): logreg = LogisticRegression(solver='lbfgs', max_iter=500) diff --git a/aif360/sklearn/tests/test_metrics.py b/aif360/sklearn/tests/test_metrics.py index c0a1c6e9..326c7c8b 100644 --- a/aif360/sklearn/tests/test_metrics.py +++ b/aif360/sklearn/tests/test_metrics.py @@ -6,6 +6,7 @@ from aif360.metrics import ClassificationMetric from aif360.sklearn.metrics import ( consistency_score, specificity_score, selection_rate, + base_rate, generalized_fpr, generalized_fnr, disparate_impact_ratio, statistical_parity_difference, equal_opportunity_difference, average_odds_difference, average_odds_error, generalized_entropy_error, @@ -13,14 +14,16 @@ X, y, sample_weight = fetch_adult(numeric_only=True) -y_pred = LogisticRegression(solver='liblinear').fit(X, y, - sample_weight=sample_weight).predict(X) +lr = LogisticRegression(solver='liblinear').fit(X, y, sample_weight=sample_weight) +y_pred = lr.predict(X) +y_proba = lr.predict_proba(X)[:, 1] adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'], features_to_drop=[]) adult_pred = adult.copy() adult_pred.labels = y_pred +adult_pred.scores = y_proba cm = ClassificationMetric(adult, adult_pred, unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}]) @@ -36,10 +39,22 @@ def test_specificity(): spec = specificity_score(y, y_pred, sample_weight=sample_weight) assert spec == cm.specificity() +def test_base_rate(): + base = base_rate(y, y_pred, sample_weight=sample_weight) + assert base == cm.base_rate() + def test_selection_rate(): select = selection_rate(y, y_pred, sample_weight=sample_weight) assert select == cm.selection_rate() +def test_generalized_fpr(): + gfpr = generalized_fpr(y, y_proba, sample_weight=sample_weight) + assert np.isclose(gfpr, cm.generalized_false_positive_rate()) + +def test_generalized_fnr(): + gfnr = generalized_fnr(y, y_proba, sample_weight=sample_weight) + assert np.isclose(gfnr, cm.generalized_false_negative_rate()) + def test_disparate_impact(): di = disparate_impact_ratio(y, y_pred, prot_attr='sex', sample_weight=sample_weight) From e01f23fe60087d2135ad087a5d04783dd01ea81c Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 11 Nov 2019 16:49:32 -0500 Subject: [PATCH 39/61] added COMPAS and other dataset fixes* fixed german dataset to match paper* added ColumnAlreadyDroppedWarnings in standardize_dataset* added compas test and fixed old tests to match new drop warnings --- aif360/sklearn/datasets/__init__.py | 1 + aif360/sklearn/datasets/compas_dataset.py | 72 ++++++++++++++++++++++ aif360/sklearn/datasets/openml_datasets.py | 2 +- aif360/sklearn/datasets/utils.py | 45 +++++++++++--- aif360/sklearn/tests/test_datasets.py | 19 ++++-- 5 files changed, 125 insertions(+), 14 deletions(-) create mode 100644 aif360/sklearn/datasets/compas_dataset.py diff --git a/aif360/sklearn/datasets/__init__.py b/aif360/sklearn/datasets/__init__.py index 1a5a27f0..43168666 100644 --- a/aif360/sklearn/datasets/__init__.py +++ b/aif360/sklearn/datasets/__init__.py @@ -1,2 +1,3 @@ from aif360.sklearn.datasets.utils import * from aif360.sklearn.datasets.openml_datasets import * +from aif360.sklearn.datasets.compas_dataset import fetch_compas diff --git a/aif360/sklearn/datasets/compas_dataset.py b/aif360/sklearn/datasets/compas_dataset.py new file mode 100644 index 00000000..31f2a14e --- /dev/null +++ b/aif360/sklearn/datasets/compas_dataset.py @@ -0,0 +1,72 @@ +import os + +import pandas as pd + +from aif360.sklearn.datasets.utils import standarize_dataset + + +# cache location +DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)), + '..', 'data', 'raw') +COMPAS_URL = 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv' + +def fetch_compas(data_home=None, binary_race=False, + usecols=['sex', 'age', 'age_cat', 'race', 'juv_fel_count', + 'juv_misd_count', 'juv_other_count', 'priors_count', + 'c_charge_degree', 'c_charge_desc'], + dropcols=[], numeric_only=False, dropna=True): + """Load the COMPAS Recidivism Risk Scores dataset. + + Optionally binarizes 'race' to 'Caucasian' (privileged) or 'African-American' + (unprivileged). The other protected attribute is 'sex' ('Male' is + _unprivileged_ and 'Female' is _privileged_). The outcome variable is + 'no recid.' (favorable) if the person was not accused of a crime within two + years or 'did recid.' (unfavorable) if they were. + + Args: + data_home (string, optional): Specify another download and cache folder + for the datasets. By default all AIF360 datasets are stored in + 'aif360/sklearn/data/raw' subfolders. + binary_race (bool, optional): Filter only White and Black defendants. + usecols (single label or list-like, optional): Feature column(s) to + keep. All others are dropped. + dropcols (single label or list-like, optional): Feature column(s) to + drop. + numeric_only (bool): Drop all non-numeric feature columns. + dropna (bool): Drop rows with NAs. + + Returns: + namedtuple: Tuple containing X and y for the COMPAS dataset accessible + by index or name. + """ + cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, + os.path.basename(COMPAS_URL)) + if os.path.isfile(cache_path): + df = pd.read_csv(cache_path, index_col='id') + else: + df = pd.read_csv(COMPAS_URL, index_col='id') + df.to_csv(cache_path) + + # Perform the same preprocessing as the original analysis: + # https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb + df = df[(df.days_b_screening_arrest <= 30) + & (df.days_b_screening_arrest >= -30) + & (df.is_recid != -1) + & (df.c_charge_degree != 'O') + & (df.score_text != 'N/A')] + + for col in ['sex', 'age_cat', 'race', 'c_charge_degree', 'c_charge_desc']: + df[col] = df[col].astype('category') + + df.two_year_recid = df.two_year_recid.replace({0: 'no recid.', 1: 'did recid.'}).astype('category').cat.as_ordered() # 'did recid' < 'no recid' + + if binary_race: + df.race = df.race.cat.set_categories(['African-American', 'Caucasian'], + ordered=True) # 'African-American' < 'Caucasian' + + df.sex = df.sex.astype('category').cat.as_ordered() # 'Female' < 'Male' + + return standarize_dataset(df, prot_attr=['sex', 'race'], + target='two_year_recid', usecols=usecols, + dropcols=dropcols, numeric_only=numeric_only, + dropna=dropna) diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index c1c30b02..1cb4b9a1 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -151,7 +151,7 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], df['credit-risk'] = df['credit-risk'].cat.as_ordered() # 'bad' < 'good' # binarize protected attribute (but not corresponding feature) - age = (pd.cut(df.age, [0, 25, 100], right=False, labels=['young', 'aged']) + age = (pd.cut(df.age, [0, 25, 100], labels=numeric_only and ['young', 'aged']) if binary_age else 'age') # Note: marital_status directly implies sex. i.e. 'div/dep/mar' => 'female' diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index e714026b..703ad13f 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -1,7 +1,25 @@ from collections import namedtuple +import warnings +import numpy as np from pandas.core.dtypes.common import is_list_like + +class ColumnAlreadyDroppedWarning(UserWarning): + """Warning used if a column is attempted to be dropped twice.""" + +def check_already_dropped(labels, dropped_cols, name, dropped_by='numeric_only', + warn=True): + if not is_list_like(labels): + labels = [labels] + labels = [c for c in labels if isinstance(c, str)] + already_dropped = dropped_cols.intersection(labels) + if warn and already_dropped.any(): + warnings.warn("Some column labels from `{}` were already dropped by " + "`{}`:\n{}".format(name, dropped_by, already_dropped.tolist()), + ColumnAlreadyDroppedWarning, stacklevel=2) + return [c for c in labels if c not in already_dropped] + def standarize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], dropcols=[], numeric_only=False, dropna=True): """Separate data, targets, and possibly sample weights and populate @@ -36,7 +54,7 @@ def standarize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], Note: The order of execution for the dropping parameters is: numeric_only -> - dropcols -> usecols -> dropna. + usecols -> dropcols -> dropna. Examples: >>> import pandas as pd @@ -53,24 +71,35 @@ def standarize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], >>> X, y = standarize_dataset(df, prot_attr=0, target=5) >>> X_tr, X_te, y_tr, y_te = train_test_split(X, y) """ - # TODO: warn user if label in prot_attr, target, or dropcols is already dropped - # TODO: error message if label in usecols is already dropped + orig_cols = df.columns if numeric_only: for col in df.select_dtypes('category'): if df[col].cat.ordered: df[col] = df[col].factorize(sort=True)[0] + df[col] = df[col].replace(-1, np.nan) df = df.select_dtypes(['number', 'bool']) + nonnumeric = orig_cols.difference(df.columns) + prot_attr = check_already_dropped(prot_attr, nonnumeric, 'prot_attr') + if len(prot_attr) == 0: + raise ValueError("At least one protected attribute must be present.") df = df.set_index(prot_attr, drop=False, append=True) - y = df.pop(target) + + target = check_already_dropped(target, nonnumeric, 'target') + if len(target) == 0: + raise ValueError("At least one target must be present.") + y = df.pop(target if len(target) > 1 else target[0]) # maybe return Series # Column-wise drops - df = df.drop(columns=dropcols) + orig_cols = df.columns if usecols: - if not is_list_like(usecols): - # make sure we don't return a Series instead of a DataFrame - usecols = [usecols] + usecols = check_already_dropped(usecols, nonnumeric, 'usecols') df = df[usecols] + unused = orig_cols.difference(df.columns) + + dropcols = check_already_dropped(dropcols, nonnumeric, 'dropcols', warn=False) + dropcols = check_already_dropped(dropcols, unused, 'dropcols', 'usecols', False) + df = df.drop(columns=dropcols) # Index-wise drops if dropna: diff --git a/aif360/sklearn/tests/test_datasets.py b/aif360/sklearn/tests/test_datasets.py index 05974f1e..5e2f00ad 100644 --- a/aif360/sklearn/tests/test_datasets.py +++ b/aif360/sklearn/tests/test_datasets.py @@ -6,6 +6,7 @@ from aif360.sklearn.datasets import fetch_adult, fetch_bank, fetch_german from aif360.sklearn.datasets import standarize_dataset +from aif360.sklearn.datasets import fetch_compas, ColumnAlreadyDroppedWarning df = pd.DataFrame([[1, 2, 3, 'a'], [5, 6, 7, 'b'], [np.NaN, 10, 11, 'c']], @@ -39,8 +40,8 @@ def test_usecols_dropcols_basic(): assert basic(dropcols=['X1', 'Z']).X.columns.tolist() == ['X2'] assert basic(usecols='X1', dropcols=['X2']).X.columns.tolist() == ['X1'] - with pytest.raises(KeyError): - basic(usecols=['X1', 'X2'], dropcols='X2') + assert isinstance(basic(usecols='X2', dropcols=['X1', 'X2'])[0], + pd.DataFrame) def test_dropna_basic(): basic_dropna = partial(standarize_dataset, df=df, prot_attr='Z', @@ -50,9 +51,8 @@ def test_dropna_basic(): def test_numeric_only_basic(): assert basic(prot_attr='X2', numeric_only=True).X.shape == (3, 2) - with pytest.raises(KeyError): - assert (basic(prot_attr='X2', dropcols='Z', numeric_only=True).X.shape - == (3, 2)) + assert (basic(prot_attr='X2', dropcols='Z', numeric_only=True).X.shape + == (3, 2)) def test_fetch_adult(): adult = fetch_adult() @@ -74,6 +74,15 @@ def test_fetch_bank(): assert fetch_bank(dropcols=[]).X.shape == (45211, 16) assert fetch_bank(numeric_only=True).X.shape == (45211, 6) +@pytest.mark.filterwarnings('error', category=ColumnAlreadyDroppedWarning) +def test_fetch_compas(): + compas = fetch_compas() + assert len(compas) == 2 + assert compas.X.shape == (6167, 10) + assert fetch_compas(binary_race=True).X.shape == (5273, 10) + with pytest.raises(ColumnAlreadyDroppedWarning): + assert fetch_compas(numeric_only=True).X.shape == (6172, 6) + def test_onehot_transformer(): X, y = fetch_german() assert len(pd.get_dummies(X).columns) == 63 From e92f84663c8f3754821568fd143f94a64263d39e Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 12 Nov 2019 10:55:03 -0500 Subject: [PATCH 40/61] fix more edge cases in metrics --- aif360/sklearn/metrics/metrics.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index 4adadda0..50632c81 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -101,6 +101,7 @@ def ratio(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, warnings.warn("The ratio is ill-defined and being set to 0.0 because " "the {} for privileged samples is 0.".format(func.__name__), UndefinedMetricWarning) + return 0. return numerator / denominator @@ -132,7 +133,13 @@ def specificity_score(y_true, y_pred, neg_label=0, sample_weight=None): sample_weight=sample_weight) def base_rate(y_true, y_pred=None, pos_label=1, sample_weight=None): - return np.average(y_true == pos_label, weights=sample_weight) + idx = (y_true == pos_label) + if not np.any(idx): + warnings.warn("base_rate is ill-defined because there are no samples " + "with value {} in y_true.".format(pos_label), + UndefinedMetricWarning) + return 0. + return np.average(idx, weights=sample_weight) def selection_rate(y_true, y_pred, pos_label=1, sample_weight=None): return base_rate(y_pred, pos_label=pos_label, sample_weight=sample_weight) From 27aa55c0e98895a3b1adc9115fb28940b719ea63 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Tue, 12 Nov 2019 10:55:32 -0500 Subject: [PATCH 41/61] removed unused import --- aif360/sklearn/postprocessing/calibrated_equalized_odds.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py index 322d331a..143ed423 100644 --- a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py +++ b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py @@ -1,6 +1,5 @@ import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.preprocessing import LabelEncoder from sklearn.utils import check_random_state from aif360.sklearn.metrics import base_rate, generalized_fnr, generalized_fpr @@ -56,14 +55,16 @@ def fit(self, y_true, y_pred, pos_label=1, sample_weight=None): self.groups_)) # ensure self.classes_ = [neg_label, pos_label] - self.classes_ = np.append(np.delete(self.classes_, pos_label), pos_label) + self.classes_ = np.append(np.delete(self.classes_, pos_label), + pos_label) def args(grp_idx, triv=False): i = (groups == self.groups_[grp_idx]) pred = (np.full_like(y_pred, self.base_rates_[grp_idx]) if triv else y_pred) return dict(y_true=y_true[i], y_pred=pred[i], pos_label=pos_label, - sample_weight=sample_weight[i] if sample_weight is not None else None) + sample_weight=None if sample_weight is None + else sample_weight[i]) self.base_rates_ = [base_rate(**args(i)) for i in range(2)] From 831775c5532a4ef70664085ba81d071e363dde0c Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Mon, 9 Dec 2019 14:35:45 -0500 Subject: [PATCH 42/61] make cache dir if necessary --- aif360/sklearn/datasets/compas_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aif360/sklearn/datasets/compas_dataset.py b/aif360/sklearn/datasets/compas_dataset.py index 31f2a14e..be649864 100644 --- a/aif360/sklearn/datasets/compas_dataset.py +++ b/aif360/sklearn/datasets/compas_dataset.py @@ -45,6 +45,7 @@ def fetch_compas(data_home=None, binary_race=False, df = pd.read_csv(cache_path, index_col='id') else: df = pd.read_csv(COMPAS_URL, index_col='id') + os.makedirs(os.path.dirname(cache_path), exist_ok=True) df.to_csv(cache_path) # Perform the same preprocessing as the original analysis: From a0e56b09876717b7e64c97f9be099f3b336fb18d Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Fri, 13 Dec 2019 13:22:19 -0500 Subject: [PATCH 43/61] docstring, formatting, and typo fixes --- aif360/sklearn/datasets/__init__.py | 10 + aif360/sklearn/datasets/compas_dataset.py | 12 +- aif360/sklearn/datasets/openml_datasets.py | 31 +- aif360/sklearn/datasets/utils.py | 22 +- aif360/sklearn/inprocessing/__init__.py | 3 + aif360/sklearn/metrics/__init__.py | 6 + aif360/sklearn/metrics/metrics.py | 388 ++++++++++++++++++--- aif360/sklearn/preprocessing/__init__.py | 3 + aif360/sklearn/preprocessing/reweighing.py | 108 +++++- aif360/sklearn/tests/test_datasets.py | 6 +- docs/Makefile | 6 +- docs/source/conf.py | 94 ++++- docs/source/modules/sklearn.rst | 214 +++++++++++- docs/source/modules/standard_datasets.rst | 1 + docs/source/static/style.css | 12 + docs/source/templates/base.rst | 6 + docs/source/templates/class.rst | 29 ++ 17 files changed, 842 insertions(+), 109 deletions(-) create mode 100644 docs/source/static/style.css create mode 100644 docs/source/templates/base.rst create mode 100644 docs/source/templates/class.rst diff --git a/aif360/sklearn/datasets/__init__.py b/aif360/sklearn/datasets/__init__.py index 43168666..5aac86b8 100644 --- a/aif360/sklearn/datasets/__init__.py +++ b/aif360/sklearn/datasets/__init__.py @@ -1,3 +1,13 @@ +""" +The dataset format for ``aif360.sklearn`` is a :class:`pandas.DataFrame` with +protected attributes in the index. + +Warning: + Currently, while all scikit-learn classes will accept DataFrames as inputs, + most classes will return a :class:`numpy.ndarray`. Therefore, many pre- + processing steps, when placed before an ``aif360.sklearn`` step in a + Pipeline, will cause errors. +""" from aif360.sklearn.datasets.utils import * from aif360.sklearn.datasets.openml_datasets import * from aif360.sklearn.datasets.compas_dataset import fetch_compas diff --git a/aif360/sklearn/datasets/compas_dataset.py b/aif360/sklearn/datasets/compas_dataset.py index be649864..76a0d9df 100644 --- a/aif360/sklearn/datasets/compas_dataset.py +++ b/aif360/sklearn/datasets/compas_dataset.py @@ -2,7 +2,7 @@ import pandas as pd -from aif360.sklearn.datasets.utils import standarize_dataset +from aif360.sklearn.datasets.utils import standardize_dataset # cache location @@ -19,7 +19,7 @@ def fetch_compas(data_home=None, binary_race=False, Optionally binarizes 'race' to 'Caucasian' (privileged) or 'African-American' (unprivileged). The other protected attribute is 'sex' ('Male' is - _unprivileged_ and 'Female' is _privileged_). The outcome variable is + *unprivileged* and 'Female' is *privileged*). The outcome variable is 'no recid.' (favorable) if the person was not accused of a crime within two years or 'did recid.' (unfavorable) if they were. @@ -67,7 +67,7 @@ def fetch_compas(data_home=None, binary_race=False, df.sex = df.sex.astype('category').cat.as_ordered() # 'Female' < 'Male' - return standarize_dataset(df, prot_attr=['sex', 'race'], - target='two_year_recid', usecols=usecols, - dropcols=dropcols, numeric_only=numeric_only, - dropna=dropna) + return standardize_dataset(df, prot_attr=['sex', 'race'], + target='two_year_recid', usecols=usecols, + dropcols=dropcols, numeric_only=numeric_only, + dropna=dropna) diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 1cb4b9a1..6decfcb7 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -3,7 +3,7 @@ import pandas as pd from sklearn.datasets import fetch_openml -from aif360.sklearn.datasets.utils import standarize_dataset +from aif360.sklearn.datasets.utils import standardize_dataset # cache location @@ -38,8 +38,8 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], Binarizes 'race' to 'White' (privileged) or 'Non-white' (unprivileged). The other protected attribute is 'sex' ('Male' is privileged and 'Female' is - unprivileged). The outcome variable is '>50K' (favorable) or '<=50K' - (unfavorable). + unprivileged). The outcome variable is 'annual-income': '>50K' (favorable) + or '<=50K' (unfavorable). Args: subset ({'train', 'test', or 'all'}, optional): Select the dataset to @@ -88,7 +88,7 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], ordered=True).fillna('Non-white') df.sex = df.sex.cat.as_ordered() # 'Female' < 'Male' - return standarize_dataset(df, prot_attr=['race', 'sex'], + return standardize_dataset(df, prot_attr=['race', 'sex'], target='annual-income', sample_weight='fnlwgt', usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, dropna=dropna) @@ -101,19 +101,20 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], unprivileged) and 'age' (binarized by default as recommended by [#kamiran09]_: ``age >= 25`` is considered privileged and ``age < 25`` is considered unprivileged; see the ``binary_age`` flag to keep this - continuous). The outcome variable is 'good' (favorable) or 'bad' - (unfavorable). + continuous). The outcome variable is 'credit-risk': 'good' (favorable) or + 'bad' (unfavorable). References: - .. [#kamiran09] F. Kamiran and T. Calders, "Classifying without + .. [#kamiran09] `F. Kamiran and T. Calders, "Classifying without discriminating," 2nd International Conference on Computer, Control and Communication, 2009. + `_ Args: data_home (string, optional): Specify another download and cache folder for the datasets. By default all AIF360 datasets are stored in 'aif360/sklearn/data/raw' subfolders. - binary_age (bool, optional): If `True`, split protected attribute, + binary_age (bool, optional): If ``True``, split protected attribute, ``age``, into 'aged' (privileged) and 'youth' (unprivileged). The ``age`` feature remains continuous. usecols (single label or list-like, optional): Column name(s) to keep. @@ -161,16 +162,16 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], df = df.join(personal_status.astype('category')) df.sex = df.sex.cat.as_ordered() # 'female' < 'male' - return standarize_dataset(df, prot_attr=['sex', age], target='credit-risk', - usecols=usecols, dropcols=dropcols, - numeric_only=numeric_only, dropna=dropna) + return standardize_dataset(df, prot_attr=['sex', age], target='credit-risk', + usecols=usecols, dropcols=dropcols, + numeric_only=numeric_only, dropna=dropna) def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', numeric_only=False, dropna=False): """Load the Bank Marketing Dataset. The protected attribute is 'age' (left as continuous). The outcome variable - is 'yes' or 'no'. TODO: which is favorable? + is 'deposit': ``True`` or ``False``. Args: data_home (string, optional): Specify another download and cache folder @@ -213,6 +214,6 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', # replace 'unknown' marker with NaN df.apply(lambda s: s.cat.remove_categories('unknown', inplace=True) if hasattr(s, 'cat') and 'unknown' in s.cat.categories else s) - return standarize_dataset(df, prot_attr='age', target='deposit', - usecols=usecols, dropcols=dropcols, - numeric_only=numeric_only, dropna=dropna) + return standardize_dataset(df, prot_attr='age', target='deposit', + usecols=usecols, dropcols=dropcols, + numeric_only=numeric_only, dropna=dropna) diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index 703ad13f..db88ea46 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -10,6 +10,22 @@ class ColumnAlreadyDroppedWarning(UserWarning): def check_already_dropped(labels, dropped_cols, name, dropped_by='numeric_only', warn=True): + """Check if columns have already been dropped and return only those that + haven't. + + Args: + labels (single label or list-like): Column labels to check. + dropped_cols (set or pandas.Index): Columns that were already dropped. + name (str): Original arg that triggered the check (e.g. ``dropcols``). + dropped_by (str, optional): Original arg that caused ``dropped_cols`` + (e.g. ``numeric_only``). + warn (bool, optional): If ``True``, produces a + :class:`ColumnAlreadyDroppedWarning` if there are columns in the + intersection of ``dropped_cols`` and ``labels``. + + Returns: + list: Columns in ``labels`` which are not in ``dropped_cols``. + """ if not is_list_like(labels): labels = [labels] labels = [c for c in labels if isinstance(c, str)] @@ -20,7 +36,7 @@ def check_already_dropped(labels, dropped_cols, name, dropped_by='numeric_only', ColumnAlreadyDroppedWarning, stacklevel=2) return [c for c in labels if c not in already_dropped] -def standarize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], +def standardize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], dropcols=[], numeric_only=False, dropna=True): """Separate data, targets, and possibly sample weights and populate protected attributes as sample properties. @@ -61,14 +77,14 @@ def standarize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], >>> from sklearn.linear_model import LinearRegression >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['X', 'y', 'Z']) - >>> train = standarize_dataset(df, prot_attr='Z', target='y') + >>> train = standardize_dataset(df, prot_attr='Z', target='y') >>> reg = LinearRegression().fit(*train) >>> import numpy as np >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import train_test_split >>> df = pd.DataFrame(np.hstack(make_classification(n_features=5))) - >>> X, y = standarize_dataset(df, prot_attr=0, target=5) + >>> X, y = standardize_dataset(df, prot_attr=0, target=5) >>> X_tr, X_te, y_tr, y_te = train_test_split(X, y) """ orig_cols = df.columns diff --git a/aif360/sklearn/inprocessing/__init__.py b/aif360/sklearn/inprocessing/__init__.py index 863d3676..18df48c4 100644 --- a/aif360/sklearn/inprocessing/__init__.py +++ b/aif360/sklearn/inprocessing/__init__.py @@ -1,3 +1,6 @@ +""" +In-processing algorithms train a fair classifier (data in, predictions out). +""" from aif360.sklearn.inprocessing.adversarial_debiasing import AdversarialDebiasing __all__ = [ diff --git a/aif360/sklearn/metrics/__init__.py b/aif360/sklearn/metrics/__init__.py index ceaef288..a0778b80 100644 --- a/aif360/sklearn/metrics/__init__.py +++ b/aif360/sklearn/metrics/__init__.py @@ -1 +1,7 @@ +""" +``aif360.sklearn`` implements a number of fairness metrics for group fairness +and individual fairness. For guidance on which metric to use for a given +application, see our +`Guidance `_ page. +""" from aif360.sklearn.metrics.metrics import * diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index 50632c81..f100a012 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -10,11 +10,23 @@ __all__ = [ - 'base_rate', 'consistency_score', 'specificity_score', 'selection_rate', - 'disparate_impact_ratio', 'statistical_parity_difference', + # meta-metrics + 'difference', 'ratio', + # scorer factories + 'make_difference_scorer', 'make_ratio_scorer', + # helpers + 'specificity_score', 'base_rate', 'selection_rate', 'generalized_fpr', + 'generalized_fnr', + # group fairness + 'statistical_parity_difference', 'disparate_impact_ratio', 'equal_opportunity_difference', 'average_odds_difference', - 'average_odds_error', 'generalized_entropy_error', 'generalized_fnr', - 'between_group_generalized_entropy_error', 'generalized_fpr' + 'average_odds_error', + # individual fairness + 'generalized_entropy_index', 'generalized_entropy_error', + 'between_group_generalized_entropy_error', 'theil_index', + 'coefficient_of_variation', 'consistency_score', + # aliases + 'sensitivity_score', 'mean_difference', ] # ============================= META-METRICS =================================== @@ -35,7 +47,7 @@ def difference(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, *args: Additional positional args to be passed through to ``func``. prot_attr (array-like, keyword-only): Protected attribute(s). If ``None``, all protected attributes in ``y`` are used. - priv_group (scalar, optional): Label value for the privileged group. + priv_group (scalar, optional): The label of the privileged group. sample_weight (array-like, optional): Sample weights passed through to ``func``. **kwargs: Additional keyword args to be passed through to ``func``. @@ -66,8 +78,7 @@ def ratio(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, arbitrary metric. Note: The optimal value of a ratio is 1. To make it a scorer, one must - take the minimum of the ratio and its inverse, subtract it from 1, and set - ``greater_is_better`` to False. + take the minimum of the ratio and its inverse. Unprivileged group is taken to be the inverse of the privileged group. @@ -76,9 +87,9 @@ def ratio(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, :mod:`aif360.sklearn.metrics.metrics`. y (array-like): Outcome vector with protected attributes as index. *args: Additional positional args to be passed through to ``func``. - groups (array-like, keyword-only): Group labels (protected attributes) - for the samples. - priv_group (scalar, optional): Label value for the privileged group. + prot_attr (array-like, keyword-only): Protected attribute(s). If + ``None``, all protected attributes in ``y`` are used. + priv_group (scalar, optional): The label of the privileged group. sample_weight (array-like, optional): Sample weights passed through to ``func``. **kwargs: Additional keyword args to be passed through to ``func``. @@ -99,7 +110,7 @@ def ratio(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, if denominator == 0: warnings.warn("The ratio is ill-defined and being set to 0.0 because " - "the {} for privileged samples is 0.".format(func.__name__), + "'{}' for privileged samples is 0.".format(func.__name__), UndefinedMetricWarning) return 0. @@ -107,15 +118,40 @@ def ratio(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, # =========================== SCORER FACTORIES ================================= -def make_difference_scorer(func): - return make_scorer(lambda y, y_pred, **kw: abs(func(y, y_pred, **kw)), +def make_difference_scorer(diff_func): + """Make a scorer from a 'difference' metric (e.g. + :func:`statistical_parity_difference`). + + Since the optimal value of a difference metric is 0, this function takes the + absolute value and sets ``greater_is_better`` to ``False``. + + See also: + :func:`~sklearn.metrics.make_scorer` + + Args: + diff_func (callable): A difference metric with signature + ``diff_func(y, y_pred, **kwargs)``. + """ + return make_scorer(lambda y, y_pred, **kw: abs(diff_func(y, y_pred, **kw)), greater_is_better=False) -def make_ratio_scorer(func): +def make_ratio_scorer(ratio_func): + """Make a scorer from a 'ratio' metric (e.g. :func:`disparate_impact_ratio`) + + Since the optimal value of a ratio metric is 1, this function takes the + minimum of the ratio and its inverse. + + See also: + :func:`~sklearn.metrics.make_scorer` + + Args: + ratio_func (callable): A ratio metric with signature + `ratio_func(y, y_pred, **kwargs)``. + """ def score_fn(y, y_pred, **kwargs): - ratio = func(y, y_pred, **kwargs) - return 1 - min(ratio, 1/ratio) - return make_scorer(score_fn, greater_is_better=False) + ratio = ratio_func(y, y_pred, **kwargs) + return min(ratio, 1/ratio) + return make_scorer(score_fn) # ================================ HELPERS ===================================== @@ -126,66 +162,208 @@ def specificity_score(y_true, y_pred, neg_label=0, sample_weight=None): Args: y_true (array-like): Ground truth (correct) target values. y_pred (array-like): Estimated targets as returned by a classifier. - neg_label (scalar, optional): The class to report. Note: the data should - be binary. + neg_label (scalar, optional): The label of the negative class. Note: + the data should be binary. + sample_weight (array-like, optional): Sample weights. """ return recall_score(y_true, y_pred, pos_label=neg_label, sample_weight=sample_weight) def base_rate(y_true, y_pred=None, pos_label=1, sample_weight=None): + r"""Compute the base rate, :math:`Pr(Y = \text{pos_label}) = \frac{P}{P+N}`. + + Args: + y_true (array-like): Ground truth (correct) target values. + y_pred (array-like, optional): Estimated targets. Ignored. + pos_label (scalar, optional): The label of the positive class. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Base rate. + """ idx = (y_true == pos_label) - if not np.any(idx): - warnings.warn("base_rate is ill-defined because there are no samples " - "with value {} in y_true.".format(pos_label), - UndefinedMetricWarning) - return 0. return np.average(idx, weights=sample_weight) def selection_rate(y_true, y_pred, pos_label=1, sample_weight=None): + r"""Compute the selection rate, :math:`Pr(\hat{Y} = \text{pos_label}) = + \frac{TP + FP}{P + N}`. + + Args: + y_true (array-like): Ground truth (correct) target values. Ignored. + y_pred (array-like): Estimated targets as returned by a classifier. + pos_label (scalar, optional): The label of the positive class. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Selection rate. + """ return base_rate(y_pred, pos_label=pos_label, sample_weight=sample_weight) -def generalized_fpr(y_true, y_pred, pos_label=1, sample_weight=None): +def generalized_fpr(y_true, probas_pred, pos_label=1, sample_weight=None): + r"""Return the ratio of generalized false positives to negative examples in + the dataset, :math:`GFPR = \tfrac{GFP}{N}`. + + The generalized confusion matrix is calculated by summing the probabilities + of the positive class instead of the hard predictions. + + Args: + y_true (array-like): Ground-truth (correct) target values. + probas_pred (array-like): Probability estimates of the positive class. + pos_label (scalar, optional): The label of the positive class. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Generalized false positive rate. If there are no negative samples + in ``y_true``, this will raise an + :class:`~sklearn.exceptions.UndefinedMetricWarning` and return 0. + """ idx = (y_true != pos_label) if not np.any(idx): - warnings.warn("generalized_fpr is ill-defined because there are no true" - " negatives in y_true.", UndefinedMetricWarning) + warnings.warn("generalized_fpr is ill-defined because there are no " + "negative samples in y_true.", UndefinedMetricWarning) return 0. if sample_weight is None: - return y_pred[idx].mean() - return np.average(y_pred[idx], weights=sample_weight[idx]) + return probas_pred[idx].mean() + return np.average(probas_pred[idx], weights=sample_weight[idx]) + +def generalized_fnr(y_true, probas_pred, pos_label=1, sample_weight=None): + r"""Return the ratio of generalized false negatives to positive examples in + the dataset, :math:`GFNR = \tfrac{GFN}{P}`. -def generalized_fnr(y_true, y_pred, pos_label=1, sample_weight=None): + The generalized confusion matrix is calculated by summing the probabilities + of the positive class instead of the hard predictions. + + Args: + y_true (array-like): Ground-truth (correct) target values. + probas_pred (array-like): Probability estimates of the positive class. + pos_label (scalar, optional): The label of the positive class. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Generalized false negative rate. If there are no positive samples + in ``y_true``, this will raise an + :class:`~sklearn.exceptions.UndefinedMetricWarning` and return 0. + """ idx = (y_true == pos_label) if not np.any(idx): - warnings.warn("generalized_fnr is ill-defined because there are no true" - " positives in y_true.", UndefinedMetricWarning) + warnings.warn("generalized_fnr is ill-defined because there are no " + "positive samples in y_true.", UndefinedMetricWarning) return 0. if sample_weight is None: - return 1 - y_pred[idx].mean() - return 1 - np.average(y_pred[idx], weights=sample_weight[idx]) + return 1 - probas_pred[idx].mean() + return 1 - np.average(probas_pred[idx], weights=sample_weight[idx]) # ============================ GROUP FAIRNESS ================================== def statistical_parity_difference(*y, prot_attr=None, priv_group=1, pos_label=1, sample_weight=None): + r"""Difference in selection rates. + + .. math:: + Pr(\hat{Y} = \text{pos_label} | D = \text{unprivileged}) + - Pr(\hat{Y} = \text{pos_label} | D = \text{privileged}) + + Note: + If only ``y_true`` is provided, this will return the difference in base + rates (statistical parity difference of the original dataset). + + Args: + y_true (array-like): Ground truth (correct) target values. If ``y_pred`` + is provided, this is ignored. + y_pred (array-like, optional): Estimated targets as returned by a + classifier. + prot_attr (array-like, keyword-only): Protected attribute(s). If + ``None``, all protected attributes in ``y_true`` are used. + priv_group (scalar, optional): The label of the privileged group. + pos_label (scalar, optional): The label of the positive class. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Statistical parity difference. + """ rate = base_rate if len(y) == 1 or y[1] is None else selection_rate return difference(rate, *y, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) def disparate_impact_ratio(*y, prot_attr=None, priv_group=1, pos_label=1, sample_weight=None): + r"""Ratio of selection rates. + + .. math:: + \frac{Pr(\hat{Y} = \text{pos_label} | D = \text{unprivileged})} + {Pr(\hat{Y} = \text{pos_label} | D = \text{privileged})} + + Note: + If only ``y_true`` is provided, this will return the ratio of base rates + (disparate impact of the original dataset). + + Args: + y_true (array-like): Ground truth (correct) target values. If ``y_pred`` + is provided, this is ignored. + y_pred (array-like, optional): Estimated targets as returned by a + classifier. + prot_attr (array-like, keyword-only): Protected attribute(s). If + ``None``, all protected attributes in ``y_true`` are used. + priv_group (scalar, optional): The label of the privileged group. + pos_label (scalar, optional): The label of the positive class. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Disparate impact. + """ rate = base_rate if len(y) == 1 or y[1] is None else selection_rate return ratio(rate, *y, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) def equal_opportunity_difference(y_true, y_pred, prot_attr=None, priv_group=1, pos_label=1, sample_weight=None): + r"""A relaxed version of equality of opportunity. + + Returns the difference in recall scores (TPR) between the unprivileged and + privileged groups. A value of 0 indicates equality of opportunity. + + Args: + y_true (array-like): Ground truth (correct) target values. + y_pred (array-like): Estimated targets as returned by a classifier. + prot_attr (array-like, keyword-only): Protected attribute(s). If + ``None``, all protected attributes in ``y_true`` are used. + priv_group (scalar, optional): The label of the privileged group. + pos_label (scalar, optional): The label of the positive class. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Equal opportunity difference. + """ return difference(recall_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) def average_odds_difference(y_true, y_pred, prot_attr=None, priv_group=1, pos_label=1, neg_label=0, sample_weight=None): + r"""A relaxed version of equality of odds. + + Returns the average of the difference in FPR and TPR for the unprivileged + and privileged groups: + + .. math:: + + \dfrac{(FPR_{D = \text{unprivileged}} - FPR_{D = \text{privileged}}) + + (TPR_{D = \text{unprivileged}} - TPR_{D = \text{privileged}})}{2} + + A value of 0 indicates equality of odds. + + Args: + y_true (array-like): Ground truth (correct) target values. + y_pred (array-like): Estimated targets as returned by a classifier. + prot_attr (array-like, keyword-only): Protected attribute(s). If + ``None``, all protected attributes in ``y_true`` are used. + priv_group (scalar, optional): The label of the privileged group. + pos_label (scalar, optional): The label of the positive class. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Average odds difference. + """ fpr_diff = -difference(specificity_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, neg_label=neg_label, sample_weight=sample_weight) @@ -196,6 +374,30 @@ def average_odds_difference(y_true, y_pred, prot_attr=None, priv_group=1, def average_odds_error(y_true, y_pred, prot_attr=None, priv_group=1, pos_label=1, neg_label=0, sample_weight=None): + r"""A relaxed version of equality of odds. + + Returns the average of the absolute difference in FPR and TPR for the + unprivileged and privileged groups: + + .. math:: + + \dfrac{|FPR_{D = \text{unprivileged}} - FPR_{D = \text{privileged}}| + + |TPR_{D = \text{unprivileged}} - TPR_{D = \text{privileged}}|}{2} + + A value of 0 indicates equality of odds. + + Args: + y_true (array-like): Ground truth (correct) target values. + y_pred (array-like): Estimated targets as returned by a classifier. + prot_attr (array-like, keyword-only): Protected attribute(s). If + ``None``, all protected attributes in ``y_true`` are used. + priv_group (scalar, optional): The label of the privileged group. + pos_label (scalar, optional): The label of the positive class. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Average odds error. + """ fpr_diff = -difference(specificity_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, neg_label=neg_label, sample_weight=sample_weight) @@ -207,6 +409,23 @@ def average_odds_error(y_true, y_pred, prot_attr=None, priv_group=1, # ========================== INDIVIDUAL FAIRNESS =============================== def generalized_entropy_index(b, alpha=2): + r"""Generalized entropy index measures inequality over a population. + + .. math:: + + \mathcal{E}(\alpha) = \begin{cases} + \frac{1}{n \alpha (\alpha-1)}\sum_{i=1}^n\left[\left(\frac{b_i}{\mu}\right)^\alpha - 1\right],& \alpha \ne 0, 1,\\ + \frac{1}{n}\sum_{i=1}^n\frac{b_{i}}{\mu}\ln\frac{b_{i}}{\mu},& \alpha=1,\\ + -\frac{1}{n}\sum_{i=1}^n\ln\frac{b_{i}}{\mu},& \alpha=0. + \end{cases} + + Args: + b (array-like): Parameter over which to calculate the entropy index. + alpha (scalar): Parameter that regulates the weight given to distances + between values at different parts of the distribution. A value of 0 + is equivalent to the mean log deviation, 1 is the Theil index, and 2 + is half the squared coefficient of variation. + """ if alpha == 0: return -(np.log(b / b.mean()) / b.mean()).mean() elif alpha == 1: @@ -217,12 +436,65 @@ def generalized_entropy_index(b, alpha=2): def generalized_entropy_error(y_true, y_pred, alpha=2, pos_label=1): # sample_weight=None): + r"""Compute the generalized entropy. + + Generalized entropy index is proposed as a unified individual and + group fairness measure in [#speicher18]_. + + Uses :math:`b_i = \hat{y}_i - y_i + 1`. See + :func:`generalized_entropy_index` for details. + + Args: + y_true (array-like): Ground truth (correct) target values. + y_pred (array-like): Estimated targets as returned by a classifier. + alpha (scalar, optional): Parameter that regulates the weight given to + distances between values at different parts of the distribution. A + value of 0 is equivalent to the mean log deviation, 1 is the Theil + index, and 2 is half the squared coefficient of variation. + pos_label (scalar, optional): The label of the positive class. + + References: + .. [#speicher18] `T. Speicher, H. Heidari, N. Grgic-Hlaca, + K. P. Gummadi, A. Singla, A. Weller, and M. B. Zafar, "A Unified + Approach to Quantifying Algorithmic Unfairness: Measuring Individual + and Group Unfairness via Inequality Indices," ACM SIGKDD + International Conference on Knowledge Discovery and Data Mining, + 2018. `_ + """ b = 1 + (y_pred == pos_label) - (y_true == pos_label) return generalized_entropy_index(b, alpha=alpha) def between_group_generalized_entropy_error(y_true, y_pred, prot_attr=None, - priv_group=None, alpha=2, - pos_label=1): + priv_group=None, alpha=2, pos_label=1): + r"""Compute the between-group generalized entropy. + + Between-group generalized entropy index is proposed as a group + fairness measure in [#speicher18]_ and is one of two terms that the + generalized entropy index decomposes to. + + Args: + y_true (array-like): Ground truth (correct) target values. + y_pred (array-like): Estimated targets as returned by a classifier. + prot_attr (array-like, optional): Protected attribute(s). If ``None``, + all protected attributes in ``y_true`` are used. + priv_group (scalar, optional): The label of the privileged group. If + provided, the index will be computed between only the privileged and + unprivileged groups. Otherwise, the index will be computed between + all groups defined by the ``prot_attr``. + alpha (scalar, optional): Parameter that regulates the weight given to + distances between values at different parts of the distribution. A + value of 0 is equivalent to the mean log deviation, 1 is the Theil + index, and 2 is half the squared coefficient of variation. + pos_label (scalar, optional): The label of the positive class. + + References: + .. [#speicher18] `T. Speicher, H. Heidari, N. Grgic-Hlaca, + K. P. Gummadi, A. Singla, A. Weller, and M. B. Zafar, "A Unified + Approach to Quantifying Algorithmic Unfairness: Measuring Individual + and Group Unfairness via Inequality Indices," ACM SIGKDD + International Conference on Knowledge Discovery and Data Mining, + 2018. `_ + """ groups, _ = check_groups(y_true, prot_attr) b = np.empty_like(y_true, dtype='float') if priv_group is not None: @@ -233,16 +505,46 @@ def between_group_generalized_entropy_error(y_true, y_pred, prot_attr=None, return generalized_entropy_index(b, alpha=alpha) def theil_index(b): + r"""The Theil index is the :func:`generalized_entropy_index` with + :math:`\alpha = 1`. + + Args: + b (array-like): Parameter over which to calculate the entropy index. + """ return generalized_entropy_index(b, alpha=1) def coefficient_of_variation(b): + r"""The coefficient of variation is two times the square root of the + :func:`generalized_entropy_index` with :math:`\alpha = 2`. + + Args: + b (array-like): Parameter over which to calculate the entropy index. + """ return 2 * np.sqrt(generalized_entropy_index(b, alpha=2)) -# TODO: not technically a scorer but you should be allowed to score transformers -# Is consistency_difference posible? -# use sample_weight? +# TODO: use sample_weight? def consistency_score(X, y, n_neighbors=5): + r"""Compute the consistency score. + + Individual fairness metric from [#zemel13]_ that measures how similar the + labels are for similar instances. + + .. math:: + 1 - \frac{1}{n\cdot\text{n_neighbors}}\sum_{i=1}^n |\hat{y}_i - + \sum_{j\in\mathcal{N}_{\text{n_neighbors}}(x_i)} \hat{y}_j| + + Args: + X (array-like): Sample features. + y (array-like): Sample targets. + n_neighbors (int, optional): Number of neighbors for the knn + computation. + + References: + .. [#zemel13] `R. Zemel, Y. Wu, K. Swersky, T. Pitassi, and C. Dwork, + "Learning Fair Representations," International Conference on Machine + Learning, 2013. `_ + """ # cast as ndarrays X, y = check_X_y(X, y) # learn a KNN on the features @@ -267,7 +569,9 @@ def sensitivity_score(y_true, y_pred, pos_label=1, sample_weight=None): # return 1 - specificity_score(y_true, y_pred, neg_label=neg_label, # sample_weight=sample_weight) -def mean_difference(*y, prot_attr=None, priv_group=1, pos_label=1, sample_weight=None): +def mean_difference(*y, prot_attr=None, priv_group=1, pos_label=1, + sample_weight=None): """Alias of :func:`statistical_parity_difference`.""" - return statistical_parity_difference(*y, prot_attr=prot_attr, priv_group=priv_group, - pos_label=pos_label, sample_weight=sample_weight) + return statistical_parity_difference(*y, prot_attr=prot_attr, + priv_group=priv_group, pos_label=pos_label, + sample_weight=sample_weight) diff --git a/aif360/sklearn/preprocessing/__init__.py b/aif360/sklearn/preprocessing/__init__.py index 61a0431d..c47dda96 100644 --- a/aif360/sklearn/preprocessing/__init__.py +++ b/aif360/sklearn/preprocessing/__init__.py @@ -1,3 +1,6 @@ +""" +Pre-processing algorithms modify a dataset to be more fair (data in, data out). +""" from aif360.sklearn.preprocessing.reweighing import Reweighing, ReweighingMeta __all__ = [ diff --git a/aif360/sklearn/preprocessing/reweighing.py b/aif360/sklearn/preprocessing/reweighing.py index c73b96fe..dcb1d906 100644 --- a/aif360/sklearn/preprocessing/reweighing.py +++ b/aif360/sklearn/preprocessing/reweighing.py @@ -7,11 +7,25 @@ class Reweighing(BaseEstimator): - """Reweighing is a preprocessing technique that weights the examples in each + """Sample reweighing. + + Reweighing is a preprocessing technique that weights the examples in each (group, label) combination differently to ensure fairness before classification [#kamiran12]_. + Note: + This breaks the scikit-learn API by returning new sample weights from + ``fit_transform()``. See :class:`ReweighingMeta` for a workaround. + + References: + .. [#kamiran12] `F. Kamiran and T. Calders, "Data Preprocessing + Techniques for Classification without Discrimination," Knowledge and + Information Systems, 2012. + `_ + Attributes: + prot_attr_ (str or list(str)): Protected attribute(s) used for + reweighing. groups_ (array, shape (n_groups,)): A list of group labels known to the transformer. classes_ (array, shape (n_classes,)): A list of class labels known to @@ -20,32 +34,21 @@ class Reweighing(BaseEstimator): for each combination of group and class labels used to debias samples. Existing sample weights are multiplied by the corresponding factor for that sample's group and class. - - Examples: - >>> pipe = make_pipeline(Reweighing(), LinearRegression()) - >>> # sample_weight_ will be used after it is fit - >>> fit_params = {'linearregression__sample_weight': - ... pipe['reweighing'].sample_weight_} - >>> pipe.fit(X, y, **fit_params) - - References: - .. [#kamiran12] F. Kamiran and T. Calders, "Data Preprocessing - Techniques for Classification without Discrimination," Knowledge and - Information Systems, 2012. """ def __init__(self, prot_attr=None): """ Args: prot_attr (single label or list-like, optional): Protected - attribute(s) to use as sensitive attribute(s) in the reweighing - process. If more than one attribute, all combinations of values - (intersections) are considered. Default is ``None`` meaning all - protected attributes from the dataset are used. + attribute(s) to use in the reweighing process. If more than one + attribute, all combinations of values (intersections) are + considered. Default is ``None`` meaning all protected attributes + from the dataset are used. """ self.prot_attr = prot_attr def fit(self, X, y, sample_weight=None): + """Only ``fit_transform`` is allowed for this algorithm.""" self.fit_transform(X, y, sample_weight=sample_weight) return self @@ -88,7 +91,22 @@ def N_(i): return sample_weight[i].sum() class ReweighingMeta(BaseEstimator, MetaEstimatorMixin): + """A meta-estimator which wraps a given estimator with a reweighing + preprocessing step. + + This is necessary for use in a Pipeline, etc. + + Attributes: + estimator_ (sklearn.BaseEstimator): The fitted underlying estimator. + reweigher_: The fitted underlying reweigher. + """ def __init__(self, estimator, reweigher=Reweighing()): + """ + Args: + estimator (sklearn.BaseEstimator): Estimator to be wrapped. + reweigher: Preprocessor which returns new sample weights from + ``transform()``. + """ self.reweigher = reweigher self.estimator = estimator @@ -97,6 +115,18 @@ def _estimator_type(self): return self.estimator._estimator_type def fit(self, X, y, sample_weight=None): + """Performs ``self.reweigher_.fit_transform(X, y, sample_weight)`` and + then ``self.estimator_.fit(X, y, sample_weight)`` using the reweighed + samples. + + Args: + X (array-like): Training samples. + y (array-like): Training labels. + sample_weight (array-like, optional): Sample weights. + + Returns: + ReweighingMeta: self. + """ if not has_fit_parameter(self.estimator, 'sample_weight'): raise TypeError("`estimator` (type: {}) does not have fit parameter" " `sample_weight`.".format(type(self.estimator))) @@ -111,16 +141,60 @@ def fit(self, X, y, sample_weight=None): @if_delegate_has_method('estimator_') def predict(self, X): + """Predict class labels for the given samples using ``self.estimator_``. + + Args: + X (array-like): Test samples. + + Returns: + array: Predicted class label per sample. + """ return self.estimator_.predict(X) @if_delegate_has_method('estimator_') def predict_proba(self, X): + """Probability estimates from ``self.estimator_``. + + The returned estimates for all classes are ordered by the label of + classes. + + Args: + X (array-like): Test samples. + + Returns: + array: Returns the probability of the sample for each class in the + model, where classes are ordered as they are in ``self.classes_``. + """ return self.estimator_.predict_proba(X) @if_delegate_has_method('estimator_') def predict_log_proba(self, X): + """Log of probability estimates from ``self.estimator_``. + + The returned estimates for all classes are ordered by the label of + classes. + + Args: + X (array-like): Test samples. + + Returns: + array: Returns the log-probability of the sample for each class in + the model, where classes are ordered as they are in + ``self.classes_``. + """ return self.estimator_.predict_log_proba(X) @if_delegate_has_method('estimator_') def score(self, X, y, sample_weight=None): + """Returns the output of the estimator's score function on the given + test data and labels. + + Args: + X (array-like): Test samples. + y (array-like): True labels for ``X``. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: `self.estimator.score(X, y, sample_weight)` + """ return self.estimator_.score(X, y, sample_weight=sample_weight) diff --git a/aif360/sklearn/tests/test_datasets.py b/aif360/sklearn/tests/test_datasets.py index 5e2f00ad..0cd13a6c 100644 --- a/aif360/sklearn/tests/test_datasets.py +++ b/aif360/sklearn/tests/test_datasets.py @@ -5,13 +5,13 @@ import pytest from aif360.sklearn.datasets import fetch_adult, fetch_bank, fetch_german -from aif360.sklearn.datasets import standarize_dataset +from aif360.sklearn.datasets import standardize_dataset from aif360.sklearn.datasets import fetch_compas, ColumnAlreadyDroppedWarning df = pd.DataFrame([[1, 2, 3, 'a'], [5, 6, 7, 'b'], [np.NaN, 10, 11, 'c']], columns=['X1', 'X2', 'y', 'Z']) -basic = partial(standarize_dataset, df=df, prot_attr='Z', target='y', +basic = partial(standardize_dataset, df=df, prot_attr='Z', target='y', dropna=False) def test_standardize_dataset_basic(): @@ -44,7 +44,7 @@ def test_usecols_dropcols_basic(): pd.DataFrame) def test_dropna_basic(): - basic_dropna = partial(standarize_dataset, df=df, prot_attr='Z', + basic_dropna = partial(standardize_dataset, df=df, prot_attr='Z', target='y', dropna=True) assert basic_dropna().X.shape == (2, 3) assert basic(dropcols='X1').X.shape == (3, 2) diff --git a/docs/Makefile b/docs/Makefile index 3d5de195..f417938a 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -12,7 +12,11 @@ BUILDDIR = build help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -.PHONY: help Makefile +.PHONY: help clean Makefile + +clean: + -rm -rf $(BUILDDIR)/* + -rm -rf source/modules/generated/* # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). diff --git a/docs/source/conf.py b/docs/source/conf.py index 03058220..0f850880 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -33,7 +33,8 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.viewcode', + 'sphinx.ext.autosummary', + 'sphinx.ext.linkcode', 'sphinx.ext.napoleon', 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax'] @@ -44,10 +45,22 @@ 'sklearn': ('https://scikit-learn.org/stable/', None), 'python': ('https://docs.python.org/{}.{}'.format(*sys.version_info), None)} +napoleon_include_init_with_doc = True +napoleon_use_ivar = True +napoleon_use_rtype = False + autoclass_content = 'both' # Add any paths that contain templates here, relative to this directory. -templates_path = [] +templates_path = ['templates'] + +# generate autosummary even if no references +autosummary_generate = True + +autodoc_default_options = { + 'members': True, + 'inherited-members': True +} # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: @@ -60,7 +73,7 @@ # General information about the project. project = u'aif360' -copyright = u'2018, IBM Corporation' +copyright = u'2018 - 2019, IBM Corporation' author = u'aif360 developers' # The version info for the project you're documenting, acts as replacement for @@ -82,7 +95,14 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = [] +exclude_patterns = ['templates'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +default_role = 'literal' + +# If true, '()' will be appended to :func: etc. cross-reference text. +add_function_parentheses = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' @@ -95,8 +115,8 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -# -# html_theme = 'alabaster' + +# html_theme = 'bizstyle' if os.environ.get('READTHEDOCS') != 'True': try: import sphinx_rtd_theme @@ -115,7 +135,10 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = [] +html_static_path = ['static'] + +def setup(app): + app.add_stylesheet('style.css') # Custom sidebar templates, must be a dictionary that maps document names # to template names. @@ -188,3 +211,60 @@ author, 'aif360', 'One line description of project.', 'Miscellaneous'), ] + + +# -- Options for linkcode ------------------------------------------------- +# taken from numpy/doc/source/conf.py: +import inspect +from os.path import relpath, dirname +def linkcode_resolve(domain, info): + """ + Determine the URL corresponding to Python object + """ + if domain != 'py': + return None + + modname = info['module'] + fullname = info['fullname'] + + submod = sys.modules.get(modname) + if submod is None: + return None + + obj = submod + for part in fullname.split('.'): + try: + obj = getattr(obj, part) + except Exception: + return None + + # strip decorators, which would resolve to the source of the decorator + # possibly an upstream bug in getsourcefile, bpo-1764286 + try: + unwrap = inspect.unwrap + except AttributeError: + pass + else: + obj = unwrap(obj) + + try: + fn = inspect.getsourcefile(obj) + except Exception: + fn = None + if not fn: + return None + + try: + source, lineno = inspect.getsourcelines(obj) + except Exception: + lineno = None + + if lineno: + linespec = "#L%d-L%d" % (lineno, lineno + len(source) - 1) + else: + linespec = "" + + fn = relpath(fn, start=dirname(aif360.__file__)) + + return "https://github.com/IBM/AIF360/blob/master/aif360/%s%s" % ( + fn, linespec) diff --git a/docs/source/modules/sklearn.rst b/docs/source/modules/sklearn.rst index 5a9fdb15..757c0ef8 100644 --- a/docs/source/modules/sklearn.rst +++ b/docs/source/modules/sklearn.rst @@ -1,25 +1,209 @@ -:mod:`aif360.sklearn` -===================== +======================================= +`scikit-learn`-Compatible API Reference +======================================= -.. automodule:: aif360.sklearn +This is the class and function reference for the `scikit-learn`-compatible +version of the AIF360 API. It is functionally equivalent to the normal API but +it uses scikit-learn paradigms (where possible) and Pandas `DataFrames` for +datasets. Not all functionality from AIF360 is supported yet. See +`Getting Started `_ +for a demo of the capabilities. -Datasets --------- +Note: This is under active development. Visit our +`GitHub page `_ if you'd like to contribute! -.. automodule:: aif360.sklearn.datasets.utils - :members: -.. automodule:: aif360.sklearn.datasets.openml_datasets - :members: +:mod:`aif360.sklearn.datasets`: Dataset loading functions +========================================================= -Metrics +.. automodule:: aif360.sklearn.datasets + :no-members: + :no-inherited-members: + +Utils +----- +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + :nosignatures: + + datasets.ColumnAlreadyDroppedWarning + +.. autosummary:: + :toctree: generated/ + :template: base.rst + :nosignatures: + + datasets.check_already_dropped + datasets.standardize_dataset + datasets.to_dataframe + +Loaders +------- + +.. autosummary:: + :toctree: generated/ + :template: base.rst + :nosignatures: + + datasets.fetch_adult + datasets.fetch_german + datasets.fetch_bank + datasets.fetch_compas + +:mod:`aif360.sklearn.metrics`: Fairness metrics +=============================================== + +.. automodule:: aif360.sklearn.metrics + :no-members: + :no-inherited-members: + +Meta-metrics +------------ +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: base.rst + :nosignatures: + + metrics.difference + metrics.ratio + +Scorers ------- +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: base.rst + :nosignatures: + + metrics.make_difference_scorer + metrics.make_ratio_scorer + +Generic metrics +--------------- +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: base.rst + :nosignatures: + + metrics.specificity_score + metrics.sensitivity_score + metrics.base_rate + metrics.selection_rate + metrics.generalized_fpr + metrics.generalized_fnr + +Group fairness metrics +---------------------- +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: base.rst + :nosignatures: -.. automodule:: aif360.sklearn.metrics.metrics - :members: + metrics.statistical_parity_difference + metrics.mean_difference + metrics.disparate_impact_ratio + metrics.equal_opportunity_difference + metrics.average_odds_difference + metrics.average_odds_error + metrics.between_group_generalized_entropy_error -Preprocessing +Individual fairness metrics +--------------------------- +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: base.rst + :nosignatures: + + metrics.generalized_entropy_index + metrics.generalized_entropy_error + metrics.theil_index + metrics.coefficient_of_variation + metrics.consistency_score + +:mod:`aif360.sklearn.preprocessing`: Pre-processing Algorithms +============================================================== + +.. automodule:: aif360.sklearn.preprocessing + :no-members: + :no-inherited-members: + +Pre-processors +-------------- +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + :nosignatures: + + preprocessing.Reweighing + +Meta-Estimator +-------------- +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + :nosignatures: + + preprocessing.ReweighingMeta + +:mod:`aif360.sklearn.inprocessing`: In-processing Algorithms +============================================================ + +.. automodule:: aif360.sklearn.inprocessing + :no-members: + :no-inherited-members: + +In-processors ------------- +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + :nosignatures: + + inprocessing.AdversarialDebiasing + +:mod:`aif360.sklearn.postprocessing`: Post-processing Algorithms +================================================================ + +.. automodule:: aif360.sklearn.postprocessing + :no-members: + :no-inherited-members: + +Post-processors +--------------- +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + :nosignatures: + + postprocessing.CalibratedEqualizedOdds + +Meta-Estimator +-------------- +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + :nosignatures: -.. autoclass:: aif360.sklearn.preprocessing.reweighing.Reweighing - :members: + postprocessing.PostProcessingMeta \ No newline at end of file diff --git a/docs/source/modules/standard_datasets.rst b/docs/source/modules/standard_datasets.rst index 6209bc86..3f6f5622 100644 --- a/docs/source/modules/standard_datasets.rst +++ b/docs/source/modules/standard_datasets.rst @@ -1,4 +1,5 @@ .. module:: aif360.datasets + :noindex: Base Class ---------- diff --git a/docs/source/static/style.css b/docs/source/static/style.css new file mode 100644 index 00000000..460fdebc --- /dev/null +++ b/docs/source/static/style.css @@ -0,0 +1,12 @@ +/* .wy-nav-content { + max-width: 1000px !important; +} */ + +/* override table width restrictions */ +.wy-table-responsive table td, .wy-table-responsive table th { + white-space: normal !important; +} + +.wy-table-responsive { + overflow: visible !important; +} \ No newline at end of file diff --git a/docs/source/templates/base.rst b/docs/source/templates/base.rst new file mode 100644 index 00000000..ba0aa5f3 --- /dev/null +++ b/docs/source/templates/base.rst @@ -0,0 +1,6 @@ +:mod:`{{module}}`.{{objname}} +{{ underline }}==================== + +.. currentmodule:: {{ module }} + +.. auto{{ objtype }}:: {{ objname }} diff --git a/docs/source/templates/class.rst b/docs/source/templates/class.rst new file mode 100644 index 00000000..5f46cabb --- /dev/null +++ b/docs/source/templates/class.rst @@ -0,0 +1,29 @@ +:mod:`{{module}}`.{{objname}} +{{ underline }}============== + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + + {% block methods %} + {% if methods %} + .. rubric:: Methods + + .. autosummary:: + :nosignatures: + {% for item in methods %} + {% if item != '__init__' %} + ~{{ name }}.{{ item }} + {% endif %} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block attributes %} + .. rubric:: Attributes + + .. autosummary:: + {% for item in attributes %} + ~{{ name }}.{{ item }} + {%- endfor %} + {% endblock %} From 0e48ead05132a66be6fab4393832e50df9bdbdf4 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Fri, 13 Dec 2019 13:29:07 -0500 Subject: [PATCH 44/61] more gitignores --- .gitignore | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5b50f9b1..b74b5f5a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,8 +5,23 @@ .cache/ .ipynb_checkpoints/ .pytest_cache/ +__pycache__/ + .idea/ +.vscode/ + +.eggs/ +aif360.egg-info +build/ +dist/ + +.coverage* +coverage.txt + docs/build/ +docs/source/modules/generated + +aif360/version.py aif360/data/raw/** !aif360/data/raw/*/*.md -aif360/version.py +aif360/sklearn/data/ \ No newline at end of file From 0cbc3f4154a3cf877b46155972692b45763e1b88 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Fri, 13 Dec 2019 13:52:40 -0500 Subject: [PATCH 45/61] docstrings and add alpha=sqrt(global_step) option --- .../inprocessing/adversarial_debiasing.py | 154 ++++++++++++++---- 1 file changed, 124 insertions(+), 30 deletions(-) diff --git a/aif360/sklearn/inprocessing/adversarial_debiasing.py b/aif360/sklearn/inprocessing/adversarial_debiasing.py index 1ba8a248..66014d6c 100644 --- a/aif360/sklearn/inprocessing/adversarial_debiasing.py +++ b/aif360/sklearn/inprocessing/adversarial_debiasing.py @@ -10,7 +10,9 @@ class AdversarialDebiasing(BaseEstimator, ClassifierMixin): - """Adversarial debiasing is an in-processing technique that learns a + """Debiasing with adversarial learning. + + Adversarial debiasing is an in-processing technique that learns a classifier to maximize prediction accuracy and simultaneously reduce an adversary's ability to determine the protected attribute from the predictions [#zhang18]_. This approach leads to a fair classifier as the @@ -18,15 +20,55 @@ class AdversarialDebiasing(BaseEstimator, ClassifierMixin): adversary can exploit. References: - .. [#zhang18] B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating + .. [#zhang18] `B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating Unwanted Biases with Adversarial Learning," AAAI/ACM Conference on Artificial Intelligence, Ethics, and Society, 2018. + `_ + + Attributes: + prot_attr_ (str or list(str)): Protected attribute(s) used for + debiasing. + groups_ (array, shape (n_groups,)): A list of group labels known to the + classifier. + classes_ (array, shape (n_classes,)): A list of class labels known to + the classifier. + sess_ (tensorflow.Session): The TensorFlow Session used for the + computations. Note: this can be manually closed to free up resources + with `self.sess_.close()`. + classifier_logits_ (tensorflow.Tensor): Tensor containing output logits + from the classifier. + adversary_logits_ (tensorflow.Tensor): Tensor containing output logits + from the adversary. """ def __init__(self, prot_attr=None, scope_name='classifier', adversary_loss_weight=0.1, num_epochs=50, batch_size=128, classifier_num_hidden_units=200, debias=True, verbose=False, random_state=None): + r""" + Args: + prot_attr (single label or list-like, optional): Protected + attribute(s) to use in the debiasing process. If more than one + attribute, all combinations of values (intersections) are + considered. Default is ``None`` meaning all protected attributes + from the dataset are used. + scope_name (str, optional): TensorFlow "variable_scope" name for the + entire model (classifier and adversary). + adversary_loss_weight (float or ``None``, optional): If ``None``, + this will use the suggestion from the paper: + :math:`\alpha = \sqrt(global_step)` with inverse time decay on + the learning rate. Otherwise, it uses the provided coefficient + with exponential learning rate decay. + num_epochs (int, optional): Number of epochs for which to train. + batch_size (int, optional): Size of mini-batch for training. + classifier_num_hidden_units (int, optional): Number of hidden units + in the classifier. + debias (bool, optional): If ``False``, learn a classifier without an + adversary. + verbose (bool, optional): If ``True``, print losses every 200 steps. + random_state (int or numpy.RandomState, optional): Seed of pseudo- + random number generator for shuffling data. + """ self.prot_attr = prot_attr self.scope_name = scope_name @@ -39,10 +81,20 @@ def __init__(self, prot_attr=None, scope_name='classifier', self.random_state = random_state def fit(self, X, y): + """Train the classifier and adversary (if ``debias == True``) with the + given training data. + + Args: + X (array-like): Training samples. + y (array-like): Training labels. + + Returns: + AdversarialDebiasing: self. + """ X, y, _ = check_inputs(X, y) rng = check_random_state(self.random_state) ii32 = np.iinfo(np.int32) - seed1, seed2, seed3, seed4 = rng.randint(ii32.min, ii32.max, size=4) + s1, s2, s3, s4 = rng.randint(ii32.min, ii32.max, size=4) tf.reset_default_graph() self.sess_ = tf.Session() @@ -51,7 +103,8 @@ def fit(self, X, y): le = LabelEncoder() y = le.fit_transform(y) self.classes_ = le.classes_ - groups = groups.map(str) # BUG: LabelEncoder converts to ndarray which removes tuple formatting + # BUG: LabelEncoder converts to ndarray which removes tuple formatting + groups = groups.map(str) groups = le.fit_transform(groups) self.groups_ = le.classes_ @@ -76,16 +129,16 @@ def fit(self, X, y): with tf.variable_scope('classifier_model'): W1 = tf.get_variable( 'W1', [n_features, self.classifier_num_hidden_units], - initializer=tf.initializers.glorot_uniform(seed=seed1)) - b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), - name='b1') + initializer=tf.initializers.glorot_uniform(seed=s1)) + b1 = tf.Variable(tf.zeros( + shape=[self.classifier_num_hidden_units]), name='b1') h1 = tf.nn.relu(tf.matmul(self.input_ph, W1) + b1) - h1 = tf.nn.dropout(h1, rate=1-self.keep_prob, seed=seed2) + h1 = tf.nn.dropout(h1, rate=1-self.keep_prob, seed=s2) W2 = tf.get_variable( 'W2', [self.classifier_num_hidden_units, n_classes], - initializer=tf.initializers.glorot_uniform(seed=seed3)) + initializer=tf.initializers.glorot_uniform(seed=s3)) b2 = tf.Variable(tf.zeros(shape=[n_classes]), name='b2') self.classifier_logits_ = tf.matmul(h1, W2) + b2 @@ -110,12 +163,12 @@ def fit(self, X, y): s = tf.sigmoid((1 + tf.abs(c)) * self.classifier_logits_) W2 = tf.get_variable('W2', [3, n_groups], - initializer=tf.initializers.glorot_uniform(seed=seed4)) + initializer=tf.initializers.glorot_uniform(seed=s4)) b2 = tf.Variable(tf.zeros(shape=[n_groups]), name='b2') self.adversary_logits_ = tf.matmul( tf.concat([s, s * self.true_labels_ph, - s * (1.0 - self.true_labels_ph)], axis=1), + s * (1. - self.true_labels_ph)], axis=1), W2) + b2 # Obtain adversary loss @@ -131,10 +184,14 @@ def fit(self, X, y): tf.int32)), logits=self.adversary_logits_)) - global_step = tf.train.get_or_create_global_step() - starter_learning_rate = 0.001 - learning_rate = tf.train.exponential_decay(starter_learning_rate, + global_step = tf.Variable(0., trainable=False) + init_learning_rate = 0.001 + if self.adversary_loss_weight is not None: + learning_rate = tf.train.exponential_decay(init_learning_rate, global_step, 1000, 0.96, staircase=True) + else: + learning_rate = tf.train.inverse_time_decay(init_learning_rate, + global_step, 1000, 0.1, staircase=True) # Setup optimizers clf_opt = tf.train.AdamOptimizer(learning_rate) @@ -153,15 +210,20 @@ def fit(self, X, y): normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny) clf_grads = [] - for (grad, var) in clf_opt.compute_gradients(clf_loss, var_list=clf_vars): + for (grad, var) in clf_opt.compute_gradients(clf_loss, + var_list=clf_vars): if self.debias: unit_adv_grad = normalize(adv_grads[var]) # proj_{adv_grad} clf_grad: grad -= tf.reduce_sum(grad * unit_adv_grad) * unit_adv_grad - grad -= self.adversary_loss_weight * adv_grads[var] + if self.adversary_loss_weight is not None: + grad -= self.adversary_loss_weight * adv_grads[var] + else: + grad -= tf.sqrt(global_step) * adv_grads[var] clf_grads.append((grad, var)) - clf_min = clf_opt.apply_gradients(clf_grads, global_step=global_step) + clf_min = clf_opt.apply_gradients(clf_grads, + global_step=global_step) if self.debias: with tf.control_dependencies([clf_min]): adv_min = adv_opt.minimize(adv_loss, var_list=adv_vars) @@ -182,26 +244,37 @@ def fit(self, X, y): self.prot_attr_ph: batch_prot_attr, self.keep_prob: 0.8} if self.debias: - _, _, clf_loss_value, adv_loss_value = ( - self.sess_.run([clf_min, adv_min, - clf_loss, adv_loss], - feed_dict=batch_feed_dict)) + _, _, clf_loss_val, adv_loss_val = self.sess_.run( + [clf_min, adv_min, clf_loss, adv_loss], + feed_dict=batch_feed_dict) + if i % 200 == 0 and self.verbose: - print("epoch {}; iter: {}; batch classifier loss: " - "{}; batch adversarial loss: {}".format( - epoch, i, clf_loss_value, - adv_loss_value)) + print("epoch {:>3d}; iter: {:>4d}; batch classifier" + " loss: {:.4f}; batch adversarial loss: " + "{:.4f}".format(epoch, i, clf_loss_val, + adv_loss_val)) else: - _, clf_loss_value = self.sess_.run( - [clf_min, clf_loss], + _, clf_loss_val = self.sess_.run([clf_min, clf_loss], feed_dict=batch_feed_dict) + if i % 200 == 0 and self.verbose: - print("epoch {}; iter: {}; batch classifier loss: " - "{}".format(epoch, i, clf_loss_value)) + print("epoch {:>3d}; iter: {:>4d}; batch classifier" + " loss: {:.4f}".format(epoch, i, + clf_loss_val)) return self def decision_function(self, X): + """Soft prediction scores. + + Args: + X (array-like): Test samples. + + Returns: + numpy.ndarray: Confidence scores per (sample, class) combination. In + the binary case, confidence score for ``self.classes_[1]`` where >0 + means this class would be predicted. + """ check_is_fitted(self, ['classes_', 'input_ph', 'keep_prob', 'classifier_logits_']) n_samples = X.shape[0] @@ -224,12 +297,25 @@ def decision_function(self, X): self.keep_prob: 1.0} scores[batch_ids] = self.sess_.run(self.classifier_logits_, - feed_dict=batch_feed_dict) + feed_dict=batch_feed_dict) samples_covered += len(batch_features) return scores.ravel() if scores.shape[1] == 1 else scores def predict_proba(self, X): + """Probability estimates. + + The returned estimates for all classes are ordered by the label of + classes. + + Args: + X (array-like): Test samples. + + Returns: + numpy.ndarray: Returns the probability of the sample for each class + in the model, where classes are ordered as they are in + ``self.classes_``. + """ decision = self.decision_function(X) if decision.ndim == 1: @@ -239,6 +325,14 @@ def predict_proba(self, X): return scipy.special.softmax(decision_2d, axis=1) def predict(self, X): + """Predict class labels for the given samples. + + Args: + X (array-like): Test samples. + + Returns: + numpy.ndarray: Predicted class label per sample. + """ scores = self.decision_function(X) if scores.ndim == 1: indices = (scores > 0).astype(np.int) From 8be64498e13e1dba3ffcbb44dd29b3c52a99115e Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Fri, 13 Dec 2019 13:57:27 -0500 Subject: [PATCH 46/61] docstrings and input is now predict_proba output also added score function (compute weighted cost) --- aif360/sklearn/postprocessing/__init__.py | 127 ++++++++++-- .../calibrated_equalized_odds.py | 189 +++++++++++++----- .../tests/test_calibrated_equalized_odds.py | 6 +- 3 files changed, 247 insertions(+), 75 deletions(-) diff --git a/aif360/sklearn/postprocessing/__init__.py b/aif360/sklearn/postprocessing/__init__.py index 49e89d42..acc63020 100644 --- a/aif360/sklearn/postprocessing/__init__.py +++ b/aif360/sklearn/postprocessing/__init__.py @@ -1,3 +1,7 @@ +""" +Post-processing algorithms modify predictions to be more fair (predictions in, +predictions out). +""" from logging import warning import numpy as np @@ -11,7 +15,16 @@ class PostProcessingMeta(BaseEstimator, MetaEstimatorMixin): - """ + """A meta-estimator which wraps a given estimator with a post-processing + step. + + The post-processor trains on a separate training set from the estimator to + prevent leakage. + + Note: + Because of the dataset splitting, if a Pipeline is necessary it should + be used as the input to this meta-estimator not the other way around. + Attributes: estimator_: Cloned ``estimator``. postprocessor_: Cloned ``postprocessor``. @@ -40,6 +53,7 @@ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(), """ self.estimator = estimator self.postprocessor = postprocessor + self.use_proba = use_proba self.val_size = val_size self.options = options @@ -47,9 +61,26 @@ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(), def _estimator_type(self): return self.postprocessor._estimator_type - def fit(self, X, y, pos_label=1, sample_weight=None): - self.pos_label_ = pos_label - self.use_proba_ = isinstance(self.postprocessor, CalibratedEqualizedOdds) + def fit(self, X, y, sample_weight=None, **fit_params): + """Splits the training samples with + :func:`~sklearn.model_selection.train_test_split` and uses the resultant + 'train' portion to train the estimator. Then the estimator predicts on + the 'test' portion of the split data and the post-processor is trained + with those prediction-ground-truth target pairs. + + Args: + X (array-like): Training samples. + y (array-like): Training labels. + sample_weight (array-like, optional): Sample weights. + **fit_params: Parameters passed to the post-processor ``fit`` + method. Note: these do not need to be prefixed with ``__`` + notation. + + Returns: + PostProcessingMeta: self. + """ + self.use_proba_ = (self.use_proba if self.use_proba is not None else + isinstance(self.postprocessor, CalibratedEqualizedOdds)) if self.use_proba_ and not hasattr(self.estimator, 'predict_proba'): raise TypeError("`estimator` (type: {}) does not implement method " "`predict_proba()`.".format(type(self.estimator))) @@ -72,48 +103,100 @@ def fit(self, X, y, pos_label=1, sample_weight=None): X_est, X_post, y_est, y_post = train_test_split(X, y, **options_) self.estimator_.fit(X_est, y_est) - pos_idx = np.nonzero(self.estimator_.classes_ == pos_label)[0][0] y_pred = (self.estimator_.predict(X_post) if not self.use_proba_ else - self.estimator_.predict_proba(X_post)[:, pos_idx]) - self.postprocessor_.fit(y_post, y_pred, pos_label=pos_label, - sample_weight=None if sample_weight is None else sw_post) + self.estimator_.predict_proba(X_post)) + # fit_params = fit_params.copy().update(labels=self.estimator_.classes_) + self.postprocessor_.fit(y_pred, y_post, sample_weight=sw_post + if sample_weight is not None else None, + **fit_params) return self - @property - def classes_(self): - # order of postprocessor.classes_ may differ from estimator_.classes_ - check_is_fitted(self.postprocessor_, 'classes_') - return self.postprocessor_.classes_ - @if_delegate_has_method('postprocessor_') def predict(self, X): - pos_idx = np.nonzero(self.estimator_.classes_ == self.pos_label_)[0][0] + """Predict class labels for the given samples. + + First, runs ``self.estimator_.predict`` (or ``predict_proba`` if + ``self.use_proba_`` is ``True``) then returns the post-processed output + from those predictions. + + Args: + X (array-like): Test samples. + + Returns: + numpy.ndarray: Predicted class label per sample. + """ y_pred = (self.estimator_.predict(X) if not self.use_proba_ else - self.estimator_.predict_proba(X)[:, pos_idx]) + self.estimator_.predict_proba(X)) y_pred = pd.Series(y_pred, index=X.index) return self.postprocessor_.predict(y_pred) @if_delegate_has_method('postprocessor_') def predict_proba(self, X): - pos_idx = np.nonzero(self.estimator_.classes_ == self.pos_label_)[0][0] + """Probability estimates. + + First, runs ``self.estimator_.predict`` (or ``predict_proba`` if + ``self.use_proba_`` is ``True``) then returns the post-processed output + from those predictions. + + The returned estimates for all classes are ordered by the label of + classes. + + Args: + X (array-like): Test samples. + + Returns: + numpy.ndarray: Returns the probability of the sample for each class + in the model, where classes are ordered as they are in + ``self.classes_``. + """ y_pred = (self.estimator_.predict(X) if not self.use_proba_ else - self.estimator_.predict_proba(X)[:, pos_idx]) + self.estimator_.predict_proba(X)) y_pred = pd.Series(y_pred, index=X.index) return self.postprocessor_.predict_proba(y_pred) @if_delegate_has_method('postprocessor_') def predict_log_proba(self, X): - pos_idx = np.nonzero(self.estimator_.classes_ == self.pos_label_)[0][0] + """Log of probability estimates. + + First, runs ``self.estimator_.predict`` (or ``predict_proba`` if + ``self.use_proba_`` is ``True``) then returns the post-processed output + from those predictions. + + The returned estimates for all classes are ordered by the label of + classes. + + Args: + X (array-like): Test samples. + + Returns: + array: Returns the log-probability of the sample for each class in + the model, where classes are ordered as they are in + ``self.classes_``. + """ y_pred = (self.estimator_.predict(X) if not self.use_proba_ else - self.estimator_.predict_proba(X)[:, pos_idx]) + self.estimator_.predict_proba(X)) y_pred = pd.Series(y_pred, index=X.index) return self.postprocessor_.predict_log_proba(y_pred) @if_delegate_has_method('postprocessor_') def score(self, X, y, sample_weight=None): - pos_idx = np.nonzero(self.estimator_.classes_ == self.pos_label_)[0][0] + """Returns the output of the post-processor's score function on the + given test data and labels. + + First, runs ``self.estimator_.predict`` (or ``predict_proba`` if + ``self.use_proba_`` is ``True``) then gets the post-processed output + from those predictions and scores it. + + Args: + X (array-like): Test samples. + y (array-like): True labels for ``X``. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Score value. + """ y_pred = (self.estimator_.predict(X) if not self.use_proba_ else - self.estimator_.predict_proba(X)[:, pos_idx]) + self.estimator_.predict_proba(X)) y_pred = pd.Series(y_pred, index=X.index) return self.postprocessor_.score(y_pred, y, sample_weight=sample_weight) diff --git a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py index 143ed423..088a84a3 100644 --- a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py +++ b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py @@ -1,16 +1,24 @@ import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils import check_random_state +from sklearn.utils.validation import check_is_fitted -from aif360.sklearn.metrics import base_rate, generalized_fnr, generalized_fpr +from aif360.sklearn.metrics import difference, base_rate +from aif360.sklearn.metrics import generalized_fnr, generalized_fpr from aif360.sklearn.utils import check_groups class CalibratedEqualizedOdds(BaseEstimator, ClassifierMixin): - """Calibrated equalized odds postprocessing is a post-processing technique - that optimizes over calibrated classifier score outputs to find - probabilities with which to change output labels with an equalized odds - objective [#pleiss17]_. + """Calibrated equalized odds post-processor. + + Calibrated equalized odds is a post-processing technique that optimizes over + calibrated classifier score outputs to find probabilities with which to + change output labels with an equalized odds objective [#pleiss17]_. + + Note: + This breaks the sckit-learn API by requiring fit params ``y_true``, + ``y_pred``, and ``pos_label`` and predict param ``y_pred``. See + :class:`PostProcessingMeta` for a workaround. References: .. [#pleiss17] `G. Pleiss, M. Raghavan, F. Wu, J. Kleinberg, and @@ -20,78 +28,125 @@ class CalibratedEqualizedOdds(BaseEstimator, ClassifierMixin): Adapted from: https://github.com/gpleiss/equalized_odds_and_calibration/blob/master/calib_eq_odds.py + + Attributes: + prot_attr_ (str or list(str)): Protected attribute(s) used for post- + processing. + groups_ (array, shape (2,)): A list of group labels known to the + classifier. Note: this algorithm require a binary division of the + data. + classes_ (array, shape (num_classes,)): A list of class labels known to + the classifier. Note: this algorithm treats all non-positive + outcomes as negative (binary classification only). + pos_label_ (scalar): The label of the positive class. + mix_rates_ (array, shape (2,)): The interpolation parameters -- the + probability of randomly returning the group's base rate. The group + for which the cost function is higher is set to 0. """ def __init__(self, prot_attr=None, cost_constraint='weighted', random_state=None): """ Args: prot_attr (single label or list-like, optional): Protected - attribute(s) to use as sensitive attribute(s) in the post- - processing. If more than one attribute, all combinations of - values (intersections) are considered. Default is ``None`` - meaning all protected attributes from the dataset are used. - Note: This algorithm requires there be exactly 2 groups - (privileged and unprivileged). - cost_constraint ('fpr', 'fnr', or 'weighted'): - random_state (int or numpy.RandomState, optional): + attribute(s) to use in the post-processing. If more than one + attribute, all combinations of values (intersections) are + considered. Default is ``None`` meaning all protected attributes + from the dataset are used. Note: This algorithm requires there + be exactly 2 groups (privileged and unprivileged). + cost_constraint ('fpr', 'fnr', or 'weighted'): Which equal-cost + constraint to satisfy: generalized false positive rate ('fpr'), + generalized false negative rate ('fnr'), or a weighted + combination of both ('weighted'). + random_state (int or numpy.RandomState, optional): Seed of pseudo- + random number generator for shuffling data. """ self.prot_attr = prot_attr self.cost_constraint = cost_constraint self.random_state = random_state - def fit(self, y_true, y_pred, pos_label=1, sample_weight=None): + def _weighted_cost(self, y_true, probas_pred, pos_label, sample_weight): + """Evaluates the cost function specified by ``self.cost_constraint``.""" + fpr = generalized_fpr(y_true, probas_pred, pos_label, sample_weight) + fnr = generalized_fnr(y_true, probas_pred, pos_label, sample_weight) + br = base_rate(y_true, probas_pred, pos_label, sample_weight) + if self.cost_constraint == 'fpr': + return fpr + elif self.cost_constraint == 'fnr': + return fnr + elif self.cost_constraint == 'weighted': + return fpr * (1 - br) + fnr * br + else: + raise ValueError("`cost_constraint` must be one of: 'fpr', 'fnr', " + "or 'weighted'") + + def fit(self, y_pred, y_true, labels=None, pos_label=1, sample_weight=None): + """Compute the mixing rates required to satisfy the cost constraint. + + Args: + y_pred (array-like): Probability estimates of the targets as + returned by a ``predict_proba()`` call or equivalent. + y_true (array-like): Ground-truth (correct) target values. + labels (list, optional): The ordered set of labels values. Must + match the order of columns in ``y_pred`` if provided. By + default, all labels in ``y_true`` are used in sorted order. + pos_label (scalar, optional): The label of the positive class. + sample_weight (array-like, optional): Sample weights. + + Returns: + CalibratedEqualizedOdds: self. + """ groups, self.prot_attr_ = check_groups(y_true, self.prot_attr) - self.classes_ = np.unique(y_true) + self.classes_ = labels if labels is not None else np.unique(y_true) self.groups_ = np.unique(groups) + self.pos_label_ = pos_label + + if len(self.classes_) > 2: + raise ValueError('Only binary classification is supported.') if pos_label not in self.classes_: - raise ValueError('pos_label={} is not present in y_true. The valid ' - 'values are:\n{}'.format(pos_label, self.classes_)) + raise ValueError('pos_label={} is not in the set of labels. The ' + 'valid values are:\n{}'.format(pos_label, self.classes_)) if len(self.groups_) != 2: raise ValueError('prot_attr={}\nyielded {} groups:\n{}\nbut this ' - 'algorithm requires a binary division of the ' - 'data.'.format(self.prot_attr_, len(self.groups_), - self.groups_)) + 'algorithm requires a binary division of the data.'.format( + self.prot_attr_, len(self.groups_), self.groups_)) - # ensure self.classes_ = [neg_label, pos_label] - self.classes_ = np.append(np.delete(self.classes_, pos_label), - pos_label) + y_pred = y_pred[:, np.nonzero(self.classes_ == self.pos_label_)[0][0]] - def args(grp_idx, triv=False): - i = (groups == self.groups_[grp_idx]) + # local function to return corresponding args for metric evaluation + def _args(grp_idx, triv=False): + idx = (groups == self.groups_[grp_idx]) pred = (np.full_like(y_pred, self.base_rates_[grp_idx]) if triv else y_pred) - return dict(y_true=y_true[i], y_pred=pred[i], pos_label=pos_label, - sample_weight=None if sample_weight is None - else sample_weight[i]) - - self.base_rates_ = [base_rate(**args(i)) for i in range(2)] - - def weighted_cost(grp_idx, triv=False): - fpr = generalized_fpr(**args(grp_idx, triv=triv)) - fnr = generalized_fnr(**args(grp_idx, triv=triv)) - base_rate = self.base_rates_[grp_idx] - if self.cost_constraint == 'fpr': - return fpr - elif self.cost_constraint == 'fnr': - return fnr - elif self.cost_constraint == 'weighted': - return fpr * (1 - base_rate) + fnr * base_rate - else: - raise ValueError("`cost_constraint` must be one of: 'fpr', " - "'fnr', or 'weighted'") - - costs = [weighted_cost(i) for i in range(2)] + return [y_true[idx], pred[idx], pos_label, + sample_weight[idx] if sample_weight is not None else None] + + self.base_rates_ = [base_rate(*_args(i)) for i in range(2)] + + costs = [self._weighted_cost(*_args(i)) for i in range(2)] self.mix_rates_ = [(costs[1] - costs[0]) - / (weighted_cost(0, triv=True) - costs[0]), + / (self._weighted_cost(*_args(0, True)) - costs[0]), (costs[0] - costs[1]) - / (weighted_cost(1, triv=True) - costs[1])] + / (self._weighted_cost(*_args(1, True)) - costs[1])] self.mix_rates_[np.argmax(costs)] = 0 return self def predict_proba(self, y_pred): + """The returned estimates for all classes are ordered by the label of + classes. + + Args: + y_pred (array-like): Probability estimates of the targets as + returned by a ``predict_proba()`` call or equivalent. + + Returns: + numpy.ndarray: Returns the probability of the sample for each class + in the model, where classes are ordered as they are in + ``self.classes_``. + """ + check_is_fitted(self, 'mix_rates_') rng = check_random_state(self.random_state) groups, _ = check_groups(y_pred, self.prot_attr_) @@ -100,6 +155,9 @@ def predict_proba(self, y_pred): 'match those from the training set:\n{}'.format( np.unique(groups), self.groups_)) + pos_idx = np.nonzero(self.classes_ == self.pos_label_)[0][0] + y_pred = y_pred[:, pos_idx] + yt = np.empty_like(y_pred) for grp_idx in range(2): i = (groups == self.groups_[grp_idx]) @@ -108,8 +166,39 @@ def predict_proba(self, y_pred): new_preds[to_replace] = self.base_rates_[grp_idx] yt[i] = new_preds - return np.stack([1 - yt, yt], axis=-1) + return np.c_[1 - yt, yt] if pos_idx == 1 else np.c_[yt, 1 - yt] def predict(self, y_pred): + """Predict class labels for the given scores. + + Args: + y_pred (array-like): Probability estimates of the targets as + returned by a ``predict_proba()`` call or equivalent. + + Returns: + numpy.ndarray: Predicted class label per sample. + """ scores = self.predict_proba(y_pred) - return self.classes_[scores.argmax(axis=1)] + return self.classes[scores.argmax(axis=1)] + + def score(self, y_pred, y_true, sample_weight=None): + """Score the predictions according to the cost constraint specified. + + Args: + y_pred (array-like): Probability estimates of the targets as + returned by a ``predict_proba()`` call or equivalent. + y_true (array-like): Ground-truth (correct) target values. + sample_weight (array-like, optional): Sample weights. + + Returns: + float: Absolute value of the difference in cost function for the two + groups (e.g. :func:`~aif360.sklearn.metrics.generalized_fpr` if + ``self.cost_constraint`` is 'fpr') + """ + check_is_fitted(self, ['classes_', 'pos_label_']) + pos_idx = np.nonzero(self.classes_ == self.pos_label_)[0][0] + probas_pred = self.predict_proba(y_pred)[:, pos_idx] + + return abs(difference(self._weighted_cost, y_true, probas_pred, + prot_attr=self.prot_attr_, priv_group=self.groups_[1], + sample_weight=sample_weight)) diff --git a/aif360/sklearn/tests/test_calibrated_equalized_odds.py b/aif360/sklearn/tests/test_calibrated_equalized_odds.py index f1a6f3b3..1cba4391 100644 --- a/aif360/sklearn/tests/test_calibrated_equalized_odds.py +++ b/aif360/sklearn/tests/test_calibrated_equalized_odds.py @@ -15,14 +15,14 @@ def test_calib_eq_odds_sex(): logreg = LogisticRegression(solver='lbfgs', max_iter=500) - y_pred = logreg.fit(X, y, sample_weight=sample_weight).predict_proba(X)[:, 1] + y_pred = logreg.fit(X, y, sample_weight=sample_weight).predict_proba(X) adult_pred = adult.copy() - adult_pred.scores = y_pred + adult_pred.scores = y_pred[:, 1] orig_cal_eq_odds = CalibratedEqOddsPostprocessing( unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}]) orig_cal_eq_odds.fit(adult, adult_pred) cal_eq_odds = CalibratedEqualizedOdds('sex') - cal_eq_odds.fit(y, y_pred, sample_weight=sample_weight) + cal_eq_odds.fit(y_pred, y, sample_weight=sample_weight) assert np.isclose(orig_cal_eq_odds.priv_mix_rate, cal_eq_odds.mix_rates_[1]) assert np.isclose(orig_cal_eq_odds.unpriv_mix_rate, cal_eq_odds.mix_rates_[0]) From 994bdf0457a90def05db458f15bcc3d37a54d4ff Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Wed, 18 Dec 2019 15:00:59 -0500 Subject: [PATCH 47/61] moved tests to main test folder --- .travis.yml | 2 +- .../tests => tests/sklearn}/test_adversarial_debiasing.py | 0 .../tests => tests/sklearn}/test_calibrated_equalized_odds.py | 0 {aif360/sklearn/tests => tests/sklearn}/test_datasets.py | 0 {aif360/sklearn/tests => tests/sklearn}/test_metrics.py | 0 {aif360/sklearn/tests => tests/sklearn}/test_reweighing.py | 0 6 files changed, 1 insertion(+), 1 deletion(-) rename {aif360/sklearn/tests => tests/sklearn}/test_adversarial_debiasing.py (100%) rename {aif360/sklearn/tests => tests/sklearn}/test_calibrated_equalized_odds.py (100%) rename {aif360/sklearn/tests => tests/sklearn}/test_datasets.py (100%) rename {aif360/sklearn/tests => tests/sklearn}/test_metrics.py (100%) rename {aif360/sklearn/tests => tests/sklearn}/test_reweighing.py (100%) diff --git a/.travis.yml b/.travis.yml index fdfa087e..a9c99eda 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,4 +28,4 @@ before_script: script: # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics - - travis_wait python -m pytest aif360/sklearn/tests + - travis_wait python -m pytest tests/sklearn diff --git a/aif360/sklearn/tests/test_adversarial_debiasing.py b/tests/sklearn/test_adversarial_debiasing.py similarity index 100% rename from aif360/sklearn/tests/test_adversarial_debiasing.py rename to tests/sklearn/test_adversarial_debiasing.py diff --git a/aif360/sklearn/tests/test_calibrated_equalized_odds.py b/tests/sklearn/test_calibrated_equalized_odds.py similarity index 100% rename from aif360/sklearn/tests/test_calibrated_equalized_odds.py rename to tests/sklearn/test_calibrated_equalized_odds.py diff --git a/aif360/sklearn/tests/test_datasets.py b/tests/sklearn/test_datasets.py similarity index 100% rename from aif360/sklearn/tests/test_datasets.py rename to tests/sklearn/test_datasets.py diff --git a/aif360/sklearn/tests/test_metrics.py b/tests/sklearn/test_metrics.py similarity index 100% rename from aif360/sklearn/tests/test_metrics.py rename to tests/sklearn/test_metrics.py diff --git a/aif360/sklearn/tests/test_reweighing.py b/tests/sklearn/test_reweighing.py similarity index 100% rename from aif360/sklearn/tests/test_reweighing.py rename to tests/sklearn/test_reweighing.py From 372e1116e0f458ea027e94900704acab2be31ad9 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 19 Dec 2019 11:53:00 -0500 Subject: [PATCH 48/61] more docs and formatting changes --- aif360/datasets/structured_dataset.py | 23 ++++-- aif360/sklearn/datasets/compas_dataset.py | 17 ++-- aif360/sklearn/datasets/openml_datasets.py | 11 ++- aif360/sklearn/datasets/utils.py | 10 +-- .../inprocessing/adversarial_debiasing.py | 10 +-- aif360/sklearn/metrics/metrics.py | 58 +++++++------- aif360/sklearn/preprocessing/reweighing.py | 15 ++-- aif360/sklearn/utils.py | 35 +++++++-- docs/source/conf.py | 4 +- docs/source/index.rst | 10 ++- docs/source/modules/algorithms.rst | 78 +++++++++++++++---- docs/source/modules/datasets.rst | 44 ++++++----- docs/source/modules/explainers.rst | 21 ++--- docs/source/modules/inprocessing.rst | 35 --------- docs/source/modules/metrics.rst | 53 +++++++------ docs/source/modules/postprocessing.rst | 22 ------ docs/source/modules/preprocessing.rst | 30 ------- docs/source/modules/sklearn.rst | 42 +++++----- docs/source/modules/standard_datasets.rst | 32 -------- docs/source/static/style.css | 6 +- 20 files changed, 271 insertions(+), 285 deletions(-) delete mode 100644 docs/source/modules/inprocessing.rst delete mode 100644 docs/source/modules/postprocessing.rst delete mode 100644 docs/source/modules/preprocessing.rst delete mode 100644 docs/source/modules/standard_datasets.rst diff --git a/aif360/datasets/structured_dataset.py b/aif360/datasets/structured_dataset.py index c36e2308..3f03b94a 100644 --- a/aif360/datasets/structured_dataset.py +++ b/aif360/datasets/structured_dataset.py @@ -411,14 +411,25 @@ def import_dataset(self, import_metadata=False): return None def split(self, num_or_size_splits, shuffle=False, seed=None): - """Split the dataset into multiple datasets + """Split this dataset into multiple partitions. + Args: - num_or_size_splits (list or int): - shuffle (bool): - seed (int or array_like): takes the same argument as `numpy.random.seed()` - function + num_or_size_splits (array or int): If `num_or_size_splits` is an + int, *k*, the value is the number of equal-sized folds to make + (if *k* does not evenly divide the dataset these folds are + approximately equal-sized). If `num_or_size_splits` is an array + of type int, the values are taken as the indices at which to + split the dataset. If the values are floats (< 1.), they are + considered to be fractional proportions of the dataset at which + to split. + shuffle (bool, optional): Randomly shuffle the dataset before + splitting. + seed (int or array_like): Takes the same argument as + :func:`numpy.random.seed()`. + Returns: - list: Each element of this list is a dataset obtained during the split + list: Splits. Contains *k* or `len(num_or_size_splits) + 1` + datasets depending on `num_or_size_splits`. """ # Set seed diff --git a/aif360/sklearn/datasets/compas_dataset.py b/aif360/sklearn/datasets/compas_dataset.py index 76a0d9df..81578ef5 100644 --- a/aif360/sklearn/datasets/compas_dataset.py +++ b/aif360/sklearn/datasets/compas_dataset.py @@ -17,11 +17,11 @@ def fetch_compas(data_home=None, binary_race=False, dropcols=[], numeric_only=False, dropna=True): """Load the COMPAS Recidivism Risk Scores dataset. - Optionally binarizes 'race' to 'Caucasian' (privileged) or 'African-American' - (unprivileged). The other protected attribute is 'sex' ('Male' is - *unprivileged* and 'Female' is *privileged*). The outcome variable is - 'no recid.' (favorable) if the person was not accused of a crime within two - years or 'did recid.' (unfavorable) if they were. + Optionally binarizes 'race' to 'Caucasian' (privileged) or + 'African-American' (unprivileged). The other protected attribute is 'sex' + ('Male' is *unprivileged* and 'Female' is *privileged*). The outcome + variable is 'no recid.' (favorable) if the person was not accused of a crime + within two years or 'did recid.' (unfavorable) if they were. Args: data_home (string, optional): Specify another download and cache folder @@ -59,11 +59,14 @@ def fetch_compas(data_home=None, binary_race=False, for col in ['sex', 'age_cat', 'race', 'c_charge_degree', 'c_charge_desc']: df[col] = df[col].astype('category') - df.two_year_recid = df.two_year_recid.replace({0: 'no recid.', 1: 'did recid.'}).astype('category').cat.as_ordered() # 'did recid' < 'no recid' + # 'did recid' < 'no recid' + df.two_year_recid = df.two_year_recid.replace({0: 'no recid.', + 1: 'did recid.'}).astype('category').cat.as_ordered() if binary_race: + # 'African-American' < 'Caucasian' df.race = df.race.cat.set_categories(['African-American', 'Caucasian'], - ordered=True) # 'African-American' < 'Caucasian' + ordered=True) df.sex = df.sex.astype('category').cat.as_ordered() # 'Female' < 'Male' diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 6decfcb7..1bfa24e7 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -99,10 +99,9 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], Protected attributes are 'sex' ('male' is privileged and 'female' is unprivileged) and 'age' (binarized by default as recommended by - [#kamiran09]_: ``age >= 25`` is considered privileged and ``age < 25`` is - considered unprivileged; see the ``binary_age`` flag to keep this - continuous). The outcome variable is 'credit-risk': 'good' (favorable) or - 'bad' (unfavorable). + [#kamiran09]_: age >= 25 is considered privileged and age < 25 is considered + unprivileged; see the binary_age flag to keep this continuous). The outcome + variable is 'credit-risk': 'good' (favorable) or 'bad' (unfavorable). References: .. [#kamiran09] `F. Kamiran and T. Calders, "Classifying without @@ -115,8 +114,8 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], for the datasets. By default all AIF360 datasets are stored in 'aif360/sklearn/data/raw' subfolders. binary_age (bool, optional): If ``True``, split protected attribute, - ``age``, into 'aged' (privileged) and 'youth' (unprivileged). The - ``age`` feature remains continuous. + 'age', into 'aged' (privileged) and 'youth' (unprivileged). The + 'age' feature remains continuous. usecols (single label or list-like, optional): Column name(s) to keep. All others are dropped. dropcols (single label or list-like, optional): Column name(s) to drop. diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index db88ea46..a39d5fb3 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -16,15 +16,15 @@ def check_already_dropped(labels, dropped_cols, name, dropped_by='numeric_only', Args: labels (single label or list-like): Column labels to check. dropped_cols (set or pandas.Index): Columns that were already dropped. - name (str): Original arg that triggered the check (e.g. ``dropcols``). - dropped_by (str, optional): Original arg that caused ``dropped_cols`` - (e.g. ``numeric_only``). + name (str): Original arg that triggered the check (e.g. dropcols). + dropped_by (str, optional): Original arg that caused dropped_cols`` + (e.g. numeric_only). warn (bool, optional): If ``True``, produces a :class:`ColumnAlreadyDroppedWarning` if there are columns in the - intersection of ``dropped_cols`` and ``labels``. + intersection of dropped_cols and labels. Returns: - list: Columns in ``labels`` which are not in ``dropped_cols``. + list: Columns in labels which are not in dropped_cols. """ if not is_list_like(labels): labels = [labels] diff --git a/aif360/sklearn/inprocessing/adversarial_debiasing.py b/aif360/sklearn/inprocessing/adversarial_debiasing.py index 66014d6c..ca3de37d 100644 --- a/aif360/sklearn/inprocessing/adversarial_debiasing.py +++ b/aif360/sklearn/inprocessing/adversarial_debiasing.py @@ -85,11 +85,11 @@ def fit(self, X, y): given training data. Args: - X (array-like): Training samples. + X (pandas.DataFrame): Training samples. y (array-like): Training labels. Returns: - AdversarialDebiasing: self. + self """ X, y, _ = check_inputs(X, y) rng = check_random_state(self.random_state) @@ -268,7 +268,7 @@ def decision_function(self, X): """Soft prediction scores. Args: - X (array-like): Test samples. + X (pandas.DataFrame): Test samples. Returns: numpy.ndarray: Confidence scores per (sample, class) combination. In @@ -309,7 +309,7 @@ def predict_proba(self, X): classes. Args: - X (array-like): Test samples. + X (pandas.DataFrame): Test samples. Returns: numpy.ndarray: Returns the probability of the sample for each class @@ -328,7 +328,7 @@ def predict(self, X): """Predict class labels for the given samples. Args: - X (array-like): Test samples. + X (pandas.DataFrame): Test samples. Returns: numpy.ndarray: Predicted class label per sample. diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index f100a012..7b954bf5 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -36,21 +36,21 @@ def difference(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, arbitrary metric. Note: The optimal value of a difference is 0. To make it a scorer, one must - take the absolute value and set ``greater_is_better`` to False. + take the absolute value and set greater_is_better to False. Unprivileged group is taken to be the inverse of the privileged group. Args: func (function): A metric function from :mod:`sklearn.metrics` or :mod:`aif360.sklearn.metrics.metrics`. - y (array-like): Outcome vector with protected attributes as index. - *args: Additional positional args to be passed through to ``func``. + y (pandas.Series): Outcome vector with protected attributes as index. + *args: Additional positional args to be passed through to func. prot_attr (array-like, keyword-only): Protected attribute(s). If - ``None``, all protected attributes in ``y`` are used. + ``None``, all protected attributes in y are used. priv_group (scalar, optional): The label of the privileged group. sample_weight (array-like, optional): Sample weights passed through to - ``func``. - **kwargs: Additional keyword args to be passed through to ``func``. + func. + **kwargs: Additional keyword args to be passed through to func. Returns: scalar: Difference in metric value for unprivileged and privileged @@ -85,14 +85,14 @@ def ratio(func, y, *args, prot_attr=None, priv_group=1, sample_weight=None, Args: func (function): A metric function from :mod:`sklearn.metrics` or :mod:`aif360.sklearn.metrics.metrics`. - y (array-like): Outcome vector with protected attributes as index. - *args: Additional positional args to be passed through to ``func``. + y (pandas.Series): Outcome vector with protected attributes as index. + *args: Additional positional args to be passed through to func. prot_attr (array-like, keyword-only): Protected attribute(s). If - ``None``, all protected attributes in ``y`` are used. + ``None``, all protected attributes in y are used. priv_group (scalar, optional): The label of the privileged group. sample_weight (array-like, optional): Sample weights passed through to - ``func``. - **kwargs: Additional keyword args to be passed through to ``func``. + func. + **kwargs: Additional keyword args to be passed through to func. Returns: scalar: Ratio of metric values for unprivileged and privileged groups. @@ -123,7 +123,7 @@ def make_difference_scorer(diff_func): :func:`statistical_parity_difference`). Since the optimal value of a difference metric is 0, this function takes the - absolute value and sets ``greater_is_better`` to ``False``. + absolute value and sets greater_is_better to ``False``. See also: :func:`~sklearn.metrics.make_scorer` @@ -214,7 +214,7 @@ def generalized_fpr(y_true, probas_pred, pos_label=1, sample_weight=None): Returns: float: Generalized false positive rate. If there are no negative samples - in ``y_true``, this will raise an + in y_true, this will raise an :class:`~sklearn.exceptions.UndefinedMetricWarning` and return 0. """ idx = (y_true != pos_label) @@ -241,7 +241,7 @@ def generalized_fnr(y_true, probas_pred, pos_label=1, sample_weight=None): Returns: float: Generalized false negative rate. If there are no positive samples - in ``y_true``, this will raise an + in y_true, this will raise an :class:`~sklearn.exceptions.UndefinedMetricWarning` and return 0. """ idx = (y_true == pos_label) @@ -264,16 +264,16 @@ def statistical_parity_difference(*y, prot_attr=None, priv_group=1, pos_label=1, - Pr(\hat{Y} = \text{pos_label} | D = \text{privileged}) Note: - If only ``y_true`` is provided, this will return the difference in base + If only y_true is provided, this will return the difference in base rates (statistical parity difference of the original dataset). Args: - y_true (array-like): Ground truth (correct) target values. If ``y_pred`` + y_true (pandas.Series): Ground truth (correct) target values. If y_pred is provided, this is ignored. y_pred (array-like, optional): Estimated targets as returned by a classifier. prot_attr (array-like, keyword-only): Protected attribute(s). If - ``None``, all protected attributes in ``y_true`` are used. + ``None``, all protected attributes in y_true are used. priv_group (scalar, optional): The label of the privileged group. pos_label (scalar, optional): The label of the positive class. sample_weight (array-like, optional): Sample weights. @@ -294,16 +294,16 @@ def disparate_impact_ratio(*y, prot_attr=None, priv_group=1, pos_label=1, {Pr(\hat{Y} = \text{pos_label} | D = \text{privileged})} Note: - If only ``y_true`` is provided, this will return the ratio of base rates + If only y_true is provided, this will return the ratio of base rates (disparate impact of the original dataset). Args: - y_true (array-like): Ground truth (correct) target values. If ``y_pred`` + y_true (pandas.Series): Ground truth (correct) target values. If y_pred is provided, this is ignored. y_pred (array-like, optional): Estimated targets as returned by a classifier. prot_attr (array-like, keyword-only): Protected attribute(s). If - ``None``, all protected attributes in ``y_true`` are used. + ``None``, all protected attributes in y_true are used. priv_group (scalar, optional): The label of the privileged group. pos_label (scalar, optional): The label of the positive class. sample_weight (array-like, optional): Sample weights. @@ -323,10 +323,10 @@ def equal_opportunity_difference(y_true, y_pred, prot_attr=None, priv_group=1, privileged groups. A value of 0 indicates equality of opportunity. Args: - y_true (array-like): Ground truth (correct) target values. + y_true (pandas.Series): Ground truth (correct) target values. y_pred (array-like): Estimated targets as returned by a classifier. prot_attr (array-like, keyword-only): Protected attribute(s). If - ``None``, all protected attributes in ``y_true`` are used. + ``None``, all protected attributes in y_true are used. priv_group (scalar, optional): The label of the privileged group. pos_label (scalar, optional): The label of the positive class. sample_weight (array-like, optional): Sample weights. @@ -353,10 +353,10 @@ def average_odds_difference(y_true, y_pred, prot_attr=None, priv_group=1, A value of 0 indicates equality of odds. Args: - y_true (array-like): Ground truth (correct) target values. + y_true (pandas.Series): Ground truth (correct) target values. y_pred (array-like): Estimated targets as returned by a classifier. prot_attr (array-like, keyword-only): Protected attribute(s). If - ``None``, all protected attributes in ``y_true`` are used. + ``None``, all protected attributes in y_true are used. priv_group (scalar, optional): The label of the privileged group. pos_label (scalar, optional): The label of the positive class. sample_weight (array-like, optional): Sample weights. @@ -387,10 +387,10 @@ def average_odds_error(y_true, y_pred, prot_attr=None, priv_group=1, A value of 0 indicates equality of odds. Args: - y_true (array-like): Ground truth (correct) target values. + y_true (pandas.Series): Ground truth (correct) target values. y_pred (array-like): Estimated targets as returned by a classifier. prot_attr (array-like, keyword-only): Protected attribute(s). If - ``None``, all protected attributes in ``y_true`` are used. + ``None``, all protected attributes in y_true are used. priv_group (scalar, optional): The label of the privileged group. pos_label (scalar, optional): The label of the positive class. sample_weight (array-like, optional): Sample weights. @@ -473,14 +473,14 @@ def between_group_generalized_entropy_error(y_true, y_pred, prot_attr=None, generalized entropy index decomposes to. Args: - y_true (array-like): Ground truth (correct) target values. + y_true (pandas.Series): Ground truth (correct) target values. y_pred (array-like): Estimated targets as returned by a classifier. prot_attr (array-like, optional): Protected attribute(s). If ``None``, - all protected attributes in ``y_true`` are used. + all protected attributes in y_true are used. priv_group (scalar, optional): The label of the privileged group. If provided, the index will be computed between only the privileged and unprivileged groups. Otherwise, the index will be computed between - all groups defined by the ``prot_attr``. + all groups defined by the prot_attr. alpha (scalar, optional): Parameter that regulates the weight given to distances between values at different parts of the distribution. A value of 0 is equivalent to the mean log deviation, 1 is the Theil diff --git a/aif360/sklearn/preprocessing/reweighing.py b/aif360/sklearn/preprocessing/reweighing.py index dcb1d906..d4f782b0 100644 --- a/aif360/sklearn/preprocessing/reweighing.py +++ b/aif360/sklearn/preprocessing/reweighing.py @@ -48,7 +48,7 @@ def __init__(self, prot_attr=None): self.prot_attr = prot_attr def fit(self, X, y, sample_weight=None): - """Only ``fit_transform`` is allowed for this algorithm.""" + """Only :meth:`fit_transform` is allowed for this algorithm.""" self.fit_transform(X, y, sample_weight=sample_weight) return self @@ -57,15 +57,16 @@ def fit_transform(self, X, y, sample_weight=None): sample weights. Args: - X (array-like): Training samples. + X (pandas.DataFrame): Training samples. y (array-like): Training labels. sample_weight (array-like, optional): Sample weights. Returns: tuple: + Samples and their weights. - **X** -- Unchanged samples. - **sample_weight** -- Transformed sample weights. + * **X** -- Unchanged samples. + * **sample_weight** -- Transformed sample weights. """ X, y, sample_weight = check_inputs(X, y, sample_weight) @@ -120,12 +121,12 @@ def fit(self, X, y, sample_weight=None): samples. Args: - X (array-like): Training samples. + X (pandas.DataFrame): Training samples. y (array-like): Training labels. sample_weight (array-like, optional): Sample weights. Returns: - ReweighingMeta: self. + self """ if not has_fit_parameter(self.estimator, 'sample_weight'): raise TypeError("`estimator` (type: {}) does not have fit parameter" @@ -191,7 +192,7 @@ def score(self, X, y, sample_weight=None): Args: X (array-like): Test samples. - y (array-like): True labels for ``X``. + y (array-like): True labels for X. sample_weight (array-like, optional): Sample weights. Returns: diff --git a/aif360/sklearn/utils.py b/aif360/sklearn/utils.py index 28db1e61..13ad3820 100644 --- a/aif360/sklearn/utils.py +++ b/aif360/sklearn/utils.py @@ -1,14 +1,28 @@ import numpy as np +import pandas as pd from pandas.core.dtypes.common import is_list_like from sklearn.utils import check_consistent_length from sklearn.utils.validation import column_or_1d -def check_inputs(X, y, sample_weight=None): - if not hasattr(X, 'index'): - raise TypeError("Expected `DataFrame`, got {} instead.".format( - type(X).__name__)) - y = column_or_1d(y) +def check_inputs(X, y, sample_weight=None, ensure_2d=True): + """Input validation for debiasing algorithms. + + Checks all inputs for consistent length, validates shapes (optional for X), + and returns an array of all ones if sample_weight is ``None``. + + Args: + X (array-like): Input data. + y (array-like, shape = (n_samples,)): Target values. + sample_weight (array-like): Sample weights. + ensure_2d (bool, optional): Whether to raise a ValueError if X is not + 2D. + """ + if ensure_2d and X.ndim != 2: + raise ValueError("Expected X to be 2D, got ndim == {} instead.".format( + X.ndim)) + if not isinstance(y, pd.Series): # don't cast Series -> ndarray + y = column_or_1d(y) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) else: @@ -17,13 +31,18 @@ def check_inputs(X, y, sample_weight=None): return X, y, sample_weight def check_groups(arr, prot_attr, ensure_binary=False): - """Validates ``arr`` and returns ``groups`` and ``prot_attr``. + """Get groups from the index of arr. + + If there are multiple protected attributes provided, the index is flattened + to be a 1-D Index of tuples. If ensure_binary is ``True``, raises a + ValueError if there are not exactly two unique groups. Also checks that all + provided protected attributes are in the index. Args: arr (`pandas.Series` or `pandas.DataFrame`): A Pandas object containing protected attribute information in the index. prot_attr (single label or list-like): Protected attribute(s). If - ``None``, all protected attributes in ``arr`` are used. + ``None``, all protected attributes in arr are used. ensure_binary (bool): Raise an error if the resultant groups are not binary. @@ -31,7 +50,7 @@ def check_groups(arr, prot_attr, ensure_binary=False): tuple: * **groups** (`pandas.Index`) -- Label (or tuple of labels) of - protected attribute for each sample in ``arr``. + protected attribute for each sample in arr. * **prot_attr** (list-like) -- Modified input. If input is a single label, returns single-item list. If input is ``None`` returns list of all protected attributes. diff --git a/docs/source/conf.py b/docs/source/conf.py index 0f850880..b6695bc9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -59,7 +59,7 @@ autodoc_default_options = { 'members': True, - 'inherited-members': True + # 'inherited-members': True } # The suffix(es) of source filenames. @@ -102,7 +102,7 @@ default_role = 'literal' # If true, '()' will be appended to :func: etc. cross-reference text. -add_function_parentheses = False +add_function_parentheses = True # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' diff --git a/docs/source/index.rst b/docs/source/index.rst index 37ba7078..532c0eb5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,17 +3,21 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to AI Fairness 360's documentation! -=========================================== +AI Fairness 360 documentation +============================= .. toctree:: - :maxdepth: 3 + :maxdepth: 2 :caption: Modules modules/algorithms modules/datasets modules/explainers modules/metrics + +.. toctree:: + :maxdepth: 3 + modules/sklearn diff --git a/docs/source/modules/algorithms.rst b/docs/source/modules/algorithms.rst index 16d70441..6842064c 100644 --- a/docs/source/modules/algorithms.rst +++ b/docs/source/modules/algorithms.rst @@ -1,25 +1,71 @@ -:mod:`aif360.algorithms` -======================== +========== +Algorithms +========== -.. automodule:: aif360.algorithms +:mod:`aif360.algorithms.preprocessing` +====================================== + +.. automodule:: aif360.algorithms.preprocessing + :no-members: + :no-inherited-members: + +.. currentmodule:: aif360 + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + algorithms.preprocessing.DisparateImpactRemover + algorithms.preprocessing.LFR + algorithms.preprocessing.OptimPreproc + algorithms.preprocessing.Reweighing + +:mod:`aif360.algorithms.inprocessing` +===================================== + +.. automodule:: aif360.algorithms.inprocessing + :no-members: + :no-inherited-members: -.. toctree:: - :maxdepth: 2 +.. currentmodule:: aif360 - preprocessing +.. autosummary:: + :toctree: generated/ + :template: class.rst -.. toctree:: - :maxdepth: 2 + algorithms.inprocessing.AdversarialDebiasing + algorithms.inprocessing.ARTClassifier + algorithms.inprocessing.MetaFairClassifier + algorithms.inprocessing.PrejudiceRemover - inprocessing +:mod:`aif360.algorithms.postprocessing` +======================================= -.. toctree:: - :maxdepth: 2 +.. automodule:: aif360.algorithms.postprocessing + :no-members: + :no-inherited-members: + +.. currentmodule:: aif360 + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + algorithms.postprocessing.CalibratedEqOddsPostprocessing + algorithms.postprocessing.EqOddsPostprocessing + algorithms.postprocessing.RejectOptionClassification + +:mod:`aif360.algorithms` +======================== + +.. automodule:: aif360.algorithms + :no-members: + :no-inherited-members: - postprocessing +.. currentmodule:: aif360 -Base Class ----------- +.. autosummary:: + :toctree: generated/ + :template: class.rst -.. autoclass:: Transformer - :members: + algorithms.Transformer diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst index 4ef46368..140ef25b 100644 --- a/docs/source/modules/datasets.rst +++ b/docs/source/modules/datasets.rst @@ -1,30 +1,38 @@ +======== +Datasets +======== + :mod:`aif360.datasets` ====================== .. automodule:: aif360.datasets + :no-members: + :no-inherited-members: -Base Class ----------- - -.. autoclass:: Dataset - :members: +Base classes +------------ -Structured Dataset ------------------- +.. currentmodule:: aif360 -.. autoclass:: StructuredDataset - :members: +.. autosummary:: + :toctree: generated/ + :template: class.rst -Binary Label Dataset --------------------- + datasets.Dataset + datasets.StructuredDataset + datasets.BinaryLabelDataset + datasets.StandardDataset -.. autoclass:: BinaryLabelDataset - :members: +Common datasets +--------------- -Standard Datasets ------------------ +.. currentmodule:: aif360 -.. toctree:: - :maxdepth: 2 +.. autosummary:: + :toctree: generated/ + :template: class.rst - standard_datasets + datasets.AdultDataset + datasets.BankDataset + datasets.CompasDataset + datasets.GermanDataset diff --git a/docs/source/modules/explainers.rst b/docs/source/modules/explainers.rst index c0053e81..f5e14832 100644 --- a/docs/source/modules/explainers.rst +++ b/docs/source/modules/explainers.rst @@ -1,16 +1,19 @@ +========== +Explainers +========== + :mod:`aif360.explainers` ======================== .. automodule:: aif360.explainers + :no-members: + :no-inherited-members: -Metric Text Explainer ---------------------- - -.. autoclass:: MetricTextExplainer - :members: +.. currentmodule:: aif360 -Metric JSON Explainer ---------------------- +.. autosummary:: + :toctree: generated/ + :template: class.rst -.. autoclass:: MetricJSONExplainer - :members: + explainers.MetricTextExplainer + explainers.MetricJSONExplainer diff --git a/docs/source/modules/inprocessing.rst b/docs/source/modules/inprocessing.rst deleted file mode 100644 index 1ae7ce3d..00000000 --- a/docs/source/modules/inprocessing.rst +++ /dev/null @@ -1,35 +0,0 @@ -:mod:`aif360.algorithms.inprocessing` -===================================== - -.. automodule:: aif360.algorithms.inprocessing - -Adversarial Debiasing ---------------------- - -.. autoclass:: AdversarialDebiasing - :members: - :inherited-members: - :exclude-members: transform, fit_transform - -ART Classifier --------------- - -.. autoclass:: ARTClassifier - :members: - :inherited-members: - :exclude-members: transform, fit_transform - -Meta Fair Classifier --------------------- -.. autoclass:: MetaFairClassifier - :members: - :inherited-members: - :exclude-members: transform, fit_transform - -Prejudice Remover ------------------ - -.. autoclass:: PrejudiceRemover - :members: - :inherited-members: - :exclude-members: transform, fit_transform diff --git a/docs/source/modules/metrics.rst b/docs/source/modules/metrics.rst index ea5171f0..4be7ed49 100644 --- a/docs/source/modules/metrics.rst +++ b/docs/source/modules/metrics.rst @@ -1,36 +1,41 @@ +================ +Fairness Metrics +================ + :mod:`aif360.metrics` ===================== .. automodule:: aif360.metrics + :no-members: + :no-inherited-members: -Dataset Metric --------------- - -.. autoclass:: DatasetMetric - :members: - :exclude-members: difference, ratio - -Binary Label Dataset Metric ---------------------------- +.. currentmodule:: aif360 -.. autoclass:: BinaryLabelDatasetMetric - :members: +.. autosummary:: + :toctree: generated/ + :template: class.rst -Classification Metric ---------------------- + metrics.DatasetMetric + metrics.BinaryLabelDatasetMetric + metrics.ClassificationMetric + metrics.SampleDistortionMetric -.. autoclass:: ClassificationMetric - :private-members: - :members: +:mod:`aif360.metrics.utils` +=========================== -Sample Distortion Metric ------------------------- +.. automodule:: aif360.metrics.utils + :no-members: + :no-inherited-members: -.. autoclass:: SampleDistortionMetric - :members: +.. currentmodule:: aif360 -Utility Functions ------------------ +.. autosummary:: + :toctree: generated/ + :template: base.rst -.. automodule:: aif360.metrics.utils - :members: + metrics.utils.compute_boolean_conditioning_vector + metrics.utils.compute_num_instances + metrics.utils.compute_num_pos_neg + metrics.utils.compute_num_TF_PN + metrics.utils.compute_num_gen_TF_PN + metrics.utils.compute_distance \ No newline at end of file diff --git a/docs/source/modules/postprocessing.rst b/docs/source/modules/postprocessing.rst deleted file mode 100644 index 18b924db..00000000 --- a/docs/source/modules/postprocessing.rst +++ /dev/null @@ -1,22 +0,0 @@ -:mod:`aif360.algorithms.postprocessing` -======================================= - -.. automodule:: aif360.algorithms.postprocessing - -Calibrated Equality of Odds ---------------------------- - -.. autoclass:: CalibratedEqOddsPostprocessing - :members: - -Equality of Odds ----------------- - -.. autoclass:: EqOddsPostprocessing - :members: - -Reject Option Classification ----------------------------- - -.. autoclass:: RejectOptionClassification - :members: diff --git a/docs/source/modules/preprocessing.rst b/docs/source/modules/preprocessing.rst deleted file mode 100644 index a99006f0..00000000 --- a/docs/source/modules/preprocessing.rst +++ /dev/null @@ -1,30 +0,0 @@ -:mod:`aif360.algorithms.preprocessing` -====================================== - -.. automodule:: aif360.algorithms.preprocessing - -Disparate Impact Remover ------------------------- - -.. autoclass:: DisparateImpactRemover - :members: - -Learning Fair Representations ------------------------------ - -.. autoclass:: LFR - :members: - -Optimized Preprocessing ------------------------ - -.. autoclass:: OptimPreproc - :members: - -Reweighing ----------- - -.. autoclass:: Reweighing - :members: - :inherited-members: - :exclude-members: predict, fit_predict diff --git a/docs/source/modules/sklearn.rst b/docs/source/modules/sklearn.rst index 757c0ef8..a61283de 100644 --- a/docs/source/modules/sklearn.rst +++ b/docs/source/modules/sklearn.rst @@ -4,7 +4,7 @@ This is the class and function reference for the `scikit-learn`-compatible version of the AIF360 API. It is functionally equivalent to the normal API but -it uses scikit-learn paradigms (where possible) and Pandas `DataFrames` for +it uses scikit-learn paradigms (where possible) and :class:`pandas.DataFrame` for datasets. Not all functionality from AIF360 is supported yet. See `Getting Started `_ for a demo of the capabilities. @@ -27,14 +27,12 @@ Utils .. autosummary:: :toctree: generated/ :template: class.rst - :nosignatures: datasets.ColumnAlreadyDroppedWarning .. autosummary:: :toctree: generated/ :template: base.rst - :nosignatures: datasets.check_already_dropped datasets.standardize_dataset @@ -46,7 +44,6 @@ Loaders .. autosummary:: :toctree: generated/ :template: base.rst - :nosignatures: datasets.fetch_adult datasets.fetch_german @@ -67,7 +64,6 @@ Meta-metrics .. autosummary:: :toctree: generated/ :template: base.rst - :nosignatures: metrics.difference metrics.ratio @@ -79,7 +75,6 @@ Scorers .. autosummary:: :toctree: generated/ :template: base.rst - :nosignatures: metrics.make_difference_scorer metrics.make_ratio_scorer @@ -91,7 +86,6 @@ Generic metrics .. autosummary:: :toctree: generated/ :template: base.rst - :nosignatures: metrics.specificity_score metrics.sensitivity_score @@ -107,7 +101,6 @@ Group fairness metrics .. autosummary:: :toctree: generated/ :template: base.rst - :nosignatures: metrics.statistical_parity_difference metrics.mean_difference @@ -124,7 +117,6 @@ Individual fairness metrics .. autosummary:: :toctree: generated/ :template: base.rst - :nosignatures: metrics.generalized_entropy_index metrics.generalized_entropy_error @@ -132,7 +124,7 @@ Individual fairness metrics metrics.coefficient_of_variation metrics.consistency_score -:mod:`aif360.sklearn.preprocessing`: Pre-processing Algorithms +:mod:`aif360.sklearn.preprocessing`: Pre-processing algorithms ============================================================== .. automodule:: aif360.sklearn.preprocessing @@ -146,7 +138,6 @@ Pre-processors .. autosummary:: :toctree: generated/ :template: class.rst - :nosignatures: preprocessing.Reweighing @@ -157,11 +148,10 @@ Meta-Estimator .. autosummary:: :toctree: generated/ :template: class.rst - :nosignatures: preprocessing.ReweighingMeta -:mod:`aif360.sklearn.inprocessing`: In-processing Algorithms +:mod:`aif360.sklearn.inprocessing`: In-processing algorithms ============================================================ .. automodule:: aif360.sklearn.inprocessing @@ -175,11 +165,10 @@ In-processors .. autosummary:: :toctree: generated/ :template: class.rst - :nosignatures: inprocessing.AdversarialDebiasing -:mod:`aif360.sklearn.postprocessing`: Post-processing Algorithms +:mod:`aif360.sklearn.postprocessing`: Post-processing algorithms ================================================================ .. automodule:: aif360.sklearn.postprocessing @@ -193,7 +182,6 @@ Post-processors .. autosummary:: :toctree: generated/ :template: class.rst - :nosignatures: postprocessing.CalibratedEqualizedOdds @@ -204,6 +192,24 @@ Meta-Estimator .. autosummary:: :toctree: generated/ :template: class.rst - :nosignatures: - postprocessing.PostProcessingMeta \ No newline at end of file + postprocessing.PostProcessingMeta + +:mod:`aif360.sklearn.utils`: Utility functions +============================================== + +.. automodule:: aif360.sklearn.utils + :no-members: + :no-inherited-members: + +Validation +---------- + +.. currentmodule:: aif360.sklearn + +.. autosummary:: + :toctree: generated/ + :template: base.rst + + utils.check_inputs + utils.check_groups \ No newline at end of file diff --git a/docs/source/modules/standard_datasets.rst b/docs/source/modules/standard_datasets.rst deleted file mode 100644 index 3f6f5622..00000000 --- a/docs/source/modules/standard_datasets.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. module:: aif360.datasets - :noindex: - -Base Class ----------- - -.. autoclass:: StandardDataset - :members: - -Adult Dataset -------------- - -.. autoclass:: AdultDataset - :members: - -Bank Dataset ------------- - -.. autoclass:: BankDataset - :members: - -Compas Dataset --------------- - -.. autoclass:: CompasDataset - :members: - -German Dataset --------------- - -.. autoclass:: GermanDataset - :members: diff --git a/docs/source/static/style.css b/docs/source/static/style.css index 460fdebc..db0cc5f8 100644 --- a/docs/source/static/style.css +++ b/docs/source/static/style.css @@ -1,6 +1,6 @@ -/* .wy-nav-content { - max-width: 1000px !important; -} */ +.wy-nav-content { + max-width: 900px !important; +} /* override table width restrictions */ .wy-table-responsive table td, .wy-table-responsive table th { From 8d108935cb2761be33b79dbdf9c430c46ec3bb60 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 19 Dec 2019 12:03:24 -0500 Subject: [PATCH 49/61] postprocessor takes DataFrame if use_proba added additional tests to check this --- .../calibrated_eq_odds_postprocessing.py | 10 ++-- aif360/sklearn/postprocessing/__init__.py | 38 +++++++-------- .../calibrated_equalized_odds.py | 46 +++++++++---------- .../sklearn/test_calibrated_equalized_odds.py | 46 ++++++++++++------- 4 files changed, 77 insertions(+), 63 deletions(-) diff --git a/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py b/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py index 4bae2ed9..ba240d72 100644 --- a/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py +++ b/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py @@ -171,16 +171,16 @@ def predict(self, dataset, threshold=0.5): dataset.protected_attribute_names, self.unprivileged_groups) - priv_indices = (np.random.random(sum(cond_vec_priv)) - <= self.priv_mix_rate) - priv_new_pred = dataset.scores[cond_vec_priv].copy() - priv_new_pred[priv_indices] = self.base_rate_priv - unpriv_indices = (np.random.random(sum(cond_vec_unpriv)) <= self.unpriv_mix_rate) unpriv_new_pred = dataset.scores[cond_vec_unpriv].copy() unpriv_new_pred[unpriv_indices] = self.base_rate_unpriv + priv_indices = (np.random.random(sum(cond_vec_priv)) + <= self.priv_mix_rate) + priv_new_pred = dataset.scores[cond_vec_priv].copy() + priv_new_pred[priv_indices] = self.base_rate_priv + dataset_new = dataset.copy(deepcopy=True) dataset_new.scores = np.zeros_like(dataset.scores, dtype=np.float64) diff --git a/aif360/sklearn/postprocessing/__init__.py b/aif360/sklearn/postprocessing/__init__.py index acc63020..d1b0465f 100644 --- a/aif360/sklearn/postprocessing/__init__.py +++ b/aif360/sklearn/postprocessing/__init__.py @@ -26,8 +26,8 @@ class PostProcessingMeta(BaseEstimator, MetaEstimatorMixin): be used as the input to this meta-estimator not the other way around. Attributes: - estimator_: Cloned ``estimator``. - postprocessor_: Cloned ``postprocessor``. + estimator_: Fitted estimator. + postprocessor_: Fitted postprocessor. use_proba_ (bool): Determined depending on the postprocessor type if `use_proba` is None. """ @@ -49,7 +49,7 @@ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(), **options: Keyword options passed through to :func:`~sklearn.model_selection.train_test_split`. Note: 'train_size' and 'test_size' will be ignored in favor of - ``val_size``. + 'val_size'. """ self.estimator = estimator self.postprocessor = postprocessor @@ -70,14 +70,14 @@ def fit(self, X, y, sample_weight=None, **fit_params): Args: X (array-like): Training samples. - y (array-like): Training labels. + y (pandas.Series): Training labels. sample_weight (array-like, optional): Sample weights. - **fit_params: Parameters passed to the post-processor ``fit`` + **fit_params: Parameters passed to the post-processor ``fit()`` method. Note: these do not need to be prefixed with ``__`` notation. Returns: - PostProcessingMeta: self. + self """ self.use_proba_ = (self.use_proba if self.use_proba is not None else isinstance(self.postprocessor, CalibratedEqualizedOdds)) @@ -115,26 +115,26 @@ def fit(self, X, y, sample_weight=None, **fit_params): def predict(self, X): """Predict class labels for the given samples. - First, runs ``self.estimator_.predict`` (or ``predict_proba`` if + First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if ``self.use_proba_`` is ``True``) then returns the post-processed output from those predictions. Args: - X (array-like): Test samples. + X (pandas.DataFrame): Test samples. Returns: numpy.ndarray: Predicted class label per sample. """ y_pred = (self.estimator_.predict(X) if not self.use_proba_ else self.estimator_.predict_proba(X)) - y_pred = pd.Series(y_pred, index=X.index) + y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') return self.postprocessor_.predict(y_pred) @if_delegate_has_method('postprocessor_') def predict_proba(self, X): """Probability estimates. - First, runs ``self.estimator_.predict`` (or ``predict_proba`` if + First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if ``self.use_proba_`` is ``True``) then returns the post-processed output from those predictions. @@ -142,7 +142,7 @@ def predict_proba(self, X): classes. Args: - X (array-like): Test samples. + X (pandas.DataFrame): Test samples. Returns: numpy.ndarray: Returns the probability of the sample for each class @@ -151,14 +151,14 @@ def predict_proba(self, X): """ y_pred = (self.estimator_.predict(X) if not self.use_proba_ else self.estimator_.predict_proba(X)) - y_pred = pd.Series(y_pred, index=X.index) + y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') return self.postprocessor_.predict_proba(y_pred) @if_delegate_has_method('postprocessor_') def predict_log_proba(self, X): """Log of probability estimates. - First, runs ``self.estimator_.predict`` (or ``predict_proba`` if + First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if ``self.use_proba_`` is ``True``) then returns the post-processed output from those predictions. @@ -166,7 +166,7 @@ def predict_log_proba(self, X): classes. Args: - X (array-like): Test samples. + X (pandas.DataFrame): Test samples. Returns: array: Returns the log-probability of the sample for each class in @@ -175,7 +175,7 @@ def predict_log_proba(self, X): """ y_pred = (self.estimator_.predict(X) if not self.use_proba_ else self.estimator_.predict_proba(X)) - y_pred = pd.Series(y_pred, index=X.index) + y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') return self.postprocessor_.predict_log_proba(y_pred) @if_delegate_has_method('postprocessor_') @@ -183,13 +183,13 @@ def score(self, X, y, sample_weight=None): """Returns the output of the post-processor's score function on the given test data and labels. - First, runs ``self.estimator_.predict`` (or ``predict_proba`` if + First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if ``self.use_proba_`` is ``True``) then gets the post-processed output from those predictions and scores it. Args: - X (array-like): Test samples. - y (array-like): True labels for ``X``. + X (pandas.DataFrame): Test samples. + y (array-like): True labels for X. sample_weight (array-like, optional): Sample weights. Returns: @@ -197,7 +197,7 @@ def score(self, X, y, sample_weight=None): """ y_pred = (self.estimator_.predict(X) if not self.use_proba_ else self.estimator_.predict_proba(X)) - y_pred = pd.Series(y_pred, index=X.index) + y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') return self.postprocessor_.score(y_pred, y, sample_weight=sample_weight) diff --git a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py index 088a84a3..a648d13b 100644 --- a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py +++ b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py @@ -5,7 +5,7 @@ from aif360.sklearn.metrics import difference, base_rate from aif360.sklearn.metrics import generalized_fnr, generalized_fpr -from aif360.sklearn.utils import check_groups +from aif360.sklearn.utils import check_inputs, check_groups class CalibratedEqualizedOdds(BaseEstimator, ClassifierMixin): @@ -16,9 +16,9 @@ class CalibratedEqualizedOdds(BaseEstimator, ClassifierMixin): change output labels with an equalized odds objective [#pleiss17]_. Note: - This breaks the sckit-learn API by requiring fit params ``y_true``, - ``y_pred``, and ``pos_label`` and predict param ``y_pred``. See - :class:`PostProcessingMeta` for a workaround. + This breaks the sckit-learn API by requiring fit params y_true, y_pred, + and pos_label and predict param y_pred. See :class:`PostProcessingMeta` + for a workaround. References: .. [#pleiss17] `G. Pleiss, M. Raghavan, F. Wu, J. Kleinberg, and @@ -85,17 +85,20 @@ def fit(self, y_pred, y_true, labels=None, pos_label=1, sample_weight=None): Args: y_pred (array-like): Probability estimates of the targets as returned by a ``predict_proba()`` call or equivalent. - y_true (array-like): Ground-truth (correct) target values. + y_true (pandas.Series): Ground-truth (correct) target values. labels (list, optional): The ordered set of labels values. Must - match the order of columns in ``y_pred`` if provided. By - default, all labels in ``y_true`` are used in sorted order. + match the order of columns in y_pred if provided. By default, + all labels in y_true are used in sorted order. pos_label (scalar, optional): The label of the positive class. sample_weight (array-like, optional): Sample weights. Returns: - CalibratedEqualizedOdds: self. + self """ - groups, self.prot_attr_ = check_groups(y_true, self.prot_attr) + y_pred, y_true, sample_weight = check_inputs(y_pred, y_true, + sample_weight) + groups, self.prot_attr_ = check_groups(y_true, self.prot_attr, + ensure_binary=True) self.classes_ = labels if labels is not None else np.unique(y_true) self.groups_ = np.unique(groups) self.pos_label_ = pos_label @@ -107,11 +110,6 @@ def fit(self, y_pred, y_true, labels=None, pos_label=1, sample_weight=None): raise ValueError('pos_label={} is not in the set of labels. The ' 'valid values are:\n{}'.format(pos_label, self.classes_)) - if len(self.groups_) != 2: - raise ValueError('prot_attr={}\nyielded {} groups:\n{}\nbut this ' - 'algorithm requires a binary division of the data.'.format( - self.prot_attr_, len(self.groups_), self.groups_)) - y_pred = y_pred[:, np.nonzero(self.classes_ == self.pos_label_)[0][0]] # local function to return corresponding args for metric evaluation @@ -119,8 +117,7 @@ def _args(grp_idx, triv=False): idx = (groups == self.groups_[grp_idx]) pred = (np.full_like(y_pred, self.base_rates_[grp_idx]) if triv else y_pred) - return [y_true[idx], pred[idx], pos_label, - sample_weight[idx] if sample_weight is not None else None] + return [y_true[idx], pred[idx], pos_label, sample_weight[idx]] self.base_rates_ = [base_rate(*_args(i)) for i in range(2)] @@ -138,8 +135,9 @@ def predict_proba(self, y_pred): classes. Args: - y_pred (array-like): Probability estimates of the targets as - returned by a ``predict_proba()`` call or equivalent. + y_pred (pandas.DataFrame): Probability estimates of the targets as + returned by a ``predict_proba()`` call or equivalent. Note: must + include protected attributes in the index. Returns: numpy.ndarray: Returns the probability of the sample for each class @@ -156,7 +154,7 @@ def predict_proba(self, y_pred): np.unique(groups), self.groups_)) pos_idx = np.nonzero(self.classes_ == self.pos_label_)[0][0] - y_pred = y_pred[:, pos_idx] + y_pred = y_pred.iloc[:, pos_idx] yt = np.empty_like(y_pred) for grp_idx in range(2): @@ -172,8 +170,9 @@ def predict(self, y_pred): """Predict class labels for the given scores. Args: - y_pred (array-like): Probability estimates of the targets as - returned by a ``predict_proba()`` call or equivalent. + y_pred (pandas.DataFrame): Probability estimates of the targets as + returned by a ``predict_proba()`` call or equivalent. Note: must + include protected attributes in the index. Returns: numpy.ndarray: Predicted class label per sample. @@ -185,8 +184,9 @@ def score(self, y_pred, y_true, sample_weight=None): """Score the predictions according to the cost constraint specified. Args: - y_pred (array-like): Probability estimates of the targets as - returned by a ``predict_proba()`` call or equivalent. + y_pred (pandas.DataFrame): Probability estimates of the targets as + returned by a ``predict_proba()`` call or equivalent. Note: must + include protected attributes in the index. y_true (array-like): Ground-truth (correct) target values. sample_weight (array-like, optional): Sample weights. diff --git a/tests/sklearn/test_calibrated_equalized_odds.py b/tests/sklearn/test_calibrated_equalized_odds.py index 1cba4391..3352b548 100644 --- a/tests/sklearn/test_calibrated_equalized_odds.py +++ b/tests/sklearn/test_calibrated_equalized_odds.py @@ -13,7 +13,7 @@ features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'], features_to_drop=[]) -def test_calib_eq_odds_sex(): +def test_calib_eq_odds_sex_weighted(): logreg = LogisticRegression(solver='lbfgs', max_iter=500) y_pred = logreg.fit(X, y, sample_weight=sample_weight).predict_proba(X) adult_pred = adult.copy() @@ -27,31 +27,45 @@ def test_calib_eq_odds_sex(): assert np.isclose(orig_cal_eq_odds.priv_mix_rate, cal_eq_odds.mix_rates_[1]) assert np.isclose(orig_cal_eq_odds.unpriv_mix_rate, cal_eq_odds.mix_rates_[0]) -def test_split(): - adult_est, adult_post = adult.split([0.75], shuffle=False) - X_est, X_post, y_est, y_post = train_test_split(X, y, shuffle=False) +def test_postprocessingmeta_fnr(): + adult_train, adult_test = adult.split([0.9], shuffle=False) + X_tr, X_te, y_tr, _, sw_tr, _ = train_test_split(X, y, sample_weight, + train_size=0.9, shuffle=False) - assert np.all(adult_est.features == X_est) - assert np.all(adult_est.labels.ravel() == y_est) - assert np.all(adult_post.features == X_post) - assert np.all(adult_post.labels.ravel() == y_post) + assert np.all(adult_train.features == X_tr) + assert np.all(adult_test.features == X_te) + assert np.all(adult_train.labels.ravel() == y_tr) + + adult_est, adult_post = adult_train.split([0.75], shuffle=False) -def test_postprocessingmeta(): logreg = LogisticRegression(solver='lbfgs', max_iter=500) + logreg.fit(adult_est.features, adult_est.labels.ravel(), + sample_weight=adult_est.instance_weights) + probas_pred = logreg.predict_proba(adult_post.features)[:, 1] - adult_est, adult_post = adult.split([0.75], shuffle=False) - logreg.fit(adult_est.features, adult_est.labels.ravel()) - y_pred = logreg.predict_proba(adult_post.features)[:, 1] adult_pred = adult_post.copy() - adult_pred.scores = y_pred + adult_pred.scores = probas_pred + orig_cal_eq_odds = CalibratedEqOddsPostprocessing( - unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}]) + unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}], + cost_constraint='fnr', seed=0) orig_cal_eq_odds.fit(adult_post, adult_pred) cal_eq_odds = PostProcessingMeta(estimator=logreg, - postprocessor=CalibratedEqualizedOdds('sex'), shuffle=False) - cal_eq_odds.fit(X, y, sample_weight=sample_weight) + postprocessor=CalibratedEqualizedOdds('sex', cost_constraint='fnr', random_state=0), + shuffle=False) + cal_eq_odds.fit(X_tr, y_tr, sample_weight=sw_tr) + + assert np.allclose(logreg.coef_, cal_eq_odds.estimator_.coef_) assert np.allclose([orig_cal_eq_odds.unpriv_mix_rate, orig_cal_eq_odds.priv_mix_rate], cal_eq_odds.postprocessor_.mix_rates_) + + adult_test_pred = adult_test.copy() + adult_test_pred.scores = logreg.predict_proba(adult_test.features)[:, 1] + adult_test_pred = orig_cal_eq_odds.predict(adult_test_pred) + + y_test_pred = cal_eq_odds.predict_proba(X_te) + + assert np.allclose(adult_test_pred.scores, y_test_pred[:, 1]) From e0ff2b66b5aa4b866a15ccfe51ad57816e85d704 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 19 Dec 2019 13:04:31 -0500 Subject: [PATCH 50/61] readme changes overwritten in the merge --- aif360/sklearn/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md index 98497eb9..f895d171 100644 --- a/aif360/sklearn/README.md +++ b/aif360/sklearn/README.md @@ -28,18 +28,18 @@ objects with sample properties (protected attributes) as the index - [x] Adversarial debiasing - [ ] **[External]** `get_feature_names()` from data preprocessing steps that would remove DataFrame formatting - - [ ] SLEP007/8 + - [ ] [SLEP007](https://github.com/scikit-learn/enhancement_proposals/pull/17)/[SLEP008](https://github.com/scikit-learn/enhancement_proposals/pull/18) - feature names - [ ] Prejudice remover - [ ] Meta-fair classifier - [ ] Make preprocessing algorithms compatible as sklearn `Transformer`s - [ ] **[External]** Add functionality to modify X and y - - [ ] [SLEP001](https://github.com/scikit-learn/enhancement_proposals/blob/master/slep001/proposal.rst) + - [ ] [SLEP005](https://github.com/scikit-learn/enhancement_proposals/pull/15) - Resampler API (see discussion; meta-estimator workaround may be enough) - [ ] Disparate impact remover - [ ] Learning fair representations - [ ] Optimized preprocessing - [X] Reweighing - [X] Meta-estimator workaround - - [ ] **[External]** SLEP006 - Sample properties + - [ ] **[External]** [SLEP006](https://github.com/scikit-learn/enhancement_proposals/pull/16) - Sample properties (meta-estimator works but would be very nice to have) - [ ] Make postprocessing algorithms compatible - [x] Calibrated equalized odds postprocessing - [x] Meta-estimator workaround again From a2cd77ee4369f7b70de153edd96c678a67070e1a Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 19 Dec 2019 14:56:37 -0500 Subject: [PATCH 51/61] train, test were swapped for adult --- examples/demo_disparate_impact_remover.ipynb | 2 +- tests/test_differential_fairness.py | 4 ++-- tests/test_disparate_impact_remover.py | 2 +- tests/test_meta_classifier.py | 2 +- tests/test_standard_datasets.py | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/demo_disparate_impact_remover.ipynb b/examples/demo_disparate_impact_remover.ipynb index 6d6a8bbd..ba5948fd 100644 --- a/examples/demo_disparate_impact_remover.ipynb +++ b/examples/demo_disparate_impact_remover.ipynb @@ -64,7 +64,7 @@ "metadata": {}, "outputs": [], "source": [ - "train, test = ad.split([32561])\n", + "test, train = ad.split([16281])\n", "train.features = scaler.fit_transform(train.features)\n", "test.features = scaler.fit_transform(test.features)\n", "\n", diff --git a/tests/test_differential_fairness.py b/tests/test_differential_fairness.py index 26d1a489..b586c1d5 100644 --- a/tests/test_differential_fairness.py +++ b/tests/test_differential_fairness.py @@ -10,7 +10,7 @@ categorical_features=['workclass', 'education', 'marital-status', 'occupation', 'relationship'], custom_preprocessing=lambda df: df.fillna('Unknown')) -adult_train, adult_test = ad.split([32561], shuffle=False) +adult_test, adult_train = ad.split([16281], shuffle=False) scaler = StandardScaler() X = scaler.fit_transform(adult_train.features) @@ -58,7 +58,7 @@ def custom_preprocessing(df): nonbinary_ad.features = np.delete(nonbinary_ad.features, index, axis=1) nonbinary_ad.feature_names = np.delete(nonbinary_ad.feature_names, index) - _, nonbinary_test = nonbinary_ad.split([32561], shuffle=False) + nonbinary_test, _ = nonbinary_ad.split([16281], shuffle=False) dataset_metric = BinaryLabelDatasetMetric(nonbinary_test) eps_data = dataset_metric.smoothed_empirical_differential_fairness() assert eps_data == 2.063813731996515 # verified with reference implementation diff --git a/tests/test_disparate_impact_remover.py b/tests/test_disparate_impact_remover.py index 3f74ff0d..0b3d4973 100644 --- a/tests/test_disparate_impact_remover.py +++ b/tests/test_disparate_impact_remover.py @@ -28,7 +28,7 @@ def test_adult(): scaler = MinMaxScaler(copy=False) # ad.features = scaler.fit_transform(ad.features) - train, test = ad.split([32561]) + test, train = ad.split([16281]) assert np.any(test.labels) train.features = scaler.fit_transform(train.features) diff --git a/tests/test_meta_classifier.py b/tests/test_meta_classifier.py index 8195650e..45931cdd 100644 --- a/tests/test_meta_classifier.py +++ b/tests/test_meta_classifier.py @@ -19,7 +19,7 @@ def test_adult(): #scaler = MinMaxScaler(copy=False) # ad.features = scaler.fit_transform(ad.features) - train, test = ad.split([32561]) + test, train = ad.split([16281]) biased_model = MetaFairClassifier(tau=0, sensitive_attr=protected) biased_model.fit(train) diff --git a/tests/test_standard_datasets.py b/tests/test_standard_datasets.py index 3aa7b984..0f09d0aa 100644 --- a/tests/test_standard_datasets.py +++ b/tests/test_standard_datasets.py @@ -24,8 +24,8 @@ def test_german(): def test_adult_test_set(): ad = AdultDataset() - # train, test = ad.split([32561]) - train, test = ad.split([30162]) + # test, train = ad.split([16281]) + test, train = ad.split([15060]) assert np.any(test.labels) def test_adult(): From ee7f23c90697095628934814f0d37bb582a1c40a Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 19 Dec 2019 15:20:23 -0500 Subject: [PATCH 52/61] remove branch mentions --- .travis.yml | 4 ++-- aif360/sklearn/README.md | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index a9c99eda..9aa44262 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ env: branches: only: - - sklearn-compat + - master install: - pip install -r requirements.txt @@ -28,4 +28,4 @@ before_script: script: # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics - - travis_wait python -m pytest tests/sklearn + - travis_wait pytest tests diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md index f895d171..0bc7f189 100644 --- a/aif360/sklearn/README.md +++ b/aif360/sklearn/README.md @@ -1,7 +1,5 @@ ## `aif360.sklearn` -[![Build Status](https://travis-ci.org/IBM/AIF360.svg?branch=sklearn-compat)](https://travis-ci.org/IBM/AIF360) - This is a wholly separate interface for interacting with data, viewing metrics, and running debiasing algorithms than the main AIF360 package. The purpose of this sub-package is to match scikit-learn paradigms/APIs for easier integration From c8154ec0969896cc6d9889c1d2bebf8161d76f95 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Fri, 20 Dec 2019 10:47:34 -0500 Subject: [PATCH 53/61] remove "attributes" line if none present --- docs/source/templates/class.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/templates/class.rst b/docs/source/templates/class.rst index 5f46cabb..9ce9f9e6 100644 --- a/docs/source/templates/class.rst +++ b/docs/source/templates/class.rst @@ -20,10 +20,12 @@ {% endblock %} {% block attributes %} + {% if attributes %} .. rubric:: Attributes .. autosummary:: {% for item in attributes %} ~{{ name }}.{{ item }} {%- endfor %} + {%- endif %} {% endblock %} From 7ef94e71ccba3ee16da718d1465dc035245d7785 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Fri, 27 Dec 2019 19:06:14 -0500 Subject: [PATCH 54/61] moved example to main folder --- docs/source/modules/sklearn.rst | 2 +- examples/README.md | 7 +++++-- .../sklearn/demo_new_features.ipynb | 0 3 files changed, 6 insertions(+), 3 deletions(-) rename aif360/sklearn/examples/Getting Started.ipynb => examples/sklearn/demo_new_features.ipynb (100%) diff --git a/docs/source/modules/sklearn.rst b/docs/source/modules/sklearn.rst index a61283de..1401e2eb 100644 --- a/docs/source/modules/sklearn.rst +++ b/docs/source/modules/sklearn.rst @@ -6,7 +6,7 @@ This is the class and function reference for the `scikit-learn`-compatible version of the AIF360 API. It is functionally equivalent to the normal API but it uses scikit-learn paradigms (where possible) and :class:`pandas.DataFrame` for datasets. Not all functionality from AIF360 is supported yet. See -`Getting Started `_ +`Getting Started `_ for a demo of the capabilities. Note: This is under active development. Visit our diff --git a/examples/README.md b/examples/README.md index a9bc4f59..026e0ade 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,7 +1,7 @@ # AI Fairness 360 Examples (Tutorials and Demos) This directory contains a diverse collection of jupyter notebooks that use [AI Fairness 360](http://aif360.mybluemix.net/) in various ways. -Both tutorials and demos illustrate working code using AIF360. Tutorials provide additional discussion that walks +Both tutorials and demos illustrate working code using AIF360. Tutorials provide additional discussion that walks the user through the various steps of the notebook. ## Tutorials @@ -13,6 +13,9 @@ Data from the Medical Expenditure Panel Survey ([2015](https://meps.ahrq.gov/mep ## Demos Below is a list of additional notebooks that demonstrate the use of AIF360. +**NEW:** [sklearn/demo_new_features.ipynb](sklearn/demo_new_features.ipynb): highlights the +features of the new `scikit-learn`-compatible API + [demo_optim_data_preproc.ipynb](demo_optim_data_preproc.ipynb): demonstrates a generalization of the credit scoring tutorial that shows the full machine learning workflow for the optimized data pre-processing algorithm for bias mitigation on several datasets [demo_adversarial_debiasing.ipynb](demo_adversarial_debiasing.ipynb): demonstrates the use of the adversarial debiasing in-processing algorithm to learn a fair classifier @@ -21,7 +24,7 @@ Below is a list of additional notebooks that demonstrate the use of AIF360. [demo_disparate_impact_remover.ipynb](demo_calibrated_eqodds_postprocessing.ipynb): demonstrates the use of a disparate impact remover pre-processing algorithm for bias mitigiation -[demo_json_explainers.ipynb](demo_json_explainers.ipynb): +[demo_json_explainers.ipynb](demo_json_explainers.ipynb): [demo_lfr.ipynb](demo_lfr.ipynb): demonstrates the use of the learning fair representations algorithm for bias mitigation diff --git a/aif360/sklearn/examples/Getting Started.ipynb b/examples/sklearn/demo_new_features.ipynb similarity index 100% rename from aif360/sklearn/examples/Getting Started.ipynb rename to examples/sklearn/demo_new_features.ipynb From c5af6479b82193f93dd417098b442a9fbbcc7adb Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Fri, 31 Jan 2020 16:40:01 -0500 Subject: [PATCH 55/61] use_proba -> needs_proba --- aif360/sklearn/postprocessing/__init__.py | 32 +++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/aif360/sklearn/postprocessing/__init__.py b/aif360/sklearn/postprocessing/__init__.py index d1b0465f..9af0db10 100644 --- a/aif360/sklearn/postprocessing/__init__.py +++ b/aif360/sklearn/postprocessing/__init__.py @@ -28,17 +28,17 @@ class PostProcessingMeta(BaseEstimator, MetaEstimatorMixin): Attributes: estimator_: Fitted estimator. postprocessor_: Fitted postprocessor. - use_proba_ (bool): Determined depending on the postprocessor type if - `use_proba` is None. + needs_proba_ (bool): Determined depending on the postprocessor type if + `needs_proba` is None. """ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(), - use_proba=None, val_size=0.25, **options): + needs_proba=None, val_size=0.25, **options): """ Args: estimator (sklearn.BaseEstimator): Original estimator. postprocessor: Post-processing algorithm. - use_proba (bool): Use ``self.estimator_.predict_proba()`` instead of + needs_proba (bool): Use ``self.estimator_.predict_proba()`` instead of ``self.estimator_.predict()`` as input to postprocessor. If ``None``, defaults to ``True`` if the postprocessor supports it. val_size (int or float): Size of validation set used to fit the @@ -53,7 +53,7 @@ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(), """ self.estimator = estimator self.postprocessor = postprocessor - self.use_proba = use_proba + self.needs_proba = needs_proba self.val_size = val_size self.options = options @@ -79,9 +79,9 @@ def fit(self, X, y, sample_weight=None, **fit_params): Returns: self """ - self.use_proba_ = (self.use_proba if self.use_proba is not None else + self.needs_proba_ = (self.needs_proba if self.needs_proba is not None else isinstance(self.postprocessor, CalibratedEqualizedOdds)) - if self.use_proba_ and not hasattr(self.estimator, 'predict_proba'): + if self.needs_proba_ and not hasattr(self.estimator, 'predict_proba'): raise TypeError("`estimator` (type: {}) does not implement method " "`predict_proba()`.".format(type(self.estimator))) @@ -103,7 +103,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): X_est, X_post, y_est, y_post = train_test_split(X, y, **options_) self.estimator_.fit(X_est, y_est) - y_pred = (self.estimator_.predict(X_post) if not self.use_proba_ else + y_pred = (self.estimator_.predict(X_post) if not self.needs_proba_ else self.estimator_.predict_proba(X_post)) # fit_params = fit_params.copy().update(labels=self.estimator_.classes_) self.postprocessor_.fit(y_pred, y_post, sample_weight=sw_post @@ -116,7 +116,7 @@ def predict(self, X): """Predict class labels for the given samples. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if - ``self.use_proba_`` is ``True``) then returns the post-processed output + ``self.needs_proba_`` is ``True``) then returns the post-processed output from those predictions. Args: @@ -125,7 +125,7 @@ def predict(self, X): Returns: numpy.ndarray: Predicted class label per sample. """ - y_pred = (self.estimator_.predict(X) if not self.use_proba_ else + y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else self.estimator_.predict_proba(X)) y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') return self.postprocessor_.predict(y_pred) @@ -135,7 +135,7 @@ def predict_proba(self, X): """Probability estimates. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if - ``self.use_proba_`` is ``True``) then returns the post-processed output + ``self.needs_proba_`` is ``True``) then returns the post-processed output from those predictions. The returned estimates for all classes are ordered by the label of @@ -149,7 +149,7 @@ def predict_proba(self, X): in the model, where classes are ordered as they are in ``self.classes_``. """ - y_pred = (self.estimator_.predict(X) if not self.use_proba_ else + y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else self.estimator_.predict_proba(X)) y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') return self.postprocessor_.predict_proba(y_pred) @@ -159,7 +159,7 @@ def predict_log_proba(self, X): """Log of probability estimates. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if - ``self.use_proba_`` is ``True``) then returns the post-processed output + ``self.needs_proba_`` is ``True``) then returns the post-processed output from those predictions. The returned estimates for all classes are ordered by the label of @@ -173,7 +173,7 @@ def predict_log_proba(self, X): the model, where classes are ordered as they are in ``self.classes_``. """ - y_pred = (self.estimator_.predict(X) if not self.use_proba_ else + y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else self.estimator_.predict_proba(X)) y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') return self.postprocessor_.predict_log_proba(y_pred) @@ -184,7 +184,7 @@ def score(self, X, y, sample_weight=None): given test data and labels. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if - ``self.use_proba_`` is ``True``) then gets the post-processed output + ``self.needs_proba_`` is ``True``) then gets the post-processed output from those predictions and scores it. Args: @@ -195,7 +195,7 @@ def score(self, X, y, sample_weight=None): Returns: float: Score value. """ - y_pred = (self.estimator_.predict(X) if not self.use_proba_ else + y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else self.estimator_.predict_proba(X)) y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') return self.postprocessor_.score(y_pred, y, sample_weight=sample_weight) From 042bb1215fd0c85221266d70f33e5647a845211d Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Fri, 31 Jan 2020 18:10:57 -0500 Subject: [PATCH 56/61] fixed/renamed/reordered/added some attributes * fixed German 'age' from being dropped * renamed two_year_recid labels to 'Survived' and 'Recidivated' to match ProPublica article * reordered COMPAS categories to 'Male' < 'Female' * added 'foreign_worker' protected attribute for German --- aif360/sklearn/datasets/compas_dataset.py | 19 +++++++++++++------ aif360/sklearn/datasets/openml_datasets.py | 14 ++++++++------ aif360/sklearn/datasets/utils.py | 8 ++++---- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/aif360/sklearn/datasets/compas_dataset.py b/aif360/sklearn/datasets/compas_dataset.py index 81578ef5..c909692d 100644 --- a/aif360/sklearn/datasets/compas_dataset.py +++ b/aif360/sklearn/datasets/compas_dataset.py @@ -20,8 +20,12 @@ def fetch_compas(data_home=None, binary_race=False, Optionally binarizes 'race' to 'Caucasian' (privileged) or 'African-American' (unprivileged). The other protected attribute is 'sex' ('Male' is *unprivileged* and 'Female' is *privileged*). The outcome - variable is 'no recid.' (favorable) if the person was not accused of a crime - within two years or 'did recid.' (unfavorable) if they were. + variable is 'Survived' (favorable) if the person was not accused of a crime + within two years or 'Recidivated' (unfavorable) if they were. + + Note: + The values for the 'sex' variable if numeric_only is ``True`` are 1 for + 'Female and 0 for 'Male' -- opposite the convention of other datasets. Args: data_home (string, optional): Specify another download and cache folder @@ -59,16 +63,19 @@ def fetch_compas(data_home=None, binary_race=False, for col in ['sex', 'age_cat', 'race', 'c_charge_degree', 'c_charge_desc']: df[col] = df[col].astype('category') - # 'did recid' < 'no recid' - df.two_year_recid = df.two_year_recid.replace({0: 'no recid.', - 1: 'did recid.'}).astype('category').cat.as_ordered() + # 'Survived' < 'Recidivated' + cats = ['Survived', 'Recidivated'] + df.two_year_recid = df.two_year_recid.replace([0, 1], cats).astype('category') + df.two_year_recid = df.two_year_recid.cat.set_categories(cats, ordered=True) if binary_race: # 'African-American' < 'Caucasian' df.race = df.race.cat.set_categories(['African-American', 'Caucasian'], ordered=True) - df.sex = df.sex.astype('category').cat.as_ordered() # 'Female' < 'Male' + # 'Male' < 'Female' + df.sex = df.sex.astype('category').cat.reorder_categories( + ['Male', 'Female'], ordered=True) return standardize_dataset(df, prot_attr=['sex', 'race'], target='two_year_recid', usecols=usecols, diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 1bfa24e7..2e6f73d6 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -36,8 +36,8 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], dropcols=[], numeric_only=False, dropna=True): """Load the Adult Census Income Dataset. - Binarizes 'race' to 'White' (privileged) or 'Non-white' (unprivileged). - The other protected attribute is 'sex' ('Male' is privileged and 'Female' is + Binarizes 'race' to 'White' (privileged) or 'Non-white' (unprivileged). The + other protected attribute is 'sex' ('Male' is privileged and 'Female' is unprivileged). The outcome variable is 'annual-income': '>50K' (favorable) or '<=50K' (unfavorable). @@ -151,7 +151,8 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], df['credit-risk'] = df['credit-risk'].cat.as_ordered() # 'bad' < 'good' # binarize protected attribute (but not corresponding feature) - age = (pd.cut(df.age, [0, 25, 100], labels=numeric_only and ['young', 'aged']) + age = (pd.cut(df.age, [0, 25, 100], + labels=False if numeric_only else ['young', 'aged']) if binary_age else 'age') # Note: marital_status directly implies sex. i.e. 'div/dep/mar' => 'female' @@ -161,9 +162,10 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], df = df.join(personal_status.astype('category')) df.sex = df.sex.cat.as_ordered() # 'female' < 'male' - return standardize_dataset(df, prot_attr=['sex', age], target='credit-risk', - usecols=usecols, dropcols=dropcols, - numeric_only=numeric_only, dropna=dropna) + return standardize_dataset(df, prot_attr=['sex', age, 'foreign_worker'], + target='credit-risk', usecols=usecols, + dropcols=dropcols, numeric_only=numeric_only, + dropna=dropna) def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', numeric_only=False, dropna=False): diff --git a/aif360/sklearn/datasets/utils.py b/aif360/sklearn/datasets/utils.py index a39d5fb3..a776ff16 100644 --- a/aif360/sklearn/datasets/utils.py +++ b/aif360/sklearn/datasets/utils.py @@ -28,13 +28,13 @@ def check_already_dropped(labels, dropped_cols, name, dropped_by='numeric_only', """ if not is_list_like(labels): labels = [labels] - labels = [c for c in labels if isinstance(c, str)] - already_dropped = dropped_cols.intersection(labels) - if warn and already_dropped.any(): + str_labels = [c for c in labels if isinstance(c, str)] + already_dropped = dropped_cols.intersection(str_labels) + if warn and any(already_dropped): warnings.warn("Some column labels from `{}` were already dropped by " "`{}`:\n{}".format(name, dropped_by, already_dropped.tolist()), ColumnAlreadyDroppedWarning, stacklevel=2) - return [c for c in labels if c not in already_dropped] + return [c for c in labels if not isinstance(c, str) or c not in already_dropped] def standardize_dataset(df, prot_attr, target, sample_weight=None, usecols=[], dropcols=[], numeric_only=False, dropna=True): From ff9e70c0170c3a6178e00e094aeb370f6daa9535 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Wed, 5 Feb 2020 18:39:29 -0500 Subject: [PATCH 57/61] fixed sample_weight=None bug and classes_ typo --- aif360/sklearn/postprocessing/calibrated_equalized_odds.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py index a648d13b..94f8d5ef 100644 --- a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py +++ b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py @@ -64,7 +64,8 @@ def __init__(self, prot_attr=None, cost_constraint='weighted', self.cost_constraint = cost_constraint self.random_state = random_state - def _weighted_cost(self, y_true, probas_pred, pos_label, sample_weight): + def _weighted_cost(self, y_true, probas_pred, pos_label=1, + sample_weight=None): """Evaluates the cost function specified by ``self.cost_constraint``.""" fpr = generalized_fpr(y_true, probas_pred, pos_label, sample_weight) fnr = generalized_fnr(y_true, probas_pred, pos_label, sample_weight) @@ -178,7 +179,7 @@ def predict(self, y_pred): numpy.ndarray: Predicted class label per sample. """ scores = self.predict_proba(y_pred) - return self.classes[scores.argmax(axis=1)] + return self.classes_[scores.argmax(axis=1)] def score(self, y_pred, y_true, sample_weight=None): """Score the predictions according to the cost constraint specified. @@ -201,4 +202,4 @@ def score(self, y_pred, y_true, sample_weight=None): return abs(difference(self._weighted_cost, y_true, probas_pred, prot_attr=self.prot_attr_, priv_group=self.groups_[1], - sample_weight=sample_weight)) + pos_label=self.pos_label_, sample_weight=sample_weight)) From 57b2ab51efa5725396818cdb67070aafdbd80b37 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 6 Feb 2020 12:54:00 -0500 Subject: [PATCH 58/61] improved specificity_score and added fpr/fnr error --- aif360/sklearn/metrics/metrics.py | 44 ++++++++++++++++++------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index 7b954bf5..4fda5c67 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -2,6 +2,7 @@ import numpy as np from sklearn.metrics import make_scorer, recall_score +from sklearn.metrics import multilabel_confusion_matrix from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_X_y from sklearn.exceptions import UndefinedMetricWarning @@ -26,7 +27,8 @@ 'between_group_generalized_entropy_error', 'theil_index', 'coefficient_of_variation', 'consistency_score', # aliases - 'sensitivity_score', 'mean_difference', + 'sensitivity_score', 'mean_difference', 'false_negative_rate_error', + 'false_positive_rate_error' ] # ============================= META-METRICS =================================== @@ -155,19 +157,24 @@ def score_fn(y, y_pred, **kwargs): # ================================ HELPERS ===================================== -# TODO: make this more general -def specificity_score(y_true, y_pred, neg_label=0, sample_weight=None): +def specificity_score(y_true, y_pred, pos_label=1, sample_weight=None): """Compute the specificity or true negative rate. Args: y_true (array-like): Ground truth (correct) target values. y_pred (array-like): Estimated targets as returned by a classifier. - neg_label (scalar, optional): The label of the negative class. Note: - the data should be binary. + pos_label (scalar, optional): The label of the positive class. sample_weight (array-like, optional): Sample weights. """ - return recall_score(y_true, y_pred, pos_label=neg_label, - sample_weight=sample_weight) + MCM = multilabel_confusion_matrix(y_true, y_pred, labels=[pos_label], + sample_weight=sample_weight) + tn, fp, fn, tp = MCM.ravel() + negs = tn + fp + if negs == 0: + warnings.warn('specificity_score is ill-defined and being set to 0.0 ' + 'due to no negative samples.', UndefinedMetricWarning) + return 0. + return tn / negs def base_rate(y_true, y_pred=None, pos_label=1, sample_weight=None): r"""Compute the base rate, :math:`Pr(Y = \text{pos_label}) = \frac{P}{P+N}`. @@ -339,7 +346,7 @@ def equal_opportunity_difference(y_true, y_pred, prot_attr=None, priv_group=1, sample_weight=sample_weight) def average_odds_difference(y_true, y_pred, prot_attr=None, priv_group=1, - pos_label=1, neg_label=0, sample_weight=None): + pos_label=1, sample_weight=None): r"""A relaxed version of equality of odds. Returns the average of the difference in FPR and TPR for the unprivileged @@ -366,14 +373,14 @@ def average_odds_difference(y_true, y_pred, prot_attr=None, priv_group=1, """ fpr_diff = -difference(specificity_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, - neg_label=neg_label, sample_weight=sample_weight) + pos_label=pos_label, sample_weight=sample_weight) tpr_diff = difference(recall_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) return (tpr_diff + fpr_diff) / 2 -def average_odds_error(y_true, y_pred, prot_attr=None, priv_group=1, - pos_label=1, neg_label=0, sample_weight=None): +def average_odds_error(y_true, y_pred, prot_attr=None, pos_label=1, + sample_weight=None): r"""A relaxed version of equality of odds. Returns the average of the absolute difference in FPR and TPR for the @@ -398,9 +405,10 @@ def average_odds_error(y_true, y_pred, prot_attr=None, priv_group=1, Returns: float: Average odds error. """ + priv_group = check_groups(y_true, prot_attr=prot_attr)[0][0] fpr_diff = -difference(specificity_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, - neg_label=neg_label, sample_weight=sample_weight) + pos_label=pos_label, sample_weight=sample_weight) tpr_diff = difference(recall_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, sample_weight=sample_weight) @@ -561,13 +569,13 @@ def sensitivity_score(y_true, y_pred, pos_label=1, sample_weight=None): return recall_score(y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight) -# def false_negative_rate_error(y_true, y_pred, pos_label=1, sample_weight=None): -# return 1 - recall_score(y_true, y_pred, pos_label=pos_label, -# sample_weight=sample_weight) +def false_negative_rate_error(y_true, y_pred, pos_label=1, sample_weight=None): + return 1 - recall_score(y_true, y_pred, pos_label=pos_label, + sample_weight=sample_weight) -# def false_positive_rate_error(y_true, y_pred, neg_label=0, sample_weight=None): -# return 1 - specificity_score(y_true, y_pred, neg_label=neg_label, -# sample_weight=sample_weight) +def false_positive_rate_error(y_true, y_pred, pos_label=1, sample_weight=None): + return 1 - specificity_score(y_true, y_pred, pos_label=pos_label, + sample_weight=sample_weight) def mean_difference(*y, prot_attr=None, priv_group=1, pos_label=1, sample_weight=None): From 8fdd6dc1460ff0c891b1ff4bd04bb7e3a3d18a1c Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Thu, 6 Feb 2020 12:56:43 -0500 Subject: [PATCH 59/61] made foreign_worker and education (bank) ordered --- aif360/sklearn/datasets/openml_datasets.py | 6 ++++++ tests/sklearn/test_datasets.py | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 2e6f73d6..16d3165f 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -162,6 +162,9 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], df = df.join(personal_status.astype('category')) df.sex = df.sex.cat.as_ordered() # 'female' < 'male' + # 'no' < 'yes' + df.foreign_worker = df.foreign_worker.astype('category').cat.as_ordered() + return standardize_dataset(df, prot_attr=['sex', age, 'foreign_worker'], target='credit-risk', usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, @@ -215,6 +218,9 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', # replace 'unknown' marker with NaN df.apply(lambda s: s.cat.remove_categories('unknown', inplace=True) if hasattr(s, 'cat') and 'unknown' in s.cat.categories else s) + # 'primary' < 'secondary' < 'tertiary' + df.education = df.education.astype('category').cat.as_ordered() + return standardize_dataset(df, prot_attr='age', target='deposit', usecols=usecols, dropcols=dropcols, numeric_only=numeric_only, dropna=dropna) diff --git a/tests/sklearn/test_datasets.py b/tests/sklearn/test_datasets.py index 0cd13a6c..1d2ec6a0 100644 --- a/tests/sklearn/test_datasets.py +++ b/tests/sklearn/test_datasets.py @@ -65,14 +65,14 @@ def test_fetch_german(): german = fetch_german() assert len(german) == 2 assert german.X.shape == (1000, 21) - assert fetch_german(numeric_only=True).X.shape == (1000, 8) + assert fetch_german(numeric_only=True).X.shape == (1000, 9) def test_fetch_bank(): bank = fetch_bank() assert len(bank) == 2 assert bank.X.shape == (45211, 15) assert fetch_bank(dropcols=[]).X.shape == (45211, 16) - assert fetch_bank(numeric_only=True).X.shape == (45211, 6) + assert fetch_bank(numeric_only=True).X.shape == (45211, 7) @pytest.mark.filterwarnings('error', category=ColumnAlreadyDroppedWarning) def test_fetch_compas(): From 2cf455f186181d72fc6bcd155d730dc42eeb5062 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Wed, 19 Feb 2020 15:29:32 -0500 Subject: [PATCH 60/61] various fixes to address PR comments * added one-hot encoding example and random_states to demo notebook * added 'prefit' option to PostProcessingMeta * multiple fixes to docstring wordings * added additional links/disclaimers in docstrings * renamed CalibratedEqualizedOdds args to X and y --- aif360/sklearn/datasets/openml_datasets.py | 37 +- .../inprocessing/adversarial_debiasing.py | 2 +- aif360/sklearn/metrics/metrics.py | 35 +- aif360/sklearn/postprocessing/__init__.py | 81 ++- .../calibrated_equalized_odds.py | 69 +-- aif360/sklearn/preprocessing/reweighing.py | 3 + aif360/sklearn/utils.py | 27 +- examples/sklearn/demo_new_features.ipynb | 557 +++++------------- 8 files changed, 324 insertions(+), 487 deletions(-) diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py index 16d3165f..f4c78e67 100644 --- a/aif360/sklearn/datasets/openml_datasets.py +++ b/aif360/sklearn/datasets/openml_datasets.py @@ -41,6 +41,10 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], unprivileged). The outcome variable is 'annual-income': '>50K' (favorable) or '<=50K' (unfavorable). + Note: + By default, the data is downloaded from OpenML. See the `adult + `_ page for details. + Args: subset ({'train', 'test', or 'all'}, optional): Select the dataset to load: 'train' for the training set, 'test' for the test set, 'all' @@ -60,6 +64,9 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[], namedtuple: Tuple containing X, y, and sample_weights for the Adult dataset accessible by index or name. + See also: + :func:`sklearn.datasets.fetch_openml` + Examples: >>> adult = fetch_adult() >>> adult.X.shape @@ -103,11 +110,9 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], unprivileged; see the binary_age flag to keep this continuous). The outcome variable is 'credit-risk': 'good' (favorable) or 'bad' (unfavorable). - References: - .. [#kamiran09] `F. Kamiran and T. Calders, "Classifying without - discriminating," 2nd International Conference on Computer, - Control and Communication, 2009. - `_ + Note: + By default, the data is downloaded from OpenML. See the `credit-g + `_ page for details. Args: data_home (string, optional): Specify another download and cache folder @@ -126,6 +131,15 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], namedtuple: Tuple containing X and y for the German dataset accessible by index or name. + See also: + :func:`sklearn.datasets.fetch_openml` + + References: + .. [#kamiran09] `F. Kamiran and T. Calders, "Classifying without + discriminating," 2nd International Conference on Computer, + Control and Communication, 2009. + `_ + Examples: >>> german = fetch_german() >>> german.X.shape @@ -142,7 +156,6 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[], >>> disparate_impact_ratio(y, y_pred, prot_attr='age', priv_group=True, ... pos_label='good') 0.9483094846144106 - """ df = to_dataframe(fetch_openml(data_id=31, target_column=None, data_home=data_home or DATA_HOME_DEFAULT)) @@ -175,7 +188,11 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', """Load the Bank Marketing Dataset. The protected attribute is 'age' (left as continuous). The outcome variable - is 'deposit': ``True`` or ``False``. + is 'deposit': 'yes' or 'no'. + + Note: + By default, the data is downloaded from OpenML. See the `bank-marketing + `_ page for details. Args: data_home (string, optional): Specify another download and cache folder @@ -193,6 +210,9 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', namedtuple: Tuple containing X and y for the Bank dataset accessible by index or name. + See also: + :func:`sklearn.datasets.fetch_openml` + Examples: >>> bank = fetch_bank() >>> bank.X.shape @@ -214,7 +234,8 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'deposit'] # remap target - df.deposit = df.deposit.map({'1': False, '2': True}).astype('bool') + df.deposit = df.deposit.map({'1': 'no', '2': 'yes'}).astype('category') + df.deposit = df.deposit.cat.as_ordered() # 'no' < 'yes' # replace 'unknown' marker with NaN df.apply(lambda s: s.cat.remove_categories('unknown', inplace=True) if hasattr(s, 'cat') and 'unknown' in s.cat.categories else s) diff --git a/aif360/sklearn/inprocessing/adversarial_debiasing.py b/aif360/sklearn/inprocessing/adversarial_debiasing.py index ca3de37d..e2328e00 100644 --- a/aif360/sklearn/inprocessing/adversarial_debiasing.py +++ b/aif360/sklearn/inprocessing/adversarial_debiasing.py @@ -67,7 +67,7 @@ def __init__(self, prot_attr=None, scope_name='classifier', adversary. verbose (bool, optional): If ``True``, print losses every 200 steps. random_state (int or numpy.RandomState, optional): Seed of pseudo- - random number generator for shuffling data. + random number generator for shuffling data and seeding weights. """ self.prot_attr = prot_attr diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py index 4fda5c67..956621c0 100644 --- a/aif360/sklearn/metrics/metrics.py +++ b/aif360/sklearn/metrics/metrics.py @@ -210,8 +210,8 @@ def generalized_fpr(y_true, probas_pred, pos_label=1, sample_weight=None): r"""Return the ratio of generalized false positives to negative examples in the dataset, :math:`GFPR = \tfrac{GFP}{N}`. - The generalized confusion matrix is calculated by summing the probabilities - of the positive class instead of the hard predictions. + Generalized confusion matrix measures such as this are calculated by summing + the probabilities of the positive class instead of the hard predictions. Args: y_true (array-like): Ground-truth (correct) target values. @@ -237,8 +237,8 @@ def generalized_fnr(y_true, probas_pred, pos_label=1, sample_weight=None): r"""Return the ratio of generalized false negatives to positive examples in the dataset, :math:`GFNR = \tfrac{GFN}{P}`. - The generalized confusion matrix is calculated by summing the probabilities - of the positive class instead of the hard predictions. + Generalized confusion matrix measures such as this are calculated by summing + the probabilities of the positive class instead of the hard predictions. Args: y_true (array-like): Ground-truth (correct) target values. @@ -272,7 +272,8 @@ def statistical_parity_difference(*y, prot_attr=None, priv_group=1, pos_label=1, Note: If only y_true is provided, this will return the difference in base - rates (statistical parity difference of the original dataset). + rates (statistical parity difference of the original dataset). If both + y_true and y_pred are provided, only y_pred is used. Args: y_true (pandas.Series): Ground truth (correct) target values. If y_pred @@ -287,6 +288,9 @@ def statistical_parity_difference(*y, prot_attr=None, priv_group=1, pos_label=1, Returns: float: Statistical parity difference. + + See also: + :func:`selection_rate`, :func:`base_rate` """ rate = base_rate if len(y) == 1 or y[1] is None else selection_rate return difference(rate, *y, prot_attr=prot_attr, priv_group=priv_group, @@ -302,7 +306,8 @@ def disparate_impact_ratio(*y, prot_attr=None, priv_group=1, pos_label=1, Note: If only y_true is provided, this will return the ratio of base rates - (disparate impact of the original dataset). + (disparate impact of the original dataset). If both y_true and y_pred + are provided, only y_pred is used. Args: y_true (pandas.Series): Ground truth (correct) target values. If y_pred @@ -317,6 +322,9 @@ def disparate_impact_ratio(*y, prot_attr=None, priv_group=1, pos_label=1, Returns: float: Disparate impact. + + See also: + :func:`selection_rate`, :func:`base_rate` """ rate = base_rate if len(y) == 1 or y[1] is None else selection_rate return ratio(rate, *y, prot_attr=prot_attr, priv_group=priv_group, @@ -340,6 +348,9 @@ def equal_opportunity_difference(y_true, y_pred, prot_attr=None, priv_group=1, Returns: float: Equal opportunity difference. + + See also: + :func:`~sklearn.metrics.recall_score` """ return difference(recall_score, y_true, y_pred, prot_attr=prot_attr, priv_group=priv_group, pos_label=pos_label, @@ -461,6 +472,9 @@ def generalized_entropy_error(y_true, y_pred, alpha=2, pos_label=1): index, and 2 is half the squared coefficient of variation. pos_label (scalar, optional): The label of the positive class. + See also: + :func:`generalized_entropy_index` + References: .. [#speicher18] `T. Speicher, H. Heidari, N. Grgic-Hlaca, K. P. Gummadi, A. Singla, A. Weller, and M. B. Zafar, "A Unified @@ -495,6 +509,9 @@ def between_group_generalized_entropy_error(y_true, y_pred, prot_attr=None, index, and 2 is half the squared coefficient of variation. pos_label (scalar, optional): The label of the positive class. + See also: + :func:`generalized_entropy_index` + References: .. [#speicher18] `T. Speicher, H. Heidari, N. Grgic-Hlaca, K. P. Gummadi, A. Singla, A. Weller, and M. B. Zafar, "A Unified @@ -518,6 +535,9 @@ def theil_index(b): Args: b (array-like): Parameter over which to calculate the entropy index. + + See also: + :func:`generalized_entropy_index` """ return generalized_entropy_index(b, alpha=1) @@ -527,6 +547,9 @@ def coefficient_of_variation(b): Args: b (array-like): Parameter over which to calculate the entropy index. + + See also: + :func:`generalized_entropy_index` """ return 2 * np.sqrt(generalized_entropy_index(b, alpha=2)) diff --git a/aif360/sklearn/postprocessing/__init__.py b/aif360/sklearn/postprocessing/__init__.py index 9af0db10..c45f4e4b 100644 --- a/aif360/sklearn/postprocessing/__init__.py +++ b/aif360/sklearn/postprocessing/__init__.py @@ -33,14 +33,16 @@ class PostProcessingMeta(BaseEstimator, MetaEstimatorMixin): """ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(), - needs_proba=None, val_size=0.25, **options): + needs_proba=None, prefit=False, val_size=0.25, **options): """ Args: estimator (sklearn.BaseEstimator): Original estimator. postprocessor: Post-processing algorithm. - needs_proba (bool): Use ``self.estimator_.predict_proba()`` instead of - ``self.estimator_.predict()`` as input to postprocessor. If + needs_proba (bool): Use ``self.estimator_.predict_proba()`` instead + of ``self.estimator_.predict()`` as input to postprocessor. If ``None``, defaults to ``True`` if the postprocessor supports it. + prefit (bool): If ``True``, it is assumed that estimator has been + fitted already and all data is used to train postprocessor. val_size (int or float): Size of validation set used to fit the postprocessor. The estimator fits on the remainder of the training set. @@ -54,6 +56,7 @@ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(), self.estimator = estimator self.postprocessor = postprocessor self.needs_proba = needs_proba + self.prefit = prefit self.val_size = val_size self.options = options @@ -79,14 +82,28 @@ def fit(self, X, y, sample_weight=None, **fit_params): Returns: self """ - self.needs_proba_ = (self.needs_proba if self.needs_proba is not None else - isinstance(self.postprocessor, CalibratedEqualizedOdds)) + self.needs_proba_ = (self.needs_proba if self.needs_proba is not None + else isinstance(self.postprocessor, CalibratedEqualizedOdds)) if self.needs_proba_ and not hasattr(self.estimator, 'predict_proba'): raise TypeError("`estimator` (type: {}) does not implement method " "`predict_proba()`.".format(type(self.estimator))) + if self.prefit: + if len(self.options): + warning("Splitting options were passed but prefit is True so " + "these are ignored.") + self.postprocessor_ = clone(self.postprocessor) + y_score = (self.estimator.predict(X) if not self.needs_proba_ else + self.estimator.predict_proba(X)) + fit_params = fit_params.copy() + fit_params.update(labels=self.estimator_.classes_) + self.postprocessor_.fit(y_score, y, sample_weight=sample_weight, + **fit_params) + return self + if 'train_size' in self.options or 'test_size' in self.options: - warning("'train_size' and 'test_size' are ignored in favor of 'val_size'") + warning("'train_size' and 'test_size' are ignored in favor of " + "'val_size'") options_ = self.options.copy() options_['test_size'] = self.val_size if 'train_size' in options_: @@ -103,10 +120,11 @@ def fit(self, X, y, sample_weight=None, **fit_params): X_est, X_post, y_est, y_post = train_test_split(X, y, **options_) self.estimator_.fit(X_est, y_est) - y_pred = (self.estimator_.predict(X_post) if not self.needs_proba_ else + y_score = (self.estimator_.predict(X_post) if not self.needs_proba_ else self.estimator_.predict_proba(X_post)) - # fit_params = fit_params.copy().update(labels=self.estimator_.classes_) - self.postprocessor_.fit(y_pred, y_post, sample_weight=sw_post + fit_params = fit_params.copy() + fit_params.update(labels=self.estimator_.classes_) + self.postprocessor_.fit(y_score, y_post, sample_weight=sw_post if sample_weight is not None else None, **fit_params) return self @@ -116,8 +134,8 @@ def predict(self, X): """Predict class labels for the given samples. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if - ``self.needs_proba_`` is ``True``) then returns the post-processed output - from those predictions. + ``self.needs_proba_`` is ``True``) then returns the post-processed + output from those predictions. Args: X (pandas.DataFrame): Test samples. @@ -125,18 +143,18 @@ def predict(self, X): Returns: numpy.ndarray: Predicted class label per sample. """ - y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else - self.estimator_.predict_proba(X)) - y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') - return self.postprocessor_.predict(y_pred) + y_score = (self.estimator_.predict(X) if not self.needs_proba_ else + self.estimator_.predict_proba(X)) + y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns') + return self.postprocessor_.predict(y_score) @if_delegate_has_method('postprocessor_') def predict_proba(self, X): """Probability estimates. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if - ``self.needs_proba_`` is ``True``) then returns the post-processed output - from those predictions. + ``self.needs_proba_`` is ``True``) then returns the post-processed + output from those predictions. The returned estimates for all classes are ordered by the label of classes. @@ -149,18 +167,18 @@ def predict_proba(self, X): in the model, where classes are ordered as they are in ``self.classes_``. """ - y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else - self.estimator_.predict_proba(X)) - y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') - return self.postprocessor_.predict_proba(y_pred) + y_score = (self.estimator_.predict(X) if not self.needs_proba_ else + self.estimator_.predict_proba(X)) + y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns') + return self.postprocessor_.predict_proba(y_score) @if_delegate_has_method('postprocessor_') def predict_log_proba(self, X): """Log of probability estimates. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if - ``self.needs_proba_`` is ``True``) then returns the post-processed output - from those predictions. + ``self.needs_proba_`` is ``True``) then returns the post-processed + output from those predictions. The returned estimates for all classes are ordered by the label of classes. @@ -173,10 +191,10 @@ def predict_log_proba(self, X): the model, where classes are ordered as they are in ``self.classes_``. """ - y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else - self.estimator_.predict_proba(X)) - y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') - return self.postprocessor_.predict_log_proba(y_pred) + y_score = (self.estimator_.predict(X) if not self.needs_proba_ else + self.estimator_.predict_proba(X)) + y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns') + return self.postprocessor_.predict_log_proba(y_score) @if_delegate_has_method('postprocessor_') def score(self, X, y, sample_weight=None): @@ -195,10 +213,11 @@ def score(self, X, y, sample_weight=None): Returns: float: Score value. """ - y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else - self.estimator_.predict_proba(X)) - y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns') - return self.postprocessor_.score(y_pred, y, sample_weight=sample_weight) + y_score = (self.estimator_.predict(X) if not self.needs_proba_ else + self.estimator_.predict_proba(X)) + y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns') + return self.postprocessor_.score(y_score, y, + sample_weight=sample_weight) __all__ = [ diff --git a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py index 94f8d5ef..0b3bdf01 100644 --- a/aif360/sklearn/postprocessing/calibrated_equalized_odds.py +++ b/aif360/sklearn/postprocessing/calibrated_equalized_odds.py @@ -16,15 +16,18 @@ class CalibratedEqualizedOdds(BaseEstimator, ClassifierMixin): change output labels with an equalized odds objective [#pleiss17]_. Note: - This breaks the sckit-learn API by requiring fit params y_true, y_pred, - and pos_label and predict param y_pred. See :class:`PostProcessingMeta` - for a workaround. + A :class:`~sklearn.pipeline.Pipeline` expects a single estimation step + but this class requires an estimator's predictions as input. See + :class:`PostProcessingMeta` for a workaround. + + See also: + :class:`PostProcessingMeta` References: .. [#pleiss17] `G. Pleiss, M. Raghavan, F. Wu, J. Kleinberg, and K. Q. Weinberger, "On Fairness and Calibration," Conference on Neural Information Processing Systems, 2017. - `_ + `_ Adapted from: https://github.com/gpleiss/equalized_odds_and_calibration/blob/master/calib_eq_odds.py @@ -58,7 +61,7 @@ def __init__(self, prot_attr=None, cost_constraint='weighted', generalized false negative rate ('fnr'), or a weighted combination of both ('weighted'). random_state (int or numpy.RandomState, optional): Seed of pseudo- - random number generator for shuffling data. + random number generator for sampling from the mix rates. """ self.prot_attr = prot_attr self.cost_constraint = cost_constraint @@ -80,27 +83,26 @@ def _weighted_cost(self, y_true, probas_pred, pos_label=1, raise ValueError("`cost_constraint` must be one of: 'fpr', 'fnr', " "or 'weighted'") - def fit(self, y_pred, y_true, labels=None, pos_label=1, sample_weight=None): + def fit(self, X, y, labels=None, pos_label=1, sample_weight=None): """Compute the mixing rates required to satisfy the cost constraint. Args: - y_pred (array-like): Probability estimates of the targets as - returned by a ``predict_proba()`` call or equivalent. - y_true (pandas.Series): Ground-truth (correct) target values. + X (array-like): Probability estimates of the targets as returned by + a ``predict_proba()`` call or equivalent. + y (pandas.Series): Ground-truth (correct) target values. labels (list, optional): The ordered set of labels values. Must - match the order of columns in y_pred if provided. By default, - all labels in y_true are used in sorted order. + match the order of columns in X if provided. By default, + all labels in y are used in sorted order. pos_label (scalar, optional): The label of the positive class. sample_weight (array-like, optional): Sample weights. Returns: self """ - y_pred, y_true, sample_weight = check_inputs(y_pred, y_true, - sample_weight) - groups, self.prot_attr_ = check_groups(y_true, self.prot_attr, + X, y, sample_weight = check_inputs(X, y, sample_weight) + groups, self.prot_attr_ = check_groups(y, self.prot_attr, ensure_binary=True) - self.classes_ = labels if labels is not None else np.unique(y_true) + self.classes_ = labels if labels is not None else np.unique(y) self.groups_ = np.unique(groups) self.pos_label_ = pos_label @@ -111,14 +113,13 @@ def fit(self, y_pred, y_true, labels=None, pos_label=1, sample_weight=None): raise ValueError('pos_label={} is not in the set of labels. The ' 'valid values are:\n{}'.format(pos_label, self.classes_)) - y_pred = y_pred[:, np.nonzero(self.classes_ == self.pos_label_)[0][0]] + X = X[:, np.nonzero(self.classes_ == self.pos_label_)[0][0]] # local function to return corresponding args for metric evaluation def _args(grp_idx, triv=False): idx = (groups == self.groups_[grp_idx]) - pred = (np.full_like(y_pred, self.base_rates_[grp_idx]) if triv else - y_pred) - return [y_true[idx], pred[idx], pos_label, sample_weight[idx]] + pred = np.full_like(X, self.base_rates_[grp_idx]) if triv else X + return [y[idx], pred[idx], pos_label, sample_weight[idx]] self.base_rates_ = [base_rate(*_args(i)) for i in range(2)] @@ -131,12 +132,12 @@ def _args(grp_idx, triv=False): return self - def predict_proba(self, y_pred): + def predict_proba(self, X): """The returned estimates for all classes are ordered by the label of classes. Args: - y_pred (pandas.DataFrame): Probability estimates of the targets as + X (pandas.DataFrame): Probability estimates of the targets as returned by a ``predict_proba()`` call or equivalent. Note: must include protected attributes in the index. @@ -148,47 +149,47 @@ def predict_proba(self, y_pred): check_is_fitted(self, 'mix_rates_') rng = check_random_state(self.random_state) - groups, _ = check_groups(y_pred, self.prot_attr_) + groups, _ = check_groups(X, self.prot_attr_) if not set(np.unique(groups)) <= set(self.groups_): - raise ValueError('The protected groups from y_pred:\n{}\ndo not ' + raise ValueError('The protected groups from X:\n{}\ndo not ' 'match those from the training set:\n{}'.format( np.unique(groups), self.groups_)) pos_idx = np.nonzero(self.classes_ == self.pos_label_)[0][0] - y_pred = y_pred.iloc[:, pos_idx] + X = X.iloc[:, pos_idx] - yt = np.empty_like(y_pred) + yt = np.empty_like(X) for grp_idx in range(2): i = (groups == self.groups_[grp_idx]) to_replace = (rng.rand(sum(i)) < self.mix_rates_[grp_idx]) - new_preds = y_pred[i].copy() + new_preds = X[i].copy() new_preds[to_replace] = self.base_rates_[grp_idx] yt[i] = new_preds return np.c_[1 - yt, yt] if pos_idx == 1 else np.c_[yt, 1 - yt] - def predict(self, y_pred): + def predict(self, X): """Predict class labels for the given scores. Args: - y_pred (pandas.DataFrame): Probability estimates of the targets as + X (pandas.DataFrame): Probability estimates of the targets as returned by a ``predict_proba()`` call or equivalent. Note: must include protected attributes in the index. Returns: numpy.ndarray: Predicted class label per sample. """ - scores = self.predict_proba(y_pred) + scores = self.predict_proba(X) return self.classes_[scores.argmax(axis=1)] - def score(self, y_pred, y_true, sample_weight=None): + def score(self, X, y, sample_weight=None): """Score the predictions according to the cost constraint specified. Args: - y_pred (pandas.DataFrame): Probability estimates of the targets as + X (pandas.DataFrame): Probability estimates of the targets as returned by a ``predict_proba()`` call or equivalent. Note: must include protected attributes in the index. - y_true (array-like): Ground-truth (correct) target values. + y (array-like): Ground-truth (correct) target values. sample_weight (array-like, optional): Sample weights. Returns: @@ -198,8 +199,8 @@ def score(self, y_pred, y_true, sample_weight=None): """ check_is_fitted(self, ['classes_', 'pos_label_']) pos_idx = np.nonzero(self.classes_ == self.pos_label_)[0][0] - probas_pred = self.predict_proba(y_pred)[:, pos_idx] + probas_pred = self.predict_proba(X)[:, pos_idx] - return abs(difference(self._weighted_cost, y_true, probas_pred, + return abs(difference(self._weighted_cost, y, probas_pred, prot_attr=self.prot_attr_, priv_group=self.groups_[1], pos_label=self.pos_label_, sample_weight=sample_weight)) diff --git a/aif360/sklearn/preprocessing/reweighing.py b/aif360/sklearn/preprocessing/reweighing.py index d4f782b0..f29653ae 100644 --- a/aif360/sklearn/preprocessing/reweighing.py +++ b/aif360/sklearn/preprocessing/reweighing.py @@ -17,6 +17,9 @@ class Reweighing(BaseEstimator): This breaks the scikit-learn API by returning new sample weights from ``fit_transform()``. See :class:`ReweighingMeta` for a workaround. + See also: + :class:`ReweighingMeta` + References: .. [#kamiran12] `F. Kamiran and T. Calders, "Data Preprocessing Techniques for Classification without Discrimination," Knowledge and diff --git a/aif360/sklearn/utils.py b/aif360/sklearn/utils.py index 13ad3820..604b1202 100644 --- a/aif360/sklearn/utils.py +++ b/aif360/sklearn/utils.py @@ -14,9 +14,20 @@ def check_inputs(X, y, sample_weight=None, ensure_2d=True): Args: X (array-like): Input data. y (array-like, shape = (n_samples,)): Target values. - sample_weight (array-like): Sample weights. + sample_weight (array-like, optional): Sample weights. ensure_2d (bool, optional): Whether to raise a ValueError if X is not 2D. + + Returns: + tuple: + + * **X** (`array-like`) -- Validated X. Unchanged. + + * **y** (`array-like`) -- Validated y. Possibly converted to 1D if + not a :class:`pandas.Series`. + * **sample_weight** (`array-like`) -- Validated sample_weight. If no + sample_weight is provided, returns a consistent-length array of + ones. """ if ensure_2d and X.ndim != 2: raise ValueError("Expected X to be 2D, got ndim == {} instead.".format( @@ -39,8 +50,8 @@ def check_groups(arr, prot_attr, ensure_binary=False): provided protected attributes are in the index. Args: - arr (`pandas.Series` or `pandas.DataFrame`): A Pandas object containing - protected attribute information in the index. + arr (:class:`pandas.Series` or :class:`pandas.DataFrame`): A Pandas + object containing protected attribute information in the index. prot_attr (single label or list-like): Protected attribute(s). If ``None``, all protected attributes in arr are used. ensure_binary (bool): Raise an error if the resultant groups are not @@ -49,11 +60,11 @@ def check_groups(arr, prot_attr, ensure_binary=False): Returns: tuple: - * **groups** (`pandas.Index`) -- Label (or tuple of labels) of - protected attribute for each sample in arr. - * **prot_attr** (list-like) -- Modified input. If input is a single - label, returns single-item list. If input is ``None`` returns list - of all protected attributes. + * **groups** (:class:`pandas.Index`) -- Label (or tuple of labels) + of protected attribute for each sample in arr. + * **prot_attr** (`list-like`) -- Modified input. If input is a + single label, returns single-item list. If input is ``None`` + returns list of all protected attributes. """ if not hasattr(arr, 'index'): raise TypeError( diff --git a/examples/sklearn/demo_new_features.ipynb b/examples/sklearn/demo_new_features.ipynb index 026bf790..34a6c087 100644 --- a/examples/sklearn/demo_new_features.ipynb +++ b/examples/sklearn/demo_new_features.ipynb @@ -18,15 +18,20 @@ "import numpy as np\n", "import pandas as pd\n", "import tensorflow as tf\n", + "tf.logging.set_verbosity(tf.logging.ERROR)\n", + "\n", + "from sklearn.compose import make_column_transformer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.model_selection import GridSearchCV, train_test_split\n", + "from sklearn.preprocessing import OneHotEncoder\n", "\n", "from aif360.sklearn.preprocessing import ReweighingMeta\n", "from aif360.sklearn.inprocessing import AdversarialDebiasing\n", "from aif360.sklearn.postprocessing import CalibratedEqualizedOdds, PostProcessingMeta\n", "from aif360.sklearn.datasets import fetch_adult\n", - "from aif360.sklearn.metrics import disparate_impact_ratio, average_odds_error, generalized_fpr, generalized_fnr" + "from aif360.sklearn.metrics import disparate_impact_ratio, average_odds_error, generalized_fpr\n", + "from aif360.sklearn.metrics import generalized_fnr, difference" ] }, { @@ -52,188 +57,8 @@ "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ageworkclasseducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
racesex
0Non-whiteMale25.0Private11th7.0Never-marriedMachine-op-inspctOwn-childNon-whiteMale0.00.040.0United-States
1WhiteMale38.0PrivateHS-grad9.0Married-civ-spouseFarming-fishingHusbandWhiteMale0.00.050.0United-States
2WhiteMale28.0Local-govAssoc-acdm12.0Married-civ-spouseProtective-servHusbandWhiteMale0.00.040.0United-States
3Non-whiteMale44.0PrivateSome-college10.0Married-civ-spouseMachine-op-inspctHusbandNon-whiteMale7688.00.040.0United-States
5WhiteMale34.0Private10th6.0Never-marriedOther-serviceNot-in-familyWhiteMale0.00.030.0United-States
\n", - "
" - ], - "text/plain": [ - " age workclass education education-num \\\n", - " race sex \n", - "0 Non-white Male 25.0 Private 11th 7.0 \n", - "1 White Male 38.0 Private HS-grad 9.0 \n", - "2 White Male 28.0 Local-gov Assoc-acdm 12.0 \n", - "3 Non-white Male 44.0 Private Some-college 10.0 \n", - "5 White Male 34.0 Private 10th 6.0 \n", - "\n", - " marital-status occupation relationship \\\n", - " race sex \n", - "0 Non-white Male Never-married Machine-op-inspct Own-child \n", - "1 White Male Married-civ-spouse Farming-fishing Husband \n", - "2 White Male Married-civ-spouse Protective-serv Husband \n", - "3 Non-white Male Married-civ-spouse Machine-op-inspct Husband \n", - "5 White Male Never-married Other-service Not-in-family \n", - "\n", - " race sex capital-gain capital-loss hours-per-week \\\n", - " race sex \n", - "0 Non-white Male Non-white Male 0.0 0.0 40.0 \n", - "1 White Male White Male 0.0 0.0 50.0 \n", - "2 White Male White Male 0.0 0.0 40.0 \n", - "3 Non-white Male Non-white Male 7688.0 0.0 40.0 \n", - "5 White Male White Male 0.0 0.0 30.0 \n", - "\n", - " native-country \n", - " race sex \n", - "0 Non-white Male United-States \n", - "1 White Male United-States \n", - "2 White Male United-States \n", - "3 Non-white Male United-States \n", - "5 White Male United-States " - ] + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ageworkclasseducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-country
racesex
0Non-whiteMale25.0Private11th7.0Never-marriedMachine-op-inspctOwn-childNon-whiteMale0.00.040.0United-States
1WhiteMale38.0PrivateHS-grad9.0Married-civ-spouseFarming-fishingHusbandWhiteMale0.00.050.0United-States
2WhiteMale28.0Local-govAssoc-acdm12.0Married-civ-spouseProtective-servHusbandWhiteMale0.00.040.0United-States
3Non-whiteMale44.0PrivateSome-college10.0Married-civ-spouseMachine-op-inspctHusbandNon-whiteMale7688.00.040.0United-States
5WhiteMale34.0Private10th6.0Never-marriedOther-serviceNot-in-familyWhiteMale0.00.030.0United-States
\n
", + "text/plain": " age workclass education education-num \\\n race sex \n0 Non-white Male 25.0 Private 11th 7.0 \n1 White Male 38.0 Private HS-grad 9.0 \n2 White Male 28.0 Local-gov Assoc-acdm 12.0 \n3 Non-white Male 44.0 Private Some-college 10.0 \n5 White Male 34.0 Private 10th 6.0 \n\n marital-status occupation relationship \\\n race sex \n0 Non-white Male Never-married Machine-op-inspct Own-child \n1 White Male Married-civ-spouse Farming-fishing Husband \n2 White Male Married-civ-spouse Protective-serv Husband \n3 Non-white Male Married-civ-spouse Machine-op-inspct Husband \n5 White Male Never-married Other-service Not-in-family \n\n race sex capital-gain capital-loss hours-per-week \\\n race sex \n0 Non-white Male Non-white Male 0.0 0.0 40.0 \n1 White Male White Male 0.0 0.0 50.0 \n2 White Male White Male 0.0 0.0 40.0 \n3 Non-white Male Non-white Male 7688.0 0.0 40.0 \n5 White Male White Male 0.0 0.0 30.0 \n\n native-country \n race sex \n0 Non-white Male United-States \n1 White Male United-States \n2 White Male United-States \n3 Non-white Male United-States \n5 White Male United-States " }, "execution_count": 2, "metadata": {}, @@ -249,150 +74,81 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can also easily load a version of the dataset which only contains numeric or binary columns and split it with scikit-learn:" + "We can then map the protected attributes to integers," ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, + "outputs": [], + "source": [ + "X.index = pd.MultiIndex.from_arrays(X.index.codes, names=X.index.names)\n", + "y.index = pd.MultiIndex.from_arrays(y.index.codes, names=y.index.names)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and the target classes to 0/1," + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "y = pd.Series(y.factorize(sort=True)[0], index=y.index)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "split the dataset," + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "(X_train, X_test,\n", + " y_train, y_test) = train_test_split(X, y, train_size=0.7, random_state=1234567)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and finally, one-hot encode the categorical features:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ageeducation-numracesexcapital-gaincapital-losshours-per-week
racesex
00125.07.0010.00.040.0
11138.09.0110.00.050.0
21128.012.0110.00.040.0
30144.010.0017688.00.040.0
41018.010.0100.00.030.0
\n", - "
" - ], - "text/plain": [ - " age education-num race sex capital-gain capital-loss \\\n", - " race sex \n", - "0 0 1 25.0 7.0 0 1 0.0 0.0 \n", - "1 1 1 38.0 9.0 1 1 0.0 0.0 \n", - "2 1 1 28.0 12.0 1 1 0.0 0.0 \n", - "3 0 1 44.0 10.0 0 1 7688.0 0.0 \n", - "4 1 0 18.0 10.0 1 0 0.0 0.0 \n", - "\n", - " hours-per-week \n", - " race sex \n", - "0 0 1 40.0 \n", - "1 1 1 50.0 \n", - "2 1 1 40.0 \n", - "3 0 1 40.0 \n", - "4 1 0 30.0 " - ] + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
0123456789...90919293949596979899
racesex
30149110.00.00.00.01.00.00.00.00.00.0...0.00.01.00.00.058.011.00.00.042.0
12028100.00.00.00.01.00.00.00.00.00.0...0.00.00.00.00.051.012.00.00.030.0
36374110.00.01.00.00.00.00.00.00.00.0...0.00.01.00.00.026.014.00.01887.040.0
8055110.00.01.00.00.00.00.00.00.00.0...0.00.00.00.00.044.03.00.00.040.0
38108110.00.01.00.00.00.00.01.00.00.0...0.00.01.00.00.033.06.00.00.040.0
\n

5 rows × 100 columns

\n
", + "text/plain": " 0 1 2 3 4 5 6 7 8 9 ... 90 \\\n race sex ... \n30149 1 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n12028 1 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n36374 1 1 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n8055 1 1 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n38108 1 1 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 \n\n 91 92 93 94 95 96 97 98 99 \n race sex \n30149 1 1 0.0 1.0 0.0 0.0 58.0 11.0 0.0 0.0 42.0 \n12028 1 0 0.0 0.0 0.0 0.0 51.0 12.0 0.0 0.0 30.0 \n36374 1 1 0.0 1.0 0.0 0.0 26.0 14.0 0.0 1887.0 40.0 \n8055 1 1 0.0 0.0 0.0 0.0 44.0 3.0 0.0 0.0 40.0 \n38108 1 1 0.0 1.0 0.0 0.0 33.0 6.0 0.0 0.0 40.0 \n\n[5 rows x 100 columns]" }, - "execution_count": 3, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "X, y, sample_weight = fetch_adult(numeric_only=True)\n", - "(X_train, X_test,\n", - " y_train, y_test) = train_test_split(X, y, train_size=0.7, shuffle=False)\n", + "ohe = make_column_transformer(\n", + " (OneHotEncoder(sparse=False), X_train.dtypes == 'category'),\n", + " remainder='passthrough')\n", + "X_train = pd.DataFrame(ohe.fit_transform(X_train), index=X_train.index)\n", + "X_test = pd.DataFrame(ohe.transform(X_test), index=X_test.index)\n", + "\n", "X_train.head()" ] }, @@ -400,27 +156,47 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "the protected attribute information is replicated in the labels:" + "Note: the column names are lost in this transformation. The same encoding can be done with Pandas, but this cannot be combined with other preprocessing in a Pipeline." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ageeducation-numcapital-gaincapital-losshours-per-weekworkclass_Federal-govworkclass_Local-govworkclass_Privateworkclass_Self-emp-incworkclass_Self-emp-not-inc...native-country_Portugalnative-country_Puerto-Riconative-country_Scotlandnative-country_Southnative-country_Taiwannative-country_Thailandnative-country_Trinadad&Tobagonative-country_United-Statesnative-country_Vietnamnative-country_Yugoslavia
racesex
00125.07.00.00.040.000100...0000000100
11138.09.00.00.050.000100...0000000100
21128.012.00.00.040.001000...0000000100
30144.010.07688.00.040.000100...0000000100
51134.06.00.00.030.000100...0000000100
\n

5 rows × 100 columns

\n
", + "text/plain": " age education-num capital-gain capital-loss hours-per-week \\\n race sex \n0 0 1 25.0 7.0 0.0 0.0 40.0 \n1 1 1 38.0 9.0 0.0 0.0 50.0 \n2 1 1 28.0 12.0 0.0 0.0 40.0 \n3 0 1 44.0 10.0 7688.0 0.0 40.0 \n5 1 1 34.0 6.0 0.0 0.0 30.0 \n\n workclass_Federal-gov workclass_Local-gov workclass_Private \\\n race sex \n0 0 1 0 0 1 \n1 1 1 0 0 1 \n2 1 1 0 1 0 \n3 0 1 0 0 1 \n5 1 1 0 0 1 \n\n workclass_Self-emp-inc workclass_Self-emp-not-inc ... \\\n race sex ... \n0 0 1 0 0 ... \n1 1 1 0 0 ... \n2 1 1 0 0 ... \n3 0 1 0 0 ... \n5 1 1 0 0 ... \n\n native-country_Portugal native-country_Puerto-Rico \\\n race sex \n0 0 1 0 0 \n1 1 1 0 0 \n2 1 1 0 0 \n3 0 1 0 0 \n5 1 1 0 0 \n\n native-country_Scotland native-country_South \\\n race sex \n0 0 1 0 0 \n1 1 1 0 0 \n2 1 1 0 0 \n3 0 1 0 0 \n5 1 1 0 0 \n\n native-country_Taiwan native-country_Thailand \\\n race sex \n0 0 1 0 0 \n1 1 1 0 0 \n2 1 1 0 0 \n3 0 1 0 0 \n5 1 1 0 0 \n\n native-country_Trinadad&Tobago native-country_United-States \\\n race sex \n0 0 1 0 1 \n1 1 1 0 1 \n2 1 1 0 1 \n3 0 1 0 1 \n5 1 1 0 1 \n\n native-country_Vietnam native-country_Yugoslavia \n race sex \n0 0 1 0 0 \n1 1 1 0 0 \n2 1 1 0 0 \n3 0 1 0 0 \n5 1 1 0 0 \n\n[5 rows x 100 columns]" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# there is one unused category ('Never-worked') that was dropped during dropna\n", + "X.workclass.cat.remove_unused_categories(inplace=True)\n", + "pd.get_dummies(X).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The protected attribute information is also replicated in the labels:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - " race sex\n", - "0 0 1 0\n", - "1 1 1 0\n", - "2 1 1 1\n", - "3 0 1 1\n", - "4 1 0 0\n", - "Name: annual-income, dtype: int64" - ] + "text/plain": " race sex\n30149 1 1 0\n12028 1 0 1\n36374 1 1 1\n8055 1 1 0\n38108 1 1 0\ndtype: int64" }, - "execution_count": 4, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -445,22 +221,20 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "0.823858595509452" - ] + "text/plain": "0.8375469890174688" }, - "execution_count": 5, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "y_pred = LogisticRegression(solver='liblinear').fit(X_train, y_train).predict(X_test)\n", + "y_pred = LogisticRegression(solver='lbfgs').fit(X_train, y_train).predict(X_test)\n", "accuracy_score(y_test, y_pred)" ] }, @@ -473,16 +247,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "0.19826239080897468" - ] + "text/plain": "0.2905425926727236" }, - "execution_count": 6, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -499,22 +271,19 @@ "\n", "`average_odds_error()` computes the (unweighted) average of the absolute values of the true positive rate (TPR) difference and false positive rate (FPR) difference, i.e.:\n", "\n", - "$\\tfrac{1}{2}\\left(|FPR_{D = \\text{unprivileged}} - FPR_{D = \\text{privileged}}|\n", - " + |TPR_{D = \\text{unprivileged}} - TPR_{D = \\text{privileged}}|\\right)$" + "$$ \\tfrac{1}{2}\\left(|FPR_{D = \\text{unprivileged}} - FPR_{D = \\text{privileged}}| + |TPR_{D = \\text{unprivileged}} - TPR_{D = \\text{privileged}}|\\right) $$" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "0.12427040384779571" - ] + "text/plain": "0.09372170954260936" }, - "execution_count": 7, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -539,22 +308,17 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "scrolled": false - }, + "execution_count": 12, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", - "text": [ - "0.8147819559134648\n", - "{'estimator__C': 10, 'reweigher__prot_attr': 'sex'}\n" - ] + "text": "0.8279649148669566\n{'estimator__C': 10, 'reweigher__prot_attr': 'sex'}\n" } ], "source": [ - "rew = ReweighingMeta(estimator=LogisticRegression(solver='liblinear'))\n", + "rew = ReweighingMeta(estimator=LogisticRegression(solver='lbfgs'))\n", "\n", "params = {'estimator__C': [1, 10], 'reweigher__prot_attr': ['sex']}\n", "\n", @@ -566,16 +330,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "0.639237550613212" - ] + "text/plain": "0.5676803237673037" }, - "execution_count": 9, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -593,47 +355,34 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /anaconda/envs/aif360/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Colocations handled automatically by placer.\n" - ] - }, { "data": { - "text/plain": [ - "0.8218794786050638" - ] + "text/plain": "0.8399056534237488" }, - "execution_count": 10, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "adv_deb = AdversarialDebiasing(prot_attr='sex')\n", + "adv_deb = AdversarialDebiasing(prot_attr='sex', random_state=1234567)\n", "adv_deb.fit(X_train, y_train)\n", "adv_deb.score(X_test, y_test)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "0.022611763594614448" - ] + "text/plain": "0.060623189820735834" }, - "execution_count": 11, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -651,7 +400,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -669,24 +418,22 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "0.7676926226711254" - ] + "text/plain": "0.8163190093609494" }, - "execution_count": 13, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "cal_eq_odds = CalibratedEqualizedOdds('sex', cost_constraint='fnr')\n", - "log_reg = LogisticRegression(solver='liblinear')\n", - "postproc = PostProcessingMeta(estimator=log_reg, postprocessor=cal_eq_odds)\n", + "cal_eq_odds = CalibratedEqualizedOdds('sex', cost_constraint='fnr', random_state=1234567)\n", + "log_reg = LogisticRegression(solver='lbfgs')\n", + "postproc = PostProcessingMeta(estimator=log_reg, postprocessor=cal_eq_odds, random_state=1234567)\n", "\n", "postproc.fit(X_train, y_train)\n", "accuracy_score(y_test, postproc.predict(X_test))" @@ -694,15 +441,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfUAAAEKCAYAAAALjMzdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzs3Xdck1f7P/DPCXsLiGwRgRDCcFEUR92K/VkUseJotY66H63Vjm+X1qq1j9papFq11Yrax1W1rmprK9hqawVF2UslArIEwoaEnN8fSWyAAEEJCXjer1dekHvlusM4Oec+93URSikYhmEYhun8OJoOgGEYhmGY9sEadYZhGIbpIlijzjAMwzBdBGvUGYZhGKaLYI06wzAMw3QRrFFnGIZhmC5CrY06ISSIEJJKCMkghLynZH1PQsgVQshtQshdQshLCuv+T7ZfKiFkvDrjZBiGYZiugKjrPnVCiA6ANABjAWQDuAlgBqU0SWGbPQBuU0p3EUL4AC5QSnvJvv8fgAAADgAuA+BSSuvVEizDMAzDdAHq7KkHAMiglN6jlNYBOAJgUqNtKABz2fcWAHJl308CcIRSWkspvQ8gQ3Y8hmEYhmGaoavGYzsCeKjwPBvAwEbbrAPwCyHkPwBMAIxR2PfvRvs6Nn4BQshCAAsBwMTEZACPx2uXwDUtIQEwNgZ699Z0JAzTUGxsbBGl1EbTcTAMo5w6G3VVzADwPaV0GyEkEMBBQoiPqjtTSvcA2AMA/v7+NCYmRk1hdhyJRNqgv/EG8Pnnmo6GYRoihGRpOgaGYZqnzkY9B4CzwnMn2TJF8wEEAQCl9C9CiCGA7iru2yUVFgK1tUDPnpqOhGEYhuls1HlN/SYAD0KIKyFEH8B0AGcabSMAMBoACCFeAAwBFMq2m04IMSCEuALwAPCPGmPVGlmyfhBr1BmGYZi2UltPnVIqJoQsB3AJgA6AfZTSRELIegAxlNIzAFYD2EsIWQXppLnXqXQ6fiIh5BiAJABiAMuel5nvAoH0q4uLZuNgGIZhOh+1XlOnlF4AcKHRso8Vvk8CMKSZfTcC2KjO+LSRvFFnPXWmK4mNje2hq6v7LQAfsKRXDPO0JAASxGLxggEDBhQo20DTE+WYRrKyADMzwMJC05EwTPvR1dX91s7OzsvGxqaEw+GoJzkGw3RxEomEFBYW8vPy8r4FEKxsG/aJWcsIBNKhd0I0HQnDtCsfGxubMtagM8zT43A41MbGRgjpiJfybTowHkYFAgEbeme6JA5r0Bnm2cn+jpptu1mjrmVYo84wDMM8Ldaoa5HKSqCoiDXqDMMwzNNhjboWeShLqstuZ2MY9Th48GA3QsiA27dvG8qXpaam6nt4eHgDwLlz58xGjhzp/qyvExoa2mv//v2WABAWFuYSGxtrCADGxsb9nuW4586dM/v1119N2rqfo6Oj76NHj1SaGB0eHm49e/bsdutaDB8+3L2oqEgHADZs2NCjd+/e3sHBwa6HDx+2eP/99+3a63XkJBIJBg0axC0uLuYAgI6OzgAej8eXP1JTU/Xb+zXlnva9y83N1R02bJhHe8TAZr9rEXY7G8Oo15EjR6z69+9fERkZadWvX7/c1vd4dkePHm1Tal2RSAQ9PT2l637//XczU1PT+rFjx1a2S3AdIDo6OkP+/XfffWdz+fLlNDc3N5FskVDV47T0vig6duyYhbe3d7WVlZUEAAwMDCQpKSlJre2nSQ4ODmJbW1vRL7/8YjJu3Lhn+tmyRl2LsGxyzPNg3jw4JyTAuD2P6eODqn37GhSQakIoFHJu3rxpevny5dTg4GCPL7/8UuVGXSwWY+nSpU5XrlyxIITQOXPmFH3wwQcFa9assb948WK32tpajr+/f8Xhw4ezOJyGA6ABAQGeW7duffjiiy9WAcD8+fOdo6OjzW1sbEQ//vjjPQcHB3FAQICnj49P1T///GMaGhpa7OnpWbN582Z7kUjEsbS0FB89evReVVUVJzIy0obD4dBjx45Zb9++XeDn51czd+5cl5ycHH0A+OKLLwTjxo2rzMvL0wkNDe2dn5+vP2DAgIrmSmyfOHHC/OOPP3asr68nVlZW4r/++itNcf0PP/xg0TgOZ2dn8fnz501Xr17dEwAIIbh+/XpKWVmZTmhoaO+Kigqd+vp6smPHjqygoKAKR0dH35iYmOTVq1c7ZGdnG0yYMMFj1qxZRZaWlvUxMTEmkZGRgtzcXF1l5/HWW2853Lt3z0AgEBg4OjrWrl279tHcuXNdRSIRkUgk+PHHHzN9fX1rFWM+fPiw1aJFi4pa+3kuW7bM6dq1a2Z1dXXkjTfeKHj77beLzp07Z/bJJ584mJubi1NTU42Dg4OLfX19q3fu3GlbW1tLTp06lent7V3b3Pui+BrNnZOy987S0lIyefLk0sjISOtnbdTZ8LsWEQgAHR3AwUHTkTBM1/PDDz90GzFihNDPz6/W0tJS/Mcff6j8wWLbtm02AoFAPykpKTEtLS1pwYIFjwHg7bffLkhISEhOT09PrK6u5hw5cqTFDBPV1dUcf3//yoyMjMQhQ4aUv/fee0/+2uvq6khCQkLyJ598kj927NiKuLi4lOTk5KSpU6cWr1+/3s7T07Nu9uzZhYsXL85PSUlJCgoKqli0aJHzW2+9lZ+QkJB86tSpzMWLF/cCgPfee88hMDCwIiMjIzEkJKT00aNHTYacc3NzdZcvX97r5MmTmampqUmnT5/ObLyNsjhk74ddeHh4VkpKStLff/+dYmpqKtm3b5/V6NGjhSkpKUnJycmJAwcOrGr0/gt69Oghio6OTlu7dm2DxCnNnQcApKenG169ejX17Nmz93fs2GGzdOnS/JSUlKS7d+8mu7q61jWOOTY21nTIkCFPGsba2lqOfOh97NixbgCwffv27hYWFvUJCQnJd+7cST5w4IBNSkqKPgCkpKQY7du3T5Cenp5w4sQJ67S0NMP4+Pjk1157rWjbtm09WnpfVDknZe8dAAwZMqTyn3/+MVXya9MmrKeuRQQCwNER0GU/FaYLa61HrS7Hjh2zWrFiRQEAhIaGFh88eNBq2LBhVa3tBwC///67+eLFiwvlw7+2trb1APDzzz+bffHFF3Y1NTWc0tJSXT6fX40WhpQ5HA4WLFhQDADz5s17PGXKlCfX72fMmFEs//7+/fv6kydPdiosLNSrq6vjODs71yo73rVr18zT09ON5M8rKip0hEIh5++//zY7efJkBgBMnz5duGjRoiZptqOiokwCAgLKeTxeneI5KWoujkGDBlWsWbPGedq0acUzZswocXNzkwwaNKhy0aJFvUQiEWfq1KklgwcPrm75XW39PAAgKCio1NTUlAJAYGBg5datW+2zs7P1p0+fXtK4lw4AQqFQ19LSUiJ/rmz4/fLly+YpKSnGZ86csQSA8vJynaSkJEN9fX3q6+tb6eLiIgKAnj171k6YMEEIAH369KmOjo42a+l9UeWclL13gHQIvqCg4Jmv97OeuhbJymJD7wyjDvn5+Tp///232bJly1wcHR19IyIi7M6ePWspkUha37kZVVVVZPXq1S4nT57MTEtLS3r11VeLampq2vQ/lShkmTIzM3sSzPLly3suXbq0IC0tLSkiIiKrtrZW6XEppbh161ZySkpKUkpKSlJBQcFdCwuLpz+pRpqLY9OmTXnffvttVnV1NWfYsGG827dvG06YMKHi6tWrqY6OjnXz5s1zjYiIsFb1dVo6DxMTkyfns3jx4uKffvopw8jISDJx4kSPM2fOmDU+lo6ODq2vb7lUCKWUbNu2TSB/vZycnPgpU6aUAYCBgcGTaxUcDgeGhoZU/n19fT1p6X1R5ZyUvXeA9PfJwMDgmX92rFHXIvJscgzDtK+DBw9ahoSEFOfm5sbn5OTE5+Xl3XVycqq7dOmSSsOdo0ePLtu9e3d3kUg6vys/P1+nqqqKAwB2dnZioVDIOXv2rGVrx5FIJJDPiv/++++tAwICypVtV15ertOzZ0+RfDv5cjMzs/ry8nId+fOhQ4eWffbZZz3kz69fv24EAIMGDSqX73fs2DHzsrIyHTQyYsSIyn/++cdMPuycn5/fZJvm4khMTDQICAio3rhxY56fn19lQkKCYVpamr6Tk5No9erVRbNnzy68deuWypc3mjuPxpKSkvS9vLxqP/zww4Lx48eXxsXFNdnO1dW1Jjk52aCl1xs7dqxw165dNrW1tQQA7t69a1BWVqZye9jc+6LKOSl77wAgISHBkMvlqjy60RzWqGuJ+nogO5v11BlGHY4fP241ZcqUEsVlkyZNKjl06JCVKvuvWrWq0MnJqY7H43l7enryv/vuO6vu3bvXz5o1q9DLy8t75MiR3D59+rQ6wcnIyEjyzz//mHh4eHhfvXrV7LPPPnukbLsPPvggd8aMGW7e3t5e1tbWTyZghYaGlp4/f74bj8fjX7x40XTPnj0Pb926ZcLlcvlubm7eERERNgCwefPm3GvXrpm6u7t7nzx50tLe3r7JtWcHBwdxeHj4g5CQEHdPT09+SEhIb1Xj+O9//9vDw8PDm8vl8vX09OjUqVOFly5dMvPy8vL28vLi//jjj1bvvPNOvirvLQA0dx6NHTp0yIrL5XrzeDx+cnKy0aJFix433mbcuHHCX375pUkPXtGqVauKeDxeja+vr5eHh4f3G2+84SISiVROzt3c+6LKOSl77wDg119/NQsKClL5boDmkOZmRXY2/v7+NCYmRtNhPLXcXOn19F27gMWLNR0NwyhHCImllPq3db87d+486NOnT4szkhmmPWRlZenNmDGj1/Xr19M1HUtb+Pv7e/78888ZNjY2rZYZv3PnTvc+ffr0UraO9dS1BLudjWEY5tm5uLiI5s2bVyRPPtMZ5Obm6q5cuTJflQa9NWyetZaQJ55h19QZhmGezYIFC0pa30p7ODg4iF977bXS9jhWp/kk09XJG3VnZ83GwTAMw3RerFHXEllZQLdugLm5piNhGIZhOiu1NuqEkCBCSCohJIMQ8p6S9V8SQuJkjzRCSKnCunqFdWfUGac2YLezMQzDMM9KbdfUCSE6AL4GMBZANoCbhJAzlNInmX0opasUtv8PAMUKRtWU0r7qik/bsDrqDMMwzLNSZ089AEAGpfQepbQOwBEAk1rYfgaA/6kxHq3GsskxjPqx0qut62qlVwkhAyZNmuQqXy8SiWBpadmntZ/z0/4u1NTUEH9/f095oqKOps7Z745AgxzP2QAGKtuQEOICwBXA7wqLDQkhMQDEADZTSk+rK1BNKysDSkvZ8DvDqBsrvdrxNF161cjISJKammpUUVFBTE1N6alTp8xtbW3V1uIaGhrS4cOHl3377bdWS5YsKW59j/alLRPlpgM4QSlVvEfPRZbkYiaA7YQQt8Y7EUIWEkJiCCExhYWFHRVru3so++jDeurMc2HePGcEBHi262PevFbvG5GXXt2/f/+DU6dOqZRJTk4sFmPhwoVO8kxgGzdu7AEAa9assffx8fHy8PDwnjFjhouyXPIBAQGeV69efZIydf78+c7u7u7egYGB3NzcXF35NvPmzXP28fHx2rBhg+0PP/xg4efnx/Py8uIPHjyY+/DhQ93U1FT9yMhIm2+++cZWnlEuNzdXd/z48W4+Pj5ePj4+Xr/88osJAOTl5ekMGTLEw93d3TssLMylpdKrfD7fy9PTkx8YGMhtvF5ZHABw/vx5U3nlMy8vL35JSQknKytLz9/f35PH4/E9PDy8L168aAr8O0owc+bMnvLSq5988kkPxRGB5s7jrbfecpg8ebJr//79eVOmTHGNiYkx9PX19eLxeHwul8uPj49vkg728OHDViEhIQ1uDxszZozw+PHj3QDgf//7n1VoaOiTxvbKlSvGffv25Xl5efH79evHu3PnTpNjlpWVcV555ZVevr6+Xl5eXvxDhw51A4Dm4pk6dWrpkSNH2vQ71l7U2ajnAFD8Q3OSLVNmOhoNvVNKc2Rf7wGIQsPr7fJt9lBK/Sml/jY2SrMKdgry29lYo84w6sNKrzb0vJReBYDXXnut+OjRo5ZVVVUkOTnZODAw8Mn6Pn361Ny8eTMlOTk5ae3atTnvvPOOU+Njvv/++/YjR44si4+PT/7jjz9SP/zwQ6eysjJOc/G88MIL1Xfv3m3zZZL2oM7h95sAPAghrpA25tMh7XU3QAjhAbAE8JfCMksAVZTSWkJIdwBDAPxXjbFqFMsmxzxX9u1jpVfBSq+qch7As5deBYCBAwdWZ2dnG+zdu9dqzJgxDX4+xcXFOmFhYa4PHjwwJIRQZTngo6KizC9dutQtPDzcDgBqa2tJRkaGfnPx6OrqQk9Pj5aUlHAax6JuauupU0rFAJYDuAQgGcAxSmkiIWQ9ISRYYdPpAI7QhuNDXgBiCCF3AFyB9Jp6g3q4XYlAAOjpAfb2mo6EYbomVnr16XSl0qtBQUGla9eudZ49e3aD69zvvvuu4/Dhw8vT09MTz549m1FXV6e0jOqJEycy5PE9evQovn///jUtxSMSiYixsXGHF1dR6zV1SukFSimXUupGKd0oW/YxpfSMwjbrKKXvNdrvOqXUl1LaR/b1O3XGqWkCAeDkBHC0ZYYDw3QxrPQqK726ZMmSojVr1uQGBAQ0GEEoKyvTcXJyqgOA3bt3d1f2uiNHjizbtm2brfxD4LVr14xaiicvL0+nW7duYsXa7B2FNSNagN3OxjDqxUqvstKrbm5uog8//LCg8fJ33303b926dU5eXl58sVhpFVVs3rw5VywWEx6Px3d3d/f+8MMPHVuK5+effzZvPMzfUVjpVS3g4gKMGAEcOKDpSBimZaz0KqPttKH06rhx49y2bt2a7efnp3QuxLNipVe1mFgM5OSwnjrDMEx70HTp1ZqaGhIcHFyqrga9Naz0qobl5gL19axRZxiGaS+aLL1qaGhIly9f3uSyQEdhPXUNY3XUGYZhmPbCGnUNY4lnGIZhmPbCGnUNkzfqzq0muWQYhmGYlrFGXcOysgBra8BEIwkFGUY7SSTAb7/BJDIS3X77DSbPkCPmiczMTL3Ro0e7ubi4+Dg7O/vMnTvXuaampkn2MAB48OCBXlBQUJNbvBpTrEDWVm+99ZbDxx9/bKvq9s9a4U3Rf//7Xxt5cpjbt28bynO4JyYmGvTr14/3rMcPCgrqnZSUpA9Ic79zuVy+PFf801SZU1VnrazWnlijrmECAbuezjCKjh6FhYMD/IKDwV26FL1efhlcBwf4HT2KFvOqt0QikWDy5MnuwcHBpVlZWQn3799PqKys5KxcudKx8bYikQi9evUSXbx48V5rx42Ojs7o3r170/RlWu6dd94plE/mOn78eLfg4OCS5OTkJG9v79rbt2+nqHociUSCxtnbYmJiDOvr6wmfz39yb3x0dHSaPBubNlaYU6yspulYnhVr1DVMIGDX0xlG7uhRWMyZg975+dCrqgKnshI61dXg5OdDb84c9H7ahv3s2bNmBgYGkpUrVz4GpLm5v/nmm4dHjx7tXl5ezgkPD7ceNWqU+6BBg7iDBw/2VKyxXl5eznnppZd6u7m5eY8dO9bNz8+PJ6+6Jq9Alpqaqt+7d2/v6dOnu7i7u3sPGTLEo6KiggDAtm3buvv4+Hh5enryx48f71ZeXt7i/92HDx/qjh071s3T05Pv6enZpGcrFAo5gYGBXD6f78Xlcp9UDCsrK+OMGDHC3dPTk+/h4eG9d+9eSwBYunSpo5ubmzeXy+UvXLjQCfh3lODo0aMWe/bssf3+++9tBg4cyAUajgh89NFHtj4+Pl5cLpe/atUqB0Baf75Xr14+ISEhvbhcrndmZmaDYjHff/+99csvv9ygSpoyzR3b1dXVOzQ0tFevXr18goODXU+fPm3Wv39/nouLi8+VK1eMga5XWa09sUZdgyhl2eQYRk4iAVasgEttrfL/S7W14KxcCZenGYqPj4836tOnT4PiLVZWVhJ7e/u6pKQkAwBITEw0/umnnzJv3ryZqrjdli1bbLp161afmZmZuGnTppykpCSlw8cCgcBwxYoVBRkZGYkWFhb1kZGRlgAwa9askoSEhOTU1NQkT0/P6vDwcKWpSOUWL17cc9iwYeWpqalJiYmJSf37969RXG9sbCw5f/58RlJSUnJ0dHTa+++/7ySRSHDy5ElzOzs7UWpqalJ6enrilClTyvLy8nQuXLhgmZ6enpiWlpa0adOmBhnswsLChPLKbzdu3EhTXHfy5EnzjIwMw7t37yYnJycnxcXFGf/888+msnM1WL58eWFGRkYil8ttkK3uxo0bpoMGDWrwXg8fPpzL4/H4fn5+vNaO/fDhQ8N33303PzMzMyEzM9Pw8OHD1jExMSkbN27M3rhxoz3Q9SqrtSd2n7oGCYVAeTkbfmcYALhyBSYVFWjx+nR5OXSiomAyahTafQh32LBhZcoqlV2/ft105cqVBQDwwgsv1HC5XKWV3RwdHWvllcn69etX9eDBAwMAiI2NNfr4448dy8vLdSorK3WGDx/eYvrQ69evm504ceI+IB1RsLa2bhCTRCIhb775ptPff/9tyuFwUFBQoJ+dna3bv3//6g8++MB5yZIljpMmTRIGBQVViEQiGBgYSMLCwnpNnDixNCwsTOXUpRcvXjS/evWqOZ/P5wNAVVUVJyUlxbB379519vb2daNHj1b6MygsLNSzs7NrcHE6Ojo6zd7e/kkO1paO7ejoWCvPz87lcqtHjRpVxuFw0L9//6oNGzY4AF2vslp7Yj11DWK3szHMv3JyoEcIWsxbTQhodjb02npsHx+f6jt37jQoMFJcXMx59OiRPp/PrwWkPeC2HleRvr7+k9h1dHSoWCwmALBw4ULXiIgIQVpaWtK7776b21zFNVXt3r3b6vHjx7rx8fHJKSkpSdbW1qLq6mqOn59f7a1bt5J8fX2rP/roI8c1a9bY6+npIS4uLnnq1Kkl586d6zZixAgPVV+HUoo333zzkfxauEAgSFi1alUR0PJ7ZWBgIKmurm7xHFs6tuL7yOFwYGhoSAFAR0cH9fX1BOh6ldXaE2vUNYjVUWeYfzk6QiSRQOlsdDlKQZyc0OYpysHBweU1NTUc+YxvsViMpUuXOr/yyitFiiVPlQkMDKw4cuSIJQDExsYapqWlKa0g1pyqqipOz549RbW1tUSVa7ZDhgwp37Jli408zsePHzcYvRAKhTrdu3cXGRgY0LNnz5rl5ubqA9IZ+2ZmZpKlS5cWv/XWW3lxcXHGQqGQI+vVCr/55puHKSkpKldOmzBhQtnBgwe7y+ua379/Xy8nJ6fV0V0PDw+lVdLa49hyXa2yWntiw+8axLLJMcy/Ro5EpZkZ6qurm+9smJmhfsSItg+9czgcnD59OmPhwoUuW7ZssZdIJBg1apQwPDw8p7V933777cJp06b1cnNz83Zzc6txd3evsbS0VHnG+3vvvZcbEBDgZWVlJe7fv39FRUVFi5cYdu3aJXj99ddduFxudw6Hg4iIiKwxY8Y8OecFCxYUT5gwwZ3L5fL9/PyqXF1dawDpMP///d//OXE4HOjq6tKdO3dmlZaW6kycONG9traWAMCnn376UNW4p0yZUpaYmGj4wgsv8ABp7/zw4cP3dXV1W2z0JkyYUPr777+bTZ48WWlZ2Wc5tty7776bt2DBAtfPP//cYezYsUon5W3evDl34cKFPXk8Hl8ikRBnZ+faK1euZBw6dMjq2LFj1rq6utTGxkb06aefPgI0W1mtPbEqbRr07rvAV18BVVWsljrTOai7Spt89ruyyXIGBpAcOIB7YWHo0H+8YrEYdXV1xNjYmCYmJhqMGzeOm5mZmSAfFmYaqqioIEOGDPGMjY1N0dXtPP1GdVdWa08tVWnrPO94FyQQSDPJsQadYaRkDfa9lSvhUl4OHUJAKQUxM0P9V18hq6MbdEB6S9uwYcM8RSIRoZTiyy+/zGINevNMTU3pxx9/nHv//n19Dw+PJnXctZGmK6u1J9ZT16DBgwEjI+C33zQdCcOopqPqqUskQFQUTLKzoefkBNGIEahkH34ZRor11LWUQACMG6fpKBhG+3A4gDpuW2OYrk6tn30JIUGEkFRCSAYh5D0l678khMTJHmmEkFKFdXMIIemyxxx1xqkJIpG0ljqb+d6FPH4M/PWXNKsQwzCMBqitUSeE6AD4GsAEAHwAMwghfMVtKKWrKKV9KaV9AewAcFK2rxWAtQAGAggAsJYQYqmuWDUhO1v6v5816l3I3r3SayoZGZqOhGGY55Q6e+oBADIopfcopXUAjgCY1ML2MwD8T/b9eAC/UkqLKaUlAH4FEKTGWDscu52tixGLgV27gFGjAA+V83swDMO0K3U26o4AFO+JzJYta4IQ4gLAFcDvbdmXELKQEBJDCIkpLCxsl6A7Cssm18WcPSv9of7nP5qOpGuQ1l41QWRkN/z2mwnaofYqK736r44uvTpgwABPxfU8Ho8vL5jTHMWiOm01ePBgbmFh4VP9XDo7bZlPOh3ACUppm0oYUkr3UEr9KaX+NjY2agpNPeSNulOTMgRMpxQRIf2ENnGipiPp/I4etYCDgx+Cg7lYurQXXn6ZCwcHPxw9ykqvtpOOLr1aWVmpk5GRoQcAt27dMmyn02jWjBkzHm/durVzNQrtRJ2Neg4AZ4XnTrJlykzHv0Pvbd23U8rKAnr0kN7SxnRyiYnA778DS5cCnSjZhlY6etQCc+b0Rn6+HqqqOKis1EF1NQf5+XqYM6f30zbsrPSqZkuvTp48uTgyMtIKACIjI61CQ0OL5etSU1P1BwwY4Mnn8734fL5X4/MFpAmAFi1a5CSPZcuWLd0BICsrS8/f399T3vO/ePGiKQBMnz699OTJk9Ytvc9dlTob9ZsAPAghroQQfUgb7jONNyKE8ABYAvhLYfElAOMIIZayCXLjZMu6DIGAXU/v9OQ9lIgIwMAAmD9fs/F0dtLaqy5oruBJbS0HK1e6PM1QPCu9qtnSqzNmzCg5e/asJQBcunSp25QpU540+g4ODuI//vgjLSkpKfno0aP3Vq1a1eSi5Pbt27tbWFjUJyQkJN+5cyf5wIEDNikpKfr79u2zGj16tDAlJSUpOTlVXsC/AAAgAElEQVQ5ceDAgVUAYGNjU19XV0fy8vKeuyF4tXUrKKViQshySBtjHQD7KKWJhJD1AGIopfIGfjqAI1QhCw6ltJgQ8imkHwwAYD2ltBhdiEAA8PkNl9XXAzrP3a9gJ5WSAgQGAr/8AkRGAjNnAt1b/F/NtObKFRO0khcd5eU6iIoywahRrPRqJyq92qNHj3oLCwvxnj17LN3d3atNTU2ffDKrq6sj8+fPd0lKSjLicDjIyspqUgzm8uXL5ikpKcZnzpyxBIDy8nKdpKQkw0GDBlUuWrSol0gk4kydOrVE/v4DgLW1tVggEOjb2dlVNz5eV6bWa+qU0guUUi6l1I1SulG27GOFBh2U0nWU0ib3sFNK91FK3WWP/eqMs6NRKh1+V5wkl5IibRNSU5vfj9ESlALz5gFlZcC0adLk/cuXazqqzi8nRw+EtHyTPyEU2dms9GonLL06derUknfeecdlxowZDTpoGzdutO3Ro4coOTk5KT4+PkkkEikro0q2bdsmkMeSk5MTP2XKlLIJEyZUXL16NdXR0bFu3rx5rvLJf4C0fvqz/kw7I22ZKPdcKS6WtgPy4XfFNmLePJa7ROv9+CNw9650uDgrC/D0BPr313RUnZ+jowgSSYulV0EpgZMTK73aCUuvzpo1q2TZsmV5U6ZMKWt8Pvb29iIdHR3s3LnTuvHEOwAYO3ascNeuXTbyanN37941KCsr46Slpek7OTmJVq9eXTR79uzCW7duGQPSCXyFhYV6np6enT6Xe1uxWT0a0Ph2thMngNu3pW3ErVvSNmPqVM3Fx7SgogJYvBiolI08Ugrk5Eifmyi91MqoauTISpiZ1UNJL+8JM7N6jBjBSq92wtKrlpaWko0bN+Y13v7NN98sCA0NdTty5Ij1qFGjhEZGRk0+ZK1atarowYMHBr6+vl6UUmJlZSW6cOFC5qVLl8zCw8PtdHV1qbGxcf3hw4fvA8Cff/5p3K9fv0o9vTYP6nR6rKCLBpw+DYSEADExwIMH0hFcxbk/HA5w/DgwZYrGQmSas3q1NMlMtcJlOkND6cz3bds0F1cHUXtBF/nsd2VD1AYGEhw4cA9tuC7cHljp1bbRhtKrc+fOdZ48eXLppEmTmq3p3pm1VNCFDb9rgLynnpYGhIWhyWReiUTa0F+40PGxMS1ISWnaoANATY10OZsQ8ezCwoQ4cOAebG1FMDaWwMSkHsbGEtjaijTRoAPSW9oCAgJ4np6e/JCQEDdWerVliqVXNRWDj49PdVdt0FvTZXrqrq6udO3atQ2WeXt744UXXoBIJMLhw4eb7NO3b1/07dsXVVVVOHbsWJP1/v7+8PHxgVAoxKlTp5qsDwwMhKenJ4qKinDu3Lkm61988UX07t0beXl5uHjx4pPlmZnSYi43b45GfLwznJ0fYvTopvVX//knCAkJdrh//x6uXr3aZP3EiRPRvXt3pKam4q+//mqyPiQkBBYWFkhISICyUYxp06bB2NgYcXFxiIuLa7J+1qxZ0NPTw82bN5GYmNhk/euvvw4AuH79OtLSGtwNAz09PcyaNQsAEB0djfv37zdYb2xsjGnTpgEALl++jOzs7Abrzc3NMUU2VHHx4kXk5TUctbO2tsbLL78MADh79iweP37cYL2dnR2CgqSZhU+ePImysgaX8eDk5IQxY8YAAI4dO4aqqoYTml1dXTF8+HAAwOHDhyESiaTXSGTH4aalYfD16wCA72XvA8zNgX7SW3y19XdPbvTo0XB2dsbDhw/xm5Lav0FBQbCzs8O9ew1/9+bOndshpVdltVdNkJ2tBycnEUaMqASrvcowAFjpVa1TWwvo6QHCVvocJSXAjRvSJDWMhlVXA+WtfPAvL5duxzIKPTtp7VVWepVh2qjL9NQ70zX1gQOlHb4UFZIx7twJLFmi/piYVlAKDBki/ZSlLPkJhwMMGgT8+SdAWp7A3Zmp/Zp6I2IxS9LHMI2xa+paRiCQFvJqbTSRw3kymstoGiHAvn2AfjOXCQ0MpOu7cIPe0W7fhqGVFfreuYMmt0cxDKMca9Q7WG0tkJcH+Pu3noDMxkbaq2e0BI+nvGCLkZF0OMXTs+k65qlIJMDcuehVUQGd119Hr3Yo0sYwzwXWqHewh7K7RF1cgP37m08Lq6PDOn5aqa6u6Q/F2BhYv14z8XRRBw7AMi0NRpQCqakwjoxEt2c9po6OzgB54Y8JEyb0bq2wijLr16/v8TT7dQZtLXUaGhraa//+/Zbt8dqNS92+/PLLrlwul//JJ5/0ePPNNx1Onz5t9izHP3jwYLc1a9bYA9JiNj169PDj8Xh8Ho/HX7p0qdKS4O1FXvSnrfstXLjQ6cyZM20+7y75y6nN5LezubgAL70EHDvWdBiew5Euf+mljo+PaYFAAJw7B0ya9G+iGRMTYPdulnimHQmF4KxahZ7V1dL/T9XV4Lz5JlzKyp7t/5WBgYEkJSUlKT09PVFPT49u27atzaU5d+/ebVtRUfHUcSgrVcqgQalbgUCge+fOHZO0tLSktWvXFmzfvj23cSKblohETRMOfvHFF3arV68ulD9fvHhxvjzl7M6dO7WyAuiaNWsKPv/8c7u27sca9Q7WOJvclCnAkSP/Tpg2NASOHmWJZ7TSrl3Sr19+Cfj5ST999enDfljtbM0aONTUNPzfVFMDzurVcGiv1xg6dGhFRkaGAQCsW7fO1sPDw9vDw8N7/fr1PQDlZUw3bNjQo6CgQG/48OFceZlSReHh4dajR492CwgI8HRxcfFZvXq1PaC8VOnu3butuFwu38PDw3vJkiVPeoonTpww5/P5Xp6envzAwECuPJZXXnmll6+vr5eXl9eTUqsxMTGGvr6+Xjwej8/lcvnx8fEGzZVf/eOPP4xfeOEFT29vb6+hQ4d6ZGVl6cmXy0u8fvHFF83eZ/PBBx/Ycblcvqenp9Ke7Zo1a+x9fHy8PDw8vGfMmOEikV0v2bBhQw952deJEyf2BoDz58+bynvJXl5e/JKSEo7iKMGYMWO4BQUF+jwej3/x4kVTxRGB5s4jICDAc968ec4+Pj5eGzZssFWM7e7duwb6+voSe3t7cUu/Ey0de/78+c4+Pj5evXv39o6OjjYeN26cm4uLi8+KFSue/E6OGTPGzdvb28vd3d1769atSi+u7ty500r+M5s5c6aLWCyGWCxGaGhoLw8PD2/56AQAcLncutLSUl2BQNC2Xj6ltNkHpNXVrrS0jbY8BgwYQDuDdesoJYTS2tp/l0kklAYGUsrhUDp4sPQ5o2Wqqym1tqY0JET6PDmZ0m7dKE1J0WxcHQzSCott/vuMi4t7QCmNae1x6xZNMDCgEuntBg0fBgZUEhdH41U5jrKHkZFRPaU0pq6uLmbUqFElmzdvzrp69WqSh4dHlVAovFVaWnrLzc2t+s8//0zcv39/RlhYWKF836KiotuU0hgHB4fa3NzcOGXH/+qrr+5379697tGjR7fLy8tj3d3dq6Ojo5NSUlLuEkLo5cuXkymlMffv379jZ2dXm5OTE1dXVxczcODAssjIyIycnJw4W1vbuuTk5LuU0pi8vLzblNKYZcuWPfr666/vUUpjCgsLb7u4uNQIhcJbs2fPzt+5c+c9SmlMdXV1bHl5eayyuGtqamL79u1bkZOTE0cpjdmzZ0/m1KlTiyilMR4eHlUXLlxIoZTGLFy4MM/d3b268XkdPXo0rW/fvhVlZWW3FOOaMmVK0b59+zIVl1FKYyZNmvT48OHD6ZTSGBsbm7qqqqpYeeyU0piRI0eWXrp0KZlSGlNaWnqrrq4uJiUl5a78tRW/V3ydls7jhRdeKJ81a1aBsp/L9u3b7y9YsCBP/nzVqlW5NjY2dZ6enlWenp5VJ06cSGvt2IsXL35EKY1Zv369wMbGpu7Bgwd3qqqqYnv06FH36NGj24rvgfxnL18u/52JjY1NGDlyZGlNTU0spTRm1qxZBTt27Lh/9erVpMDAQKE8Pvn7RCmNCQsLK9y/f39G43OS/T0p/Vtr8RMApbSeECIhhFhQSjs8k1NXJBAA9vYNJ1HLJ1YHBrLr6FrryBHg8eN/q7HxeEBREauV247kk+OUjJ4CAEQi4PXX0Ss2FqlPk4emtraWw+Px+AAwcODA8pUrVxZt2bLF5qWXXio1NzeXAMD/+3//r+TKlStmwcHBwsZlTFV5jaFDh5bZ2dnVy48VFRVlGhYWVqpYqvTPP/80GTRoULmDg4MYAMLCwoqjo6NNdXR0aEBAQDmPx6sDAHkZ2KioKPNLly51Cw8Pt5OdB8nIyNAPDAys3Lp1q312drb+9OnTS3x9fWuVlV+9efOmYXp6utGoUaO40vdZAhsbG1FRUZFOeXm5zoQJEyoAYN68eY9///13i8bn9Ouvv5q/+uqrTwrfKCtP+/PPP5t98cUXdjU1NZzS0lJdPp9fDUDo6elZHRIS4hocHFw6a9asUgAYNGhQxZo1a5ynTZtWPGPGjBI3NzeVpkHevXvXQNl5yNc3rv4m9+jRIz0bG5sGvfTFixfnr1+/Pl/+vLn3SL4+JCSkFAD69OlT7e7uXu3i4iICAGdn59p79+7p29nZVX/++ee258+f7wYAeXl5eomJiYZ2dnZPci1cvHjRLCEhwbhPnz5eAFBTU8Pp0aOHOCwsrPThw4cGc+bMcX755ZeFISEhTzJl2djYiHNyctqUmU+Vbn0FgHhCyK8AngRIKV3RlhdipASChiVX5VgbocUoBXbsALy9gZEj/13OfljtKikJBgkJMGluprtEAhIfD9PkZBh4e6PN1bfk19RV2VZexvTHH3+0+OijjxwvX75ctnXr1keK20RGRnbbtGmTAwDs2bPnAQCQRp/I5c+fpQQopRQnTpzI6NOnT4Nz7t+/f82wYcMqT506ZTFx4kSPHTt2ZAUHB5c3jnvatGml7u7u1XFxcQ0yYxQVFbXLL3BVVRVZvXq1y40bN5Lc3d1Fb731lkNNTQ0HAK5cuZL+888/m/30008WW7dutU9NTU3ctGlT3uTJk4U//fSTxbBhw3jnz59PV+X9oZQSZech11y1PSMjI4lQKGytA9viseVpgTkcDgwMDJ4kd+FwOBCLxeTcuXNm0dHRZjExMSlmZmaSgIAAz8blZyml5JVXXnn89ddfN7mGn5CQkHTq1Cnzb775xubo0aNWx48ffwAANTU1RFmBm5ao8nn3JICPAFwFEKvwYJ5Cc406wNoIrfX339LyecuXs2EUNeLzUevjg0oOB0ozYnE4oL6+qPDyanuD3pyRI0dWXLhwoVt5eTmnrKyMc+HCBcuRI0eWKytjCgAmJib18lKks2fPLpVPtnrxxRerAODPP/80z8/P16moqCAXLlzoNnz48CY9/GHDhlXeuHHD7NGjR7pisRjHjx+3GjFiRMWIESMq//nnH7OUlBR9AMjPz9eRxVi2bds2W/l16mvXrhkBQFJSkr6Xl1fthx9+WDB+/PjSuLg4I2Vx+/n51RQXF+tevnzZBJD29GNiYgy7d+9eb2ZmVn/p0iVTAPj++++VloUdP3582aFDh7rLZ/3L45KrqqriAICdnZ1YKBRyzp49awkA9fX1yMzM1H/55ZfLv/7665yKigodoVCok5iYaBAQEFC9cePGPD8/v8qEhARDVX5WzZ1Ha/t5e3vXZGZmtpjr4GmPLVdaWqpjYWFRb2ZmJrl9+7bhnTt3msycDQoKKjt37pylvHxtfn6+Tlpamv6jR4906+vr8frrr5d+9tlnOfHx8U/K42ZmZhr26dOnuvGxWtJqT51SeqAtB2SaR6m0UQ8O1nQkTJtERAAWFsCrr2o6ki6NwwH278eDwEDwa5U023p6wPff40F7poAfOnRo1cyZMx/379/fCwBee+21wiFDhlT/+OOP5o3LmALAnDlzioKCgri2trZ1N27cSGt8PD8/v8rg4GC3vLw8/alTpz5+8cUXq1JTUxsMn7q4uIjWrl2bM3z4cC6llIwZM6b01VdfLQWA8PDwByEhIe4SiQTW1tai69evp2/evDl34cKFPXk8Hl8ikRBnZ+faK1euZBw6dMjq2LFj1rq6utTGxkb06aefPvrzzz9NGsdtaGhIjxw5krlixYqe5eXlOvX19WTJkiX5/v7+Nd99992DBQsW9CKEYMSIEWWNzwcApk6dWnbr1i3jvn37eunp6dExY8YIIyIinvQ2u3fvXj9r1qxCLy8vbxsbG3GfPn0qAUAsFpOZM2e6lpeX61BKyYIFCwq6d+9ev3r1aofr16+bE0Kop6dn9dSpU4UCgaDVGqktnUdL+40fP77ivffec5ZIJOA088vztMeWCw0NFe7Zs8emd+/e3r17966RvweKBgwYUPPhhx/mjB49miuRSKCnp0fDw8MFxsbGkvnz5/eSSCQEANavX58NSD9YPHjwwODFF19sU7rkVtPEEkKGAFgHwAXSDwEEAKWU9m5pv47WGdLEFhQAtrbSkVz5pVlGy+XlSYdWli2Tznp/znVEmtg33oDTwYOwqa39dyTRwACS115D4d69yG5pX00KDw+3jomJMYmMjBRoOhamoblz5zpPmjSptC23xmlaZGRkt9jYWOOvvvoqt/G6Z00T+x2ALwAMBfACAH/Z11YRQoIIIamEkAxCyHvNbDONEJJECEkkhPygsLyeEBIne5xR5fW0XePb2ZhOYPdu6QytpUs1HclzY9s25BoaosF1RENDSLZtQ5N/bgyjivXr1z+qrKzsVLdwi8Vi8tFHH+W3vmVDqkyUE1JKf27rgQkhOgC+BjAWQDaAm4SQM5TSJIVtPAD8H4AhlNISQojifZLVlNK+bX1dbZaVJf3KGvVOoq4O+OYbYMIEabJ+pkOYm0Py5ZcQLFuGXtXV4BgZQbJ9O7LMzaHVyWJXrFjxGMDjVjdkOpyzs7N41qxZneoOrnnz5pU8zX6qfHK5QgjZQggJJIT0lz9U2C8AQAal9B6ltA7AEQCTGm3zBoCvKaUlAEApLWhT9J2MYjY5phM4eVI6/M6ulXS4OXNQwuWimhDA0xNVs2ejVNMxMUxnoEpPXV5SRPE6GgUwqpX9HAE8VHierXAsOS4AEEKuQZroZh2l9KJsnSEhJAaAGMBmSunpxi9ACFkIYCEA9OwE3V+BADA1Bbo9cxZrpkNERADu7kBQkKYjee7IJ82NfFHM+/573XadHMcwXVmzjTohZCWl9CsAH1FK/1Tj63sAGAHACcBVQogvpbQUgAulNIcQ0hvA74SQeEpppuLOlNI9APYA0olyaoqx3WRlSYfe2V1RncDt28C1a9LJcaxF0Yh+uI0SMhwEfwDoo+lwGKZTaOm/1VzZ1/CnPHYOAGeF506yZYqyAZyhlIoopfcBpEHayINSmiP7eg9AFIBOX1lcIGBD751GRIS0+trrr2s6kueTNL1cL1JRoYPXX+8FVnuVYVTSUqOeTAhJB+BJCLmr8IgnhNxV4dg3AXgQQlwJIfoApgNoPIv9NKS9dBBCukM6HH+PEGJJCDFQWD4EgEqZoLRZS4lnGC3y+DHwww/A7NnsWommHDhgibQ0I0hrrxojMpKVXlWz56n0KiFkQEJCwpOENOvXr+9BCBlw9epV4+aPIi3u0to2ymzatMlm+/bt1m2PvO2a/eWklM4AMAxABoCXFR4TZV9bRCkVA1gO4BKAZADHKKWJhJD1hBB5+pVLAB4TQpIAXAHwNqX0MQAvADGEkDuy5ZsVZ813RtXVQGEha9Q7hW+/BWpqpPemMx1PKORg1aqekKfZrK7m4M03XVBWxkqvdlEdXXrVw8OjOjIy8kkGvdOnT1u5u7urlGjmafznP/95vHv3btvWt3x2Lf5yUkrzKKV9KKVZjR+qHJxSeoFSyqWUulFKN8qWfUwpPSP7nlJK36KU8imlvpTSI7Ll12XP+8i+fvesJ6pp7B71TqK+Hti5U5rj3cdH09E8n9ascYAsd/gTNTUcrF7NSq+y0qvtUnr1pZdeKr1w4UI3AEhMTDQwMzMTW1paPlk/a9asnj4+Pl7u7u7eq1atUvp7d/LkSfO+ffvy+Hy+14QJE3rL0wcvXbrUUX7OCxcudAKkeemdnJxqr1y50uZeflu1rU4r89TY7WydxNmz0h8Wyx6nGbdvG+LgwR6orW04nbS2loODB3tg+fJCNCps0lYikQiXLl0yHzduXNkff/xh/MMPP1jHxsYmU2kJZ6/Ro0eXp6enG9jZ2YmioqIyAODx48c61tbW9bt27bKNjo5Oa6429927d03i4+MTTU1NJf369eNPmjRJaGtrKxYIBAbffffd/dGjRz948OCB3rp16xxjY2OTbWxsxMOGDeMePHiw2+jRoyuWL1/eKyoqKoXH49XJc6y///779iNHjiw7fvz4g6KiIh1/f3+v4ODgsh07dtgsXbo0f8mSJcU1NTVELBbjxIkTFo3jrq2tJStWrOh5/vz5DAcHB/HevXst16xZ43j8+PEH8+fP7/XVV18JJkyYULFo0SInZed07Ngx8wsXLnSLjY1NMTMzkzTO/Q4Ab7/9doG84M3kyZNdjxw5YjFz5kxheHi4XVZWVryRkRGVF5DZtm2bXXh4eNa4ceMqhUIhx9jYWFJQ8O/dzGfPns2YOHGih7z4zt69e7sD0rSpzZ0HANTV1ZGEhITkxrFduXLF1M/Pr0pxmbm5eb2Dg0PdzZs3DU+cONFt6tSpJQcPHnxSA/2LL77IsbW1rReLxRg8eLDnjRs3jAYOHPgkB/ujR490N23aZH/16tU0c3NzyQcffGD36aef2q5Zs6bgwoULlvfu3UvgcDgNiub079+/MioqymzkyJENYmlvXfLakDZiPfVOIiICcHZmCfo1QTY5Di3XXn3qSXPy0qu+vr58JyenupUrVxZFRUWZykuvWlhYSOSlV/v371/9xx9/mC9ZssTx4sWLptbW1iqNmctLr5qamlJ56VUAaK70qp6e3pPSq1FRUSbNlV798ssv7Xk8Hn/o0KGeiqVXt23bZv/BBx/Ypaen65uamlJlcSuWLOXxePwtW7bY5+bm6ikrvarsnFQtvern58fjcrn869evmyUkJBgBgLz06s6dO6309PQo8G/p1Q0bNvQoKirS0dNrNe07gIalVxXPQ76+LaVXAWDatGnFBw8etDp//rzlrFmzGiR6OXDggBWfz/fi8/n89PR0wzt37jQo7hIVFWWSmZlpGBAQwOPxePwjR45YCwQCfWtr63oDAwNJWFhYrwMHDnQzNTV98svao0cPsWK86sJ66h0kK0t6Z5RDuw0gMu0uKQn47Tfgs88AXfan0eGSkgyQkGDSbKMtkRDEx5siOdkA3t6s9CorvdpAW0uvhoWFCT/++GMnX1/fKisrqyf7pqSk6EdERNjKRlLqQ0NDe9U0uhxEKcXQoUPLzp49e7/xcePi4pLPnDljfuLECctdu3b1+Pvvv9MAaf30tpZRfRrN9tQJIWcJIWeae6g7sK5GIAAcHaWVphgt9fXXgIEBsGCBpiN5PvH5tfDxqQSHozznBIdD4etbAS8vVnoVrPTqs5ZeNTMzk6xbty77o48+avBhraSkRMfIyEhiZWVV//DhQ92oqCiLxvuOGDGiMiYmxlQ+g76srIxz9+5dA6FQyCkuLtYJCwsTfvPNNw9TUlKeXENPS0sz8PHxaVMZ1afRUndkq+zrFAB2AA7Jns8A0OYk8887djublhMKgQMHgBkzgO7dW9+eaX/SNHIPEBjIR/O1Vx+0ZzIgVnr1+S69unDhwib51QMDA6t9fHyq3NzcfOzt7esGDBjQ5IOZg4ODePfu3Q+mT5/eu66ujgDA2rVrcywsLCQTJ050r5XNCfn000+fZFW9efOm6eeff672okSqlF6NaVxqUdkyTdP20qtubsDAgdLbnxkt9NVXwJtvAjExwIABmo5Ga3VE6VW88YYTDh60QW3tv/+BDQwkeO21Quzdy0qvMm2m6dKr165dM9qyZYvd6dOnmwzXP41nLb1qIkvVCgAghLgCMGmPwJ4XEgnw8CGb+a61JBLp0HtgIGvQtcG2bbkwNGx47dHQUIJt21jpVeapaLr0akFBgd7nn3/eOKOqWqgyG2gVgChCyD0ABIALgEVqjaqLyc+XTtxlw+9a6pdfgPR0YN06TUfCAIC5uQRffinAsmW9UF3NgZGRBNu3Z8HcXKtzxbLSq9pL06VXQ0JClF7aUIdWP7nIqqZ5AFgJYAUAT0rpJXUH1pWw29m0XEQEYGcHTJ2q6Ui6MolEIlG9lNGcOSXgcqshrb1ahdmzWelVhgEg+ztq9gNuq406IcQYwNsAllNK7wDoSQiZ2H4hdn1Zsvx7rFHXQpmZwIULwKJFgL5+69szTyuhsLDQQuWGXT5pztS0vr0nxzFMZyWRSEhhYaEFgITmtlFl+H0/gFgAgbLnOQCOAzj3zBE+J1g2OS22cyegowMsXKjpSLo0sVi8IC8v79u8vDwfqJr0isMBoqKyAZjhzp1nKujBMF2EBECCWCxu9r5bVRp1N0ppGCFkBgBQSqtI4wwLTIsEAsDCAjA313QkTAOVlcC+fdJhd5YVSK0GDBhQAICl6WMYNVPlE3MdIcQIAAUAQogbgHZL/vA8yMpiQ+9a6dAhoLQUWL5c05EwDMO0C1V66usAXATgTAg5DGlt89fVGFOXIxCwoXetQ6l0gly/fsDgwZqOhmEYpl202qhTSn8hhMQCGATpLW0rKaWqJZFgAEgbddZuaJnoaCAhAfjuO4BdTWIYpotQZfb7bwAGUkrPU0rPUUqLCCF7OiC2LqGiAiguZsPvWiciArC2lqaFZRiG6SJUuabuCuBdQshahWValSJWm7GZ71ro4UPg9Glp4RYjI01HwzAM025UadRLAYwGYCur3NakYg3TPJZ4Rgt98430mvqSJZqOhGEYpl2p0qgTSqmYUroUwI8A/gTQQ5WDE0KCCCGphJAMQsh7zWwzjXt1Np4AABokSURBVBCSRAhJJIT8oLB8DiEkXfaYo8rraSPWqGuZmhpgzx4gOJgNnzAM0+WoMvv9G/k3lNLvCSHxAJa1thMhRAfA1wDGAsgGcJMQcoZSmqSwjQeA/wMwhFJaQgjpIVtuBWAtpMP8FECsbN8mZfK0XVYWoKsL2NtrOhIGAHD0KFBUxG5jYximS2q2p04IkadKOU4IsZI/ANwHsEaFYwcAyKCU3qOU1gE4AmBSo23eAPC1vLGmlBbIlo8H8CultFi27lcAQSqflRYRCAAnJ2nSMkbDKAV27AC8vIBRozQdDcMwTLtrqaf+A4CJkKaIpZDeziZHAfRWtpMCRwAPFZ5nAxjYaBsuABBCrgHQAbBOVkBG2b6OjV+AELIQwEIA6Kml49sCARt61xo3bgCxsdIyq+w2NoZhuqBmG3VK6UTZV1c1v74HgBEAnABcJYT4qrozpXQPgD0A4O/vT9UR4LPKygKGDdN0FAwA6W1s5ubA7NmajoRhGEYtmm3UCSH9W9qRUnqrlWPnAHBWeO4kW6YoG8ANSqkIwH1CSBqkjXwOpA294r5Rrbye1qmvB7Kz2XwsrZCXBxw7BixdCpiaajoahmEYtWhp+H1bC+sogNYuSt4E4EEIcYW0kZ4OYGajbU4DmAFgPyGkO6TD8fcAZALYRAixlG03DtIJdZ3Ko0fShp0Nv2uBvXsBkUjaqDMMw3RRLQ2/j3yWA1NKxYSQ5QAuQXq9fB+lNJEQsh5ADKX0jGzdOEJIEoB6AG9TSh8DACHkU0g/GADAekpp8bPEownsdjYtIRJJ700PCgK4XE1HwzAMozaq3NIGQogPAD4AQ/kySmlka/tRSi8AuNBo2ccK31MAb8kejffdB2CfKvFpq6ws6VfWqGvYyZNAbq70/nSGYZgurNVGXZYedgSkjfoFABMgTUDTaqP+vGM9dS0REQH07g1MmKDpSBiGYdRKlYxyUyFNE5tHKZ0LoA8AlipWBQIBYGXF5mVpVFwc8OefwLJlAEeVX3eGYZjOS5X/ctWUUgkAsSwhTQEazmpnmpGVxXrpGhcRARgbA/PmaToShmEYtVPlmnoMIaQbgL2QJqKpAPCXWqPqIgQC6agvoyGPHwOHDwNz5gDdumk6GoZhGLVrsVEnhBAAn1FKSwF8Qwi5CMCcUnq3Q6Lr5AQCYMQITUfxHNu3T1rAZVmrpQoYhmG6hBYbdUopJYRcAOAre/6gI4LqCoRC6YMNv2tIfT2wc6f0U5WvykkKGYZhOjVVrqnfIoS8oPZIuhj5zHeWTU5Dzp0DHjxg1dgYhnmuqHJNfSCAWYSQLACVkBZ2oZRSP7VG1smx29k0LCJCWh5vUuPCgAzDMF2XKo36eLVH0QWxRl2DkpOBy5eBjRulxewZhmGeE60Ov1NKsyC9hW2U7PsqVfZ73mVlAfr6gK2tpiN5Dn39NWBgALzxhqYjYRiG6VCtNs6yjHLv4t+CKnoADqkzqK5AIACcnVm+kw5XVgYcOABMnw7Y2Gg6GoZhmA6lSpMTAiAY0uvpoJTmAjBTZ1BdgUDAht414sABoKKCTZBjGOa5pEqjXicrvEIBgBBiot6QugaWTU4DJBLpBLlBgwB/f01HwzAM0+FUmUV0jBCyG0A3QsgbAOZBml2OaYZIJC0Kxm5n62C//gqkpQGH2NUhhmGeT6026pTSrYSQsQDKAHgC+JhS+qvaI+vEcnOlnUbWU+9gERHSmYmvvKLpSBiGYTRCpft9ZI04a8hVxOqoa8C9e8D588CHH0pvO2AYhnkOqTL7fQohJJ0QIiSElBFCygkhZR0RXGfFsslpwM6dgI4OsHixpiNhGIbRGFV66v8F8DKlNFndwXQV8kbdmRWo7RiVlcB33wGhoYCDg6ajYRiG0RhVZr/nP22DTggJIoSkEkIyCCHvKVn/OiGkkBASJ3ssUFhXr7D8zNO8vqYIBNJbpI2MNB3Jc+KHH4DSUnYbG8Mwzz1V66kfBXAaQK18IaX0ZEs7EUJ0AHwNYCyAbAA3CSFnKKVJjTY9SilV9t+4mlLaV4X4tA67na0DUQrs2AH07QsMGaLpaBiGYTRKlUbdHNLUsOMUllEALTbqAAIAZFBK7wEAIeQIgEkAGjfqXY5AAPB4mo7iOXH1KhAfD3z7LUCIpqNhGIbRKFVuaZv7lMd2BPBQ4Xk2pBXfGgslhLwIIA3AKkqpfB9DQkgMADGAzZTS0413JIQsBLAQAHpqSdeYUmmjPm5c69sy7SAiArCyAmbO1HQkDMMwGqfK7HcuIeQ3QkiC7LkfIeTDdnr9swB6ycq4/grggMI6F0qpP4CZALYTQtwa70wp3UMp9aeU+ttoSZ7vkhJpllIt+YzRtT18CJw6BcyfzyYwMAzDQLWJcnshLeYiAgD6/9u7/2iryjqP4+8PKJCCgwaGgs3FQs1KTS/YL5nWSk1LwdLK0hUsLGIp1RonZ2rV1KR/TL+Ws5oLEmBkZqn5YymOOGpNpjkJXFMxTSZU7gW0QH6JCsiF7/yx92WdbvfHPvecffc5535ea511795n732/z4V1v+fZ+3m+T8Qq4MIM520gWd2t04R0334RsTkiOp/TXwucUvLehvTrc8ADwLsy/MzCeTrbAFq4MKnyc+mlRUdiZlYTsiT1gyJiRZd9HRnOWwlMkjRR0jCSDwJ/NYpd0hElm9OAP6b7D5U0PP1+DPA+6uRZvNdRHyC7dsGiRXDuudDUVHQ0ZmY1IctAuZfSW9+dC7pcALzY10kR0SFpLnAvMBRYEhFPSboSaI2IpcAXJU0j+ZCwBZiZnv42YKGkfSQfPL7dzaj5muRqcgPklltg0yb4wheKjsTMrGYoWYCtlwOko4FFwHuBrcDzwMURsTb36MrQ3Nwcra2tRYfBFVckY7dee82DsXM1ZQrs2AFPP+1f9ACS9Gg61sXMalCW0e/PAaenS64OiYgd+YdVvzrXUXeeydHy5bByZfLpyb9oM7P9+kzqki7vsg2wHXg0Ih7PKa661ZnULUfz5sGoUfCZzxQdiZlZTckyUK4ZmEMy73w88HngLGCxpH/OMba65GpyOfvLX+Dmm2HmzCSxm5nZflkGyk0ATo6IVwAkfRO4G5gKPEqy4IsBu3fDiy96OluuFi+GPXvgssuKjsTMrOZk6akfTknNd5L56m+KiJ1d9g96G9JZ+O6p52TPHliwICnXd+yxRUdjZlZzsvTUfwYsl3Rnun0u8PN04FxdTDMbKJ7OlrM77oAXXkiKzpiZ2d/IMvr9Kkn3kBSAAZgTEZ1zxy7KLbI65GpyOWtpgaOPhrPPLjoSM7OalKWnTprEi58EXuM6k/qECcXG0ZCeeAIeegi+/30YOrToaMzMalKWZ+qWUVsbjBsHw4cXHUkDmjcvWbRl1qyiIzEzq1lO6lXU3u5b77nYsgV+9jO4+GI49NCiozEzq1lO6lXkwjM5WbIEdu6EuXOLjsTMrKY5qVdJhJN6LvbuhfnzYepUOOGEoqMxM6tpTupV8tJLSWfSSb3Kli2DtWu9GpuZWQZO6lXi6Ww5aWlJphOcd17RkZiZ1Twn9SrpTOruqVfRM8/A/ffDnDlwQKbZl2Zmg5qTepW4mlwO5s+HYcPgc58rOhIzs7rgpF4l7e1w8MFw2GFFR9IgXn4ZrrsOPvlJOPzwoqMxM6sLTupV0jnyPVlu3ip2/fXwyiseIGdmVoZck7qksyStlrRG0le6eX+mpE2SHk9fny15b4akP6WvGXnGWQ1eR72K9u1LKshNmQKTJxcdjZlZ3cgtqUsaCswHzgaOBz4l6fhuDr05Ik5KX9em5x4GfBM4FZgCfFNSTZcSczW5KvrlL2H1anjPe+CRR5IiAGZm1qc8e+pTgDUR8VxEvA7cBEzPeO6HgPsjYktEbAXuB87KKc6K7dwJGze6p14Vy5bB9OnJc4wlS+CMM5Jf7LJlRUdmZlbz8kzq44F1Jdvr031dnS9plaRbJR1VzrmSZktqldS6adOmasVdtvXrk69O6hVatgw+9jHYtSvpne/YkTxXX78eLrjAid3MrA9FD5S7C2iKiBNIeuM/KefkiFgUEc0R0Tx27NhcAszC09mqIAJmz4bdu7t/f+dO+PznfSvezKwXeSb1DcBRJdsT0n37RcTmiOj8K34tcErWc2uJq8lVwfLlsG1b78ds2wYrVgxMPGZmdSjPpL4SmCRpoqRhwIXA0tIDJB1RsjkN+GP6/b3AmZIOTQfInZnuq0nt7ckj4PHdPVywbF58MVm8pTdDhsALLwxMPGZmdSi32psR0SFpLkkyHgosiYinJF0JtEbEUuCLkqYBHcAWYGZ67hZJV5F8MAC4MiK25BVrpdra4Mgj4cADi46kjo0bB6+/3vsx+/Ylv2gzM+tWrgW1I2IZsKzLvm+UfP9V4Ks9nLsEWJJnfNXi6WxVsGdPkrR7M3p0MnfdzMy6VfRAuYbgddSrYN48GDkSRozo/v03vAEWLnTJPjOzXjipV2jfPli3zkm9IuvXw+23J6ux3XZbstTqyJFwyCHJ1wkT4NZb4cMfLjpSM7Oa5vUsK7RxYzILy0m9AgsXJp+OLr0UJk5Mbn2sWJEMijvyyOSWu3voZmZ9clKvkKezVWj3bli0CM45J0nokCTwU08tNi4zszrk2+8V6kzq7qn30y23JLc7vBqbmVnFnNQr5GpyFWppgWOPhQ9+sOhIzMzqnpN6hdrbk/Fco0cXHUkdWrEiec2dmxSWMTOzivgvaYU8na0C8+bBqFEwY0bRkZiZNQQn9Qq1tTmp98vGjXDzzUlCHzWq6GjMzBqCk3qFXE2unxYvTsrCXnZZ0ZGYmTUMJ/UKvPoqbN7snnrZ9uyBBQvgjDPguOOKjsbMrGF4nnoF1q1Lvjqpl+nOO2HDhiSxm5lZ1binXgFPZ+unlhZoanLZVzOzKnNSr4CryfXDqlXw4IPJs/ShQ4uOxsysoTipV6C9PclLRxxRdCR1ZN68ZMW1WbOKjsTMrOE4qVegrQ3Gj4cDPDIhmy1b4IYb4KKL4LDDio7GzKzhOKlXwNPZyvTjH8POnUkFOTMzqzon9Qq4mlwZ9u6F+fPhtNPgxBOLjsbMrCHlmtQlnSVptaQ1kr7Sy3HnSwpJzel2k6Sdkh5PXz/MM87+2Ls3mdLmpJ7RPffA8897NTYzsxzl9jRY0lBgPnAGsB5YKWlpRDzd5bhRwJeA5V0u8WxEnJRXfJX685+ho8O33zNraUkGIJx3XtGRmJk1rDx76lOANRHxXES8DtwETO/muKuA7wC7coyl6ryOehlWr4b77oM5c+DAA4uOxsysYeWZ1McD60q216f79pN0MnBURNzdzfkTJT0m6TeSTssxzn5xUi/D/PkwbBjMnl10JGZmDa2wyViShgBXAzO7eftF4M0RsVnSKcAdkt4eES93ucZsYDbAmwc4u7qaXEY7dsB118EnPgGHH150NGZmDS3PnvoG4KiS7Qnpvk6jgHcAD0haC7wbWCqpOSJ2R8RmgIh4FHgWOKbrD4iIRRHRHBHNY8eOzakZ3Wtvh0MP9aqhfbr++iSxe4CcmVnu8kzqK4FJkiZKGgZcCCztfDMitkfEmIhoiogm4BFgWkS0ShqbDrRD0tHAJOC5HGMtm6ezZRCRVJCbPBmmTCk6GjOzhpfb7feI6JA0F7gXGAosiYinJF0JtEbE0l5OnwpcKWkPsA+YExFb8oq1P9raPPK9T7/6FTzzTNJbNzOz3OX6TD0ilgHLuuz7Rg/HfqDk+9uA2/KMrVLt7TB1atFR1LiWFhg7NnmebmZmuXNFuX54+WXYts2333u1di3cdVcy4n348KKjMTMbFJzU+8HT2TK45hoYMiSZm25mZgPCSb0fvI56H157Da69Fj76UZgwoehozMwGDSf1fnBPvQ833ghbt3oam5nZAHNS74f29qTa6bhxRUdSgyKSAXLvfGeyIpuZmQ2YwirK1bO2tuSu8hB/JPpbDz8MTzwBixaBVHQ0ZmaDitNSP7S3+3l6j1paYPRo+PSni47EzGzQcVLvB1eT68GGDXD77XDJJXDwwUVHY2Y26Dipl6mjI8ldTurdWLgQ9u6FSy8tOhIzs0HJSb1ML7yQ5C3ffu9i9+4kqX/kI3D00UVHY2Y2KDmpl8nT2Xpw662wcaOnsZmZFchJvUxeR70HLS1wzDFw+ulFR2JmNmg5qZfJPfVurFwJy5fD3Lme52dmViD/BS5TezuMGQMHHVR0JDVk3jwYORJmzCg6EjOzQc1JvUyeztbFpk1w001JQj/kkKKjMTMb1JzUy9TW5qT+VxYvhtdfT269m5lZoZzUyxCRJHVPZ0t1dMCCBcnguOOOKzoaM7NBz7Xfy7B9O7zyinvq+915J6xfD/PnFx2JmZnhnnpZPJ2ti5YWaGpKCs6YmVnhck3qks6StFrSGklf6eW48yWFpOaSfV9Nz1st6UN5xplV53Q2334HnnwSfvObpCTs0KFFR2NmZuR4+13SUGA+cAawHlgpaWlEPN3luFHAl4DlJfuOBy4E3g4cCfxS0jERsTeveLPwHPUS8+bBiBEwa1bRkZiZWSrPnvoUYE1EPBcRrwM3AdO7Oe4q4DvArpJ904GbImJ3RDwPrEmvV6j2dhg+HMaOLTqSgm3dCjfcABddBG98Y9HRmJlZKs+BcuOBdSXb64FTSw+QdDJwVETcLemKLuc+0uXc8V1/gKTZwOx0c7ekP1Qj8L4UcLd5DPDSgP/UvvzoR8mr/2qzXdXRqG07tugAzKxnhY1+lzQEuBqY2d9rRMQiYFF6vdaIaO7jlLrUqG1r1HZB47ZNUmvRMZhZz/JM6huAo0q2J6T7Oo0C3gE8IAlgHLBU0rQM55qZmVkXeT5TXwlMkjRR0jCSgW9LO9+MiO0RMSYimiKiieR2+7SIaE2Pu1DScEkTgUnAihxjNTMzq3u59dQjokPSXOBeYCiwJCKeknQl0BoRS3s59ylJvwCeBjqAyzKMfF9UrdhrUKO2rVHbBY3btkZtl1lDUEQUHYOZmZlVgSvKmZmZNQgndTMzswZRd0m9r9Kz6eC6m9P3l0tqGvgoy5ehXVMl/V5Sh6QLioixvzK07XJJT0taJelXkuqiEG+Gds2R9KSkxyX9Nq2UWBcqKfFsZsWpq6ReUnr2bOB44FPd/KG8BNgaEW8F/oOkWl1Ny9iudpI5/T8f2Ogqk7FtjwHNEXECcCvw3YGNsnwZ2/XziHhnRJxE0qarBzjMfsnYtm5LPJtZseoqqZOt9Ox04Cfp97cCH1Q6Eb6G9dmuiFgbEauAfUUEWIEsbft1RLyWbj5CUpeg1mVp18slmwcD9TIqtZISz2ZWoHpL6t2Vnu1aPnb/MRHRAWwHar1AeZZ21aty23YJcE+uEVVHpnZJukzSsyQ99S8OUGyV6rNtpSWeBzIwM+tdvSV1a2CSLgaage8VHUu1RMT8iHgL8C/A14uOpxpKSjz/U9GxmNlfq7eknqV87P5jJB0A/B2weUCi679GLoubqW2STge+RlJVcPcAxVaJcv/NbgLOyzWi6imnxPNa4N0kJZ49WM6sYPWW1HstPZtaCsxIv78A+J+o/Qo7WdpVr/psm6R3AQtJEvrGAmLsjyztmlSy+RHgTwMYXyUqKfFsZgWqq6SePiPvLD37R+AXnaVn04VgAH4EvFHSGuByoMfpOLUiS7skTZa0Hvg4sFDSU8VFnF3Gf7PvASOBW9LpXzX/gSZju+ZKekrS4yT/F2f0cLmakrFtZlaDXCbWzMysQdRVT93MzMx65qRuZmbWIJzUzczMGoSTupmZWYNwUjczM2sQTuo24CQ90FmoRNIySaMrvN4HJP1XD+/dmK7+9o+V/Awzs3pwQNEBWONJF9BRRPS5+ExEfDjHOMYBk9MV+7Kec0A6T9vMrO64pz5ISPrXdH3s36a91y+n+98i6b8lPSrpIUnHpfuvk/Sfkv5X0nOla7hLukLSyrQH/K10X1N6/euBPwBHSVogqTUtwPKtHuJaK2lMuvb44+nreUm/Tt8/U9LvlKwlf4ukken+syQ9I+n3wMd6aPZ9wPj0mqeldwh+kG7/QdKU9Fr/Jumnkh4GflqN37eZWRGc1AcBSZOB84ETSdbILq3RvQj4QkScAnwZuKbkvSOA9wPnAN9Or3UmMIlkec6TgFMkTU2PnwRcExFvj4g24GsR0QycAPyDpBN6ijEifpiuOz6ZZFWwqyWNIVkE5fSIOBloBS6XNAJYDJwLnAKM6+Gy04BnI+KkiHgo3XdQ+nMuBZaUHHt8+nM+1VOMZma1zrffB4f3AXdGxC5gl6S7ANJe73tJyrN2Hju85Lw70lvoT0t6U7rvzPT1WLo9kiSZtwNtEfFIyfmfkDSb5P/ZESSJc1Ufsf6ApF7/XZLOSc95OI1vGPA74Djg+Yj4U9qOG4DZGX8XNwJExIOSDil5nr80InZmvIaZWU1yUh/chgDb0p5rd0pXS1PJ13+PiIWlB0pqAl4t2Z5I0vOfHBFbJV0HjOgtGEkzgb8nqTve+bPu79p7ltRTvFl0rYvcuf1q1wPNzOqNb78PDg8D50oakfbOzwGIiJeB5yV9HJIBbpJO7ONa9wKzSp5tj5d0eDfHHUKSKLenvfyze7uopM7b/xeXDLB7BHifpLemxxws6RjgGaBJ0lvS48q5Zf7J9FrvB7ZHxPYyzjUzq2nuqQ8CEbEyXflsFfAX4EmgM5ldBCyQ9HXgQJJ1v5/o5Vr3SXob8Lv0lvgrwMXA3i7HPSHpMZIEvI7kg0Vv5gKHAb9Or9saEZ9Ne+83Sup8LPD1iPi/9Lb+3ZJeAx4iWeM7i11pXAcCszKeY2ZWF7xK2yAhaWREvCLpIOBBYHZE/L7ouAaSpAeAL3vdbzNrVO6pDx6LJB1P8lz7J4MtoZuZDQbuqZuZmTUID5QzMzNrEE7qZmZmDcJJ3czMrEE4qZuZmTUIJ3UzM7MG8f+4tntQUvTdwQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfUAAAEKCAYAAAALjMzdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOydd1gU1/7/32fpXTpSBAWWZSk2ghI1dsX8DBZUVG5sMbZ4NUZTboomxniTqxhDjFETG2q+9hi7NyaCSbwxgiJSlqIU6SCydNhlz++P3SULLLBI2QXP63nmWWbmzMxnhoXPnPZ+E0opGAwGg8Fg9Hw46g6AwWAwGAxG58CSOoPBYDAYvQSW1BkMBoPB6CWwpM5gMBgMRi+BJXUGg8FgMHoJLKkzGAwGg9FL6NKkTggJJIQkE0LSCCHvKdn/JSEkVrakEEJKFfYtJISkypaFXRkng8FgMBi9AdJV89QJIVoAUgBMBJAN4A6AeZTSxBbK/xPAYErpEkKIBYBoAH4AKIAYAEMppU+7JFgGg8FgMHoBXVlT9weQRil9RCmtA3AcwLRWys8D8H+ynycD+JlSWiJL5D8DCOzCWBkMBoPB6PFod+G5HQA8VljPBjBMWUFCiDOA/gB+beVYByXHLQOwDACMjIyG8ni8jketJvLygNxcYPBggMNGOjA0lJiYmGJKqbW642AwGMrpyqTeHuYCOE0prW/PQZTSfQD2AYCfnx+Njo7uiti6hddeAy5dAu7eVXckDEbLEEIy1R0Dg8Foma6sE+YAcFJYd5RtU8Zc/N303t5jewXp6UD//uqOgsFgMBg9ma5M6ncAuBNC+hNCdCFN3OebFiKE8ACYA/ifwuZrACYRQswJIeYAJsm29VpYUmcwGAxGR+mypE4pFQNYDWkyTgJwklKaQAjZTAgJUig6F8BxqjAMn1JaAuBTSF8M7gDYLNvWKxGLgcePWVJnMBgMRsfo0j51SullAJebbNvYZP3jFo49AOBAlwWnQTx+DNTXs6TO6L3ExMTYaGtrfw/AG0z0isF4ViQA4sVi8dKhQ4cWKiugKQPlnmvS06WfLKn3cqKjge+/Bz79FLB+vgaQa2trf29nZ+dpbW39lMPhdI04BoPRy5FIJKSoqIifn5//PYAgZWXYG7MGkJEh/WRJvZezbRtw/Digr6/uSNSBt7W1dRlL6AzGs8PhcKi1tbUQ0hYv5WW6MR5GC6SnS+emOzm1XZbRQ0lPB06fBpYvB0xM1B2NOuCwhM5gdBzZ31GLuZsldQ0gPR1wdAR0dNQdCaPL2LkT0NIC1qxRdyQMBqMXw5K6BsCms/VySkqkfenz5wMOzYQRGQwGo9NgSV0DYEm9l7NnD1BVBaxfr+5InnuOHDnShxAy9N69ew0DG5KTk3Xd3d29AODixYsmY8eOdevodYKDg10OHjxoDgAhISHOMTEx+gBgaGg4uCPnvXjxosnPP/9s1N7jHBwcfPLy8lQaGB0eHm65YMGCfu2PTjmjR492Ky4u1gKALVu22AwYMMArKCio/7Fjx8zef/99u866jhyJRILhw4dzS0pKOACgpaU1lMfj8eVLcnKybmdfU86zPrvc3FztUaNGuXdGDGz0u5qprpbqvrOk3kuprQXCw4HAQMDHR93RPPccP37cYsiQIRUREREWgwcPzu2Oa544caJd0roikQg6LfTF/frrrybGxsb1EydOrOyU4LqBqKioNPnP+/fvt75+/XqKq6urSLZJqOp5Wnsuipw8edLMy8ur2sLCQgIAenp6EoFAoNQdVFOwt7cX29raiv773/8aTZo0qUO/W5bU1Uym7M+dJfVeytGjQEEBsGGDuiPRGJYsgVN8PAw785ze3qg6cKCRCVQzhEIh586dO8bXr19PDgoKcv/yyy9VTupisRirVq1yvHHjhhkhhC5cuLD4gw8+KNywYUPfq1ev9qmtreX4+flVHDt2LJPTxJHJ39/fY/v27Y9feumlKgB47bXXnKKiokytra1FZ86ceWRvby/29/f38Pb2rvrrr7+Mg4ODSzw8PGo+//zzviKRiGNubi4+ceLEo6qqKk5ERIQ1h8OhJ0+etNy5c2eWr69vzeLFi51zcnJ0AWDHjh1ZkyZNqszPz9cKDg4eUFBQoDt06NCKliy2T58+bbpx40aH+vp6YmFhIf7f//6Xorj/hx9+MGsah5OTk/jSpUvG69ev7wcAhBDcunVLUFZWphUcHDygoqJCq76+nnz99deZgYGBFQ4ODj7R0dFJ69evt8/OztabMmWKe2hoaLG5uXl9dHS0UURERFZubq62svt466237B89eqSXlZWl5+DgULtp06a8xYsX9xeJREQikeDMmTMPfXx8ahVjPnbsmMXy5cuL2/p9vvHGG45//PGHSV1dHXn99dcL33777eKLFy+afPLJJ/ampqbi5ORkw6CgoBIfH5/q3bt329bW1pIff/zxoZeXV21Lz0XxGi3dk7JnZ25uLpk+fXppRESEZUeTOmt+VzNsOlsvRiIBwsKAQYOAcePUHc1zzw8//NBnzJgxQl9f31pzc3Pxb7/9pvKLRVhYmHVWVpZuYmJiQkpKSuLSpUufAMDbb79dGB8fn5SamppQXV3NOX78uFlr56murub4+flVpqWlJYwYMaL8vffes5fvq6urI/Hx8UmffPJJwcSJEytiY2MFSUlJibNmzSrZvHmznYeHR92CBQuKVqxYUSAQCBIDAwMrli9f7vTWW28VxMfHJ/34448PV6xY4QIA7733nn1AQEBFWlpawowZM0rz8vKaNTnn5uZqr1692uXs2bMPk5OTE8+dO/ewaRllccieh114eHimQCBI/PPPPwXGxsaSAwcOWIwfP14oEAgSk5KSEoYNG1bV5Pln2djYiKKiolI2bdrUSDilpfsAgNTUVP2bN28mX7hwIf3rr7+2XrVqVYFAIEiMi4tL6t+/f13TmGNiYoxHjBjRkBhra2s58qb3iRMnugLAzp07rczMzOrj4+OT7t+/n3T48GFrgUCgCwACgcDgwIEDWampqfGnT5+2TElJ0X/w4EHSq6++WhwWFmbT2nNR5Z6UPTsAGDFiROVff/1lrORr0y5YTV3NyIVnXFzUGgajK7hyBUhKAo4dAwhRdzQaQ1s16q7i5MmTFmvWrCkEgODg4JIjR45YjBo1qqqt4wDg119/NV2xYkWRvPnX1ta2HgCuXLlismPHDruamhpOaWmpNp/Pr0YrTcocDgdLly4tAYAlS5Y8mTlzZkP//bx58xqksNPT03WnT5/uWFRUpFNXV8dxcnKqVXa+P/74wzQ1NdVAvl5RUaElFAo5f/75p8nZs2fTAGDu3LnC5cuXN3PAjIyMNPL39y/n8Xh1ivekSEtxDB8+vGLDhg1Oc+bMKZk3b95TV1dXyfDhwyuXL1/uIhKJOLNmzXr64osvVrf+VNu+DwAIDAwsNTY2pgAQEBBQuX379r7Z2dm6c+fOfdq0lg4AQqFQ29zcXCJfV9b8fv36dVOBQGB4/vx5cwAoLy/XSkxM1NfV1aU+Pj6Vzs7OIgDo169f7ZQpU4QAMHDgwOqoqCiT1p6LKvek7NkB0ib4wsLCDvf3s5q6mklPB3R1AXv7tssyehjbtknFB2bPVnckzz0FBQVaf/75p8kbb7zh7ODg4LNr1y67CxcumEskkrYPboGqqiqyfv1657Nnzz5MSUlJ/Mc//lFcU1PTrv+pROFlz8TEpCGY1atX91u1alVhSkpK4q5duzJra2uVnpdSirt37yYJBIJEgUCQWFhYGGdmZvbsN9WEluLYunVr/vfff59ZXV3NGTVqFO/evXv6U6ZMqbh582ayg4ND3ZIlS/rv2rXLUtXrtHYfRkZGDfezYsWKkp9++inNwMBAMnXqVPfz5883E33Q0tKi9fWtu3hTSklYWFiW/Ho5OTkPZs6cWQYAenp6DX0VHA4H+vr6VP5zfX09ae25qHJPyp4dIP0+6enpdfh3x5K6mklPB5ydpeIzjF7EnTtAVBSwbh0TINAAjhw5Yj5jxoyS3NzcBzk5OQ/y8/PjHB0d665du6ZSc+f48ePL9u7dayUSScd3FRQUaFVVVXEAwM7OTiwUCjkXLlwwb+s8EokE8lHxhw4dsvT39y9XVq68vFyrX79+Ink5+XYTE5P68vJyLfn6yJEjy/7973/byNdv3bplAADDhw8vlx938uRJ07KyMi00YcyYMZV//fWXibzZuaCgoFmZluJISEjQ8/f3r/7ss8/yfX19K+Pj4/VTUlJ0HR0dRevXry9esGBB0d27d1Xu3mjpPpqSmJio6+npWfvhhx8WTp48uTQ2NrZZuf79+9ckJSXptXa9iRMnCr/99lvr2tpaAgBxcXF6ZWVlKv8Xbum5qHJPyp4dAMTHx+tzuVyVWzdagqUSNcOms/VStm8HzMyApUvVHQkDwKlTpyxmzpz5VHHbtGnTnh49etRClePXrVtX5OjoWMfj8bw8PDz4+/fvt7CysqoPDQ0t8vT09Bo7dix34MCBbQ5wMjAwkPz1119G7u7uXjdv3jT597//naes3AcffJA7b948Vy8vL09LS8uGAVjBwcGlly5d6sPj8fhXr1413rdv3+O7d+8acblcvqurq9euXbusAeDzzz/P/eOPP4zd3Ny8zp49a963b99mfc/29vbi8PDwjBkzZrh5eHjwZ8yYMUDVOP7zn//YuLu7e3G5XL6Ojg6dNWuW8Nq1ayaenp5enp6e/DNnzli88847Bao8WwBo6T6acvToUQsul+vF4/H4SUlJBsuXL3/StMykSZOE//3vf1uVbVy3bl0xj8er8fHx8XR3d/d6/fXXnUUikcp9ZC09F1XuSdmzA4Cff/7ZJDAwUOXZAC1BWhoV2dPw8/Oj0dHR6g6j3VhaSltn9+xRdySMTiM9HXBzk454/+ILdUfTqRBCYiilfu097v79+xkDBw5sdUQyg9EZZGZm6sybN8/l1q1bqeqOpT34+fl5XLlyJc3a2rr1vgMA9+/ftxo4cKCLsn2spq5GysqkYmOspt7LYJKwDIbacHZ2Fi1ZsqRYLj7TE8jNzdVeu3ZtgSoJvS3Y6Hc1wqaz9UKYJCyDoXaWLl36tO1SmoO9vb341VdfLe2Mc/WYN5neCJvO1gthkrAMBkONsKSuRuRJndXUewlMEpbBYKiZLk3qhJBAQkgyISSNEPJeC2XmEEISCSEJhJAfFLbXE0JiZcv5roxTXaSnA0ZGgJWVuiNhdApMEpbBYKiZLutTJ4RoAfgGwEQA2QDuEELOU0oTFcq4A/gXgBGU0qeEEBuFU1RTSgd1VXyagHw6GxMb6wUwSVgGg6EBdGVN3R9AGqX0EaW0DsBxANOalHkdwDeU0qcAQCktxHMEm6Pei5BLwr79NntL02CY9Wrb9DbrVULI0GnTpjX8pxWJRDA3Nx/Y1u/5Wb8LNTU1xM/Pz0MuVNTddOXodwegkcZzNoBhTcpwAYAQ8gcALQAfU0qvyvbpE0KiAYgBfE4pPdeFsXY7lEqTOqvU9RKYJGyPgFmvdj/qtl41MDCQJCcnG1RUVBBjY2P6448/mtra2nZZxtXX16ejR48u+/777y1WrlxZ0vYRnYu6B8ppA3AHMAbAPADfEUL6yPY5y0Qu5gPYSQhxbXowIWQZISSaEBJdVFTUXTF3Ck+eAJWVrKbeK2CSsO1jyRIn+Pt7dOqyZIlTW5eVW68ePHgw48cff1RJSU6OWCzGsmXLHOVKYJ999pkNAGzYsKGvt7e3p7u7u9e8efOclWnJ+/v7e9y8ebNBMvW1115zcnNz8woICODm5uZqy8ssWbLEydvb23PLli22P/zwg5mvry/P09OT/+KLL3IfP36snZycrBsREWG9Z88eW7miXG5urvbkyZNdvb29Pb29vT3/+9//GgFAfn6+1ogRI9zd3Ny8QkJCnFuzXuXz+Z4eHh78gIAAbtP9yuIAgEuXLhnLnc88PT35T58+5WRmZur4+fl58Hg8vru7u9fVq1eNgb9bCebPn99Pbr36ySef2Ci2CLR0H2+99Zb99OnT+w8ZMoQ3c+bM/tHR0fo+Pj6ePB6Pz+Vy+Q8ePGgmB3vs2DGLGTNmNJoeNmHCBOGpU6f6AMD//d//WQQHBzck2xs3bhgOGjSI5+npyR88eDDv/v37zc5ZVlbGmT17touPj4+np6cn/+jRo30AoKV4Zs2aVXr8+PF2fcc6i65M6jkAFP/QHGXbFMkGcJ5SKqKUpgNIgTTJg1KaI/t8BCASQLNmK0rpPkqpH6XUz9paqaqgxsJGvvcimCRsj4BZrzbmebFeBYBXX3215MSJE+ZVVVUkKSnJMCAgoGH/wIEDa+7cuSNISkpK3LRpU84777zj2PSc77//ft+xY8eWPXjwIOm3335L/vDDDx3Lyso4LcXzwgsvVMfFxbW7m6Qz6Mrm9zsA3Akh/SFN5nMhrXUrcg7SGvpBQogVpM3xjwgh5gCqKKW1su0jAPynC2Ptdtgc9V5Cejpw+rR0xLtJq3LTDDkHDjDrVTDrVVXuA+i49SoADBs2rDo7O1vvu+++s5gwYUKj309JSYlWSEhI/4yMDH1CCFWmAR8ZGWl67dq1PuHh4XYAUFtbS9LS0nRbikdbWxs6Ojr06dOnnKaxdDVdVlOnlIoBrAZwDUASgJOU0gRCyGZCSJCs2DUATwghiQBuAHibUvoEgCeAaELIfdn2zxVHzfcGWE29l8AkYXsEzHr12ehN1quBgYGlmzZtclqwYEGjfu53333XYfTo0eWpqakJFy5cSKurq1Nqo3r69Ok0eXx5eXkPhgwZUtNaPCKRiBgaGna7uUqX9qlTSi9TSrmUUldK6WeybRsppedlP1NK6VuUUj6l1IdSely2/ZZsfaDsc39XxqkO0tMBCwvA1FTdkTCeGSYJ22Ng1qvMenXlypXFGzZsyPX392/UglBWVqbl6OhYBwB79+5VqhoyduzYsrCwMFv5S+Aff/xh0Fo8+fn5Wn369BErerN3F+oeKPfcwqaz9QKYJGyPgVmvMutVV1dX0Ycffths2vS7776b//HHHzt6enryxWKlLqr4/PPPc8ViMeHxeHw3NzevDz/80KG1eK5cuWLatJm/u2DWq2qCywUGDgROnVJ3JIxnorYWcHYGBg+WzlF/TmDWqwxNRxOsVydNmuS6ffv2bF9fX6VjIToKs17VMCQSIDOT1dR7NEwSlsHQSNRtvVpTU0OCgoJKuyqhtwWzXlUDeXlAXR1L6j0WJgnLYGg06rRe1dfXp6tXr27WLdBdsKSuBth0th6OXBL22DEmCctgMDQK1vyuBth0th4Ok4RlMBgaCkvqaoDV1HswTBKWwWBoMCypq4H0dKBvX0Bfv+2yDA2DScJ2CxIJ8MsvMIqIQJ9ffoFRBzRiGnj48KHO+PHjXZ2dnb2dnJy8Fy9e7FRTU6O0/yQjI0MnMDCw2RSvpig6kLWXt956y37jxo22qpbvqMObIv/5z3+s5eIw9+7d05druCckJOgNHjyY19HzBwYGDkhMTNQFpNrvXC6XL9eKfxaXOVXpqc5qnQlL6mqAzVHvocglYZcvZ5KwXciJEzCzt4dvUBC4q1bB5ZVXwLW3h++JE2hVV701JBIJpk+f7hYUFFSamZkZn56eHl9ZWclZu3ZtM9UgkUgEFxcX0dWrVx+1dd6oqKg0Kyur5vJlGs4777xTJB/MderUqT5BQUFPk5KSEr28vGrv3bsnUPU8EokETdXboqOj9evr6wmfz2+YGx8VFZUiV2PTRIc5RWc1dcfSUVhSVwMZGSyp90iYJGyXc+IEzBYuxICCAuhUVYFTWQmt6mpwCgqgs3AhBjxrYr9w4YKJnp6eZO3atU8AqTb3nj17Hp84ccKqvLycEx4ebjlu3Di34cOHc1988UUPRY/18vJyzssvvzzA1dXVa+LEia6+vr48ueua3IEsOTlZd8CAAV5z5851dnNz8xoxYoR7RUUFAYCwsDArb29vTw8PD/7kyZNdy8vLW/2/+/jxY+2JEye6enh48D08PJrVbIVCIScgIIDL5/M9uVxug2NYWVkZZ8yYMW4eHh58d3d3r++++84cAFatWuXg6urqxeVy+cuWLXME/m4lOHHihNm+fftsDx06ZD1s2DAu0LhF4KOPPrL19vb25HK5/HXr1tkDUv95FxcX7xkzZrhwuVyvhw8fNjKLOXTokOUrr7zSyCVNGS2du3///l7BwcEuLi4u3kFBQf3PnTtnMmTIEJ6zs7P3jRs3DIHe56zWmbDR792MSAQ8fsySeo+DScJ2ORIJsGYNnGtrlVc2amvBWbsWzrNnI47TzurIgwcPDAYOHNjIvMXCwkLSt2/fusTERD0ASEhIMIyLi0uwtbWtT05ObkhU27Zts+7Tp0/9w4cPE+7cuaMfEBDgpewaWVlZ+kePHn304osvZr788ssDIiIizFetWlUSGhr6dP369cUAsGbNGvvw8HCrDz74oJmymZwVK1b0GzVqVPnGjRsfisViCIXCRs37hoaGkkuXLqVZWFhI8vLytIcNG8abP39+6dmzZ03t7OxEkZGRaQDw5MkTrfz8fK3Lly+bP3r0KJ7D4aBpV0FISIjw9u3bRcbGxvWbN29upAB39uxZ07S0NP24uLgkSikmTJjgduXKFeMBAwbUZWVl6e3fvz99/PjxGU3jv337tnFTffXRo0dzORwOdHV1JXFxcYLWzv348WP9EydOPBo6dGiGr6+v57Fjxyyjo6MFP/zwQ5/PPvus79ixYx/KndV0dHRw7tw5k3feecfx2rVrjVzm5M5qp06dyiguLtby8/PzDAoKKpM7q61cubKkpqaGyFXk1Oms1pmwpN7NPH4s/efFBsn1MJgkbJdz4waMKirQav90eTm0IiNhNG4cOr0Jd9SoUWXKnMpu3bplvHbt2kIAeOGFF2q4XK5SZzcHB4dauTPZ4MGDqzIyMvQAICYmxmDjxo0O5eXlWpWVlVqjR49uVT701q1bJqdPn04HpC0KlpaWjWKSSCTkzTffdPzzzz+NORwOCgsLdbOzs7WHDBlS/cEHHzitXLnSYdq0acLAwMAKkUgEPT09SUhIiMvUqVNLQ0JCVJYuvXr1qunNmzdN+Xw+HwCqqqo4AoFAf8CAAXV9+/atGz9+vNLfQVFRkY6dnV2jzumoqKiUvn37NmiwtnZuBweHWrk+O5fLrR43blwZh8PBkCFDqrZs2WIP9D5ntc6ENb93M2w6Ww+kthYIDwcCAwEfH3VH02vJyYEOIWhVt5oQ0OxstHvagbe3d/X9+/cbGYyUlJRw8vLydPl8fi0grQG397yK6OrqNsSupaVFxWIxAYBly5b137VrV1ZKSkriu+++m9uS45qq7N271+LJkyfaDx48SBIIBImWlpai6upqjq+vb+3du3cTfXx8qj/66COHDRs29NXR0UFsbGzSrFmznl68eLHPmDFj3FW9DqUUb775Zp68LzwrKyt+3bp1xUDrz0pPT09SXV3d6j22dm7F58jhcKCvr08BQEtLC/X19QTofc5qnQlL6t0MS+o9ECYJ2y04OEAkkaBVNR9KQRwd0e4hykFBQeU1NTUc+YhvsViMVatWOc2ePbtY0fJUGQEBARXHjx83B4CYmBj9lJQUpQ5iLVFVVcXp16+fqLa2lqjSZztixIjybdu2WcvjfPLkSaPWC6FQqGVlZSXS09OjFy5cMMnNzdUFpCP2TUxMJKtWrSp566238mNjYw2FQiFHVqsV7tmz57FAIFDZOW3KlCllR44csZL7mqenp+vk5OS02brr7u6u1CWtM84tp7c5q3UmrPm9m0lPl461cnJSdyQMlWCSsN3G2LGoNDFBfXV1y5UNExPUjxnT/qZ3DoeDc+fOpS1btsx527ZtfSUSCcaNGycMDw/PaevYt99+u2jOnDkurq6uXq6urjVubm415ubmKo94f++993L9/f09LSwsxEOGDKmoqKhotYvh22+/zVq0aJEzl8u14nA42LVrV+aECRMa7nnp0qUlU6ZMceNyuXxfX9+q/v371wDSZv5//etfjhwOB9ra2nT37t2ZpaWlWlOnTnWrra0lAPDpp58+VjXumTNnliUkJOi/8MILPEBaOz927Fi6trZ2q0lvypQppb/++qvJ9OnTldrKduTcct599938pUuX9v/iiy/sJ06cqHRQ3ueff567bNmyfjwejy+RSIiTk1PtjRs30o4ePWpx8uRJS21tbWptbS369NNP8wD1Oqt1JsylrZuZPx/43//+rrEzNJxLl4CpU6WSsPPnqzsatdPVLm3y0e/KBsvp6UFy+DAehYSgW//xisVi1NXVEUNDQ5qQkKA3adIk7sOHD+PlzcKMxlRUVJARI0Z4xMTECLS1e069saud1TqT1lzaes4T7yWw6Ww9DCYJ263IEvajtWvhXF4OLUJAKQUxMUH9V18hs7sTOiCd0jZq1CgPkUhEKKX48ssvM1lCbxljY2O6cePG3PT0dF13d/dmPu6aiLqd1ToTltS7mfR04OWX1R0FQyXkkrA7djBJ2G4kJATC2bMRFxkJo+xs6Dg6QjRmDCrbO42tszA3N5fEx8cnqefqPZPg4OAydcfQHtTtrNaZsKTejVRXA/n5bDpbjyEsjEnCqgkOB+iKaWsMRm+nS999CSGBhJBkQkgaIeS9FsrMIYQkEkISCCE/KGxfSAhJlS0LuzLO7iIjQ/rJmt97AOnpwKlTTBKWwWD0KLqspk4I0QLwDYCJALIB3CGEnKeUJiqUcQfwLwAjKKVPCSE2su0WADYB8ANAAcTIjlWb8X1nwKaz9SCYJCyDweiBdGVN3R9AGqX0EaW0DsBxANOalHkdwDfyZE0plUsnTgbwM6W0RLbvZwCBXRhrt8CSeg+BScIyGIweSlcmdQcAinMis2XbFOEC4BJC/iCE/EkICWzHsSCELCOERBNCoouKijox9K4hPR3Q0wPs7NQdCaNVmCSs+pF6rxohIqIPfvnFCJ3gvcqsV/+mu61Xhw4d6qG4n8fj8eWGOS2haKrTXl588UVuUVHRM/1eejrqVpTTBuAOYAyAeQC+I4T0UfVgSuk+SqkfpdTP2tq6i0LsPDIypIPk1DWKl5SXMIYAACAASURBVKECTBJW/Zw4YQZ7e18EBXGxapULXnmFC3t7X5w4waxXO4nutl6trKzUSktL0wGAu3fv6nfSbbTIvHnznmzfvl3zk0IX0JXpJQeAom6ao2ybItkAzlNKRZTSdAApkCZ5VY7tcTAf9R4Ak4RVLydOmGHhwgEoKNBBVRUHlZVaqK7moKBABwsXDnjWxM6sV9VrvTp9+vSSiIgICwCIiIiwCA4ObnBxS05O1h06dKgHn8/35PP5nk3vF5AKAC1fvtxRHsu2bdusACAzM1PHz8/PQ17zv3r1qjEAzJ07t/Ts2bOWrT3n3kpXJvU7ANwJIf0JIboA5gI436TMOUhr6SCEWEHaHP8IwDUAkwgh5oQQcwCTZNt6NCypazhMEla9SL1XndGS4UltLQdr1zo/S1O8qtarP/3008M7d+4kK5ZTtF7dunVrTmJiolJ7zqysLP01a9YUpqWlJZiZmdVHRESYA0BoaOjT+Pj4pOTk5EQPD4/q8PBwpVrlcuTWq8nJyYkJCQmJQ4YMqVHcL7deTUxMTIqKikp5//33HSUSCeTWq8nJyYmpqakJM2fOLJNbr6ampiakpKQkbt26NU/xXCEhIcIFCxYUrVixouD27dspivsU7VGTkpISY2NjDa9cuWIsu1e91atXF6WlpSVwudxGAjO3b982Hj58eKNnPW/evKcXLlwwB4Br1671mTlzZkPSt7e3F//2228piYmJSSdOnHi0bt26fk2fyc6dO63MzMzq4+Pjk+7fv590+PBha4FAoHvgwAGL8ePHCwUCQWJSUlLCsGHDqgDA2tq6vq6ujuTn5z93TfBdNvqdUiomhKyGNBlrAThAKU0ghGwGEE0pPY+/k3cigHoAb1NKnwAAIeRTSF8MAGAzpbSk+VV6DkIh8PQpm6Ou0Vy5AiQlSSVhSau+Ioyu4MYNI7Shi47yci1ERhph3DhmvdqDrFdtbGzqzczMxPv27TN3c3OrNjY2bngzq6urI6+99ppzYmKiAYfDQWZmZjMzmOvXr5sKBALD8+fPmwNAeXm5VmJiov7w4cMrly9f7iISiTizZs16Kn/+AGBpaSnOysrStbOzq256vt5Ml/buUkovU0q5lFJXSulnsm0bZQkdVMpblFI+pdSHUnpc4dgDlFI32XKwK+PsDtjI9x4Ak4RVLzk5OiCkdflVQiiys5n1ag+0Xp01a9bTd955x3nevHmNKmifffaZrY2NjSgpKSnxwYMHiSKRSJmNKgkLC8uSx5KTk/Ng5syZZVOmTKm4efNmsoODQ92SJUv6ywf/AVL/9I7+TnsibMhWN8GSuoYjl4Rdt45JwqoLBwcRJJLWm0goJXB0ZNarPdB6NTQ09Okbb7yRP3PmzEYSskKhUKtv374iLS0t7N6927LpwDsAmDhxovDbb7+1lrvNxcXF6ZWVlXFSUlJ0HR0dRevXry9esGBB0d27dw0B6QC+oqIiHQ8Pjx6v5d5emExsN8GSuobDJGHVz9ixlTAxqYeSWl4DJib1GDOGWa/2QOtVc3NzyWeffZbftPybb75ZGBwc7Hr8+HHLcePGCQ0MDJq9ZK1bt644IyNDz8fHx5NSSiwsLESXL19+eO3aNZPw8HA7bW1tamhoWH/s2LF0APj9998NBw8eXKnzHL6gM+vVbmLNGuDQIWnfOuuu1TDS0wE3N+mI9y++UHc0Gk1XW682jH5X1kStpyfB4cOP0I5+4c6AWa+2D02wXl28eLHT9OnTS6dNm9aip3tPhlmvagDyke8soWsgTBJWc5Am7EdYu9YZ5eVaIISCUgITk3p89VVmdyd0gFmvthdNsF719vau7q0JvS1YUu8m5JVBhobBJGE1j5AQIWbPjkNkpBGys3Xg6CjCmDGV6lJtYtar7Ufd1qvr169vu1Wol8KSejdAqTSpT5ig7kgYzWCSsJqJ1HuVWa8yGO2EjX7vBoqKpHmDDZLTMJgkrMYjFqs7AgajZ8GSejfARr5rKEwSVqO5dw/6FhYYdP8+mk2PYjAYymFJvRtgSV0DYZKwGo1EAixeDJeKCmgtWgSXTjBpYzCeC1hS7wYyMqSfLKlrEHJJ2LffZlMSNJDDh2GekgIDSoHkZBhGREBl98aW0NLSGio3/pgyZcqAtoxVlLF582abZzmuJ9Beq9Pg4GCXgwcPmnfGtZta3b7yyiv9uVwu/5NPPrF588037c+dO2fSkfMfOXKkz4YNG/oCUjMbGxsbXx6Px+fxePxVq1Z16QhZuelPe49btmyZ4/nz59t932ygXDeQng5YWQHGxuqOhNEAk4TVWIRCcNatQ7/qammlo7oanDffhPPMmSgzNcUz19n19PQkAoEgEQCCgoL6h4WFWX/88ccF7TnH3r17bV9//fWStlToWkIikYBSCi2t585npFUUrW6zsrK079+/b5SVlRX/LOcSiURoKjqzY8cOu8uXL6fJ11esWFGwefPmdv3uu5sNGzYULl682DkoKKhdU/N65RunpsHc2TQMJgmr0WzYAPuamsb/m2pqwFm/HvaddY2RI0dWpKWl6QHAxx9/bOvu7u7l7u7utXnzZhtAuY3pli1bbAoLC3VGjx7NlduUKhIeHm45fvx4V39/fw9nZ2fv9evX9wWUW5Xu3bvXgsvl8t3d3b1WrlzZUFM8ffq0KZ/P9/Tw8OAHBARw5bHMnj3bxcfHx9PT07PBajU6Olrfx8fHk8fj8blcLv/Bgwd6Ldmv/vbbb4YvvPCCh5eXl+fIkSPdMzMzdeTb5RavO3bssGnpeX3wwQd2XC6X7+HhobRmu2HDhr7e3t6e7u7uXvPmzXOWyPpLtmzZYiO3fZ06deoAALh06ZKxvJbs6enJf/r0KUexlWDChAncwsJCXR6Px7969aqxYotAS/fh7+/vsWTJEidvb2/PLVu22CrGFhcXp6erqyvp27dvq8MuWzv3a6+95uTt7e05YMAAr6ioKMNJkya5Ojs7e69Zs6bhOzlhwgRXLy8vTzc3N6/t27crdeLbvXu3hfx3Nn/+fGexWAyxWIzg4GAXd3d3L3nrBABwudy60tJS7aysrPZVvimlLS6QuqvdaK2MpixDhw6lmoqbG6WzZ6s7CkYDISGUmplRWlam7kh6HJA6LLb77zM2NjaDUhrd1nL3Lo3X06MS6UTQxoueHpXExtIHqpxH2WJgYFBPKY2uq6uLHjdu3NPPP/888+bNm4nu7u5VQqHwbmlp6V1XV9fq33//PeHgwYNpISEhRfJji4uL71FKo+3t7Wtzc3NjlZ3/q6++SreysqrLy8u7V15eHuPm5lYdFRWVKBAI4ggh9Pr160mU0uj09PT7dnZ2tTk5ObF1dXXRw4YNK4uIiEjLycmJtbW1rUtKSoqjlEbn5+ffo5RGv/HGG3nffPPNI0ppdFFR0T1nZ+caoVB4d8GCBQW7d+9+RCmNrq6ujikvL49RFndNTU3MoEGDKnJycmIppdH79u17OGvWrGJKabS7u3vV5cuXBZTS6GXLluW7ublVN72vEydOpAwaNKiirKzsrmJcM2fOLD5w4MBDxW2U0uhp06Y9OXbsWCqlNNra2rquqqoqRh47pTR67NixpdeuXUuilEaXlpberaurixYIBHHyayv+rHid1u7jhRdeKA8NDS1U9nvZuXNn+tKlS/Pl6+vWrcu1trau8/DwqPLw8Kg6ffp0SlvnXrFiRR6lNHrz5s1Z1tbWdRkZGferqqpibGxs6vLy8u4pPgP5716+Xf6diYmJiR87dmxpTU1NDKU0OjQ0tPDrr79Ov3nzZmJAQIBQHp/8OVFKo0NCQooOHjyY1vSeZH9PSv/WWn0DoJTWE0IkhBAzSmm3Kzn1BurrgcxMYOZMdUfCACBtNjl1Sjri3aRD3XSMTkY+OE7Ugl2LSAQsWgSXmBgkP4sOTW1tLYfH4/EBYNiwYeVr164t3rZtm/XLL79campqKgGA//f//t/TGzdumAQFBQmb2piqco2RI0eW2dnZ1cvPFRkZaRwSElKqaFX6+++/Gw0fPrzc3t5eDAAhISElUVFRxlpaWtTf37+cx+PVAYDcBjYyMtL02rVrfcLDw+1k90HS0tJ0AwICKrdv3943Oztbd+7cuU99fHxqldmv3rlzRz81NdVg3LhxXOlzlsDa2lpUXFysVV5erjVlypQKAFiyZMmTX3/91azpPf3888+m//jHPxqMb5TZ0165csVkx44ddjU1NZzS0lJtPp9fDUDo4eFRPWPGjP5BQUGloaGhpQAwfPjwig0bNjjNmTOnZN68eU9dXV1V6sqIi4vTU3Yf8v1N3d/k5OXl6VhbWzeqpTdtfm/pGcn3z5gxoxQABg4cWO3m5lbt7OwsAgAnJ6faR48e6drZ2VV/8cUXtpcuXeoDAPn5+ToJCQn6dnZ2DVoLV69eNYmPjzccOHCgJwDU1NRwbGxsxCEhIaWPHz/WW7hwodMrr7winDFjRoNwj7W1tTgnJ0dXlecjR5VqfQWAB4SQnwE0BEgpZZqaKpCbK/1nxJrfNQQmCauxJCZCLz4eRi2NdJdIQB48gHFSEvS8vNBu9y3FPvW2kNuYnjlzxuyjjz5yuH79etn27dvzFMtERET02bp1qz0A7Nu3LwMASJNBl/L1jliAUkpx+vTptIEDBza65yFDhtSMGjWq8scffzSbOnWq+9dff50ZFBRU3jTuOXPmlLq5uVXHxsYKFI8vLi7ulI79qqoqsn79eufbt28nurm5id566y37mpoaDgDcuHEj9cqVKyY//fST2fbt2/smJycnbN26NX/69OnCn376yWzUqFG8S5cuparyfCilRNl9yGlpnIOBgYFEKBS2VYFt9dxyWWAOhwM9Pb0GiWAOhwOxWEwuXrxoEhUVZRIdHS0wMTGR+Pv7ezS1n6WUktmzZz/55ptvmpkIxcfHJ/7444+me/bssT5x4oTFqVOnMgCgpqaGKDO4aQ1V3nfPAvgIwE0AMQoLQwXYdDYNgknCajR8Pmq9vVHJ4UCprjqHA+rjgwpPz/Yn9JYYO3ZsxeXLl/uUl5dzysrKOJcvXzYfO3ZsuTIbUwAwMjKql1uRLliwoFTu7/3SSy9VAcDvv/9uWlBQoFVRUUEuX77cZ/To0c1q+KNGjaq8ffu2SV5enrZYLMapU6csxowZUzFmzJjKv/76y0QgEOgCQEFBgZYsxrKwsDBbeT/1H3/8YQAAiYmJup6enrUffvhh4eTJk0tjY2MNlMXt6+tbU1JSon39+nUjQFrTj46O1reysqo3MTGpv3btmjEAHDp0SKkt7OTJk8uOHj1qJR/1L49LTlVVFQcA7OzsxEKhkHPhwgVzAKivr8fDhw91X3nllfJvvvkmp6KiQksoFGolJCTo+fv7V3/22Wf5vr6+lfHx8fqq/K5auo+2jvPy8qp5+PBhq1oHz3puOaWlpVpmZmb1JiYmknv37unfv3/fqGmZwMDAsosXL5rL7WsLCgq0UlJSdPPy8rTr6+uxaNGi0n//+985Dx48aLDHffjwof7AgQOrVY0DUKGmTik93J4TMhrDprNpEEwSVqPhcICDB5EREAB+rZK0raMDHDqEjM6UgB85cmTV/PnznwwZMsQTAF599dWiESNGVJ85c8a0qY0pACxcuLA4MDCQa2trW3f79u2Upufz9fWtDAoKcs3Pz9edNWvWk5deeqkqOTm5UfOps7OzaNOmTTmjR4/mUkrJhAkTSv/xj3+UAkB4eHjGjBkz3CQSCSwtLUW3bt1K/fzzz3OXLVvWj8fj8SUSCXFycqq9ceNG2tGjRy1Onjxpqa2tTa2trUWffvpp3u+//27UNG59fX16/Pjxh2vWrOlXXl6uVV9fT1auXFng5+dXs3///oylS5e6EEIwZswYpXrts2bNKrt7967hoEGDPHV0dOiECROEu3btaqhtWllZ1YeGhhZ5enp6WVtbiwcOHFgJAGKxmMyfP79/eXm5FqWULF26tNDKyqp+/fr19rdu3TIlhFAPD4/qWbNmCbOystocsdrafbR23OTJkyvee+89J4lEAk4LX55nPbec4OBg4b59+6wHDBjgNWDAgBr5M1Bk6NChNR9++GHO+PHjuRKJBDo6OjQ8PDzL0NBQ8tprr7lIJBICAJs3b84GpC8WGRkZei+99FK75JLbtF4lhIwA8DEAZ0hfAggASikd0Npx3Y2mWq9+8ol0qa4G9JgulvqorQWcnYHBg6Vz1BnPRJdbrwJ4/XU4HjkC69rav1sS9fQgefVVFH33HbLbe+3uIjw83DI6OtooIiIiS92xMBqzePFip2nTppU29XjXZCIiIvrExMQYfvXVV7lN97VmvarKO+9+ADsAjATwAgA/2WebEEICCSHJhJA0Qsh7SvYvIoQUEUJiZctShX31CtvPq3I9TSQ9HbC3Zwld7TBJ2B5DWBhy9fUbz0fX14ckLAzN/rkxGKqwefPmvMrKyh41hVssFpOPPvqo3XPpVamp36aUDmv3iQnRApACYCKAbAB3AMyjlCYqlFkEwI9SulrJ8RWUUpXlWjS1pj56tHQE/O+/qzuS5xiJBPD2lr5Z3b3LFOQ6QHfU1AHg4EGYv/EGXKqrwTEwgGT3bqQvWoTS9l6XweiNdLSmfoMQso0QEkAIGSJfVDjOH0AapfQRpbQOwHEA01QPu3fAhGc0ACYJ2+NYuBBPuVxUEwJ4eKBqwQKW0BkMVVBlSpu8lq74dk4BtOWC4QDgscJ6tsK5FAkmhLwEaa1+HaVUfow+ISQagBjA55TSc00PJIQsA7AMAPr169fWfXQ7dXVAdjZL6mqHScL2OOSD5sa+JOYdOqTdqYPjGIzeTIt/KoSQtbIfP6KUjm2ydJat1QUALpRSXwA/A1Acae8sa+abD2AnIcS16cGU0n2UUj9KqZ+1tXUnhdR5ZGVJtbBYUlcjTBK2xzIY9/CUWGAg7qs7FAajx9Da++9i2Wf4M547B4CTwrqjbFsDlNInlFL55JXvAQxV2Jcj+3wEIBLA4GeMQ22w6WwaQFgYYGYGLF3adlmG5iCVl3MhFRVaWLTIBcx7lcFQidaSehIhJBWAByEkTmF5QAiJU+HcdwC4E0L6E0J0AcwF0GgUOyGkr8JqEIAk2XZzQoie7GcrACMAqKQEpUkw4Rk1I5eEXb6cScL2NA4fNkdKigGk3quGiIhg1qtdzPNkvUoIGRofH98wJ2nz5s02hJChN2/eNGz5LFJzl7bKKGPr1q3WO3futGx/5O2nxS8npXQegFEA0gC8orBMlX22CqVUDGA1gGuQJuuTlNIEQshmQkiQrNgaQkgCIeQ+gDUAFsm2ewKIlm2/AWmfeo9M6tragKOjuiN5TmGSsD0ToZCDdev6QS6zWV3NwZtvOqOsrEPJVC4Tm5qamqCjo0PDwsLa3We3d+9e24qKimeOQyKRoL6+mXT6c48y69WUlJTETZs2Fe7cuTO3PfPLRUrMA3bs2GG3fv36Ivm6u7t7dURERIOC3rlz5yzc3NxUEpp5Fv75z38+2bt3r23bJTtOq19OSmk+pXQgpTSz6aLKySmllymlXEqpK6X0M9m2jZTS87Kf/0Up9ZJdYyylVCDbfotS6iPb7kMp3d/RG1UH6elAv37SvMLoZpgkbM9lwwZ7yLTDG6ip4WD9ema9yqxXO8V69eWXXy69fPlyHwBISEjQMzExEZubmzfsDw0N7eft7e3p5ubmtW7dOqXfu7Nnz5oOGjSIx+fzPadMmTJALh+8atUqB/k9L1u2zBGQ6tI7OjrW3rhxo921/PbSPp9WRrtITwdcXNQdxXMKk4Ttmdy7p48jR2xQW9t47mFtLQdHjthg9eoiNDE2aS8ikQjXrl0znTRpUtlvv/1m+MMPP1jGxMQkUamFs+f48ePLU1NT9ezs7ESRkZFpAPDkyRMtS0vL+m+//dY2KioqpSVv7ri4OKMHDx4kGBsbSwYPHsyfNm2a0NbWVpyVlaW3f//+9PHjx2dkZGTofPzxxw4xMTFJ1tbW4lGjRnGPHDnSZ/z48RWrV692iYyMFPB4vDq5xvr777/fd+zYsWWnTp3KKC4u1vLz8/MMCgoq+/rrr61XrVpVsHLlypKamhoiFotx+vRps6Zx19bWkjVr1vS7dOlSmr29vfi7774z37Bhg8OpU6cyXnvtNZevvvoqa8qUKRXLly9X2qZ48uRJ08uXL/eJiYkRmJiYSJpqvwPA22+/XSg3vJk+fXr/48ePm82fP18YHh5ul5mZ+cDAwIDKDWTCwsLswsPDMydNmlQpFAo5hoaGksLCwoZzXbhwIW3q1KnucvOd7777zgqQyqa2dB8AUFdXR+Lj45Oaxnbjxg1jX1/fKsVtpqam9fb29nV37tzRP336dJ9Zs2Y9PXLkSIMH+o4dO3JsbW3rxWIxXnzxRY/bt28bDBs2rEGDPS8vT3vr1q19b968mWJqair54IMP7D799FPbDRs2FF6+fNn80aNH8RwOp5FpzpAhQyojIyNNxo4d2yiWzqZX9g1pCmyOupqorQXCw4HAQMDHR93RMFRFNjgOrXuvPvOgObn1qo+PD9/R0bFu7dq1xZGRkcZy61UzMzOJ3Hp1yJAh1b/99pvpypUrHa5evWpsaWmpUpu53HrV2NiYyq1XAaAl61UdHZ0G69XIyEijlqxXv/zyy748Ho8/cuRID0Xr1bCwsL4ffPCBXWpqqq6xsTFVFreiZSmPx+Nv27atb25uro4y61Vl96Sq9aqvry+Py+Xyb926ZRIfH28AAHLr1d27d1vo6OhQ4G/r1S1bttgUFxdr6ag4K6Wl+5Dvb4/1KgDMmTOn5MiRIxaXLl0yDw0Nfaq47/DhwxZ8Pt+Tz+fzU1NT9e/fv9/I3CUyMtLo4cOH+v7+/jwej8c/fvy4ZVZWlq6lpWW9np6eJCQkxOXw4cN9jI2NG76sNjY2YsV4uwpWU+8iKiuBwkKW1NUCk4TtmSQm6iE+3qjFpC2REDx4YIykJD14eTHrVWa92oj2Wq+GhIQIN27c6Ojj41NlYWHRcKxAINDdtWuXrawlpT44ONilpkl3EKUUI0eOLLtw4UJ60/PGxsYmnT9/3vT06dPm3377rc2ff/6ZAkj909tro/ostDZP/QIh5HxLS1cH1tPJlI06YEm9m5FIpNPYBg0CxnWWnAKjW+Dza+HtXQkOR7l2NYdD4eNTAU9PZr0KZr3aUetVExMTyccff5z90UcfNXpZe/r0qZaBgYHEwsKi/vHjx9qRkZFmTY8dM2ZMZXR0tLF8BH1ZWRknLi5OTygUckpKSrRCQkKEe/bseSwQCBr60FNSUvS8vb3bZaP6LLRWU98u+5wJwA7AUdn6PADtFpl/3mDT2dSEXBL22DEmCdvTkMrIZSAggI+WvVcz0Inycsx69fm2Xl22bNnTpuUDAgKqvb29q1xdXb379u1bN3To0GYvZvb29uK9e/dmzJ07d0BdXR0BgE2bNuWYmZlJpk6d6lYrGxPy6aefNqiq3rlzx/iLL77oclMiVQxdopsaOCjbpm40zdBl1y7gn/8E8vIAOzt1R/McMWYM8OgR8PAhU5DrArrF0OX11x1x5Ig1amv//g+spyfBq68W4bvvmPUqo92o23r1jz/+MNi2bZvduXPnmjXXPwsdNXQxIoQ0iAIQQvoDMOqMwHoz6emAvj5g2y0zExkAmCRsbyEsLBf6+o37HvX1JQgLY9arjGdC3darhYWFOl988UVO2yU7jioD5dYBiCSEPAJAADgDWN6lUfUC5NPZWAtwN8IkYXsHpqYSfPllFt54wwXV1RwYGEiwc2cmTE01Wit2zZo1TwAoHUHOUC9OTk7i0NBQobquP2PGDKVdG11Bm28ulNKrANwBrIVU9c2DUnqtqwPr6bDpbN0Mk4TVdCQSiUT1V9yFC5+Cy62G1Hu1CgsWMOtVBgOA7O+oxRfcNpM6IcQQwNsAVlNK7wPoRwiZ2nkh9k5YUu9mmCSsphNfVFRkpnJilw+aMzau7+zBcQxGT0UikZCioiIzAPEtlVGl+f0ggBgAAbL1HACnAFzscIS9lNJSQChkSb3bKCkB9u9nkrAajFgsXpqfn/99fn6+N1QVveJwgMjIbAAmuH+fNb8wGNIaerxYLG6xj1GVpO5KKQ0hhMwDAEppFWmqsMBoBJvO1s3s2SNV+2GSsBrL0KFDCyF1YmQwGF2IKm/MdYQQAwAUAAghrgA6TfyhN9KRpM4MnNoJk4RlMBiMBlRJ6h8DuArAiRByDMAvAN7pyqB6Os+a1AUCwMoKSE7u/Jh6JfX1TBKWwWAwFGiz+Z1S+l9CSAyA4ZBOaVtLKVVNROI5JT0dMDUF+vRR/RhKgSVLgLIy6efvv7PpcK0iEADDh0vfgpgkLIPBYABQbfT7LwCGUUovUUovUkqLCSH7uiG2Hot85Ht7kvLp08C9e1Lp8rt3gTNnui6+Ho/iG9DDh9JaOnsDYjAYDJWa3/sDeJcQsklhm0ZJxGoa7Z3OduYMMHcuUCNTMK6pAUJCgLNnuya+Hs+ZM0BcnDS5EwJoM7NBBoPBAFRL6qUAxgOwlTm3NXOsYfwNpUBGhupJ/fJlaQJv6jYpkQBz5kj3MxSoqABWrJCOdgekD/yNN/5eZzAYjOcYVZI6oZSKKaWrAJwB8DsAG1VOTggJJIQkE0LSCCHvKdm/iBBSRAiJlS1LFfYtJISkypaFqt6QuiksBKqrVUvqlAKLF7c84r2+XtrK3IbnzvPFpk1AVVXjbVVVwMaN6omHwWAwNAhVkvoe+Q+U0kMAFgH4b1sHEUK0AHwDYAoAPoB5hBC+kqInKKWDZMv3smMtAGwCMAyAP4BNhBBzFWJVO+0Z+f7nn0BxG0MOi4qA27c7HlevQCAAvv1W+takSHW1dDubNsBgMJ5zWuyMJISYUkrLAJySJVk56QBUmT/kDyCNUvpIdr7jAKYBSFTh2MkAfqaUlsiO/RlAIID/a+mAJ0+efLdabAAAGX9JREFU4NChQ422eXl54YUXXoBIJMKxY8eaHTNo0CAMGjQIVVVVOHnyZLP9fn5+8Pb2hlAoxI8//thsf0BAADw8PFBcXIyLF6UCe4WFwKJFQGYm8OjRSxgwYADy8/Nx9erVZscLheMhkTjByekxxo//pdn+q1cDkZ9vh9u3H0EguNls/9SpU2FlZYXk5GT873//a7Z/xowZMDMzQ3x8PJTZ0s6ZMweGhoaIjY1FbGxss/2hoaHQ0dHBnTt3kJCQ0Gz/okWLAAC3bt1CSkpja2kdHR2EhoYCAKKiopCe3thx0NDQEHPmzAEAXL9+HdnZjR01TU1NMXPmTNlzuIr8/HzpSMKQEACA5ZMneOXCBQDAhVdewRNLS2D3bmDwYACAnZ0dAgMDAQBnz55FWVljPwVHR0dMmDABAHDy5ElUNan99+/fH6NHjwYAHDt2DCKRqNF+LpeLF198EQCafe8A9Xz3FHnppda/e+PHj4eTkxMeP36MX35p/t0LDAyEnZ0dHj16hJs3m3/3GAyGZtJaTf0H2WcMgGjZZ4zCels4AHissJ4t29aUYEJIHCHkNCHEqT3HEkKWEUKiCSHRTf/pqgv5YDd9/bbLcrloU9KawwHc3DoeV4+nuhoob8MKuby8eS2ewWAwniMI7aIOW0LILACBlNKlsvVXIZ0at1qhjCWACkppLSFkOYAQSuk4QsgGAPqU0i2ych8BqKaUbm/pen5+flRZbbS7ef114KefpDX2tqAUsLNrvaytLZCXx2ZsgVJgxAhpn4Wy7yyHI523zib4dymEkBhKKZv9wmBoKC3WEwkhQ1pbVDh3DgAnhXVH2bYGKKVPKKVyydnvAQxV9VhNpT3T2QgBDh6UmospQ0sLOHCA5SgA0oewZ0/Lowb19NjDYjAYzz2tTfANa2UfBdCWhNcdAO6EkP6QJuS5AOYrFiCE9KWU5slWgwAkyX6+BmCrwuC4SQD+1cb1NIKMDMCvHfWYl18GTp4EZs9uPK2Nw5Fuf/nlTg+x53LnjvRTVxeoq/t7u4EBsHIl4OGhnrgYDAZDQ2gxqVNKx3bkxJRSMSFkNaQJWgvAAUppAiFkM4BoSul5AGsIIUEAxABKIB1ZD0ppCSHkU0hfDABgs3zQnCZTXw9kZUkTdHuYORM4fhxYuFDaJayvDxw5It3OkCGRAGFhgK8vkJ0ttVuVY2gIbN6svtgYDAZDQ1CpT50Q4g3ptLSG4V+U0ogujKvdaEKfelYW4OwM7N0LLFvWvmPlXca3b7OuYaVcugRMnQocOyatqS9aJBWcMTICDh8GgoPVHeFzAetTZzA0G1W03zcB+Fq2jAXwHzBfZKV0xHKVEGmXsKkp6xpWyrZtgJOTtBkkOFhaY+dwgIEDWZMGg8FgyFBFfGYWpDKx+ZTSxQAGAmBSsUroSFIHAB5PKkbDuoabcOcOEBUFvPkmoKPD3oAYDAajBVRxwqimlEoIIWJCiCmAQjQemc6QkZ4uzS9OHXg6LY2Ef64JCwPMzKTzBeXI34DYA2MwGIwGVEnq0YSQPgC+g1R4pgJAc/kyBtLTAQcH6ewqRieRng6cOiW1VzUxabyPJXQGg8FoRKtJnRBCAPybUloKYA8h5CoAU0ppXLdE18NojzsbQ0V27pQm7zVr1B0Jg8FgaDyt9qlT6dD4ywrrGSyht0x7fdQZbVBSAuzfD8yfL20CYTAYDEarqDJQ7i4h5IUuj6SHU1sL5OSwpN6p7Nkjnba2fr26I2EwGIwegSp96sMAhBJCMgFUAiCQVuJ9uzSyHkZWlnSuOUvqnURtLRAeDgQGAj4+6o6GwWAwegSqJPXJXR5FL6Cj09kYTTh6FCgokA6QYzAYDIZKtNn8TinNhHQK2zjZz1WqHPe8IU/qLi5qDaN3IJeEHTQIGNeWxQCDwWAw5LRZU5cpyvkB8ABwEIAOgKMARnRtaD2L9HSpLgobz9UJXLkCJCVJJWGZsAyDwWCojCo17hmQysJWAgClNBeASatHPIdkZAD9+rGp052CoiQsg8FgMFRGlaReJ5vaRgGAEGLUtSH1TNh0tk6iqSQsg8FgMFRGlaR+khCyF0AfQsjrAK5Dqi7HUIAl9U5CmSQsg8FgMFSizT51Sul2QshEAGWQ9qtvpJT+3OWR9SAqKoCiIpbUO0xrkrAMBoPBaBNVprRBlsRZIm+BjAzpJ0vqHYRJwjIYDEaHUMVPfSYhJJUQIiSElBFCygkhZd0RXE+BTWfrBJgkLIPBYHQYVWrq/wHwCqU0qauD6akw4ZlOgEnCMhgMRodRZaBcwbMmdEJIICEkmRCSRgh5r5VywYQQSgjxk627EEKqCSGxsmXPs1y/u8jIAAwNARsbdUfSQ2GSsAwGg9EpqOqnfgLAOQC18o2U0rOtHUQI0QLwDYCJALIB3CGEnKeUJjYpZwJgLYDbTU7xkFI6SIX41E56urTpnemkPCPHjjFJWAaDwegEVEnqppBKw05S2EYBtJrUAfgDSKOUPgIAQshxANMAJDYp9ymALwC8rUrAmgibztYBJBJg+3YmCctgMBidgCpT2hY/47kdADxWWM+G1PGtAULIEABOlNJLhJCmSb0/IeQepFPpPqSU/tb0AoSQZQCWAUC/fv2eMcyOQak0qY8apZbL93yYJCyDwWB0GqqMfucSQn4hhMTL1n0JIR929MKEEA6AHQCUjYzKA9CPUjoYwFsA/n979x9kVXnfcfz9QQWVH2UMqAgIaLAWf4Toipma2IxVQ4xipmKLxpk4SYNMoTUqNjqxmUb/sFHGTpzBKI5OYqpSgzN1UyGmP8SoFWEVgoKxAhcBoxYVQRNAgW//OOea68rde+7u3l/nfl4zO7vn3Oec+31WnO8+5z7P93lA0rDujSJiQUR0RETHyJEj+xpSr2zbBjt2eKTeay4Ja2bWb7JMlLsbuB74ECAiVgMzMlz3GsnubkVj0nNFQ4ETgaWSNgKfAzoldUTE7oh4O32/54D1wHEZ3rPuPPO9D1wS1sysX2VJ6odGxPJu5/ZkuG4FMFHSBEkDSf4Q6Cy+GBHbI2JERIyPiPHAMmBaRHRJGplOtEPSMcBEYEOG96w7r1HvA5eENTPrV1mS+luSjuUPG7pMJ3k83qOI2APMAR4DXgIeiog1km6UNK3C5WcCqyWtAhYBsyLinQyx1p2ryfVSsSTsFVe4JKyZWT/JMvt9NrAAOF7Sa0ABuCzLzSNiMbC427nvlWn7xZKfHwYezvIejVYowPDhyZdVwSVhzcz6XZbZ7xuAs9MtVwdExHu1D6t1eDlbLxRLwp5zDixfDqNGwemne/a7mVkfVUzqkq7udgywHXguIlbVKK6WUSjApEmNjqLFXHVVUhL28cfhqaeSterDh8Ndd8F55zU6OjOzlpXlM/UOYBbJuvPRwBXAVOBuSX9fw9iaXkTymbpH6lV45BG4777k5507k/WA778PW7bA9OmweHHP15uZWVlZkvoY4JSIuCYirgFOBQ4nmcx2eQ1ja3pvvAG7djmpZxYBl19e/vWdO5OJcxF1C8nMLE+yJPXDKan5TrJe/YiI2NntfNvxcrYqPfMMbN/ec5t3300+Zzczs6plmf1+P/CspEfS4wtIKrwN5pN13NuKl7NV6dFHK4/CBwyA3/62PvGYmeVMltnvN0laApyRnpoVEV3pz1+rWWQtwCP1Ki1Zksxw7ymx79sHRx1Vv5jMzHIky0idNIl3VWzYZgoFOOKIZC91q2DFCli5EoYNSybHlTN8OEyZUr+4zMxyJMtn6laG16hXoVgS9p574JBD9t/mkEOSZW1er25m1itO6n3gpJ5RaUnY6dNh0SIYMwaGDElG7kOGJMeLFnmduplZH2R6/G6ftGcPbNoEl1zS6EhaQPeSsOedl/zyli9PJsUddVTyyN0jdDOzPnFS76UtW2DvXk+Sq6hYEvbSS2H06D+cl5LSsGZm1m/8+L2XvJwtozvvTErCXnNNoyMxM8s9J/VeKi5nc1Lvwe7dcPvtMHUqnHRSo6MxM8s9J/VeKhSSOilHH93oSJrY/ffDm2/C3LmNjsTMrC04qfdSoZBM2D7ooEZH0qT27YN582DyZDjrrEZHY2bWFjxRrpe8nK2CJUvgpZeS0bpntZuZ1YVH6r3kpF7BrbfC2LFw8cWNjsTMrG3UNKlLmirpZUnrJF3XQ7uLJIWkjpJz16fXvSzpS7WMs1q7diXLq72crYwVK+CJJ+Db3/bnE2ZmdVSzx++SDgDmA+cAW4AVkjojYm23dkOBK4FnS85NAmYAJwBHAf8p6biI2FureKuxaVPy3SP1MoolYb/1rUZHYmbWVmo5Up8CrIuIDRHxAbAQuHA/7W4CfgDsKjl3IbAwInZHRAFYl96vKXg5Ww9KS8IOHdroaMzM2kotk/poYHPJ8Zb03EcknQKMjYhHq702vX6mpC5JXVu3bu2fqDNwUu9B95KwZmZWNw2bKCdpAHAb0OtSYxGxICI6IqJj5MiR/RdcBYUCDBzobb8/oVxJWDMzq4taLml7DRhbcjwmPVc0FDgRWKpkydORQKekaRmubahCAcaNS4rPWAmXhDUza6hapqUVwERJEyQNJJn41ll8MSK2R8SIiBgfEeOBZcC0iOhK282QNEjSBGAisLyGsVbFy9n2wyVhzcwarmZJPSL2AHOAx4CXgIciYo2kG9PReE/XrgEeAtYCvwBmN8vMd3BS3y+XhDUzazhFRKNj6BcdHR3R1dVV8/d57z0YNgxuvhmuK7vyvs3s2wcnngiDBsHzz7uCXI5Jei4iOiq3NLNGcJnYKnnL1f1wSVgzs6bgqV5V8nK2/XBJWDOzpuCkXiUn9W5cEtbMrGk4qVepUIDBg2HEiEZH0iRcEtbMrGk4qVepOPPdHx3jkrBmZk3GSb1KXs5WwiVhzcyaipN6FSKS2e/echWXhDUza0JO6lV4551knbpH6rgkrJlZE3JSr4JnvqdcEtbMrCk5qVfBST3lkrBmZk3JSb0KTuokJWHnzYPJk+GssxodjZmZlXCZ2CoUCnDYYUnt97blkrBmZk3LI/UqeDkbySjdJWHNzJqSk3oV2n45W1cXLF3qkrBmZk3KST2jffuSpN7WI/V581wS1sysiTmpZ/TGG8lKrrZN6i4Ja2bW9JzUM2r7me8uCWtm1vSc1DNq66TukrBmZi2hpkld0lRJL0taJ+m6/bw+S9ILklZJekrSpPT8eEk70/OrJN1ZyzizKCb1tpwo55KwZmYtoWbr1CUdAMwHzgG2ACskdUbE2pJmD0TEnWn7acBtwNT0tfURMblW8VWrUIBRo+DggxsdSZ25JKyZWcuo5Uh9CrAuIjZExAfAQuDC0gYRsaPkcDAQNYynT9p2OZtLwpqZtYxaJvXRwOaS4y3puY+RNFvSeuAWoHQW1gRJKyU9IekLNYwzk7YsPOOSsGZmLaXhE+UiYn5EHAt8B7ghPf06cHREfBa4GnhA0ieKs0qaKalLUtfWrVtrFuOePbB5cxsm9WJJ2GuvdUlYM7MWUMuk/howtuR4THqunIXAVwEiYndEvJ3+/BywHjiu+wURsSAiOiKiY+TIkf0WeHebN8PevW2Y1F0S1syspdQyqa8AJkqaIGkgMAPoLG0gaWLJ4VeAV9LzI9OJdkg6BpgIbKhhrD1qy+VsLglrZtZyajb7PSL2SJoDPAYcANwbEWsk3Qh0RUQnMEfS2cCHwDbg6+nlZwI3SvoQ2AfMioh3ahVrJW2Z1F0S1sys5dR069WIWAws7nbueyU/X1nmuoeBh2sZWzUKhaSY2tixldvmQrEk7Ny5LglrZtZCGj5RrhVs3Jgk9APbZfd5l4Q1M2tJTuoZFApttEbdJWHNzFqWk3oGbbVG3SVhzcxalpN6BTt3wuuvt0lSd0lYM7OW5qRewauvJt/bIqm7JKyZWUtzUq+gbZazuSSsmVnLa5f53L3WNkm9WBL2/vtdEtbMrEV5pF7Bxo0waBAceWSjI6kxl4Q1M2t5TuoVFAowbhwMyPNvyiVhzcxyIc+pql+0xXI2l4Q1M8sFJ/UKcp/UiyVhr7jCJWHNzFqck3oPduxICqzlOqm7JKyZWW44qfcg9zPfXRLWzCxXnNR7kPuk7pKwZma54qTeg40bk++5TOouCWtmljtO6j0oFGDIEDjssEZHUgMuCWtmljtO6j0oznzPXYE1l4Q1M8sll4ntQaEAxxzT6ChqwCVhzcxyySP1MiJyvEbdJWHNzHKppkld0lRJL0taJ+m6/bw+S9ILklZJekrSpJLXrk+ve1nSl2oZ5/689VYyMTx3Sd0lYc3McqtmSV3SAcB84MvAJOCS0qSdeiAiToqIycAtwG3ptZOAGcAJwFTgjvR+dZPb5WwuCWtmllu1HKlPAdZFxIaI+ABYCFxY2iAidpQcDgYi/flCYGFE7I6IArAuvV/d5HI528aNLglrZpZjtZwoNxrYXHK8BTi9eyNJs4GrgYFAcSr2aGBZt2s/UfJM0kxgZnq4W9KLfQ/7404+ub/v2CsjgLf67W633JJ8NV7/9qu55LVvf9zoAMysvIbPfo+I+cB8SZcCNwBfr+LaBcACAEldEdFRmygbK699y2u/IL99k9TV6BjMrLxaPn5/DRhbcjwmPVfOQuCrvbzWzMys7dUyqa8AJkqaIGkgycS3ztIGkiaWHH4FeCX9uROYIWmQpAnARGB5DWM1MzNreTV7/B4ReyTNAR4DDgDujYg1km4EuiKiE5gj6WzgQ2Ab6aP3tN1DwFpgDzA7IvZWeMsFtepLE8hr3/LaL8hv3/LaL7NcUERUbmVmZmZNzxXlzMzMcsJJ3czMLCdaLqlnKD07SNK/pq8/K2l8/aOsXoZ+nSnpeUl7JE1vRIy9laFvV0taK2m1pP+SNK4RcVarL2WQm12lvpW0u0hSSMrd8j2zVtRSST1j6dlvAtsi4tPAPwM/qG+U1cvYr03A5cAD9Y2ubzL2bSXQEREnA4tISgY3tb6UQW52GfuGpKHAlcCz9Y3QzMppqaROhtKz6fFP0p8XAX8uNf3+ollK6m6MiNXAvkYE2AdZ+vZ4RPw+PVxGUpeg2fWlDHKzy/L/GcBNJH8076pncGZWXqsl9f2Vnu1ePvajNhGxB9gOfKou0fVeln61qmr79k1gSU0j6h+Z+iVptqT1JCP1v6tTbH1VsW+STgHGRsSj9QzMzHrWakndckzSZUAHcGujY+kvETE/Io4FvkNSBrnlSRpA8lHCNY2Oxcw+rtWSepbysR+1kXQg8EfA23WJrvfyXBY3U9/SIkTfBaZFxO46xdYXfSmD3Owq9W0ocCKwVNJG4HNApyfLmTVeqyX1iqVn0+PipjDTgf+O5q+wk6VfrSpLueDPAneRJPT/a0CMvdGXMsjNrse+RcT2iBgREeMjYjzJPIhpEeHNXswarKWSevoZebH07EvAQ8XSs5Kmpc3uAT4laR3Jlq5ll+M0iyz9knSapC3AxcBdktY0LuLsMv43uxUYAvwsXf7V9H/QZOzXHElrJK0i+beYeQfCRsrYNzNrQi4Ta2ZmlhMtNVI3MzOz8pzUzczMcsJJ3czMLCec1M3MzHLCSd3MzCwnnNSt7iQtLRYqkbRY0vA+3u+Lkv69zGsPpru/XdWX9zAzawUHNjoAy590Ax1FRMXNZyLivBrGcSRwWrpjX9ZrDkzXaZuZtRyP1NuEpH9I98d+Kh29zk3PHyvpF5Kek/SkpOPT8z+WdLuk/5G0oXQPd0nXSlqRjoC/n54bn97/PuBFYKykH0nqSguwfL9MXBsljUj3Hl+VfhUkPZ6+fq6kZ5TsJf8zSUPS81Ml/UbS88BflOn2L4HR6T2/kD4h+GF6/KKkKem9/lHSTyU9Dfy0P37fZmaN4KTeBiSdBlwEfIZkj+zSGt0LgL+NiFOBucAdJa+NAj4PnA/8U3qvc4GJJNtzTgZOlXRm2n4icEdEnBARrwLfjYgO4GTgzySdXC7GiLgz3Xf8NJJdwW6TNIJkE5SzI+IUoAu4WtLBwN3ABcCpwJFlbjsNWB8RkyPiyfTcoen7/A1wb0nbSen7XFIuRjOzZufH7+3hDOCRiNgF7JL0c4B01PunJOVZi20HlVz3b+kj9LWSjkjPnZt+rUyPh5Ak803AqxGxrOT6v5Q0k+Tf2SiSxLm6Qqw/JKnX/3NJ56fXPJ3GNxB4BjgeKETEK2k//gWYmfF38SBARPxK0rCSz/M7I2JnxnuYmTUlJ/X2NgB4Nx257k/pbmkq+X5zRNxV2lDSeOB3JccTSEb+p0XENkk/Bg7uKRhJlwPjSOqOF9/rP7qPniWVizeL7nWRi8e/697QzKzV+PF7e3gauEDSweno/HyAiNgBFCRdDMkEN0mfqXCvx4BvlHy2PVrS4ftpN4wkUW5PR/lf7ummkoqP/y8rmWC3DDhD0qfTNoMlHQf8Bhgv6di0XTWPzP8qvdfnge0Rsb2Ka83MmppH6m0gIlakO5+tBt4EXgCKyexrwI8k3QAcRLLv9697uNcvJf0J8Ez6SPx94DJgb7d2v5a0kiQBbyb5w6Inc4DDgMfT+3ZFxF+no/cHJRU/FrghIv43faz/qKTfA0+S7PGdxa40roOAb2S8xsysJXiXtjYhaUhEvC/pUOBXwMyIeL7RcdWTpKXAXO/7bWZ55ZF6+1ggaRLJ59o/abeEbmbWDjxSNzMzywlPlDMzM8sJJ3UzM7OccFI3MzPLCSd1MzOznHBSNzMzy4n/ByovRNlIJOlHAAAAAElFTkSuQmCC\n", + "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", + "text/plain": "
" }, "metadata": { "needs_background": "light" @@ -714,7 +460,7 @@ "y_pred = postproc.predict_proba(X_test)[:, 1]\n", "y_lr = postproc.estimator_.predict_proba(X_test)[:, 1]\n", "br = postproc.postprocessor_.base_rates_\n", - "i = X_test.sex == 1\n", + "i = X_test.index.get_level_values('sex') == 1\n", "\n", "plt.plot([0, br[0]], [0, 1-br[0]], '-b', label='All calibrated classifiers (Females)')\n", "plt.plot([0, br[1]], [0, 1-br[1]], '-r', label='All calibrated classifiers (Males)')\n", @@ -736,8 +482,8 @@ "plt.plot([0, 1], [generalized_fnr(y_test, y_pred)]*2, '--', c='0.5')\n", "\n", "plt.axis('square')\n", - "plt.xlim([0, 0.4])\n", - "plt.ylim([0.4, 0.8])\n", + "plt.xlim([0.0, 0.4])\n", + "plt.ylim([0.3, 0.7])\n", "plt.xlabel('generalized fpr');\n", "plt.ylabel('generalized fnr');\n", "plt.legend(bbox_to_anchor=(1.04,1), loc='upper left');" @@ -747,15 +493,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see the generalized false negative rate is approximately equalized and the classifiers remain close to the calibration lines." + "We can see the generalized false negative rate is approximately equalized and the classifiers remain close to the calibration lines.\n", + "\n", + "We can quanitify the discrepancy between protected groups using the `difference` operator:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": "0.0027891187222710556" + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "difference(generalized_fnr, y_test, y_pred, prot_attr='sex')" + ] } ], "metadata": { @@ -774,9 +533,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.6" + "version": "3.6.9-final" } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file From 789e96b099b28e98109dd7a436dce8610e6a1229 Mon Sep 17 00:00:00 2001 From: Samuel Hoffman Date: Wed, 19 Feb 2020 17:10:23 -0500 Subject: [PATCH 61/61] added comments to tests --- tests/sklearn/test_adversarial_debiasing.py | 8 ++++++++ tests/sklearn/test_calibrated_equalized_odds.py | 12 +++++++++++- tests/sklearn/test_datasets.py | 10 ++++++++++ tests/sklearn/test_metrics.py | 14 ++++++++++++++ tests/sklearn/test_reweighing.py | 17 ++++++----------- 5 files changed, 49 insertions(+), 12 deletions(-) diff --git a/tests/sklearn/test_adversarial_debiasing.py b/tests/sklearn/test_adversarial_debiasing.py index c28fb17c..f7dd19d4 100644 --- a/tests/sklearn/test_adversarial_debiasing.py +++ b/tests/sklearn/test_adversarial_debiasing.py @@ -15,6 +15,7 @@ 'hours-per-week'], features_to_drop=[]) def test_adv_debias_old_reproduce(): + """Test that the old AdversarialDebiasing is reproducible.""" sess = tf.Session() old_adv_deb = OldAdversarialDebiasing(unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}], @@ -34,6 +35,8 @@ def test_adv_debias_old_reproduce(): assert np.allclose(old_preds.labels, old_preds2.labels) def test_adv_debias_old(): + """Test that the predictions of the old and new AdversarialDebiasing match. + """ tf.reset_default_graph() sess = tf.Session() old_adv_deb = OldAdversarialDebiasing(unprivileged_groups=[{'sex': 0}], @@ -48,6 +51,7 @@ def test_adv_debias_old(): assert np.allclose(old_preds.labels.flatten(), new_preds) def test_adv_debias_reproduce(): + """Test that the new AdversarialDebiasing is reproducible.""" adv_deb = AdversarialDebiasing('sex', num_epochs=5, random_state=123) new_preds = adv_deb.fit(X, y).predict(X) adv_deb.sess_.close() @@ -60,12 +64,16 @@ def test_adv_debias_reproduce(): assert new_acc == accuracy_score(y, new_preds) def test_adv_debias_intersection(): + """Test that the new AdversarialDebiasing runs with >2 protected groups.""" adv_deb = AdversarialDebiasing(scope_name='intersect', num_epochs=5) adv_deb.fit(X, y) adv_deb.sess_.close() assert adv_deb.adversary_logits_.shape[1] == 4 def test_adv_debias_grid(): + """Test that the new AdversarialDebiasing works in a grid search (and that + debiasing results in reduced accuracy). + """ adv_deb = AdversarialDebiasing('sex', num_epochs=10, random_state=123) params = {'debias': [True, False]} diff --git a/tests/sklearn/test_calibrated_equalized_odds.py b/tests/sklearn/test_calibrated_equalized_odds.py index 3352b548..3bfffaf5 100644 --- a/tests/sklearn/test_calibrated_equalized_odds.py +++ b/tests/sklearn/test_calibrated_equalized_odds.py @@ -14,6 +14,9 @@ 'hours-per-week'], features_to_drop=[]) def test_calib_eq_odds_sex_weighted(): + """Test that the old and new CalibratedEqualizedOdds produce the same mix + rates. + """ logreg = LogisticRegression(solver='lbfgs', max_iter=500) y_pred = logreg.fit(X, y, sample_weight=sample_weight).predict_proba(X) adult_pred = adult.copy() @@ -28,6 +31,12 @@ def test_calib_eq_odds_sex_weighted(): assert np.isclose(orig_cal_eq_odds.unpriv_mix_rate, cal_eq_odds.mix_rates_[0]) def test_postprocessingmeta_fnr(): + """Test that the old and new CalibratedEqualizedOdds produce the same + probability predictions. + + This tests the whole "pipeline": splitting the data the same way, training a + LogisticRegression classifier, and training the post-processor the same way. + """ adult_train, adult_test = adult.split([0.9], shuffle=False) X_tr, X_te, y_tr, _, sw_tr, _ = train_test_split(X, y, sample_weight, train_size=0.9, shuffle=False) @@ -52,7 +61,8 @@ def test_postprocessingmeta_fnr(): orig_cal_eq_odds.fit(adult_post, adult_pred) cal_eq_odds = PostProcessingMeta(estimator=logreg, - postprocessor=CalibratedEqualizedOdds('sex', cost_constraint='fnr', random_state=0), + postprocessor=CalibratedEqualizedOdds('sex', cost_constraint='fnr', + random_state=0), shuffle=False) cal_eq_odds.fit(X_tr, y_tr, sample_weight=sw_tr) diff --git a/tests/sklearn/test_datasets.py b/tests/sklearn/test_datasets.py index 1d2ec6a0..2b0fb2c5 100644 --- a/tests/sklearn/test_datasets.py +++ b/tests/sklearn/test_datasets.py @@ -15,6 +15,7 @@ dropna=False) def test_standardize_dataset_basic(): + """Tests standardize_dataset on a toy example.""" dataset = basic() X, y = dataset X, y = dataset.X, dataset.y @@ -28,11 +29,13 @@ def test_standardize_dataset_basic(): assert X.shape == (3, 3) def test_sample_weight_basic(): + """Tests returning sample_weight on a toy example.""" with_weights = basic(sample_weight='X2') assert len(with_weights) == 3 assert with_weights.X.shape == (3, 2) def test_usecols_dropcols_basic(): + """Tests various combinations of usecols and dropcols on a toy example.""" assert basic(usecols='X1').X.columns.tolist() == ['X1'] assert basic(usecols=['X1', 'Z']).X.columns.tolist() == ['X1', 'Z'] @@ -44,17 +47,20 @@ def test_usecols_dropcols_basic(): pd.DataFrame) def test_dropna_basic(): + """Tests dropna on a toy example.""" basic_dropna = partial(standardize_dataset, df=df, prot_attr='Z', target='y', dropna=True) assert basic_dropna().X.shape == (2, 3) assert basic(dropcols='X1').X.shape == (3, 2) def test_numeric_only_basic(): + """Tests numeric_only on a toy example.""" assert basic(prot_attr='X2', numeric_only=True).X.shape == (3, 2) assert (basic(prot_attr='X2', dropcols='Z', numeric_only=True).X.shape == (3, 2)) def test_fetch_adult(): + """Tests Adult Income dataset shapes with various options.""" adult = fetch_adult() assert len(adult) == 3 assert adult.X.shape == (45222, 13) @@ -62,12 +68,14 @@ def test_fetch_adult(): assert fetch_adult(numeric_only=True).X.shape == (48842, 7) def test_fetch_german(): + """Tests German Credit dataset shapes with various options.""" german = fetch_german() assert len(german) == 2 assert german.X.shape == (1000, 21) assert fetch_german(numeric_only=True).X.shape == (1000, 9) def test_fetch_bank(): + """Tests Bank Marketing dataset shapes with various options.""" bank = fetch_bank() assert len(bank) == 2 assert bank.X.shape == (45211, 15) @@ -76,6 +84,7 @@ def test_fetch_bank(): @pytest.mark.filterwarnings('error', category=ColumnAlreadyDroppedWarning) def test_fetch_compas(): + """Tests COMPAS Recidivism dataset shapes with various options.""" compas = fetch_compas() assert len(compas) == 2 assert compas.X.shape == (6167, 10) @@ -84,5 +93,6 @@ def test_fetch_compas(): assert fetch_compas(numeric_only=True).X.shape == (6172, 6) def test_onehot_transformer(): + """Tests that categorical features can be correctly one-hot encoded.""" X, y = fetch_german() assert len(pd.get_dummies(X).columns) == 63 diff --git a/tests/sklearn/test_metrics.py b/tests/sklearn/test_metrics.py index 326c7c8b..916d2ce5 100644 --- a/tests/sklearn/test_metrics.py +++ b/tests/sklearn/test_metrics.py @@ -29,61 +29,75 @@ privileged_groups=[{'sex': 1}]) def test_dataset_equality(): + """Tests that the old and new datasets match exactly.""" assert (adult.features == X.values).all() assert (adult.labels.ravel() == y).all() def test_consistency(): + """Tests that the old and new consistency_score matches exactly.""" assert np.isclose(consistency_score(X, y), cm.consistency()) def test_specificity(): + """Tests that the old and new specificity_score matches exactly.""" spec = specificity_score(y, y_pred, sample_weight=sample_weight) assert spec == cm.specificity() def test_base_rate(): + """Tests that the old and new base_rate matches exactly.""" base = base_rate(y, y_pred, sample_weight=sample_weight) assert base == cm.base_rate() def test_selection_rate(): + """Tests that the old and new selection_rate matches exactly.""" select = selection_rate(y, y_pred, sample_weight=sample_weight) assert select == cm.selection_rate() def test_generalized_fpr(): + """Tests that the old and new generalized_fpr matches exactly.""" gfpr = generalized_fpr(y, y_proba, sample_weight=sample_weight) assert np.isclose(gfpr, cm.generalized_false_positive_rate()) def test_generalized_fnr(): + """Tests that the old and new generalized_fnr matches exactly.""" gfnr = generalized_fnr(y, y_proba, sample_weight=sample_weight) assert np.isclose(gfnr, cm.generalized_false_negative_rate()) def test_disparate_impact(): + """Tests that the old and new disparate_impact matches exactly.""" di = disparate_impact_ratio(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert di == cm.disparate_impact() def test_statistical_parity(): + """Tests that the old and new statistical_parity matches exactly.""" stat = statistical_parity_difference(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert stat == cm.statistical_parity_difference() def test_equal_opportunity(): + """Tests that the old and new equal_opportunity matches exactly.""" eopp = equal_opportunity_difference(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert eopp == cm.equal_opportunity_difference() def test_average_odds_difference(): + """Tests that the old and new average_odds_difference matches exactly.""" aod = average_odds_difference(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert np.isclose(aod, cm.average_odds_difference()) def test_average_odds_error(): + """Tests that the old and new average_odds_error matches exactly.""" aoe = average_odds_error(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert np.isclose(aoe, cm.average_abs_odds_difference()) def test_generalized_entropy_index(): + """Tests that the old and new generalized_entropy_index matches exactly.""" gei = generalized_entropy_error(y, y_pred) assert np.isclose(gei, cm.generalized_entropy_index()) def test_between_group_generalized_entropy_index(): + """Tests that the old and new between_group_GEI matches exactly.""" bggei = between_group_generalized_entropy_error(y, y_pred, prot_attr='sex') assert bggei == cm.between_group_generalized_entropy_index() diff --git a/tests/sklearn/test_reweighing.py b/tests/sklearn/test_reweighing.py index 97631043..f8046fe9 100644 --- a/tests/sklearn/test_reweighing.py +++ b/tests/sklearn/test_reweighing.py @@ -9,36 +9,32 @@ from aif360.sklearn.preprocessing import Reweighing, ReweighingMeta -# X, y = fetch_german(numeric_only=True, dropcols='duration') -# X.age = (X.age >= 25).astype('int') -# german = GermanDataset(categorical_features=[], features_to_keep=[ -# 'credit_amount', 'investment_as_income_percentage', 'residence_since', -# 'age', 'number_of_credits', 'people_liable_for', 'sex']) X, y, sample_weight = fetch_adult(numeric_only=True) adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'], features_to_drop=[]) def test_reweighing_sex(): + """Test that the old and new Reweighing produce the same sample_weights.""" orig_rew = OrigReweighing(unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}]) adult_fair = orig_rew.fit_transform(adult) rew = Reweighing('sex') _, new_sample_weight = rew.fit_transform(X, y, sample_weight=sample_weight) - # assert np.allclose([[orig_rew.w_up_unfav, orig_rew.w_up_fav], - # [orig_rew.w_p_unfav, orig_rew.w_p_fav]], - # rew.reweigh_factors_) + assert np.allclose([[orig_rew.w_up_unfav, orig_rew.w_up_fav], + [orig_rew.w_p_unfav, orig_rew.w_p_fav]], + rew.reweigh_factors_) assert np.allclose(adult_fair.instance_weights, new_sample_weight) def test_reweighing_intersection(): + """Test that the new Reweighing runs with >2 protected groups.""" rew = Reweighing() rew.fit_transform(X, y) assert rew.reweigh_factors_.shape == (4, 2) def test_gridsearch(): - # logreg = LogisticRegression(solver='lbfgs', max_iter=500) - # rew = ReweighingMeta(estimator=logreg, reweigher=Reweighing('sex')) + """Test that ReweighingMeta works in a grid search.""" rew = ReweighingMeta(estimator=LogisticRegression(solver='liblinear')) # UGLY workaround for sklearn issue: https://stackoverflow.com/a/49598597 @@ -51,4 +47,3 @@ def score_func(y_true, y_pred, sample_weight): clf = GridSearchCV(rew, params, scoring=scoring, cv=5, iid=False) clf.fit(X, y, **{'sample_weight': sample_weight}) - # print(clf.best_score_)