Skip to content

Commit

Permalink
various fixes to address PR comments
Browse files Browse the repository at this point in the history
* added one-hot encoding example and random_states to demo notebook
* added 'prefit' option to PostProcessingMeta
* multiple fixes to docstring wordings
* added additional links/disclaimers in docstrings
* renamed CalibratedEqualizedOdds args to X and y
  • Loading branch information
hoffmansc committed Feb 19, 2020
1 parent c34d39f commit 1b829d7
Show file tree
Hide file tree
Showing 8 changed files with 324 additions and 487 deletions.
37 changes: 29 additions & 8 deletions aif360/sklearn/datasets/openml_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[],
unprivileged). The outcome variable is 'annual-income': '>50K' (favorable)
or '<=50K' (unfavorable).
Note:
By default, the data is downloaded from OpenML. See the `adult
<https://www.openml.org/d/1590>`_ page for details.
Args:
subset ({'train', 'test', or 'all'}, optional): Select the dataset to
load: 'train' for the training set, 'test' for the test set, 'all'
Expand All @@ -60,6 +64,9 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[],
namedtuple: Tuple containing X, y, and sample_weights for the Adult
dataset accessible by index or name.
See also:
:func:`sklearn.datasets.fetch_openml`
Examples:
>>> adult = fetch_adult()
>>> adult.X.shape
Expand Down Expand Up @@ -103,11 +110,9 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[],
unprivileged; see the binary_age flag to keep this continuous). The outcome
variable is 'credit-risk': 'good' (favorable) or 'bad' (unfavorable).
References:
.. [#kamiran09] `F. Kamiran and T. Calders, "Classifying without
discriminating," 2nd International Conference on Computer,
Control and Communication, 2009.
<https://ieeexplore.ieee.org/abstract/document/4909197>`_
Note:
By default, the data is downloaded from OpenML. See the `credit-g
<https://www.openml.org/d/31>`_ page for details.
Args:
data_home (string, optional): Specify another download and cache folder
Expand All @@ -126,6 +131,15 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[],
namedtuple: Tuple containing X and y for the German dataset accessible
by index or name.
See also:
:func:`sklearn.datasets.fetch_openml`
References:
.. [#kamiran09] `F. Kamiran and T. Calders, "Classifying without
discriminating," 2nd International Conference on Computer,
Control and Communication, 2009.
<https://ieeexplore.ieee.org/abstract/document/4909197>`_
Examples:
>>> german = fetch_german()
>>> german.X.shape
Expand All @@ -142,7 +156,6 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[],
>>> disparate_impact_ratio(y, y_pred, prot_attr='age', priv_group=True,
... pos_label='good')
0.9483094846144106
"""
df = to_dataframe(fetch_openml(data_id=31, target_column=None,
data_home=data_home or DATA_HOME_DEFAULT))
Expand Down Expand Up @@ -175,7 +188,11 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration',
"""Load the Bank Marketing Dataset.
The protected attribute is 'age' (left as continuous). The outcome variable
is 'deposit': ``True`` or ``False``.
is 'deposit': 'yes' or 'no'.
Note:
By default, the data is downloaded from OpenML. See the `bank-marketing
<https://www.openml.org/d/1461>`_ page for details.
Args:
data_home (string, optional): Specify another download and cache folder
Expand All @@ -193,6 +210,9 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration',
namedtuple: Tuple containing X and y for the Bank dataset accessible by
index or name.
See also:
:func:`sklearn.datasets.fetch_openml`
Examples:
>>> bank = fetch_bank()
>>> bank.X.shape
Expand All @@ -214,7 +234,8 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration',
'housing', 'loan', 'contact', 'day', 'month', 'duration',
'campaign', 'pdays', 'previous', 'poutcome', 'deposit']
# remap target
df.deposit = df.deposit.map({'1': False, '2': True}).astype('bool')
df.deposit = df.deposit.map({'1': 'no', '2': 'yes'}).astype('category')
df.deposit = df.deposit.cat.as_ordered() # 'no' < 'yes'
# replace 'unknown' marker with NaN
df.apply(lambda s: s.cat.remove_categories('unknown', inplace=True)
if hasattr(s, 'cat') and 'unknown' in s.cat.categories else s)
Expand Down
2 changes: 1 addition & 1 deletion aif360/sklearn/inprocessing/adversarial_debiasing.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __init__(self, prot_attr=None, scope_name='classifier',
adversary.
verbose (bool, optional): If ``True``, print losses every 200 steps.
random_state (int or numpy.RandomState, optional): Seed of pseudo-
random number generator for shuffling data.
random number generator for shuffling data and seeding weights.
"""

self.prot_attr = prot_attr
Expand Down
35 changes: 29 additions & 6 deletions aif360/sklearn/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,8 @@ def generalized_fpr(y_true, probas_pred, pos_label=1, sample_weight=None):
r"""Return the ratio of generalized false positives to negative examples in
the dataset, :math:`GFPR = \tfrac{GFP}{N}`.
The generalized confusion matrix is calculated by summing the probabilities
of the positive class instead of the hard predictions.
Generalized confusion matrix measures such as this are calculated by summing
the probabilities of the positive class instead of the hard predictions.
Args:
y_true (array-like): Ground-truth (correct) target values.
Expand All @@ -237,8 +237,8 @@ def generalized_fnr(y_true, probas_pred, pos_label=1, sample_weight=None):
r"""Return the ratio of generalized false negatives to positive examples in
the dataset, :math:`GFNR = \tfrac{GFN}{P}`.
The generalized confusion matrix is calculated by summing the probabilities
of the positive class instead of the hard predictions.
Generalized confusion matrix measures such as this are calculated by summing
the probabilities of the positive class instead of the hard predictions.
Args:
y_true (array-like): Ground-truth (correct) target values.
Expand Down Expand Up @@ -272,7 +272,8 @@ def statistical_parity_difference(*y, prot_attr=None, priv_group=1, pos_label=1,
Note:
If only y_true is provided, this will return the difference in base
rates (statistical parity difference of the original dataset).
rates (statistical parity difference of the original dataset). If both
y_true and y_pred are provided, only y_pred is used.
Args:
y_true (pandas.Series): Ground truth (correct) target values. If y_pred
Expand All @@ -287,6 +288,9 @@ def statistical_parity_difference(*y, prot_attr=None, priv_group=1, pos_label=1,
Returns:
float: Statistical parity difference.
See also:
:func:`selection_rate`, :func:`base_rate`
"""
rate = base_rate if len(y) == 1 or y[1] is None else selection_rate
return difference(rate, *y, prot_attr=prot_attr, priv_group=priv_group,
Expand All @@ -302,7 +306,8 @@ def disparate_impact_ratio(*y, prot_attr=None, priv_group=1, pos_label=1,
Note:
If only y_true is provided, this will return the ratio of base rates
(disparate impact of the original dataset).
(disparate impact of the original dataset). If both y_true and y_pred
are provided, only y_pred is used.
Args:
y_true (pandas.Series): Ground truth (correct) target values. If y_pred
Expand All @@ -317,6 +322,9 @@ def disparate_impact_ratio(*y, prot_attr=None, priv_group=1, pos_label=1,
Returns:
float: Disparate impact.
See also:
:func:`selection_rate`, :func:`base_rate`
"""
rate = base_rate if len(y) == 1 or y[1] is None else selection_rate
return ratio(rate, *y, prot_attr=prot_attr, priv_group=priv_group,
Expand All @@ -340,6 +348,9 @@ def equal_opportunity_difference(y_true, y_pred, prot_attr=None, priv_group=1,
Returns:
float: Equal opportunity difference.
See also:
:func:`~sklearn.metrics.recall_score`
"""
return difference(recall_score, y_true, y_pred, prot_attr=prot_attr,
priv_group=priv_group, pos_label=pos_label,
Expand Down Expand Up @@ -461,6 +472,9 @@ def generalized_entropy_error(y_true, y_pred, alpha=2, pos_label=1):
index, and 2 is half the squared coefficient of variation.
pos_label (scalar, optional): The label of the positive class.
See also:
:func:`generalized_entropy_index`
References:
.. [#speicher18] `T. Speicher, H. Heidari, N. Grgic-Hlaca,
K. P. Gummadi, A. Singla, A. Weller, and M. B. Zafar, "A Unified
Expand Down Expand Up @@ -495,6 +509,9 @@ def between_group_generalized_entropy_error(y_true, y_pred, prot_attr=None,
index, and 2 is half the squared coefficient of variation.
pos_label (scalar, optional): The label of the positive class.
See also:
:func:`generalized_entropy_index`
References:
.. [#speicher18] `T. Speicher, H. Heidari, N. Grgic-Hlaca,
K. P. Gummadi, A. Singla, A. Weller, and M. B. Zafar, "A Unified
Expand All @@ -518,6 +535,9 @@ def theil_index(b):
Args:
b (array-like): Parameter over which to calculate the entropy index.
See also:
:func:`generalized_entropy_index`
"""
return generalized_entropy_index(b, alpha=1)

Expand All @@ -527,6 +547,9 @@ def coefficient_of_variation(b):
Args:
b (array-like): Parameter over which to calculate the entropy index.
See also:
:func:`generalized_entropy_index`
"""
return 2 * np.sqrt(generalized_entropy_index(b, alpha=2))

Expand Down
81 changes: 50 additions & 31 deletions aif360/sklearn/postprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,16 @@ class PostProcessingMeta(BaseEstimator, MetaEstimatorMixin):
"""

def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(),
needs_proba=None, val_size=0.25, **options):
needs_proba=None, prefit=False, val_size=0.25, **options):
"""
Args:
estimator (sklearn.BaseEstimator): Original estimator.
postprocessor: Post-processing algorithm.
needs_proba (bool): Use ``self.estimator_.predict_proba()`` instead of
``self.estimator_.predict()`` as input to postprocessor. If
needs_proba (bool): Use ``self.estimator_.predict_proba()`` instead
of ``self.estimator_.predict()`` as input to postprocessor. If
``None``, defaults to ``True`` if the postprocessor supports it.
prefit (bool): If ``True``, it is assumed that estimator has been
fitted already and all data is used to train postprocessor.
val_size (int or float): Size of validation set used to fit the
postprocessor. The estimator fits on the remainder of the
training set.
Expand All @@ -54,6 +56,7 @@ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(),
self.estimator = estimator
self.postprocessor = postprocessor
self.needs_proba = needs_proba
self.prefit = prefit
self.val_size = val_size
self.options = options

Expand All @@ -79,14 +82,28 @@ def fit(self, X, y, sample_weight=None, **fit_params):
Returns:
self
"""
self.needs_proba_ = (self.needs_proba if self.needs_proba is not None else
isinstance(self.postprocessor, CalibratedEqualizedOdds))
self.needs_proba_ = (self.needs_proba if self.needs_proba is not None
else isinstance(self.postprocessor, CalibratedEqualizedOdds))
if self.needs_proba_ and not hasattr(self.estimator, 'predict_proba'):
raise TypeError("`estimator` (type: {}) does not implement method "
"`predict_proba()`.".format(type(self.estimator)))

if self.prefit:
if len(self.options):
warning("Splitting options were passed but prefit is True so "
"these are ignored.")
self.postprocessor_ = clone(self.postprocessor)
y_score = (self.estimator.predict(X) if not self.needs_proba_ else
self.estimator.predict_proba(X))
fit_params = fit_params.copy()
fit_params.update(labels=self.estimator_.classes_)
self.postprocessor_.fit(y_score, y, sample_weight=sample_weight,
**fit_params)
return self

if 'train_size' in self.options or 'test_size' in self.options:
warning("'train_size' and 'test_size' are ignored in favor of 'val_size'")
warning("'train_size' and 'test_size' are ignored in favor of "
"'val_size'")
options_ = self.options.copy()
options_['test_size'] = self.val_size
if 'train_size' in options_:
Expand All @@ -103,10 +120,11 @@ def fit(self, X, y, sample_weight=None, **fit_params):
X_est, X_post, y_est, y_post = train_test_split(X, y, **options_)
self.estimator_.fit(X_est, y_est)

y_pred = (self.estimator_.predict(X_post) if not self.needs_proba_ else
y_score = (self.estimator_.predict(X_post) if not self.needs_proba_ else
self.estimator_.predict_proba(X_post))
# fit_params = fit_params.copy().update(labels=self.estimator_.classes_)
self.postprocessor_.fit(y_pred, y_post, sample_weight=sw_post
fit_params = fit_params.copy()
fit_params.update(labels=self.estimator_.classes_)
self.postprocessor_.fit(y_score, y_post, sample_weight=sw_post
if sample_weight is not None else None,
**fit_params)
return self
Expand All @@ -116,27 +134,27 @@ def predict(self, X):
"""Predict class labels for the given samples.
First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
``self.needs_proba_`` is ``True``) then returns the post-processed output
from those predictions.
``self.needs_proba_`` is ``True``) then returns the post-processed
output from those predictions.
Args:
X (pandas.DataFrame): Test samples.
Returns:
numpy.ndarray: Predicted class label per sample.
"""
y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else
self.estimator_.predict_proba(X))
y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
return self.postprocessor_.predict(y_pred)
y_score = (self.estimator_.predict(X) if not self.needs_proba_ else
self.estimator_.predict_proba(X))
y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns')
return self.postprocessor_.predict(y_score)

@if_delegate_has_method('postprocessor_')
def predict_proba(self, X):
"""Probability estimates.
First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
``self.needs_proba_`` is ``True``) then returns the post-processed output
from those predictions.
``self.needs_proba_`` is ``True``) then returns the post-processed
output from those predictions.
The returned estimates for all classes are ordered by the label of
classes.
Expand All @@ -149,18 +167,18 @@ def predict_proba(self, X):
in the model, where classes are ordered as they are in
``self.classes_``.
"""
y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else
self.estimator_.predict_proba(X))
y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
return self.postprocessor_.predict_proba(y_pred)
y_score = (self.estimator_.predict(X) if not self.needs_proba_ else
self.estimator_.predict_proba(X))
y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns')
return self.postprocessor_.predict_proba(y_score)

@if_delegate_has_method('postprocessor_')
def predict_log_proba(self, X):
"""Log of probability estimates.
First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
``self.needs_proba_`` is ``True``) then returns the post-processed output
from those predictions.
``self.needs_proba_`` is ``True``) then returns the post-processed
output from those predictions.
The returned estimates for all classes are ordered by the label of
classes.
Expand All @@ -173,10 +191,10 @@ def predict_log_proba(self, X):
the model, where classes are ordered as they are in
``self.classes_``.
"""
y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else
self.estimator_.predict_proba(X))
y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
return self.postprocessor_.predict_log_proba(y_pred)
y_score = (self.estimator_.predict(X) if not self.needs_proba_ else
self.estimator_.predict_proba(X))
y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns')
return self.postprocessor_.predict_log_proba(y_score)

@if_delegate_has_method('postprocessor_')
def score(self, X, y, sample_weight=None):
Expand All @@ -195,10 +213,11 @@ def score(self, X, y, sample_weight=None):
Returns:
float: Score value.
"""
y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else
self.estimator_.predict_proba(X))
y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
return self.postprocessor_.score(y_pred, y, sample_weight=sample_weight)
y_score = (self.estimator_.predict(X) if not self.needs_proba_ else
self.estimator_.predict_proba(X))
y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns')
return self.postprocessor_.score(y_score, y,
sample_weight=sample_weight)


__all__ = [
Expand Down
Loading

0 comments on commit 1b829d7

Please sign in to comment.