various fixes to address PR comments

* added one-hot encoding example and random_states to demo notebook * added 'prefit' option to PostProcessingMeta * multiple fixes to docstring wordings * added additional links/disclaimers in docstrings * renamed CalibratedEqualizedOdds args to X and y
Trusted-AI · Feb 19, 2020 · 1b829d7 · 1b829d7
1 parent c34d39f
commit 1b829d7
Show file tree

Hide file tree

Showing 8 changed files with 324 additions and 487 deletions.
diff --git a/aif360/sklearn/datasets/openml_datasets.py b/aif360/sklearn/datasets/openml_datasets.py
@@ -41,6 +41,10 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[],
     unprivileged). The outcome variable is 'annual-income': '>50K' (favorable)
     or '<=50K' (unfavorable).
 
+    Note:
+        By default, the data is downloaded from OpenML. See the `adult
+        <https://www.openml.org/d/1590>`_ page for details.
+
     Args:
         subset ({'train', 'test', or 'all'}, optional): Select the dataset to
             load: 'train' for the training set, 'test' for the test set, 'all'
@@ -60,6 +64,9 @@ def fetch_adult(subset='all', data_home=None, binary_race=True, usecols=[],
         namedtuple: Tuple containing X, y, and sample_weights for the Adult
         dataset accessible by index or name.
 
+    See also:
+        :func:`sklearn.datasets.fetch_openml`
+
     Examples:
         >>> adult = fetch_adult()
         >>> adult.X.shape
@@ -103,11 +110,9 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[],
     unprivileged; see the binary_age flag to keep this continuous). The outcome
     variable is 'credit-risk': 'good' (favorable) or 'bad' (unfavorable).
 
-    References:
-        .. [#kamiran09] `F. Kamiran and T. Calders, "Classifying without
-           discriminating," 2nd International Conference on Computer,
-           Control and Communication, 2009.
-           <https://ieeexplore.ieee.org/abstract/document/4909197>`_
+    Note:
+        By default, the data is downloaded from OpenML. See the `credit-g
+        <https://www.openml.org/d/31>`_ page for details.
 
     Args:
         data_home (string, optional): Specify another download and cache folder
@@ -126,6 +131,15 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[],
         namedtuple: Tuple containing X and y for the German dataset accessible
         by index or name.
 
+    See also:
+        :func:`sklearn.datasets.fetch_openml`
+
+    References:
+        .. [#kamiran09] `F. Kamiran and T. Calders, "Classifying without
+           discriminating," 2nd International Conference on Computer,
+           Control and Communication, 2009.
+           <https://ieeexplore.ieee.org/abstract/document/4909197>`_
+
     Examples:
         >>> german = fetch_german()
         >>> german.X.shape
@@ -142,7 +156,6 @@ def fetch_german(data_home=None, binary_age=True, usecols=[], dropcols=[],
         >>> disparate_impact_ratio(y, y_pred, prot_attr='age', priv_group=True,
         ... pos_label='good')
         0.9483094846144106
-
     """
     df = to_dataframe(fetch_openml(data_id=31, target_column=None,
                                    data_home=data_home or DATA_HOME_DEFAULT))
@@ -175,7 +188,11 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration',
     """Load the Bank Marketing Dataset.
 
     The protected attribute is 'age' (left as continuous). The outcome variable
-    is 'deposit': ``True`` or ``False``.
+    is 'deposit': 'yes' or 'no'.
+
+    Note:
+        By default, the data is downloaded from OpenML. See the `bank-marketing
+        <https://www.openml.org/d/1461>`_ page for details.
 
     Args:
         data_home (string, optional): Specify another download and cache folder
@@ -193,6 +210,9 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration',
         namedtuple: Tuple containing X and y for the Bank dataset accessible by
         index or name.
 
+    See also:
+        :func:`sklearn.datasets.fetch_openml`
+
     Examples:
         >>> bank = fetch_bank()
         >>> bank.X.shape
@@ -214,7 +234,8 @@ def fetch_bank(data_home=None, percent10=False, usecols=[], dropcols='duration',
                   'housing', 'loan', 'contact', 'day', 'month', 'duration',
                   'campaign', 'pdays', 'previous', 'poutcome', 'deposit']
     # remap target
-    df.deposit = df.deposit.map({'1': False, '2': True}).astype('bool')
+    df.deposit = df.deposit.map({'1': 'no', '2': 'yes'}).astype('category')
+    df.deposit = df.deposit.cat.as_ordered()  # 'no' < 'yes'
     # replace 'unknown' marker with NaN
     df.apply(lambda s: s.cat.remove_categories('unknown', inplace=True)
              if hasattr(s, 'cat') and 'unknown' in s.cat.categories else s)

diff --git a/aif360/sklearn/inprocessing/adversarial_debiasing.py b/aif360/sklearn/inprocessing/adversarial_debiasing.py
@@ -67,7 +67,7 @@ def __init__(self, prot_attr=None, scope_name='classifier',
                 adversary.
             verbose (bool, optional): If ``True``, print losses every 200 steps.
             random_state (int or numpy.RandomState, optional): Seed of pseudo-
-                random number generator for shuffling data.
+                random number generator for shuffling data and seeding weights.
         """
 
         self.prot_attr = prot_attr

diff --git a/aif360/sklearn/metrics/metrics.py b/aif360/sklearn/metrics/metrics.py
@@ -210,8 +210,8 @@ def generalized_fpr(y_true, probas_pred, pos_label=1, sample_weight=None):
     r"""Return the ratio of generalized false positives to negative examples in
     the dataset, :math:`GFPR = \tfrac{GFP}{N}`.
 
-    The generalized confusion matrix is calculated by summing the probabilities
-    of the positive class instead of the hard predictions.
+    Generalized confusion matrix measures such as this are calculated by summing
+    the probabilities of the positive class instead of the hard predictions.
 
     Args:
         y_true (array-like): Ground-truth (correct) target values.
@@ -237,8 +237,8 @@ def generalized_fnr(y_true, probas_pred, pos_label=1, sample_weight=None):
     r"""Return the ratio of generalized false negatives to positive examples in
     the dataset, :math:`GFNR = \tfrac{GFN}{P}`.
 
-    The generalized confusion matrix is calculated by summing the probabilities
-    of the positive class instead of the hard predictions.
+    Generalized confusion matrix measures such as this are calculated by summing
+    the probabilities of the positive class instead of the hard predictions.
 
     Args:
         y_true (array-like): Ground-truth (correct) target values.
@@ -272,7 +272,8 @@ def statistical_parity_difference(*y, prot_attr=None, priv_group=1, pos_label=1,
 
     Note:
         If only y_true is provided, this will return the difference in base
-        rates (statistical parity difference of the original dataset).
+        rates (statistical parity difference of the original dataset). If both
+        y_true and y_pred are provided, only y_pred is used.
 
     Args:
         y_true (pandas.Series): Ground truth (correct) target values. If y_pred
@@ -287,6 +288,9 @@ def statistical_parity_difference(*y, prot_attr=None, priv_group=1, pos_label=1,
 
     Returns:
         float: Statistical parity difference.
+
+    See also:
+        :func:`selection_rate`, :func:`base_rate`
     """
     rate = base_rate if len(y) == 1 or y[1] is None else selection_rate
     return difference(rate, *y, prot_attr=prot_attr, priv_group=priv_group,
@@ -302,7 +306,8 @@ def disparate_impact_ratio(*y, prot_attr=None, priv_group=1, pos_label=1,
 
     Note:
         If only y_true is provided, this will return the ratio of base rates
-        (disparate impact of the original dataset).
+        (disparate impact of the original dataset). If both y_true and y_pred
+        are provided, only y_pred is used.
 
     Args:
         y_true (pandas.Series): Ground truth (correct) target values. If y_pred
@@ -317,6 +322,9 @@ def disparate_impact_ratio(*y, prot_attr=None, priv_group=1, pos_label=1,
 
     Returns:
         float: Disparate impact.
+
+    See also:
+        :func:`selection_rate`, :func:`base_rate`
     """
     rate = base_rate if len(y) == 1 or y[1] is None else selection_rate
     return ratio(rate, *y, prot_attr=prot_attr, priv_group=priv_group,
@@ -340,6 +348,9 @@ def equal_opportunity_difference(y_true, y_pred, prot_attr=None, priv_group=1,
 
     Returns:
         float: Equal opportunity difference.
+
+    See also:
+        :func:`~sklearn.metrics.recall_score`
     """
     return difference(recall_score, y_true, y_pred, prot_attr=prot_attr,
                       priv_group=priv_group, pos_label=pos_label,
@@ -461,6 +472,9 @@ def generalized_entropy_error(y_true, y_pred, alpha=2, pos_label=1):
             index, and 2 is half the squared coefficient of variation.
         pos_label (scalar, optional): The label of the positive class.
 
+    See also:
+        :func:`generalized_entropy_index`
+
     References:
         .. [#speicher18] `T. Speicher, H. Heidari, N. Grgic-Hlaca,
            K. P. Gummadi, A. Singla, A. Weller, and M. B. Zafar, "A Unified
@@ -495,6 +509,9 @@ def between_group_generalized_entropy_error(y_true, y_pred, prot_attr=None,
             index, and 2 is half the squared coefficient of variation.
         pos_label (scalar, optional): The label of the positive class.
 
+    See also:
+        :func:`generalized_entropy_index`
+
     References:
         .. [#speicher18] `T. Speicher, H. Heidari, N. Grgic-Hlaca,
            K. P. Gummadi, A. Singla, A. Weller, and M. B. Zafar, "A Unified
@@ -518,6 +535,9 @@ def theil_index(b):
 
     Args:
         b (array-like): Parameter over which to calculate the entropy index.
+
+    See also:
+        :func:`generalized_entropy_index`
     """
     return generalized_entropy_index(b, alpha=1)
 
@@ -527,6 +547,9 @@ def coefficient_of_variation(b):
 
     Args:
         b (array-like): Parameter over which to calculate the entropy index.
+
+    See also:
+        :func:`generalized_entropy_index`
     """
     return 2 * np.sqrt(generalized_entropy_index(b, alpha=2))
 

diff --git a/aif360/sklearn/postprocessing/__init__.py b/aif360/sklearn/postprocessing/__init__.py
@@ -33,14 +33,16 @@ class PostProcessingMeta(BaseEstimator, MetaEstimatorMixin):
     """
 
     def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(),
-                 needs_proba=None, val_size=0.25, **options):
+                 needs_proba=None, prefit=False, val_size=0.25, **options):
         """
         Args:
             estimator (sklearn.BaseEstimator): Original estimator.
             postprocessor: Post-processing algorithm.
-            needs_proba (bool): Use ``self.estimator_.predict_proba()`` instead of
-                ``self.estimator_.predict()`` as input to postprocessor. If
+            needs_proba (bool): Use ``self.estimator_.predict_proba()`` instead
+                of ``self.estimator_.predict()`` as input to postprocessor. If
                 ``None``, defaults to ``True`` if the postprocessor supports it.
+            prefit (bool): If ``True``, it is assumed that estimator has been
+                fitted already and all data is used to train postprocessor.
             val_size (int or float): Size of validation set used to fit the
                 postprocessor. The estimator fits on the remainder of the
                 training set.
@@ -54,6 +56,7 @@ def __init__(self, estimator, postprocessor=CalibratedEqualizedOdds(),
         self.estimator = estimator
         self.postprocessor = postprocessor
         self.needs_proba = needs_proba
+        self.prefit = prefit
         self.val_size = val_size
         self.options = options
 
@@ -79,14 +82,28 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         Returns:
             self
         """
-        self.needs_proba_ = (self.needs_proba if self.needs_proba is not None else
-                isinstance(self.postprocessor, CalibratedEqualizedOdds))
+        self.needs_proba_ = (self.needs_proba if self.needs_proba is not None
+                else isinstance(self.postprocessor, CalibratedEqualizedOdds))
         if self.needs_proba_ and not hasattr(self.estimator, 'predict_proba'):
             raise TypeError("`estimator` (type: {}) does not implement method "
                             "`predict_proba()`.".format(type(self.estimator)))
 
+        if self.prefit:
+            if len(self.options):
+                warning("Splitting options were passed but prefit is True so "
+                        "these are ignored.")
+            self.postprocessor_ = clone(self.postprocessor)
+            y_score = (self.estimator.predict(X) if not self.needs_proba_ else
+                       self.estimator.predict_proba(X))
+            fit_params = fit_params.copy()
+            fit_params.update(labels=self.estimator_.classes_)
+            self.postprocessor_.fit(y_score, y, sample_weight=sample_weight,
+                                    **fit_params)
+            return self
+
         if 'train_size' in self.options or 'test_size' in self.options:
-            warning("'train_size' and 'test_size' are ignored in favor of 'val_size'")
+            warning("'train_size' and 'test_size' are ignored in favor of "
+                    "'val_size'")
         options_ = self.options.copy()
         options_['test_size'] = self.val_size
         if 'train_size' in options_:
@@ -103,10 +120,11 @@ def fit(self, X, y, sample_weight=None, **fit_params):
             X_est, X_post, y_est, y_post = train_test_split(X, y, **options_)
             self.estimator_.fit(X_est, y_est)
 
-        y_pred = (self.estimator_.predict(X_post) if not self.needs_proba_ else
+        y_score = (self.estimator_.predict(X_post) if not self.needs_proba_ else
                   self.estimator_.predict_proba(X_post))
-        # fit_params = fit_params.copy().update(labels=self.estimator_.classes_)
-        self.postprocessor_.fit(y_pred, y_post, sample_weight=sw_post
+        fit_params = fit_params.copy()
+        fit_params.update(labels=self.estimator_.classes_)
+        self.postprocessor_.fit(y_score, y_post, sample_weight=sw_post
                                 if sample_weight is not None else None,
                                 **fit_params)
         return self
@@ -116,27 +134,27 @@ def predict(self, X):
         """Predict class labels for the given samples.
 
         First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
-        ``self.needs_proba_`` is ``True``) then returns the post-processed output
-        from those predictions.
+        ``self.needs_proba_`` is ``True``) then returns the post-processed
+        output from those predictions.
 
         Args:
             X (pandas.DataFrame): Test samples.
 
         Returns:
             numpy.ndarray: Predicted class label per sample.
         """
-        y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else
-                  self.estimator_.predict_proba(X))
-        y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
-        return self.postprocessor_.predict(y_pred)
+        y_score = (self.estimator_.predict(X) if not self.needs_proba_ else
+                   self.estimator_.predict_proba(X))
+        y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns')
+        return self.postprocessor_.predict(y_score)
 
     @if_delegate_has_method('postprocessor_')
     def predict_proba(self, X):
         """Probability estimates.
 
         First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
-        ``self.needs_proba_`` is ``True``) then returns the post-processed output
-        from those predictions.
+        ``self.needs_proba_`` is ``True``) then returns the post-processed
+        output from those predictions.
 
         The returned estimates for all classes are ordered by the label of
         classes.
@@ -149,18 +167,18 @@ def predict_proba(self, X):
             in the model, where classes are ordered as they are in
             ``self.classes_``.
         """
-        y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else
-                  self.estimator_.predict_proba(X))
-        y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
-        return self.postprocessor_.predict_proba(y_pred)
+        y_score = (self.estimator_.predict(X) if not self.needs_proba_ else
+                   self.estimator_.predict_proba(X))
+        y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns')
+        return self.postprocessor_.predict_proba(y_score)
 
     @if_delegate_has_method('postprocessor_')
     def predict_log_proba(self, X):
         """Log of probability estimates.
 
         First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if
-        ``self.needs_proba_`` is ``True``) then returns the post-processed output
-        from those predictions.
+        ``self.needs_proba_`` is ``True``) then returns the post-processed
+        output from those predictions.
 
         The returned estimates for all classes are ordered by the label of
         classes.
@@ -173,10 +191,10 @@ def predict_log_proba(self, X):
             the model, where classes are ordered as they are in
             ``self.classes_``.
         """
-        y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else
-                  self.estimator_.predict_proba(X))
-        y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
-        return self.postprocessor_.predict_log_proba(y_pred)
+        y_score = (self.estimator_.predict(X) if not self.needs_proba_ else
+                   self.estimator_.predict_proba(X))
+        y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns')
+        return self.postprocessor_.predict_log_proba(y_score)
 
     @if_delegate_has_method('postprocessor_')
     def score(self, X, y, sample_weight=None):
@@ -195,10 +213,11 @@ def score(self, X, y, sample_weight=None):
         Returns:
             float: Score value.
         """
-        y_pred = (self.estimator_.predict(X) if not self.needs_proba_ else
-                  self.estimator_.predict_proba(X))
-        y_pred = pd.DataFrame(y_pred, index=X.index).squeeze('columns')
-        return self.postprocessor_.score(y_pred, y, sample_weight=sample_weight)
+        y_score = (self.estimator_.predict(X) if not self.needs_proba_ else
+                   self.estimator_.predict_proba(X))
+        y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns')
+        return self.postprocessor_.score(y_score, y,
+                                         sample_weight=sample_weight)
 
 
 __all__ = [