Trusted-AI · hoffmansc · Feb 19, 2020 · May 16, 2019 · May 16, 2019 · May 17, 2019
diff --git a/.gitignore b/.gitignore
@@ -5,8 +5,23 @@
 .cache/
 .ipynb_checkpoints/
 .pytest_cache/
+__pycache__/
+
 .idea/
+.vscode/
+
+.eggs/
+aif360.egg-info
+build/
+dist/
+
+.coverage*
+coverage.txt
+
 docs/build/
+docs/source/modules/generated
+
+aif360/version.py
 aif360/data/raw/**
 !aif360/data/raw/*/*.md
-aif360/version.py
+aif360/sklearn/data/
diff --git a/aif360/algorithms/inprocessing/adversarial_debiasing.py b/aif360/algorithms/inprocessing/adversarial_debiasing.py
@@ -80,14 +80,14 @@ def _classifier_model(self, features, features_dim, keep_prob):
         """
         with tf.variable_scope("classifier_model"):
             W1 = tf.get_variable('W1', [features_dim, self.classifier_num_hidden_units],
-                                  initializer=tf.contrib.layers.xavier_initializer())
+                                  initializer=tf.contrib.layers.xavier_initializer(seed=self.seed1))
             b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), name='b1')
 
             h1 = tf.nn.relu(tf.matmul(features, W1) + b1)
-            h1 = tf.nn.dropout(h1, keep_prob=keep_prob)
+            h1 = tf.nn.dropout(h1, keep_prob=keep_prob, seed=self.seed2)
 
             W2 = tf.get_variable('W2', [self.classifier_num_hidden_units, 1],
-                                 initializer=tf.contrib.layers.xavier_initializer())
+                                 initializer=tf.contrib.layers.xavier_initializer(seed=self.seed3))
             b2 = tf.Variable(tf.zeros(shape=[1]), name='b2')
 
             pred_logit = tf.matmul(h1, W2) + b2
@@ -103,7 +103,7 @@ def _adversary_model(self, pred_logits, true_labels):
             s = tf.sigmoid((1 + tf.abs(c)) * pred_logits)
 
             W2 = tf.get_variable('W2', [3, 1],
-                                 initializer=tf.contrib.layers.xavier_initializer())
+                                 initializer=tf.contrib.layers.xavier_initializer(seed=self.seed4))
             b2 = tf.Variable(tf.zeros(shape=[1]), name='b2')
 
             pred_protected_attribute_logit = tf.matmul(tf.concat([s, s * true_labels, s * (1.0 - true_labels)], axis=1), W2) + b2
@@ -123,6 +123,8 @@ def fit(self, dataset):
         """
         if self.seed is not None:
             np.random.seed(self.seed)
+        ii32 = np.iinfo(np.int32)
+        self.seed1, self.seed2, self.seed3, self.seed4 = np.random.randint(ii32.min, ii32.max, size=4)
 
         # Map the dataset labels to 0 and 1.
         temp_labels = dataset.labels.copy()
@@ -177,14 +179,15 @@ def fit(self, dataset):
 
             if self.debias:
                 # Update adversary parameters
-                adversary_minimizer = adversary_opt.minimize(pred_protected_attributes_loss, var_list=adversary_vars, global_step=global_step)
+                with tf.control_dependencies([classifier_minimizer]):
+                    adversary_minimizer = adversary_opt.minimize(pred_protected_attributes_loss, var_list=adversary_vars)#, global_step=global_step)
 
             self.sess.run(tf.global_variables_initializer())
             self.sess.run(tf.local_variables_initializer())
 
             # Begin training
             for epoch in range(self.num_epochs):
-                shuffled_ids = np.random.choice(num_train_samples, num_train_samples)
+                shuffled_ids = np.random.choice(num_train_samples, num_train_samples, replace=False)
                 for i in range(num_train_samples//self.batch_size):
                     batch_ids = shuffled_ids[self.batch_size*i: self.batch_size*(i+1)]
                     batch_features = dataset.features[batch_ids]

diff --git a/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py b/aif360/algorithms/postprocessing/calibrated_eq_odds_postprocessing.py
@@ -171,16 +171,16 @@ def predict(self, dataset, threshold=0.5):
             dataset.protected_attribute_names,
             self.unprivileged_groups)
 
-        priv_indices = (np.random.random(sum(cond_vec_priv))
-                     <= self.priv_mix_rate)
-        priv_new_pred = dataset.scores[cond_vec_priv].copy()
-        priv_new_pred[priv_indices] = self.base_rate_priv
-
         unpriv_indices = (np.random.random(sum(cond_vec_unpriv))
                        <= self.unpriv_mix_rate)
         unpriv_new_pred = dataset.scores[cond_vec_unpriv].copy()
         unpriv_new_pred[unpriv_indices] = self.base_rate_unpriv
 
+        priv_indices = (np.random.random(sum(cond_vec_priv))
+                     <= self.priv_mix_rate)
+        priv_new_pred = dataset.scores[cond_vec_priv].copy()
+        priv_new_pred[priv_indices] = self.base_rate_priv
+
         dataset_new = dataset.copy(deepcopy=True)
 
         dataset_new.scores = np.zeros_like(dataset.scores, dtype=np.float64)
@@ -208,4 +208,4 @@ def weighted_cost(fp_rate, fn_rate, cm, privileged):
             * (1 - cm.base_rate(privileged=privileged))) +
            (fn_rate / norm_const
             * cm.generalized_false_negative_rate(privileged=privileged)
-            * (1 - cm.base_rate(privileged=privileged))))
+            * cm.base_rate(privileged=privileged)))
diff --git a/aif360/datasets/adult_dataset.py b/aif360/datasets/adult_dataset.py
@@ -99,7 +99,7 @@ def __init__(self, label_name='income-per-year',
             import sys
             sys.exit(1)
 
-        df = pd.concat([train, test], ignore_index=True)
+        df = pd.concat([test, train], ignore_index=True)
 
         super(AdultDataset, self).__init__(df=df, label_name=label_name,
             favorable_classes=favorable_classes,

diff --git a/aif360/datasets/structured_dataset.py b/aif360/datasets/structured_dataset.py
@@ -411,14 +411,25 @@ def import_dataset(self, import_metadata=False):
         return None
 
     def split(self, num_or_size_splits, shuffle=False, seed=None):
-        """Split the dataset into multiple datasets
+        """Split this dataset into multiple partitions.
+
         Args:
-            num_or_size_splits (list or int):
-            shuffle (bool):
-            seed (int or array_like): takes the same argument as `numpy.random.seed()`
-            function
+            num_or_size_splits (array or int): If `num_or_size_splits` is an
+                int, *k*, the value is the number of equal-sized folds to make
+                (if *k* does not evenly divide the dataset these folds are
+                approximately equal-sized). If `num_or_size_splits` is an array
+                of type int, the values are taken as the indices at which to
+                split the dataset. If the values are floats (< 1.), they are
+                considered to be fractional proportions of the dataset at which
+                to split.
+            shuffle (bool, optional): Randomly shuffle the dataset before
+                splitting.
+            seed (int or array_like): Takes the same argument as
+                :func:`numpy.random.seed()`.
+
         Returns:
-            list: Each element of this list is a dataset obtained during the split
+            list: Splits. Contains *k* or `len(num_or_size_splits) + 1`
+            datasets depending on `num_or_size_splits`.
         """
 
         # Set seed

diff --git a/aif360/sklearn/README.md b/aif360/sklearn/README.md
@@ -0,0 +1,47 @@
+## `aif360.sklearn`
+
+This is a wholly separate interface for interacting with data, viewing metrics,
+and running debiasing algorithms than the main AIF360 package. The purpose of
+this sub-package is to match scikit-learn paradigms/APIs for easier integration
+in typical machine learning workflows.
+
+See [Getting Started](examples/Getting%20Started.ipynb) to see `aif360.sklearn`
+in action.
+
+To do:
+
+- [x] Reformat datasets as separate X and y (and sample_weight) DataFrame
+objects with sample properties (protected attributes) as the index
+- [ ] Load included datasets in the above format
+  - [x] Use `sklearn.datasets.fetch_openml` to load UCI datasets (#53)
+  - [ ] COMPAS
+  - [ ] MEPS
+- [ ] Implement metrics as individual functions instead of instance methods
+  - [x] Make certain metrics compatible as sklearn scorers
+  - [x] Use "prot_attr" and "priv_group" keywords to specify protected attributes to
+  functions
+  - [x] Generalized confusion matrix
+  - [ ] Sample distortion metrics
+- [ ] Make inprocessing algorithms compatible as sklearn `Estimator`s
+  - [x] Adversarial debiasing
+  - [ ] **[External]** `get_feature_names()` from data preprocessing
+  steps that would remove DataFrame formatting
+    - [ ] [SLEP007](https://github.com/scikit-learn/enhancement_proposals/pull/17)/[SLEP008](https://github.com/scikit-learn/enhancement_proposals/pull/18) - feature names
+  - [ ] Prejudice remover
+  - [ ] Meta-fair classifier
+- [ ] Make preprocessing algorithms compatible as sklearn `Transformer`s
+  - [ ] **[External]** Add functionality to modify X and y
+    - [ ] [SLEP005](https://github.com/scikit-learn/enhancement_proposals/pull/15) - Resampler API (see discussion; meta-estimator workaround may be enough)
+  - [ ] Disparate impact remover
+  - [ ] Learning fair representations
+  - [ ] Optimized preprocessing
+  - [X] Reweighing
+    - [X] Meta-estimator workaround
+    - [ ] **[External]** [SLEP006](https://github.com/scikit-learn/enhancement_proposals/pull/16) - Sample properties (meta-estimator works but would be very nice to have)
+- [ ] Make postprocessing algorithms compatible
+  - [x] Calibrated equalized odds postprocessing
+    - [x] Meta-estimator workaround again
+  - [ ] Equalized odds postprocessing
+  - [ ] Reject option classification
+- [ ] Miscellaneous:
+  - [ ] Explainers
diff --git a/aif360/sklearn/__init__.py b/aif360/sklearn/__init__.py
diff --git a/aif360/sklearn/datasets/__init__.py b/aif360/sklearn/datasets/__init__.py
@@ -0,0 +1,13 @@
+"""
+The dataset format for ``aif360.sklearn`` is a :class:`pandas.DataFrame` with
+protected attributes in the index.
+
+Warning:
+    Currently, while all scikit-learn classes will accept DataFrames as inputs,
+    most classes will return a :class:`numpy.ndarray`. Therefore, many pre-
+    processing steps, when placed before an ``aif360.sklearn`` step in a
+    Pipeline, will cause errors.
+"""
+from aif360.sklearn.datasets.utils import *
+from aif360.sklearn.datasets.openml_datasets import *
+from aif360.sklearn.datasets.compas_dataset import fetch_compas
diff --git a/aif360/sklearn/datasets/compas_dataset.py b/aif360/sklearn/datasets/compas_dataset.py
@@ -0,0 +1,83 @@
+import os
+
+import pandas as pd
+
+from aif360.sklearn.datasets.utils import standardize_dataset
+
+
+# cache location
+DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                 '..', 'data', 'raw')
+COMPAS_URL = 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv'
+
+def fetch_compas(data_home=None, binary_race=False,
+                 usecols=['sex', 'age', 'age_cat', 'race', 'juv_fel_count',
+                          'juv_misd_count', 'juv_other_count', 'priors_count',
+                          'c_charge_degree', 'c_charge_desc'],
+                 dropcols=[], numeric_only=False, dropna=True):
+    """Load the COMPAS Recidivism Risk Scores dataset.
+
+    Optionally binarizes 'race' to 'Caucasian' (privileged) or
+    'African-American' (unprivileged). The other protected attribute is 'sex'
+    ('Male' is *unprivileged* and 'Female' is *privileged*). The outcome
+    variable is 'Survived' (favorable) if the person was not accused of a crime
+    within two years or 'Recidivated' (unfavorable) if they were.
+
+    Note:
+        The values for the 'sex' variable if numeric_only is ``True`` are 1 for
+        'Female and 0 for 'Male' -- opposite the convention of other datasets.
+
+    Args:
+        data_home (string, optional): Specify another download and cache folder
+            for the datasets. By default all AIF360 datasets are stored in
+            'aif360/sklearn/data/raw' subfolders.
+        binary_race (bool, optional): Filter only White and Black defendants.
+        usecols (single label or list-like, optional): Feature column(s) to
+            keep. All others are dropped.
+        dropcols (single label or list-like, optional): Feature column(s) to
+            drop.
+        numeric_only (bool): Drop all non-numeric feature columns.
+        dropna (bool): Drop rows with NAs.
+
+    Returns:
+        namedtuple: Tuple containing X and y for the COMPAS dataset accessible
+        by index or name.
+    """
+    cache_path = os.path.join(data_home or DATA_HOME_DEFAULT,
+                              os.path.basename(COMPAS_URL))
+    if os.path.isfile(cache_path):
+        df = pd.read_csv(cache_path, index_col='id')
+    else:
+        df = pd.read_csv(COMPAS_URL, index_col='id')
+        os.makedirs(os.path.dirname(cache_path), exist_ok=True)
+        df.to_csv(cache_path)
+
+    # Perform the same preprocessing as the original analysis:
+    # https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
+    df = df[(df.days_b_screening_arrest <= 30)
+          & (df.days_b_screening_arrest >= -30)
+          & (df.is_recid != -1)
+          & (df.c_charge_degree != 'O')
+          & (df.score_text != 'N/A')]
+
+    for col in ['sex', 'age_cat', 'race', 'c_charge_degree', 'c_charge_desc']:
+        df[col] = df[col].astype('category')
+
+    # 'Survived' < 'Recidivated'
+    cats = ['Survived', 'Recidivated']
+    df.two_year_recid = df.two_year_recid.replace([0, 1], cats).astype('category')
+    df.two_year_recid = df.two_year_recid.cat.set_categories(cats, ordered=True)
+
+    if binary_race:
+        # 'African-American' < 'Caucasian'
+        df.race = df.race.cat.set_categories(['African-American', 'Caucasian'],
+                                             ordered=True)
+
+    # 'Male' < 'Female'
+    df.sex = df.sex.astype('category').cat.reorder_categories(
+            ['Male', 'Female'], ordered=True)
+
+    return standardize_dataset(df, prot_attr=['sex', 'race'],
+                               target='two_year_recid', usecols=usecols,
+                               dropcols=dropcols, numeric_only=numeric_only,
+                               dropna=dropna)