diff --git a/tests/sklearn/test_adversarial_debiasing.py b/tests/sklearn/test_adversarial_debiasing.py index c28fb17c..f7dd19d4 100644 --- a/tests/sklearn/test_adversarial_debiasing.py +++ b/tests/sklearn/test_adversarial_debiasing.py @@ -15,6 +15,7 @@ 'hours-per-week'], features_to_drop=[]) def test_adv_debias_old_reproduce(): + """Test that the old AdversarialDebiasing is reproducible.""" sess = tf.Session() old_adv_deb = OldAdversarialDebiasing(unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}], @@ -34,6 +35,8 @@ def test_adv_debias_old_reproduce(): assert np.allclose(old_preds.labels, old_preds2.labels) def test_adv_debias_old(): + """Test that the predictions of the old and new AdversarialDebiasing match. + """ tf.reset_default_graph() sess = tf.Session() old_adv_deb = OldAdversarialDebiasing(unprivileged_groups=[{'sex': 0}], @@ -48,6 +51,7 @@ def test_adv_debias_old(): assert np.allclose(old_preds.labels.flatten(), new_preds) def test_adv_debias_reproduce(): + """Test that the new AdversarialDebiasing is reproducible.""" adv_deb = AdversarialDebiasing('sex', num_epochs=5, random_state=123) new_preds = adv_deb.fit(X, y).predict(X) adv_deb.sess_.close() @@ -60,12 +64,16 @@ def test_adv_debias_reproduce(): assert new_acc == accuracy_score(y, new_preds) def test_adv_debias_intersection(): + """Test that the new AdversarialDebiasing runs with >2 protected groups.""" adv_deb = AdversarialDebiasing(scope_name='intersect', num_epochs=5) adv_deb.fit(X, y) adv_deb.sess_.close() assert adv_deb.adversary_logits_.shape[1] == 4 def test_adv_debias_grid(): + """Test that the new AdversarialDebiasing works in a grid search (and that + debiasing results in reduced accuracy). + """ adv_deb = AdversarialDebiasing('sex', num_epochs=10, random_state=123) params = {'debias': [True, False]} diff --git a/tests/sklearn/test_calibrated_equalized_odds.py b/tests/sklearn/test_calibrated_equalized_odds.py index 3352b548..3bfffaf5 100644 --- a/tests/sklearn/test_calibrated_equalized_odds.py +++ b/tests/sklearn/test_calibrated_equalized_odds.py @@ -14,6 +14,9 @@ 'hours-per-week'], features_to_drop=[]) def test_calib_eq_odds_sex_weighted(): + """Test that the old and new CalibratedEqualizedOdds produce the same mix + rates. + """ logreg = LogisticRegression(solver='lbfgs', max_iter=500) y_pred = logreg.fit(X, y, sample_weight=sample_weight).predict_proba(X) adult_pred = adult.copy() @@ -28,6 +31,12 @@ def test_calib_eq_odds_sex_weighted(): assert np.isclose(orig_cal_eq_odds.unpriv_mix_rate, cal_eq_odds.mix_rates_[0]) def test_postprocessingmeta_fnr(): + """Test that the old and new CalibratedEqualizedOdds produce the same + probability predictions. + + This tests the whole "pipeline": splitting the data the same way, training a + LogisticRegression classifier, and training the post-processor the same way. + """ adult_train, adult_test = adult.split([0.9], shuffle=False) X_tr, X_te, y_tr, _, sw_tr, _ = train_test_split(X, y, sample_weight, train_size=0.9, shuffle=False) @@ -52,7 +61,8 @@ def test_postprocessingmeta_fnr(): orig_cal_eq_odds.fit(adult_post, adult_pred) cal_eq_odds = PostProcessingMeta(estimator=logreg, - postprocessor=CalibratedEqualizedOdds('sex', cost_constraint='fnr', random_state=0), + postprocessor=CalibratedEqualizedOdds('sex', cost_constraint='fnr', + random_state=0), shuffle=False) cal_eq_odds.fit(X_tr, y_tr, sample_weight=sw_tr) diff --git a/tests/sklearn/test_datasets.py b/tests/sklearn/test_datasets.py index 1d2ec6a0..2b0fb2c5 100644 --- a/tests/sklearn/test_datasets.py +++ b/tests/sklearn/test_datasets.py @@ -15,6 +15,7 @@ dropna=False) def test_standardize_dataset_basic(): + """Tests standardize_dataset on a toy example.""" dataset = basic() X, y = dataset X, y = dataset.X, dataset.y @@ -28,11 +29,13 @@ def test_standardize_dataset_basic(): assert X.shape == (3, 3) def test_sample_weight_basic(): + """Tests returning sample_weight on a toy example.""" with_weights = basic(sample_weight='X2') assert len(with_weights) == 3 assert with_weights.X.shape == (3, 2) def test_usecols_dropcols_basic(): + """Tests various combinations of usecols and dropcols on a toy example.""" assert basic(usecols='X1').X.columns.tolist() == ['X1'] assert basic(usecols=['X1', 'Z']).X.columns.tolist() == ['X1', 'Z'] @@ -44,17 +47,20 @@ def test_usecols_dropcols_basic(): pd.DataFrame) def test_dropna_basic(): + """Tests dropna on a toy example.""" basic_dropna = partial(standardize_dataset, df=df, prot_attr='Z', target='y', dropna=True) assert basic_dropna().X.shape == (2, 3) assert basic(dropcols='X1').X.shape == (3, 2) def test_numeric_only_basic(): + """Tests numeric_only on a toy example.""" assert basic(prot_attr='X2', numeric_only=True).X.shape == (3, 2) assert (basic(prot_attr='X2', dropcols='Z', numeric_only=True).X.shape == (3, 2)) def test_fetch_adult(): + """Tests Adult Income dataset shapes with various options.""" adult = fetch_adult() assert len(adult) == 3 assert adult.X.shape == (45222, 13) @@ -62,12 +68,14 @@ def test_fetch_adult(): assert fetch_adult(numeric_only=True).X.shape == (48842, 7) def test_fetch_german(): + """Tests German Credit dataset shapes with various options.""" german = fetch_german() assert len(german) == 2 assert german.X.shape == (1000, 21) assert fetch_german(numeric_only=True).X.shape == (1000, 9) def test_fetch_bank(): + """Tests Bank Marketing dataset shapes with various options.""" bank = fetch_bank() assert len(bank) == 2 assert bank.X.shape == (45211, 15) @@ -76,6 +84,7 @@ def test_fetch_bank(): @pytest.mark.filterwarnings('error', category=ColumnAlreadyDroppedWarning) def test_fetch_compas(): + """Tests COMPAS Recidivism dataset shapes with various options.""" compas = fetch_compas() assert len(compas) == 2 assert compas.X.shape == (6167, 10) @@ -84,5 +93,6 @@ def test_fetch_compas(): assert fetch_compas(numeric_only=True).X.shape == (6172, 6) def test_onehot_transformer(): + """Tests that categorical features can be correctly one-hot encoded.""" X, y = fetch_german() assert len(pd.get_dummies(X).columns) == 63 diff --git a/tests/sklearn/test_metrics.py b/tests/sklearn/test_metrics.py index 326c7c8b..916d2ce5 100644 --- a/tests/sklearn/test_metrics.py +++ b/tests/sklearn/test_metrics.py @@ -29,61 +29,75 @@ privileged_groups=[{'sex': 1}]) def test_dataset_equality(): + """Tests that the old and new datasets match exactly.""" assert (adult.features == X.values).all() assert (adult.labels.ravel() == y).all() def test_consistency(): + """Tests that the old and new consistency_score matches exactly.""" assert np.isclose(consistency_score(X, y), cm.consistency()) def test_specificity(): + """Tests that the old and new specificity_score matches exactly.""" spec = specificity_score(y, y_pred, sample_weight=sample_weight) assert spec == cm.specificity() def test_base_rate(): + """Tests that the old and new base_rate matches exactly.""" base = base_rate(y, y_pred, sample_weight=sample_weight) assert base == cm.base_rate() def test_selection_rate(): + """Tests that the old and new selection_rate matches exactly.""" select = selection_rate(y, y_pred, sample_weight=sample_weight) assert select == cm.selection_rate() def test_generalized_fpr(): + """Tests that the old and new generalized_fpr matches exactly.""" gfpr = generalized_fpr(y, y_proba, sample_weight=sample_weight) assert np.isclose(gfpr, cm.generalized_false_positive_rate()) def test_generalized_fnr(): + """Tests that the old and new generalized_fnr matches exactly.""" gfnr = generalized_fnr(y, y_proba, sample_weight=sample_weight) assert np.isclose(gfnr, cm.generalized_false_negative_rate()) def test_disparate_impact(): + """Tests that the old and new disparate_impact matches exactly.""" di = disparate_impact_ratio(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert di == cm.disparate_impact() def test_statistical_parity(): + """Tests that the old and new statistical_parity matches exactly.""" stat = statistical_parity_difference(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert stat == cm.statistical_parity_difference() def test_equal_opportunity(): + """Tests that the old and new equal_opportunity matches exactly.""" eopp = equal_opportunity_difference(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert eopp == cm.equal_opportunity_difference() def test_average_odds_difference(): + """Tests that the old and new average_odds_difference matches exactly.""" aod = average_odds_difference(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert np.isclose(aod, cm.average_odds_difference()) def test_average_odds_error(): + """Tests that the old and new average_odds_error matches exactly.""" aoe = average_odds_error(y, y_pred, prot_attr='sex', sample_weight=sample_weight) assert np.isclose(aoe, cm.average_abs_odds_difference()) def test_generalized_entropy_index(): + """Tests that the old and new generalized_entropy_index matches exactly.""" gei = generalized_entropy_error(y, y_pred) assert np.isclose(gei, cm.generalized_entropy_index()) def test_between_group_generalized_entropy_index(): + """Tests that the old and new between_group_GEI matches exactly.""" bggei = between_group_generalized_entropy_error(y, y_pred, prot_attr='sex') assert bggei == cm.between_group_generalized_entropy_index() diff --git a/tests/sklearn/test_reweighing.py b/tests/sklearn/test_reweighing.py index 97631043..f8046fe9 100644 --- a/tests/sklearn/test_reweighing.py +++ b/tests/sklearn/test_reweighing.py @@ -9,36 +9,32 @@ from aif360.sklearn.preprocessing import Reweighing, ReweighingMeta -# X, y = fetch_german(numeric_only=True, dropcols='duration') -# X.age = (X.age >= 25).astype('int') -# german = GermanDataset(categorical_features=[], features_to_keep=[ -# 'credit_amount', 'investment_as_income_percentage', 'residence_since', -# 'age', 'number_of_credits', 'people_liable_for', 'sex']) X, y, sample_weight = fetch_adult(numeric_only=True) adult = AdultDataset(instance_weights_name='fnlwgt', categorical_features=[], features_to_keep=['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'], features_to_drop=[]) def test_reweighing_sex(): + """Test that the old and new Reweighing produce the same sample_weights.""" orig_rew = OrigReweighing(unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}]) adult_fair = orig_rew.fit_transform(adult) rew = Reweighing('sex') _, new_sample_weight = rew.fit_transform(X, y, sample_weight=sample_weight) - # assert np.allclose([[orig_rew.w_up_unfav, orig_rew.w_up_fav], - # [orig_rew.w_p_unfav, orig_rew.w_p_fav]], - # rew.reweigh_factors_) + assert np.allclose([[orig_rew.w_up_unfav, orig_rew.w_up_fav], + [orig_rew.w_p_unfav, orig_rew.w_p_fav]], + rew.reweigh_factors_) assert np.allclose(adult_fair.instance_weights, new_sample_weight) def test_reweighing_intersection(): + """Test that the new Reweighing runs with >2 protected groups.""" rew = Reweighing() rew.fit_transform(X, y) assert rew.reweigh_factors_.shape == (4, 2) def test_gridsearch(): - # logreg = LogisticRegression(solver='lbfgs', max_iter=500) - # rew = ReweighingMeta(estimator=logreg, reweigher=Reweighing('sex')) + """Test that ReweighingMeta works in a grid search.""" rew = ReweighingMeta(estimator=LogisticRegression(solver='liblinear')) # UGLY workaround for sklearn issue: https://stackoverflow.com/a/49598597 @@ -51,4 +47,3 @@ def score_func(y_true, y_pred, sample_weight): clf = GridSearchCV(rew, params, scoring=scoring, cv=5, iid=False) clf.fit(X, y, **{'sample_weight': sample_weight}) - # print(clf.best_score_)