From 1669b39467d5986dec30151e18b28a4518cf5316 Mon Sep 17 00:00:00 2001 From: Vadym Doroshenko Date: Mon, 27 Nov 2023 15:28:04 +0100 Subject: [PATCH 1/2] Fix of setting partition selection strategy --- analysis/utility_analysis.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/analysis/utility_analysis.py b/analysis/utility_analysis.py index 198b7df9..1f584bd3 100644 --- a/analysis/utility_analysis.py +++ b/analysis/utility_analysis.py @@ -115,21 +115,6 @@ def perform_utility_analysis( "Compute cross-partition metrics") # ((configuration_index, bucket), UtilityReport) - if public_partitions is None: - # Add partition selection strategy for private partitions. - strategies = data_structures.get_partition_selection_strategy(options) - - def add_partition_selection_strategy(report: metrics.UtilityReport): - # Beam does not allow to change input arguments in map, so copy it. - report = copy.deepcopy(report) - report.partitions_info.strategy = strategies[ - report.configuration_index] - return report - - cross_partition_metrics = backend.map_values( - cross_partition_metrics, add_partition_selection_strategy, - "Add Partition Selection Strategy") - cross_partition_metrics = backend.map_tuple( cross_partition_metrics, lambda key, value: (key[0], (key[1], value)), "Rekey") @@ -140,6 +125,22 @@ def add_partition_selection_strategy(report: metrics.UtilityReport): # (configuration_index, Iterable[(bucket, UtilityReport)]) result = backend.map_tuple(cross_partition_metrics, _group_utility_reports, "Group utility reports") + if public_partitions is None: + # Add partition selection strategy for private partitions. + strategies = data_structures.get_partition_selection_strategy(options) + + def add_partition_selection_strategy(report: metrics.UtilityReport): + # Beam does not allow to change input arguments in map, so copy it. + report = copy.deepcopy(report) + strategy = strategies[report.configuration_index] + report.partitions_info.strategy = strategy + for bin in report.utility_report_histogram: + bin.report.partitions_info.strategy = strategy + return report + + result = backend.map(result, add_partition_selection_strategy, + "Add Partition Selection Strategy") + # result: (UtilityReport) # result: (UtilityReport) return result, per_partition_result From f288b3be792587ec6f6c1cd93c858b4d903f07d5 Mon Sep 17 00:00:00 2001 From: Vadym Doroshenko Date: Fri, 1 Dec 2023 15:46:54 +0100 Subject: [PATCH 2/2] tests --- analysis/tests/utility_analysis_test.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/analysis/tests/utility_analysis_test.py b/analysis/tests/utility_analysis_test.py index 3bd9951b..961fc08e 100644 --- a/analysis/tests/utility_analysis_test.py +++ b/analysis/tests/utility_analysis_test.py @@ -243,7 +243,11 @@ def test_multi_parameters(self): multi_param = analysis.MultiParameterConfiguration( max_partitions_contributed=[1, 2], - max_contributions_per_partition=[1, 2]) + max_contributions_per_partition=[1, 2], + partition_selection_strategy=[ + pipeline_dp.PartitionSelectionStrategy.TRUNCATED_GEOMETRIC, + pipeline_dp.PartitionSelectionStrategy.GAUSSIAN_THRESHOLDING + ]) # Input collection has 1 privacy id, which contributes to 2 partitions # 1 and 2 times correspondingly. @@ -253,8 +257,6 @@ def test_multi_parameters(self): partition_extractor=lambda x: x[1], value_extractor=lambda x: 0) - public_partitions = ["pk0", "pk1"] - output, _ = analysis.perform_utility_analysis( col=input, backend=pipeline_dp.LocalBackend(), @@ -264,7 +266,6 @@ def test_multi_parameters(self): aggregate_params=aggregate_params, multi_param_configuration=multi_param), data_extractors=data_extractors, - public_partitions=public_partitions, ) utility_reports = list(output) @@ -273,16 +274,16 @@ def test_multi_parameters(self): self.assertLen(utility_reports, 2) # one report per each configuration. # Check the parameter configuration - expected_noise_std = [3.02734375, 8.56262117843085] + expected_noise_std = [5.9765625, 16.904271487740903] expected_l0_error = [-0.5, 0] - expected_partition_info = metrics.PartitionsInfo( - public_partitions=True, - num_dataset_partitions=2, - num_non_public_partitions=0, - num_empty_partitions=0) for i_configuration, report in enumerate(utility_reports): self.assertEqual(report.configuration_index, i_configuration) - self.assertEqual(report.partitions_info, expected_partition_info) + self.assertFalse(report.partitions_info.public_partitions) + self.assertEqual(report.partitions_info.num_dataset_partitions, 2) + self.assertEqual(report.partitions_info.num_dataset_partitions, 2) + self.assertEqual( + report.partitions_info.strategy, + multi_param.partition_selection_strategy[i_configuration]) self.assertLen(report.metric_errors, 1) # metrics for COUNT errors = report.metric_errors[0] self.assertEqual(errors.metric, pipeline_dp.Metrics.COUNT)