forked from cmla-psu/statdp
-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
__init__.py
192 lines (165 loc) · 8.93 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# MIT License
#
# Copyright (c) 2018 Yuxin Wang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import matplotlib.pyplot as plt
import matplotlib
import coloredlogs
from pathlib import Path
from jsonpickle import encode
import json
import time
from textwrap import dedent
import argparse
import logging
import multiprocessing as mp
from tqdm import tqdm
from pystatdp.generators import generate_arguments, generate_databases, ALL_DIFFER, ONE_DIFFER
from pystatdp.hypotest import hypothesis_test
from pystatdp.selectors import select_event
from pystatdp.algorithms import generic_method
logger = logging.getLogger(__name__)
matplotlib.use('agg')
matplotlib.rcParams['xtick.labelsize'] = '12'
matplotlib.rcParams['ytick.labelsize'] = '12'
coloredlogs.install(
'INFO', fmt='%(asctime)s [0x%(process)x] %(levelname)s %(message)s')
class pystatdp:
def plot_result(self, data, xlabel, ylabel, title, output_filename):
"""plot the results similar to the figures in our paper
:param data: The input data sets to plots. e.g., {algorithm_epsilon: [(test_epsilon, pvalue), ...]}
:param xlabel: The label for x axis.
:param ylabel: The label for y axis.
:param title: The title of the figure.
:param output_filename: The output file name.
:return: None
"""
# setup the figure
plt.ylim(0.0, 1.0)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
# colors and markers for each claimed epsilon
markers = ['s', 'o', '^', 'x', '*', '+', 'p']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# add an auxiliary line for p-value=0.05
plt.axhline(y=0.05, color='black', linestyle='dashed', linewidth=1.2)
for i, (epsilon, points) in enumerate(data.items()):
# add an auxiliary vertical line for the claimed privacy
plt.axvline(x=float(epsilon), color=colors[i % len(
colors)], linestyle='dashed', linewidth=1.2)
# plot the
x = [item[0] for item in points]
p = [item[1] for item in points]
plt.plot(x, p, 'o-',
label=f'$\\epsilon_0$ = {epsilon}', markersize=8, marker=markers[i % len(markers)], linewidth=3)
# plot legends
legend = plt.legend()
legend.get_frame().set_linewidth(0.0)
# save the figure and clear the canvas for next draw
plt.savefig(output_filename, bbox_inches='tight')
plt.gcf().clear()
def detect_counterexample(self, algorithm, test_epsilon, default_kwargs=None, databases=None, num_input=(5, 10),
event_iterations=100000, detect_iterations=500000, cores=None, sensitivity=ALL_DIFFER,
quiet=False, loglevel=logging.INFO):
"""
:param algorithm: The algorithm to test for.
:param test_epsilon: The privacy budget to test for, can either be a number or a tuple/list.
:param default_kwargs: The default arguments the algorithm needs except the first Queries argument.
:param databases: The databases to run for detection, optional.
:param num_input: The length of input to generate, not used if database param is specified.
:param event_iterations: The iterations for event selector to run.
:param detect_iterations: The iterations for detector to run.
:param cores: The number of max processes to set for multiprocessing.Pool(), os.cpu_count() is used if None.
:param sensitivity: The sensitivity setting, all queries can differ by one or just one query can differ by one.
:param quiet: Do not print progress bar or messages, logs are not affected.
:param loglevel: The loglevel for logging package.
:return: [(epsilon, p, d1, d2, kwargs, event)] The epsilon-p pairs along with databases/arguments/selected event.
"""
# initialize an empty default kwargs if None is given
default_kwargs = default_kwargs if default_kwargs else {}
logging.basicConfig(level=loglevel)
logger.info(
f'Start detection for counterexample on {algorithm.__name__} with test epsilon {test_epsilon}')
logger.info(
f'Options -> default_kwargs: {default_kwargs} | databases: {databases} | cores:{cores}')
input_list = []
if databases is not None:
d1, d2 = databases
kwargs = generate_arguments(
algorithm, d1, d2, default_kwargs=default_kwargs)
input_list = ((d1, d2, kwargs),)
else:
num_input = (int(num_input), ) if isinstance(
num_input, (int, float)) else num_input
for num in num_input:
input_list.extend(
generate_databases(algorithm, num, default_kwargs=default_kwargs, sensitivity=sensitivity))
result = []
# convert int/float or iterable into tuple (so that it has length information)
test_epsilon = (test_epsilon, ) if isinstance(
test_epsilon, (int, float)) else test_epsilon
with mp.Pool(cores) as pool:
for _, epsilon in tqdm(enumerate(test_epsilon), total=len(test_epsilon), unit='test', desc='Detection',
disable=quiet):
d1, d2, kwargs, event = select_event(algorithm, input_list, epsilon, event_iterations, quiet=quiet,
process_pool=pool)
p = hypothesis_test(algorithm, d1, d2, kwargs, event, epsilon, detect_iterations, report_p2=False,
process_pool=pool)
result.append((epsilon, float(p), d1.tolist(),
d2.tolist(), kwargs, event))
if not quiet:
tqdm.write(
f'Epsilon: {epsilon} | p-value: {p:5.3f} | Event: {event}')
logger.debug(f'D1: {d1} | D2: {d2} | kwargs: {kwargs}')
return result
def main(self, algo, param, epsilon):
# list of tasks to test, each tuple contains (function, extra_args, sensitivity)
tasks = [
(generic_method, {'algorithm': algo,
'param_for_algorithm': param}, ALL_DIFFER)
]
# claimed privacy level to check
claimed_privacy = epsilon
for i, (algorithm, kwargs, sensitivity) in enumerate(tasks):
start_time = time.time()
results = {}
flag_file = time.ctime().replace(' ', '_')
for privacy_budget in claimed_privacy:
# # privacy levels to test, here we test the claimed privacy plus .01 above and below
test_privacy = (privacy_budget - .09,
privacy_budget, privacy_budget + .09)
# set the second argument of the function (assumed to be `epsilon`) to the claimed privacy level
kwargs[algorithm.__code__.co_varnames[1]] = privacy_budget
results[privacy_budget] = self.detect_counterexample(
algorithm, test_privacy, kwargs, sensitivity=sensitivity)
# dump the results to file
json_file = Path.cwd() / f'{algorithm.__name__}_{flag_file}.json'
with json_file.open('w') as f:
json.dump(encode(results, unpicklable=False), f)
# plot and save to file
plot_file = Path.cwd() / f'{algorithm.__name__}_{flag_file}.pdf'
self.plot_result(results, r'Test $\epsilon$', 'P Value',
algorithm.__name__.replace('_', ' ').title(), plot_file)
total_time, total_detections = time.time() - start_time, len(claimed_privacy) * \
len(test_privacy)
logger.info(f'[{i + 1} / {len(tasks)}]: {algorithm.__name__} | Time elapsed: {total_time:5.3f}s | '
f'Average time per detection: {total_time / total_detections:5.3f}s')