/
test_sparse.py
358 lines (321 loc) · 14 KB
/
test_sparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
#get_ipython().run_line_magic('matplotlib', 'inline')
#import Ipynb_importer
#import Classic
import RR_RAPPOR
import Subsetselection
import k2k_hadamard
import hadamard_quantize
import timeit
import scipy.io as io
import datetime
import numpy as np
import random
import math
import matplotlib.pyplot as plt
import scipy.io as io
from functions import *
def test(n, k, eps, rep, point_num, sparsity_sz, init, dist, encode_acc = 1, encode_mode = 0):
# fix alphabet size and privacy level, get the error plot with respect to sample size
#Args:
# k : alphabet size, eps: privacy level, rep: repetition times to compute a point
# point_num: number of points to compute
# step_sz: distance between two sample sizes
# init: initial sample size = init* step_sz
# dist: underlying data distribution: choose from 'Uniform', 'Two_steps', 'Zipf', 'Dirchlet', 'Geometric'
# encode_acc : control whether to use fast encoding for hadamard responce
# recommended and default: 1 use fast encoding when k < 10000
# if memory use is high, disable this
# mode: control encoding method for rappor and subset selection
# 0 for standard, which is fast but memory intensive
# 1 for light, which is relatively slow but not memory intensive
# 2 for compress, where we compress the output of rappor and subsetselection into locations of ones.
# recommended and default: 0 when k <= 5000 n <= 1000000
# if memory use is high, use light mode
# you can also create other modes by modifying the code
print('Alphabet size:', k)
print('Privacy level:', eps)
indicies = [(init-1+i)*sparsity_sz for i in range(1,point_num+1) ] # all the indicies
subset = Subsetselection.Subsetselection(k,eps) #class for subset selection algorithm
rappor = RR_RAPPOR.RAPPOR(k,eps) #class for RAPPOR
rr = RR_RAPPOR.Randomized_Response(k, eps) #class for Randomized Response
if encode_acc == 1:
hr = k2k_hadamard.Hadamard_Rand_general_original(k,eps,1) #initialize hadamard response
else:
hr = k2k_hadamard.Hadamard_Rand_general_original(k,eps,0) #initialize hadamard response
hq_1 = hadamard_quantize.Hadamard_Quantize(k, eps, comm = 1, proj = False)
hq_2 = hadamard_quantize.Hadamard_Quantize(k, eps, comm = 2, proj = False)
hq_3 = hadamard_quantize.Hadamard_Quantize(k, eps, comm = 3, proj = False)
hq = hadamard_quantize.Hadamard_Quantize(k, eps, comm = 100, proj = False)
prob1 = generate_uniform_distribution(k)
#prob1 = []
prob2 = generate_two_steps_distribution(k)
prob3 = generate_Zipf_distribution(k,1.0)
prob4 = generate_Dirichlet_distribution(k,1.0)
prob5 = generate_geometric_distribution(k,0.8)
prob_list = {
'Uniform' : prob1,
'Two_steps' : prob2,
'Zipf' : prob3,
'Dirchlet' : prob4,
'Geometric' : prob5,
}
#underlying distribution
prob = prob_list[dist]
# to store l1 errors for each method
l1_1 = [0]*point_num
l1_2 = [0]*point_num
l1_3 = [0]*point_num
l1_4 = [0]*point_num
l1_5 = [0]*point_num
l1_6 = [0]*point_num
l1_7 = [0]*point_num
l1_8 = [0]*point_num
# to store l2 errors for each method
l2_1 = [0]*point_num
l2_2 = [0]*point_num
l2_3 = [0]*point_num
l2_4 = [0]*point_num
l2_5 = [0]*point_num
l2_6 = [0]*point_num
l2_7 = [0]*point_num
l2_8 = [0]*point_num
# to store decodign time for each method
t1_1 = [0]*point_num
t1_2 = [0]*point_num
t1_3 = [0]*point_num
t1_4 = [0]*point_num
t1_5 = [0]*point_num
t1_6 = [0]*point_num
t1_7 = [0]*point_num
t1_8 = [0]*point_num
for r in range(init, point_num + init):
print('Iteration:', r)
s = r*sparsity_sz
count1 = 0
count2 = 0
count3 = 0
count4 = 0
count5 = 0
count6 = 0
count7 = 0
count8 = 0
count2_1 = 0
count2_2 = 0
count2_3 = 0
count2_4 = 0
count2_5 = 0
count2_6 = 0
count2_7 = 0
count2_8 = 0
t1 = 0
t2 = 0
t3 = 0
t4 = 0
t5 = 0
t6 = 0
t7 = 0
t8 = 0
for t in range(0,rep):
#print(t)
elements = range(0,k)
prob = [1/s]*int(s)+[0]*(k-int(s))
prob = np.random.permutation(prob)
in_list = np.random.choice(elements, n, p=prob) #input symbols
'''
#subset selection
if encode_mode == 0: # standard mode
outp_1 = subset.encode_string_fast(in_list)
start_time = timeit.default_timer()
prob_est_1 = subset.decode_string(outp_1,n, normalization = 1) # estimate the original underlying distribution
t1 = t1 + timeit.default_timer() - start_time
if encode_mode == 1: # light mode
counts,time = subset.encode_string_light(in_list) #subset selection
start_time = timeit.default_timer()
prob_est_1 = subset.decode_counts(counts, n, normalization = 1) # estimate the original underlying distribution
t1 = t1 + time + timeit.default_timer() - start_time
if encode_mode == 2: # compress mode
out_list = subset.encode_string_compress(in_list) #subset selection
start_time = timeit.default_timer()
counts, temp = np.histogram(out_list,range(k+1))
prob_est_1 = subset.decode_counts(counts, n, normalization = 1) # estimate the original underlying distribution
t1 = t1 + timeit.default_timer() - start_time
count1 = count1 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_1)], ord=1)
count2_1 = count2_1 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_1)], ord=2)**2
'''
# k- RR
sample = rr.encode_string(in_list)
start_time = timeit.default_timer()
prob_est_2 = rr.decode_string(sample, normalization = 1) # estimate the original underlying distribution
t2 = t2 + timeit.default_timer() - start_time
count2 = count2 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_2)], ord=1)
count2_2 = count2_2 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_2)], ord=2)**2
'''
#k-RAPPOR
if encode_mode == 0:
sample = rappor.encode_string(in_list)
start_time = timeit.default_timer()
outp_3 = np.sum(sample, axis=0)
prob_est_3 = rappor.decode_counts(outp_3,n) # estimate the original underlying distribution
t3 = t3 + timeit.default_timer() - start_time
if encode_mode == 1:
counts, time = rappor.encode_string_light(in_list)
start_time = timeit.default_timer()
prob_est_3 = rappor.decode_counts(counts,n) # estimate the original underlying distribution
t3 = t3 + time + timeit.default_timer() - start_time
if encode_mode == 2:
out_list = rappor.encode_string_compress(in_list)
start_time = timeit.default_timer()
counts,temp = np.histogram(out_list,range(k+1))
prob_est_3 = rappor.decode_counts(counts,n) # estimate the original underlying distribution
t3 = t3 + timeit.default_timer() - start_time
count3 = count3 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_3)], ord=1)
count2_3 = count2_3 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_3)], ord=2)**2
'''
#k-HR
outp_4 = hr.encode_string(in_list)
start_time = timeit.default_timer()
prob_est_4 = hr.decode_string(outp_4, normalization = 1) # estimate the original underlying distribution
t4 = t4 + timeit.default_timer() - start_time
count4 = count4 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_4)], ord=1)
count2_4 = count2_4 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_4)], ord=2)**2
#HQ_1
outp_5 = hq_1.encode_string(in_list)
start_time = timeit.default_timer()
prob_est_5 = hq_1.decode_string_fast(outp_5, normalization = 1) # estimate the original underlying distribution
t5 = t5 + timeit.default_timer() - start_time
count5 = count5 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_5)], ord=1)
count2_5 = count2_5 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_5)], ord=2)**2
'''
#HQ_2
outp_6 = hq_2.encode_string(in_list)
start_time = timeit.default_timer()
prob_est_6 = hq_2.decode_string(outp_6) # estimate the original underlying distribution
t6 = t6 + timeit.default_timer() - start_time
count6 = count6 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_6)], ord=1)
count2_6 = count2_6 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_6)], ord=2)**2
#HQ_3
outp_7 = hq_3.encode_string(in_list)
start_time = timeit.default_timer()
prob_est_7 = hq_3.decode_string(outp_7) # estimate the original underlying distribution
t7 = t7 + timeit.default_timer() - start_time
count7 = count7 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_7)], ord=1)
count2_7 = count2_7 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_7)], ord=2)**2
'''
#HQ
outp_8 = hq.encode_string(in_list)
start_time = timeit.default_timer()
prob_est_8 = hq.decode_string_fast(outp_8, normalization = 1) # estimate the original underlying distribution
t8 = t8 + timeit.default_timer() - start_time
count8 = count8 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_8)], ord=1)
count2_8 = count2_8 + np.linalg.norm([a_i - b_i for a_i, b_i in zip(prob, prob_est_8)], ord=2)**2
l1_1[r-1] = count1/float(rep)
l1_2[r-1] = count2/float(rep)
l1_3[r-1] = count3/float(rep)
l1_4[r-1] = count4/float(rep)
l1_5[r-1] = count5/float(rep)
l1_6[r-1] = count6/float(rep)
l1_7[r-1] = count7/float(rep)
l1_8[r-1] = count8/float(rep)
l2_1[r-1] = count2_1/float(rep)
l2_2[r-1] = count2_2/float(rep)
l2_3[r-1] = count2_3/float(rep)
l2_4[r-1] = count2_4/float(rep)
l2_5[r-1] = count2_5/float(rep)
l2_6[r-1] = count2_6/float(rep)
l2_7[r-1] = count2_7/float(rep)
l2_8[r-1] = count2_8/float(rep)
t1_1[r-1] = t1/float(rep)
t1_2[r-1] = t2/float(rep)
t1_3[r-1] = t3/float(rep)
t1_4[r-1] = t4/float(rep)
t1_5[r-1] = t5/float(rep)
t1_6[r-1] = t6/float(rep)
t1_7[r-1] = t7/float(rep)
t1_8[r-1] = t8/float(rep)
'''
plt.figure()
plt.plot(indicies,l1_1, label = 'subset')
plt.plot(indicies,l1_2, label = 'rr')
plt.plot(indicies,l1_3, label = 'rappor')
plt.plot(indicies,l1_4, label = 'hr')
plt.plot(indicies,l1_5, label = 'hq (1-bit)')
plt.plot(indicies,l1_6, label = 'hq (2-bit)')
plt.plot(indicies,l1_7, label = 'hq (3-bit)')
plt.plot(indicies,l1_8, label = 'hq')
plt.legend()
plt.figure()
plt.plot(indicies,l2_1, label = 'subset')
plt.plot(indicies,l2_2, label = 'rr')
plt.plot(indicies,l2_3, label = 'rappor')
plt.plot(indicies,l2_4, label = 'hr')
plt.plot(indicies,l2_5, label = 'hq (1-bit)')
plt.plot(indicies,l2_6, label = 'hq (2-bit)')
plt.plot(indicies,l2_7, label = 'hq (3-bit)')
plt.plot(indicies,l2_8, label = 'hq')
plt.legend()
plt.figure()
plt.plot(indicies,t1_1, label = 'subset')
plt.plot(indicies,t1_2, label = 'rr')
plt.plot(indicies,t1_3, label = 'rappor')
plt.plot(indicies,t1_4, label = 'hr')
plt.plot(indicies,t1_5, label = 'hq (1-bit)')
plt.plot(indicies,t1_6, label = 'hq (2-bit)')
plt.plot(indicies,t1_7, label = 'hq (3-bit)')
plt.plot(indicies,t1_8, label = 'hq')
plt.legend()
'''
time = datetime.datetime.now().strftime("%m_%d_%H_%M")
#save all the data into a mat file with time stamp
data = {
'time' : time,
'absz' : k,
#'sparsity' : s,
'privacy' : eps,
'repetition' : rep,
'indices' : indicies, # indices of each point (sparsities)
'subset_error': l1_1, #l1 error for each point
'rr_error': l1_2,
'rappor_error': l1_3,
'hr_error': l1_4,
'hq_1_error': l1_5,
'hq_2_error': l1_6,
'hq_3_error': l1_7,
'hq_error': l1_8,
'subset_error_l2': l2_1, #l2 error for each point
'rr_error_l2': l2_2,
'rappor_error_l2': l2_3,
'hr_error_l2': l2_4,
'hq_1_error_l2': l2_5,
'hq_2_error_l2': l2_6,
'hq_3_error_l2': l2_7,
'hq_error_l2': l2_8,
'subset_time': t1_1, #decoding time for each point
'rr_time': t1_2,
'rappor_time': t1_3,
'hr_time': t1_4,
'hq_1_time': t1_5,
'hq_2_time': t1_6,
'hq_3_time': t1_7,
'hq_time': t1_8,
'prob': prob,
'dist': dist
}
para = 'k_{}_eps_{}_'.format(k,eps)
filename = 'Data/data_sparse_' + dist + '_' + para + '.mat'
io.savemat(filename,data)
return data
#Testing script for comparison
k = 10000 #absz
eps = 2 # privacy_para
rep = 20 #repetition time for each point
points = 20 # total number of points
sparsity_sz = 100 # step size between two points
init = 1 #initial step
n = 500000
for eps in [0.5,1, 2, 5, 7]:
dist = 'Uniform'
#print(dist)
print(datetime.datetime.now())
#print(dist)
print(k)
data = test(n,k,eps,rep,points,sparsity_sz,init,dist,0,0)