In [22]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib

Using matplotlib backend: MacOSX


In [2]:
def load_clients(filename):

    with open(filename, 'rb') as fp:
        clients = pickle.load(fp)
    print("client size: ", len(clients))
    
    distribution = {}
    for client in clients:
        distribution[client] = distribution.get(client, 0) + 1

    distribution = sorted(distribution.items(), key=lambda x: -x[-1])
    x = np.array([i[0] for i in distribution])
    y = np.array([i[1] for i in distribution])
    print("distinct clients: ",len(x))
    print("var/client: ", np.var(y)/len(y))
    return x, y

In [3]:
def savefig_word_distribution(x, y, topk=50):
    x_top = x[:topk]
    y_top = y[:topk]
    total_clients = np.sum(y)
    with plt.style.context("seaborn"):
        plt.xticks(rotation=90)
        
        plt.xlabel("words")
        plt.ylabel("word_counts")
        plt.title("Words Count Distribution")
        plt.plot(x_top, y_top, '^-')
        plt.savefig(f"./clients_{total_clients}.png", format="png")


In [4]:
x, y = load_clients("./triehh_clients.txt")
savefig_word_distribution(x, y, topk=50)



client size:  99411
distinct clients:  15742
var/client:  0.13729282674948987


In [5]:
x, y = load_clients("./triehh_clients_remove_top5_90740.txt")
savefig_word_distribution(x, y, topk=50)

client size:  90740
distinct clients:  15737
var/client:  0.06305916690756608


In [None]:
removed_words = x[:5]


## VVR for TrieHH

In [21]:
# VVR

x = ["2716", "4642", "9004", "90740", "99411"]
y = [5.916667, 10.083333, 19.666667,198.333333, 217.333333]

plt.figure()
plt.xticks(rotation=10)
plt.xlabel("client_size (n)")
plt.ylabel("VRR")
plt.title(r"VRR ($\varepsilon=12, \delta=1/n^2$)")
plt.plot(x, y, '^')
plt.savefig("./VVR.png")

In [30]:
import os


filename = "../results/connectionloss_2716"
alg = "triehh"
closs = 0.0
score = "F1"

with open(os.path.join(filename, f'{alg}_cls{closs:.1f}_{score}'), 'rb') as f:
    print(pickle.load(f))

[[0.2, 0.8999999999999999, 1.5999999999999999, 2.3, 3.0, 3.7, 4.4, 5.1000000000000005, 5.800000000000001, 6.500000000000001, 7.200000000000001, 7.900000000000001, 8.600000000000001, 9.3, 10.0, 10.7, 11.399999999999999], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]


## Generate Zipf Distribution dataset

In [None]:
import pandas as pd

a = np.random.zipf(1.5, 2000)                                            


with open("./triehh_clients_remove_top5_90740.txt", 'rb') as f:
    data = pickle.load(f) 

k = np.bincount(a)[1:]

value_counts = pd.value_counts(data)

value_counts.sort_values(ascending=False)

clients = []
rank = 0
for item in value_counts.keys():
    clients += [item]*k[rank]
    rank += 1

np.random.shuffle(clients)

with open("./zipf_remove_top5_20000.txt", 'wb') as f:
    pickle.dump(clients, f)
