TODO:
- Rewrite overbudgeted entities code to use vectorized code (replace rows with 0s)
- Figure out how to read/write to databases effectively?
    - Make it so the ledger remembers what it got from the DB, and what it got locally (make a diff object, and make it so that it can read that effectively)

Later:
- Give each DataSubject a unique integer ID
- Custom deltas? (please no)

In [292]:
import time

class LedgerUpdate:
    def __init__(self, sigmas, l2_norms, l2_norm_bounds, Ls, coeffs, entity_ids, update_number, timestamp):
        self.sigmas = sigmas
        self.l2_norms = l2_norms
        self.l2_norm_bounds = l2_norm_bounds 
        self.Ls = Ls
        self.coeffs = coeffs 
        self.entity_ids = entity_ids
        self.update_number = update_number
        self.timestamp = time.time()


In [293]:
import numpy as np
from scipy.optimize import minimize_scalar
import time

class DataSubjectLedger:
    """for a particular data subject, this is the list
    of all mechanisms releasing informationo about this
    particular subject, stored in a vectorized form"""
    
    def __init__(self, default_cache_size=1e3):
        
        self.delta = 1e-6  # WARNING: CHANGING DELTA INVALIDATES THE CACHE
        self.reset()
        self.cache_constant2epsilon = list()
        self.increase_max_cache(int(default_cache_size))
        
        # save initial size (number of rows from DB) when deserialized
        self.known_db_size = 0
        self.update_number = 0
        self.timestamp_of_last_update = None
    
    def write_to_db(self):
        self.update_number += 1
    
        result = LedgerUpdate(sigmas=self.sigmas[self.known_db_size:],
                            l2_norms=self.l2_norms[self.known_db_size:],
                            l2_norm_bounds=self.l2_norms[self.known_db_size:],
                            Ls=self.Ls[self.known_db_size:],
                            coeffs=self.coeffs[self.known_db_size:],
                            entity_ids=self.entity_ids[self.known_db_size:],
                            update_number=self.update_number,
                            timestamp=time.time()
                           )
        self.known_db_size += len(self.sigmas)
        return result

    def read_from_db(self, update: LedgerUpdate):
        if update.update_number == self.update_number + 1:
            if self.timestamp_of_last_update is not None and update.timestamp < self.timestamp:
                raise Exception("It appears that updates were created out of order." +  
                "This is probably due to multiple python threads creating updates- which should NOT happen." + 
                "This is a very serious error- please contact OpenMined immediately." + "Thank you!")
            self.sigmas = np.concatenate([self.sigmas, update.sigmas])
            self.l2_norms = np.concatenate([self.l2_norms, update.l2_norms])
            self.l2_norm_bounds = np.concatenate([self.l2_norm_bounds, update.l2_norm_bounds]) 
            self.Ls = np.concatenate([self.Ls, update.Ls])
            self.coeffs = np.concatenate([self.coeffs, update.coeffs]) 
            self.entity_ids = np.concatenate([self.entity_ids, update.entity_ids])
            self.update_number = update.update_number
            self.timestamp = update.timestamp
        else:
            raise Exception("Cannot add update to Ledger")
                                                   
                                                   
    def reset(self):
        self.sigmas = np.array([])
        self.l2_norms = np.array([])
        self.l2_norm_bounds = np.array([])
        self.Ls = np.array([])
        self.coeffs = np.array([])
        self.entity_ids = np.array([])
        self.entity2budget = np.array([])
        
    def batch_append(self, 
                     sigmas: np.ndarray, 
                     l2_norms: np.ndarray, 
                     l2_norm_bounds: np.ndarray, 
                     Ls: np.ndarray, 
                     coeffs: np.ndarray, 
                     entity_ids: np.ndarray):
        
        self.sigmas = np.concatenate([self.sigmas, sigmas])
        self.l2_norms = np.concatenate([self.l2_norms, l2_norms])        
        self.l2_norm_bounds = np.concatenate([self.l2_norm_bounds, l2_norm_bounds])        
        self.Ls = np.concatenate([self.Ls, Ls])        
        self.coeffs = np.concatenate([self.coeffs, coeffs])               
        self.entity_ids = np.concatenate([self.entity_ids, entity_ids])
        
    def increase_max_cache(self, new_size):
        new_entries = []
        current_size = len(self.cache_constant2epsilon)
        for i in range(new_size - current_size):
            alpha, eps = self.get_optimal_alpha_for_constant(i+1 + current_size)
            new_entries.append(eps)
        self.cache_constant2epsilon = np.concatenate([self.cache_constant2epsilon, np.array(new_entries)])
        # print(self.cache_constant2epsilon)
        
    def get_fake_rdp_func(self, constant):
        
        def func(alpha):
            return alpha * constant
        
        return func

    def get_alpha_search_function(self, rdp_compose_func):
            
        # if len(self.deltas) > 0:
            # delta = np.max(self.deltas)
        # else:
        log_delta = np.log(self.delta)
        
        def fun(alpha):  # the input is the RDP's \alpha
            
            if alpha <= 1:
                return np.inf
            else:
                alpha_minus_1 = alpha-1
                return np.maximum(rdp_compose_func(alpha) + np.log(alpha_minus_1/alpha)
                                  - (log_delta + np.log(alpha))/alpha_minus_1, 0)
        return fun    
    
    def get_optimal_alpha_for_constant(self, constant=3):
        
        f = self.get_fake_rdp_func(constant)
        f2 = self.get_alpha_search_function(rdp_compose_func=f)
        results = minimize_scalar(f2, method='Brent', bracket=(1,2), bounds=[1, np.inf])
        
        return results.x, results.fun

        
    def get_batch_rdp_constants(self, entity_ids_query, private=True):
        
        # get indices for all ledger rows corresponding to any of the entities in entity_ids_query
        indices_batch = np.where(np.in1d(self.entity_ids, entity_ids_query))[0]
        
        # use the indices to get a "batch" of the full ledger. this is the only part
        # of the ledger we care about (the entries corresponding to specific entities)
        batch_sigmas = self.sigmas.take(indices_batch)
        batch_Ls = self.Ls.take(indices_batch)
        batch_l2_norms = self.l2_norms.take(indices_batch)
        batch_l2_norm_bounds = self.l2_norm_bounds.take(indices_batch)
        batch_coeffs = self.coeffs.take(indices_batch)
        batch_entity_ids = self.entity_ids.take(indices_batch).astype(np.int64)
        
        squared_Ls = batch_Ls**2
        squared_sigma = batch_sigmas**2
        
        if private:
            squared_L2_norms = batch_l2_norms**2
            constant = (squared_Ls * squared_L2_norms / (2 * squared_sigma)) * batch_coeffs
            constant = np.bincount(batch_entity_ids, weights=constant).take(entity_ids_query)
            return constant
        else:
            squared_L2_norm_bounds = batch_l2_norm_bounds**2
            constant = (squared_Ls * squared_L2_norm_bounds / (2 * squared_sigma)) * batch_coeffs
            constant = np.bincount(batch_entity_ids, weights=constant).take(entity_ids_query)
            return constant
        
    def get_epsilon_spend(self, entity_ids_query):
        rdp_constants = self.get_batch_rdp_constants(entity_ids_query=entity_ids_query).astype(np.int64)
        rdp_constants_lookup = rdp_constants - 1
        try:
            eps_spend = self.cache_constant2epsilon.take(rdp_constants_lookup)
        except IndexError:
            self.increase_max_cache(int(max(rdp_constants_lookup) * 1.1))
            eps_spend = self.cache_constant2epsilon.take(rdp_constants_lookup)
        return eps_spend
    
    def get_overbudgeted_entities(self, user_budget: float): 
        """ TODO: 
        In our current implementation, user_budget is obtained by querying the Adversarial Accountant's entity2ledger with the Data Scientist's User Key.
        When we replace the entity2ledger with something else, we could perhaps directly add it into this method
        """
        
        # Get the privacy budget spent by all the entities
        epsilon_spent = self.get_epsilon_spend(self.entity_ids.astype(np.int64))
        
        # Create a mask
        is_overbudget = np.ones_like(epsilon_spent) * user_budget < epsilon_spent
        return is_overbudget

In [241]:
from syft.core.tensor.autodp.gamma_tensor import GammaTensor

In [245]:
from random import gauss

In [246]:
def publish(gamma_tensor: GammaTensor, ledger: DataSubjectLedger, sigma:float = 2, user_budget = 100, private=False):
    """ Incomplete"""
    
    # entities = gamma_tensor.data_subjects.entities_indexed
    output = gamma_tensor.value * (ledger.get_overbudgeted_entities(user_budget=user_budget)^1) + gauss(sigma)
    pass

### Testing DataSubjectLedger

In [250]:
ledger = DataSubjectLedger()

  w = xb - ((xb - xc) * tmp2 - (xb - xa) * tmp1) / denom


In [310]:
ledger.reset()
n = int(1e3)

In [305]:
ledger.batch_append(sigmas=np.ones(n),
                    l2_norms=np.ones(n)*10,
                    l2_norm_bounds=np.ones(n)*40,
                    Ls=np.random.randn(n)*5,
                    coeffs=np.ones(n),
                    entity_ids=np.arange(n))

In [262]:
query = np.arange(n)

In [263]:
%%time
eps = ledger.get_epsilon_spend(entity_ids_query=query)

CPU times: user 3.42 s, sys: 0 ns, total: 3.42 s
Wall time: 3.42 s


In [308]:
eps = ledger.get_epsilon_spend(entity_ids_query=query)

In [309]:
len(eps)

1000

In [267]:
min(eps)

7.766216625311721

In [268]:
max(eps)

14814.25443357862

In [269]:
ledger.get_overbudgeted_entities(1000)

array([False,  True, False,  True,  True,  True, False, False, False,
        True,  True, False, False, False,  True,  True, False, False,
       False, False, False,  True, False, False, False,  True,  True,
       False, False, False,  True, False,  True,  True,  True,  True,
        True,  True, False, False,  True,  True,  True, False, False,
        True,  True, False,  True, False, False,  True,  True, False,
       False, False, False,  True,  True, False, False, False,  True,
       False, False,  True,  True, False, False, False,  True, False,
        True, False,  True,  True,  True,  True, False,  True, False,
       False, False,  True,  True, False, False, False, False, False,
        True, False,  True,  True, False, False, False, False,  True,
        True, False, False, False, False, False, False, False,  True,
       False,  True, False, False,  True,  True,  True,  True, False,
       False, False,  True, False, False,  True,  True, False,  True,
        True, False,

In [278]:
test_values = np.random.randn(n) * 1000

In [279]:
test_values.min()

-2851.152450896803

In [280]:
test_values.max()

3621.2325581495047

In [281]:
test_values.shape

(1000,)

In [282]:
ledger.sigmas.shape

(1000,)

In [285]:
(test_values * (ledger.get_overbudgeted_entities(user_budget=1000)^1) + gauss(0, 2))

array([ 3.34124798e+02,  4.20372456e-01,  1.22164906e+03,  4.20372456e-01,
        4.20372456e-01,  4.20372456e-01, -4.10603554e+02, -8.70251422e+02,
       -3.85631358e+02,  4.20372456e-01,  4.20372456e-01, -3.04409431e+02,
        4.40686765e+02,  1.12595898e+02,  4.20372456e-01,  4.20372456e-01,
        9.28116733e+02, -3.44710671e+02, -2.72191436e+02,  2.19666545e+02,
        8.90626317e+02,  4.20372456e-01, -8.79948909e+02,  1.26564781e+03,
       -1.16540887e+03,  4.20372456e-01,  4.20372456e-01,  5.05205241e+02,
       -2.87578357e+02, -2.31595962e+02,  4.20372456e-01,  1.32666856e+03,
        4.20372456e-01,  4.20372456e-01,  4.20372456e-01,  4.20372456e-01,
        4.20372456e-01,  4.20372456e-01, -1.02788287e+03,  6.62813594e+02,
        4.20372456e-01,  4.20372456e-01,  4.20372456e-01, -1.36458721e+03,
       -7.04913202e+02,  4.20372456e-01,  4.20372456e-01,  1.37228130e+03,
        4.20372456e-01, -6.39644693e+02, -9.14918573e+02,  4.20372456e-01,
        4.20372456e-01,  

In [290]:
np.round(test_values * (ledger.get_overbudgeted_entities(user_budget=1000)^1) + gauss(0, 2), 1)

array([ 3.3350e+02, -2.0000e-01,  1.2211e+03, -2.0000e-01, -2.0000e-01,
       -2.0000e-01, -4.1120e+02, -8.7080e+02, -3.8620e+02, -2.0000e-01,
       -2.0000e-01, -3.0500e+02,  4.4010e+02,  1.1200e+02, -2.0000e-01,
       -2.0000e-01,  9.2750e+02, -3.4530e+02, -2.7280e+02,  2.1910e+02,
        8.9010e+02, -2.0000e-01, -8.8050e+02,  1.2651e+03, -1.1660e+03,
       -2.0000e-01, -2.0000e-01,  5.0460e+02, -2.8820e+02, -2.3220e+02,
       -2.0000e-01,  1.3261e+03, -2.0000e-01, -2.0000e-01, -2.0000e-01,
       -2.0000e-01, -2.0000e-01, -2.0000e-01, -1.0285e+03,  6.6220e+02,
       -2.0000e-01, -2.0000e-01, -2.0000e-01, -1.3652e+03, -7.0550e+02,
       -2.0000e-01, -2.0000e-01,  1.3717e+03, -2.0000e-01, -6.4020e+02,
       -9.1550e+02, -2.0000e-01, -2.0000e-01,  6.3740e+02, -5.8520e+02,
       -1.1200e+02, -4.8200e+01, -2.0000e-01, -2.0000e-01, -6.0150e+02,
        3.1700e+02, -4.8800e+01, -2.0000e-01, -1.4673e+03, -5.6430e+02,
       -2.0000e-01, -2.0000e-01,  6.5400e+02, -1.1407e+03, -1.34

In [220]:
update1 = ledger.write_to_db()

In [221]:
ledger.sigmas

array([1., 1., 1., ..., 1., 1., 1.])

In [222]:
update1.sigmas

array([1., 1., 1., ..., 1., 1., 1.])

In [223]:
ledger.read_from_db(update1)

Exception: Cannot add update to Ledger

In [82]:
len(ledger.cache_constant2epsilon)

1373

In [31]:
eps.any()

False

In [None]:
len(ledger.sigmas)/1e8

In [None]:
%%time
eps = ledger.get_epsilon_spend(entity_ids_query=query)

In [None]:
eps[0:20]