In [38]:
import numpy as np
from scipy.optimize import minimize_scalar

class DataSubjectLedger:
    """for a particular data subject, this is the list
    of all mechanisms releasing informationo about this
    particular subject, stored in a vectorized form"""
    
    def __init__(self, default_delta=1e-6):
        
        self.default_delta = default_delta
        self.reset()
        self.cache_constant2epsilon = list()     
        
        for i in range(100000):
            alpha, eps = self.get_optimal_alpha_for_constant(i+1)
            self.cache_constant2epsilon.append(eps)
            
        self.cache_constant2epsilon = np.array(self.cache_constant2epsilon)        
        

    def reset(self):
        self.sigmas = np.array([])
        self.l2_norms = np.array([])
        self.l2_norm_bounds = np.array([])
        self.Ls = np.array([])
        self.coeffs = np.array([])
        self.deltas = np.array([])
        self.entity_ids = np.array([])
        self.entity2budget = np.array([])
        
    def append_batch(self, 
                     sigmas: np.ndarray, 
                     l2_norms: np.ndarray, 
                     l2_norm_bounds: np.ndarray, 
                     Ls: np.ndarray, 
                     coeffs: np.ndarray, 
                     deltas: np.ndarray, 
                     entity_ids: np.ndarray):
        
        self.sigmas = np.concatenate([self.sigmas, sigmas])
        self.l2_norms = np.concatenate([self.l2_norms, l2_norms])        
        self.l2_norm_bounds = np.concatenate([self.l2_norm_bounds, l2_norm_bounds])        
        self.Ls = np.concatenate([self.Ls, Ls])        
        self.coeffs = np.concatenate([self.coeffs, coeffs])        
        self.deltas = np.concatenate([self.deltas, deltas])        
        self.entity_ids = np.concatenate([self.entity_ids, entity_ids])                
        
    def get_rdp_func(self, entity_id, private=True):
        
            constant = self.get_rdp_constant(entity_id=entity_id, private=private)
            
            def rdp_func(alpha):
                return alpha * constant

            return rdp_func  
        
    def get_fake_rdp_func(self, constant):
        
        def func(alpha):
            return alpha * constant
        
        return func

    def get_alpha_search_function(self, entity_id, func_override=None):
        
        if func_override is None:
            rdp_compose_func = self.get_rdp_func(entity_id)
        else:
            rdp_compose_func = func_override
            
        if len(self.deltas) > 0:
            delta = np.max(self.deltas)
        else:
            delta = self.default_delta
            
        log_delta = np.log(delta)
        
        def fun(alpha):  # the input is the RDP's \alpha
            
            if alpha <= 1:
                return np.inf
            else:
                alpha_minus_1 = alpha-1
                return np.maximum(rdp_compose_func(alpha) + np.log(alpha_minus_1/alpha)
                                  - (log_delta + np.log(alpha))/alpha_minus_1, 0)
        return fun    
    
    def get_optimal_alpha_for_constant(self, constant=3):
        
        f = self.get_fake_rdp_func(constant)
        f2 = self.get_alpha_search_function(entity_id=1, func_override=f)
        results = minimize_scalar(f2, method='Brent', bracket=(1,2), bounds=[1, np.inf])
        
        return results.x, results.fun
    

    def get_rdp_constant(self, entity_id, private=True):
    
        squared_Ls = self.Ls**2
        squared_sigma = self.sigmas**2
        entity_mask = self.entity_ids == entity_id
        
        if private:
            
            squared_L2_norms = self.l2_norms**2            
            private_constant = (squared_Ls * squared_L2_norms / (2 * squared_sigma)) * entity_mask
            private_constant = private_constant * self.coeffs
            private_constant = np.sum(private_constant)
            
            return private_constant
        else:
            squared_L2_norm_bounds = self.l2_norms_bound**2            
            public_constant = (squared_Ls * squared_L2_norm_bounds / (2 * squared_sigma)) * entity_mask
            public_constant = public_constant * self.coeffs
            public_constant = np.sum(private_constant)
            return public_constant  
        
    def get_batch_rdp_constants(self, entity_ids_query, private=True):
        
        # get indices for all ledger rows corresponding to any of the entities in entity_ids_query
        indices_batch = np.where(np.in1d(self.entity_ids, entity_ids_query))[0]
        
        # use the indices to get a "batch" of the full ledger. this is the only part
        # of the ledger we care about (the entries corresponding to specific entities)
        batch_sigmas = self.sigmas.take(indices_batch)
        batch_Ls = self.Ls.take(indices_batch)
        batch_l2_norms = self.l2_norms.take(indices_batch)
        batch_l2_norm_bounds = self.l2_norm_bounds.take(indices_batch)
        batch_coeffs = self.coeffs.take(indices_batch)
        batch_entity_ids = self.entity_ids.take(indices_batch).astype(np.int64)
        
        
        squared_Ls = batch_Ls**2
        squared_sigma = batch_sigmas**2
        
        if private:
            squared_L2_norms = batch_l2_norms**2
            constant = (squared_Ls * squared_L2_norms / (2 * squared_sigma)) * batch_coeffs
            constant = np.bincount(batch_entity_ids, weights=constant).take(entity_ids_query)
            return constant
        else:
            squared_L2_norm_bounds = batch_l2_norm_bounds**2
            constant = (squared_Ls * squared_L2_norm_bounds / (2 * squared_sigma)) * batch_coeffs
            constant = np.bincount(batch_entity_ids, weights=constant).take(entity_ids_query)
            return constant
        
    def get_epsilon_spend(self, entity_ids_query):
        rdp_constants = self.get_batch_rdp_constants(entity_ids_query=entity_ids_query).astype(np.int64)
        rdp_constants_lookup = rdp_constants - 1
        eps_spend = self.cache_constant2epsilon.take(rdp_constants_lookup)
        return eps_spend

In [56]:
import mkl
mkl.set_num_threads(2)

ModuleNotFoundError: No module named 'mkl'

In [39]:
ledger = DataSubjectLedger()

In [52]:
ledger.reset()
n = int(1e7)

In [53]:
ledger.append_batch(sigmas=np.ones(n),
                    l2_norms=np.ones(n)*10,
                    l2_norm_bounds=np.ones(n)*40,
                    Ls=np.random.randn(n)*5,
                    coeffs=np.ones(n),
                    deltas=np.ones(n)*1e6,
                    entity_ids=np.arange(n))

In [54]:
query = np.arange(n)

In [51]:
%%time
eps = ledger.get_epsilon_spend(entity_ids_query=query)

CPU times: user 17.6 s, sys: 8.82 s, total: 26.4 s
Wall time: 27 s


In [55]:
%%time
eps = ledger.get_epsilon_spend(entity_ids_query=query)

CPU times: user 1.58 s, sys: 418 ms, total: 2 s
Wall time: 2.03 s


In [37]:
eps[0:20]

array([3725.40022315, 2263.91209203,   30.04967125, 2034.64885325,
        159.79325645,   99.16966854,  727.20567344, 1890.65316412,
       4849.06547809, 2601.18850134, 2072.75222106, 3372.47573355,
        196.52083601,   32.22166095,  192.49873732,  134.48558098,
        850.26206058, 3582.66288229,  308.96429586,  670.33884933])

array([2869.12969754, 7827.67597659])