In [5]:
class HyperLogLog:
    
    def __init__(self, b, scheduler_type = "round_robin"):
        """
        b: rightmost bits to use, in [4,16]
        """
        self.b = b
        m, alpha = self._config(b)
        self.alpha = alpha
        self.m = m
        self._registers = [0]*m
        self.count = 0
        self._get_register_index = self._config_get_scheduler_index(scheduler_type)
    
    def _config(self, b):
        m = 2**b
        if m == 16:
            alpha = 0.673
        elif m == 32:
            alpha = 0.697
        elif m == 64:
            alpha = 0.709
        else:
            alpha = 0.7213/(1 + 1.079/m)
        
        return (m, alpha)
    
    def round_robin_scheduler(self):
        return self.count % self.m
    
    def random_scheduler(self):
        return randint(self.m)

    def _config_get_register_index(scheduler_type = "round_robin"):
        if scheduler_type == "random":
            return random_scheduler
        else:
            return round_robin_scheduler
    
    def _run_of_zeros(h):
        max_run, counter = 0, 0
        for i in h[self.b:]:
            if i == 0:
                counter += 1
            else:
                counter = 0
            if counter > max_run:
                max_run = counter
        return max_run
    
    def _count_of_zero_registers(self):
        return sum([1 for r in self._registers if r == 0])
    
    def update(self, h):
        """
        h: binary-hashed data
        """
        register_index = self._get_register_index(h)
        run_length = self._run_of_zeros(h)
        self._registers[register_index] = max(self._registers[ register_index ], run_length)
    
    def estimate_DV(self):
        harm_mean = self.m**2 * 1.0 / sum([2^-r for r in self._registers])
        DV_est = self.alpha * harm_mean
        
        if DV_est < (5/2.0 * self.m): # small range correction
            V = _count_of_zero_registers()
            if V == 0:  # if none of the registers are empty, use the HLL estimate
                  DV = DV_est
            else:
                  DV = self.m * log(self.m / V) # i.e. balls and bins correction
        
        if DV_est <= ( 1/30 * 2^32 ): # intermediate range, no correction
             DV = DV_est
        
        if DV_est > ( 1/30 * 2^32 ): # large range correction
             DV = -2^32 * log( 1 - DV_est/2^32)
        
        return DV

    
hll = HyperLogLog(4)
hll.alpha

0.673