In [None]:
from functools import cache
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import lognorm

In [2]:
from full_fm import FullFm

In [3]:
from typing import Callable

In [4]:
X = lognorm.rvs(5, size=int(1e6)).astype(int)

In [5]:
def seeded_hash(hash_seed: int) -> Callable[[int], float]:
    @cache
    def cool_hash(value: int) -> float:
        return np.random.default_rng(seed=value + hash_seed*31).uniform()
    return cool_hash

In [6]:
np.unique(X).shape

(37368,)

In [7]:
def run_experiment(values: np.ndarray, base_seed: int, num_hashes: int, num_betas: int) -> float:
    generator = np.random.default_rng(seed=base_seed)
    hash_funcs = [seeded_hash(generator.integers(1, 1000)) for _ in range(num_hashes)]
    ams = FullFm(hash_funcs, len(hash_funcs) // num_betas)
    return ams.update(values).estimate()
    

In [8]:
def run_experiments(values: np.ndarray, base_seeds: list[int], num_hashes: int, num_betas: int) -> list[float]:
    return [run_experiment(values, s, num_hashes, num_betas) for s in base_seeds]

In [9]:
results = run_experiments(X, np.arange(1, 26), 1, 1)

In [10]:
results

[12353.000828790382,
 41236.78282590656,
 12362.758069872927,
 19984.78935506544,
 19984.78935506544,
 31107.917983110747,
 38875.01039370618,
 4382038.053111839,
 26200.871695462032,
 20788.72444279912,
 201309.8023417926,
 11657.50739132869,
 4382038.053111839,
 201309.8023417926,
 41236.78282590656,
 863197.482082425,
 41138.24345705607,
 135757.79535488706,
 41236.78282590656,
 12362.758069872927,
 201309.8023417926,
 80108.46486556508,
 32151.98533865455,
 172792.19284583934,
 172792.19284583934]

In [24]:
np.var(results) / len(X)

1374206.5230752686

In [26]:
np.mean(results)

447813.29384408466