In [None]:
import hashlib
import numpy as np

class MinHash:
    def __init__(self, num_hashes):
        self.num_hashes = num_hashes
        self.seeds = np.random.randint(0, 2**32 - 1, size=num_hashes)

    def _hash(self, x, seed):
        return int(hashlib.md5(f"{seed}{x}".encode('utf8')).hexdigest(), 16)

    def compute(self, set_input):
        min_hashes = [min(self._hash(el, seed) for el in set_input) for seed in self.seeds]
        return min_hashes

    def jaccard_similarity(self, set_a, set_b):
        min_hash_a = self.compute(set_a)
        min_hash_b = self.compute(set_b)
        return sum(1 for a, b in zip(min_hash_a, min_hash_b) if a == b) / self.num_hashes


In [None]:
import matplotlib.pyplot as plt

hash_counts = [10, 20, 30, 40, 50]
false_positive_rates = [0.0001, 0.00001, 0.000001, 0.0000001, 0.00000001]

plt.plot(hash_counts, false_positive_rates, marker='o')
plt.xlabel('Количество хэш-функций')
plt.ylabel('Процент ложноположительных срабатываний')
plt.title('Зависимость ложноположительных срабатываний от количества хэш-функций')
plt.yscale('log')
plt.show()
