In [1]:
import numpy as np

import typing as tp

import math
import zlib
import hashlib

In [2]:
P = 0.0001

source_path = 'catboost.txt'

**1.** Make a generic solution to generate Countable Bloom Filter structure based on parameters

    a) amount of objects 
    b) amount of hash functions (thinking about dynamic hash functions generating)
    c) length of CBF array.

In [3]:
def read_file(source_path: str) -> str:
    with open(source_path, encoding="utf8") as file:
        return file.read()

a) amount of objects 

In [4]:
def unique_words(text: str) -> tp.Set[str]:
    return set(text.split())

In [5]:
unique_words = unique_words(read_file(source_path))

In [6]:
n = len(unique_words)
n

895

c) length of CBF array.

${\displaystyle m=-{\frac {n \ln p}{(\ln 2)^2} }}$

In [7]:
def optimal_CBF_len(count: int, precision: float) -> int:
    return -math.ceil((count * np.log(precision)) / (np.log(2) * np.log(2)))

In [8]:
m = optimal_CBF_len(count=n, precision=P)
m

17157

b) amount of hash functions (thinking about dynamic hash functions generating)

${\displaystyle k={\frac {m}{n} {\ln 2}}}$

In [9]:
def hash_funcs_number(
    length: int,
    count: int,
) -> int:
    return round((length / count) * math.log(2))

In [10]:
k = hash_funcs_number(length=m, count=n)
print(k)

13


**2.** To create a CBF based with precision = 0.0001 and for words for any internet post

Link to the article: https://habr.com/ru/company/yandex/blog/333522/

In [11]:
def random_salts(hashes_count: int) -> tp.List[str]:
        salts = [
            hashlib.sha224(bytes(np.random.RandomState(42).randint(
                    0, 999_999))).hexdigest() for _ in range(hashes_count)
        ]
        return salts

In [12]:
salts = random_salts(m)

In [13]:
def hash_index(obj: str, salt: str, cbf_length: int) -> int:
    return zlib.crc32(bytes(obj + salt, encoding='utf8')) % cbf_length

In [14]:
def countable_bloom_filter(objects: set, cbf_length: int, hashes_count: int):
    cbf = [0] * cbf_length
    salts = random_salts(hashes_count)
    for obj in objects:
        for i in range(hashes_count):
            index = hash_index(obj=obj, salt=salts[i], cbf_length=m)
            cbf[index] += 1
    return cbf

In [15]:
def word_prob(word: str, cbf: list, salts: list, hashes_count: int) -> float:
    minimal_val = 999999990
    for i in range(hashes_count):
        index = hash_index(obj=word, salt=salts[i], cbf_length=len(cbf))
        if cbf[index] < minimal_val:
            minimal_val = cbf[index]

    if minimal_val > 0:
        return 1 / minimal_val
    else:
        return 0

In [16]:
cbf = countable_bloom_filter(objects=unique_words,
                             cbf_length=m,
                             hashes_count=k)
print(cbf)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 0, 0, 0, 

**3.** To check for existing any 10 words in CBF

In [17]:
words = ['Яндекс', 'бустинг', 'CatBoost', 'Yandex', 'машинное', 'обучение',
         'кулинария', 'этнографический', 'оранжерея', 'трансцендентность']

In [18]:
for word in words:
    print(word_prob(word=word,
                    cbf=cbf,
                    salts=salts,
                    hashes_count=k))

0.07692307692307693
0.07692307692307693
0.07692307692307693
0.07692307692307693
0.07692307692307693
0.07692307692307693
0
0
0
0
