## Bloom Filters

A bloom filter is a set data structure with constant $\Omicron(1)$ lookup performance that has no false negatives (always detects memberships if actually a member) but can have false positives (can detect membership but actually not a member), along with $\Omicron(1)$ insert complexity.

It specializes in having a constant memory cost, of which the size directly corrosponds with the false positive rate assuming a perfect hashing function.


In [28]:
import hashlib


class BloomFilter:
    memory: bytearray
    size: int

    def __init__(self, size: int):
        self.memory = bytearray(size)
        self.size = size

    def add(self, filter: str):
        h = hashlib.sha512(filter.encode('utf-8')).digest()

        for i in range(16):
            pos = int.from_bytes(h[32*i:32*(i+1)], 'little') % (8*self.size)
            self.memory[pos//8] |= (1 << (pos % 8))

    def __contains__(self, item: str):
        h = hashlib.sha512(item.encode('utf-8')).digest()

        for i in range(16):
            pos = int.from_bytes(h[32*i:32*(i+1)], 'little') % (8*self.size)
            if self.memory[pos//8] & (1 << (pos % 8)) == 0:
                return False

        return True



fil = BloomFilter(10)
fil.add("hi")
fil.add("hello")
fil.add("what")
fil.add("fdsa")
fil.add("ere")
fil.add("adsf")
fil.add("bfdbfd")
fil.add("uytuty")


print("o98" in fil)  # collision
print("hello" in fil)
print("hi" in fil)
print(fil.memory)
# f"{hashlib.sha512('hi'.encode('utf-8')).digest().hex()}"
# split the sha512 hash into 512/32 16 hash functions, each of 32 bits, and set those bits mod the memory size

True
True
True
bytearray(b'\x07\x00\x08E\x00\x01\x0c\x020\xc8')
