## ILAS - Data Mining (summer 2019) - Assignment 4
#### by Andreas Hene, Niklas Mertens, Richard Palme

In [2]:
import time
import math

import sympy
import pandas
import numpy as np
import matplotlib.pyplot as plt

First we write a function `get_repr` that returns the representative of an element val of [n] in the dyadic interval of level `lvl`. The representative is always chosen to be the smallest element of the dyadic array.
$$ \text{val} = j \cdot 2^{h-\text{lvl}} + \text{rest} $$
The representative can be computed by
$$ j \cdot 2^{h-\text{lvl}} + 1 $$
The formula won't work in the case `h == lvl`. But in this case we can just take `val` as it's own representation and don't call `get_repr` at all.

In [4]:
def get_repr(val, lvl, h):
    j = math.floor(val / 2**(h - lvl))
    return j * 2**(h-lvl) + 1

Next we write a function `get_children` that returns the children dyadic intervals of a representative `u`. The intervals are again given by their representative. The children are `u` itself and
$$ (j+1) \cdot 2^{h-(\text{lvl} + 1)} + 1$$

In [1]:
def get_children(u, lvl, h):
    j = math.floor(u / 2**(h - lvl))
    return (j + 1) * 2**(h - (lvl+1)) + 1

In [14]:
def base_p(u, p, size):
    res = np.zeros(size, dtype=np.int16)
    i = 0
    while u != 0:
        u, r = np.divmod(u, p)
        res[i] = r
        i += 1
    return res

def hashfunc(u, a, b, p, k):
    return (np.dot(a, base_p(u, p, k)) + b) % p


In [3]:
def count_min_sketch(filepath, eps, delta, n):
    d = math.ceil(math.log(1.0 / delta, 2))
    w = math.ceil(2.0 / eps)
    
    # find a prime p >= w + 1:
    p = w + 1
    while not sympy.isprime(p):
        p += 1
    
    k = math.floor(math.log(n, p))
    
    hashfuncs = []
    for _ in range(d):
        a = np.asarray([np.random.randint(p) for _ in range(k)])
        b = np.random.randint(p)
        hashfuncs.append(lambda u: hashfunc(a, b, u))

    h = math.ceil(math.log(n, 2))
    C = np.zeros((h+1, d, w), dtype=np.int16)
    
    # dtype is int16, so 2 bytes. chunksize is 5*10**5,
    # so one chunk is exactly 1 MB and
    # should fit comfortably into cache.
    chunks = pandas.read_csv(
        filepath,
        header=None,
        skiprows=3,
        squeeze=True,
        dtype=np.int16,
        delim_whitespace=True,
        chunksize=5*10**5
    )

    # process the stream. Each chunk is of type pandas.series
    for chunk in chunks:
        for x in chunk:
            for lvl in range(h):
                u = get_repr(x, lvl, h)
                for i, hashfunc in enumerate(hashfuncs):
                    C[(lvl, i, hashfunc(u))] += 1
            for i, hashfunc in enumerate(hashfuncs):
                C[(h, i, x)] += 1
    
    # now we do BFS. the values in explore_current are
    # the representatives of the dyadic arrays that
    # currently get explored on this level.
    explore_current = [1]
    explore_next = []
    hash_values = np.zeros(d)
    for lvl in range(h):
        for u in explore_current:
            for i, hashfunc in enumerate(hashfuncs):
                hash_values[i] = C[(h, i, hashfunc(u))]
            if hash_values.min() >= t:
                explore_next.extend(get_children(u, lvl))
        explore_current = explore_next
        explore_next = []
    return explore_current

In [None]:
filepath = 'data/easy.txt'
#filepath = 'data/largest_40k.txt'

with open(filepath) as f:
    n = int(f.readline())
    m = int(f.readline())
    t = int(f.readline())
    
    
    
    

start_ms = time.time() * 1000

end_ms = time.time() * 1000
elapsed_time_ms = int(round(end_ms - start_ms))