## ILAS - Data Mining (summer 2019) - Assignment 4
#### by Andreas Hene, Niklas Mertens, Richard Palme

In [1]:
import time
import math

import sympy
import pandas
import numpy as np
import matplotlib.pyplot as plt

First we write a function `get_repr` that returns the representative of an element val of [n] in the dyadic interval of level `lvl`. The representative is always chosen to be the smallest element of the dyadic array.
$$ \text{val} = j \cdot 2^{h-\text{lvl}} + \text{rest} $$
If rest >= 1, the representative can be computed by
$$ j \cdot 2^{h-\text{lvl}} + 1 $$
If rest == 0, the representative can be computed by
$$ (j-1) \cdot 2^{h - \text{lvl}} + 1 $$

In [2]:
def get_repr(val, lvl, h):
    j, rest = np.divmod(val, 2**(h - lvl))
    if rest >= 1:
        return val - rest + 1
    return val - 2**(h-lvl) + 1

Next we write a function `get_children` that returns the children dyadic intervals of a representative `u`. The intervals are again given by their representative. The children of `u` are `u` and
$$ (j+1) \cdot 2^{h-(\text{lvl} + 1)} + 1$$

In [3]:
def get_children(u, lvl, h):
    if lvl == h:
        return [u]
    j = math.floor(u / 2**(h - lvl))
    return [u, u + 2**(h - (lvl+1))]

In [4]:
def base_p(u, p, size):
    res = np.zeros(size, dtype=np.int16)
    i = 0
    while u != 0:
        u, r = np.divmod(u, p)
        res[i] = r
        i += 1
    return res

def _hashfunc(u, a, b, p, k):
    return (np.dot(a, base_p(u, p, k)) + b) % p


In [5]:
for u in range(9):
    print(_hashfunc(u, [1, 2], 1, p=3, k=2), end=' ')

1 2 0 0 1 2 2 0 1 

In [6]:
def count_min_sketch(filepath, eps, delta, n, threshold):
    d = math.ceil(math.log(1.0 / delta, 2))
    w = math.ceil(2.0 / eps)
    
    # find a prime p with p >= w
    p = w
    while not sympy.isprime(p):
        p += 1
    
    # choose k such that p^k - 1 >= n
    # i.e. k >= log(n+1, p)
    k = math.ceil(math.log(n+1, p))

    hashfuncs = []
    for _ in range(d):
        a = np.asarray([np.random.randint(p) for _ in range(k)])
        b = np.random.randint(p)
        hashfuncs.append(lambda u, a=a, b=b: _hashfunc(u, a, b, p, k))

    h = math.ceil(math.log(n, 2))
    
    # note that, since w isn't necessarily a prime number,
    # we might have to make w larger (i.e. take p instead of w)
    C = np.zeros((h+1, d, p), dtype=np.int16)
    
    # dtype is int16, so 2 bytes. chunksize is 5*10**5,
    # so one chunk is exactly 1 MB and
    # should fit comfortably into cache.
    chunks = pandas.read_csv(
        filepath,
        header=None,
        skiprows=3,
        squeeze=True,
        dtype=np.int16,
        delim_whitespace=True,
        chunksize=5*10**5
    )

    # process the stream. Each chunk is of type pandas.series
    for chunk in chunks:
        for x in chunk:
            if x == 0:
                print('found a 0.')
                continue
            for lvl in range(h+1):
                u = get_repr(x, lvl, h)
                for i, hashfunc in enumerate(hashfuncs):
                    C[(lvl, i, hashfunc(u))] += 1
    
    # now we do BFS. the values in explore_current are
    # the representatives of the dyadic arrays that
    # currently get explored on this level.
    explore_current = [1]
    explore_next = []
    hash_values = np.zeros(d, dtype=np.int16)
    for lvl in range(h + 1):
        for u in explore_current:
            for i, hashfunc in enumerate(hashfuncs):
                hash_values[i] = C[(lvl, i, hashfunc(u))]
            if hash_values.min() >= threshold:
                explore_next.extend(get_children(u, lvl, h))
        explore_current = explore_next
        explore_next = []
        
    # prepare the output:
    approx_frequency = []
    for u in explore_current:
        for i, hashfunc in enumerate(hashfuncs):
            hash_values[i] = C[(h, i, hashfunc(u))]
        approx_frequency.append(hash_values.min())
        
    return explore_current, approx_frequency

In [14]:
filepath = 'data/supereasy'
#filepath = 'data/easy.txt'
#filepath = 'data/largest_40k.txt'

with open(filepath) as f:
    n = int(f.readline())
    m = int(f.readline())
    t = int(f.readline())
    
start = time.time()
result, freq = count_min_sketch(filepath, eps=0.6, delta=0.1, n=n, threshold=t)
end = time.time()
elapsed_time = int(round(end - start))

print(result, freq)
print(elapsed_time)

[2] [4]
0
