## ILAS - Data Mining (summer 2019) - Assignment 4
#### by Andreas Hene, Niklas Mertens, Richard Palme

In [1]:
import time
import math

import sympy
import pandas
import numpy as np
import matplotlib.pyplot as plt

First we write a function `get_repr` that returns the representative of an element val of [n] in the dyadic interval of level `lvl`. The representative is always chosen to be the smallest element of the dyadic array.
$$ \text{val} = j \cdot 2^{h-\text{lvl}} + \text{rest} $$
If rest >= 1, the representative can be computed by
$$ j \cdot 2^{h-\text{lvl}} + 1 $$
If rest == 0, the representative can be computed by
$$ (j-1) \cdot 2^{h - \text{lvl}} + 1 $$

In [2]:
def get_repr(val, lvl, h):
    j, rest = np.divmod(val, 2**(h - lvl))
    if rest >= 1:
        return val - rest + 1
    return val - 2**(h-lvl) + 1

Next we write a function `get_children` that returns the children dyadic intervals of a representative `u`. The intervals are again given by their representative. The children of `u` are `u` and
$$ (j+1) \cdot 2^{h-(\text{lvl} + 1)} + 1$$

In [3]:
def get_children(u, lvl, h):
    if lvl == h:
        return [u]
    j = math.floor(u / 2**(h - lvl))
    return [u, u + 2**(h - (lvl+1))]

In [4]:
def base_p(u, p, size):
    res = np.zeros(size, dtype=np.int16)
    i = 0
    while u != 0:
        u, r = np.divmod(u, p)
        res[i] = r
        i += 1
    return res

def _hashfunc(u, a, b, p, k):
    return (np.dot(a, base_p(u, p, k)) + b) % p


In [5]:
for u in range(9):
    print(_hashfunc(u, [1, 2], 1, p=3, k=2), end=' ')

1 2 0 0 1 2 2 0 1 

In [6]:
def print_hashfuncs(hashfuncs, n):
    for u in range(n):
        print(u, end=' ')
    print('\n')
    for hashfunc in hashfuncs:
        for u in range(n):
            print(hashfunc(u), end=' ')
        print()

In [23]:
def count_min_sketch(filepath, eps, delta, n, threshold):
    d = math.ceil(math.log(1.0 / delta, 2))
    w = math.ceil(2.0 / eps)
    
    # find a prime p with p >= w
    p = w
    while not sympy.isprime(p):
        p += 1
    
    # choose k such that p^k - 1 >= n
    # i.e. k >= log(n+1, p)
    k = math.ceil(math.log(n+1, p))

    hashfuncs = []
    for _ in range(d):
        a = np.asarray([np.random.randint(p) for _ in range(k)])
        b = np.random.randint(p)
        hashfuncs.append(lambda u, a=a, b=b: _hashfunc(u, a, b, p, k))
    print_hashfuncs(hashfuncs, n)

    h = math.ceil(math.log(n, 2))
    
    print('eps', eps, 'delta', delta, '\n',
          'n', n, 'threshold', threshold, '\n',
          'd', d, 'w', w, '\n',
          'p', p, 'k', k, 'h', h)
    
    # note that, since w isn't necessarily a prime number,
    # we might have to make w larger (i.e. take p instead of w)
    C = np.zeros((h+1, d, p), dtype=np.int32)
    
    # dtype is int16, so 2 bytes. chunksize is 5*10**5,
    # so one chunk is exactly 1 MB and
    # should fit comfortably into cache.
    chunks = pandas.read_csv(
        filepath,
        header=None,
        skiprows=3,
        squeeze=True,
        dtype=np.int16,
        delim_whitespace=True,
        chunksize=5*10**5
    )
    tmp = 0
    # process the stream. Each chunk is of type pandas.series
    for chunk in chunks:
        for x in chunk:
            if x == 0:
                continue
            tmp += 1
            for lvl in range(h+1):
                u = get_repr(x, lvl, h)
                if lvl == 0 and (tmp % 10000) == 0:
                    print(C[lvl, :, :])
                for i, hashfunc in enumerate(hashfuncs):
                    C[(lvl, i, hashfunc(u))] += 1
    
    # now we do BFS. the values in explore_current are
    # the representatives of the dyadic arrays that
    # currently get explored on this level.
    explore_current = [1]
    explore_next = []
    hash_values = np.zeros(d, dtype=np.int32)
    for lvl in range(h + 1):
        print(explore_current)
        for u in explore_current:
            for i, hashfunc in enumerate(hashfuncs):
                hash_values[i] = C[(lvl, i, hashfunc(u))]
            if hash_values.min() >= threshold:
                explore_next.extend(get_children(u, lvl, h))
        explore_current = explore_next
        explore_next = []
        
    # prepare the output:
    approx_frequency = []
    for u in explore_current:
        for i, hashfunc in enumerate(hashfuncs):
            hash_values[i] = C[(h, i, hashfunc(u))]
        approx_frequency.append(hash_values.min())
        
    return explore_current, approx_frequency

In [24]:
#filepath = 'data/supereasy'
filepath = 'data/easy.txt'
#filepath = 'data/largest_40k.txt'

with open(filepath) as f:
    n = int(f.readline())
    m = int(f.readline())
    t = int(f.readline())
    
start = time.time()
result, freq = count_min_sketch(filepath, eps=0.6, delta=0.1, n=n, threshold=t)
end = time.time()
elapsed_time = int(round(end - start))

print(result, freq)
print('time in seconds:', elapsed_time)

0 1 2 3 4 5 6 7 8 9 10 11 

2 0 3 1 4 2 0 3 1 4 2 0 
4 2 0 3 1 3 1 4 2 0 2 0 
1 0 4 3 2 1 0 4 3 2 1 0 
2 1 0 4 3 3 2 1 0 4 4 3 
eps 0.6 delta 0.1 
 n 12 threshold 30000 
 d 4 w 4 
 p 5 k 2 h 4
[[9999    0    0    0    0]
 [   0    0 9999    0    0]
 [9999    0    0    0    0]
 [   0 9999    0    0    0]]
[[19999     0     0     0     0]
 [    0     0 19999     0     0]
 [19999     0     0     0     0]
 [    0 19999     0     0     0]]
[[29999     0     0     0     0]
 [    0     0 29999     0     0]
 [29999     0     0     0     0]
 [    0 29999     0     0     0]]
[[39999     0     0     0     0]
 [    0     0 39999     0     0]
 [39999     0     0     0     0]
 [    0 39999     0     0     0]]
[[49999     0     0     0     0]
 [    0     0 49999     0     0]
 [49999     0     0     0     0]
 [    0 49999     0     0     0]]
[[59999     0     0     0     0]
 [    0     0 59999     0     0]
 [59999     0     0     0     0]
 [    0 59999     0     0     0]]
[[69999     0     0     0    

In [25]:
def brute(filepath, n):
    chunks = pandas.read_csv(
        filepath,
        header=None,
        skiprows=3,
        squeeze=True,
        dtype=np.int16,
        delim_whitespace=True,
        chunksize=5*10**5
    )
    
    # does count 0 as well
    idx = np.arange(n+1)
    freq = np.zeros(n+1, dtype=np.int32)
    
    for chunk in chunks:
        for x in chunk:
            freq[x] += 1
    print(idx)
    print(freq)

In [26]:
filepath = 'data/easy.txt'

with open(filepath) as f:
    n = int(f.readline())

brute(filepath, n)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12]
[   23   596  5954 23847 38313 24983 12249 24878 38298 24068  6174   590
    24]
