## Hello

In [17]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

datasets_names = ['mushroom', 'small']#['chess', 'connect', 'pumsb', 'pumsb_star', 'retail', 'kosarak']
datasets = {}

for name in datasets_names:
    temp_str = [i.strip().split() for i in open('datasets/' + name + '.dat').readlines()]
    datasets[name] = [list(map(int, line)) for line in temp_str]

In [9]:
def freq_sampling(dataset, w, normalised_w):
    random_pattern_index = np.random.choice(len(dataset), p=normalised_w)
    return [attr for attr in dataset[random_pattern_index] if np.random.randint(2)]

def freq_sampling_k(dataset, k):
    patterns = {}
    w = [2**abs(len(motif)) for motif in dataset]
    normalised_w = [i / sum(w) for i in w]
    done = False
    for _ in range(k):
        p = tuple(freq_sampling(dataset, w, normalised_w))
        if p in patterns:
            patterns[p] += 1
        else:
            patterns[p] = 1
    return patterns

def freq_sampling_all_datasets(k):
    patterns_freq = {}
    for dataset_name in datasets:
        patterns_freq[dataset_name] = freq_sampling_k(datasets[dataset_name], k)
        print('dataset ' + dataset_name + ' done')
    return patterns_freq

In [6]:
def area_sampling(dataset, w, normalised_w):
    random_pattern_index = np.random.choice(len(dataset), p=normalised_w)
    k_range = list(range(1, len(dataset[random_pattern_index]) + 1))
    normalised_k_range = [i / sum(k_range) for i in k_range]
    k = np.random.choice(k_range, p=normalised_k_range)
    return sorted(np.random.choice(dataset[random_pattern_index], size=k, replace=False))

def area_sampling_k(dataset, k):
    patterns = {}
    w = [abs(len(motif))*2**(abs(len(motif)) - 1) for motif in dataset]
    normalised_w = [i / sum(w) for i in w]
    for _ in range(k):
        p = tuple(area_sampling(dataset, w, normalised_w))
        if p in patterns:
            patterns[p] += 1
        else:
            patterns[p] = 1
    return patterns

def area_sampling_all_datasets(k):
    patterns_area = {}
    for dataset_name in datasets:
        patterns_area[dataset_name] = freq_sampling_k(datasets[dataset_name], k)
        print('dataset ' + dataset_name + ' done')
    return patterns_area

In [67]:
def a_priori(dataset, minsup):
    flattened_dataset = [i for sublist in dataset for i in sublist]
    c = [[[i] for i in sorted(list(set(flattened_dataset)))]]
    f = []
    i = 0
    over = False
    while not over:
        f.append(evaluate(c[i], dataset, minsup))
        ci = gen_candidates(f[i])
        if len(ci) == 0:
            over = True
        else:
            c.append(ci)
            i += 1
    return (c, f)

def evaluate(ci, dataset, minsup):
    fi = []
    for candidate in ci:
        count = 0
        for line in dataset:
            in_line = True
            for attr in candidate:
                if attr not in line:
                    in_line = False
                    break
            count += in_line
        if count >= minsup:
            fi.append(candidate)
    return fi

def gen_candidates(fi):
    ci = []
    if len(fi) > 1:
        for i in range(len(fi) - 1):
            for j in range(i + 1, len(fi)):
                if len(fi[i]) == 1:
                    ci.append([fi[i][0], fi[j][0]])
                else:
                    if fi[i][1:] == fi[j][:-1]:
                        cii = [fi[i][0]]
                        cii.extend(fi[j])
                        ci.append(cii)
    return ci

In [12]:
patterns_freq = freq_sampling_all_datasets(1000)

dataset mushroom done


In [None]:
patterns_area = area_sampling_all_datasets(1000)

In [13]:
sorted_patterns = list(reversed(sorted(patterns_freq['mushroom'].items(), key=lambda kv: kv[1])))
for i in range(len(sorted_patterns)):
    print(sorted_patterns[i])

((3, 10, 16, 34, 36, 56, 59, 63, 67, 85, 86, 91), 1)
((1, 17, 24, 31, 38, 58, 63, 67, 77, 86, 119), 1)
((10, 13, 23, 34, 38, 40, 54, 63, 67, 85, 86, 90, 93, 98), 1)
((6, 9, 26, 37, 38, 59, 63, 85, 86, 90, 99, 110, 116), 1)
((3, 15, 26, 39, 67, 93, 107), 1)
((1, 3, 17, 31, 34, 38, 77, 90, 94, 102, 110, 117), 1)
((6, 24, 36, 42, 52, 56, 61, 66, 69, 77, 110), 1)
((2, 17, 23, 28, 34, 36, 39, 53, 59, 63, 67, 76, 86, 90, 93, 116), 1)
((6, 23, 27, 43, 56, 59, 67, 85, 93, 99, 116), 1)
((1, 24, 29, 39, 45, 52, 56, 61, 70, 80, 85, 86, 90, 95), 1)
((2, 24, 34, 37, 39, 41, 53, 54, 60, 63, 85, 90, 94, 99, 114), 1)
((6, 11, 28, 34, 39, 46, 56, 59, 63, 86, 90, 99, 111), 1)
((2, 3, 11, 24, 28, 37, 39, 42, 52, 58, 85, 86, 102), 1)
((1, 3, 9, 34, 36, 48, 63, 67, 77, 85, 86, 90, 110, 116), 1)
((1, 3, 29, 34, 52, 61, 77, 85, 86, 95, 101, 111, 117), 1)
((2, 10, 23, 28, 34, 39, 53, 63, 93, 110, 116), 1)
((3, 13, 24, 31, 34, 36, 38, 53, 61, 69, 90, 94, 102), 1)
((6, 10, 14, 24, 29, 34, 36, 56, 66, 71, 85, 10

((10, 39, 57, 59, 76, 85, 86, 90, 107, 114), 1)
((17, 34, 39, 41, 53, 59, 86, 90, 110), 1)
((9, 30, 36, 38, 42, 56, 63, 86, 90, 99, 107, 116), 1)
((10, 31, 53, 59, 77, 90, 102, 110), 1)
((11, 24, 34, 36, 45, 69, 80, 86, 90, 111), 1)
((11, 17, 23, 39, 53, 56, 90, 93, 111, 116), 1)
((1, 16, 37, 38, 46, 52, 56, 59, 63, 85, 90, 93), 1)
((16, 24, 42, 67, 85, 91, 93, 107), 1)
((2, 11, 23, 28, 36, 39, 43, 53, 56, 78, 85, 90, 93, 116), 1)
((2, 3, 9, 15, 23, 27, 34, 40, 52, 63, 67, 85, 90, 93), 1)
((14, 24, 29, 45, 61, 66, 70, 90, 116), 1)
((11, 14, 24, 29, 34, 45, 71, 95, 101), 1)
((6, 13, 24, 34, 37, 52, 67, 80, 85, 86, 110, 119), 1)
((1, 9, 23, 34, 39, 60, 67, 86, 93, 110, 114), 1)
((10, 17, 24, 32, 34, 36, 53, 85, 86, 102), 1)
((2, 11, 24, 28, 39, 53, 63, 67, 76, 85, 86, 94, 107, 114), 1)
((3, 15, 23, 29, 34, 36, 39, 56, 64, 67, 76, 85, 93, 113), 1)
((2, 13, 23, 28, 34, 36, 39, 44, 53, 59, 69, 85, 99, 110, 116), 1)
((3, 9, 15, 24, 28, 37, 39, 43, 54, 64, 76, 86, 90, 99, 107, 114), 1)
((1, 2

In [68]:
c, f = a_priori(datasets['small'], 3)

In [69]:
for cand in c:
    print(cand)

[[1], [2], [3], [4], [5], [6], [7], [9], [10], [16], [24], [65], [66], [78], [120]]
[[1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 16], [1, 65], [2, 3], [2, 4], [2, 5], [2, 6], [2, 16], [2, 65], [3, 4], [3, 5], [3, 6], [3, 16], [3, 65], [4, 5], [4, 6], [4, 16], [4, 65], [5, 6], [5, 16], [5, 65], [6, 16], [6, 65], [16, 65]]
[[1, 3, 4], [1, 3, 5], [1, 3, 6], [1, 3, 16], [1, 3, 65], [1, 4, 5], [1, 4, 6], [1, 4, 16], [1, 16, 65], [3, 4, 5], [3, 4, 6], [3, 4, 16], [3, 5, 6], [3, 16, 65], [4, 5, 6], [4, 16, 65]]
[[1, 3, 4, 5], [1, 3, 4, 6], [1, 3, 16, 65], [3, 4, 5, 6]]


In [70]:
for pattern in f:
    print(pattern)

[[1], [2], [3], [4], [5], [6], [16], [65]]
[[1, 3], [1, 4], [1, 16], [1, 65], [3, 4], [3, 5], [3, 6], [3, 16], [3, 65], [4, 5], [4, 6], [4, 16], [5, 6], [16, 65]]
[[1, 3, 4], [1, 3, 16], [1, 3, 65], [1, 16, 65], [3, 4, 5], [3, 4, 6], [3, 5, 6], [3, 16, 65], [4, 5, 6]]
[[1, 3, 16, 65], [3, 4, 5, 6]]


In [71]:
c, f = a_priori(datasets['mushroom'], 3)

KeyboardInterrupt: 