In [1]:
import numpy as np
import scipy.sparse as sp
import pandas as pd

## Read Data and Arguments

### Read Arguments

In [2]:
file_args = "proj1_parameter-file.txt"
file_data = "proj1_input-data.txt"
#file_args = "proj1_ex13_args.txt"
#file_data = "proj1_ex13_data.txt"

In [3]:
ms = dict()
sdc = 1
x_exclude = []
x_must = []
for i in open(file_args, "r"):
    i = i.rstrip("\n")
    if i.startswith("MIS"):
        j = i.split(" = ")
        ms.update({j[0][4:-1]: float(j[1])})
    elif i.startswith("SDC"):
        sdc = float(i.split("=")[1])
    elif i.startswith("cannot_be_together"):
        x_exclude = [j.split(", ") for j in i.split(": ")[1][1:-1].split("}, {")]
    elif i.startswith("must"):
        x_must = [j for j in i.split(": ")[1].split(" or ")]
print("Minimum item support:", ms)
print("Support difference constraint:", sdc)
print("Exclude:", x_exclude)
print("Must-have:", x_must)

Minimum item support: {'60': 0.3, '100': 0.1, '80': 0.2, '40': 0.4, '30': 0.3, '10': 0.43, '20': 0.3, '90': 0.2, '120': 0.2, '70': 0.2, '140': 0.15, '50': 0.4}
Support difference constraint: 0.1
Exclude: [['20', '40'], ['70', '80']]
Must-have: ['20', '40', '50']


In [4]:
ms = pd.Series(ms, name = "MIS").sort_values().reset_index()
ms

Unnamed: 0,index,MIS
0,100,0.1
1,140,0.15
2,120,0.2
3,70,0.2
4,80,0.2
5,90,0.2
6,20,0.3
7,30,0.3
8,60,0.3
9,40,0.4


### Index-ID Map based on sorted MIS

In [5]:
id_dict = ms["index"].to_dict()
ms_dict = ms["MIS"].to_dict()
id_dict_inv = {val: key for key, val in id_dict.items()}
x_must = [id_dict_inv[i] for i in x_must]
x_exclude = [tuple(np.sort([id_dict_inv[j] for j in i])) for i in x_exclude]

print("Index-ID dictionary:", id_dict)
print("MIS index", ms_dict)
print("Must-have index:", x_must)
print("Exclude index:",  x_exclude)

Index-ID dictionary: {0: '100', 1: '140', 2: '120', 3: '70', 4: '80', 5: '90', 6: '20', 7: '30', 8: '60', 9: '40', 10: '50', 11: '10'}
MIS index {0: 0.10000000000000001, 1: 0.14999999999999999, 2: 0.20000000000000001, 3: 0.20000000000000001, 4: 0.20000000000000001, 5: 0.20000000000000001, 6: 0.29999999999999999, 7: 0.29999999999999999, 8: 0.29999999999999999, 9: 0.40000000000000002, 10: 0.40000000000000002, 11: 0.42999999999999999}
Must-have index: [6, 9, 10]
Exclude index: [(6, 9), (3, 4)]


### Read Transaction Data

In [6]:
s = pd.read_csv(file_data, header = None, sep = "\t",squeeze = True)
s

0                   {20, 30, 80, 70, 50, 90}
1                           {20, 10, 80, 70}
2                               {10, 20, 80}
3                               {20, 30, 80}
4                                   {20, 80}
5    {20, 30, 80, 70, 50, 90, 100, 120, 140}
Name: 0, dtype: object

In [7]:
da = s.str[1:-1].str.get_dummies(sep = ", ").reindex(columns = ms["index"], fill_value = 0)
X = da.values
da

index,100,140,120,70,80,90,20,30,60,40,50,10
0,0,0,0,1,1,1,1,1,0,0,1,0
1,0,0,0,1,1,0,1,0,0,0,0,1
2,0,0,0,0,1,0,1,0,0,0,0,1
3,0,0,0,0,1,0,1,1,0,0,0,0
4,0,0,0,0,1,0,1,0,0,0,0,0
5,1,1,1,1,1,1,1,1,0,0,1,0


In [8]:
I = [(i,) for i in range(len(ms))]
I

[(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,), (10,), (11,)]

In [9]:
def sup(xL):
    op = np.mean([X[:, i].all(axis = 1) for i in xL], axis = 1)
    return(op)

In [10]:
Isup = sup(I)
Isup

array([ 0.16666667,  0.16666667,  0.16666667,  0.5       ,  1.        ,
        0.33333333,  1.        ,  0.5       ,  0.        ,  0.        ,
        0.33333333,  0.33333333])

In [11]:
sup_dict = dict(zip(I, Isup))
sup_dict

{(0,): 0.16666666666666666,
 (1,): 0.16666666666666666,
 (2,): 0.16666666666666666,
 (3,): 0.5,
 (4,): 1.0,
 (5,): 0.33333333333333331,
 (6,): 1.0,
 (7,): 0.5,
 (8,): 0.0,
 (9,): 0.0,
 (10,): 0.33333333333333331,
 (11,): 0.33333333333333331}

## 2. Candidate Generation

### Level 1

In [12]:
Li = (Isup > ms["MIS"]).argmax()
L = [i for i in range(Li, ms.shape[0]) if Isup[i] > ms["MIS"][Li]]
Li, L

(0, [0, 1, 2, 3, 4, 5, 6, 7, 10, 11])

In [13]:
F = [[(i,) for i in np.where(Isup > ms["MIS"])[0]]]
F

[[(0,), (1,), (3,), (4,), (5,), (6,), (7,)]]

### Level 2

In [14]:
def pair_sup_mis(x):
    x_t = x[:, np.newaxis]
    x_sup = sup(x_t)
    x_mis = [ms_dict[i] for i in x]
    x_sup_t = x_sup[:, np.newaxis]
    iL = sp.coo_matrix(np.triu((x_sup_t >= x_mis).T & (np.abs(x_sup_t - x_sup) < sdc), 1)).nonzero()
    op = list(zip(x[iL[0]], x[iL[1]]))
    return(op)

In [15]:
C = [i for i in pair_sup_mis(np.array(L)) if i[0] in np.array(F[0]).T[0]]
C

[(0, 1), (0, 2), (1, 2), (3, 7), (4, 6), (5, 10), (5, 11)]

In [16]:
def frequent(xL):
    x_sup = sup(xL)
    sup_dict.update(dict(zip(xL, x_sup)))
    op = [xL[j] for j in np.where(x_sup >= [ms_dict[i[0]] for i in xL])[0]]
    xL_dropfirst = set(tuple(i[1:]) for i in xL)
    sup_dict.update(dict(zip(xL_dropfirst, sup(xL_dropfirst))))
    return(op)

In [17]:
F.append(frequent(C))
F

[[(0,), (1,), (3,), (4,), (5,), (6,), (7,)],
 [(0, 1), (0, 2), (1, 2), (3, 7), (4, 6), (5, 10)]]

### Level > 2

In [19]:
def append_set(xL, x_base):
    if len(xL):
        op = [tuple(i) for i in np.hstack([np.tile(x_base, (len(xL), 1)), xL])]
    else:
        op = []
    return(op)
def prune_candidate(xL):
    op = [xL[l] for l in np.where(np.all([[any(set(k).issubset(j) for k in F[-1]) for j in np.delete(xL, i, axis = 1)] for i in range(1, len(xL[0]))], axis = 0))[0]]
    return(op)

In [18]:
Ls = pd.DataFrame(F[-1])
Ls

Unnamed: 0,0,1
0,0,1
1,0,2
2,1,2
3,3,7
4,4,6
5,5,10


In [20]:
C = sum([append_set(pair_sup_mis(group.values), name) for name, group in Ls.groupby(list(range(len(F)-1)))[len(F)-1]], [])
C = prune_candidate(C)
C

[(0, 1, 2)]

In [21]:
F.append(frequent(C))
F

[[(0,), (1,), (3,), (4,), (5,), (6,), (7,)],
 [(0, 1), (0, 2), (1, 2), (3, 7), (4, 6), (5, 10)],
 [(0, 1, 2)]]

### Prune Frequent Itemsets with Restriction

In [22]:
F_prune = [[j for j in i if any(k in j for k in x_must) & ~any(set(k).issubset(j) for k in x_exclude)] for i in F]
F_prune

[[(6,)], [(4, 6), (5, 10)], []]

In [23]:
def id_name(xL):
    op = [tuple(id_dict[j] for j in i) for i in xL]
    return(op)

In [24]:
[id_name(i) for i in F_prune]

[[('20',)], [('80', '20'), ('90', '50')], []]

### Item Supports for Association Rule

In [25]:
sup_dict

{(0,): 0.16666666666666666,
 (0, 1): 0.16666666666666666,
 (0, 1, 2): 0.16666666666666666,
 (0, 2): 0.16666666666666666,
 (1,): 0.16666666666666666,
 (1, 2): 0.16666666666666666,
 (2,): 0.16666666666666666,
 (3,): 0.5,
 (3, 7): 0.33333333333333331,
 (4,): 1.0,
 (4, 6): 1.0,
 (5,): 0.33333333333333331,
 (5, 10): 0.33333333333333331,
 (5, 11): 0.0,
 (6,): 1.0,
 (7,): 0.5,
 (8,): 0.0,
 (9,): 0.0,
 (10,): 0.33333333333333331,
 (11,): 0.33333333333333331}