In [2]:
import numpy as np
from io import StringIO
from collections import deque
import pickle

## Apriori Rule 

In [3]:
def get_data(filename):
    """
    Purpose:
        Read file, and extract data and label from it.
    Input:
        filename: String
    Output:
        data: a matrix of string
        label: a vector of string
    """
    with open(filename) as f:
        raw_data = np.genfromtxt(StringIO(f.read()), delimiter="\t",dtype='str')
        data = raw_data[:,:-1]
        label = raw_data[:,-1]
    return data, label

In [4]:
def pre_process(data):
    """
    Purpose:
        preprocees data in place
    Input:
        data: a matrix of string
    output:
        None
    """
    for i in range(len(data)):
        for j in range(len(data[i])):
            data[i,j] = 'G' + str(j) + '_' + data[i,j]

In [5]:
def get_C1(data):
    """
    Purpose:
        Get all the length one itemsets of data
    input: 
        data: a matrix of string
    output: 
        res: a set contains frozenset type elements.
    """
    res = set()
    for row in data:
        for item in row:
            res.add(frozenset([item]))
    return res

In [6]:
def get_freqI(data, Ck, min_support, record):
    """
    Purpose:
        Generate the frequent and the unfrequent items from the candidate itemsets.
    input:
        data: a matrix of string 
        Ck : set of frozenset, current candidate frequent itemsets
        min_support: float, the minimum support
        record: a dictionary. Key type is frozenset, value type is float.
    Output:
        Fq: a list of frozenset, contains all the frequent itemsets.
        UnFq: a list of frozenset, contains all the unfrequent itemsets.
    """
    N = data.shape[0]
    Fq = []
    UnFq = []
    for item in Ck:
        count = 0
        for row in data:
            if item.issubset(row):
                count += 1
        support = count / N
        if support >= min_support:
            Fq.append(item)
        else:
            UnFq.append(item)
        record[item] = support
    return Fq, UnFq    

In [7]:
def get_Ck(Fq):
    """
    Purpose:
        Gnenrate the candidate itemsets by using previous freqent itemsets
    input:
        Fq: list of frozenset, previous frequent itemsets
    output:
        Ck: set of frozenset, current candidate itemsets
    """
    C = Fq
    C_level = len(C[0])
    C1 = set()
    Ck = set()
    
    for item in C:
        for elem in item:
            C1.add(frozenset([elem]))
        
    for item in C:
        for elem in C1:
            check = item | elem
            if(len(check) - C_level == 1):
                Ck.add(check)
    return Ck

In [8]:
def eliminate_infeq(Ck, unfq):
    """
    Purpose:
        prune the candidate itemsets by the previous unfrequent itemsets.
    Input:
        Ck: a set of frozenset, current candidate itemsets
        unfq: a list of frozenset, previous unfrequent itemsets.
    Output:
        res: a set of frozenset, the prunded current candidate itemsets
    """
    res = set()
    for i in Ck:
        for j in unfq:
            if j.issubset(i):
                break
        else:
            res.add(i)
    return res

In [9]:
def count(fq_list, support):
    """
    Purpose:
        Count the numbers of differnt length itemsets in the frequent list.
    """
    count = 0
    print("Support is set to be {}%".format(support * 100))
    for i in range(len(fq_list) - 1):
        count += len(fq_list[i])
        print("number of length-{} frequent itemsets: {}".format(i+1, len(fq_list[i])))
    print("number of all lengths frequent itemsets: {}".format(count))

In [10]:
def apriori(filename, support=0.5):
    """
    Purpose:
        Doing apriori minging on given data.
    Input:
        filename: String, the filename of data
        Support: float, the minimum support, default 0.5
    Output:
        fq_list: a two dimensions list of frozenset. Store the frequent itemsets.
        record: a dictionary. Key type is frozenset, value type is float.    
    """
    data, label = get_data(filename)
    pre_process(data)
    record = {}
    C1 = get_C1(data)
    fq_list = []
    fq, unfq = get_freqI(data, C1, support, record)
    fq_list.append(fq)
    while len(fq_list[-1]) != 0:
        Ck = get_Ck(fq_list[-1])
        Ck = eliminate_infeq(Ck, unfq)
        fq, unfq = get_freqI(data, Ck, support, record)
        fq_list.append(fq)
    return fq_list, record

## Association rule

In [79]:
def rule_gen(itemset, record, min_conf, res):
    """
    Purpose:
        generate the association rule fro the given itemset
    Input:
        itemset: frozenset of itemset
        record: dictionary, Key is the frozenset of itemsets , value is the support
        min_conf: float, the minimum confidence
        res: dictionary, Key is the frozenset of itemsets , value is the confidence
    Output:
        res: dictionary, Key is the frozenset of itemsets , value is the confidence
    """
    queue = deque()
    queue.append(itemset)
    while len(queue) != 0:
        item = queue.popleft()
        conf = record[itemset] / record[item]
        if  conf >= min_conf:
            res[(item, itemset - item)] = conf
            if(len(item) > 1):
                c1 = [frozenset([elem]) for elem in item]
                for elem in c1:
                    queue.append(item - elem)
    res.pop((itemset, frozenset()))
    return res

In [78]:
def get_rule(combinations, min_conf, record, res):
    """
    Purpose:
        get the association rule of all the itemsets from the combinations list. And
        save the rule in the res dictionray.
    Input:
        combinations: a list of frozenset of itemsets. 
        min_conf: float, the minimum confidence
        record: dictionary, Key is the frozenset of itemsets , value is the support
        res: dictionary, Key is the frozenset of itemsets , value is the confidence 
    Output:
        None
    """
    for row in combinations:
        for elem in row:
            rule_gen(elem, record, min_conf, res)

In [13]:
def item_combinations(items):
    """
    Purpose:
        get all the combinations of items from the list.
    Input:
        items: list of string. Each element in list represent an itemset
    Output:
        res[:-1]: a list of frozenset. 
    """
    res = []
    res.append(get_C1([items]))
    while len(res[-1]) != 0 :
        Ck = get_Ck([i for i in res[-1]])
        res.append(Ck)
    return res[:-1]

In [14]:
def combination_any(items, fq_list):
    """
    Input:
        items: list of string. Each element in list represent an itemset
        fq_list: a two dimension list of the frozenset of itemsets.
    Output:
        combinations: a list of frozenset. 
    """
    combinations = []
    itemset = frozenset(items)
    for row in fq_list[1:-1]:
        combinations.append([])
        for item in row:
            if len(item) != len(item - itemset):
                combinations[-1].append(item)
    return combinations

In [15]:
def combination_none(items, fq_list):
    """
    Input:
        items: list of string. Each element in list represent an itemset
        fq_list: a two dimension list of the frozenset of itemsets.
    Output:
        combinations: a list of frozenset. 
    """
    combinations = []
    itemset = frozenset(items)
    for row in fq_list[1:-1]:
        combinations.append([])
        for item in row:
            if len(item) == len(item - itemset):
                combinations[-1].append(item)
    return combinations

In [16]:
def combination_num(cmd_2, items, fq_list):
    """
    Input:
        cmd_2: int, target number
        items: list of string. Each element in list represent an itemset
        fq_list: a two dimension list of the frozenset of itemsets.
    Output:
        combinations: a list of frozenset. 
    """
    layer = int(cmd_2)
    combinations = []
    itemsets = item_combinations(items)[layer - 1]
    non_itemsets = [(frozenset(items) - item) for item in itemsets]
    for item in non_itemsets:
        sub_all = combination_none([i for i in item], fq_list)
        check = frozenset(items) - item
        row = []
        for i in sub_all:
            for j in i:
                if check.issubset(j):
                    row.append(j)
        combinations.append(row)
    return combinations

In [17]:
def head_body_num_dic(cmd_1, cmd_2, items, rule_any_dic):
    """
    Purpose:
        use the rule_any_dic to generated the association rules which has limited 
        number on HEAD or BODY
    Input:
        cmd_1: String, command 1
        cmd_2: String or int, used to limited number of itemsets in HEAD or BODY
        items: list of string. Each element in list represent an itemset
        rule_any_dic: dictionary, Key is the frozenset of itemsets , value is 
                      the confidence. 
    Output:
        res: dictionary, Key is the frozenset of itemsets , value is the confidence.
    """
    res = {}
    index = 0
    num = int(cmd_2)
    if cmd_1 == "HEAD": index = 0
    elif cmd_1 == "BODY": index = 1
    check = frozenset(items)
    for k, v in rule_any_dic.items():
        if len(check - k[index]) == len(check) - num :
            res[k] = v
    return res;

In [18]:
def head_body_any_dic(cmd_1, items, rule_any_dic):
    """
    Purpose:
        use the rule_any_dic to generated the association rules which has any number of 
        items on HEAD or BODY
    """
    res = {}
    index = 0
    if cmd_1 == "HEAD": index = 0
    elif cmd_1 == "BODY": index = 1
    check = frozenset(items)
    for k, v in rule_any_dic.items():
        if len(k[index] - check) != len(k[index]):
            res[k] = v
    return res;
            

In [19]:
def head_body_none_from_rule_any(cmd_1, items, rule_any_dic):
    """
    Purpose:
        use the rule_any_dic to generated the association rules which has none of 
        items on HEAD or BODY
    """
    res = {}
    index = 0
    if cmd_1 == "HEAD": index = 0
    elif cmd_1 == "BODY": index = 1
    check = frozenset(items)
    for k, v in rule_any_dic.items():
        if len(k[index] - check) == len(k[index]):
            res[k] = v
    return res;
            

In [20]:
def print_dic(dic):
    for k, v in dic.items():
        print(k[0],"-->",k[1]," : ", v)

In [21]:
def template1(cmd_1, cmd_2, items, min_conf, record, fq_list):
    """
    Purpose:
        mining association rules by given query from template1
    Input :
        cmd_1: string
        cmd_2: string
        items: list of string. Each element in list represent an itemset
        min_conf: float, the minimum confidence
        record: dictionary, Key is the frozenset of itemsets , value is the support
        fq_list: a two dimension list of the frozenset of itemsets.
    Output :
        res: dictionary, Key is the frozenset of itemsets , value is the confidence
    """
    
    res = {}
    if cmd_1 == "RULE":
        if cmd_2 == "ANY":
            combinations = combination_any(items, fq_list)
            get_rule(combinations, min_conf, record, res)
        elif cmd_2 == "NONE":
            combinations = combination_none(items, fq_list)
            get_rule(combinations, min_conf, record, res)
        else:
            combinations = combination_num(cmd_2, items, fq_list)
            get_rule(combinations, min_conf, record, res)
    elif cmd_1 == "HEAD":
        if cmd_2 == "ANY":
            rule_any_dic = template1("RULE", "ANY", items, min_conf, record, fq_list)
            res = head_body_any_dic(cmd_1, items, rule_any_dic)
        elif cmd_2 == "NONE" : 
            rule_any_dic = template1("RULE", "ANY", items, min_conf, record, fq_list)
            rule_none_dic = template1("RULE", "NONE", items, min_conf, record, fq_list)
            dic_1 = head_body_none_from_rule_any(cmd_1, items, rule_any_dic)
            res = {**dic_1, **rule_none_dic}
        else :
            rule_any_dic = template1("RULE", "ANY", items, min_conf, record, fq_list)
            res = head_body_num_dic(cmd_1, cmd_2, items, rule_any_dic)        
    elif cmd_1 == "BODY" :
        if cmd_2 == "ANY" :
            rule_any_dic = template1("RULE", "ANY", items, min_conf, record, fq_list)
            res = head_body_any_dic(cmd_1, items, rule_any_dic)
        elif cmd_2 == "NONE" : 
            rule_any_dic = template1("RULE", "ANY", items, min_conf, record, fq_list)
            rule_none_dic = template1("RULE", "NONE", items, min_conf, record, fq_list)
            dic_1 = head_body_none_from_rule_any(cmd_1, items, rule_any_dic)
            res = {**dic_1, **rule_none_dic}
        else :
            rule_any_dic = template1("RULE", "ANY", items, min_conf, record, fq_list)
            res = head_body_num_dic(cmd_1, cmd_2, items, rule_any_dic)
    
    return res

### template2

In [22]:
def template2(cmd_1, cmd_2, min_conf, record, fq_list):
    res = {}
    num = int(cmd_2)
    combinations_list = []
    for item in fq_list[num - 1]:
        combinations_list.append([i for i in item])
    if cmd_1 == "RULE":
        for combinations in combinations_list:
            res = {**res, **template1(cmd_1, cmd_2, combinations, min_conf, record, fq_list)}
    elif cmd_1 == "HEAD":
        for combinations in combinations_list:
            res = {**res, **template1(cmd_1, cmd_2, combinations, min_conf, record, fq_list)}
    elif cmd_1 == "BODY":
        for combinations in combinations_list:
            res = {**res, **template1(cmd_1, cmd_2, combinations, min_conf, record, fq_list)}
    else:
        print("RULE|HEAD|BODY")
    return res

### Template3

In [23]:
def template3(cmd, *cmds):
    res = {}
    if cmd == "1or1" or cmd == "1and1":
        cmd1_1, cmd1_2, items1, cmd2_1, cmd2_2, items2, min_conf, record, fq_list = cmds
        dic1 = template1(cmd1_1, cmd1_2, items1, min_conf, record, fq_list)
        dic2 = template1(cmd2_1, cmd2_2, items2, min_conf, record, fq_list)
        if cmd == "1or1":
            res ={**dic1, **dic2}
        else :
            for k, v in dic1.items():
                if k in dic2:
                    res[k] = v
    elif cmd == "1or2" or cmd == "1and2":
        cmd1_1, cmd1_2, items1, cmd2_1, cmd2_2, min_conf, record, fq_list = cmds
        dic1 = template1(cmd1_1, cmd1_2, items1, min_conf, record, fq_list)
        dic2 = template2(cmd2_1, cmd2_2, min_conf, record, fq_list)
        if cmd == "1or2":
            res = {**dic1, **dic2}
        else :
            for k, v in dic1.items():
                if k in dic2:
                    res[k] = v
    elif cmd == "2or2" or cmd == "2and2" :
        cmd1_1, cmd1_2, cmd2_1, cmd2_2, min_conf, record, fq_list = cmds
        dic1 = template2(cmd1_1, cmd1_2, min_conf, record, fq_list)
        dic2 = template2(cmd2_1, cmd2_2, min_conf, record, fq_list)
        if cmd == "2or2":
            res = {**dic1, **dic2}
        else :
            for k, v in dic1.items():
                if k in dic2:
                    res[k] = v
    return res

# PART 1 RESULT

In [22]:
fq_list, record = apriori('../data/associationruletestdata.txt', 0.5)

In [24]:
f_30 = "../data/support_" + str(30) + ".p"
f_40 = "../data/support_" + str(40) + ".p"
f_50 = "../data/support_" + str(50) + ".p"
f_60 = "../data/support_" + str(60) + ".p"
f_70 = "../data/support_" + str(70) + ".p"

In [25]:
fq_list30, record30 = pickle.load( open(f_30, "rb" ) )
fq_list40, record40 = pickle.load( open(f_40, "rb" ) )
fq_list50, record50 = pickle.load( open(f_50, "rb" ) )
fq_list60, record60 = pickle.load( open(f_60, "rb" ) )
fq_list70, record70 = pickle.load( open(f_70, "rb" ) )

In [27]:
count(fq_list30, 0.3)

Support is set to be 30.0%
number of length-1 frequent itemsets: 194
number of length-2 frequent itemsets: 5323
number of length-3 frequent itemsets: 5251
number of length-4 frequent itemsets: 1463
number of length-5 frequent itemsets: 388
number of length-6 frequent itemsets: 61
number of length-7 frequent itemsets: 3
number of all lengths frequent itemsets: 12683


In [28]:
count(fq_list40, 0.4)

Support is set to be 40.0%
number of length-1 frequent itemsets: 167
number of length-2 frequent itemsets: 753
number of length-3 frequent itemsets: 149
number of length-4 frequent itemsets: 7
number of length-5 frequent itemsets: 1
number of all lengths frequent itemsets: 1077


In [29]:
count(fq_list50, 0.5)

Support is set to be 50.0%
number of length-1 frequent itemsets: 109
number of length-2 frequent itemsets: 63
number of length-3 frequent itemsets: 2
number of all lengths frequent itemsets: 174


In [30]:
count(fq_list60, 0.6)

Support is set to be 60.0%
number of length-1 frequent itemsets: 34
number of length-2 frequent itemsets: 2
number of all lengths frequent itemsets: 36


In [31]:
count(fq_list70, 0.7)

Support is set to be 70.0%
number of length-1 frequent itemsets: 7
number of all lengths frequent itemsets: 7


# Part2 Result

In [80]:
fq_list50, record50 = pickle.load( open(f_50, "rb" ) )

## template1 result

In [102]:
result11 = template1("RULE", "ANY", ['G59_Up'], 0.7,record50, fq_list50)
print(len(result11))

0


In [82]:
result12 = template1("RULE", "NONE", ['G59_Up'], 0.7,record50, fq_list50)
print(len(result12))

117


In [83]:
result13 = template1("RULE", 1, ['G59_Up', 'G10_Down'], 0.7,record50, fq_list50)
print(len(result13))

0


In [84]:
result14 = template1("HEAD", "ANY", ['G59_Up'], 0.7,record50, fq_list50)
print(len(result14))

0


In [85]:
result15 = template1("HEAD", "NONE", ['G59_Up'], 0.7,record50, fq_list50)
print(len(result15))

117


In [86]:
result16 = template1("HEAD", "1", ['G59_Up', 'G10_Down'], 0.7,record50, fq_list50)
print(len(result16))

0


In [87]:
result17 = template1("BODY", "ANY", ['G59_Up'], 0.7,record50, fq_list50)
print(len(result17))

0


In [88]:
result18 = template1("BODY", "NONE", ['G59_Up'], 0.7,record50, fq_list50)
print(len(result18))

117


In [89]:
result19 = template1("BODY", "1", ['G59_Up', 'G10_Down'], 0.7,record50, fq_list50)
print(len(result19))

0


## template2 result

In [90]:
result21 = template2("RULE", 3, 0.7, record50, fq_list50)
print(len(result21))

9


In [91]:
result22 = template2("HEAD", 2, 0.7, record50, fq_list50)
print(len(result22))

6


In [92]:
result23 = template2("BODY", 1, 0.7, record50, fq_list50)
print(len(result23))

117


In [93]:
result23 = template2("BODY", 1, 0.7, record40, fq_list40)

## template 3 result

In [94]:
result31 = template3("1or1", "HEAD", "ANY", ['G10_Down'], "BODY", 1, ['G59_Up'], 0.7 ,record50, fq_list50)
len(result31)

0

In [95]:
result32 = template3("1and1", "HEAD", "ANY", ['G10_Down'], "BODY", 1, ['G59_Up'], 0.7 ,record50, fq_list50)
len(result32)

0

In [96]:
result33 = template3("1or2", "HEAD", "ANY", ['G10_Down'], "BODY", 2, 0.7 ,record50, fq_list50)
len(result33)

3

In [97]:
result34 = template3("1and2", "HEAD", "ANY", ['G10_Down'], "BODY", 2, 0.7 ,record50, fq_list50)
len(result34)

0

In [98]:
result35 = template3("2or2", "HEAD", 1, "BODY", 2, 0.7 ,record50, fq_list50)
len(result35)

117

In [99]:
result36 = template3("2and2", "HEAD", 1, "BODY", 2, 0.7 ,record50, fq_list50)
len(result36)

3