In [207]:
import pandas as pd
import numpy as np
from itertools import combinations
from collections import defaultdict

In [208]:
df = pd.read_csv("test2.csv", low_memory=False)


In [209]:
df.head()

Unnamed: 0,milk,bread,biscuit,cornflakes,bournvita,jam,maggi,tea,coffee,cock,sugar
0,t,t,t,,,,,,,,
1,t,t,t,t,,,,,,,
2,,t,,,t,,,t,,,
3,t,t,,,,t,t,,,,
4,,,t,,,,t,t,,,


In [210]:
item_list = list(df.columns)
item_dict = dict()

for i, item in enumerate(item_list):
    item_dict[item] = i + 1

item_dict

{'milk': 1,
 'bread': 2,
 'biscuit': 3,
 'cornflakes': 4,
 'bournvita': 5,
 'jam': 6,
 'maggi': 7,
 'tea': 8,
 'coffee': 9,
 'cock': 10,
 'sugar': 11}

In [211]:
transactions = list()

for i, row in df.iterrows():
    transaction = set()
    
    for item in item_dict:
        if row[item] == 't':
            transaction.add(item_dict[item])
    transactions.append(transaction)
    
transactions

[{1, 2, 3},
 {1, 2, 3, 4},
 {2, 5, 8},
 {1, 2, 6, 7},
 {3, 7, 8},
 {2, 5, 8},
 {4, 7, 8},
 {2, 3, 7, 8},
 {2, 6, 7, 8},
 {1, 2},
 {3, 4, 9, 10},
 {3, 4, 9, 10},
 {5, 9, 11},
 {2, 9, 10},
 {2, 3, 11},
 {4, 9, 11},
 {2, 5, 11},
 {2, 9, 11},
 {2, 9, 11},
 {1, 4, 8, 9}]

In [212]:
def get_support(transactions, item_set):
    match_count = 0
    for transaction in transactions:
        if item_set.issubset(transaction):
            match_count += 1
            
    return float(match_count/len(transactions))

In [213]:
def get_TID_item(transactions, item):
    TID = 1
    list_TID = []
    for transaction in transactions:
        if item.issubset(transaction):
            list_TID.append(TID)
        
        TID += 1
    return list_TID

In [214]:
def self_join(frequent_item_sets_per_level, level):
    current_level_candidates = list()
    last_level_items = frequent_item_sets_per_level[level - 1]
    
    if len(last_level_items) == 0:
        return current_level_candidates
    
    for i in range(len(last_level_items)):
        for j in range(i+1, len(last_level_items)):
            itemset_i = last_level_items[i][0]
            itemset_j = last_level_items[j][0]
            union_set = itemset_i.union(itemset_j)
            
            if union_set not in current_level_candidates and len(union_set) == level:
                current_level_candidates.append(union_set)
                
    return current_level_candidates

In [215]:
def get_single_drop_subsets(item_set):
    single_drop_subsets = list()
    for item in item_set:
        temp = item_set.copy()
        temp.remove(item)
        single_drop_subsets.append(temp)
        
    return single_drop_subsets

def is_valid_set(item_set, prev_level_sets):
    single_drop_subsets = get_single_drop_subsets(item_set)
    
    for single_drop_set in single_drop_subsets:
        if single_drop_set not in prev_level_sets:
            return False
    return True

def pruning(frequent_item_sets_per_level, level, candidate_set):
    post_pruning_set = list()
    if len(candidate_set) == 0:
        return post_pruning_set
    
    prev_level_sets = list()
    for item_set, _ in frequent_item_sets_per_level[level - 1]:
        prev_level_sets.append(item_set)
        
    for item_set in candidate_set:
        if is_valid_set(item_set, prev_level_sets):
            post_pruning_set.append(item_set)
            
    return post_pruning_set

In [216]:
def apriori(min_support):
    frequent_item_sets_per_level = defaultdict(list)
    print("level : 1", end = " ")
    dict_TID = {}
    
    for item in range(1, len(item_list) + 1):
        support = get_support(transactions, {item})
        dict_TID[item] = get_TID_item(transactions, {item})
        if support >= min_support:
            frequent_item_sets_per_level[1].append(({item}, support))
    print("Here is the dict_TID:")
    print(dict_TID)    
    
    for level in range(2, len(item_list) + 1):
        print(level, end = " ")
        current_level_candidates = self_join(frequent_item_sets_per_level, level)

        post_pruning_candidates = pruning(frequent_item_sets_per_level, level, current_level_candidates)
        if len(post_pruning_candidates) == 0:
            break

        for item_set in post_pruning_candidates:
            support = get_support(transactions, item_set)
            if support >= min_support:
                frequent_item_sets_per_level[level].append((item_set, support))
                
    return frequent_item_sets_per_level

In [217]:
min_support = 0.2
frequent_item_sets_per_level = apriori(min_support)

level : 1 Here is the dict_TID:
{1: [1, 2, 4, 10, 20], 2: [1, 2, 3, 4, 6, 8, 9, 10, 14, 15, 17, 18, 19], 3: [1, 2, 5, 8, 11, 12, 15], 4: [2, 7, 11, 12, 16, 20], 5: [3, 6, 13, 17], 6: [4, 9], 7: [4, 5, 7, 8, 9], 8: [3, 5, 6, 7, 8, 9, 20], 9: [11, 12, 13, 14, 16, 18, 19, 20], 10: [11, 12, 14], 11: [13, 15, 16, 17, 18, 19]}
2 3 

In [218]:
for level in frequent_item_sets_per_level:
    print(len(frequent_item_sets_per_level[level]))

9
7


In [219]:
for level in frequent_item_sets_per_level:
    print(frequent_item_sets_per_level[level])

[({1}, 0.25), ({2}, 0.65), ({3}, 0.35), ({4}, 0.3), ({5}, 0.2), ({7}, 0.25), ({8}, 0.35), ({9}, 0.4), ({11}, 0.3)]
[({1, 2}, 0.2), ({2, 3}, 0.2), ({8, 2}, 0.2), ({2, 11}, 0.2), ({9, 4}, 0.2), ({8, 7}, 0.2), ({9, 11}, 0.2)]


In [220]:
item_support_dict = dict()
item_list = list()

key_list = list(item_dict.keys())
val_list = list(item_dict.values())

for level in frequent_item_sets_per_level:
    for set_support_pair in frequent_item_sets_per_level[level]:
        for i in set_support_pair[0]:
            item_list.append(key_list[val_list.index(i)])
        item_support_dict[frozenset(item_list)] = set_support_pair[1]
        item_list = list()

In [221]:
item_support_dict

{frozenset({'milk'}): 0.25,
 frozenset({'bread'}): 0.65,
 frozenset({'biscuit'}): 0.35,
 frozenset({'cornflakes'}): 0.3,
 frozenset({'bournvita'}): 0.2,
 frozenset({'maggi'}): 0.25,
 frozenset({'tea'}): 0.35,
 frozenset({'coffee'}): 0.4,
 frozenset({'sugar'}): 0.3,
 frozenset({'bread', 'milk'}): 0.2,
 frozenset({'biscuit', 'bread'}): 0.2,
 frozenset({'bread', 'tea'}): 0.2,
 frozenset({'bread', 'sugar'}): 0.2,
 frozenset({'coffee', 'cornflakes'}): 0.2,
 frozenset({'maggi', 'tea'}): 0.2,
 frozenset({'coffee', 'sugar'}): 0.2}

In [222]:
def find_subset(item, item_length):
    combs = []
    for i in range(1, item_length + 1):
        combs.append(list(combinations(item, i)))
        
    subsets = []
    for comb in combs:
        for elt in comb:
            subsets.append(elt)
            
    return subsets

In [223]:
def association_rules(min_confidence, support_dict):
    rules = list()
    for item, support in support_dict.items():
        item_length = len(item)
       
        if item_length > 1:
            subsets = find_subset(item, item_length)
           
            for A in subsets:
                B = item.difference(A)
               
                if B:
                    A = frozenset(A)
                    
                    AB = A | B
                    
                    confidence = support_dict[AB] / support_dict[A]
                    if confidence >= min_confidence:
                        rules.append((A, B, confidence))
    
    return rules

In [224]:
association_rules = association_rules(min_confidence = 0.6, support_dict = item_support_dict)

In [225]:
print("Number of rules: ", len(association_rules), "\n")

for rule in association_rules:
    print('{0} -> {1} <confidence: {2}>'.format(set(rule[0]), set(rule[1]), rule[2]))

Number of rules:  5 

{'milk'} -> {'bread'} <confidence: 0.8>
{'sugar'} -> {'bread'} <confidence: 0.6666666666666667>
{'cornflakes'} -> {'coffee'} <confidence: 0.6666666666666667>
{'maggi'} -> {'tea'} <confidence: 0.8>
{'sugar'} -> {'coffee'} <confidence: 0.6666666666666667>
