In [None]:
import math
from itertools import combinations

In [None]:
# Function to read the given file and convert to suitable format
# This functions returns a list of transactions, where every transation is again
# represented as a list of items
def read_file(file):
    data = open(file,"r")
    table = data.readlines() 
    data.close()
    transactions = []
    for line in table:
        line = line.strip(",\n").split(",")
        line = sorted(line)
        transactions.append(line)
    return transactions

In [None]:
# Function to count the number of occurences of a given itemset in transactions
def get_itemset_count(itemset, transactions):
    count = 0
    for i in range(len(transactions)):
        if set(itemset).issubset(set(transactions[i])):
            count += 1
    return count

In [None]:
# Function to generate all possible candidate itemsets Ck, by joining each itemset
# of Lk-1
def generate_candidate_itemsets(itemsets):
    candidate_itemsets = []
    for i in range(len(itemsets)):
        for j in range(i+1, len(itemsets)):
            item1 = itemsets[i]
            item2 = itemsets[j]
            n = len(item1)
            is_joinable = True
            # 2 items of length n can be joined if their n-1 elements are same
            # and the nth element of item1 is less than that of item2
            for k in range(n-1):
                if item1[k] != item2[k]:
                    is_joinable = False
                    break
            if is_joinable and (item1[n-1] < item2[n-1]):
                # item has n+1 elements
                item = item1 + item2[n-1].split(",")
                candidate_itemsets.append(item)
    return candidate_itemsets

In [None]:
# function to find all frequent itemsets, for a given candidate set
# after pruning and comparing with min support count
def find_frequent_itemsets(itemsets, transactions, min_sup_count, pruned_itemsets):
    freq_item = [] #  List containing newly added frequent itemset
    sup_count = [] # List containing the corresponding support count of the added itemset
    new_pruned_itemsets = [] # used to keep account of itemsets removed because of less support count
    n = len(pruned_itemsets.keys())
    for item in itemsets:
        already_pruned = False
        for pruned_item in pruned_itemsets[n]: # check if the item is superset of any already pruned item
            if set(pruned_item).issubset(set(item)):
                already_prued = True # If yes, then this item will not be added to frequent itemset
                break
        if already_pruned == False:
            count = get_itemset_count(item, transactions)
            if count >= min_sup_count: 
                freq_item.append(item)
                sup_count.append(count)
            else:
                new_pruned_itemsets.append(item) # If support count of itemset is less than min, 
                                                 # then add it to pruned items

    return freq_item, sup_count, new_pruned_itemsets

In [None]:
def apriori(transactions, min_sup):
    # In C,L, pruned_itemsets, and sup_count, the number of items in the itemset is the key
    unique_items = set() 
    k = 1

    C = {} # Dictionary where C[k] is the list of candidate itemsets with k items
    L = {} # Dictionary where L[k] is the list of frequent-k itemsets
    sup_count = {} # Dictionary where sup_count[k] is list of support count of itemsets in L[k]
    pruned_itemsets = {k : []} # Dictionary where pruned_itemsets[k] is the itemsets prumed from C[k]

    min_sup_count = math.ceil((min_sup/100) * len(transactions))

    # Finding all the unique items in the database
    for i in range(len(transactions)):
        for item in transactions[i]:
            unique_items.add(item)

    C.update({k : [ [i] for i in sorted(list(unique_items))]}) # C[1] contains all unique database elements
    print("C[1] : ",C[k])

    while True:
        if k > 1:
            C[k] = generate_candidate_itemsets(L[k-1])
        freq_item, item_count, new_pruned_items = find_frequent_itemsets(C[k], transactions, min_sup_count, pruned_itemsets)
        if(len(freq_item) == 0):
            break # If the frequent itemset generated has no items, then stop the process
        L[k] = freq_item
        sup_count[k] = item_count
        pruned_itemsets[k] = new_pruned_items
        k += 1

    return L, C, sup_count

In [None]:
# Function to generate all proper subsets of given itemset
def generate_subsets(itemset):
    subsets = []
    for i in range(1,len(itemset)):
        items = list(combinations(itemset,i))
        for item in items:
            subsets.append(sorted(list(item)))
    
    return subsets


In [None]:
# Generating all possible associations from the available itemsets and finding strong associations
def generate_association_rules(L, sup_count, min_confidence):
    association_rules = []
    for j in range(2,len(L.keys())+1):
        for i in range(len(L[j])):
            s = L[j][i]
            s_sup = sup_count[j][i]
            P = generate_subsets(s)
            for subset in P:
            # print(subset)
                S = set(s)
                I = set(subset)
                I_key = len(I)
                D = set(S - I)
                I_index = L[I_key].index(sorted(list(I)))
                confidence = (s_sup / sup_count[I_key][I_index])
                if confidence >= (min_confidence/100):
                    #print(I,D,confidence)
                    rule = str(I) + "-->" + str(D) + " Confidence :" + str(confidence)
                    #print(rule)
                    association_rules.append(rule)
    return association_rules

In [None]:
transactions = read_file("store_data.csv")
print(len(transactions))

7501


In [None]:
for i in range(10):
    print(transactions[i])

['almonds', 'antioxydant juice', 'avocado', 'cottage cheese', 'energy drink', 'frozen smoothie', 'green grapes', 'green tea', 'honey', 'low fat yogurt', 'mineral water', 'olive oil', 'salad', 'salmon', 'shrimp', 'spinach', 'tomato juice', 'vegetables mix', 'whole weat flour', 'yams']
['burgers', 'eggs', 'meatballs']
['chutney']
['avocado', 'turkey']
['energy bar', 'green tea', 'milk', 'mineral water', 'whole wheat rice']
['low fat yogurt']
['french fries', 'whole wheat pasta']
['light cream', 'shallot', 'soup']
['frozen vegetables', 'green tea', 'spaghetti']
['french fries']


# minimum support = 4%, minimum confidence = 40%

In [None]:
min_sup = 4
L, C, sup_count = apriori(transactions, min_sup)

C[1] :  [['almonds'], ['antioxydant juice'], ['asparagus'], ['avocado'], ['babies food'], ['bacon'], ['barbecue sauce'], ['black tea'], ['blueberries'], ['body spray'], ['bramble'], ['brownies'], ['bug spray'], ['burger sauce'], ['burgers'], ['butter'], ['cake'], ['candy bars'], ['carrots'], ['cauliflower'], ['cereals'], ['champagne'], ['chicken'], ['chili'], ['chocolate'], ['chocolate bread'], ['chutney'], ['cider'], ['clothes accessories'], ['cookies'], ['cooking oil'], ['corn'], ['cottage cheese'], ['cream'], ['dessert wine'], ['eggplant'], ['eggs'], ['energy bar'], ['energy drink'], ['escalope'], ['extra dark chocolate'], ['flax seed'], ['french fries'], ['french wine'], ['fresh bread'], ['fresh tuna'], ['fromage blanc'], ['frozen smoothie'], ['frozen vegetables'], ['gluten free bar'], ['grated cheese'], ['green beans'], ['green grapes'], ['green tea'], ['ground beef'], ['gums'], ['ham'], ['hand protein bar'], ['herb & pepper'], ['honey'], ['hot dogs'], ['ketchup'], ['light cream']

In [None]:
for key in L.keys():
    print("L{k}".format(k=key))
    for i in range(len(L[key])):
        print(L[key][i],sup_count[key][i])

L1
['burgers'] 654
['cake'] 608
['champagne'] 351
['chicken'] 450
['chocolate'] 1229
['cookies'] 603
['cooking oil'] 383
['eggs'] 1348
['escalope'] 595
['french fries'] 1282
['fresh bread'] 323
['frozen smoothie'] 475
['frozen vegetables'] 715
['grated cheese'] 393
['green tea'] 991
['ground beef'] 737
['herb & pepper'] 371
['honey'] 356
['low fat yogurt'] 574
['milk'] 972
['mineral water'] 1788
['olive oil'] 494
['pancakes'] 713
['salmon'] 319
['shrimp'] 536
['soup'] 379
['spaghetti'] 1306
['tomatoes'] 513
['turkey'] 469
['whole wheat rice'] 439
L2
['chocolate', 'mineral water'] 395
['eggs', 'mineral water'] 382
['ground beef', 'mineral water'] 307
['milk', 'mineral water'] 360
['mineral water', 'spaghetti'] 448


In [None]:
min_conf = 40
association_rules = generate_association_rules(L, sup_count, min_conf)

In [None]:
print("Minimun support = {a}%, minimum confidence = {b}%".format(a=min_sup,b=min_conf))
print("Number of association rules is : ",len(association_rules))
for rule in association_rules:
    print(rule)

Minimun support = 4%, minimum confidence = 40%
Number of association rules is :  1
{'ground beef'}-->{'mineral water'} Confidence :0.41655359565807326


# minimum support = 0.4%, minimum confidence = 55%

In [None]:
min_sup = 0.4
L, C, sup_count = apriori(transactions, min_sup)

C[1] :  [['almonds'], ['antioxydant juice'], ['asparagus'], ['avocado'], ['babies food'], ['bacon'], ['barbecue sauce'], ['black tea'], ['blueberries'], ['body spray'], ['bramble'], ['brownies'], ['bug spray'], ['burger sauce'], ['burgers'], ['butter'], ['cake'], ['candy bars'], ['carrots'], ['cauliflower'], ['cereals'], ['champagne'], ['chicken'], ['chili'], ['chocolate'], ['chocolate bread'], ['chutney'], ['cider'], ['clothes accessories'], ['cookies'], ['cooking oil'], ['corn'], ['cottage cheese'], ['cream'], ['dessert wine'], ['eggplant'], ['eggs'], ['energy bar'], ['energy drink'], ['escalope'], ['extra dark chocolate'], ['flax seed'], ['french fries'], ['french wine'], ['fresh bread'], ['fresh tuna'], ['fromage blanc'], ['frozen smoothie'], ['frozen vegetables'], ['gluten free bar'], ['grated cheese'], ['green beans'], ['green grapes'], ['green tea'], ['ground beef'], ['gums'], ['ham'], ['hand protein bar'], ['herb & pepper'], ['honey'], ['hot dogs'], ['ketchup'], ['light cream']

In [None]:
for key in L.keys():
    print(L[key])

[['almonds'], ['antioxydant juice'], ['asparagus'], ['avocado'], ['babies food'], ['bacon'], ['barbecue sauce'], ['black tea'], ['blueberries'], ['body spray'], ['brownies'], ['bug spray'], ['burger sauce'], ['burgers'], ['butter'], ['cake'], ['candy bars'], ['carrots'], ['cauliflower'], ['cereals'], ['champagne'], ['chicken'], ['chili'], ['chocolate'], ['chocolate bread'], ['chutney'], ['cider'], ['clothes accessories'], ['cookies'], ['cooking oil'], ['corn'], ['cottage cheese'], ['dessert wine'], ['eggplant'], ['eggs'], ['energy bar'], ['energy drink'], ['escalope'], ['extra dark chocolate'], ['flax seed'], ['french fries'], ['french wine'], ['fresh bread'], ['fresh tuna'], ['fromage blanc'], ['frozen smoothie'], ['frozen vegetables'], ['gluten free bar'], ['grated cheese'], ['green beans'], ['green grapes'], ['green tea'], ['ground beef'], ['gums'], ['ham'], ['hand protein bar'], ['herb & pepper'], ['honey'], ['hot dogs'], ['ketchup'], ['light cream'], ['light mayo'], ['low fat yogu

In [None]:
min_conf = 55
association_rules = generate_association_rules(L, sup_count, min_conf)

In [None]:
print("Minimun support = {a}%, minimum confidence = {b}%".format(a=min_sup,b=min_conf))
print("Number of association rules is : ",len(association_rules))
for rule in association_rules:
    print(rule)

Minimun support = 0.4%, minimum confidence = 55%
Number of association rules is :  12
{'chocolate', 'soup'}-->{'mineral water'} Confidence :0.5526315789473685
{'cooking oil', 'ground beef'}-->{'spaghetti'} Confidence :0.5714285714285714
{'cooking oil', 'pancakes'}-->{'mineral water'} Confidence :0.5932203389830508
{'frozen vegetables', 'olive oil'}-->{'mineral water'} Confidence :0.5764705882352941
{'soup', 'frozen vegetables'}-->{'mineral water'} Confidence :0.6333333333333333
{'soup', 'milk'}-->{'mineral water'} Confidence :0.5614035087719298
{'shrimp', 'olive oil'}-->{'mineral water'} Confidence :0.5573770491803278
{'soup', 'olive oil'}-->{'mineral water'} Confidence :0.582089552238806
{'olive oil', 'tomatoes'}-->{'mineral water'} Confidence :0.5740740740740741
{'soup', 'pancakes'}-->{'mineral water'} Confidence :0.6274509803921569
{'whole wheat rice', 'pancakes'}-->{'mineral water'} Confidence :0.5961538461538461
{'olive oil', 'tomatoes'}-->{'spaghetti'} Confidence :0.6111111111111

In [None]:
L1, C1, sup_count_1 = apriori(transactions[:3750], min_sup)
L2, C2, sup_count_2 = apriori(transactions[3751:], min_sup)

C[1] :  [['almonds'], ['antioxydant juice'], ['asparagus'], ['avocado'], ['babies food'], ['bacon'], ['barbecue sauce'], ['black tea'], ['blueberries'], ['body spray'], ['bramble'], ['brownies'], ['bug spray'], ['burger sauce'], ['burgers'], ['butter'], ['cake'], ['candy bars'], ['carrots'], ['cauliflower'], ['cereals'], ['champagne'], ['chicken'], ['chili'], ['chocolate'], ['chocolate bread'], ['chutney'], ['cider'], ['clothes accessories'], ['cookies'], ['cooking oil'], ['corn'], ['cottage cheese'], ['cream'], ['dessert wine'], ['eggplant'], ['eggs'], ['energy bar'], ['energy drink'], ['escalope'], ['extra dark chocolate'], ['flax seed'], ['french fries'], ['french wine'], ['fresh bread'], ['fresh tuna'], ['fromage blanc'], ['frozen smoothie'], ['frozen vegetables'], ['gluten free bar'], ['grated cheese'], ['green beans'], ['green grapes'], ['green tea'], ['ground beef'], ['gums'], ['ham'], ['hand protein bar'], ['herb & pepper'], ['honey'], ['hot dogs'], ['ketchup'], ['light cream']

In [None]:
association_rules_1 = generate_association_rules(L1, sup_count_1, min_conf)
association_rules_2 = generate_association_rules(L2, sup_count_2, min_conf)

In [None]:
for rules in association_rules_1:
    print(rules)

{'avocado', 'spaghetti'}-->{'milk'} Confidence :0.6451612903225806
{'brownies', 'ground beef'}-->{'mineral water'} Confidence :0.625
{'soup', 'cake'}-->{'mineral water'} Confidence :0.6
{'cereals', 'ground beef'}-->{'spaghetti'} Confidence :0.7142857142857143
{'pancakes', 'chicken'}-->{'mineral water'} Confidence :0.5581395348837209
{'soup', 'chicken'}-->{'mineral water'} Confidence :0.5517241379310345
{'chocolate', 'soup'}-->{'mineral water'} Confidence :0.5869565217391305
{'cooking oil', 'eggs'}-->{'mineral water'} Confidence :0.5714285714285714
{'cottage cheese', 'spaghetti'}-->{'mineral water'} Confidence :0.5555555555555556
{'soup', 'eggs'}-->{'mineral water'} Confidence :0.5581395348837209
{'soup', 'frozen vegetables'}-->{'milk'} Confidence :0.5625
{'frozen vegetables', 'olive oil'}-->{'mineral water'} Confidence :0.6666666666666666
{'frozen vegetables', 'salmon'}-->{'mineral water'} Confidence :0.5806451612903226
{'soup', 'frozen vegetables'}-->{'mineral water'} Confidence :0.71

In [None]:
for rules in association_rules_2:
    print(rules)

{'cake', 'ground beef'}-->{'mineral water'} Confidence :0.59375
{'cake', 'milk'}-->{'mineral water'} Confidence :0.5641025641025641
{'chocolate', 'chicken'}-->{'mineral water'} Confidence :0.5609756097560976
{'cooking oil', 'ground beef'}-->{'mineral water'} Confidence :0.6
{'cooking oil', 'ground beef'}-->{'spaghetti'} Confidence :0.64
{'cooking oil', 'pancakes'}-->{'mineral water'} Confidence :0.6666666666666666
{'eggs', 'grated cheese'}-->{'mineral water'} Confidence :0.5714285714285714
{'eggs', 'red wine'}-->{'spaghetti'} Confidence :0.6153846153846154
{'ground beef', 'frozen smoothie'}-->{'mineral water'} Confidence :0.5769230769230769
{'frozen vegetables', 'ground beef'}-->{'mineral water'} Confidence :0.5645161290322581
{'herb & pepper', 'grated cheese'}-->{'spaghetti'} Confidence :0.5769230769230769
{'pancakes', 'grated cheese'}-->{'mineral water'} Confidence :0.5666666666666667
{'pancakes', 'ground beef'}-->{'mineral water'} Confidence :0.603448275862069
{'soup', 'ground beef'

# minimum support = 0.4%, minimum confidence = 40%

In [None]:
min_sup = 0.4
L, C, sup_count = apriori(transactions, min_sup)

C[1] :  [['almonds'], ['antioxydant juice'], ['asparagus'], ['avocado'], ['babies food'], ['bacon'], ['barbecue sauce'], ['black tea'], ['blueberries'], ['body spray'], ['bramble'], ['brownies'], ['bug spray'], ['burger sauce'], ['burgers'], ['butter'], ['cake'], ['candy bars'], ['carrots'], ['cauliflower'], ['cereals'], ['champagne'], ['chicken'], ['chili'], ['chocolate'], ['chocolate bread'], ['chutney'], ['cider'], ['clothes accessories'], ['cookies'], ['cooking oil'], ['corn'], ['cottage cheese'], ['cream'], ['dessert wine'], ['eggplant'], ['eggs'], ['energy bar'], ['energy drink'], ['escalope'], ['extra dark chocolate'], ['flax seed'], ['french fries'], ['french wine'], ['fresh bread'], ['fresh tuna'], ['fromage blanc'], ['frozen smoothie'], ['frozen vegetables'], ['gluten free bar'], ['grated cheese'], ['green beans'], ['green grapes'], ['green tea'], ['ground beef'], ['gums'], ['ham'], ['hand protein bar'], ['herb & pepper'], ['honey'], ['hot dogs'], ['ketchup'], ['light cream']

In [None]:
min_conf = 40
association_rules = generate_association_rules(L, sup_count, min_conf)

In [None]:
print("Minimun support = {a}%, minimum confidence = {b}%".format(a=min_sup,b=min_conf))
print("Number of association rules is : ",len(association_rules))
for rule in association_rules:
    print(rule)

Minimun support = 0.4%, minimum confidence = 40%
Number of association rules is :  136
{'cider'}-->{'eggs'} Confidence :0.4050632911392405
{'extra dark chocolate'}-->{'mineral water'} Confidence :0.4777777777777778
{'ground beef'}-->{'mineral water'} Confidence :0.41655359565807326
{'light cream'}-->{'mineral water'} Confidence :0.4700854700854701
{'nonfat milk'}-->{'mineral water'} Confidence :0.48717948717948717
{'olive oil'}-->{'mineral water'} Confidence :0.4190283400809717
{'protein bar'}-->{'mineral water'} Confidence :0.4172661870503597
{'rice'}-->{'mineral water'} Confidence :0.41134751773049644
{'salmon'}-->{'mineral water'} Confidence :0.4012539184952978
{'soup'}-->{'mineral water'} Confidence :0.45646437994722955
{'tomato sauce'}-->{'mineral water'} Confidence :0.4056603773584906
{'tomato sauce'}-->{'spaghetti'} Confidence :0.44339622641509435
{'cake', 'burgers'}-->{'mineral water'} Confidence :0.4186046511627907
{'burgers', 'french fries'}-->{'eggs'} Confidence :0.412121212