## Apriori Algorithm
minsup: 0.3
minconf: 0.8
minlift: 1.0
filename: 'supermarket.csv'

In [1]:
import pandas as pd
import numpy as np
import itertools

def load_data_set(filename):
    # Importing dataset
    data = pd.read_csv(filename, header = None)
    df_shape = data.shape
    n_of_transactions = df_shape[0]
    n_of_products = df_shape[1]

    # Converting dataframe into a list of lists for Apriori algorithm
    records = []
    for i in range(0, n_of_transactions):
        records.append([])
        for j in range(0, n_of_products):
            if (str(data.values[i,j]) != 'nan'):
                records[i].append(str(data.values[i,j]))
            else:
                continue
    return records
 
def Create_C1(data_set):
    # Creating frozenset for each item
    C1 = set()
    for t in data_set:
        for item in t:
            item_set = frozenset([item])
            C1.add(item_set)
    return C1
 
def is_apriori(Ck_item, Lk_sub_1):
    for item in Ck_item:
        sub_item = Ck_item - frozenset([item])
        if sub_item not in Lk_sub_1:
            return False
    return True
 
def Create_Ck(Lk_sub_1, k):
    Ck = set()
    len_Lk_sub_1 = len(Lk_sub_1)
    list_Lk_sub_1 = list(Lk_sub_1)
    for i in range(len_Lk_sub_1): #i: [0, len_Lk_sub_1)
        for j in range(i+1, len_Lk_sub_1): #j: [i+1, len_Lk_sub_1)
            l1 = list(list_Lk_sub_1[i])
            l2 = list(list_Lk_sub_1[j])
            l1.sort()
            l2.sort()
            if l1[0:k-2] == l2[0:k-2]:
                Ck_item = list_Lk_sub_1[i] | list_Lk_sub_1[j]
                if is_apriori(Ck_item, Lk_sub_1):
                    Ck.add(Ck_item)
    return Ck
 
def Generate_Lk_By_Ck(data_set, Ck, minsup, support_data):
    Lk = set()
    item_count = {}
    for t in data_set:
        for Ck_item in Ck:
            if Ck_item.issubset(t):
                if Ck_item not in item_count:
                    item_count[Ck_item] = 1
                else:
                    item_count[Ck_item] += 1
    data_num = float(len(data_set))
    for item in item_count:
        if(item_count[item] / data_num) >= minsup:
            Lk.add(item)
            support_data[item] = item_count[item] / data_num
    return Lk

def Generate_L(data_set, minsup, max_k = 10):
    # Creating a dict that has the frequent itemset as key and corresponding support as value
    support_data = {}
    C1 = Create_C1(data_set)
    L1 = Generate_Lk_By_Ck(data_set, C1, minsup, support_data)
    Lk_sub_1 = L1.copy()
    L = []
    L.append(Lk_sub_1)
    for k in range(2, max_k+1):
        Ck = Create_Ck(Lk_sub_1, k)
        Lk = Generate_Lk_By_Ck(data_set, Ck, minsup, support_data)
        Lk_sub_1 = Lk.copy()
        L.append(Lk_sub_1)
    return L, support_data
 
def Generate_Rule(L, support_data, minconf, minlift):
    rule_list = []
    sub_set_list = []
    for i in range(len(L)):
        for frequent_set in L[i]:
            for sub_set in sub_set_list:
                if sub_set.issubset(frequent_set):
                    # Calculating lift, conf, support
                    support = support_data[frequent_set]
                    conf = support_data[frequent_set] / support_data[sub_set]
                    lift = conf / support_data[frequent_set-sub_set]
                    rule = [sub_set, frequent_set-sub_set, round(lift, 2), round(conf, 2), round(support, 2)]
                    if minlift == 'NULL':
                        if conf >= minconf and rule not in rule_list:
                            rule_list.append(rule)
                    else:
                        if conf >= minconf and rule not in rule_list and lift >= minlift:
                            rule_list.append(rule)
            sub_set_list.append(frequent_set)
    return rule_list
   
def main(minsup, minconf, minlift, filename):
    data_set = load_data_set(filename)
    L, support_data = Generate_L(data_set, minsup)
    rule_list = Generate_Rule(L, support_data, minconf, minlift)
    
    df = pd.DataFrame(rule_list)
    df.columns = ['Freq', 'Conseq', 'Lift', 'Confidence', 'Support']
    df['Num_items'] = df.apply(lambda row: len(row.Freq) + len(row.Conseq), axis=1)

    sorted_df = df.sort_values(['Num_items', 'Lift', 'Confidence', 'Support'], 
                          ascending = [True, False, False, False])
    sorted_df.drop('Num_items', axis=1, inplace=True)
    print(sorted_df.reset_index(drop=True))
    
main(minsup = 0.3, minconf = 0.8, minlift = 1.0, filename = 'supermarket.csv')

                            Freq            Conseq  Lift  Confidence  Support
0                 (total = high)  (bread and cake)  1.17        0.84     0.31
1                    (margarine)  (bread and cake)  1.11        0.80     0.40
2              (fruit, biscuits)  (bread and cake)  1.17        0.84     0.33
3         (vegetables, biscuits)  (bread and cake)  1.17        0.84     0.32
4         (biscuits, milk-cream)  (bread and cake)  1.17        0.84     0.32
5       (biscuits, frozen foods)  (bread and cake)  1.16        0.83     0.33
6          (fruit, frozen foods)  (bread and cake)  1.16        0.83     0.33
7            (fruit, milk-cream)  (bread and cake)  1.15        0.83     0.36
8     (baking needs, milk-cream)  (bread and cake)  1.15        0.83     0.34
9     (frozen foods, milk-cream)  (bread and cake)  1.15        0.83     0.33
10      (baking needs, biscuits)  (bread and cake)  1.15        0.83     0.31
11      (vegetables, milk-cream)  (bread and cake)  1.14        