In [1]:
import pandas as pd
import time
import pprint
import tqdm

pd.options.display.max_columns = 100

In [2]:
# load data
data = pd.read_csv('./Market_Basket_Optimisation.csv', header=None)

In [3]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7497,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7498,chicken,,,,,,,,,,,,,,,,,,,
7499,escalope,green tea,,,,,,,,,,,,,,,,,,


In [4]:
# convert data to lower cases
for col in data.columns:
    data[col] = data[col].str.lower()
    
# fill nan
data = data.fillna(-1)

In [5]:
def rule_apriori():
    from efficient_apriori import apriori
    pp = pprint.PrettyPrinter()
    start = time.time()
    # extract transactions from dataframe
    transactions = []
    for i, r in data.iterrows():
        temp_set = set()
        for col in data.columns:
            if r[col] != -1:
                temp_set.add(r[col])
        if len(temp_set) > 0:
            transactions.append(temp_set)

    # print(transactions)
    # mine frequent item sets and frequent correlative rules
    itemset, rules = apriori(transactions, min_support=0.03, min_confidence=0.3)

    print('frequent items')
    pp.pprint(itemset)
    print('correlative rules')
    pp.pprint(rules)

    end = time.time()
    print('time spent', end - start)

In [6]:
def rule_mlxtend():
    
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules
    pp = pprint.PrettyPrinter()
    
    start = time.time()
    
    # reconstruct columns
    items = set()
    for i, r in data.iterrows():
        for col in data.columns:
            if r[col] != -1:
                items.add(r[col])
                
    hot_encoded_df = pd.DataFrame(columns = items)
    
    # copy and convert data into new format
    for i, r in tqdm.tqdm(data.iterrows()):
        temp_set = set()
        temp_dict = dict()
        for col in data.columns:
            if r[col] != -1:
                temp_set.add(r[col])
        for item in items:
            if item in temp_set:
                temp_dict[item] = [1]
            else:
                temp_dict[item] = [0]
        hot_encoded_df = hot_encoded_df.append(pd.DataFrame(temp_dict), ignore_index = True)
    
    # apply apriori to extract frquent item sets and correlative rules
    frequent_itemsets = apriori(hot_encoded_df, min_support=0.02, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.3)
    print("requent item sets：")
    pp.pprint(frequent_itemsets)
    print("correlative rules:")
    pp.pprint(rules[(rules['lift'] >= 1) & (rules['confidence'] >= 0.3)].sort_values(by=['lift'], ascending = False))

    end = time.time()
    print("time spent: ", end - start)

In [7]:
rule_apriori()
print('-'*100)
rule_mlxtend()

frequent items
{1: {('avocado',): 250,
     ('brownies',): 253,
     ('burgers',): 654,
     ('butter',): 226,
     ('cake',): 608,
     ('champagne',): 351,
     ('chicken',): 450,
     ('chocolate',): 1229,
     ('cookies',): 603,
     ('cooking oil',): 383,
     ('cottage cheese',): 239,
     ('eggs',): 1348,
     ('escalope',): 595,
     ('french fries',): 1282,
     ('fresh bread',): 323,
     ('frozen smoothie',): 475,
     ('frozen vegetables',): 715,
     ('grated cheese',): 393,
     ('green tea',): 991,
     ('ground beef',): 737,
     ('herb & pepper',): 371,
     ('honey',): 356,
     ('hot dogs',): 243,
     ('low fat yogurt',): 574,
     ('milk',): 972,
     ('mineral water',): 1788,
     ('olive oil',): 494,
     ('pancakes',): 713,
     ('salmon',): 319,
     ('shrimp',): 536,
     ('soup',): 379,
     ('spaghetti',): 1306,
     ('tomato juice',): 228,
     ('tomatoes',): 513,
     ('turkey',): 469,
     ('whole wheat rice',): 439},
 2: {('chocolate', 'eggs'): 249,
    

7501it [00:54, 136.55it/s]


requent item sets：
       support                        itemsets
0     0.062525                        (turkey)
1    0.0265298                        (pepper)
2     0.059992                       (chicken)
3    0.0270631                    (energy bar)
4    0.0257299                       (cereals)
..         ...                             ...
98   0.0229303  (frozen vegetables, chocolate)
99   0.0343954       (french fries, chocolate)
100  0.0234635          (chocolate, green tea)
101  0.0201306        (french fries, pancakes)
102  0.0285295       (french fries, green tea)

[103 rows x 2 columns]
correlative rules:
            antecedents      consequents  antecedent support  \
25        (ground beef)      (spaghetti)            0.098254   
64          (olive oil)      (spaghetti)            0.065858   
41               (soup)  (mineral water)            0.050527   
7             (burgers)           (eggs)            0.087188   
34          (olive oil)  (mineral water)            0.