## Association Rule Mining on a grocery dataset. 

### Data aggregation

In [1]:
with open('grocery_dataset.csv') as ds:
    groceries_list = ds.read()

In [2]:
print (groceries_list[0:250] + "...\n... (etc.) ...") # Prints the first 250 characters only

citrus fruit,semi-finished bread,margarine,ready soups
tropical fruit,yogurt,coffee
whole milk
pip fruit,yogurt,cream cheese ,meat spreads
other vegetables,whole milk,condensed milk,long life bakery product
whole milk,butter,yogurt,rice,abrasive clea...
... (etc.) ...


## Data cleaning

In [3]:
import re
def normalize_grocery_list(dataset):
    i_list = [item_list.split(",") for item_list in dataset.split('\n')]
    itemset_list = [set(re.sub(r"[^a-zA-Z\s\/]", "", item.lower()) 
                        for item in items) for items in i_list]
    return itemset_list

grocery_itemset_list = normalize_grocery_list(groceries_list)
grocery_itemset_list

[{'citrus fruit', 'margarine', 'ready soups', 'semifinished bread'},
 {'coffee', 'tropical fruit', 'yogurt'},
 {'whole milk'},
 {'cream cheese ', 'meat spreads', 'pip fruit', 'yogurt'},
 {'condensed milk',
  'long life bakery product',
  'other vegetables',
  'whole milk'},
 {'abrasive cleaner', 'butter', 'rice', 'whole milk', 'yogurt'},
 {'x coffee powder'},
 {'bottled beer',
  'liquor appetizer',
  'other vegetables',
  'uhtmilk',
  'x coffee powder'},
 {'pot plants'},
 {'cereals', 'whole milk'},
 {'bottled water',
  'chocolate',
  'other vegetables',
  'tropical fruit',
  'white bread'},
 {'bottled water',
  'butter',
  'citrus fruit',
  'curd',
  'dishes',
  'flour',
  'tropical fruit',
  'whole milk',
  'yogurt'},
 {'beef'},
 {'frankfurter', 'soda', 'x coffee powder'},
 {'chicken', 'tropical fruit'},
 {'butter', 'fruit/vegetable juice', 'newspapers', 'sugar'},
 {'fruit/vegetable juice'},
 {'packaged fruit/vegetables'},
 {'chocolate'},
 {'specialty bar'},
 {'other vegetables'},
 {'

## Contingency table for the co-occurrence of items

In [4]:
from collections import defaultdict
from itertools  import combinations 

def create_pair_count(pair_counts, items_set):
    for a, b in combinations(items_set, 2):
        pair_counts[(a,b)]+=1
        pair_counts[(b,a)]+=1

In [5]:
def create_item_counts(item_counts, items_set):
    for item in items_set:
        item_counts[item]+=1

In [6]:
def filter_rules_by_conf(pair_counts, item_counts, threshold, min_count):
    rules = {} # (item_a, item_b) -> conf (item_a => item_b)
    for (a, b) in pair_counts:
        assert a in item_counts
        conf_ab = pair_counts[(a,b)] / item_counts[a]
        if (item_counts[a] >= min_count) and (conf_ab >= threshold):
            rules[(a,b)] = conf_ab
    return rules

In [7]:
def identify_assoc_rules(grocery_list, threshold, min_count):
    pair_counts = defaultdict(int)
    item_counts = defaultdict(int)
    for item_set in grocery_list:
        create_pair_count(pair_counts,item_set)
        create_item_counts(item_counts,item_set)
    rules = filter_rules_by_conf(pair_counts, item_counts, threshold, min_count)
    return rules

In [8]:
THRESHOLD = 0.5
MIN_COUNT = 10
grocery_rules = identify_assoc_rules(grocery_itemset_list, THRESHOLD, MIN_COUNT)

In [9]:
def gen_rule_str(a, b, val=None, val_fmt='{:.3f}', sep=" = "):
    text = "{} => {}".format(a, b)
    if val:
        text = "conf(" + text + ")"
        text += sep + val_fmt.format(val)
    return text

def print_rules(rules):
    if type(rules) is dict or type(rules) is defaultdict:
        from operator import itemgetter
        ordered_rules = sorted(rules.items(), key=itemgetter(1), reverse=True)
    else: # Assume rules is iterable
        ordered_rules = [((a, b), None) for a, b in rules]
    for (a, b), conf_ab in ordered_rules:
        print(gen_rule_str(a, b, conf_ab))

In [10]:
print_rules(grocery_rules)

conf(honey => whole milk) = 0.733
conf(frozen fruits => other vegetables) = 0.667
conf(cereals => whole milk) = 0.643
conf(rice => whole milk) = 0.613
conf(rubbing alcohol => whole milk) = 0.600
conf(cocoa drinks => whole milk) = 0.591
conf(pudding powder => whole milk) = 0.565
conf(jam => whole milk) = 0.547
conf(cream => other vegetables) = 0.538
conf(cream => sausage) = 0.538
conf(baking powder => whole milk) = 0.523
conf(x sugar => x coffee powder) = 0.522
conf(rice => other vegetables) = 0.520
conf(cooking chocolate => whole milk) = 0.520
conf(specialty cheese => other vegetables) = 0.500
conf(rubbing alcohol => butter) = 0.500
conf(rubbing alcohol => citrus fruit) = 0.500
conf(ready soups => x coffee powder) = 0.500
conf(frozen fruits => whipped/sour cream) = 0.500
