In [1]:
import pandas as pd
import numpy as np
import uuid
from collections import Counter
import math
from progressbar import ProgressBar, AdaptiveETA, Percentage, Bar, RotatingMarker

In [2]:
df = pd.read_hdf('../data/processed/epi_vector.h5')

In [3]:
dat = df.values[:,:-8]

In [4]:
columns = list(df)
new_vec = []
for recipe in dat:
    ind = uuid.uuid4().hex
    for i in range(len(recipe)):
        if recipe[i] != 0:
            new_vec.append([ind, columns[i]])

In [5]:
print(len(dat))
basketized = pd.DataFrame(new_vec, columns=["index", "name"]).set_index('index', drop=False)
del basketized["index"]
basketized.head()

34626


Unnamed: 0_level_0,name
index,Unnamed: 1_level_1
a54b174876bc4e80b58f4c93662cf6ce,coarsely grated sharp Cheddar
a54b174876bc4e80b58f4c93662cf6ce,garlic clove
70214f9b407f433780fa0f973342e477,almond
70214f9b407f433780fa0f973342e477,apricot jam
70214f9b407f433780fa0f973342e477,brandy


In [6]:
def generate_candidate_set(size, L):
    candidate_set = set()
    for i in range(len(L)):
        for j in range(i+1, len(L)):
            diff = L[i].difference(L[j])
            if len(diff) == 1:
                candidate_set.add(frozenset.union(L[i], L[j]))
    return candidate_set

def apriori(frame, support_per):
    candidate_sets = []
    frequent_sets = []
    thresh = math.floor(len(frame.groupby(level=0))*support_per)
    
    C1 = Counter()
    for ind, basket in frame.groupby(level=0):
        for item in basket.values:
            C1[item[0]] += 1
    
    L1 = []
    for item in C1.most_common():
        if item[1] > thresh:
            L1.append(set([item[0]]))
    print(L1)
    
    count = Counter()
    candidate = generate_candidate_set(2, L1)
    frequent = []
    size = 2
    while len(candidate) != 0:
        widgets = ['Counting C{}: '.format(size), Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', AdaptiveETA()]
        pbar = ProgressBar(widgets=widgets)
        for ind, basket in pbar(frame.groupby(level=0)):
            for c in candidate:
                if basket["name"].isin(c).sum() == len(c):
                    count[c] += 1
        for item in count.most_common():
            if item[1] > thresh:
                frequent.append(set([item[0]]))
        candidate_sets.append(candidate)
        frequent_sets.append(frequent)
        size += 1
        count = Counter()
        candidate = generate_candidate_set(size, frequent)
        frequent = []
        
    return candidate_sets, frequent_sets

def apriori_quick(frame, vec_frame, support_per):
    candidate_sets = []
    frequent_sets = []
    thresh = math.floor(len(frame.groupby(level=0))*support_per)
    ing_map = {}
    dat = vec_frame.values
    columns = list(vec_frame)
    for i in range(len(columns)-8):
        ing_map[columns[i]] = i
    
    C1 = Counter()
    for ind, basket in frame.groupby(level=0):
        for item in basket.values:
            C1[item[0]] += 1
    
    L1 = []
    for item in C1.most_common():
        if item[1] > thresh:
            L1.append(frozenset([item[0]]))
    print(L1)
    
    count = Counter()
    candidate = generate_candidate_set(2, L1)
    frequent = []
    size = 2
    while len(candidate) != 0:
        widgets = ['Counting C{}: '.format(size), Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', AdaptiveETA()]
        pbar = ProgressBar(widgets=widgets)
        for c in pbar(candidate):
            check_vec = []
            for ing in c:
                check_vec.append(ing_map[ing])
            for recipe in dat:
                if np.count_nonzero(recipe[check_vec]) == len(check_vec):
                    count[c] += 1
        for item in count.most_common():
            if item[1] > thresh:
                frequent.append(item[0])
        candidate_sets.append(candidate)
        frequent_sets.append(frequent)
        size += 1
        count = Counter()
        candidate = generate_candidate_set(size, frequent)
        frequent = []
        
    return candidate_sets, frequent_sets

In [7]:
c_sets, f_sets = apriori_quick(basketized, df, 0.01)

Counting C2:   0% |                                            | ETA:  --:--:--

[frozenset({'salt'}), frozenset({'olive oil'}), frozenset({'sugar'}), frozenset({'egg'}), frozenset({'garlic'}), frozenset({'unsalted butter'}), frozenset({'onion'}), frozenset({'water'}), frozenset({'lemon juice'}), frozenset({'black pepper'}), frozenset({'all - purpose flour'}), frozenset({'vegetable oil'}), frozenset({'unsalte butter'}), frozenset({'milk'}), frozenset({'butter'}), frozenset({'vanilla extract'}), frozenset({'ginger'}), frozenset({'shallot'}), frozenset({'chicken broth'}), frozenset({'garlic clove'}), frozenset({'cinnamon'}), frozenset({'thyme'}), frozenset({'carrot'}), frozenset({'baking powder'}), frozenset({'lime juice'}), frozenset({'tomato'}), frozenset({'all purpose flour'}), frozenset({'parsley'}), frozenset({'heavy cream'}), frozenset({'whip cream'}), frozenset({'red onion'}), frozenset({'cilantro'}), frozenset({'dry white wine'}), frozenset({'cumin'}), frozenset({'baking soda'}), frozenset({'potato'}), frozenset({'honey'}), frozenset({'scallion'}), frozenset(

Counting C2: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:13:43
Counting C3: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:04:59
Counting C4: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:00:55
Counting C5: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:00:11
Counting C6: 100% ||||||||||||||||||||||||||||||||||||||||||||||| Time: 0:00:00


In [8]:
for f in f_sets:
    print(f)

[frozenset({'salt', 'egg'}), frozenset({'sugar', 'egg'}), frozenset({'garlic', 'olive oil'}), frozenset({'salt', 'sugar'}), frozenset({'egg', 'unsalted butter'}), frozenset({'salt', 'unsalted butter'}), frozenset({'sugar', 'unsalted butter'}), frozenset({'onion', 'garlic'}), frozenset({'onion', 'olive oil'}), frozenset({'salt', 'black pepper'}), frozenset({'salt', 'olive oil'}), frozenset({'salt', 'water'}), frozenset({'sugar', 'water'}), frozenset({'salt', 'all - purpose flour'}), frozenset({'egg', 'all - purpose flour'}), frozenset({'egg', 'vanilla extract'}), frozenset({'salt', 'garlic'}), frozenset({'sugar', 'vanilla extract'}), frozenset({'egg', 'milk'}), frozenset({'lemon juice', 'olive oil'}), frozenset({'egg', 'baking powder'}), frozenset({'salt', 'onion'}), frozenset({'salt', 'baking powder'}), frozenset({'unsalted butter', 'all - purpose flour'}), frozenset({'sugar', 'all - purpose flour'}), frozenset({'unsalted butter', 'vanilla extract'}), frozenset({'black pepper', 'olive 

In [9]:
dfs = []
for f in f_sets:
    freq_vec = []
    for s in f:
        insert = list(s)
        insert.sort()
        freq_vec.append(insert)
    new_df = pd.DataFrame(freq_vec, columns=["Item {}".format(x) for x in range(1, len(insert) + 1)])
    dfs.append(new_df)

In [12]:
dfs[0].head(10)

Unnamed: 0,Item 1,Item 2
0,egg,salt
1,egg,sugar
2,garlic,olive oil
3,salt,sugar
4,egg,unsalted butter
5,salt,unsalted butter
6,sugar,unsalted butter
7,garlic,onion
8,olive oil,onion
9,black pepper,salt


In [13]:
dfs[1].head(10)

Unnamed: 0,Item 1,Item 2,Item 3
0,egg,salt,sugar
1,egg,salt,unsalted butter
2,egg,sugar,unsalted butter
3,salt,sugar,unsalted butter
4,egg,sugar,vanilla extract
5,all - purpose flour,egg,salt
6,baking powder,egg,salt
7,garlic,olive oil,onion
8,egg,unsalted butter,vanilla extract
9,baking powder,egg,sugar


In [14]:
dfs[2].head(10)

Unnamed: 0,Item 1,Item 2,Item 3,Item 4
0,egg,salt,sugar,unsalted butter
1,baking powder,egg,salt,sugar
2,egg,sugar,unsalted butter,vanilla extract
3,egg,salt,sugar,vanilla extract
4,all - purpose flour,egg,salt,unsalted butter
5,baking powder,egg,salt,unsalted butter
6,all - purpose flour,egg,salt,sugar
7,egg,salt,unsalted butter,vanilla extract
8,baking powder,egg,sugar,unsalted butter
9,salt,sugar,unsalted butter,vanilla extract


In [16]:
dfs[3].head(10)

Unnamed: 0,Item 1,Item 2,Item 3,Item 4,Item 5
0,egg,salt,sugar,unsalted butter,vanilla extract
1,baking powder,egg,salt,sugar,unsalted butter
2,all - purpose flour,egg,salt,sugar,unsalted butter
3,baking powder,egg,salt,sugar,vanilla extract
4,all purpose flour,egg,salt,sugar,unsalted butter
5,all - purpose flour,baking powder,egg,salt,sugar
